* Copyright (c) 2004 Novell, Inc. All Rights Reserved. * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; version 2 of the License. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA /* This code was adapted from the sample RTF reader found here: * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/dnrtfspec/html/rtfspec.asp /* Internal RTF parser error codes */ #define NMRTF_OK 0 /* Everything's fine! */ #define NMRTF_STACK_UNDERFLOW 1 /* Unmatched '}' */ #define NMRTF_STACK_OVERFLOW 2 /* Too many '{' -- memory exhausted */ #define NMRTF_UNMATCHED_BRACE 3 /* RTF ended during an open group. */ #define NMRTF_INVALID_HEX 4 /* invalid hex character found in data */ #define NMRTF_BAD_TABLE 5 /* RTF table (sym or prop) invalid */ #define NMRTF_ASSERTION 6 /* Assertion failure */ #define NMRTF_EOF 7 /* End of file reached while reading RTF */ #define NMRTF_CONVERT_ERROR 8 /* Error converting text */ #define NMRTF_MAX_DEPTH 256 } NMRtfState; /* Rtf State */ /* Property types that we care about */ /* All we care about for now is the font. * bold, italic, underline, etc. should be char *keyword; /* RTF keyword */ int default_val; /* default value to use */ gboolean pass_default; /* true to use default value from this table */ NMRtfKeywordType kwd_type; /* the type of the keyword */ int action; /* property type if the keyword represents a property */ /* destination type if the keyword represents a destination */ /* character to print if the keyword represents a character */ NMRtfState rds; /* destination state */ NMRtfState ris; /* internal state */ NMRtfCharProp chp; /* current character properties (ie. font, bold, italic, etc.) */ GSList *font_table; /* the font table */ GSList *saved; /* saved state stack */ int param; /* numeric parameter for the current keyword */ long bytes_to_skip; /* number of bytes to skip (after encountering \bin) */ int depth; /* how many groups deep are we */ gboolean skip_unknown; /* if true, skip any unknown destinations (this is set after encountering '\*') */ char *input; /* input string */ guchar nextch; /* next char in input */ gboolean nextch_available; /* nextch value is set */ GString *ansi; /* Temporary ansi text, will be convert/flushed to the output string */ GString *output; /* The plain text UTF8 string */ static int rtf_parse(NMRtfContext *ctx); static int rtf_push_state(NMRtfContext *ctx); static int rtf_pop_state(NMRtfContext *ctx); static NMRtfFont *rtf_get_font(NMRtfContext *ctx, int index); static int rtf_get_char(NMRtfContext *ctx, guchar *ch); static int rtf_unget_char(NMRtfContext *ctx, guchar ch); static int rtf_flush_data(NMRtfContext *ctx); static int rtf_parse_keyword(NMRtfContext *ctx); static int rtf_dispatch_control(NMRtfContext *ctx, char *keyword, int param, gboolean param_set); static int rtf_dispatch_char(NMRtfContext *ctx, guchar ch); static int rtf_dispatch_unicode_char(NMRtfContext *ctx, gunichar ch); static int rtf_print_char(NMRtfContext *ctx, guchar ch); static int rtf_print_unicode_char(NMRtfContext *ctx, gunichar ch); static int rtf_change_destination(NMRtfContext *ctx, NMRtfDestinationType dest); static int rtf_dispatch_special(NMRtfContext *ctx, NMRtfSpecialKwd special); static int rtf_apply_property(NMRtfContext *ctx, NMRtfProperty prop, int val); /* Keyword descriptions */ NMRtfSymbol rtf_symbols[] = { /* keyword, default, pass_default, keyword_type, action */ {"fonttbl", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_FONTTABLE}, {"f", 0, FALSE, NMRTF_KWD_PROP, NMRTF_PROP_FONT_IDX}, {"fcharset", 0, FALSE, NMRTF_KWD_PROP, NMRTF_PROP_FONT_CHARSET}, {"par", 0, FALSE, NMRTF_KWD_CHAR, 0x0a}, {"line", 0, FALSE, NMRTF_KWD_CHAR, 0x0a}, {"\0x0a", 0, FALSE, NMRTF_KWD_CHAR, 0x0a}, {"\0x0d", 0, FALSE, NMRTF_KWD_CHAR, 0x0a}, {"tab", 0, FALSE, NMRTF_KWD_CHAR, 0x09}, {"\r", 0, FALSE, NMRTF_KWD_CHAR, '\r'}, {"\n", 0, FALSE, NMRTF_KWD_CHAR, '\n'}, {"ldblquote",0, FALSE, NMRTF_KWD_CHAR, '"'}, {"rdblquote",0, FALSE, NMRTF_KWD_CHAR, '"'}, {"{", 0, FALSE, NMRTF_KWD_CHAR, '{'}, {"}", 0, FALSE, NMRTF_KWD_CHAR, '}'}, {"\\", 0, FALSE, NMRTF_KWD_CHAR, '\\'}, {"bin", 0, FALSE, NMRTF_KWD_SPEC, NMRTF_SPECIAL_BIN}, {"*", 0, FALSE, NMRTF_KWD_SPEC, NMRTF_SPECIAL_SKIP}, {"'", 0, FALSE, NMRTF_KWD_SPEC, NMRTF_SPECIAL_HEX}, {"u", 0, FALSE, NMRTF_KWD_SPEC, NMRTF_SPECIAL_UNICODE}, {"colortbl", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"author", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"buptim", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"comment", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"creatim", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"doccomm", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"footer", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"footerf", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"footerl", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"footerr", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"footnote", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"ftncn", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"ftnsep", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"ftnsepc", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"header", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"headerf", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"headerl", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"headerr", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"info", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"keywords", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"operator", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"pict", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"printim", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"private1", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"revtim", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"rxe", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"stylesheet", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"subject", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"tc", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"title", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"txe", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP}, {"xe", 0, FALSE, NMRTF_KWD_DEST, NMRTF_DEST_SKIP} int table_size = sizeof(rtf_symbols) / sizeof(NMRtfSymbol); NMRtfContext *ctx = g_new0(NMRtfContext, 1); ctx->nextch_available = FALSE; ctx->ansi = g_string_new(""); ctx->output = g_string_new(""); nm_rtf_strip_formatting(NMRtfContext *ctx, const char *input) ctx->input = (char *)input; return g_strdup(ctx->output->str); purple_debug_info("novell", "RTF parser failed with error code %d\n", status); nm_rtf_font_free(NMRtfFont *font) g_return_if_fail(font != NULL); nm_rtf_deinit(NMRtfContext *ctx) g_slist_free_full(ctx->font_table, (GDestroyNotify)nm_rtf_font_free); g_slist_free_full(ctx->saved, g_free); g_string_free(ctx->ansi, TRUE); g_string_free(ctx->output, TRUE); get_current_encoding(NMRtfContext *ctx) font = rtf_get_font(ctx, ctx->chp.font_idx); purple_debug_info("novell", "Unhandled font charset %d\n", font->charset); * Add an entry to the font table rtf_add_font_entry(NMRtfContext *ctx, int number, const char *name, int charset) NMRtfFont *font = g_new0(NMRtfFont, 1); font->name = g_strdup(name); purple_debug_info("novell", "Adding font to table: #%d\t%s\t%d\n", font->number, font->name, font->charset); ctx->font_table = g_slist_append(ctx->font_table, font); * Return the nth entry in the font table rtf_get_font(NMRtfContext *ctx, int nth) font = g_slist_nth_data(ctx->font_table, nth); * Isolate RTF keywords and send them to rtf_parse_keyword; * Push and pop state at the start and end of RTF groups; * Send text to rtf_dispatch_char for further processing. rtf_parse(NMRtfContext *ctx) while (rtf_get_char(ctx, &ch) == NMRTF_OK) { return NMRTF_STACK_UNDERFLOW; /* if we're parsing binary data, handle it directly */ if (ctx->ris == NMRTF_STATE_BIN) { if ((status = rtf_dispatch_char(ctx, ch)) != NMRTF_OK) if (ctx->depth > NMRTF_MAX_DEPTH) return NMRTF_STACK_OVERFLOW; if ((status = rtf_push_state(ctx)) != NMRTF_OK) /* for some reason there is always an unwanted '\par' at the end */ if (ctx->rds == NMRTF_STATE_NORMAL) { if (ctx->output->str[len-1] == '\n') ctx->output = g_string_truncate(ctx->output, len-1); if ((status = rtf_pop_state(ctx)) != NMRTF_OK) return NMRTF_STACK_OVERFLOW; if ((status = rtf_parse_keyword(ctx)) != NMRTF_OK) case 0x0a: /* cr and lf are noise characters... */ if (ctx->ris == NMRTF_STATE_NORMAL) { if ((status = rtf_dispatch_char(ctx, ch)) != NMRTF_OK) } else { /* parsing a hex encoded character */ if (ctx->ris != NMRTF_STATE_HEX) hex_byte = hex_byte << 4; hex_byte += (char) ch - '0'; if (ch < 'a' || ch > 'f') return NMRTF_INVALID_HEX; hex_byte += (char) ch - 'a' + 10; if (ch < 'A' || ch > 'F') return NMRTF_INVALID_HEX; hex_byte += (char) ch - 'A' + 10; if ((status = rtf_dispatch_char(ctx, hex_byte)) != NMRTF_OK) ctx->ris = NMRTF_STATE_NORMAL; return NMRTF_STACK_OVERFLOW; return NMRTF_UNMATCHED_BRACE; * Push the current state onto stack rtf_push_state(NMRtfContext *ctx) NMRtfStateSave *save = g_new0(NMRtfStateSave, 1); ctx->saved = g_slist_prepend(ctx->saved, save); ctx->ris = NMRTF_STATE_NORMAL; * Restore the state at the top of the stack rtf_pop_state(NMRtfContext *ctx) NMRtfStateSave *save_old; return NMRTF_STACK_UNDERFLOW; save_old = ctx->saved->data; ctx->chp = save_old->chp; ctx->rds = save_old->rds; ctx->ris = save_old->ris; ctx->saved = g_slist_delete_link(ctx->saved, link_old); * Get a control word (and its associated value) and rtf_parse_keyword(NMRtfContext *ctx) gboolean param_set = FALSE; if ((status = rtf_get_char(ctx, &ch)) != NMRTF_OK) /* a control symbol; no delimiter. */ return rtf_dispatch_control(ctx, keyword, 0, param_set); for (i = 0; isalpha(ch) && (i < sizeof(keyword) - 1); rtf_get_char(ctx, &ch)) { /* check for '-' indicated a negative parameter value */ if ((status = rtf_get_char(ctx, &ch)) != NMRTF_OK) /* check for numerical param */ for (i = 0; isdigit(ch) && (i < sizeof(parameter) - 1); rtf_get_char(ctx, &ch)) { parameter[i] = (char) ch; ctx->param = param = atoi(parameter); ctx->param = param = -param; /* space after control is optional, put character back if it is not a space */ return rtf_dispatch_control(ctx, keyword, param, param_set); * Route the character to the appropriate destination rtf_dispatch_char(NMRtfContext *ctx, guchar ch) if (ctx->ris == NMRTF_STATE_BIN && --(ctx->bytes_to_skip) <= 0) ctx->ris = NMRTF_STATE_NORMAL; return rtf_print_char(ctx, ch); case NMRTF_STATE_FONTTABLE: rtf_add_font_entry(ctx, ctx->chp.font_idx, ctx->ansi->str, ctx->chp.font_charset); g_string_truncate(ctx->ansi, 0); return rtf_print_char(ctx, ch); /* Handle a unicode character */ rtf_dispatch_unicode_char(NMRtfContext *ctx, gunichar ch) case NMRTF_STATE_FONTTABLE: return rtf_print_unicode_char(ctx, ch); rtf_print_char(NMRtfContext *ctx, guchar ch) ctx->ansi = g_string_append_c(ctx->ansi, ch); * Output a unicode character rtf_print_unicode_char(NMRtfContext *ctx, gunichar ch) /* convert and flush the ansi buffer to the utf8 buffer */ /* convert the unicode character to utf8 and add directly to the output buffer */ num = g_unichar_to_utf8((gunichar) ch, buf); purple_debug_info("novell", "converted unichar 0x%X to utf8 char %s\n", ch, buf); ctx->output = g_string_append(ctx->output, buf); rtf_flush_data(NMRtfContext *ctx) if (ctx->rds == NMRTF_STATE_NORMAL && ctx->ansi->len > 0) { enc = get_current_encoding(ctx); conv_data = g_convert(ctx->ansi->str, ctx->ansi->len, "UTF-8", enc, ctx->output = g_string_append(ctx->output, conv_data); ctx->ansi = g_string_truncate(ctx->ansi, 0); status = NMRTF_CONVERT_ERROR; purple_debug_info("novell", "failed to convert data! error code = %d msg = %s\n", gerror->code, gerror->message); * Handle a property change rtf_apply_property(NMRtfContext *ctx, NMRtfProperty prop, int val) if (ctx->rds == NMRTF_STATE_SKIP) /* If we're skipping text, */ return NMRTF_OK; /* don't do anything. */ /* Need to flush any temporary data before a property change*/ case NMRTF_PROP_FONT_IDX: case NMRTF_PROP_FONT_CHARSET: ctx->chp.font_charset = val; * Search the table for keyword and evaluate it appropriately. * keyword: The RTF control to evaluate. * param: The parameter of the RTF control. * param_set: TRUE if the control had a parameter; (that is, if param is valid) rtf_dispatch_control(NMRtfContext *ctx, char *keyword, int param, gboolean param_set) for (idx = 0; idx < table_size; idx++) { if (purple_strequal(keyword, rtf_symbols[idx].keyword)) ctx->rds = NMRTF_STATE_SKIP; ctx->skip_unknown = FALSE; /* found it! use kwd_type and action to determine what to do with it. */ ctx->skip_unknown = FALSE; switch (rtf_symbols[idx].kwd_type) { if (rtf_symbols[idx].pass_default || !param_set) param = rtf_symbols[idx].default_val; return rtf_apply_property(ctx, rtf_symbols[idx].action, param); return rtf_dispatch_char(ctx, rtf_symbols[idx].action); return rtf_change_destination(ctx, rtf_symbols[idx].action); return rtf_dispatch_special(ctx, rtf_symbols[idx].action); * Change to the destination specified. rtf_change_destination(NMRtfContext *ctx, NMRtfDestinationType type) /* if we're skipping text, don't do anything */ if (ctx->rds == NMRTF_STATE_SKIP) case NMRTF_DEST_FONTTABLE: ctx->rds = NMRTF_STATE_FONTTABLE; g_string_truncate(ctx->ansi, 0); ctx->rds = NMRTF_STATE_SKIP; /* when in doubt, skip it... */ * Dispatch an RTF control that needs special processing rtf_dispatch_special(NMRtfContext *ctx, NMRtfSpecialKwd type) if (ctx->rds == NMRTF_STATE_SKIP && type != NMRTF_SPECIAL_BIN) /* if we're skipping, and it's not */ return NMRTF_OK; /* the \bin keyword, ignore it. */ ctx->ris = NMRTF_STATE_BIN; ctx->bytes_to_skip = ctx->param; ctx->skip_unknown = TRUE; ctx->ris = NMRTF_STATE_HEX; case NMRTF_SPECIAL_UNICODE: purple_debug_info("novell", "parsing unichar\n"); status = rtf_dispatch_unicode_char(ctx, ctx->param); status = rtf_get_char(ctx, &ch); status = NMRTF_BAD_TABLE; * Get the next character from the input stream rtf_get_char(NMRtfContext *ctx, guchar *ch) if (ctx->nextch_available) { ctx->nextch_available = FALSE; * Move a character back into the input stream rtf_unget_char(NMRtfContext *ctx, guchar ch) ctx->nextch_available = TRUE;