HGKeeper

* Purple - Internet Messaging Library

* Copyright (C) Pidgin Developers <devel@pidgin.im>

* Purple is the legal property of its developers, whose names are too numerous

* to list here. Please refer to the COPYRIGHT file distributed with this

* source distribution.

* This library is free software; you can redistribute it and/or modify it

* under the terms of the GNU General Public License as published by the Free

* Software Foundation; either version 2 of the License, or (at your option)

* any later version.

* This library is distributed in the hope that it will be useful, but WITHOUT

* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for

* more details.

* You should have received a copy of the GNU General Public License along with

* this library; if not, see <https://www.gnu.org/licenses/>.

#include "purplemarkup.h"

#include "util.h"

const char *

purple_markup_unescape_entity(const char *text, int *length)

{

const char *pln;

int len;

if (!text || *text != '&')

return NULL;

#define IS_ENTITY(s) (!g_ascii_strncasecmp(text, s, (len = sizeof(s) - 1)))

if(IS_ENTITY("&"))

pln = "&";

else if(IS_ENTITY("<"))

pln = "<";

else if(IS_ENTITY(">"))

pln = ">";

else if(IS_ENTITY(" "))

pln = " ";

else if(IS_ENTITY("©"))

pln = "\302\251"; /* or use g_unichar_to_utf8(0xa9); */

else if(IS_ENTITY("""))

pln = "\"";

else if(IS_ENTITY("®"))

pln = "\302\256"; /* or use g_unichar_to_utf8(0xae); */

else if(IS_ENTITY("'"))

pln = "\'";

else if(text[1] == '#' && (g_ascii_isxdigit(text[2]) || text[2] == 'x')) {

static char buf[7];

const char *start = text + 2;

char *end;

guint64 pound;

int base = 10;

int buflen;

if (*start == 'x') {

base = 16;

start++;

}

pound = g_ascii_strtoull(start, &end, base);

if (pound == 0 || pound > INT_MAX || *end != ';') {

return NULL;

}

len = (end - text) + 1;

buflen = g_unichar_to_utf8((gunichar)pound, buf);

buf[buflen] = '\0';

pln = buf;

}

else

return NULL;

if (length)

*length = len;

return pln;

}

struct purple_parse_tag {

char *src_tag;

char *dest_tag;

gboolean ignore;

};

/* NOTE: Do not put `do {} while(0)` around this macro (as this is the method

recommended in the GCC docs). It contains 'continue's that should

affect the while-loop in purple_markup_html_to_xhtml and doing the

above would break that.

Also, remember to put braces in constructs that require them for

multiple statements when using this macro. */

#define ALLOW_TAG_ALT(x, y) if(!g_ascii_strncasecmp(c, "<" x " ", strlen("<" x " "))) { \

const char *o = c + strlen("<" x); \

const char *p = NULL, *q = NULL, *r = NULL; \

/* o = iterating over full tag \

* p = > (end of tag) \

* q = start of quoted bit \

* r = < inside tag \

*/ \

GString *innards = g_string_new(""); \

while(o && *o) { \

if(!q && (*o == '\"' || *o == '\'') ) { \

q = o; \

} else if(q) { \

if(*o == *q) { /* end of quoted bit */ \

char *unescaped = g_strndup(q+1, o-q-1); \

char *escaped = g_markup_escape_text(unescaped, -1); \

g_string_append_printf(innards, "%c%s%c", *q, escaped, *q); \

g_free(unescaped); \

g_free(escaped); \

q = NULL; \

} else if(*c == '\\') { \

o++; \

} \

} else if(*o == '<') { \

r = o; \

} else if(*o == '>') { \

p = o; \

break; \

} else { \

innards = g_string_append_c(innards, *o); \

} \

o++; \

} \

if(p && !r) { /* got an end of tag and no other < earlier */\

if(*(p-1) != '/') { \

struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); \

pt->src_tag = x; \

pt->dest_tag = y; \

tags = g_list_prepend(tags, pt); \

} \

if(xhtml) { \

xhtml = g_string_append(xhtml, "<" y); \

xhtml = g_string_append(xhtml, innards->str); \

xhtml = g_string_append_c(xhtml, '>'); \

} \

c = p + 1; \

} else { /* got end of tag with earlier < *or* didn't get anything */ \

if(xhtml) \

xhtml = g_string_append(xhtml, "<"); \

if(plain) \

plain = g_string_append_c(plain, '<'); \

c++; \

} \

g_string_free(innards, TRUE); \

continue; \

} \

if(!g_ascii_strncasecmp(c, "<" x, strlen("<" x)) && \

(*(c+strlen("<" x)) == '>' || \

!g_ascii_strncasecmp(c+strlen("<" x), "/>", 2))) { \

if(xhtml) \

xhtml = g_string_append(xhtml, "<" y); \

c += strlen("<" x); \

if(*c != '/') { \

struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); \

pt->src_tag = x; \

pt->dest_tag = y; \

tags = g_list_prepend(tags, pt); \

if(xhtml) \

xhtml = g_string_append_c(xhtml, '>'); \

} else { \

if(xhtml) \

xhtml = g_string_append(xhtml, "/>");\

} \

c = strchr(c, '>') + 1; \

continue; \

}

/* Don't forget to check the note above for ALLOW_TAG_ALT. */

#define ALLOW_TAG(x) ALLOW_TAG_ALT(x, x)

void

purple_markup_html_to_xhtml(const char *html, char **xhtml_out,

char **plain_out)

{

GString *xhtml = NULL;

GString *plain = NULL;

GString *url = NULL;

GString *cdata = NULL;

GList *tags = NULL, *tag;

const char *c = html;

char quote = '\0';

#define CHECK_QUOTE(ptr) if (*(ptr) == '\'' || *(ptr) == '\"') \

quote = *(ptr++); \

else \

quote = '\0';

#define VALID_CHAR(ptr) (*(ptr) && *(ptr) != quote && (quote || (*(ptr) != ' ' && *(ptr) != '>')))

g_return_if_fail(xhtml_out != NULL || plain_out != NULL);

if(xhtml_out)

xhtml = g_string_new("");

if(plain_out)

plain = g_string_new("");

while(c && *c) {

if(*c == '<') {

if(*(c+1) == '/') { /* closing tag */

tag = tags;

while(tag) {

struct purple_parse_tag *pt = tag->data;

if(!g_ascii_strncasecmp((c+2), pt->src_tag, strlen(pt->src_tag)) && *(c+strlen(pt->src_tag)+2) == '>') {

c += strlen(pt->src_tag) + 3;

break;

}

tag = tag->next;

}

if(tag) {

while(tags) {

struct purple_parse_tag *pt = tags->data;

if(xhtml && !pt->ignore)

g_string_append_printf(xhtml, "</%s>", pt->dest_tag);

if(plain && purple_strequal(pt->src_tag, "a")) {

/* if this is a link, we have to add the url to the plaintext, too */

if (cdata && url &&

(!g_string_equal(cdata, url) && (g_ascii_strncasecmp(url->str, "mailto:", 7) != 0 ||

g_utf8_collate(url->str + 7, cdata->str) != 0)))

{

char *unescaped = purple_unescape_html(url->str);

g_string_append_printf(plain, " <%s>",

g_strstrip(unescaped));

g_free(unescaped);

}

if (cdata) {

g_string_free(cdata, TRUE);

cdata = NULL;

}

if(tags == tag)

break;

tags = g_list_delete_link(tags, tags);

g_free(pt);

}

g_free(tag->data);

tags = g_list_delete_link(tags, tag);

} else {

/* a closing tag we weren't expecting...

* we'll let it slide, if it's really a tag...if it's

* just a </ we'll escape it properly */

const char *end = c+2;

while(*end && g_ascii_isalpha(*end))

end++;

if(*end == '>') {

c = end+1;

} else {

if(xhtml)

xhtml = g_string_append(xhtml, "<");

if(plain)

plain = g_string_append_c(plain, '<');

c++;

}

} else { /* opening tag */

ALLOW_TAG("blockquote");

ALLOW_TAG("cite");

ALLOW_TAG("div");

ALLOW_TAG("em");

ALLOW_TAG("h1");

ALLOW_TAG("h2");

ALLOW_TAG("h3");

ALLOW_TAG("h4");

ALLOW_TAG("h5");

ALLOW_TAG("h6");

/* we only allow html to start the message */

if(c == html) {

ALLOW_TAG("html");

}

ALLOW_TAG_ALT("i", "em");

ALLOW_TAG_ALT("italic", "em");

ALLOW_TAG("li");

ALLOW_TAG("ol");

ALLOW_TAG("p");

ALLOW_TAG("pre");

ALLOW_TAG("q");

ALLOW_TAG("span");

ALLOW_TAG("ul");

/* we skip <HR> because it's not legal in XHTML-IM. However,

* we still want to send something sensible, so we put a

* linebreak in its place. <BR> also needs special handling

* because putting a </BR> to close it would just be dumb. */

if((!g_ascii_strncasecmp(c, "<br", 3)

|| !g_ascii_strncasecmp(c, "<hr", 3))

&& (*(c+3) == '>' ||

!g_ascii_strncasecmp(c+3, "/>", 2) ||

!g_ascii_strncasecmp(c+3, " />", 3))) {

c = strchr(c, '>') + 1;

if(xhtml)

xhtml = g_string_append(xhtml, "<br/>");

if(plain && *c != '\n')

plain = g_string_append_c(plain, '\n');

continue;

}

if(!g_ascii_strncasecmp(c, "<b>", 3) || !g_ascii_strncasecmp(c, "<bold>", strlen("<bold>")) || !g_ascii_strncasecmp(c, "<strong>", strlen("<strong>"))) {

struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1);

if (*(c+2) == '>')

pt->src_tag = "b";

else if (*(c+2) == 'o')

pt->src_tag = "bold";

else

pt->src_tag = "strong";

pt->dest_tag = "span";

tags = g_list_prepend(tags, pt);

c = strchr(c, '>') + 1;

if(xhtml)

xhtml = g_string_append(xhtml, "<span style='font-weight: bold;'>");

continue;

}

if(!g_ascii_strncasecmp(c, "<u>", 3) || !g_ascii_strncasecmp(c, "<underline>", strlen("<underline>"))) {

struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1);

pt->src_tag = *(c+2) == '>' ? "u" : "underline";

pt->dest_tag = "span";

tags = g_list_prepend(tags, pt);

c = strchr(c, '>') + 1;

if (xhtml)

xhtml = g_string_append(xhtml, "<span style='text-decoration: underline;'>");

continue;

}

if(!g_ascii_strncasecmp(c, "<s>", 3) || !g_ascii_strncasecmp(c, "<strike>", strlen("<strike>"))) {

struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1);

pt->src_tag = *(c+2) == '>' ? "s" : "strike";

pt->dest_tag = "span";

tags = g_list_prepend(tags, pt);

c = strchr(c, '>') + 1;

if(xhtml)

xhtml = g_string_append(xhtml, "<span style='text-decoration: line-through;'>");

continue;

}

if(!g_ascii_strncasecmp(c, "<sub>", 5)) {

struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1);

pt->src_tag = "sub";

pt->dest_tag = "span";

tags = g_list_prepend(tags, pt);

c = strchr(c, '>') + 1;

if(xhtml)

xhtml = g_string_append(xhtml, "<span style='vertical-align:sub;'>");

continue;

}

if(!g_ascii_strncasecmp(c, "<sup>", 5)) {

struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1);

pt->src_tag = "sup";

pt->dest_tag = "span";

tags = g_list_prepend(tags, pt);

c = strchr(c, '>') + 1;

if(xhtml)

xhtml = g_string_append(xhtml, "<span style='vertical-align:super;'>");

continue;

}

if (!g_ascii_strncasecmp(c, "<img", 4) && (*(c+4) == '>' || *(c+4) == ' ')) {

const char *p = c + 4;

GString *src = NULL, *alt = NULL;

#define ESCAPE(from, to) \

CHECK_QUOTE(from); \

while (VALID_CHAR(from)) { \

int len; \

if ((*from == '&') && (purple_markup_unescape_entity(from, &len) == NULL)) \

to = g_string_append(to, "&"); \

else if (*from == '\'') \

to = g_string_append(to, "'"); \

else \

to = g_string_append_c(to, *from); \

from++; \

}

while (*p && *p != '>') {

if (!g_ascii_strncasecmp(p, "src=", 4)) {

const char *q = p + 4;

if (src)

g_string_free(src, TRUE);

src = g_string_new("");

ESCAPE(q, src);

p = q;

} else if (!g_ascii_strncasecmp(p, "alt=", 4)) {

const char *q = p + 4;

if (alt)

g_string_free(alt, TRUE);

alt = g_string_new("");

ESCAPE(q, alt);

p = q;

} else {

p++;

}

#undef ESCAPE

if ((c = strchr(p, '>')) != NULL)

c++;

else

c = p;

/* src and alt are required! */

if(src && xhtml)

g_string_append_printf(xhtml, "<img src='%s' alt='%s' />", g_strstrip(src->str), alt ? alt->str : "");

if(alt) {

if(plain) {

char *unescaped = purple_unescape_html(alt->str);

plain = g_string_append(plain, unescaped);

g_free(unescaped);

}

if(!src && xhtml) {

xhtml = g_string_append(xhtml, alt->str);

}

g_string_free(alt, TRUE);

}

g_string_free(src, TRUE);

continue;

}

if (!g_ascii_strncasecmp(c, "<a", 2) && (*(c+2) == '>' || *(c+2) == ' ')) {

const char *p = c + 2;

struct purple_parse_tag *pt;

while (*p && *p != '>') {

if (!g_ascii_strncasecmp(p, "href=", 5)) {

const char *q = p + 5;

if (url)

g_string_free(url, TRUE);

url = g_string_new("");

if (cdata)

g_string_free(cdata, TRUE);

cdata = g_string_new("");

CHECK_QUOTE(q);

while (VALID_CHAR(q)) {

int len;

if ((*q == '&') && (purple_markup_unescape_entity(q, &len) == NULL))

url = g_string_append(url, "&");

else if (*q == '"')

url = g_string_append(url, """);

else

url = g_string_append_c(url, *q);

q++;

}

p = q;

} else {

p++;

}

if ((c = strchr(p, '>')) != NULL)

c++;

else

c = p;

pt = g_new0(struct purple_parse_tag, 1);

pt->src_tag = "a";

pt->dest_tag = "a";

tags = g_list_prepend(tags, pt);

if(xhtml)

g_string_append_printf(xhtml, "<a href=\"%s\">", url ? g_strstrip(url->str) : "");

continue;

}

#define ESCAPE(from, to) \

CHECK_QUOTE(from); \

while (VALID_CHAR(from)) { \

int len; \

if ((*from == '&') && (purple_markup_unescape_entity(from, &len) == NULL)) \

to = g_string_append(to, "&"); \

else if (*from == '\'') \

to = g_string_append_c(to, '\"'); \

else \

to = g_string_append_c(to, *from); \

from++; \

}

if(!g_ascii_strncasecmp(c, "<font", 5) && (*(c+5) == '>' || *(c+5) == ' ')) {

const char *p = c + 5;

GString *style = g_string_new("");

struct purple_parse_tag *pt;

while (*p && *p != '>') {

if (!g_ascii_strncasecmp(p, "back=", 5)) {

const char *q = p + 5;

GString *color = g_string_new("");

ESCAPE(q, color);

g_string_append_printf(style, "background: %s; ", color->str);

g_string_free(color, TRUE);

p = q;

} else if (!g_ascii_strncasecmp(p, "color=", 6)) {

const char *q = p + 6;

GString *color = g_string_new("");

ESCAPE(q, color);

g_string_append_printf(style, "color: %s; ", color->str);

g_string_free(color, TRUE);

p = q;

} else if (!g_ascii_strncasecmp(p, "face=", 5)) {

const char *q = p + 5;

GString *face = g_string_new("");

ESCAPE(q, face);

g_string_append_printf(style, "font-family: %s; ", g_strstrip(face->str));

g_string_free(face, TRUE);

p = q;

} else if (!g_ascii_strncasecmp(p, "size=", 5)) {

const char *q = p + 5;

int sz;

const char *size = "medium";

CHECK_QUOTE(q);

sz = atoi(q);

switch (sz)

{

case 1:

size = "xx-small";

break;

case 2:

size = "small";

break;

case 3:

size = "medium";

break;

case 4:

size = "large";

break;

case 5:

size = "x-large";

break;

case 6:

case 7:

size = "xx-large";

break;

default:

break;

}

g_string_append_printf(style, "font-size: %s; ", size);

p = q;

} else {

p++;

}

if ((c = strchr(p, '>')) != NULL)

c++;

else

c = p;

pt = g_new0(struct purple_parse_tag, 1);

pt->src_tag = "font";

pt->dest_tag = "span";

tags = g_list_prepend(tags, pt);

if(style->len && xhtml)

g_string_append_printf(xhtml, "<span style='%s'>", g_strstrip(style->str));

else

pt->ignore = TRUE;

g_string_free(style, TRUE);

continue;

}

#undef ESCAPE

if (!g_ascii_strncasecmp(c, "<body ", 6)) {

const char *p = c + 6;

gboolean did_something = FALSE;

while (*p && *p != '>') {

if (!g_ascii_strncasecmp(p, "bgcolor=", 8)) {

const char *q = p + 8;

struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1);

GString *color = g_string_new("");

CHECK_QUOTE(q);

while (VALID_CHAR(q)) {

color = g_string_append_c(color, *q);

q++;

}

if (xhtml)

g_string_append_printf(xhtml, "<span style='background: %s;'>", g_strstrip(color->str));

g_string_free(color, TRUE);

if ((c = strchr(p, '>')) != NULL)

c++;

else

c = p;

pt->src_tag = "body";

pt->dest_tag = "span";

tags = g_list_prepend(tags, pt);

did_something = TRUE;

break;

}

p++;

}

if (did_something) continue;

}

/* this has to come after the special case for bgcolor */

ALLOW_TAG("body");

if(!g_ascii_strncasecmp(c, "<!--", strlen("<!--"))) {

char *p = strstr(c + strlen("");

if(p) {

if(xhtml)

xhtml = g_string_append(xhtml, "<!--");

c += strlen("<!--");

continue;

}

if(xhtml)

xhtml = g_string_append(xhtml, "<");

if(plain)

plain = g_string_append_c(plain, '<');

c++;

}

} else if(*c == '&') {

char buf[7];

const char *pln;

int len;

if ((pln = purple_markup_unescape_entity(c, &len)) == NULL) {

len = 1;

g_snprintf(buf, sizeof(buf), "%c", *c);

pln = buf;

}

if(xhtml)

xhtml = g_string_append_len(xhtml, c, len);

if(plain)

plain = g_string_append(plain, pln);

if(cdata)

cdata = g_string_append_len(cdata, c, len);

c += len;

} else {

if(xhtml)

xhtml = g_string_append_c(xhtml, *c);

if(plain)

plain = g_string_append_c(plain, *c);

if(cdata)

cdata = g_string_append_c(cdata, *c);

c++;

}

if(xhtml) {

for (tag = tags; tag ; tag = tag->next) {

struct purple_parse_tag *pt = tag->data;

if(!pt->ignore)

g_string_append_printf(xhtml, "</%s>", pt->dest_tag);

}

g_clear_list(&tags, g_free);

if(xhtml_out)

*xhtml_out = g_string_free(xhtml, FALSE);

if(plain_out)

*plain_out = g_string_free(plain, FALSE);

if(url)

g_string_free(url, TRUE);

if (cdata)

g_string_free(cdata, TRUE);

#undef CHECK_QUOTE

#undef VALID_CHAR

}

/* The following are probably reasonable changes:

* - \n should be converted to a normal space

* - in addition to <br>, <p> and <div> etc. should also be converted into \n

* - We want to turn </td>#whitespace<td> sequences into a single tab

* - We want to turn </tr>#whitespace<tr> sequences into a single \n

* - <script>...</script> and <style>...</style> should be completely removed

char *

purple_markup_strip_html(const char *str)

{

int i, j, k, entlen;

gboolean visible = TRUE;

gboolean closing_td_p = FALSE;

gchar *str2;

const gchar *cdata_close_tag = NULL, *ent;

gchar *href = NULL;

int href_st = 0;

if(!str)

return NULL;

str2 = g_strdup(str);

for (i = 0, j = 0; str2[i]; i++)

{

if (str2[i] == '<')

{

if (cdata_close_tag)

{

/* Note: Don't even assume any other tag is a tag in CDATA */

if (g_ascii_strncasecmp(str2 + i, cdata_close_tag,

strlen(cdata_close_tag)) == 0)

{

i += strlen(cdata_close_tag) - 1;

cdata_close_tag = NULL;

}

continue;

}

else if (g_ascii_strncasecmp(str2 + i, "<td", 3) == 0 && closing_td_p)

{

str2[j++] = '\t';

visible = TRUE;

}

else if (g_ascii_strncasecmp(str2 + i, "</td>", 5) == 0)

{

closing_td_p = TRUE;

visible = FALSE;

}

else

{

closing_td_p = FALSE;

visible = TRUE;

}

k = i + 1;

if(g_ascii_isspace(str2[k]))

visible = TRUE;

else if (str2[k])

{

/* Scan until we end the tag either implicitly (closed start

* tag) or explicitly, using a sloppy method (i.e., < or >

* inside quoted attributes will screw us up)

while (str2[k] && str2[k] != '<' && str2[k] != '>')

{

k++;

}

/* If we've got an <a> tag with an href, save the address

* to print later. */

if (g_ascii_strncasecmp(str2 + i, "<a", 2) == 0 &&

g_ascii_isspace(str2[i+2]))

{

int st; /* start of href, inclusive [ */

int end; /* end of href, exclusive ) */

char delim = ' ';

/* Find start of href */

for (st = i + 3; st < k; st++)

{

if (g_ascii_strncasecmp(str2+st, "href=", 5) == 0)

{

st += 5;

if (str2[st] == '"' || str2[st] == '\'')

{

delim = str2[st];

st++;

}

break;

}

/* find end of address */

for (end = st; end < k && str2[end] != delim; end++)

{

/* All the work is done in the loop construct above. */

}

/* If there's an address, save it. If there was

* already one saved, kill it. */

if (st < k)

{

char *tmp;

g_free(href);

tmp = g_strndup(str2 + st, end - st);

href = purple_unescape_html(tmp);

g_free(tmp);

href_st = j;

}

/* Replace </a> with an ascii representation of the

* address the link was pointing to. */

else if (href != NULL && g_ascii_strncasecmp(str2 + i, "</a>", 4) == 0)

{

size_t hrlen = strlen(href);

/* Only insert the href if it's different from the CDATA. */

if ((hrlen != (gsize)(j - href_st) ||

strncmp(str2 + href_st, href, hrlen)) &&

(hrlen != (gsize)(j - href_st + 7) || /* 7 == strlen("http://") */

strncmp(str2 + href_st, href + 7, hrlen - 7)))

{

str2[j++] = ' ';

str2[j++] = '(';

memmove(str2 + j, href, hrlen);

j += hrlen;

str2[j++] = ')';

g_free(href);

href = NULL;

}

/* Check for tags which should be mapped to newline (but ignore some of

* the tags at the beginning of the text) */

else if ((j && (g_ascii_strncasecmp(str2 + i, "<p>", 3) == 0

|| g_ascii_strncasecmp(str2 + i, "<tr", 3) == 0

|| g_ascii_strncasecmp(str2 + i, "<hr", 3) == 0

|| g_ascii_strncasecmp(str2 + i, "<li", 3) == 0

|| g_ascii_strncasecmp(str2 + i, "<div", 4) == 0))

|| g_ascii_strncasecmp(str2 + i, "<br", 3) == 0

|| g_ascii_strncasecmp(str2 + i, "</table>", 8) == 0)

{

str2[j++] = '\n';

}

/* Check for tags which begin CDATA and need to be closed */

else if (g_ascii_strncasecmp(str2 + i, "<script", 7) == 0)

{

cdata_close_tag = "</script>";

}

else if (g_ascii_strncasecmp(str2 + i, "<style", 6) == 0)

{

cdata_close_tag = "</style>";

}

/* Update the index and continue checking after the tag */

i = (str2[k] == '<' || str2[k] == '\0')? k - 1: k;

continue;

}

else if (cdata_close_tag)

{

continue;

}

else if (!g_ascii_isspace(str2[i]))

{

visible = TRUE;

}

if (str2[i] == '&' && (ent = purple_markup_unescape_entity(str2 + i, &entlen)) != NULL)

{

while (*ent)

str2[j++] = *ent++;

i += entlen - 1;

continue;

}

if (visible)

str2[j++] = g_ascii_isspace(str2[i])? ' ': str2[i];

}

g_free(href);

str2[j] = '\0';

return str2;

}

static gboolean

badchar(char c)

{

switch (c) {

case ' ':

case ',':

case '\0':

case '\n':

case '\r':

case '<':

case '>':

case '"':

return TRUE;

default:

return FALSE;

}

static gboolean

badentity(const char *c)

{

if (!g_ascii_strncasecmp(c, "<", 4) ||

!g_ascii_strncasecmp(c, ">", 4) ||

!g_ascii_strncasecmp(c, """, 6)) {

return TRUE;

}

return FALSE;

}

static const char *

process_link(GString *ret,

const char *start, const char *c,

int matchlen,

const char *urlprefix,

int inside_paren)

{

char *url_buf, *tmpurlbuf;

const char *t;

for (t = c;; t++) {

if (!badchar(*t) && !badentity(t))

continue;

if (t - c == matchlen)

break;

if (*t == ',' && *(t + 1) != ' ') {

continue;

}

if (t > start && *(t - 1) == '.')

t--;

if (t > start && *(t - 1) == ')' && inside_paren > 0)

t--;

url_buf = g_strndup(c, t - c);

tmpurlbuf = purple_unescape_html(url_buf);

g_string_append_printf(ret, "<a href=\"%s%s\">%s</a>",

urlprefix,

tmpurlbuf, url_buf);

g_free(tmpurlbuf);

g_free(url_buf);

return t;

}

return c;

}

char *

purple_markup_linkify(const char *text)

{

const char *c, *t, *q = NULL;

char *tmpurlbuf, *url_buf;

gunichar g;

gboolean inside_html = FALSE;

int inside_paren = 0;

GString *ret;

if (text == NULL)

return NULL;

ret = g_string_new("");

c = text;

while (*c) {

if(*c == '(' && !inside_html) {

inside_paren++;

ret = g_string_append_c(ret, *c);

c++;

}

if(inside_html) {

if(*c == '>') {

inside_html = FALSE;

} else if(!q && (*c == '\"' || *c == '\'')) {

q = c;

} else if(q) {

if(*c == *q)

q = NULL;

}

} else if(*c == '<') {

inside_html = TRUE;

if (!g_ascii_strncasecmp(c, "<a", 2)) {

while (1) {

if (!g_ascii_strncasecmp(c, "/a>", 3)) {

inside_html = FALSE;

break;

}

ret = g_string_append_c(ret, *c);

c++;

if (!(*c))

break;

}

} else if (!g_ascii_strncasecmp(c, "http://", 7)) {

c = process_link(ret, text, c, 7, "", inside_paren);

} else if (!g_ascii_strncasecmp(c, "https://", 8)) {

c = process_link(ret, text, c, 8, "", inside_paren);

} else if (!g_ascii_strncasecmp(c, "ftp://", 6)) {

c = process_link(ret, text, c, 6, "", inside_paren);

} else if (!g_ascii_strncasecmp(c, "sftp://", 7)) {

c = process_link(ret, text, c, 7, "", inside_paren);

} else if (!g_ascii_strncasecmp(c, "file://", 7)) {

c = process_link(ret, text, c, 7, "", inside_paren);

} else if (!g_ascii_strncasecmp(c, "www.", 4) && c[4] != '.' && (c == text || badchar(c[-1]) || badentity(c-1))) {

c = process_link(ret, text, c, 4, "http://", inside_paren);

} else if (!g_ascii_strncasecmp(c, "ftp.", 4) && c[4] != '.' && (c == text || badchar(c[-1]) || badentity(c-1))) {

c = process_link(ret, text, c, 4, "ftp://", inside_paren);

} else if (!g_ascii_strncasecmp(c, "xmpp:", 5) && (c == text || badchar(c[-1]) || badentity(c-1))) {

c = process_link(ret, text, c, 5, "", inside_paren);

} else if (!g_ascii_strncasecmp(c, "mailto:", 7)) {

t = c;

while (1) {

if (badchar(*t) || badentity(t)) {

char *d;

if (t - c == 7) {

break;

}

if (t > text && *(t - 1) == '.')

t--;

if ((d = strstr(c + 7, "?")) != NULL && d < t)

url_buf = g_strndup(c + 7, d - c - 7);

else

url_buf = g_strndup(c + 7, t - c - 7);

if (!purple_email_is_valid(url_buf)) {

g_free(url_buf);

break;

}

g_free(url_buf);

url_buf = g_strndup(c, t - c);

tmpurlbuf = purple_unescape_html(url_buf);

g_string_append_printf(ret, "<a href=\"%s\">%s</a>",

tmpurlbuf, url_buf);

g_free(url_buf);

g_free(tmpurlbuf);

c = t;

break;

}

t++;

}

} else if (c != text && (*c == '@')) {

int flag;

GString *gurl_buf = NULL;

const char illegal_chars[] = "!@#$%^&*()[]{}/|\\<>\":;\r\n \0";

if (strchr(illegal_chars,*(c - 1)) || strchr(illegal_chars, *(c + 1)))

flag = 0;

else {

flag = 1;

gurl_buf = g_string_new("");

}

t = c;

while (flag) {

/* iterate backwards grabbing the local part of an email address */

g = g_utf8_get_char(t);

if (badchar(*t) || (g >= 127) || (*t == '(') ||

((*t == ';') && ((t > (text+2) && (!g_ascii_strncasecmp(t - 3, "<", 4) ||

!g_ascii_strncasecmp(t - 3, ">", 4))) ||

(t > (text+4) && (!g_ascii_strncasecmp(t - 5, """, 6)))))) {

/* local part will already be part of ret, strip it out */

ret = g_string_truncate(ret, ret->len - (c - t));

ret = g_string_append_unichar(ret, g);

break;

} else {

g_string_prepend_unichar(gurl_buf, g);

t = g_utf8_find_prev_char(text, t);

if (t < text) {

ret = g_string_assign(ret, "");

break;

}

t = g_utf8_find_next_char(c, NULL);

while (flag) {

/* iterate forwards grabbing the domain part of an email address */

g = g_utf8_get_char(t);

if (badchar(*t) || (g >= 127) || (*t == ')') || badentity(t)) {

char *d;

url_buf = g_string_free(gurl_buf, FALSE);

gurl_buf = NULL;

/* strip off trailing periods */

if (*url_buf) {

for (d = url_buf + strlen(url_buf) - 1; *d == '.'; d--, t--)

*d = '\0';

}

tmpurlbuf = purple_unescape_html(url_buf);

if (purple_email_is_valid(tmpurlbuf)) {

g_string_append_printf(ret, "<a href=\"mailto:%s\">%s</a>",

tmpurlbuf, url_buf);

} else {

g_string_append(ret, url_buf);

}

g_free(url_buf);

g_free(tmpurlbuf);

c = t;

break;

} else {

g_string_append_unichar(gurl_buf, g);

t = g_utf8_find_next_char(t, NULL);

}

if (gurl_buf) {

g_string_free(gurl_buf, TRUE);

}

if(*c == ')' && !inside_html) {

inside_paren--;

ret = g_string_append_c(ret, *c);

c++;

}

if (*c == 0)

break;

ret = g_string_append_c(ret, *c);

c++;

}

return g_string_free(ret, FALSE);

}

char *purple_unescape_text(const char *in)

{

GString *ret;

const char *c = in;

if (in == NULL)

return NULL;

ret = g_string_new("");

while (*c) {

int len;

const char *ent;

if ((ent = purple_markup_unescape_entity(c, &len)) != NULL) {

g_string_append(ret, ent);

c += len;

} else {

g_string_append_c(ret, *c);

c++;

}

return g_string_free(ret, FALSE);

}

char *purple_unescape_html(const char *html)

{

GString *ret;

const char *c = html;

if (html == NULL)

return NULL;

ret = g_string_new("");

while (*c) {

int len;

const char *ent;

if ((ent = purple_markup_unescape_entity(c, &len)) != NULL) {

g_string_append(ret, ent);

c += len;

} else if (!strncmp(c, "<br>", 4)) {

g_string_append_c(ret, '\n');

c += 4;

} else {

g_string_append_c(ret, *c);

c++;

}

return g_string_free(ret, FALSE);

}

char *

purple_markup_slice(const char *str, guint x, guint y)

{

GString *ret;

GQueue *q;

guint z = 0;

gboolean appended = FALSE;

gunichar c;

char *tag;

g_return_val_if_fail(str != NULL, NULL);

g_return_val_if_fail(x <= y, NULL);

if (x == y)

return g_strdup("");

ret = g_string_new("");

q = g_queue_new();

while (*str && (z < y)) {

c = g_utf8_get_char(str);

if (c == '<') {

char *end = strchr(str, '>');

if (!end) {

g_string_free(ret, TRUE);

while ((tag = g_queue_pop_head(q)))

g_free(tag);

g_queue_free(q);

return NULL;

}

if (!g_ascii_strncasecmp(str, "<img ", 5)) {

z += strlen("[Image]");

} else if (!g_ascii_strncasecmp(str, "<br", 3)) {

z += 1;

} else if (!g_ascii_strncasecmp(str, "<hr>", 4)) {

z += strlen("\n---\n");

} else if (!g_ascii_strncasecmp(str, "</", 2)) {

/* pop stack */

char *tmp;

tmp = g_queue_pop_head(q);

g_free(tmp);

/* z += 0; */

} else {

/* push it unto the stack */

char *tmp;

tmp = g_strndup(str, end - str + 1);

g_queue_push_head(q, tmp);

/* z += 0; */

}

if (z >= x) {

g_string_append_len(ret, str, end - str + 1);

}

str = end;

} else if (c == '&') {

char *end = strchr(str, ';');

if (!end) {

g_string_free(ret, TRUE);

while ((tag = g_queue_pop_head(q)))

g_free(tag);

g_queue_free(q);

return NULL;

}

if (z >= x)

g_string_append_len(ret, str, end - str + 1);

z++;

str = end;

} else {

if (z == x && z > 0 && !appended) {

GList *l = q->tail;

while (l) {

tag = l->data;

g_string_append(ret, tag);

l = l->prev;

}

appended = TRUE;

}

if (z >= x)

g_string_append_unichar(ret, c);

z++;

}

str = g_utf8_next_char(str);

}

while ((tag = g_queue_pop_head(q))) {

char *name;

name = purple_markup_get_tag_name(tag);

g_string_append_printf(ret, "</%s>", name);

g_free(name);

g_free(tag);

}

g_queue_free(q);

return g_string_free(ret, FALSE);

}

char *

purple_markup_get_tag_name(const char *tag)

{

int i;

g_return_val_if_fail(tag != NULL, NULL);

g_return_val_if_fail(*tag == '<', NULL);

for (i = 1; tag[i]; i++)

if (tag[i] == '>' || tag[i] == ' ' || tag[i] == '/')

break;

return g_strndup(tag+1, i-1);

}

pidgin/pidgin