HGKeeper

* Adium is the legal property of its developers, whose names are listed in the copyright file included

* with this source distribution.

* This program is free software; you can redistribute it and/or modify it under the terms of the GNU

* General Public License as published by the Free Software Foundation; either version 2 of the License,

* or (at your option) any later version.

* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even

* the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General

* Public License for more details.

* You should have received a copy of the GNU General Public License along with this program; if not,

* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

#import "GetMetadataForHTMLLog.h"

#import "GetMetadataForHTMLLog-Additions.h"

#include <sys/stat.h>

#import <AIUtilities/ISO8601DateFormatter.h>

static char *gaim_markup_strip_html(const char *str);

//Given an Adium log file name, return an NSDate for its creation date

static NSDate *dateFromHTMLLog(NSString *pathToFile)

{

ISO8601DateFormatter *formatter = [[[ISO8601DateFormatter alloc] init] autorelease];

formatter.timeSeparator = '.';

NSRange openParenRange, closeParenRange;

if ((openParenRange = [pathToFile rangeOfString:@"(" options:NSBackwardsSearch]).location != NSNotFound) {

openParenRange = NSMakeRange(openParenRange.location, [pathToFile length] - openParenRange.location);

if ((closeParenRange = [pathToFile rangeOfString:@")" options:0 range:openParenRange]).location != NSNotFound) {

//Add and subtract one to remove the parenthesis

NSString *dateString = [pathToFile substringWithRange:NSMakeRange(openParenRange.location + 1, (closeParenRange.location - openParenRange.location))];

return [formatter dateFromString:[dateString stringByReplacingOccurrencesOfString:@"|" withString:@"-"]];

}

return nil;

}

NSString *CopyTextContentForHTMLLogData(NSData *logData) {

/* Perhaps we want to decode the HTML instead of stripping it so we can process

* the attributed contents to turn links into link (URL) for searching purposes...

NSString *textContent = nil;

const char* UTF8HTMLCString = [logData bytes];

if (UTF8HTMLCString) {

//Strip the HTML markup

char *plainText = gaim_markup_strip_html(UTF8HTMLCString);

textContent = [[NSString alloc] initWithUTF8String:plainText];

free((void *)plainText);

}

return textContent;

}

Boolean GetMetadataForHTMLLog(NSMutableDictionary *attributes, NSString *pathToFile)

{

/* HTML log is stored as ServiceID.Account_Name/Destination_Name/Destination_Name (2006|03|30).AdiumHTMLLog

* or HTML log is stored as ServiceID.Account_Name/Destination_Name/Destination_Name (2006-03-30).AdiumHTMLLog

NSArray *pathComponents = [pathToFile pathComponents];

NSUInteger count = [pathComponents count];

NSString *toUID = ((count >= 2) ? [pathComponents objectAtIndex:(count - 2)] : nil);

NSString *sourceFolder = ((count >= 3) ? [pathComponents objectAtIndex:(count - 3)] : nil);

NSString *serviceClass, *fromUID;

NSArray *serviceAndFromUIDArray;

/* Determine the service and fromUID - should be SERVICE.ACCOUNT_NAME

* Check against count to guard in case of old, malformed or otherwise odd folders & whatnot sitting in log base

serviceAndFromUIDArray = [sourceFolder componentsSeparatedByString:@"."];

if ([serviceAndFromUIDArray count] >= 2) {

serviceClass = [serviceAndFromUIDArray objectAtIndex:0];

//Use substringFromIndex so we include the rest of the string in the case of a UID with a . in it

fromUID = [sourceFolder substringFromIndex:([serviceClass length] + 1)]; //One off for the '.'

} else {

//Fallback: blank non-nil serviceClass; folderName as the fromUID

serviceClass = @"";

fromUID = sourceFolder;

}

NSDate *date;

if ((date = dateFromHTMLLog(pathToFile))) {

[attributes setObject:date

forKey:(NSString *)kMDItemContentCreationDate];

[attributes setObject:date

forKey:(NSString *)kMDItemLastUsedDate];

}

NSData *logData = [[NSData alloc] initWithContentsOfURL:[NSURL fileURLWithPath:pathToFile isDirectory:NO]

options:NSDataReadingUncached error:NULL];

NSString *textContent = nil;

if ((textContent = CopyTextContentForHTMLLogData(logData))) {

[attributes setObject:textContent

forKey:(NSString *)kMDItemTextContent];

}

[logData release];

[textContent release];

[attributes setObject:serviceClass

forKey:@"com_adiumX_service"];

if (fromUID) {

[attributes setObject:fromUID

forKey:@"com_adiumX_chatSource"];

}

if (toUID) {

[attributes setObject:toUID

forKey:@"com_adiumX_chatDestination"];

[attributes setObject:[NSString stringWithFormat:@"%@ on %@",toUID,[date descriptionWithCalendarFormat:@"%y-%m-%d"

timeZone:nil

locale:nil]]

forKey:(NSString *)kMDItemDisplayName];

}

[attributes setObject:@"Chat log"

forKey:(NSString *)kMDItemKind];

return TRUE;

}

#pragma mark Stripping HTML

//Taken from Gaim, 'cause I knew it was there. There may be an easier way to do this...

static BOOL g_ascii_isspace(char character)

{

return (character == ' ');

}

/* Find the length of STRING, but scan at most MAXLEN characters.

If no '\0' terminator is found in that many characters, return MAXLEN. */

size_t

strnlen (const char *string, size_t maxlen)

{

const char *end = memchr (string, '\0', maxlen);

return end ? (size_t) (end - string) : maxlen;

}

char *strndup (const char *s, size_t n)

{

size_t len = strnlen (s, n);

char *nouveau = malloc (len + 1);

if (nouveau == NULL)

return NULL;

nouveau[len] = '\0';

return (char *) memcpy (nouveau, s, len);

}

static char *gaim_unescape_html(const char *html) {

NSString *unescapedString = [[NSString stringWithUTF8String:html] stringByUnescapingFromXMLWithEntities:nil];

const char *unescapedStringUTF8String = [unescapedString UTF8String];

if (!unescapedStringUTF8String) NSLog(@"Warning: Could not unescape %s, or could not make a UTF8 string out of %@",html,unescapedString);

return (unescapedStringUTF8String ? strdup(unescapedStringUTF8String) : nil);

}

/* The following are probably reasonable changes:

* - \n should be converted to a normal space

* - in addition to <br>, <p> and <div> etc. should also be converted into \n

* - We want to turn </td>#whitespace<td> sequences into a single tab

* - We want to turn <td> into a single tab (for msn profile "parsing")

* - We want to turn </tr>#whitespace<tr> sequences into a single \n

* - <script>...</script> and <style>...</style> should be completely removed

static char *

gaim_markup_strip_html(const char *str)

{

size_t i, j, k;

BOOL visible = TRUE;

BOOL closing_td_p = FALSE;

char *str2;

const char *cdata_close_tag = NULL;

char *href = NULL;

size_t href_st = 0;

if(!str)

return NULL;

str2 = strdup(str);

for (i = 0, j = 0; str2[i]; i++)

{

if (str2[i] == '<')

{

if (cdata_close_tag)

{

/* Note: Don't even assume any other tag is a tag in CDATA */

if (strncasecmp(str2 + i, cdata_close_tag,

strlen(cdata_close_tag)) == 0)

{

i += strlen(cdata_close_tag) - 1;

cdata_close_tag = NULL;

}

continue;

}

else if (strncasecmp(str2 + i, "<td", 3) == 0 && closing_td_p)

{

str2[j++] = '\t';

visible = TRUE;

}

else if (strncasecmp(str2 + i, "</td>", 5) == 0)

{

closing_td_p = TRUE;

visible = FALSE;

}

else

{

closing_td_p = FALSE;

visible = TRUE;

}

k = i + 1;

if(g_ascii_isspace(str2[k]))

visible = TRUE;

else if (str2[k])

{

/* Scan until we end the tag either implicitly - closed start

* tag - or explicitly, using a sloppy method

* i.e., < or >

* inside quoted attributes will screw us up

while (str2[k] && str2[k] != '<' && str2[k] != '>')

{

k++;

}

/* If we've got an <a> tag with an href, save the address

* to print later. */

if (strncasecmp(str2 + i, "<a", 2) == 0 &&

g_ascii_isspace(str2[i+2]))

{

size_t st; /* start of href, inclusive [ */

size_t end; /* end of href, exclusive ) */

char delim = ' ';

/* Find start of href */

for (st = i + 3; st < k; st++)

{

if (strncasecmp(str2+st, "href=", 5) == 0)

{

st += 5;

if (str2[st] == '"')

{

delim = '"';

st++;

}

break;

}

/* find end of address */

for (end = st; end < k && str2[end] != delim; end++)

{

/* All the work is done in the loop construct above. */

}

/* If there's an address, save it. If there was

* already one saved, kill it. */

if (st < k)

{

char *tmp;

free(href);

tmp = strndup(str2 + st, end - st);

href = gaim_unescape_html(tmp);

free(tmp);

href_st = j;

}

/* Replace </a> with an ascii representation of the

* address the link was pointing to. */

else if (href != NULL && strncasecmp(str2 + i, "</a>", 4) == 0)

{

size_t hrlen = strlen(href);

/* Only insert the href if it's different from the CDATA. */

if ((hrlen != j - href_st ||

strncmp(str2 + href_st, href, hrlen)) &&

(hrlen != j - href_st + 7 ||

strncmp(str2 + href_st, href + 7, hrlen - 7))) {

str2[j++] = ' ';

str2[j++] = '(';

memmove(str2 + j, href, hrlen);

j += hrlen;

str2[j++] = ')';

free(href);

href = NULL;

}

/* Check for tags which should be mapped to newline */

else if (strncasecmp(str2 + i, "<p>", 3) == 0

|| strncasecmp(str2 + i, "<tr", 3) == 0

|| strncasecmp(str2 + i, "<br", 3) == 0

|| strncasecmp(str2 + i, "<li", 3) == 0

|| strncasecmp(str2 + i, "<div", 4) == 0

|| strncasecmp(str2 + i, "</table>", 8) == 0) {

str2[j++] = '\n';

}

/* Check for tags which begin CDATA and need to be closed */

else if (strncasecmp(str2 + i, "<script", 7) == 0) {

cdata_close_tag = "</script>";

}

else if (strncasecmp(str2 + i, "<style", 6) == 0) {

cdata_close_tag = "</style>";

}

/* Update the index and continue checking after the tag */

i = (str2[k] == '<' || str2[k] == '\0')? k - 1: k;

continue;

}

else if (cdata_close_tag)

{

continue;

}

else if (!g_ascii_isspace(str2[i]))

{

visible = TRUE;

}

/* XXX: This sucks. We need to be un-escaping all entities, which

* includes these, as well as the &#num; ones */

if (str2[i] == '&' && strncasecmp(str2 + i, """, 6) == 0)

{

str2[j++] = '\"';

i = i + 5;

continue;

}

if (str2[i] == '&' && strncasecmp(str2 + i, "&", 5) == 0)

{

str2[j++] = '&';

i = i + 4;

continue;

}

if (str2[i] == '&' && strncasecmp(str2 + i, "<", 4) == 0)

{

str2[j++] = '<';

i = i + 3;

continue;

}

if (str2[i] == '&' && strncasecmp(str2 + i, ">", 4) == 0)

{

str2[j++] = '>';

i = i + 3;

continue;

}

if (str2[i] == '&' && strncasecmp(str2 + i, "'", 6) == 0)

{

str2[j++] = '\'';

i = i + 5;

continue;

}

if (visible)

str2[j++] = g_ascii_isspace(str2[i])? ' ': str2[i];

}

free(href);

str2[j] = '\0';

return str2;

}

adium/adium