* Adium is the legal property of its developers, whose names are listed in the copyright file included * with this source distribution. * This program is free software; you can redistribute it and/or modify it under the terms of the GNU * General Public License as published by the Free Software Foundation; either version 2 of the License, * or (at your option) any later version. * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * Public License for more details. * You should have received a copy of the GNU General Public License along with this program; if not, * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #import "GetMetadataForHTMLLog.h" #import "GetMetadataForHTMLLog-Additions.h" #import <AIUtilities/ISO8601DateFormatter.h> static char *gaim_markup_strip_html(const char *str); //Given an Adium log file name, return an NSDate for its creation date static NSDate *dateFromHTMLLog(NSString *pathToFile) ISO8601DateFormatter *formatter = [[[ISO8601DateFormatter alloc] init] autorelease]; formatter.timeSeparator = '.'; NSRange openParenRange, closeParenRange; if ((openParenRange = [pathToFile rangeOfString:@"(" options:NSBackwardsSearch]).location != NSNotFound) { openParenRange = NSMakeRange(openParenRange.location, [pathToFile length] - openParenRange.location); if ((closeParenRange = [pathToFile rangeOfString:@")" options:0 range:openParenRange]).location != NSNotFound) { //Add and subtract one to remove the parenthesis NSString *dateString = [pathToFile substringWithRange:NSMakeRange(openParenRange.location + 1, (closeParenRange.location - openParenRange.location))]; return [formatter dateFromString:[dateString stringByReplacingOccurrencesOfString:@"|" withString:@"-"]]; NSString *CopyTextContentForHTMLLogData(NSData *logData) { /* Perhaps we want to decode the HTML instead of stripping it so we can process * the attributed contents to turn links into link (URL) for searching purposes... NSString *textContent = nil; const char* UTF8HTMLCString = [logData bytes]; char *plainText = gaim_markup_strip_html(UTF8HTMLCString); textContent = [[NSString alloc] initWithUTF8String:plainText]; Boolean GetMetadataForHTMLLog(NSMutableDictionary *attributes, NSString *pathToFile) /* HTML log is stored as ServiceID.Account_Name/Destination_Name/Destination_Name (2006|03|30).AdiumHTMLLog * or HTML log is stored as ServiceID.Account_Name/Destination_Name/Destination_Name (2006-03-30).AdiumHTMLLog NSArray *pathComponents = [pathToFile pathComponents]; NSUInteger count = [pathComponents count]; NSString *toUID = ((count >= 2) ? [pathComponents objectAtIndex:(count - 2)] : nil); NSString *sourceFolder = ((count >= 3) ? [pathComponents objectAtIndex:(count - 3)] : nil); NSString *serviceClass, *fromUID; NSArray *serviceAndFromUIDArray; /* Determine the service and fromUID - should be SERVICE.ACCOUNT_NAME * Check against count to guard in case of old, malformed or otherwise odd folders & whatnot sitting in log base serviceAndFromUIDArray = [sourceFolder componentsSeparatedByString:@"."]; if ([serviceAndFromUIDArray count] >= 2) { serviceClass = [serviceAndFromUIDArray objectAtIndex:0]; //Use substringFromIndex so we include the rest of the string in the case of a UID with a . in it fromUID = [sourceFolder substringFromIndex:([serviceClass length] + 1)]; //One off for the '.' //Fallback: blank non-nil serviceClass; folderName as the fromUID if ((date = dateFromHTMLLog(pathToFile))) { [attributes setObject:date forKey:(NSString *)kMDItemContentCreationDate]; [attributes setObject:date forKey:(NSString *)kMDItemLastUsedDate]; NSData *logData = [[NSData alloc] initWithContentsOfURL:[NSURL fileURLWithPath:pathToFile isDirectory:NO] options:NSDataReadingUncached error:NULL]; NSString *textContent = nil; if ((textContent = CopyTextContentForHTMLLogData(logData))) { [attributes setObject:textContent forKey:(NSString *)kMDItemTextContent]; [attributes setObject:serviceClass forKey:@"com_adiumX_service"]; [attributes setObject:fromUID forKey:@"com_adiumX_chatSource"]; [attributes setObject:toUID forKey:@"com_adiumX_chatDestination"]; [attributes setObject:[NSString stringWithFormat:@"%@ on %@",toUID,[date descriptionWithCalendarFormat:@"%y-%m-%d" forKey:(NSString *)kMDItemDisplayName]; [attributes setObject:@"Chat log" forKey:(NSString *)kMDItemKind]; #pragma mark Stripping HTML //Taken from Gaim, 'cause I knew it was there. There may be an easier way to do this... static BOOL g_ascii_isspace(char character) return (character == ' '); /* Find the length of STRING, but scan at most MAXLEN characters. If no '\0' terminator is found in that many characters, return MAXLEN. */ strnlen (const char *string, size_t maxlen) const char *end = memchr (string, '\0', maxlen); return end ? (size_t) (end - string) : maxlen; char *strndup (const char *s, size_t n) size_t len = strnlen (s, n); char *nouveau = malloc (len + 1); return (char *) memcpy (nouveau, s, len); static char *gaim_unescape_html(const char *html) { NSString *unescapedString = [[NSString stringWithUTF8String:html] stringByUnescapingFromXMLWithEntities:nil]; const char *unescapedStringUTF8String = [unescapedString UTF8String]; if (!unescapedStringUTF8String) NSLog(@"Warning: Could not unescape %s, or could not make a UTF8 string out of %@",html,unescapedString); return (unescapedStringUTF8String ? strdup(unescapedStringUTF8String) : nil); /* The following are probably reasonable changes: * - \n should be converted to a normal space * - in addition to <br>, <p> and <div> etc. should also be converted into \n * - We want to turn </td>#whitespace<td> sequences into a single tab * - We want to turn <td> into a single tab (for msn profile "parsing") * - We want to turn </tr>#whitespace<tr> sequences into a single \n * - <script>...</script> and <style>...</style> should be completely removed gaim_markup_strip_html(const char *str) BOOL closing_td_p = FALSE; const char *cdata_close_tag = NULL; for (i = 0, j = 0; str2[i]; i++) /* Note: Don't even assume any other tag is a tag in CDATA */ if (strncasecmp(str2 + i, cdata_close_tag, strlen(cdata_close_tag)) == 0) i += strlen(cdata_close_tag) - 1; else if (strncasecmp(str2 + i, "<td", 3) == 0 && closing_td_p) else if (strncasecmp(str2 + i, "</td>", 5) == 0) if(g_ascii_isspace(str2[k])) /* Scan until we end the tag either implicitly - closed start * tag - or explicitly, using a sloppy method * inside quoted attributes will screw us up while (str2[k] && str2[k] != '<' && str2[k] != '>') /* If we've got an <a> tag with an href, save the address if (strncasecmp(str2 + i, "<a", 2) == 0 && g_ascii_isspace(str2[i+2])) size_t st; /* start of href, inclusive [ */ size_t end; /* end of href, exclusive ) */ for (st = i + 3; st < k; st++) if (strncasecmp(str2+st, "href=", 5) == 0) /* find end of address */ for (end = st; end < k && str2[end] != delim; end++) /* All the work is done in the loop construct above. */ /* If there's an address, save it. If there was * already one saved, kill it. */ tmp = strndup(str2 + st, end - st); href = gaim_unescape_html(tmp); /* Replace </a> with an ascii representation of the * address the link was pointing to. */ else if (href != NULL && strncasecmp(str2 + i, "</a>", 4) == 0) size_t hrlen = strlen(href); /* Only insert the href if it's different from the CDATA. */ if ((hrlen != j - href_st || strncmp(str2 + href_st, href, hrlen)) && (hrlen != j - href_st + 7 || strncmp(str2 + href_st, href + 7, hrlen - 7))) { memmove(str2 + j, href, hrlen); /* Check for tags which should be mapped to newline */ else if (strncasecmp(str2 + i, "<p>", 3) == 0 || strncasecmp(str2 + i, "<tr", 3) == 0 || strncasecmp(str2 + i, "<br", 3) == 0 || strncasecmp(str2 + i, "<li", 3) == 0 || strncasecmp(str2 + i, "<div", 4) == 0 || strncasecmp(str2 + i, "</table>", 8) == 0) { /* Check for tags which begin CDATA and need to be closed */ else if (strncasecmp(str2 + i, "<script", 7) == 0) { cdata_close_tag = "</script>"; else if (strncasecmp(str2 + i, "<style", 6) == 0) { cdata_close_tag = "</style>"; /* Update the index and continue checking after the tag */ i = (str2[k] == '<' || str2[k] == '\0')? k - 1: k; else if (cdata_close_tag) else if (!g_ascii_isspace(str2[i])) /* XXX: This sucks. We need to be un-escaping all entities, which * includes these, as well as the &#num; ones */ if (str2[i] == '&' && strncasecmp(str2 + i, """, 6) == 0) if (str2[i] == '&' && strncasecmp(str2 + i, "&", 5) == 0) if (str2[i] == '&' && strncasecmp(str2 + i, "<", 4) == 0) if (str2[i] == '&' && strncasecmp(str2 + i, ">", 4) == 0) if (str2[i] == '&' && strncasecmp(str2 + i, "'", 6) == 0) str2[j++] = g_ascii_isspace(str2[i])? ' ': str2[i];