* Purple is the legal property of its developers, whose names are too * numerous to list here. Please refer to the COPYRIGHT file distributed * with this source distribution * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA /* A single internal (that don't have any children) consists * of 256 + 4 pointers. That's 1040 bytes on 32-bit machine or 2080 bytes * Thus, in 10500-byte pool block we can hold about 5-10 internal states. * Threshold of 100 states means, we'd need 10-20 "small" blocks before * switching to ~1-2 large blocks. #define PURPLE_TRIE_LARGE_THRESHOLD 100 #define PURPLE_TRIE_STATES_SMALL_POOL_BLOCK_SIZE 10880 #define PURPLE_TRIE_STATES_LARGE_POOL_BLOCK_SIZE 102400 typedef struct _PurpleTrieRecord PurpleTrieRecord; typedef struct _PurpleTrieState PurpleTrieState; typedef struct _PurpleTrieRecordList PurpleTrieRecordList; * The trie object instance. PurpleMemoryPool *records_str_mempool; PurpleMemoryPool *records_obj_mempool; PurpleTrieRecordList *records; gsize records_total_size; PurpleMemoryPool *states_mempool; PurpleTrieState *root_state; struct _PurpleTrieRecordList PurpleTrieRecordList *next; PurpleTrieRecordList *prev; PurpleTrieState **children; PurpleTrieState *longest_suffix; PurpleTrieRecord *found_word; PurpleTrieState *root_state; PurpleTrieReplaceCb replace_cb; PurpleTrieFindCb find_cb; /* TODO: an option to make it eager or lazy (now, it's eager) */ static GParamSpec *properties[PROP_LAST]; G_DEFINE_TYPE_WITH_PRIVATE(PurpleTrie, purple_trie, G_TYPE_OBJECT); /******************************************************************************* ******************************************************************************/ static PurpleTrieRecordList * purple_record_list_new(PurpleMemoryPool *mpool, PurpleTrieRecord *rec) PurpleTrieRecordList *node; node = purple_memory_pool_alloc0(mpool, sizeof(PurpleTrieRecordList), sizeof(gpointer)); g_return_val_if_fail(node != NULL, NULL); static PurpleTrieRecordList * purple_record_list_prepend(PurpleMemoryPool *mpool, PurpleTrieRecordList *old_head, PurpleTrieRecord *rec) PurpleTrieRecordList *new_head; new_head = purple_record_list_new(mpool, rec); g_return_val_if_fail(new_head != NULL, NULL); new_head->next = old_head; old_head->prev = new_head; static PurpleTrieRecordList * purple_record_list_copy(PurpleMemoryPool *mpool, const PurpleTrieRecordList *head) PurpleTrieRecordList *new_head = NULL, *new_tail = NULL; PurpleTrieRecordList *node; node = purple_record_list_new(mpool, head->rec); g_return_val_if_fail(node != NULL, NULL); /* there is no leak */ static PurpleTrieRecordList * purple_record_list_remove(PurpleTrieRecordList *head, PurpleTrieRecordList *node) g_return_val_if_fail(head != NULL, NULL); g_return_val_if_fail(node != NULL, head); g_return_val_if_fail(head->prev == NULL, NULL); g_return_val_if_fail(node->prev != NULL, NULL); node->prev->next = node->next; node->next->prev = node->prev; /******************************************************************************* ******************************************************************************/ purple_trie_states_cleanup(PurpleTriePrivate *priv) if (priv->root_state != NULL) { purple_memory_pool_cleanup(priv->states_mempool); /* Allocates a state and binds it to the parent. */ purple_trie_state_new(PurpleTriePrivate *priv, PurpleTrieState *parent, state = purple_memory_pool_alloc0(priv->states_mempool, sizeof(PurpleTrieState), sizeof(gpointer)); g_return_val_if_fail(state != NULL, NULL); if (parent->children == NULL) { parent->children = purple_memory_pool_alloc0( /* PurpleTrieState *children[G_MAXUCHAR + 1] */ if (parent->children == NULL) { purple_memory_pool_free(priv->states_mempool, state); parent->children[character] = state; purple_trie_states_build(PurpleTriePrivate *priv) PurpleMemoryPool *reclist_mpool; PurpleTrieRecordList *reclist, *it; if (priv->root_state != NULL) if (priv->records_total_size < PURPLE_TRIE_LARGE_THRESHOLD) { purple_memory_pool_set_block_size(priv->states_mempool, PURPLE_TRIE_STATES_SMALL_POOL_BLOCK_SIZE); purple_memory_pool_set_block_size(priv->states_mempool, PURPLE_TRIE_STATES_LARGE_POOL_BLOCK_SIZE); priv->root_state = root = purple_trie_state_new(priv, NULL, '\0'); g_return_val_if_fail(root != NULL, FALSE); g_assert(root->longest_suffix == NULL); /* reclist is a list of words not yet added to the trie. Shorter words * are removed from the list, when they are fully added to the trie. */ reclist_mpool = purple_memory_pool_new(); reclist = purple_record_list_copy(reclist_mpool, priv->records); /* extra_data on every element of reclist will be a pointer to a trie * node -- the prefix of the word with len of cur_len */ for (it = reclist; it != NULL; it = it->next) { /* Iterate over indexes of words -- every loop iteration checks certain * index of all remaining words. Loop finishes when there are no words * longer than cur_len. */ for (cur_len = 0; reclist != NULL; cur_len++) { for (it = reclist; it; it = it->next) { PurpleTrieRecord *rec = it->rec; guchar character = rec->word[cur_len]; PurpleTrieState *prefix = it->extra_data; PurpleTrieState *lon_suf_parent; g_assert(character != '\0'); if (prefix->children && prefix->children[character]) { /* Word's prefix is already in the trie, added prefix = prefix->children[character]; /* We need to create a new branch of trie. */ prefix = purple_trie_state_new(priv, prefix, g_object_unref(reclist_mpool); /* prefix is now of length increased by one character. */ /* The whole word is now added to the trie. */ if (rec->word[cur_len + 1] == '\0') { if (prefix->found_word == NULL) prefix->found_word = rec; purple_debug_warning("trie", "found " "a collision of \"%s\" words", /* "it" is not modified here, so it->next is reclist = purple_record_list_remove(reclist, it); /* We need to fill the longest_suffix field -- a longest * complete suffix of the prefix we created. We look for * that suffix in any path starting in root and ending * in the (cur_len - 1) level of trie. */ if (prefix->longest_suffix != NULL) lon_suf_parent = prefix->parent->longest_suffix; if (lon_suf_parent->children && lon_suf_parent->children[character]) prefix->longest_suffix = lon_suf_parent-> lon_suf_parent = lon_suf_parent->longest_suffix; if (prefix->longest_suffix == NULL) prefix->longest_suffix = root; if (prefix->found_word == NULL) { prefix->longest_suffix->found_word; g_object_unref(reclist_mpool); /******************************************************************************* ******************************************************************************/ purple_trie_advance(PurpleTrieMachine *m, const guchar character) /* change state after processing a character */ /* Perfect fit - next character is the same, as the child of the * prefix we reached so far. */ if (m->state->children && m->state->children[character]) { m->state = m->state->children[character]; /* We reached root, that's a pity. */ if (m->state == m->root_state) /* Let's try a bit shorter suffix. */ m->state = m->state->longest_suffix; purple_trie_replace_do_replacement(PurpleTrieMachine *m, GString *out) gboolean was_replaced = FALSE; /* if we reached a "found" state, let's process it */ if (!m->state->found_word) /* let's get back to the beginning of the word */ g_assert(out->len >= m->state->found_word->word_len - 1); out->len -= m->state->found_word->word_len - 1; was_replaced = m->replace_cb(out, m->state->found_word->word, m->state->found_word->data, m->user_data); /* output was untouched, revert to the previous position */ if (was_replaced || m->reset_on_match) m->state = m->root_state; purple_trie_find_do_discovery(PurpleTrieMachine *m) /* if we reached a "found" state, let's process it */ if (!m->state->found_word) was_accepted = m->find_cb(m->state->found_word->word, m->state->found_word->data, m->user_data); if (was_accepted && m->reset_on_match) m->state = m->root_state; purple_trie_replace(PurpleTrie *trie, const gchar *src, PurpleTrieReplaceCb replace_cb, gpointer user_data) PurpleTriePrivate *priv = NULL; PurpleTrieMachine machine; g_return_val_if_fail(replace_cb != NULL, g_strdup(src)); g_return_val_if_fail(PURPLE_IS_TRIE(trie), NULL); priv = purple_trie_get_instance_private(trie); purple_trie_states_build(priv); machine.state = priv->root_state; machine.root_state = priv->root_state; machine.reset_on_match = priv->reset_on_match; machine.replace_cb = replace_cb; machine.user_data = user_data; out = g_string_new(NULL); guchar character = src[i++]; purple_trie_advance(&machine, character); was_replaced = purple_trie_replace_do_replacement(&machine, out); /* We skipped a character without finding any records, * let's just copy it to the output. */ g_string_append_c(out, character); return g_string_free(out, FALSE); purple_trie_multi_replace(const GSList *tries, const gchar *src, PurpleTrieReplaceCb replace_cb, gpointer user_data) guint tries_count, m_idx; PurpleTrieMachine *machines; g_return_val_if_fail(replace_cb != NULL, g_strdup(src)); tries_count = g_slist_length((GSList*)tries); /* Initialize all machines. */ machines = g_new(PurpleTrieMachine, tries_count); for (i = 0; i < tries_count; i++, tries = tries->next) { PurpleTrie *trie = tries->data; PurpleTriePrivate *priv = NULL; if (!PURPLE_TRIE(trie)) { priv = purple_trie_get_instance_private(trie); purple_trie_states_build(priv); machines[i].state = priv->root_state; machines[i].root_state = priv->root_state; machines[i].reset_on_match = priv->reset_on_match; machines[i].replace_cb = replace_cb; machines[i].user_data = user_data; out = g_string_new(NULL); guchar character = src[i++]; gboolean was_replaced = FALSE; /* Advance every machine and possibly perform a replacement. */ for (m_idx = 0; m_idx < tries_count; m_idx++) { purple_trie_advance(&machines[m_idx], character); was_replaced = purple_trie_replace_do_replacement( /* We skipped a character without finding any records, * let's just copy it to the output. */ g_string_append_c(out, character); /* If we replaced a word, reset _all_ machines */ for (m_idx = 0; m_idx < tries_count; m_idx++) { machines[m_idx].root_state; return g_string_free(out, FALSE); purple_trie_find(PurpleTrie *trie, const gchar *src, PurpleTrieFindCb find_cb, gpointer user_data) PurpleTriePrivate *priv = NULL; PurpleTrieMachine machine; g_return_val_if_fail(PURPLE_IS_TRIE(trie), 0); priv = purple_trie_get_instance_private(trie); purple_trie_states_build(priv); machine.state = priv->root_state; machine.root_state = priv->root_state; machine.reset_on_match = priv->reset_on_match; machine.find_cb = find_cb; machine.user_data = user_data; guchar character = src[i++]; purple_trie_advance(&machine, character); was_found = purple_trie_find_do_discovery(&machine); purple_trie_multi_find(const GSList *tries, const gchar *src, PurpleTrieFindCb find_cb, gpointer user_data) guint tries_count, m_idx; PurpleTrieMachine *machines; tries_count = g_slist_length((GSList*)tries); /* Initialize all machines. */ machines = g_new(PurpleTrieMachine, tries_count); for (i = 0; i < tries_count; i++, tries = tries->next) { PurpleTrie *trie = tries->data; PurpleTriePrivate *priv = NULL; if (!PURPLE_IS_TRIE(trie)) { priv = purple_trie_get_instance_private(trie); purple_trie_states_build(priv); machines[i].state = priv->root_state; machines[i].root_state = priv->root_state; machines[i].reset_on_match = priv->reset_on_match; machines[i].find_cb = find_cb; machines[i].user_data = user_data; guchar character = src[i++]; gboolean was_found = FALSE; /* Advance every machine and possibly perform a replacement. */ for (m_idx = 0; m_idx < tries_count; m_idx++) { purple_trie_advance(&machines[m_idx], character); purple_trie_find_do_discovery(&machines[m_idx]); /* If we replaced a word, reset _all_ machines */ for (m_idx = 0; m_idx < tries_count; m_idx++) { if (!machines[m_idx].reset_on_match) machines[m_idx].root_state; /******************************************************************************* ******************************************************************************/ purple_trie_add(PurpleTrie *trie, const gchar *word, gpointer data) PurpleTriePrivate *priv = NULL; g_return_val_if_fail(PURPLE_IS_TRIE(trie), FALSE); g_return_val_if_fail(word != NULL, FALSE); g_return_val_if_fail(word[0] != '\0', FALSE); priv = purple_trie_get_instance_private(trie); if (g_hash_table_lookup(priv->records_map, word) != NULL) { purple_debug_warning("trie", "record exists: %s", word); /* Every change in a trie invalidates longest_suffix map. * These prefixes could be updated instead of cleaning the whole graph. purple_trie_states_cleanup(priv); rec = purple_memory_pool_alloc(priv->records_obj_mempool, sizeof(PurpleTrieRecord), sizeof(gpointer)); rec->word = purple_memory_pool_strdup(priv->records_str_mempool, word); rec->word_len = strlen(word); g_assert(rec->word_len > 0); priv->records_total_size += rec->word_len; priv->records = purple_record_list_prepend(priv->records_obj_mempool, g_hash_table_insert(priv->records_map, rec->word, priv->records); purple_trie_remove(PurpleTrie *trie, const gchar *word) PurpleTriePrivate *priv = NULL; PurpleTrieRecordList *it; g_return_if_fail(PURPLE_IS_TRIE(trie)); g_return_if_fail(word != NULL); g_return_if_fail(word[0] != '\0'); priv = purple_trie_get_instance_private(trie); it = g_hash_table_lookup(priv->records_map, word); /* see purple_trie_add */ purple_trie_states_cleanup(priv); priv->records_total_size -= it->rec->word_len; priv->records = purple_record_list_remove(priv->records, it); g_hash_table_remove(priv->records_map, it->rec->word); purple_memory_pool_free(priv->records_str_mempool, it->rec->word); purple_memory_pool_free(priv->records_obj_mempool, it->rec); purple_memory_pool_free(priv->records_obj_mempool, it); purple_trie_get_size(PurpleTrie *trie) PurpleTriePrivate *priv = NULL; g_return_val_if_fail(PURPLE_IS_TRIE(trie), 0); priv = purple_trie_get_instance_private(trie); return g_hash_table_size(priv->records_map); /******************************************************************************* ******************************************************************************/ purple_trie_get_reset_on_match(PurpleTrie *trie) PurpleTriePrivate *priv = NULL; g_return_val_if_fail(PURPLE_IS_TRIE(trie), FALSE); priv = purple_trie_get_instance_private(trie); return priv->reset_on_match; purple_trie_set_reset_on_match(PurpleTrie *trie, gboolean reset) PurpleTriePrivate *priv = NULL; g_return_if_fail(PURPLE_IS_TRIE(trie)); priv = purple_trie_get_instance_private(trie); priv->reset_on_match = reset; g_object_notify_by_pspec(G_OBJECT(trie), properties[PROP_RESET_ON_MATCH]); /******************************************************************************* ******************************************************************************/ return g_object_new(PURPLE_TYPE_TRIE, NULL); purple_trie_init(PurpleTrie *trie) PurpleTriePrivate *priv = purple_trie_get_instance_private(trie); priv->records_obj_mempool = purple_memory_pool_new(); priv->records_str_mempool = purple_memory_pool_new(); priv->states_mempool = purple_memory_pool_new(); purple_memory_pool_set_block_size(priv->states_mempool, PURPLE_TRIE_STATES_SMALL_POOL_BLOCK_SIZE); priv->records_map = g_hash_table_new(g_str_hash, g_str_equal); purple_trie_finalize(GObject *obj) PurpleTriePrivate *priv = purple_trie_get_instance_private(PURPLE_TRIE(obj)); g_hash_table_destroy(priv->records_map); g_object_unref(priv->records_obj_mempool); g_object_unref(priv->records_str_mempool); g_object_unref(priv->states_mempool); G_OBJECT_CLASS(purple_trie_parent_class)->finalize(obj); purple_trie_get_property(GObject *obj, guint param_id, GValue *value, PurpleTrie *trie = PURPLE_TRIE(obj); PurpleTriePrivate *priv = purple_trie_get_instance_private(trie); case PROP_RESET_ON_MATCH: g_value_set_boolean(value, priv->reset_on_match); G_OBJECT_WARN_INVALID_PROPERTY_ID(obj, param_id, pspec); purple_trie_set_property(GObject *obj, guint param_id, const GValue *value, GParamSpec *pspec) PurpleTrie *trie = PURPLE_TRIE(obj); PurpleTriePrivate *priv = purple_trie_get_instance_private(trie); case PROP_RESET_ON_MATCH: priv->reset_on_match = g_value_get_boolean(value); G_OBJECT_WARN_INVALID_PROPERTY_ID(obj, param_id, pspec); purple_trie_class_init(PurpleTrieClass *klass) GObjectClass *obj_class = G_OBJECT_CLASS(klass); obj_class->finalize = purple_trie_finalize; obj_class->get_property = purple_trie_get_property; obj_class->set_property = purple_trie_set_property; properties[PROP_RESET_ON_MATCH] = g_param_spec_boolean("reset-on-match", "Reset on match", "Determines, if the search state machine " "should be reset to the initial state on every match. This " "ensures, that every match is distinct from each other. " "Please note, that it's not well-defined for a replace " "operation, so it's better to leave this value default, unless " "you perform only find operations.", TRUE, G_PARAM_READWRITE | G_PARAM_CONSTRUCT | G_PARAM_STATIC_STRINGS); g_object_class_install_properties(obj_class, PROP_LAST, properties);