talkatu/talkatu

Parents 18c1d213d019
Children 1feab6429217
Add TalkatuHtmlParser that abstracts out the traversal of HTML documents to make it easier to output other formats

Add an iterative html parser which will be used to create pangomarkup and gtktext later.

Testing Done:
Added some unittests and ran them in valgrind. Also created and checked the docs.

Reviewed at https://reviews.imfreedom.org/r/89/
--- a/talkatu/meson.build Thu Sep 03 02:18:02 2020 -0500
+++ b/talkatu/meson.build Thu Sep 03 20:11:33 2020 -0500
@@ -15,6 +15,7 @@
'talkatuhistory.h',
'talkatuhistoryrow.h',
'talkatuhtmlbuffer.h',
+ 'talkatuhtmlparser.h',
'talkatuinput.h',
'talkatulinkdialog.h',
'talkatumarkdownbuffer.h',
@@ -45,6 +46,7 @@
'talkatuhistory.c',
'talkatuhistoryrow.c',
'talkatuhtmlbuffer.c',
+ 'talkatuhtmlparser.c',
'talkatuinput.c',
'talkatulinkdialog.c',
'talkatumarkdownbuffer.c',
@@ -180,7 +182,7 @@
include_directories : [toplevel_inc, talkatu_inc],
link_with : talkatu,
sources : TALKATU_PUBLIC_BUILT_HEADERS, # Ensure they're built before use.
- dependencies : [GLIB, GOBJECT]
+ dependencies : [GLIB, GOBJECT, GTK3]
)
pkgconfig.generate(
--- a/talkatu/reference/talkatu-docs.xml Thu Sep 03 02:18:02 2020 -0500
+++ b/talkatu/reference/talkatu-docs.xml Thu Sep 03 20:11:33 2020 -0500
@@ -40,6 +40,8 @@
<xi:include href="xml/talkatuhistory.xml"/>
<xi:include href="xml/talkatuhistoryrow.xml"/>
+ <xi:include href="xml/talkatuhtmlparser.xml"/>
+
<xi:include href="xml/talkatuinput.xml"/>
<xi:include href="xml/talkatueditor.xml"/>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/talkatu/talkatuhtmlparser.c Thu Sep 03 20:11:33 2020 -0500
@@ -0,0 +1,307 @@
+/*
+ * talkatu
+ * Copyright (C) 2017-2020 Gary Kramlich <grim@reaperworld.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <gumbo.h>
+
+#include "talkatuhtmlparser.h"
+
+/**
+ * SECTION:talkatuhtmlparser
+ * @Title: HTML Parsing
+ * @Short_description: An Expat-like HTML Parser
+ *
+ * #TalkatuHtmlParser is an abstract class that will parse HTML and call the
+ * registered instance methods for each element.
+ */
+
+/**
+ * TALKATU_TYPE_HTML_PARSER:
+ *
+ * The standard _get_type macro for #TalkatuHtmlParser.
+ */
+
+/**
+ * TalkatuHtmlParserClass:
+ * @element_start: The method to call when an element is found. The attribute
+ * names and values are passed in as a %NULL terminated array of
+ * strings.
+ * @element_finish: The method to call when all children of an element have been
+ * processed.
+ * @text: The method to call when can text or character data is found.
+ * @comment: The method to call when a comment is found. The passed in comment
+ * is the contents only and does not contain the start (<!--) and end
+ * (-->) tags.
+ *
+ * An abstract class that will walk an HTML document and call the instance
+ * methods of the child class for each node that is found.
+ */
+
+#define _GUMBO_NODE_IS_CONTAINER(node) \
+ ((node)->type == GUMBO_NODE_ELEMENT || \
+ (node)->type == GUMBO_NODE_TEMPLATE)
+
+static GumboNode *
+talkatu_html_parser_find_next_sibling(GumboNode *node) {
+ GumboNode *next = NULL;
+
+ if(node->parent == NULL) {
+ return NULL;
+ }
+
+ /* As long as we have a parent, we can use it with our node's
+ * `index_within_parent` to figure out if we have any more siblings.
+ */
+ if(_GUMBO_NODE_IS_CONTAINER(node->parent)) {
+ GumboElement element = node->parent->v.element;
+
+ if(node->index_within_parent != element.children.length - 1) {
+ next = element.children.data[node->index_within_parent+1];
+ }
+ }
+
+ return next;
+}
+
+/******************************************************************************
+ * Helper Implementations
+ *****************************************************************************/
+static void
+talkatu_html_parser_element_start(TalkatuHtmlParser *parser,
+ const gchar *name,
+ GumboElement *element,
+ gpointer data)
+{
+ TalkatuHtmlParserClass *klass = TALKATU_HTML_PARSER_GET_CLASS(parser);
+
+ if(klass->element_start) {
+ const gchar **names = NULL, **values = NULL;
+ guint length = element->attributes.length;
+
+ if(length > 0) {
+ guint i = 0;
+
+ names = g_new(const gchar *, length + 1);
+ values = g_new(const gchar *, length + 1);
+
+ for(i = 0; i < length; i++) {
+ GumboAttribute *attr = NULL;
+
+ attr = (GumboAttribute *)element->attributes.data[i];
+
+ names[i] = attr->name;
+ values[i] = attr->value;
+ }
+
+ /* add our terminating null values to the end */
+ names[i] = NULL;
+ values[i] = NULL;
+ }
+
+ klass->element_start(parser, name, names, values, data);
+
+ g_free(names);
+ g_free(values);
+ }
+}
+
+static void
+talkatu_html_parser_element_finish(TalkatuHtmlParser *parser,
+ const gchar *name,
+ gpointer data)
+{
+ TalkatuHtmlParserClass *klass = TALKATU_HTML_PARSER_GET_CLASS(parser);
+
+ if(klass->element_finish) {
+ klass->element_finish(parser, name, data);
+ }
+}
+
+static void
+talkatu_html_parser_text(TalkatuHtmlParser *parser, const gchar *text,
+ gpointer data)
+{
+ TalkatuHtmlParserClass *klass = TALKATU_HTML_PARSER_GET_CLASS(parser);
+
+ if(klass->text) {
+ klass->text(parser, text, data);
+ }
+}
+
+static void
+talkatu_html_parser_comment(TalkatuHtmlParser *parser, const gchar *comment,
+ gpointer data)
+{
+ TalkatuHtmlParserClass *klass = TALKATU_HTML_PARSER_GET_CLASS(parser);
+
+ if(klass->comment) {
+ klass->comment(parser, comment, data);
+ }
+}
+
+/******************************************************************************
+ * GObject Implementation
+ *****************************************************************************/
+G_DEFINE_ABSTRACT_TYPE(TalkatuHtmlParser, talkatu_html_parser, G_TYPE_OBJECT)
+
+static void
+talkatu_html_parser_init(TalkatuHtmlParser *parser) {
+}
+
+static void
+talkatu_html_parser_class_init(TalkatuHtmlParserClass *klass) {
+}
+
+/******************************************************************************
+ * Public API
+ *****************************************************************************/
+
+/**
+ * talkatu_html_parser_parse:
+ * @parser: The #TalkatuHtmlParser instance.
+ * @html: The HTML text to parse.
+ * @data: User data to pass to all of the handler functions.
+ *
+ * Starts parsing the given @html calling the #TalkatuHtmlParserClass functions
+ * as necessary.
+ */
+void
+talkatu_html_parser_parse(TalkatuHtmlParser *parser, const gchar *html,
+ gpointer data)
+{
+ GList *stack = NULL;
+ GumboOutput *output = NULL;
+
+ output = gumbo_parse(html);
+
+ stack = g_list_prepend(stack, output->root);
+
+ /* We create a stack with the first node and then process according to the
+ * node type.
+ *
+ * For non-element nodes, we call the text/comment function as appropriate
+ * and then look for their siblings. If the node has a sibling, we remove
+ * the current node from the stack and replace it with its sibling.
+ *
+ * For element nodes, we call element_start and push the first child to the
+ * stack if the node has children and immediately start processing the
+ * child. If the element does not have children, we call element_finish,
+ * remove it from the stack and look for a sibling to push to the stack.
+ *
+ * If the node does not have a sibling, we call element_finish on its parent
+ * and remove it from the stack. Then we check for its parent and repeat
+ * the process until we have found a sibling or have exhausted the stack.
+ */
+
+ while(stack != NULL) {
+ GumboNode *node = (GumboNode *)stack->data;
+ GumboNode *next = NULL;
+ const gchar *tagname = NULL;
+
+ switch(node->type) {
+ case GUMBO_NODE_DOCUMENT:
+ /* this is here to stop a warning from gcc. We could add a
+ * default case, but then if a new type is added or something we
+ * would mask the warning that it would generate.
+ */
+ break;
+ case GUMBO_NODE_ELEMENT:
+ case GUMBO_NODE_TEMPLATE:
+ tagname = gumbo_normalized_tagname(node->v.element.tag);
+ talkatu_html_parser_element_start(parser, tagname,
+ &node->v.element, data);
+
+ if(node->v.element.children.length > 0) {
+ /* if we have at least one child, we throw it on the stack
+ * and start processing that node.
+ */
+ node = (GumboNode *)(&node->v.element.children)->data[0];
+ stack = g_list_prepend(stack, node);
+
+ continue;
+ } else {
+ /* We have no children so we just call the finish method. */
+ talkatu_html_parser_element_finish(parser, tagname, data);
+ }
+ break;
+ case GUMBO_NODE_CDATA:
+ case GUMBO_NODE_TEXT:
+ case GUMBO_NODE_WHITESPACE:
+ talkatu_html_parser_text(parser, node->v.text.text, data);
+ break;
+ case GUMBO_NODE_COMMENT:
+ talkatu_html_parser_comment(parser, node->v.text.text, data);
+ break;
+ }
+
+ /* check if we have a sibling */
+ next = talkatu_html_parser_find_next_sibling(node);
+
+ /* remove the node from the stack */
+ stack = g_list_remove(stack, node);
+
+ /* if the node was the last, we need to end element_finish for the
+ * parent and pop the parent from the stack as well.
+ */
+ if(next != NULL) {
+ stack = g_list_prepend(stack, next);
+ } else if(node->parent != NULL && _GUMBO_NODE_IS_CONTAINER(node->parent)) {
+ /* Our node has no other siblings, so we need to finish the parent
+ * element.
+ */
+ GumboElement parent_element = node->parent->v.element;
+ tagname = gumbo_normalized_tagname(parent_element.tag);
+
+ talkatu_html_parser_element_finish(parser, tagname, data);
+
+ /* while we still have elements on the list, pop them off until we
+ * find one that still has children we haven't visited yet.
+ */
+ while(stack != NULL) {
+ GumboElement element;
+
+ node = (GumboNode *)stack->data;
+
+ next = talkatu_html_parser_find_next_sibling(node);
+
+ if(next != NULL) {
+ /* we found a sibling, so drop the top most item and
+ * put the sibling on the top of the stack.
+ */
+ stack = g_list_remove(stack, node);
+ stack = g_list_prepend(stack, next);
+
+ break;
+ }
+
+ if(node->parent->type != GUMBO_NODE_DOCUMENT) {
+ element = node->parent->v.element;
+ tagname = gumbo_normalized_tagname(element.tag);
+
+ talkatu_html_parser_element_finish(parser, tagname, data);
+ }
+
+ /* If this node doesn't have a sibling, then pop it off the
+ * stack.
+ */
+ stack = g_list_remove(stack, node);
+ }
+ }
+ }
+
+ gumbo_destroy_output(&kGumboDefaultOptions, output);
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/talkatu/talkatuhtmlparser.h Thu Sep 03 20:11:33 2020 -0500
@@ -0,0 +1,51 @@
+/*
+ * talkatu
+ * Copyright (C) 2017-2020 Gary Kramlich <grim@reaperworld.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if !defined(TALKATU_GLOBAL_HEADER_INSIDE) && !defined(TALKATU_COMPILATION)
+#error "only <talkatu.h> may be included directly"
+#endif
+
+#ifndef TALKATU_HTML_H
+#define TALKATU_HTML_H
+
+#include <glib.h>
+#include <glib-object.h>
+
+#define TALKATU_TYPE_HTML_PARSER (talkatu_html_parser_get_type())
+G_DECLARE_DERIVABLE_TYPE(TalkatuHtmlParser, talkatu_html_parser, TALKATU,
+ HTML_PARSER, GObject)
+
+struct _TalkatuHtmlParserClass {
+ /*< private >*/
+ GObjectClass parent;
+
+ /*< public >*/
+ void (*element_start)(TalkatuHtmlParser *parser, const gchar *name, const gchar **attribute_names, const gchar **attribute_values, gpointer data);
+ void (*element_finish)(TalkatuHtmlParser *parser, const gchar *name, gpointer data);
+ void (*text)(TalkatuHtmlParser *parser, const gchar *text, gpointer data);
+ void (*comment)(TalkatuHtmlParser *parser, const gchar *comment, gpointer data);
+};
+
+G_BEGIN_DECLS
+
+void talkatu_html_parser_parse(TalkatuHtmlParser *parser, const gchar *html, gpointer data);
+
+G_END_DECLS
+
+#endif /* TALKATU_HTML_H */
+
--- a/talkatu/tests/meson.build Thu Sep 03 02:18:02 2020 -0500
+++ b/talkatu/tests/meson.build Thu Sep 03 20:11:33 2020 -0500
@@ -16,4 +16,11 @@
)
test('action-group', TEST_WRAPPER, args : e, is_parallel : false)
+e = executable(
+ 'test-html-parser',
+ 'talkatutesthtmlparser.c',
+ dependencies : [talkatu_dep, GLIB]
+)
+test('html-parser', e)
+
endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/talkatu/tests/talkatutesthtmlparser.c Thu Sep 03 20:11:33 2020 -0500
@@ -0,0 +1,228 @@
+/*
+ * talkatu
+ * Copyright (C) 2017-2020 Gary Kramlich <grim@reaperworld.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <talkatu.h>
+
+/******************************************************************************
+ * TestTalkatuHtmlParser
+ *****************************************************************************/
+#define TEST_TALKATU_TYPE_HTML_PARSER (test_talkatu_html_parser_get_type())
+G_DECLARE_FINAL_TYPE(TestTalkatuHtmlParser, test_talkatu_html_parser,
+ TEST_TALKATU, HTML_PARSER, TalkatuHtmlParser)
+
+struct _TestTalkatuHtmlParser {
+ TalkatuHtmlParser parent;
+};
+
+static void
+test_talkatu_html_parser_element_start(TalkatuHtmlParser *parser,
+ const gchar *name,
+ const gchar **attr_names,
+ const gchar **attr_values,
+ gpointer data)
+{
+ GString *str = (GString *)data;
+
+ g_string_append_printf(str, "<%s", name);
+
+ if(attr_names != NULL) {
+ gint i = 0;
+
+ for(i = 0; attr_names[i] != NULL; i++) {
+ g_string_append_printf(str, " %s=\"", attr_names[i]);
+ if(attr_values[i] != NULL) {
+ g_string_append_printf(str, "%s", attr_values[i]);
+ }
+ g_string_append_printf(str, "\"");
+ }
+ }
+
+ g_string_append_printf(str, ">");
+}
+
+static void
+test_talkatu_html_parser_element_finish(TalkatuHtmlParser *parser,
+ const gchar *name,
+ gpointer data)
+{
+ g_string_append_printf((GString *)data, "</%s>", name);
+}
+
+static void
+test_talkatu_html_parser_text(TalkatuHtmlParser *parser, const gchar *text,
+ gpointer data)
+{
+ g_string_append_printf((GString *)data, "%s", text);
+}
+
+static void
+test_talkatu_html_parser_comment(TalkatuHtmlParser *parser,
+ const gchar *comment, gpointer data)
+{
+ g_string_append_printf((GString *)data, "<!--%s-->", comment);
+}
+
+G_DEFINE_TYPE(TestTalkatuHtmlParser, test_talkatu_html_parser, TALKATU_TYPE_HTML_PARSER);
+
+static void
+test_talkatu_html_parser_init(TestTalkatuHtmlParser *parser) {
+}
+
+static void
+test_talkatu_html_parser_class_init(TestTalkatuHtmlParserClass *klass) {
+ TalkatuHtmlParserClass *parser_class = TALKATU_HTML_PARSER_CLASS(klass);
+
+ parser_class->element_start = test_talkatu_html_parser_element_start;
+ parser_class->element_finish = test_talkatu_html_parser_element_finish;
+ parser_class->text = test_talkatu_html_parser_text;
+ parser_class->comment = test_talkatu_html_parser_comment;
+}
+
+TalkatuHtmlParser *
+test_talkatu_html_parser_new(void) {
+ return TALKATU_HTML_PARSER(g_object_new(TEST_TALKATU_TYPE_HTML_PARSER,
+ NULL));
+}
+
+/******************************************************************************
+ * Tests
+ *****************************************************************************/
+static void
+test_talkatu_html_parser_simple(void) {
+ TalkatuHtmlParser *parser = test_talkatu_html_parser_new();
+ GString *str = g_string_new("");
+ const gchar *exp = "<html><head></head><body>plain text</body></html>";
+
+ talkatu_html_parser_parse(parser, "plain text", str);
+
+ g_assert_cmpstr(str->str, ==, exp);
+
+ g_string_free(str, TRUE);
+ g_object_unref(G_OBJECT(parser));
+}
+
+static void
+test_talkatu_html_parser_mixed(void) {
+ TalkatuHtmlParser *parser = test_talkatu_html_parser_new();
+ GString *str = g_string_new("");
+ const gchar *exp = \
+ "<html><head></head><body>" \
+ "<i><b>emphasis <u>underline</u></b> <strike>strike</strike> italic</i>" \
+ "</body></html>";
+
+ talkatu_html_parser_parse(parser, exp, str);
+
+ g_assert_cmpstr(str->str, ==, exp);
+
+ g_string_free(str, TRUE);
+ g_object_unref(G_OBJECT(parser));
+}
+
+static void
+test_talkatu_html_parser_with_comment(void) {
+ TalkatuHtmlParser *parser = test_talkatu_html_parser_new();
+ GString *str = g_string_new("");
+ const gchar *exp = \
+ "<html><head></head><body>" \
+ "Hello, <!--Darkness, my old friend, here to conquer the--> World!" \
+ "</body></html>";
+
+ talkatu_html_parser_parse(parser, exp, str);
+
+ g_assert_cmpstr(str->str, ==, exp);
+
+ g_string_free(str, TRUE);
+ g_object_unref(G_OBJECT(parser));
+}
+
+static void
+test_talkatu_html_parser_with_attributes(void) {
+ TalkatuHtmlParser *parser = test_talkatu_html_parser_new();
+ GString *str = g_string_new("");
+ const gchar *exp = \
+ "<html><head></head><body>" \
+ "<font size=\"2\" color=\"#007f00\">talkatu</font>" \
+ "</body></html>";
+
+ talkatu_html_parser_parse(parser, exp, str);
+
+ g_assert_cmpstr(str->str, ==, exp);
+
+ g_string_free(str, TRUE);
+ g_object_unref(G_OBJECT(parser));
+}
+
+static void
+test_talkatu_html_parser_with_nested_attributes(void) {
+ TalkatuHtmlParser *parser = test_talkatu_html_parser_new();
+ GString *str = g_string_new("");
+ const gchar *exp = \
+ "<html><head></head><body>" \
+ "<font size=\"2\" color=\"#007f00\">" \
+ "<a href=\"https://keep.imfreedom.org/talkatu/talkatu/\">talkatu</a>" \
+ "</font>" \
+ "</body></html>";
+
+ talkatu_html_parser_parse(parser, exp, str);
+
+ g_assert_cmpstr(str->str, ==, exp);
+
+ g_string_free(str, TRUE);
+ g_object_unref(G_OBJECT(parser));
+}
+
+/******************************************************************************
+ * Main
+ *****************************************************************************/
+gint
+main(gint argc, gchar **argv) {
+ gint ret = 0;
+
+ g_test_init(&argc, &argv, NULL);
+
+ gtk_init(&argc, &argv);
+
+ talkatu_init();
+
+ g_test_add_func(
+ "/html-parser/simple",
+ test_talkatu_html_parser_simple);
+
+ g_test_add_func(
+ "/html-parser/mixed-children",
+ test_talkatu_html_parser_mixed);
+
+ g_test_add_func(
+ "/html-parser/with-comment",
+ test_talkatu_html_parser_with_comment);
+
+ g_test_add_func(
+ "/html-parser/with-attributes",
+ test_talkatu_html_parser_with_attributes);
+
+ g_test_add_func(
+ "/html-parser/with-nested-attributes",
+ test_talkatu_html_parser_with_nested_attributes);
+
+ ret = g_test_run();
+
+ talkatu_uninit();
+
+ return ret;
+}
+