[tracker] tracker-extract: Fixed HTML parser to use TrackerSparqlBuilder
- From: Martyn James Russell <mr src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [tracker] tracker-extract: Fixed HTML parser to use TrackerSparqlBuilder
- Date: Tue, 3 Nov 2009 17:53:21 +0000 (UTC)
commit 12593e8cefdcd2d3f278a22aab1c977b95c05a4b
Author: Martyn Russell <martyn lanedo com>
Date: Tue Nov 3 17:52:43 2009 +0000
tracker-extract: Fixed HTML parser to use TrackerSparqlBuilder
src/tracker-extract/tracker-extract-html.c | 245 ++++++++++++++--------------
1 files changed, 125 insertions(+), 120 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-html.c b/src/tracker-extract/tracker-extract-html.c
index 514d08d..a90b7ce 100644
--- a/src/tracker-extract/tracker-extract-html.c
+++ b/src/tracker-extract/tracker-extract-html.c
@@ -1,7 +1,7 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
- * Copyright (C) 2007, Jason Kivlighn (jkivlighn gmail com)
- * Copyright (C) 2008, Nokia
+ * Copyright (C) 2007, Jason Kivlighn (jkivlighn gmail com)
+ * Copyright (C) 2008-2009, Nokia
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -31,12 +31,7 @@
#include <libtracker-common/tracker-ontology.h>
-#define NIE_PREFIX TRACKER_NIE_PREFIX
-#define NFO_PREFIX TRACKER_NFO_PREFIX
-#define NCO_PREFIX TRACKER_NCO_PREFIX
-
-#define RDF_PREFIX TRACKER_RDF_PREFIX
-#define RDF_TYPE RDF_PREFIX "type"
+#define RDF_TYPE TRACKER_RDF_PREFIX "type"
typedef enum {
READ_TITLE,
@@ -46,10 +41,10 @@ typedef struct {
TrackerSparqlBuilder *metadata;
tag_type current;
const gchar *uri;
-} HTMLParseInfo;
+} parser_data;
-static void extract_html (const gchar *filename,
- TrackerSparqlBuilder *metadata);
+static void extract_html (const gchar *filename,
+ TrackerSparqlBuilder *metadata);
static TrackerExtractData data[] = {
{ "text/html", extract_html },
@@ -58,19 +53,19 @@ static TrackerExtractData data[] = {
};
static gboolean
-has_attribute (const xmlChar **atts,
- const gchar *attr,
- const gchar *val)
+has_attribute (const gchar **attrs,
+ const gchar *attr,
+ const gchar *val)
{
gint i;
- if (!(atts && attr && val)) {
+ if (!attrs || !attr || !val) {
return FALSE;
}
- for (i = 0; atts[i] && atts[i + 1]; i += 2) {
- if (strcasecmp ((gchar*) atts[i], attr) == 0) {
- if (strcasecmp ((gchar*) atts[i + 1], val) == 0) {
+ for (i = 0; attrs[i] && attrs[i + 1]; i += 2) {
+ if (g_ascii_strcasecmp (attrs[i], attr) == 0) {
+ if (g_ascii_strcasecmp (attrs[i + 1], val) == 0) {
return TRUE;
}
}
@@ -80,174 +75,184 @@ has_attribute (const xmlChar **atts,
}
static const xmlChar *
-lookup_attribute (const xmlChar **atts,
- const gchar *attr)
+lookup_attribute (const gchar **attrs,
+ const gchar *attr)
{
gint i;
- if (!atts || !attr) {
+ if (!attrs || !attr) {
return NULL;
}
- for (i = 0; atts[i] && atts[i + 1]; i += 2) {
- if (strcasecmp ((gchar*) atts[i], attr) == 0) {
- return atts[i + 1];
+ for (i = 0; attrs[i] && attrs[i + 1]; i += 2) {
+ if (g_ascii_strcasecmp (attrs[i], attr) == 0) {
+ return attrs[i + 1];
}
}
return NULL;
}
-void
-startElement (void *info_,
- const xmlChar *name,
- const xmlChar **atts)
+static void
+parser_start_element (void *data,
+ const xmlChar *name_,
+ const xmlChar **attrs_)
{
- HTMLParseInfo* info = info_;
+ parser_data *pd = data;
+ const gchar *name = (const gchar*) name_;
+ const gchar **attrs = (const gchar**) attrs_;
- if (!(info && name)) {
+ if (!pd || !name) {
return;
}
/* Look for RDFa triple describing the license */
- if (strcasecmp ((gchar*) name, "a") == 0) {
+ if (g_ascii_strcasecmp (name, "a") == 0) {
/* This tag is a license. Ignore, however, if it is
* referring to another document.
*/
- if (has_attribute (atts, "rel", "license") &&
- has_attribute (atts, "about", NULL) == FALSE) {
+ if (has_attribute (attrs, "rel", "license") &&
+ has_attribute (attrs, "about", NULL) == FALSE) {
const xmlChar *href;
- href = lookup_attribute (atts, "href");
+ href = lookup_attribute (attrs, "href");
if (href) {
- tracker_statement_list_insert (info->metadata,
- info->uri, NIE_PREFIX "license",
- (const gchar *) href);
+ tracker_sparql_builder_predicate (pd->metadata, "nie:license");
+ tracker_sparql_builder_object_unvalidated (pd->metadata, href);
}
}
- } else if (strcasecmp ((gchar*)name, "title") == 0) {
- info->current = READ_TITLE;
- } else if (strcasecmp ((gchar*)name, "meta") == 0) {
- if (has_attribute (atts, "name", "Author")) {
+ } else if (g_ascii_strcasecmp (name, "title") == 0) {
+ pd->current = READ_TITLE;
+ } else if (g_ascii_strcasecmp (name, "meta") == 0) {
+ if (has_attribute (attrs, "name", "author")) {
const xmlChar *author;
- author = lookup_attribute (atts, "content");
+ author = lookup_attribute (attrs, "content");
if (author) {
- tracker_statement_list_insert (info->metadata, ":", RDF_TYPE, NCO_PREFIX "Contact");
- tracker_statement_list_insert (info->metadata, ":", NCO_PREFIX "fullname", author);
- tracker_statement_list_insert (info->metadata, info->uri, NCO_PREFIX "creator", ":");
+ tracker_sparql_builder_predicate (pd->metadata, "nco:creator");
+ tracker_sparql_builder_object_blank_open (pd->metadata);
+ tracker_sparql_builder_predicate (pd->metadata, "a");
+ tracker_sparql_builder_object (pd->metadata, "nco:Contact");
+ tracker_sparql_builder_predicate (pd->metadata, "nco:fullname");
+ tracker_sparql_builder_object_unvalidated (pd->metadata, author);
+ tracker_sparql_builder_object_blank_close (pd->metadata);
}
}
- if (has_attribute (atts, "name", "DC.Description")) {
+ if (has_attribute (attrs, "name", "description")) {
const xmlChar *desc;
- desc = lookup_attribute (atts,"content");
+ desc = lookup_attribute (attrs,"content");
if (desc) {
- tracker_statement_list_insert (info->metadata,
- info->uri, NIE_PREFIX "comment",
- (const gchar *) desc);
+ tracker_sparql_builder_predicate (pd->metadata, "nie:description");
+ tracker_sparql_builder_object_unvalidated (pd->metadata, desc);
}
}
- if (has_attribute (atts, "name", "KEYWORDS") ||
- has_attribute (atts, "name", "keywords")) {
- const xmlChar* k = lookup_attribute (atts, "content");
-
- if (k) {
- gchar *keywords = g_strdup (k);
- char *lasts, *keyw;
-
- for (keyw = strtok_r (keywords, ",;", &lasts); keyw;
- keyw = strtok_r (NULL, ",;", &lasts)) {
- tracker_statement_list_insert (info->metadata,
- info->uri, NIE_PREFIX "keyword",
- (const gchar*) keyw);
+ if (has_attribute (attrs, "name", "keywords")) {
+ const xmlChar* content = lookup_attribute (attrs, "content");
+
+ if (content) {
+ gchar **keywords;
+ gint i;
+
+ keywords = g_strsplit (content, ",", -1);
+ if (keywords) {
+ for (i = 0; keywords[i] != NULL; i++) {
+ if (!keywords[i] || keywords[i] == '\0') {
+ continue;
+ }
+
+ tracker_sparql_builder_predicate (pd->metadata, "nie:keyword");
+ tracker_sparql_builder_object_unvalidated (pd->metadata, g_strstrip (keywords[i]));
+ }
+
+ g_strfreev (keywords);
}
-
- g_free (keywords);
}
}
}
}
-void
-characters (void *info_,
- const xmlChar *ch,
- int len)
+static void
+parser_characters (void *data,
+ const xmlChar *ch,
+ int len)
{
- HTMLParseInfo* info = info_;
+ parser_data *pd = data;
- switch (info->current) {
+ switch (pd->current) {
case READ_TITLE:
- tracker_statement_list_insert (info->metadata,
- info->uri, NIE_PREFIX "title",
- (const gchar*) ch);
+ tracker_sparql_builder_predicate (pd->metadata, "nie:title");
+ tracker_sparql_builder_object_unvalidated (pd->metadata, ch);
break;
default:
break;
}
- info->current = -1;
+ pd->current = -1;
}
static void
-extract_html (const gchar *uri,
- TrackerSparqlBuilder *metadata)
+extract_html (const gchar *uri,
+ TrackerSparqlBuilder *metadata)
{
- gchar *filename = g_filename_from_uri (uri, NULL, NULL);
- xmlSAXHandler SAXHandlerStruct = {
- NULL, /* internalSubset */
- NULL, /* isStandalone */
- NULL, /* hasInternalSubset */
- NULL, /* hasExternalSubset */
- NULL, /* resolveEntity */
- NULL, /* getEntity */
- NULL, /* entityDecl */
- NULL, /* notationDecl */
- NULL, /* attributeDecl */
- NULL, /* elementDecl */
- NULL, /* unparsedEntityDecl */
- NULL, /* setDocumentLocator */
- NULL, /* startDocument */
- NULL, /* endDocument */
- startElement, /* startElement */
- NULL, /* endElement */
- NULL, /* reference */
- characters, /* characters */
- NULL, /* ignorableWhitespace */
- NULL, /* processingInstruction */
- NULL, /* comment */
- NULL, /* xmlParserWarning */
- NULL, /* xmlParserError */
- NULL, /* xmlParserError */
- NULL, /* getParameterEntity */
- NULL, /* cdataBlock */
- NULL, /* externalSubset */
- 1, /* initialized */
- NULL, /* private */
- NULL, /* startElementNsSAX2Func */
- NULL, /* endElementNsSAX2Func */
- NULL /* xmlStructuredErrorFunc */
+ htmlDocPtr doc;
+ parser_data pd;
+ gchar *filename;
+ xmlSAXHandler handler = {
+ NULL, /* internalSubset */
+ NULL, /* isStandalone */
+ NULL, /* hasInternalSubset */
+ NULL, /* hasExternalSubset */
+ NULL, /* resolveEntity */
+ NULL, /* getEntity */
+ NULL, /* entityDecl */
+ NULL, /* notationDecl */
+ NULL, /* attributeDecl */
+ NULL, /* elementDecl */
+ NULL, /* unparsedEntityDecl */
+ NULL, /* setDocumentLocator */
+ NULL, /* startDocument */
+ NULL, /* endDocument */
+ parser_start_element, /* startElement */
+ NULL, /* endElement */
+ NULL, /* reference */
+ parser_characters, /* characters */
+ NULL, /* ignorableWhitespace */
+ NULL, /* processingInstruction */
+ NULL, /* comment */
+ NULL, /* xmlParserWarning */
+ NULL, /* xmlParserError */
+ NULL, /* xmlParserError */
+ NULL, /* getParameterEntity */
+ NULL, /* cdataBlock */
+ NULL, /* externalSubset */
+ 1, /* initialized */
+ NULL, /* private */
+ NULL, /* startElementNsSAX2Func */
+ NULL, /* endElementNsSAX2Func */
+ NULL /* xmlStructuredErrorFunc */
};
- HTMLParseInfo info = { metadata, -1, uri };
+ tracker_sparql_builder_subject_iri (metadata, uri);
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object_unvalidated (metadata, "nfo:Document");
- htmlDocPtr doc;
- doc = htmlSAXParseFile (filename, NULL, &SAXHandlerStruct, &info);
- if (doc) {
+ pd.metadata = metadata;
+ pd.current = -1;
+ pd.uri = uri;
- tracker_statement_list_insert (metadata, uri,
- RDF_TYPE,
- NFO_PREFIX "Document");
+ filename = g_filename_from_uri (uri, NULL, NULL);
+ doc = htmlSAXParseFile (filename, NULL, &handler, &pd);
+ g_free (filename);
+ if (doc) {
xmlFreeDoc (doc);
}
-
- g_free (filename);
}
TrackerExtractData *
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]