[tracker/tracker-0.8] Fixes GB#616493 - Remove dependency of unzip from the OASIS extractor
- From: Martyn James Russell <mr src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/tracker-0.8] Fixes GB#616493 - Remove dependency of unzip from the OASIS extractor
- Date: Thu, 29 Apr 2010 13:54:06 +0000 (UTC)
commit be8f6afc8a6690779f12df932f86918b4ea9722b
Author: Aleksander Morgado <aleksander lanedo com>
Date: Wed Apr 21 18:58:09 2010 +0200
Fixes GB#616493 - Remove dependency of unzip from the OASIS extractor
src/tracker-extract/Makefile.am | 14 ++-
src/tracker-extract/tracker-extract-msoffice.c | 120 ++-----------------
src/tracker-extract/tracker-extract-oasis.c | 85 ++++++-------
src/tracker-extract/tracker-gsf.c | 153 ++++++++++++++++++++++++
src/tracker-extract/tracker-gsf.h | 35 ++++++
5 files changed, 246 insertions(+), 161 deletions(-)
---
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 1346b0b..eede0c1 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -33,13 +33,12 @@ if HAVE_ENCA
INCLUDES += $(ENCA_CFLAGS)
endif
-# NOTE:
+# NOTE:
# We don't always link with libtracker-common, we only link
# against it if we directly use functions in the .so
modules_LTLIBRARIES = \
libextract-abw.la \
libextract-mp3.la \
- libextract-oasis.la \
libextract-png.la \
libextract-ps.la \
libextract-text.la
@@ -69,7 +68,9 @@ modules_LTLIBRARIES += libextract-html.la
endif
if HAVE_LIBGSF
-modules_LTLIBRARIES += libextract-msoffice.la
+modules_LTLIBRARIES += \
+ libextract-msoffice.la \
+ libextract-oasis.la
endif
if HAVE_POPPLER_GLIB
@@ -82,7 +83,7 @@ endif
if HAVE_GSTREAMER_HELIX
modules_LTLIBRARIES += libextract-gstreamer-helix.la
-endif
+endif
if HAVE_LIBXINE
modules_LTLIBRARIES += libextract-xine.la
@@ -320,6 +321,11 @@ tracker_extract_LDADD = \
$(GCOV_LIBS) \
$(GLIB2_LIBS)
+if HAVE_LIBGSF
+tracker_extract_SOURCES += tracker-gsf.c tracker-gsf.h
+tracker_extract_LDADD += $(LIBGSF_LIBS)
+endif
+
if HAVE_LIBSTREAMANALYZER
tracker_extract_SOURCES += tracker-topanalyzer.cpp tracker-topanalyzer.h
tracker_extract_LDADD += $(LIBSTREAMANALYZER_LIBS)
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index d7e6aa1..708a9dd 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -40,6 +40,7 @@
#include <libtracker-extract/tracker-extract.h>
#include "tracker-main.h"
+#include "tracker-gsf.h"
/* Powerpoint files comprise of structures. Each structure contains a
* header. Within that header is a record type that specifies what
@@ -2086,116 +2087,6 @@ xml_text_handler_document_data (GMarkupParseContext *context,
}
}
-/**
- * based on find_member() from vsd_utils.c:
- * http://vsdump.sourcearchive.com/documentation/0.0.44/vsd__utils_8c-source.html
- */
-static GsfInput *
-find_member (GsfInfile *arch,
- char const *name)
-{
- gchar const *slash = strchr (name, '/');
-
- if (slash) {
- gchar *dirname = g_strndup (name, slash - name);
- GsfInput *member;
-
- if ((member = gsf_infile_child_by_name (arch, dirname)) != NULL) {
- GsfInfile *dir = GSF_INFILE (member);
- member = find_member (dir, slash + 1);
- g_object_unref (dir);
- }
-
- g_free (dirname);
- return member;
- } else {
- return gsf_infile_child_by_name (arch, name);
- }
-}
-
-
-#define XML_BUFFER_SIZE 8192 /* bytes */
-/* Note: 20 MBytes of max size is really assumed to be a safe limit. */
-#define XML_MAX_BYTES_READ (20u << 20) /* bytes */
-
-static void
-parse_xml_contents (const gchar *file_uri,
- const gchar *xml_filename,
- GMarkupParseContext *context)
-{
- gchar *filename;
- GError *error = NULL;
- GsfInfile *infile = NULL;
- GsfInput *src = NULL;
- GsfInput *member = NULL;
-
- g_debug ("Parsing '%s' XML file from '%s' zip archive...",
- xml_filename, file_uri);
-
- /* Get filename from the given URI */
- if ((filename = g_filename_from_uri (file_uri,
- NULL, &error)) == NULL) {
- g_warning ("Can't get filename from uri '%s': %s",
- file_uri, error ? error->message : "no error given");
- }
- /* Create a new Input GSF object for the given file */
- else if ((src = gsf_input_stdio_new (filename, &error)) == NULL) {
- g_warning ("Failed creating a GSF Input object for '%s': %s",
- filename, error ? error->message : "no error given");
- }
- /* Input object is a Zip file */
- else if ((infile = gsf_infile_zip_new (src, &error)) == NULL) {
- g_warning ("'%s' Not a zip file: %s",
- filename, error ? error->message : "no error given");
- }
- /* Look for requested filename inside the ZIP file */
- else if ((member = find_member (infile, xml_filename)) == NULL) {
- g_warning ("No member '%s' in zip file '%s'",
- xml_filename, filename);
- }
- /* Load whole contents of the internal file in the xml buffer */
- else {
- guint8 buf[XML_BUFFER_SIZE];
- size_t remaining_size, chunk_size, accum;
-
- /* Get whole size of the contents to read */
- remaining_size = (size_t) gsf_input_size (GSF_INPUT (member));
-
- /* Note that gsf_input_read() needs to be able to read ALL specified
- * number of bytes, or it will fail */
- chunk_size = MIN (remaining_size, XML_BUFFER_SIZE);
-
- accum = 0;
- while (accum <= XML_MAX_BYTES_READ &&
- chunk_size > 0 &&
- gsf_input_read (GSF_INPUT (member), chunk_size, buf) != NULL) {
-
- /* update accumulated count */
- accum += chunk_size;
-
- /* Pass the read stream to the context parser... */
- g_markup_parse_context_parse (context, buf, chunk_size, NULL);
-
- /* update bytes to be read */
- remaining_size -= chunk_size;
- chunk_size = MIN (remaining_size, XML_BUFFER_SIZE);
- }
- }
-
- /* it's safe to call g_free on NULL pointers */
- g_free (filename);
- /* but better don't do it in g_object_unref or g_error_free */
- if (error)
- g_error_free (error);
- if (infile)
- g_object_unref (infile);
- if (src)
- g_object_unref (src);
- if (member)
- g_object_unref (member);
-}
-
-
static gboolean
xml_read (MsOfficeXMLParserInfo *parser_info,
const gchar *xml_filename,
@@ -2254,7 +2145,9 @@ xml_read (MsOfficeXMLParserInfo *parser_info,
if (context) {
/* Load the internal XML file from the Zip archive, and parse it
* using the given context */
- parse_xml_contents (parser_info->uri, xml_filename, context);
+ tracker_gsf_parse_xml_in_zip (parser_info->uri,
+ xml_filename,
+ context);
g_markup_parse_context_free (context);
}
@@ -2395,9 +2288,12 @@ extract_msoffice_xml (const gchar *uri,
info.content = g_string_new ("");
context = g_markup_parse_context_new (&parser, 0, &info, NULL);
+
/* Load the internal XML file from the Zip archive, and parse it
* using the given context */
- parse_xml_contents (uri, "[Content_Types].xml", context);
+ tracker_gsf_parse_xml_in_zip (uri,
+ "[Content_Types].xml",
+ context);
if (info.content) {
gchar *content;
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index 3fea090..e2f482c 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -23,6 +23,7 @@
#include <libtracker-extract/tracker-extract.h>
#include "tracker-main.h"
+#include "tracker-gsf.h"
#include <unistd.h>
@@ -72,16 +73,19 @@ static TrackerExtractData extract_data[] = {
#define ODT_BUFFER_SIZE 8193 /* bytes */
static gchar *
-extract_content (const gchar *path,
- guint n_words,
- gsize n_bytes)
+extract_oasis_content (const gchar *uri,
+ guint n_words,
+ gsize n_bytes)
{
const gchar *argv[4];
gint fdz;
FILE *fz;
GError *error = NULL;
gchar *text = NULL;
+ gchar *path;
+ /* Newly allocated string with the file path */
+ path = g_filename_from_uri (uri, NULL, NULL);
/* Setup command to be executed */
argv[0] = "odt2txt";
@@ -164,6 +168,8 @@ extract_content (const gchar *path,
text = g_string_free (normalized, FALSE);
}
+ g_free (path);
+
return text;
}
@@ -172,71 +178,60 @@ extract_oasis (const gchar *uri,
TrackerSparqlBuilder *preupdate,
TrackerSparqlBuilder *metadata)
{
- gchar *argv[5];
- gchar *xml;
- gchar *filename;
gchar *content;
TrackerFTSConfig *fts_config;
guint n_words;
gsize n_bytes;
+ ODTParseInfo info;
+ GMarkupParseContext *context;
+ GMarkupParser parser = {
+ xml_start_element_handler,
+ xml_end_element_handler,
+ xml_text_handler,
+ NULL,
+ NULL
+ };
+
+ /* Setup conf */
+ fts_config = tracker_main_get_fts_config ();
- filename = g_filename_from_uri (uri, NULL, NULL);
-
- argv[0] = g_strdup ("unzip");
- argv[1] = g_strdup ("-p");
- argv[2] = filename;
- argv[3] = g_strdup ("meta.xml");
- argv[4] = NULL;
+ g_debug ("Extracting OASIS metadata and contents from '%s'", uri);
- /* No need to unlink meta.xml, as it goes to stdout of the
- * spawned child (-p option in unzip) */
+ /* First, parse metadata */
tracker_sparql_builder_predicate (metadata, "a");
tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
- if (tracker_spawn (argv, 10, &xml, NULL)) {
- ODTParseInfo info;
- GMarkupParseContext *context;
- GMarkupParser parser = {
- xml_start_element_handler,
- xml_end_element_handler,
- xml_text_handler,
- NULL,
- NULL
- };
-
- info.metadata = metadata;
- info.current = ODT_TAG_TYPE_UNKNOWN;
- info.uri = uri;
-
- context = g_markup_parse_context_new (&parser, 0, &info, NULL);
- g_markup_parse_context_parse (context, xml, -1, NULL);
-
- g_markup_parse_context_free (context);
- g_free (xml);
- }
+ /* Create parse info */
+ info.metadata = metadata;
+ info.current = ODT_TAG_TYPE_UNKNOWN;
+ info.uri = uri;
+
+ /* Create parsing context */
+ context = g_markup_parse_context_new (&parser, 0, &info, NULL);
+
+ /* Load the internal XML file from the Zip archive, and parse it
+ * using the given context */
+ tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context);
+ g_markup_parse_context_free (context);
+
+ /* Next, parse contents */
- fts_config = tracker_main_get_fts_config ();
/* Set max words to read from content */
n_words = tracker_fts_config_get_max_words_to_index (fts_config);
+
/* Set max bytes to read from content.
* Assuming 3 bytes per unicode point in UTF-8, as 4-byte UTF-8 unicode
* points are really pretty rare */
n_bytes = 3 * n_words * tracker_fts_config_get_max_word_length(fts_config);
- content = extract_content (filename, n_words, n_bytes);
-
+ /* Extract content with the given limitations */
+ content = extract_oasis_content (uri, n_words, n_bytes);
if (content) {
tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
tracker_sparql_builder_object_unvalidated (metadata, content);
g_free (content);
}
-
- g_free (argv[3]);
- g_free (argv[1]);
- g_free (argv[0]);
-
- g_free (filename);
}
static void
diff --git a/src/tracker-extract/tracker-gsf.c b/src/tracker-extract/tracker-gsf.c
new file mode 100644
index 0000000..9bf1608
--- /dev/null
+++ b/src/tracker-extract/tracker-gsf.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <string.h>
+
+#include <glib.h>
+
+#include <gsf/gsf.h>
+#include <gsf/gsf-infile.h>
+#include <gsf/gsf-input-stdio.h>
+#include <gsf/gsf-infile-zip.h>
+
+#include "tracker-gsf.h"
+
+/* Size of the buffer to use */
+#define XML_BUFFER_SIZE 8192 /* bytes */
+/* Note: 20 MBytes of max size is really assumed to be a safe limit. */
+#define XML_MAX_BYTES_READ (20u << 20) /* bytes */
+
+/**
+ * based on find_member() from vsd_utils.c:
+ * http://vsdump.sourcearchive.com/documentation/0.0.44/vsd__utils_8c-source.html
+ */
+static GsfInput *
+find_member (GsfInfile *arch,
+ gchar const *name)
+{
+ gchar const *slash;
+
+ slash = strchr (name, '/');
+
+ if (slash) {
+ gchar *dirname;
+ GsfInput *member;
+
+ dirname = g_strndup (name, slash - name);
+
+ if ((member = gsf_infile_child_by_name (arch, dirname)) != NULL) {
+ GsfInfile *dir;
+
+ dir = GSF_INFILE (member);
+ member = find_member (dir, slash + 1);
+ g_object_unref (dir);
+ }
+
+ g_free (dirname);
+ return member;
+ } else {
+ return gsf_infile_child_by_name (arch, name);
+ }
+}
+
+/**
+ * tracker_gsf_parse_xml_in_zip:
+ * @zip_file_uri: URI of the ZIP archive
+ * @xml_filename: Name of the XML file stored inside the ZIP archive
+ * @context: Markup context to be used when parsing the XML
+ *
+ * This function reads and parses the contents of an XML file stored
+ * inside a ZIP compressed archive. Reading and parsing is done buffered, and
+ * maximum size of the uncompressed XML file is limited to be to 20MBytes.
+ */
+void
+tracker_gsf_parse_xml_in_zip (const gchar *zip_file_uri,
+ const gchar *xml_filename,
+ GMarkupParseContext *context)
+{
+ gchar *filename;
+ GError *error = NULL;
+ GsfInfile *infile = NULL;;
+ GsfInput *src = NULL;
+ GsfInput *member = NULL;
+
+ g_debug ("Parsing '%s' XML file from '%s' zip archive...",
+ xml_filename, zip_file_uri);
+
+ /* Get filename from the given URI */
+ if ((filename = g_filename_from_uri (zip_file_uri,
+ NULL, &error)) == NULL) {
+ g_warning ("Can't get filename from uri '%s': %s",
+ zip_file_uri, error ? error->message : "no error given");
+ }
+ /* Create a new Input GSF object for the given file */
+ else if ((src = gsf_input_stdio_new (filename, &error)) == NULL) {
+ g_warning ("Failed creating a GSF Input object for '%s': %s",
+ zip_file_uri, error ? error->message : "no error given");
+ }
+ /* Input object is a Zip file */
+ else if ((infile = gsf_infile_zip_new (src, &error)) == NULL) {
+ g_warning ("'%s' Not a zip file: %s",
+ zip_file_uri, error ? error->message : "no error given");
+ }
+ /* Look for requested filename inside the ZIP file */
+ else if ((member = find_member (infile, xml_filename)) == NULL) {
+ g_warning ("No member '%s' in zip file '%s'",
+ xml_filename, zip_file_uri);
+ }
+ /* Load whole contents of the internal file in the xml buffer */
+ else {
+ guint8 buf[XML_BUFFER_SIZE];
+ size_t remaining_size, chunk_size, accum;
+
+ /* Get whole size of the contents to read */
+ remaining_size = (size_t) gsf_input_size (GSF_INPUT (member));
+
+ /* Note that gsf_input_read() needs to be able to read ALL specified
+ * number of bytes, or it will fail */
+ chunk_size = MIN (remaining_size, XML_BUFFER_SIZE);
+
+ accum = 0;
+ while (accum <= XML_MAX_BYTES_READ &&
+ chunk_size > 0 &&
+ gsf_input_read (GSF_INPUT (member), chunk_size, buf) != NULL) {
+
+ /* update accumulated count */
+ accum += chunk_size;
+
+ /* Pass the read stream to the context parser... */
+ g_markup_parse_context_parse (context, buf, chunk_size, NULL);
+
+ /* update bytes to be read */
+ remaining_size -= chunk_size;
+ chunk_size = MIN (remaining_size, XML_BUFFER_SIZE);
+ }
+ }
+
+ g_free (filename);
+ if (error)
+ g_error_free (error);
+ if (infile)
+ g_object_unref (infile);
+ if (src)
+ g_object_unref (src);
+ if (member)
+ g_object_unref (member);
+}
+
diff --git a/src/tracker-extract/tracker-gsf.h b/src/tracker-extract/tracker-gsf.h
new file mode 100644
index 0000000..26c34b3
--- /dev/null
+++ b/src/tracker-extract/tracker-gsf.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __TRACKER_GSF_H__
+#define __TRACKER_GSF_H__
+
+#include <glib.h>
+#include <gsf/gsf.h>
+
+G_BEGIN_DECLS
+
+void tracker_gsf_parse_xml_in_zip (const gchar *zip_file_uri,
+ const gchar *xml_filename,
+ GMarkupParseContext *context);
+
+G_END_DECLS
+
+#endif /* __TRACKER_GSF_H__ */
+
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]