[tracker/rss-enclosures] Fix for GB#615035 - Removing unzip command dependency from ms-office extractor

From: Roberto Guido <rguido src gnome org>
To: commits-list gnome org
Cc:
Subject: [tracker/rss-enclosures] Fix for GB#615035 - Removing unzip command dependency from ms-office extractor
Date: Mon, 3 May 2010 00:35:45 +0000 (UTC)
commit 0667b08bd831cde44d5d92d89c21bc6ea9118ff8
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Tue Apr 13 18:28:16 2010 +0200

    Fix for GB#615035 - Removing unzip command dependency from ms-office extractor
    
    Initial patch from Amil Aggarwal, updated patch from Aleksander Morgado.

 src/tracker-extract/tracker-extract-msoffice.c |  168 +++++++++++++++++-------
 1 files changed, 118 insertions(+), 50 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index 335593b..8eb0705 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -32,6 +32,7 @@
 #include <gsf/gsf-input-stdio.h>
 #include <gsf/gsf-msole-utils.h>
 #include <gsf/gsf-utils.h>
+#include <gsf/gsf-infile-zip.h>
 
 #include <libtracker-common/tracker-utils.h>
 #include <libtracker-common/tracker-os-dependant.h>
@@ -730,6 +731,17 @@ fts_max_words (void)
 }
 
 /**
+ * @brief get min word length
+ * @return min_word_length
+ */
+static gint
+fts_min_word_length (void)
+{
+	TrackerFTSConfig *fts_config = tracker_main_get_fts_config ();
+	return tracker_fts_config_get_min_word_length (fts_config);
+}
+
+/**
  * @brief Open specified uri for reading and initialize gsf
  * @param uri URI of the file to open
  * @return GsfInFile of the opened file or NULL if failed to open file
@@ -1629,6 +1641,7 @@ xml_text_handler_document_data (GMarkupParseContext  *context,
 	MsOfficeXMLParserInfo *info = user_data;
 	static gboolean found = FALSE;
 	static gboolean added = FALSE;
+	guint min_word_length = fts_min_word_length();
 
 	switch (info->tag_type) {
 	case MS_OFFICE_XML_TAG_WORD_TEXT:
@@ -1640,8 +1653,7 @@ xml_text_handler_document_data (GMarkupParseContext  *context,
 
 		if (info->preserve_attribute_present) {
 			gchar *keywords = g_strdup (text);
-
-			if (found && (strlen (keywords) > 3)) {
+			if (found && (strlen (keywords) >= min_word_length)) {
 				g_string_append_printf (info->content, "%s ", text);
 				found = FALSE;
 			} else {
@@ -1665,13 +1677,13 @@ xml_text_handler_document_data (GMarkupParseContext  *context,
 		break;
 
 	case MS_OFFICE_XML_TAG_SLIDE_TEXT:
-		if (strlen (text) > 3) {
+		if (strlen (text) > min_word_length) {
 			g_string_append_printf (info->content, "%s ", text);
 		}
 		break;
 
 	case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
-		if ((atoi (text) == 0) && (strlen (text) > 4))  {
+		if ((atoi (text) == 0) && (strlen (text) > min_word_length))  {
 			g_string_append_printf (info->content, "%s ", text);
 		}
 		break;
@@ -1773,37 +1785,109 @@ xml_text_handler_document_data (GMarkupParseContext  *context,
 	}
 }
 
-static gboolean
-xml_read (MsOfficeXMLParserInfo *parser_info,
-          const gchar           *xml_filename,
-          MsOfficeXMLTagType     type)
+/**
+ * based on find_member() from vsd_utils.c:
+ * http://vsdump.sourcearchive.com/documentation/0.0.44/vsd__utils_8c-source.html
+ */
+static GsfInput *
+find_member (GsfInfile *arch,
+	     char const *name)
 {
-	GMarkupParseContext *context;
-	MsOfficeXMLParserInfo info;
-	gchar *xml;
-	gchar *filename;
-	const gchar *argv[5];
-	gboolean success;
+	gchar const *slash = strchr (name, '/');
 
-	filename = g_filename_from_uri (parser_info->uri, NULL, NULL);
+	if (slash) {
+		gchar *dirname = g_strndup (name, slash - name);
+		GsfInput *member;
 
-	argv[0] = "unzip";
-	argv[1] = "-p";
-	argv[2] = filename;
-	argv[3] = xml_filename;
-	argv[4] = NULL;
+		if ((member = gsf_infile_child_by_name (arch, dirname)) != NULL) {
+			GsfInfile *dir = GSF_INFILE (member);
+			member = find_member (dir, slash + 1);
+			g_object_unref (dir);
+		}
 
-	g_debug ("Reading XML data '%s'", argv[3]);
+		g_free (dirname);
+		return member;
+	} else {
+		return gsf_infile_child_by_name (arch, name);
+	}
+}
 
-	xml = NULL;
 
-	success = tracker_spawn ((gchar**) argv, 10, &xml, NULL);
+static gchar *
+load_xml_contents (const gchar *file_uri,
+		   const gchar *xml_filename)
+{
+	gchar *filename;
+	gchar *xml = NULL;
+	GError *error = NULL;
+	GsfInfile *infile = NULL;;
+	GsfInput *src = NULL;
+	GsfInput *member = NULL;
+
+	/* Get filename from the given URI */
+	if ((filename = g_filename_from_uri (file_uri,
+					     NULL, &error)) == NULL) {
+		g_warning ("Can't get filename from uri '%s': %s",
+			   file_uri, error ? error->message : NULL);
+	}
+	/* Create a new Input GSF object for the given file */
+	else if ((src = gsf_input_stdio_new (filename, &error)) == NULL) {
+		g_warning ("Failed creating a GSF Input object for '%s': %s",
+			   filename, error ? error->message : NULL);
+	}
+	/* Input object is a Zip file */
+	else if ((infile = gsf_infile_zip_new (src, &error)) == NULL) {
+		g_warning ("'%s' Not a zip file: %s",
+			   filename, error ? error->message : NULL);
+	}
+	/* Look for requested filename inside the ZIP file */
+	else if ((member = find_member (infile, xml_filename)) == NULL) {
+		g_warning ("No member '%s' in zip file '%s'",
+			   xml_filename, filename);
+	}
+	/* Load whole contents of the internal file in the xml buffer */
+	else {
+		size_t size;
+		/* Get whole size of the contents to read */
+		size = (size_t) gsf_input_size (GSF_INPUT (member));
+
+		/* Allocate buffer to return, and make sure it will be
+		 *  NIL-terminated */
+		xml = g_malloc (size + 1);
+		xml [size] = '\0';
+
+		/* And read all the bytes in one operation */
+		if(gsf_input_read (GSF_INPUT (member), size, xml) == NULL) {
+			g_warning ("Couldn't read '%u' bytes from '%s'",
+				   size, xml_filename);
+			g_free (xml);
+			xml = NULL;
+		}
+	}
+
+	/* it's safe to call g_free on NULL pointers */
 	g_free (filename);
+	/* but better don't do it in g_object_unref or g_error_free */
+	if (error)
+		g_error_free (error);
+	if (infile)
+		g_object_unref (infile);
+	if (src)
+		g_object_unref (src);
+	if (member)
+		g_object_unref (member);
+
+	return xml;
+}
 
-	if (!success) {
-		g_free (xml);
-		return FALSE;
-	}
+
+static gboolean
+xml_read (MsOfficeXMLParserInfo *parser_info,
+          const gchar           *xml_filename,
+          MsOfficeXMLTagType     type)
+{
+	GMarkupParseContext *context;
+	MsOfficeXMLParserInfo info;
 
 	/* FIXME: Can we use the original info here? */
 	info.metadata = parser_info->metadata;
@@ -1853,11 +1937,14 @@ xml_read (MsOfficeXMLParserInfo *parser_info,
 	}
 
 	if (context) {
+		gchar *xml = load_xml_contents (parser_info->uri,
+						xml_filename);
+
 		g_markup_parse_context_parse (context, xml, -1, NULL);
 		g_markup_parse_context_free (context);
-	}
 
-	g_free (xml);
+		g_free (xml);
+	}
 
 	return TRUE;
 }
@@ -1943,11 +2030,8 @@ extract_msoffice_xml (const gchar          *uri,
 		NULL,
 		NULL
 	};
-	gchar *filename;
-	gchar *xml;
+	gchar *xml = NULL;
 	const gchar *mime_used;
-	const gchar *argv[5];
-	gboolean success;
 
 	file = g_file_new_for_uri (uri);
 
@@ -1985,29 +2069,12 @@ extract_msoffice_xml (const gchar          *uri,
 
 	g_object_unref (file_info);
 
-	filename = g_filename_from_uri (uri, NULL, NULL);
-
-	argv[0] = "unzip";
-	argv[1] = "-p";
-	argv[2] = filename;
-	argv[3] = "\\[Content_Types\\].xml";
-	argv[4] = NULL;
 
 	g_debug ("Extracting MsOffice XML format...");
 
 	tracker_sparql_builder_predicate (metadata, "a");
 	tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
 
-	xml = NULL;
-
-	success = tracker_spawn ((gchar**) argv, 10, &xml, NULL);
-	g_free (filename);
-
-	if (!success) {
-		g_free (xml);
-		return;
-	}
-
 	info.metadata = metadata;
 	info.file_type = file_type;
 	info.tag_type = MS_OFFICE_XML_TAG_INVALID;
@@ -2017,6 +2084,7 @@ extract_msoffice_xml (const gchar          *uri,
 	info.content = g_string_new ("");
 
 	context = g_markup_parse_context_new (&parser, 0, &info, NULL);
+	xml = load_xml_contents (uri, "[Content_Types].xml");
 	g_markup_parse_context_parse (context, xml, -1, NULL);
 	g_free (xml);
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]