[tracker] Changed MS document content extractor to use GSF insted of WV2.

From: Philip Van Hoof <pvanhoof src gnome org>
To: svn-commits-list gnome org
Cc:
Subject: [tracker] Changed MS document content extractor to use GSF insted of WV2.
Date: Wed, 20 Jan 2010 15:04:09 +0000 (UTC)
commit 8283d574fb35e7ec7ef99bd5ea7eae949586b396
Author: Tuomas JÃ¤rvinen <tuomas jarvinen ixonos com>
Date:   Wed Jan 20 16:03:43 2010 +0100

    Changed MS document content extractor to use GSF insted of WV2.

 configure.ac                                   |   51 ------
 src/tracker-extract/Makefile.am                |    6 -
 src/tracker-extract/tracker-extract-msoffice.c |  205 ++++++++++++++++++++++--
 src/tracker-extract/tracker-msword.cpp         |  109 -------------
 src/tracker-extract/tracker-msword.h           |   32 ----
 5 files changed, 193 insertions(+), 210 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 9b7c949..547af17 100644
--- a/configure.ac
+++ b/configure.ac
@@ -148,7 +148,6 @@ GDK_REQUIRED=1.0
 LIBVORBIS_REQUIRED=0.22
 LIBEXIF_REQUIRED=0.6
 LIBGSF_REQUIRED=1.13
-LIBWV2_REQUIRED=0.3.1
 EXEMPI_REQUIRED=2.1.0
 HILDON_THUMBNAIL_REQUIRED=3.0.10
 EVO_REQUIRED=2.25.5
@@ -1168,56 +1167,6 @@ fi
 AM_CONDITIONAL(HAVE_LIBGSF, test "x$have_libgsf" = "xyes")
 
 ##################################################################
-# Check for libwv2
-##################################################################
-
-# FIXME This should be package based. Unfortunately in several main
-# distros, it is not.
-
-AC_ARG_ENABLE(libwv2,
-              AS_HELP_STRING([--enable-libwv2],
-                             [enable content extractor for MS documents [[default=auto]]]),,
-                             [enable_libwv2=auto])
-
-if test "x$enable_libwv2" != "xno" ; then
-   AC_MSG_CHECKING(for wv2)
-
-   AC_LANG(C++)
-   OLD_LDFLAGS="$LDFLAGS"
-   OLD_CFLAGS="$CFLAGS"
-   CFLAGS="$CFLAGS"
-   LDFLAGS="-lwv2 $LDFLAGS"
-   AC_LINK_IFELSE(
-   [AC_LANG_PROGRAM([#include <wv2/wv2version.h>],
-                    [wvWare::version()])],
-                    [have_libwv2=yes],
-                    [have_libwv2=no])
-
-   LIBWV2_LIBS="-lwv2"
-   AC_SUBST(LIBWV2_LIBS)
-
-   CFLAGS="$OLD_CFLAGS"
-   LDFLAGS="$OLD_LDFLAGS"
-
-   if test "x$have_libwv2" = "xyes"; then
-      AC_DEFINE(HAVE_LIBWV2, [], [Define if we have libwv2])
-      AC_MSG_RESULT(yes)
-   else
-      AC_MSG_RESULT(no)
-   fi
-else
-   have_libwv2="no (disabled)"
-fi
-
-if test "x$enable_libwv2" = "xyes"; then
-  if test "x$have_libwv2" != "xyes"; then
-     AC_MSG_ERROR([Couldn't find libwv2 >= $LIBWV2_REQUIRED.])
-  fi
-fi
-
-AM_CONDITIONAL(HAVE_LIBWV2, test "x$have_libwv2" = "xyes")
-
-##################################################################
 # Check for libjpeg
 ##################################################################
 
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index c5d2316..77b3bff 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -21,7 +21,6 @@ INCLUDES = 								\
 	$(LIBEXIF_CFLAGS) 						\
 	$(LIBIPTCDATA_CFLAGS)						\
 	$(LIBGSF_CFLAGS) 						\
-	$(LIBWV2_CFLAGS) 						\
 	$(LIBXML2_CFLAGS) 						\
 	$(LIBPNG_CFLAGS) 						\
 	$(POPPLER_GLIB_CFLAGS) 						\
@@ -186,11 +185,6 @@ libextract_msoffice_la_LDFLAGS = $(module_flags)
 libextract_msoffice_la_LIBADD = $(GLIB2_LIBS) $(LIBGSF_LIBS) $(GCOV_LIBS) \
 	$(top_builddir)/src/libtracker-common/libtracker-common.la
 
-if HAVE_LIBWV2
-libextract_msoffice_la_SOURCES += tracker-msword.cpp tracker-msword.h
-libextract_msoffice_la_LIBADD += $(LIBWV2_LIBS)
-endif
-
 # PDF
 libextract_pdf_la_SOURCES = tracker-extract-pdf.c $(xmp_sources)
 libextract_pdf_la_LDFLAGS = $(module_flags)
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index cb69b79..c395bc5 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -39,10 +39,6 @@
 
 #include "tracker-main.h"
 
-#ifdef HAVE_LIBWV2
-#include "tracker-msword.h"
-#endif
-
 #define NIE_PREFIX                              TRACKER_NIE_PREFIX
 #define NFO_PREFIX                              TRACKER_NFO_PREFIX
 #define NCO_PREFIX                              TRACKER_NCO_PREFIX
@@ -690,6 +686,189 @@ read_powerpoint (GsfInfile            *infile,
 	g_object_unref (stream);
 }
 
+/* This function was programmed by using ideas and algorithms from 
+ * b2xtranslator project (http://b2xtranslator.sourceforge.net/) */
+
+static gchar* 
+extract_msword_content (GsfInfile *infile, 
+                        gint       n_words,
+                        gboolean  *is_encrypted) 
+{
+	GsfInput *document_stream = NULL, *table_stream = NULL;
+	gint16 i = 0;
+	guint8 tmp_buffer[4] = {0};
+	gint fcClx, lcbClx;
+	guint8 *piece_table = NULL;
+	guint8 *clx = NULL;
+	gint lcb_piece_table;
+	gint piece_count;
+	gint piece_start;
+	gint piece_end;
+	guint8 *piece_descriptor = NULL;
+	gint piece_size;
+	gint32 fc;
+	guint32 is_ansi;
+	guint8 *text_buffer = NULL;
+	gchar *converted_text = NULL;
+	GString *content = NULL;
+	gchar *normalized = NULL;
+
+	document_stream = gsf_infile_child_by_name (infile, "WordDocument");
+	if (document_stream == NULL) {
+		return NULL;
+	}
+
+	/* abort if FIB can't be found from beginning of WordDocument stream */
+	gsf_input_seek (document_stream, 0, G_SEEK_SET);
+	gsf_input_read (document_stream, 2, tmp_buffer);
+	if (read_16bit (tmp_buffer) != 0xa5ec) {
+		g_object_unref (document_stream);
+		return NULL;
+	}
+
+	/* abort if document is encrypted */
+	gsf_input_seek (document_stream, 11, G_SEEK_SET);
+	gsf_input_read (document_stream, 1, tmp_buffer);
+	if ((tmp_buffer[0] & 0x1) == 0x1) {
+		g_object_unref (document_stream);
+		*is_encrypted = TRUE;
+		return NULL;
+	} else
+		*is_encrypted = FALSE;
+
+	/* document can have 0Table or 1Table or both. If flag 0x0200 is 
+	 * set to true in word 0x000A of the FIB then 1Table is used */
+
+	gsf_input_seek (document_stream, 0x000A, G_SEEK_SET);
+	gsf_input_read (document_stream, 2, tmp_buffer);
+	i = read_16bit (tmp_buffer);
+
+	if ((i & 0x0200) == 0x0200) {
+		table_stream = gsf_infile_child_by_name (infile, "1Table");
+	}
+	else {
+		table_stream = gsf_infile_child_by_name (infile, "0Table");
+	}
+
+	if (table_stream == NULL) {
+		g_object_unref (G_OBJECT (document_stream));
+		return NULL;
+	}
+
+	/* find out location and length of piece table from FIB */
+	gsf_input_seek (document_stream, 418, G_SEEK_SET);
+	gsf_input_read (document_stream, 4, tmp_buffer);
+	fcClx = read_32bit (tmp_buffer);
+	gsf_input_read (document_stream, 4, tmp_buffer);
+	lcbClx = read_32bit (tmp_buffer);
+
+	/* copy the structure holding the piece table into the clx array. */
+	clx = g_malloc (lcbClx);
+	gsf_input_seek (table_stream, fcClx, G_SEEK_SET);
+	gsf_input_read (table_stream, lcbClx, clx);
+
+	/* find out piece table from clx and set piece_table -pointer to it */
+	i = 0;
+	lcb_piece_table = 0;
+	while (TRUE) {
+		if (clx[i] == 2) {
+			lcb_piece_table = read_32bit (clx+(i+1));
+			piece_table = clx+i+5;
+			piece_count = (lcb_piece_table - 4) / 12;
+			break;
+		}
+		else if (clx[i] == 1) {
+			i = i + 2 + clx[i+1];
+		}
+		else {
+			break;
+		}
+	}
+
+	g_free (clx);
+
+	/* iterate over pieces and save text to the content -variable */
+	for (i = 0; i < piece_count; i++) {
+
+		/* logical position of the text piece in the document_stream */
+		piece_start = read_32bit (piece_table+(i*4));
+		piece_end = read_32bit (piece_table+((i+1)*4));
+
+		/* descriptor of single piece from piece table */
+		piece_descriptor = piece_table + ((piece_count+1)*4) + (i*8);
+
+		/* file character position */
+		fc = read_32bit (piece_descriptor+2);
+
+		/* second bit is set to 1 if text is saved in ANSI encoding */
+		is_ansi = ((fc & 0x40000000) == 0x40000000);
+
+		/* modify file character position according to text encoding */
+		if (!is_ansi) {
+			fc = (fc & 0xBFFFFFFF);
+		}
+		else {
+			fc = (fc & 0xBFFFFFFF) >> 1;
+		}
+
+		/* unicode uses twice as many bytes as CP1252 */
+		piece_size  = piece_end - piece_start;
+		if (!is_ansi) {
+			piece_size *= 2;
+		}
+
+		if (piece_size < 1) {
+			continue;
+		}
+
+		/* read single text piece from document_stream */
+		text_buffer = g_malloc (piece_size);
+		gsf_input_seek (document_stream, fc, G_SEEK_SET);
+		gsf_input_read (document_stream, piece_size, text_buffer);
+
+		/* pieces can have different encoding */
+		if(is_ansi) {
+			converted_text = g_convert (text_buffer, 
+			                            piece_size, 
+			                            "UTF-8", 
+			                            "CP1252", 
+			                            NULL, 
+			                            NULL, 
+			                            NULL);
+		}
+		else {
+			converted_text = g_convert (text_buffer, 
+			                            piece_size, 
+			                            "UTF-8", 
+			                            "UTF-16", 
+			                            NULL, 
+			                            NULL, 
+			                            NULL);
+		}
+
+		if (converted_text) {
+			if (!content)
+				content = g_string_new (converted_text);
+			else
+				g_string_append (content, converted_text);
+
+			g_free (converted_text);
+		}
+
+		g_free (text_buffer);
+	}
+
+	g_object_unref (document_stream);
+	g_object_unref (table_stream);
+
+	if (content) {
+		normalized = tracker_text_normalize (content->str, n_words, NULL);
+		g_string_free (content, TRUE);
+	}
+
+	return normalized;
+}
+
 /**
  * @brief get maximum number of words to index
  * @return maximum number of words to index
@@ -713,9 +892,8 @@ extract_summary (TrackerSparqlBuilder *metadata,
                  const gchar          *uri)
 {
 	GsfInput *stream;
-#ifdef HAVE_LIBWV2
 	gchar    *content;
-#endif
+	gboolean  is_encrypted = FALSE;
 
 	tracker_sparql_builder_subject_iri (metadata, uri);
 	tracker_sparql_builder_predicate (metadata, "a");
@@ -763,16 +941,20 @@ extract_summary (TrackerSparqlBuilder *metadata,
 		g_object_unref (stream);
 	}
 
-
-#ifdef HAVE_LIBWV2
-	content = extract_msword_content (uri, max_words ());
+	content = extract_msword_content(infile, max_words (), &is_encrypted);
 
 	if (content) {
-		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+		tracker_sparql_builder_predicate (metadata,
+		                                  "nie:plainTextContent");
 		tracker_sparql_builder_object_unvalidated (metadata, content);
 		g_free (content);
 	}
-#endif
+
+	if (is_encrypted) {
+		tracker_sparql_builder_predicate (metadata,
+		                                  "nfo:isContentEncrypted");
+		tracker_sparql_builder_object_boolean (metadata, TRUE);
+	}
 }
 
 /**
@@ -829,7 +1011,6 @@ extract_msoffice (const gchar          *uri,
 	gsf_shutdown ();
 }
 
-
 /**
  * @brief Extract data from powerpoin files
  *
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]