[tracker] Refactored the MSWord extractor to use libwv2 for content extraction
- From: Philip Van Hoof <pvanhoof src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [tracker] Refactored the MSWord extractor to use libwv2 for content extraction
- Date: Tue, 8 Dec 2009 20:09:38 +0000 (UTC)
commit 84c23b4da780abfe490638deffd807f6fa7582ad
Author: Philip Van Hoof <philip codeminded be>
Date: Tue Dec 8 21:08:00 2009 +0100
Refactored the MSWord extractor to use libwv2 for content extraction
configure.ac | 54 ++++++++++--
src/tracker-extract/Makefile.am | 7 ++-
src/tracker-extract/tracker-extract-msoffice.c | 53 ++----------
src/tracker-extract/tracker-msword.cpp | 113 ++++++++++++++++++++++++
src/tracker-extract/tracker-msword.h | 31 +++++++
5 files changed, 205 insertions(+), 53 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 97116e2..2d6bbfd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -147,6 +147,7 @@ GDK_REQUIRED=1.0
LIBVORBIS_REQUIRED=0.22
LIBEXIF_REQUIRED=0.6
LIBGSF_REQUIRED=1.13
+LIBWV2_REQUIRED=0.3.1
EXEMPI_REQUIRED=2.1.0
HILDON_THUMBNAIL_REQUIRED=3.0.10
EVO_REQUIRED=2.25.5
@@ -1109,13 +1110,6 @@ if test "x$enable_libgsf" != "xno" ; then
AC_SUBST(LIBGSF_CFLAGS)
AC_SUBST(LIBGSF_LIBS)
- AC_PATH_PROG(WVWAREBIN, wvWare, no)
- AC_SUBST(WVWAREBIN)
-
- if test "x$WVWAREBIN" != "xno"; then
- AC_DEFINE(HAVE_WVWARE, [], [Define if we have wvWare])
- fi
-
if test "x$have_libgsf" = "xyes"; then
AC_DEFINE(HAVE_LIBGSF, [], [Define if we have libgsf])
fi
@@ -1132,6 +1126,52 @@ fi
AM_CONDITIONAL(HAVE_LIBGSF, test "x$have_libgsf" = "xyes")
##################################################################
+# Check for libwv2
+##################################################################
+
+# FIXME This should be package based. Unfortunately in several main
+# distros, it is not.
+
+AC_ARG_ENABLE(libwv2,
+ AS_HELP_STRING([--enable-libwv2],
+ [enable content extractor for MS documents [[default=auto]]]),,
+ [enable_libwv2=auto])
+
+if test "x$enable_libwv2" != "xno" ; then
+ AC_MSG_CHECKING(for wv2)
+
+ AC_LANG(C++)
+ oldflags=$LDFLAGS
+ LDFLAGS="$LDFLAGS -lwv2"
+ AC_LINK_IFELSE(
+ [AC_LANG_PROGRAM([#include <wv2/wv2version.h>],
+ [wvWare::version()])],
+ [TEST_LIBS="$TEST_LIBS -lwv2"] [have_libwv2=yes],
+ [have_libwv2=no])
+
+ LIBWV2_LIBS="-lwv2"
+ AC_SUBST(LIBWV2_LIBS)
+ LDFLAGS=$oldflags
+
+ if test "x$have_libwv2" = "xyes"; then
+ AC_DEFINE(HAVE_LIBWV2, [], [Define if we have libwv2])
+ AC_MSG_RESULT(yes)
+ else
+ AC_MSG_RESULT(no)
+ fi
+else
+ have_libwv2="no (disabled)"
+fi
+
+if test "x$enable_libwv2" = "xyes"; then
+ if test "x$have_libwv2" != "xyes"; then
+ AC_MSG_ERROR([Couldn't find libwv2 >= $LIBWV2_REQUIRED.])
+ fi
+fi
+
+AM_CONDITIONAL(HAVE_LIBWV2, test "x$have_libwv2" = "xyes")
+
+##################################################################
# Check for libjpeg
##################################################################
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 697a3f7..c5d2316 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -7,7 +7,6 @@ INCLUDES = \
-DLOCALEDIR=\""$(localedir)"\" \
-DMODULESDIR=\"$(modulesdir)\" \
-DG_LOG_DOMAIN=\"Tracker\" \
- -DWVWAREBIN=\"$(WVWAREBIN)\" \
-DTRACKER_COMPILATION \
-I$(top_srcdir)/src \
$(WARN_CFLAGS) \
@@ -22,6 +21,7 @@ INCLUDES = \
$(LIBEXIF_CFLAGS) \
$(LIBIPTCDATA_CFLAGS) \
$(LIBGSF_CFLAGS) \
+ $(LIBWV2_CFLAGS) \
$(LIBXML2_CFLAGS) \
$(LIBPNG_CFLAGS) \
$(POPPLER_GLIB_CFLAGS) \
@@ -186,6 +186,11 @@ libextract_msoffice_la_LDFLAGS = $(module_flags)
libextract_msoffice_la_LIBADD = $(GLIB2_LIBS) $(LIBGSF_LIBS) $(GCOV_LIBS) \
$(top_builddir)/src/libtracker-common/libtracker-common.la
+if HAVE_LIBWV2
+libextract_msoffice_la_SOURCES += tracker-msword.cpp tracker-msword.h
+libextract_msoffice_la_LIBADD += $(LIBWV2_LIBS)
+endif
+
# PDF
libextract_pdf_la_SOURCES = tracker-extract-pdf.c $(xmp_sources)
libextract_pdf_la_LDFLAGS = $(module_flags)
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index 465153e..ba1f0de 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -39,6 +39,7 @@
#include <libtracker-common/tracker-ontology.h>
#include "tracker-main.h"
+#include "tracker-msword.h"
#define NIE_PREFIX TRACKER_NIE_PREFIX
#define NFO_PREFIX TRACKER_NFO_PREFIX
@@ -253,49 +254,6 @@ doc_metadata_cb (gpointer key,
}
}
-static gchar *
-extract_content (const gchar *uri,
- guint n_words)
-{
-#ifdef HAVE_WVWARE
-
- /* TODO, question: can't we replace this command-calling with a function
- * in libwmf-dev or something? If yes and somebody wants to contribute
- * replacing this with libwmf-dev, go ahead */
-
- gchar *path, *command, *output, *text;
- GError *error = NULL;
-
- path = g_filename_from_uri (uri, NULL, NULL);
-
- if (!path) {
- return NULL;
- }
-
- command = g_strdup_printf (WVWAREBIN " --charset utf-8 -1 -x wvText.xml %s", path);
-
- g_free (path);
-
- if (!g_spawn_command_line_sync (command, &output, NULL, NULL, &error)) {
- g_warning ("Could not extract text from '%s': %s",
- uri, error->message);
- g_error_free (error);
- g_free (command);
-
- return NULL;
- }
-
- text = tracker_text_normalize (output, n_words, NULL);
-
- g_free (command);
- g_free (output);
-
- return text;
-#else
- return NULL;
-#endif
-}
-
/**
* @brief Read 16 bit unsigned integer
* @param buffer data to read integer from
@@ -752,8 +710,10 @@ extract_summary (TrackerSparqlBuilder *metadata,
GsfInfile *infile,
const gchar *uri)
{
- gchar *content;
GsfInput *stream;
+#ifdef HAVE_LIBWV2
+ gchar *content;
+#endif
tracker_sparql_builder_subject_iri (metadata, uri);
tracker_sparql_builder_predicate (metadata, "a");
@@ -801,13 +761,16 @@ extract_summary (TrackerSparqlBuilder *metadata,
g_object_unref (stream);
}
- content = extract_content (uri, max_words());
+
+#ifdef HAVE_LIBWV2
+ content = extract_msword_content (uri, max_words ());
if (content) {
tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
tracker_sparql_builder_object_unvalidated (metadata, content);
g_free (content);
}
+#endif
}
/**
diff --git a/src/tracker-extract/tracker-msword.cpp b/src/tracker-extract/tracker-msword.cpp
new file mode 100644
index 0000000..741860a
--- /dev/null
+++ b/src/tracker-extract/tracker-msword.cpp
@@ -0,0 +1,113 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2009, Nokia
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include "config.h"
+#include "tracker-msword.h"
+
+#include <glib.h>
+#include <glib/gprintf.h>
+
+#include <wv2/wvlog.h>
+#include <wv2/parser.h>
+#include <wv2/handlers.h>
+#include <wv2/parserfactory.h>
+#include <wv2/word97_generated.h>
+#include <wv2/ustring.h>
+
+
+extern "C" {
+#include <libtracker-common/tracker-utils.h>
+}
+
+
+using namespace wvWare;
+
+
+class TextExtractor : public TextHandler
+{
+public:
+ UString content;
+ virtual void runOfText (const UString &text,
+ SharedPtr<const Word97::CHP> chp);
+};
+
+
+void
+TextExtractor::runOfText (const UString &text,
+ SharedPtr<const Word97::CHP> chp)
+{
+ content += text;
+}
+
+
+static gchar*
+ustring2utf (const UString& ustr, guint n_words)
+{
+ CString cstring = ustr.cstring();
+ gchar *unicode_str = g_convert (cstring.c_str (), cstring.length (),
+ "UTF-8", "ISO-8859-1",
+ NULL, NULL, NULL);
+
+ if(unicode_str) {
+ gchar *normalized = tracker_text_normalize (unicode_str, n_words, NULL);
+ g_free (unicode_str);
+ return normalized;
+ }
+
+ return NULL;
+}
+
+gchar*
+extract_msword_content (const gchar *uri, gint max_words)
+{
+ gchar *filename = g_filename_from_uri (uri, NULL, NULL);
+ gchar *str;
+
+ if(!filename) {
+ return NULL;
+ }
+
+ SharedPtr<Parser> parser (ParserFactory::createParser (filename));
+
+ if (!parser) {
+ g_free(filename);
+ return NULL;
+ }
+
+ TextExtractor* extractor = new TextExtractor;
+ if (!extractor) {
+ g_free (filename);
+ return NULL;
+ }
+
+ parser->setTextHandler (extractor);
+ if (!parser->parse ()) {
+ g_free (filename);
+ delete extractor;
+ return NULL;
+ }
+
+ str = ustring2utf (extractor->content, max_words);
+
+ delete extractor;
+ g_free (filename);
+
+ return str;
+}
diff --git a/src/tracker-extract/tracker-msword.h b/src/tracker-extract/tracker-msword.h
new file mode 100644
index 0000000..cd31044
--- /dev/null
+++ b/src/tracker-extract/tracker-msword.h
@@ -0,0 +1,31 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2009, Nokia
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __TRACKERD_MSWORD_H__
+#define __TRACKERD_MSWORD_H__
+
+#include <glib.h>
+
+#ifdef __cplusplus
+extern "C"
+#endif
+gchar* extract_msword_content (const gchar *uri, gint max_words);
+
+#endif /* __TRACKERD_MSWORD_H__ */
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]