[tracker] Add FTS support for MS and ODF document formats.
- From: Carlos Garnacho <carlosg src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [tracker] Add FTS support for MS and ODF document formats.
- Date: Thu, 8 Oct 2009 16:20:23 +0000 (UTC)
commit 0d83a247e921680d7972b3648fd94c277ee054d1
Author: Carlos Garnacho <carlos lanedo com>
Date: Thu Oct 8 18:15:37 2009 +0200
Add FTS support for MS and ODF document formats.
src/tracker-extract/tracker-extract-msoffice.c | 44 +++++++++++++++++++++++-
src/tracker-extract/tracker-extract-oasis.c | 35 +++++++++++++++++++
2 files changed, 78 insertions(+), 1 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index b1eebd8..1b1b2c8 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -208,6 +208,39 @@ doc_metadata_cb (gpointer key,
}
}
+static gchar *
+extract_content (const gchar *uri,
+ guint n_words)
+{
+ gchar *path, *command, *output, *text;
+ GError *error = NULL;
+
+ path = g_filename_from_uri (uri, NULL, NULL);
+
+ if (!path) {
+ return NULL;
+ }
+
+ command = g_strdup_printf ("wvWare --charset utf-8 -1 -x wvText.xml %s", path);
+
+ g_free (path);
+
+ if (!g_spawn_command_line_sync (command, &output, NULL, NULL, &error)) {
+ g_warning ("Could not extract text from '%s': %s", uri, error->message);
+ g_error_free (error);
+ g_free (command);
+
+ return NULL;
+ }
+
+ text = tracker_text_normalize (output, n_words, NULL);
+
+ g_free (command);
+ g_free (output);
+
+ return text;
+}
+
static void
extract_msoffice (const gchar *uri,
TrackerSparqlBuilder *metadata)
@@ -215,7 +248,7 @@ extract_msoffice (const gchar *uri,
GsfInput *input;
GsfInfile *infile;
GsfInput *stream;
- gchar *filename;
+ gchar *filename, *content;
gboolean rdf_type_added = FALSE;
gsf_init ();
@@ -294,7 +327,16 @@ extract_msoffice (const gchar *uri,
g_object_unref (stream);
}
+ content = extract_content (uri, 1000);
+
+ if (content) {
+ tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+ tracker_sparql_builder_object_unvalidated (metadata, content);
+ g_free (content);
+ }
+
g_object_unref (infile);
+ g_free (filename);
gsf_shutdown ();
}
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index ed5b200..51111a2 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -27,6 +27,7 @@
#include <libtracker-common/tracker-os-dependant.h>
#include <libtracker-common/tracker-statement-list.h>
#include <libtracker-common/tracker-ontology.h>
+#include <libtracker-common/tracker-utils.h>
#include "tracker-main.h"
@@ -77,6 +78,31 @@ static TrackerExtractData extract_data[] = {
{ NULL, NULL }
};
+static gchar *
+extract_content (const gchar *path,
+ guint n_words)
+{
+ gchar *command, *output, *text;
+ GError *error = NULL;
+
+ command = g_strdup_printf ("odt2txt --encoding=utf-8 %s", path);
+
+ if (!g_spawn_command_line_sync (command, &output, NULL, NULL, &error)) {
+ g_warning ("Could not extract text from '%s': %s", path, error->message);
+ g_error_free (error);
+ g_free (command);
+
+ return NULL;
+ }
+
+ text = tracker_text_normalize (output, n_words, NULL);
+
+ g_free (command);
+ g_free (output);
+
+ return text;
+}
+
static void
extract_oasis (const gchar *uri,
TrackerSparqlBuilder *metadata)
@@ -84,6 +110,7 @@ extract_oasis (const gchar *uri,
gchar *argv[5];
gchar *xml;
gchar *filename = g_filename_from_uri (uri, NULL, NULL);
+ gchar *content;
ODTParseInfo info = {
metadata,
-1,
@@ -117,6 +144,14 @@ extract_oasis (const gchar *uri,
g_free (xml);
}
+ content = extract_content (filename, 1000);
+
+ if (content) {
+ tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+ tracker_sparql_builder_object_unvalidated (metadata, content);
+ g_free (content);
+ }
+
g_free (argv[3]);
g_free (argv[1]);
g_free (argv[0]);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]