[tracker] tracker-extract: Added FTS support for text files
- From: Martyn James Russell <mr src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [tracker] tracker-extract: Added FTS support for text files
- Date: Tue, 8 Sep 2009 11:39:00 +0000 (UTC)
commit c53a2e5f7a329d937cbc2e43217c131b82dc4ab5
Author: Martyn Russell <martyn lanedo com>
Date: Tue Sep 8 12:37:40 2009 +0100
tracker-extract: Added FTS support for text files
src/tracker-extract/Makefile.am | 8 +-
src/tracker-extract/tracker-extract-text.c | 326 ++++++++++++++++++++++++++++
2 files changed, 333 insertions(+), 1 deletions(-)
---
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 5e6f457..792e124 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -41,7 +41,8 @@ modules_LTLIBRARIES = \
libextract-mp3.la \
libextract-oasis.la \
libextract-png.la \
- libextract-ps.la
+ libextract-ps.la \
+ libextract-text.la
if HAVE_LIBVORBIS
modules_LTLIBRARIES += libextract-vorbis.la
@@ -230,6 +231,11 @@ libextract_playlist_la_LDFLAGS = $(module_flags)
libextract_playlist_la_LIBADD = $(GLIB2_LIBS) $(TOTEM_PL_PARSER_LIBS) $(GCOV_LIBS) \
$(top_builddir)/src/libtracker-common/libtracker-common.la
+# TIFF
+libextract_text_la_SOURCES = tracker-extract-text.c
+libextract_text_la_LDFLAGS = $(module_flags)
+libextract_text_la_LIBADD = $(GLIB2_LIBS) $(GIO_LIBS) $(GCOV_LIBS) \
+ $(top_builddir)/src/libtracker-common/libtracker-common.la
#
# Binaries
diff --git a/src/tracker-extract/tracker-extract-text.c b/src/tracker-extract/tracker-extract-text.c
new file mode 100644
index 0000000..95689a9
--- /dev/null
+++ b/src/tracker-extract/tracker-extract-text.c
@@ -0,0 +1,326 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2008, Nokia
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include <glib.h>
+
+#include <libtracker-common/tracker-statement-list.h>
+
+#include "tracker-main.h"
+
+#undef TRY_LOCALE_TO_UTF8_CONVERSION
+
+#define TEXT_MAX_SIZE 1048576 /* bytes */
+#define TEXT_CHECK_SIZE 65535 /* bytes */
+
+#if 0
+
+typedef struct {
+ GMainLoop *main_loop;
+ GString *data;
+ gchar *uri;
+ TrackerSparqlBuilder *metadata;
+} ContentData;
+
+#endif
+
+static void extract_text (const gchar *uri,
+ TrackerSparqlBuilder *metadata);
+
+static TrackerExtractData data[] = {
+ { "text/plain", extract_text },
+ { "text/x-authors", extract_text },
+ { "text/x-changelog", extract_text },
+ { "text/x-copying", extract_text },
+ { "text/x-credits", extract_text },
+ { "text/x-install", extract_text },
+ { "text/x-readme", extract_text },
+ { NULL, NULL }
+};
+
+static gboolean
+get_file_is_utf8 (GString *s,
+ gssize *bytes_valid)
+{
+ const gchar *end;
+
+ /* Check for UTF-8 validity, since we may
+ * have cut off the end.
+ */
+ if (g_utf8_validate (s->str, s->len, &end)) {
+ *bytes_valid = (gssize) s->len;
+ return TRUE;
+ }
+
+ *bytes_valid = end - s->str;
+
+ /* 4 is the maximum bytes for a UTF-8 character. */
+ if (*bytes_valid > 4) {
+ return FALSE;
+ }
+
+ if (g_utf8_get_char_validated (end, *bytes_valid) == (gunichar) -1) {
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+#ifdef TRY_LOCALE_TO_UTF8_CONVERSION
+
+static GString *
+get_file_in_locale (GString *s)
+{
+ GError *error = NULL;
+ gchar *str;
+ gsize bytes_read;
+ gsize bytes_written;
+
+ str = g_locale_to_utf8 (s->str,
+ s->len,
+ &bytes_read,
+ &bytes_written,
+ &error);
+ if (error) {
+ g_debug (" Conversion to UTF-8 read %d bytes, wrote %d bytes",
+ bytes_read,
+ bytes_written);
+ g_message ("Could not convert file from locale to UTF-8, %s",
+ error->message);
+ g_error_free (error);
+ g_free (str);
+ } else {
+ g_string_assign (s, str);
+ g_free (str);
+ }
+
+ return s;
+}
+
+#endif /* TRY_LOCALE_TO_UTF8_CONVERSION */
+
+static gchar *
+get_file_content (const gchar *uri)
+{
+ GFile *file;
+ GFileInputStream *stream;
+ GError *error = NULL;
+ GString *s;
+ gssize bytes;
+ gssize bytes_valid;
+ gssize bytes_read_total;
+ gssize buf_size;
+ gchar buf[TEXT_CHECK_SIZE];
+ gboolean has_more_data;
+ gboolean has_reached_max;
+ gboolean is_utf8;
+
+ file = g_file_new_for_uri (uri);
+ stream = g_file_read (file, NULL, &error);
+
+ if (error) {
+ g_message ("Could not get read file:'%s', %s",
+ uri,
+ error->message);
+ g_error_free (error);
+ g_object_unref (file);
+
+ return NULL;
+ }
+
+ s = g_string_new ("");
+ has_reached_max = FALSE;
+ has_more_data = TRUE;
+ bytes_read_total = 0;
+ buf_size = TEXT_CHECK_SIZE - 1;
+
+ g_debug (" Starting read...");
+
+ while (has_more_data && !has_reached_max && !error) {
+ gssize bytes_read;
+ gssize bytes_remaining;
+
+ /* Leave space for NULL termination and make sure we
+ * add it at the end now.
+ */
+ bytes_remaining = buf_size;
+ bytes_read = 0;
+
+ /* Loop until we hit the maximum */
+ for (bytes = -1; bytes != 0 && !error; ) {
+ bytes = g_input_stream_read (G_INPUT_STREAM (stream),
+ buf,
+ bytes_remaining,
+ NULL,
+ &error);
+
+ bytes_read += bytes;
+ bytes_remaining -= bytes;
+
+ g_debug (" Read %" G_GSSIZE_FORMAT " bytes", bytes);
+ }
+
+ /* Set the NULL termination after the last byte read */
+ buf[buf_size - bytes_remaining] = '\0';
+
+ /* First of all, check if this is the first time we
+ * have tried to read the file up to the TEXT_CHECK_SIZE
+ * limit. Then make sure that we read the maximum size
+ * of the buffer. If we don't do this, there is the
+ * case where we read 10 bytes in and it is just one
+ * line with no '\n'. Once we have confirmed this we
+ * check that the buffer has a '\n' to make sure the
+ * file is worth indexing. Similarly if the file has
+ * <= 3 bytes then we drop it.
+ */
+ if (bytes_read_total == 0) {
+ if (bytes_read == buf_size &&
+ strchr (buf, '\n') == NULL) {
+ g_debug (" No '\\n' in the first %" G_GSSIZE_FORMAT " bytes, not indexing file",
+ buf_size);
+ break;
+ } else if (bytes_read <= 2) {
+ g_debug (" File has less than 3 characters in it, not indexing file");
+ break;
+ }
+ }
+
+ /* Here we increment the bytes read total to evaluate
+ * the next states. We don't do this before the
+ * previous condition so we can know when we have
+ * iterated > 1.
+ */
+ bytes_read_total += bytes_read;
+
+ if (bytes_read != buf_size || bytes_read == 0) {
+ has_more_data = FALSE;
+ }
+
+ if (bytes_read_total >= TEXT_MAX_SIZE) {
+ has_reached_max = TRUE;
+ }
+
+ g_debug (" Read "
+ "%" G_GSSIZE_FORMAT " bytes total, "
+ "%" G_GSSIZE_FORMAT " bytes this time, "
+ "more data:%s, reached max:%s",
+ bytes_read_total,
+ bytes_read,
+ has_more_data ? "yes" : "no",
+ has_reached_max ? "yes" : "no");
+
+ /* The + 1 is for the NULL terminating byte */
+ s = g_string_append_len (s, buf, bytes_read + 1);
+ }
+
+ if (has_reached_max) {
+ g_debug (" Maximum indexable limit reached");
+ }
+
+ if (error) {
+ g_message ("Could not read input stream for:'%s', %s",
+ uri,
+ error->message);
+ g_error_free (error);
+ g_string_free (s, TRUE);
+ g_object_unref (stream);
+ g_object_unref (file);
+
+ return NULL;
+ }
+
+ /* Check for UTF-8 Validity, if not try to convert it to the
+ * locale we are in.
+ */
+ is_utf8 = get_file_is_utf8 (s, &bytes_valid);
+
+ /* Make sure the string is NULL terminated and in the case
+ * where the string is valid UTF-8 up to the last character
+ * which was cut off, NULL terminate to the last most valid
+ * character.
+ */
+#ifdef TRY_LOCALE_TO_UTF8_CONVERSION
+ if (!is_utf8) {
+ s = get_file_in_locale (s);
+ } else {
+ g_debug (" Truncating to last valid UTF-8 character (%d/%d bytes)",
+ bytes_valid,
+ s->len);
+ s = g_string_truncate (s, bytes_valid);
+ }
+#else /* TRY_LOCALE_TO_UTF8_CONVERSION */
+ g_debug (" Truncating to last valid UTF-8 character (%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
+ bytes_valid,
+ s->len);
+ s = g_string_truncate (s, bytes_valid);
+#endif /* TRY_LOCALE_TO_UTF8_CONVERSION */
+
+ g_object_unref (stream);
+ g_object_unref (file);
+
+ if (s->len < 1) {
+ g_string_free (s, TRUE);
+ s = NULL;
+ }
+
+ return s ? g_string_free (s, FALSE) : NULL;
+}
+
+static void
+extract_text (const gchar *uri,
+ TrackerSparqlBuilder *metadata)
+{
+ gchar *content;
+
+ g_type_init ();
+
+#if 0
+ ContentData *cd;
+
+ cd = g_slice_new0 (ContentData);
+
+ cd->main_loop = g_main_loop_new (NULL, FALSE);
+ cd->data = g_string_new (NULL);
+ cd->uri = g_strdup (uri);
+ cd->metadata = g_object_ref (metadata);
+
+ g_main_loop_run (cd->main_loop);
+ g_main_loop_unref (cd->main_loop);
+
+ content = g_string_free (cd->data, FALSE);
+ g_slice_free (ContentData, cd);
+#endif
+
+ tracker_sparql_builder_subject_iri (metadata, uri);
+
+ content = get_file_content (uri);
+ tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+ tracker_sparql_builder_object_unvalidated (metadata, content);
+ g_free (content);
+}
+
+TrackerExtractData *
+tracker_get_extract_data (void)
+{
+ return data;
+}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]