tracker r2121 - in branches/indexer-split: . src/libtracker-common
- From: jamiemcc svn gnome org
- To: svn-commits-list gnome org
- Subject: tracker r2121 - in branches/indexer-split: . src/libtracker-common
- Date: Thu, 21 Aug 2008 03:15:01 +0000 (UTC)
Author: jamiemcc
Date: Thu Aug 21 03:15:01 2008
New Revision: 2121
URL: http://svn.gnome.org/viewvc/tracker?rev=2121&view=rev
Log:
2008-08-20 Jamie McCracken <jamiemcc at gnome org>
* Added new API for tracker-parser so that it will be compatible with sqlite FTS3
Modified:
branches/indexer-split/ChangeLog
branches/indexer-split/src/libtracker-common/tracker-parser.c
branches/indexer-split/src/libtracker-common/tracker-parser.h
Modified: branches/indexer-split/src/libtracker-common/tracker-parser.c
==============================================================================
--- branches/indexer-split/src/libtracker-common/tracker-parser.c (original)
+++ branches/indexer-split/src/libtracker-common/tracker-parser.c Thu Aug 21 03:15:01 2008
@@ -21,11 +21,6 @@
#include <string.h>
-#ifdef HAVE_UNAC
-#include <unac.h>
-#endif
-
-#include <pango/pango.h>
#include "tracker-parser.h"
#include "tracker-log.h"
@@ -48,6 +43,7 @@
#define IS_ASCII_IGNORE(c) ((c) <= 0x002C)
#define IS_HYPHEN(c) ((c) == 0x002D)
#define IS_UNDERSCORE(c) ((c) == 0x005F)
+#define IS_NEWLINE(c) ((c) == 0x000D)
typedef enum {
TRACKER_PARSER_WORD_ASCII_HIGHER,
@@ -59,12 +55,14 @@
TRACKER_PARSER_WORD_ALPHA_LOWER,
TRACKER_PARSER_WORD_ALPHA,
TRACKER_PARSER_WORD_ALPHA_NUM,
- TRACKER_PARSER_WORD_IGNORE
+ TRACKER_PARSER_WORD_IGNORE,
+ TRACKER_PARSER_WORD_NEWLINE
} TrackerParserWordType;
static inline TrackerParserWordType
get_word_type (gunichar c)
{
+
/* Fast ascii handling */
if (IS_ASCII (c)) {
if (IS_ASCII_ALPHA_LOWER (c)) {
@@ -90,6 +88,10 @@
if (IS_UNDERSCORE (c)) {
return TRACKER_PARSER_WORD_UNDERSCORE;
}
+
+ if (IS_NEWLINE (c)) {
+ return TRACKER_PARSER_WORD_NEWLINE;
+ }
} else {
if (g_unichar_isalpha (c)) {
if (!g_unichar_isupper (c)) {
@@ -124,6 +126,8 @@
#endif
}
+
+
static gboolean
text_needs_pango (const gchar *text)
{
@@ -144,7 +148,37 @@
}
}
- return FALSE;
+ return FALSE;
+}
+
+
+
+static TrackerParserEncoding
+get_encoding (const char *txt)
+{
+ const gchar *p;
+ gunichar c;
+ gint i = 0;
+
+ /* Grab first 255 non-whitespace chars and test */
+ for (p = txt; *p && i < 255; p = g_utf8_next_char (p)) {
+ c = g_utf8_get_char (p);
+
+ if (!g_unichar_isspace (c)) {
+ i++;
+ }
+
+ if (IS_ASCII(c)) continue;
+
+ if (IS_LATIN(c)) return TRACKER_PARSER_ENCODING_LATIN;
+
+ if (NEED_PANGO(c)) return TRACKER_PARSER_ENCODING_CJK;
+
+ return TRACKER_PARSER_ENCODING_OTHER;
+ }
+
+ return TRACKER_PARSER_ENCODING_ASCII;
+
}
static gboolean
@@ -153,6 +187,8 @@
{
GHashTable *stop_words;
+ if (!word) return FALSE;
+
stop_words = tracker_language_get_stop_words (language);
return g_hash_table_lookup (stop_words, word) != NULL;
@@ -197,7 +233,7 @@
c = g_utf8_get_char (p);
type = get_word_type (c);
- if (type == TRACKER_PARSER_WORD_IGNORE ||
+ if (type == TRACKER_PARSER_WORD_IGNORE || type == TRACKER_PARSER_WORD_NEWLINE ||
(delimit_hyphen &&
(type == TRACKER_PARSER_WORD_HYPHEN ||
type == TRACKER_PARSER_WORD_UNDERSCORE))) {
@@ -307,7 +343,8 @@
return p;
}
- if (do_strip) {
+ if (do_strip && get_encoding (utf8) == TRACKER_PARSER_ENCODING_LATIN) {
+
stripped_word = strip_word (utf8, bytes, &len);
} else {
stripped_word = NULL;
@@ -339,6 +376,489 @@
return p;
}
+
+
+TrackerParser *
+tracker_parser_new (TrackerLanguage *language,
+ gint max_word_length,
+ gint min_word_length)
+{
+
+
+ TrackerParser *parser = g_new (TrackerParser, 1);
+
+ parser->language = language;
+ parser->max_word_length = max_word_length;
+ parser->min_word_length = min_word_length;
+
+ parser->attrs = NULL;
+
+ return parser;
+}
+
+void
+tracker_parser_reset (TrackerParser *parser,
+ const gchar *txt,
+ gint txt_size,
+ gboolean delimit_words,
+ gboolean enable_stemmer,
+ gboolean enable_stop_words)
+{
+
+ g_return_if_fail (txt && parser);
+
+ if (parser->attrs) g_free (parser->attrs);
+
+ parser->enable_stemmer = enable_stemmer;
+ parser->enable_stop_words = enable_stop_words;
+ parser->delimit_words = delimit_words;
+ parser->encoding = get_encoding (txt);
+ parser->txt_size = txt_size;
+ parser->txt = txt;
+
+ parser->word_position = 0;
+
+ parser->cursor = txt;
+
+
+ if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
+
+ PangoLogAttr *attrs;
+
+ parser->attr_length = g_utf8_strlen (parser->txt, parser->txt_size) + 1;
+
+ attrs = g_new0 (PangoLogAttr, parser->attr_length);
+
+ pango_get_log_attrs (parser->txt,
+ txt_size,
+ 0,
+ pango_language_from_string ("C"),
+ attrs,
+ parser->attr_length);
+
+ parser->attrs = attrs;
+ parser->attr_pos = 0;
+
+ }
+
+}
+
+
+
+static gchar *
+pango_next (TrackerParser *parser,
+ guint skip_words,
+ guint *byte_offset_start,
+ guint *byte_offset_end,
+ gboolean *is_new_paragraph)
+{
+
+ /* CJK text does not need stemming or other treatment */
+ int word_start = -1;
+ int old_word_start = -1;
+ guint words_parsed = 0;
+ guint32 i;
+
+ *is_new_paragraph = FALSE;
+
+ for (i = parser->attr_pos; i < parser->attr_length; i++) {
+
+ if (parser->attrs[i].is_word_start) {
+ word_start = i;
+ continue;
+ }
+
+ if (parser->attrs[i].is_word_end && word_start != old_word_start) {
+
+ old_word_start = word_start;
+
+ words_parsed++;
+
+ if (words_parsed <= skip_words) continue;
+
+ gchar *start_word, *end_word;
+
+ start_word = g_utf8_offset_to_pointer (parser->txt, word_start);
+ end_word = g_utf8_offset_to_pointer (parser->txt, i);
+
+ if (start_word != end_word) {
+ gchar *str;
+ gchar *index_word;
+
+ /* Normalize word */
+ str = g_utf8_casefold (start_word, end_word - start_word);
+ if (!str) {
+ continue;
+ }
+
+ index_word = g_utf8_normalize (str, -1, G_NORMALIZE_NFC);
+ g_free (str);
+
+ if (!index_word) {
+ continue;
+ }
+
+ if (word_start > 1 && parser->attrs[word_start -1].is_sentence_boundary) {
+ *is_new_paragraph = TRUE;
+ }
+
+ *byte_offset_start = (start_word - parser->txt);
+ *byte_offset_end = *byte_offset_start + (end_word - start_word);
+ parser->attr_pos = i;
+ return index_word;
+
+ }
+ word_start = i;
+ }
+ }
+ parser->attr_pos = i;
+
+ return NULL;
+
+}
+
+static gchar *
+tracker_parser_process_word (TrackerParser *parser, const char *word, gint length, gboolean do_strip)
+{
+ guint bytes, len;
+ char *str = NULL, *stripped_word = NULL;
+ const char *stem_word;
+
+ if (word) {
+
+ if (length == -1) {
+ bytes = strlen (word);
+ } else {
+ bytes = length;
+ }
+
+ if (do_strip && get_encoding (word) == TRACKER_PARSER_ENCODING_LATIN) {
+ stripped_word = strip_word (word, bytes, &len);
+ } else {
+ stripped_word = NULL;
+ }
+
+ if (!stripped_word) {
+ str = g_utf8_normalize (word,
+ bytes,
+ G_NORMALIZE_NFC);
+ } else {
+ str = g_utf8_normalize (stripped_word,
+ len,
+ G_NORMALIZE_NFC);
+ g_free (stripped_word);
+ }
+
+
+ if (!parser->enable_stemmer) {
+ return str;
+ }
+
+ len = strlen (str);
+
+ stem_word = tracker_language_stem_word (parser->language, str, len);
+
+
+ if (stem_word) {
+ char *result = g_strdup (stem_word);
+
+ g_free (str);
+
+ return result;
+ }
+ }
+ return str;
+
+}
+
+static gchar *
+parser_next (TrackerParser *parser,
+ guint skip_words,
+ guint *byte_offset_start,
+ guint *byte_offset_end,
+ gboolean *is_new_paragraph)
+{
+ TrackerParserWordType word_type;
+ gunichar word[64];
+ gboolean is_valid;
+ gboolean filter_numbers = TRUE;
+ guint length;
+ gint char_count = 0;
+ glong bytes;
+ const char *p;
+ const char *start;
+ const char *end;
+ guint words_skipped = 0;
+ gboolean do_strip = FALSE;
+
+ *byte_offset_start = 1;
+ *byte_offset_end = 2;
+ *is_new_paragraph = FALSE;
+
+ g_return_val_if_fail (parser, NULL);
+
+ if (!parser->cursor) {
+ return NULL;
+ }
+
+ word_type = TRACKER_PARSER_WORD_IGNORE;
+ is_valid = TRUE;
+ length = 0;
+ bytes = 0;
+
+ start = NULL;
+ end = NULL;
+
+
+ for (p = parser->cursor; *p; p = g_utf8_next_char (p)) {
+ TrackerParserWordType type;
+ gunichar c;
+
+ char_count++;
+ c = g_utf8_get_char (p);
+ type = get_word_type (c);
+
+ if (type == TRACKER_PARSER_WORD_NEWLINE) {
+ *is_new_paragraph = TRUE;
+ }
+
+ if (type == TRACKER_PARSER_WORD_IGNORE || type == TRACKER_PARSER_WORD_NEWLINE ||
+ (parser->delimit_words &&
+ (type == TRACKER_PARSER_WORD_HYPHEN ||
+ type == TRACKER_PARSER_WORD_UNDERSCORE))) {
+ if (!start) {
+ continue;
+ } else {
+ /* word break */
+
+ if (!is_valid || length < parser->min_word_length || word_type == TRACKER_PARSER_WORD_NUM || words_skipped < skip_words) {
+ *is_new_paragraph = FALSE;
+
+ if (is_valid && length >= parser->min_word_length && word_type != TRACKER_PARSER_WORD_NUM && words_skipped < skip_words) words_skipped++;
+
+ word_type = TRACKER_PARSER_WORD_IGNORE;
+ is_valid = TRUE;
+ length = 0;
+ bytes = 0;
+ start = NULL;
+ end = NULL;
+ do_strip = FALSE;
+ continue;
+
+ }
+
+
+ break;
+ }
+ }
+
+ if (!is_valid) {
+ continue;
+ }
+
+ if (!start) {
+ start = g_utf8_offset_to_pointer (parser->cursor, char_count);
+
+ /* Valid words must start with an alpha or
+ * underscore if we are filtering.
+ */
+ if (filter_numbers) {
+ if (type == TRACKER_PARSER_WORD_NUM) {
+ is_valid = FALSE;
+ continue;
+ } else {
+ if (type == TRACKER_PARSER_WORD_HYPHEN) {
+ is_valid = FALSE;
+ continue;
+ }
+ }
+ }
+ }
+
+ if (length >= parser->max_word_length) {
+ continue;
+ }
+
+ length++;
+
+ switch (type) {
+ case TRACKER_PARSER_WORD_ASCII_HIGHER:
+ c += 32;
+
+ case TRACKER_PARSER_WORD_ASCII_LOWER:
+ case TRACKER_PARSER_WORD_HYPHEN:
+ case TRACKER_PARSER_WORD_UNDERSCORE:
+ if (word_type == TRACKER_PARSER_WORD_NUM ||
+ word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
+ word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
+ } else {
+ word_type = TRACKER_PARSER_WORD_ALPHA;
+ }
+
+ break;
+
+ case TRACKER_PARSER_WORD_NUM:
+ if (word_type == TRACKER_PARSER_WORD_ALPHA ||
+ word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
+ word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
+ } else {
+ word_type = TRACKER_PARSER_WORD_NUM;
+ }
+ break;
+
+ case TRACKER_PARSER_WORD_ALPHA_HIGHER:
+ c = g_unichar_tolower (c);
+
+ case TRACKER_PARSER_WORD_ALPHA_LOWER:
+ if (!do_strip) {
+ do_strip = TRUE;
+ }
+
+ if (word_type == TRACKER_PARSER_WORD_NUM ||
+ word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
+ word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
+ } else {
+ word_type = TRACKER_PARSER_WORD_ALPHA;
+ }
+
+ break;
+
+ default:
+ break;
+ }
+
+ word[length -1] = c;
+ }
+
+ if (!is_valid) {
+ parser->cursor = NULL;
+ return NULL;
+ }
+ end = g_utf8_offset_to_pointer (parser->cursor, char_count);
+
+ parser->cursor = end;
+
+ if (word_type == TRACKER_PARSER_WORD_ALPHA_NUM || word_type == TRACKER_PARSER_WORD_ALPHA) {
+ gchar *utf8;
+ gchar *processed_word;
+
+
+
+ utf8 = g_ucs4_to_utf8 (word, length, NULL, &bytes, NULL);
+
+ if (!utf8) {
+ return NULL;
+ }
+ *byte_offset_start = start - parser->txt;
+ *byte_offset_end = end - parser->txt;
+
+ processed_word = tracker_parser_process_word (parser, utf8, bytes, do_strip);
+
+ g_free (utf8);
+
+ return processed_word;
+
+ }
+
+ return NULL;
+
+}
+
+gboolean
+tracker_parser_is_stop_word (TrackerParser *parser, const gchar *word)
+{
+ if (get_encoding (word) == TRACKER_PARSER_ENCODING_CJK) return FALSE;
+
+
+ char *processed_word = tracker_parser_process_word (parser, word, -1, TRUE);
+ gboolean result = is_stop_word (parser->language, processed_word);
+ g_free (processed_word);
+ return result;
+}
+
+
+gchar *
+tracker_parser_next (TrackerParser *parser,
+ guint *position,
+ guint *byte_offset_start,
+ guint *byte_offset_end,
+ gboolean *new_paragraph,
+ gboolean *stop_word)
+{
+
+ guint byte_start, byte_end;
+ gboolean new_para;
+ char *str;
+
+
+
+ if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
+ str = pango_next (parser, 0, &byte_start, &byte_end, &new_para);
+
+ parser->word_position++;
+
+ *stop_word = FALSE;
+
+ } else {
+ str = parser_next (parser, 0, &byte_start, &byte_end, &new_para);
+ parser->word_position++;
+ if (parser->enable_stop_words && is_stop_word (parser->language, str)) {
+ *stop_word = TRUE;
+ } else {
+ *stop_word = FALSE;
+ }
+ }
+
+ *position = parser->word_position;
+ *byte_offset_start = byte_start;
+ *byte_offset_end = byte_end;
+ *new_paragraph = new_para;
+
+ return str;
+
+}
+
+
+
+void
+tracker_parser_set_posititon (TrackerParser *parser,
+ guint position)
+{
+ guint byte_start, byte_end;
+ gboolean para;
+
+
+ parser->word_position = 0;
+ parser->cursor = parser->txt;
+ parser->attr_pos = 0;
+
+ if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
+ char *s = pango_next (parser, position, &byte_start, &byte_end, ¶);
+ g_free (s);
+ } else {
+ char *s = parser_next (parser, position, &byte_start, &byte_end, ¶);
+ g_free (s);
+ }
+
+}
+
+void
+tracker_parser_free (TrackerParser *parser)
+{
+ if (parser->attrs) g_free (parser->attrs);
+
+ g_free (parser);
+
+}
+
+
+
+
+
+
+/* old stuff */
+
+
gchar *
tracker_parser_text_to_string (const gchar *txt,
TrackerLanguage *language,
@@ -555,6 +1075,7 @@
return update_count;
}
+
GHashTable *
tracker_parser_text (GHashTable *word_table,
const gchar *txt,
Modified: branches/indexer-split/src/libtracker-common/tracker-parser.h
==============================================================================
--- branches/indexer-split/src/libtracker-common/tracker-parser.h (original)
+++ branches/indexer-split/src/libtracker-common/tracker-parser.h Thu Aug 21 03:15:01 2008
@@ -22,11 +22,79 @@
#define __TRACKERD_PARSER_H__
#include <glib.h>
+#include <pango/pango.h>
#include "tracker-language.h"
G_BEGIN_DECLS
+
+typedef enum {
+ TRACKER_PARSER_ENCODING_ASCII,
+ TRACKER_PARSER_ENCODING_LATIN,
+ TRACKER_PARSER_ENCODING_CJK,
+ TRACKER_PARSER_ENCODING_OTHER
+} TrackerParserEncoding;
+
+typedef struct {
+ const gchar *txt;
+ gint txt_size;
+ TrackerLanguage *language;
+ gboolean enable_stemmer;
+ gboolean enable_stop_words;
+ guint max_words_to_index;
+ guint max_word_length;
+ guint min_word_length;
+ gboolean delimit_words;
+
+ /* private members */
+ guint word_position;
+ TrackerParserEncoding encoding;
+ const gchar *cursor;
+
+ /* pango members for CJK text parsing */
+ PangoLogAttr * attrs;
+ guint attr_length;
+ guint attr_pos;
+
+} TrackerParser;
+
+
+
+TrackerParser * tracker_parser_new (TrackerLanguage *language,
+ gint max_word_length,
+ gint min_word_length);
+
+
+void tracker_parser_reset (TrackerParser *parser,
+ const gchar *txt,
+ gint txt_size,
+ gboolean delimit_words,
+ gboolean enable_stemmer,
+ gboolean enable_stop_words);
+
+
+
+gchar * tracker_parser_next (TrackerParser *parser,
+ guint *position,
+ guint *byte_offset_start,
+ guint *byte_offset_end,
+ gboolean *new_paragraph,
+ gboolean *stop_word);
+
+
+void tracker_parser_set_posititon (TrackerParser *parser,
+ guint position);
+
+gboolean tracker_parser_is_stop_word (TrackerParser *parser, const gchar *word);
+
+static gchar * tracker_parser_process_word (TrackerParser *parser, const char *word, gint length, gboolean do_strip);
+
+void tracker_parser_free (TrackerParser *parser);
+
+
+
+
/*
* Functions to parse supplied text and break into individual words and
* maintain a count of no of occurences of the word multiplied by a
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]