[tracker] Add tracker_text_normalize()
- From: Carlos Garnacho <carlosg src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [tracker] Add tracker_text_normalize()
- Date: Thu, 8 Oct 2009 16:20:18 +0000 (UTC)
commit 0a436f923bd2a4486054c1ce9de0881e11400c88
Author: Carlos Garnacho <carlos lanedo com>
Date: Thu Oct 8 18:11:04 2009 +0200
Add tracker_text_normalize()
This function is meant for FTS in extractors, receives UTF8 text and tries to
strip non-text characters, extra spaces, carriage returns and such, providing
a suitable string for nie:plainTextContent.
src/libtracker-common/tracker-utils.c | 46 +++++++++++++++++++++++++++++++++
src/libtracker-common/tracker-utils.h | 4 +++
2 files changed, 50 insertions(+), 0 deletions(-)
---
diff --git a/src/libtracker-common/tracker-utils.c b/src/libtracker-common/tracker-utils.c
index 5006beb..f10bbec 100644
--- a/src/libtracker-common/tracker-utils.c
+++ b/src/libtracker-common/tracker-utils.c
@@ -563,3 +563,49 @@ tracker_merge (const gchar *delim, gint n_values,
return g_string_free (str, FALSE);
}
+
+gchar *
+tracker_text_normalize (const gchar *text,
+ guint max_words,
+ guint *n_words)
+{
+ GString *string;
+ gboolean in_break = TRUE;
+ gunichar ch;
+ gint words = 0;
+
+ string = g_string_new (NULL);
+
+ while ((ch = g_utf8_get_char_validated (text, -1)) > 0) {
+ GUnicodeType type;
+
+ type = g_unichar_type (ch);
+
+ if (type == G_UNICODE_LOWERCASE_LETTER ||
+ type == G_UNICODE_MODIFIER_LETTER ||
+ type == G_UNICODE_OTHER_LETTER ||
+ type == G_UNICODE_TITLECASE_LETTER ||
+ type == G_UNICODE_UPPERCASE_LETTER) {
+ /* Append regular chars */
+ g_string_append_unichar (string, ch);
+ in_break = FALSE;
+ } else if (!in_break) {
+ /* Non-regular char found, treat as word break */
+ g_string_append_c (string, ' ');
+ in_break = TRUE;
+ words++;
+
+ if (words > max_words) {
+ break;
+ }
+ }
+
+ text = g_utf8_find_next_char (text, NULL);
+ }
+
+ if (n_words) {
+ *n_words = words;
+ }
+
+ return g_string_free (string, FALSE);
+}
diff --git a/src/libtracker-common/tracker-utils.h b/src/libtracker-common/tracker-utils.h
index ce6a837..e0525d0 100644
--- a/src/libtracker-common/tracker-utils.h
+++ b/src/libtracker-common/tracker-utils.h
@@ -50,6 +50,10 @@ gchar * tracker_coalesce (gint n_values,
gchar * tracker_merge (const gchar *delim, gint n_values,
...);
+gchar * tracker_text_normalize (const gchar *text,
+ guint max_words,
+ guint *n_words);
+
/* Temporary: Just here until we upgrade to GLib 2.18. */
G_CONST_RETURN gchar *
tracker_dngettext (const gchar *domain,
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]