[tracker/parser-unicode-libs-review: 78/85] Added libicu-based unac stripping
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/parser-unicode-libs-review: 78/85] Added libicu-based unac stripping
- Date: Tue, 4 May 2010 17:30:34 +0000 (UTC)
commit c905bf580a7b5b1a43e4e62b7e1c10b01ba2e1c8
Author: Aleksander Morgado <aleksander lanedo com>
Date: Tue May 4 09:40:51 2010 +0200
Added libicu-based unac stripping
configure.ac | 2 +-
src/libtracker-fts/tracker-parser-glib.c | 4 +-
src/libtracker-fts/tracker-parser-libunistring.c | 6 +-
src/libtracker-fts/tracker-parser-utils.c | 131 ++++++++++++++++++----
src/libtracker-fts/tracker-parser-utils.h | 21 +++-
5 files changed, 133 insertions(+), 31 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index c7abf64..5cb0413 100644
--- a/configure.ac
+++ b/configure.ac
@@ -859,7 +859,7 @@ else
unicode_library=libicu
AC_CHECK_HEADER(unicode/ubrk.h, [have_libicu=yes],[have_libicu=no])
- LIBICU_CFLAGS="-Iunicode"
+ LIBICU_CFLAGS=""
LIBICU_LIBS="-licuuc"
AC_SUBST(LIBICU_CFLAGS)
diff --git a/src/libtracker-fts/tracker-parser-glib.c b/src/libtracker-fts/tracker-parser-glib.c
index f685fd4..83a969b 100644
--- a/src/libtracker-fts/tracker-parser-glib.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -538,7 +538,9 @@ tracker_parser_process_word (TrackerParser *parser,
word, bytes);
if (do_strip) {
- stripped_word = tracker_parser_unaccent_string (word, bytes, &len);
+ stripped_word = tracker_parser_unaccent_utf8_word (word,
+ bytes,
+ &len);
/* Log after UNAC stripping */
tracker_parser_message_hex (" After UNAC stripping",
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index efb05aa..6fec131 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -344,9 +344,9 @@ tracker_parser_process_word (TrackerParser *parser,
if (do_strip) {
gsize stripped_word_length;
- stripped = tracker_parser_unaccent_string (normalized,
- new_word_length,
- &stripped_word_length);
+ stripped = tracker_parser_unaccent_utf8_word (normalized,
+ new_word_length,
+ &stripped_word_length);
if (stripped) {
/* Log after UNAC stripping */
diff --git a/src/libtracker-fts/tracker-parser-utils.c b/src/libtracker-fts/tracker-parser-utils.c
index d2486ab..0a37440 100644
--- a/src/libtracker-fts/tracker-parser-utils.c
+++ b/src/libtracker-fts/tracker-parser-utils.c
@@ -25,55 +25,140 @@
#include <unac.h>
#endif
+#ifdef HAVE_LIBICU
+#include <unicode/utypes.h>
+#include <unicode/ucnv.h>
+#endif
+
#include <libtracker-common/tracker-common.h>
#include "tracker-parser-utils.h"
+
+/* Output is always UTF-8. */
gchar *
-tracker_parser_unaccent_string (const gchar *str,
- gsize ilength,
- gsize *p_olength)
+tracker_parser_unaccent_utf16be_word (const gchar *string,
+ gsize ilength,
+ gsize *p_olength)
{
-#ifdef HAVE_UNAC
GError *error = NULL;
- gchar *str_utf16;
- gsize utf16_len, unaccented_len, final_len;
gchar *unaccented_str = NULL;
- gchar *s = NULL;
+ gchar *str_utf8 = NULL;
+ gsize unaccented_len;
+ gsize utf8_len;
*p_olength = 0;
- /* unac_string() does roughly the same than below, plus it
- * corrupts memory in 64bit systems, so avoid it for now.
- */
- str_utf16 = g_convert (str, ilength, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
+ if (unac_string_utf16 (string, ilength,
+ &unaccented_str, &unaccented_len) != 0) {
+ g_warning ("UNAC failed to strip accents");
+ return NULL;
+ }
+
+ /* Convert from UTF-16BE to UTF-8 */
+ str_utf8 = g_convert (unaccented_str,
+ unaccented_len,
+ "UTF-8",
+ "UTF-16BE",
+ NULL,
+ &utf8_len,
+ &error);
+ g_free (unaccented_str);
if (error) {
- g_warning ("Could not convert to UTF-16: %s", error->message);
+ g_warning ("Could not convert back to UTF-8: %s",
+ error->message);
g_error_free (error);
return NULL;
}
- if (unac_string_utf16 (str_utf16, utf16_len,
- &unaccented_str, &unaccented_len) != 0) {
- g_warning ("UNAC failed to strip accents");
- g_free (str_utf16);
- return NULL;
+ *p_olength = utf8_len;
+ return str_utf8;
+}
+
+
+#ifdef HAVE_LIBICU
+/* NOTE: Internally, UChars are UTF-16, but conversion needed just in case,
+ * as libunac needs UTF-16BE. Output is always UTF-8.*/
+gchar *
+tracker_parser_unaccent_UChar_word (const UChar *string,
+ gsize ilength,
+ gsize *p_olength)
+{
+#ifdef HAVE_UNAC
+ UErrorCode icu_error = U_ZERO_ERROR;
+ UConverter *converter;
+ gchar *str_utf16;
+ gchar *str_utf8 = NULL;
+ gsize utf16_len;
+
+ *p_olength = 0;
+
+ /* Open converter UChar to UTF-16BE */
+ converter = ucnv_open ("UTF-16BE", &icu_error);
+ if (!converter) {
+ g_warning ("Cannot open UTF-16BE converter: '%s'",
+ U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+ return NULL;
}
+ /* Allocate buffer, same size as input string */
+ str_utf16 = g_malloc (ilength);
+
+ /* Convert from UChar to UTF-16BE */
+ utf16_len = ucnv_fromUChars (converter,
+ str_utf16,
+ ilength,
+ string,
+ ilength,
+ &icu_error);
+ if (U_FAILURE (icu_error)) {
+ g_warning ("Cannot convert from UChar to UTF-16BE: '%s'",
+ u_errorName (icu_error));
+ } else {
+ str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
+ utf16_len,
+ p_olength);
+ }
+ ucnv_close (converter);
g_free (str_utf16);
+ return str_utf8;
+#else
+ return NULL;
+#endif
+}
+#endif
- s = g_convert (unaccented_str, unaccented_len, "UTF-8", "UTF-16BE", NULL, &final_len, &error);
- g_free (unaccented_str);
+gchar *
+tracker_parser_unaccent_utf8_word (const gchar *str,
+ gsize ilength,
+ gsize *p_olength)
+{
+#ifdef HAVE_UNAC
+ GError *error = NULL;
+ gchar *str_utf16 = NULL;
+ gchar *str_utf8 = NULL;
+ gsize utf16_len;
+
+ *p_olength = 0;
+
+ /* unac_string() does roughly the same than below, plus it
+ * corrupts memory in 64bit systems, so avoid it for now.
+ */
+ str_utf16 = g_convert (str, ilength, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
if (error) {
- g_warning ("Could not convert back to UTF-8: %s", error->message);
+ g_warning ("Could not convert to UTF-16: %s", error->message);
g_error_free (error);
return NULL;
- }
+ } else {
- *p_olength = final_len;
+ str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
+ utf16_len,
+ p_olength);
+ }
- return s;
+ g_free (str_utf16);
+ return str_utf8;
#else
return NULL;
#endif
diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h
index 77eb662..2e7a2c6 100644
--- a/src/libtracker-fts/tracker-parser-utils.h
+++ b/src/libtracker-fts/tracker-parser-utils.h
@@ -20,14 +20,29 @@
#ifndef __TRACKER_PARSER_UTILS_H__
#define __TRACKER_PARSER_UTILS_H__
+#include "config.h"
+
#include <glib.h>
+#ifdef HAVE_LIBICU
+#include <unicode/utypes.h>
+#endif
+
G_BEGIN_DECLS
-gchar *tracker_parser_unaccent_string (const gchar *str,
- gsize ilength,
- gsize *p_olength);
+gchar *tracker_parser_unaccent_utf16be_word (const gchar *string,
+ gsize ilength,
+ gsize *p_olength);
+gchar *tracker_parser_unaccent_utf8_word (const gchar *string,
+ gsize ilength,
+ gsize *p_olength);
+
+#ifdef HAVE_LIBICU
+gchar *tracker_parser_unaccent_UChar_word (const UChar *string,
+ gsize ilength,
+ gsize *p_olength);
+#endif
/* Define to 1 if you want to enable debugging logs showing HEX contents
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]