[tracker/extractor-remove-word-counting-review] tracker_text_validate_utf8 can return only the number of valid UTF-8 bytes



commit 2641879c70301d56e4c8d7a6d1e818bfead396d4
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Tue May 11 11:42:08 2010 +0200

    tracker_text_validate_utf8 can return only the number of valid UTF-8 bytes

 src/libtracker-extract/tracker-utils.c         |   31 +++++++++++++++--------
 src/libtracker-extract/tracker-utils.h         |    3 +-
 src/tracker-extract/tracker-extract-html.c     |    3 +-
 src/tracker-extract/tracker-extract-msoffice.c |   11 ++++----
 src/tracker-extract/tracker-extract-oasis.c    |    3 +-
 src/tracker-extract/tracker-extract-pdf.cpp    |    3 +-
 6 files changed, 34 insertions(+), 20 deletions(-)
---
diff --git a/src/libtracker-extract/tracker-utils.c b/src/libtracker-extract/tracker-utils.c
index f9f1084..fe5eaec 100644
--- a/src/libtracker-extract/tracker-utils.c
+++ b/src/libtracker-extract/tracker-utils.c
@@ -361,25 +361,27 @@ tracker_text_normalize (const gchar *text,
  * tracker_text_validate_utf8:
  * @text: the text to validate
  * @text_len: length of @text, or -1 if NIL-terminated
- * @str: the string where to place the validated characters
+ * @str: the string where to place the validated UTF-8 characters, or %NULL if
+ *  not needed.
+ * @p_utf8_len: Output number of valid UTF-8 bytes found, or %NULL if not needed
  *
  * This function iterates through @text checking for UTF-8 validity
- * using g_utf8_validate(), and appends the first chunk of valid characters
- * to @str.
+ * using g_utf8_validate(), appends the first chunk of valid characters
+ * to @str, and gives the number of valid UTF-8 bytes in @p_utf8_len.
  *
- * Returns: %TRUE if valid UTF-8 in @text was appended to @str
+ * Returns: %TRUE if some bytes were found to be valid, %FALSE otherwise.
  *
  * Since: 0.9
  **/
 gboolean
 tracker_text_validate_utf8 (const gchar  *text,
                             gsize         text_len,
-                            GString     **str)
+                            GString     **str,
+                            gsize        *p_utf8_len)
 {
 	gsize len_to_validate;
 
 	g_return_val_if_fail (text, FALSE);
-	g_return_val_if_fail (str, FALSE);
 
 	len_to_validate = text_len >= 0 ? text_len : strlen (text);
 
@@ -390,12 +392,19 @@ tracker_text_validate_utf8 (const gchar  *text,
 		 *  (if any) or to the end of the string. */
 		g_utf8_validate (text, len_to_validate, &end);
 		if (end > text) {
-			/* Create string to output if not already as input */
-			if (*str == NULL) {
-				*str = g_string_new_len (text, end-text);
-			} else {
-				*str = g_string_append_len (*str, text, end-text);
+			/* If str output required... */
+			if (str) {
+				/* Create string to output if not already as input */
+				*str = (*str == NULL ?
+				        g_string_new_len (text, end - text) :
+				        g_string_append_len (*str, text, end - text));
+			}
+
+			/* If utf8 len output required... */
+			if (p_utf8_len) {
+				*p_utf8_len = end - text;
 			}
+
 			return TRUE;
 		}
 	}
diff --git a/src/libtracker-extract/tracker-utils.h b/src/libtracker-extract/tracker-utils.h
index 6003d36..760fc4b 100644
--- a/src/libtracker-extract/tracker-utils.h
+++ b/src/libtracker-extract/tracker-utils.h
@@ -41,7 +41,8 @@ gchar*       tracker_text_normalize         (const gchar *text,
 
 gboolean     tracker_text_validate_utf8     (const gchar  *text,
                                              gsize         text_len,
-                                             GString     **str);
+                                             GString     **str,
+                                             gsize        *p_utf8_len);
 gchar*       tracker_date_guess             (const gchar *date_string);
 gchar*       tracker_date_format_to_iso8601 (const gchar *date_string,
                                              const gchar *format);
diff --git a/src/tracker-extract/tracker-extract-html.c b/src/tracker-extract/tracker-extract-html.c
index a59b864..acd99b7 100644
--- a/src/tracker-extract/tracker-extract-html.c
+++ b/src/tracker-extract/tracker-extract-html.c
@@ -221,7 +221,8 @@ parser_characters (void          *data,
 			                                (pd->n_bytes_remaining < text_len ?
 			                                 pd->n_bytes_remaining :
 			                                 text_len),
-			                                &pd->plain_text)) {
+			                                &pd->plain_text,
+			                                NULL)) {
 				/* In the case of HTML, each string arriving this
 				 * callback is independent to any other previous
 				 * string, so need to add an explicit whitespace
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index d47a1c3..30c2046 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -434,7 +434,8 @@ msoffice_convert_and_normalize_chunk (guint8    *buffer,
 
 		if (tracker_text_validate_utf8 (converted_text,
 		                                len_to_validate,
-		                                p_content)) {
+		                                p_content,
+		                                NULL)) {
 			/* A whitespace is added to separate next strings appended */
 			g_string_append_c (*p_content, ' ');
 		}
@@ -1860,7 +1861,7 @@ xml_text_handler_document_data (GMarkupParseContext  *context,
 	case MS_OFFICE_XML_TAG_WORD_TEXT:
 		if (info->style_element_present) {
 			if (atoi (text) == 0) {
-				tracker_text_validate_utf8 (text, -1, &info->content);
+				tracker_text_validate_utf8 (text, -1, &info->content, NULL);
 				g_string_append_c (info->content, ' ');
 			}
 		}
@@ -1868,7 +1869,7 @@ xml_text_handler_document_data (GMarkupParseContext  *context,
 		if (info->preserve_attribute_present) {
 			gchar *keywords = g_strdup (text);
 			if (found) {
-				tracker_text_validate_utf8 (text, -1, &info->content);
+				tracker_text_validate_utf8 (text, -1, &info->content, NULL);
 				g_string_append_c (info->content, ' ');
 				found = FALSE;
 			} else {
@@ -1892,13 +1893,13 @@ xml_text_handler_document_data (GMarkupParseContext  *context,
 		break;
 
 	case MS_OFFICE_XML_TAG_SLIDE_TEXT:
-		tracker_text_validate_utf8 (text, -1, &info->content);
+		tracker_text_validate_utf8 (text, -1, &info->content, NULL);
 		g_string_append_c (info->content, ' ');
 		break;
 
 	case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
 		if (atoi (text) == 0)  {
-			tracker_text_validate_utf8 (text, -1, &info->content);
+			tracker_text_validate_utf8 (text, -1, &info->content, NULL);
 			g_string_append_c (info->content, ' ');
 		}
 		break;
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index 573e0db..da21440 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -139,7 +139,8 @@ extract_oasis_content (const gchar *uri,
 
 			tracker_text_validate_utf8 (buf,
 			                            len_to_validate,
-			                            &validated);
+			                            &validated,
+			                            NULL);
 
 			/* Note that in this case we shouldn't add a whitespace
 			 * separator between chunks read */
diff --git a/src/tracker-extract/tracker-extract-pdf.cpp b/src/tracker-extract/tracker-extract-pdf.cpp
index 02d3441..a23379e 100644
--- a/src/tracker-extract/tracker-extract-pdf.cpp
+++ b/src/tracker-extract/tracker-extract-pdf.cpp
@@ -365,7 +365,8 @@ extract_content (PDFDoc *document,
 
 		if (tracker_text_validate_utf8 (sel_text->getCString (),
 		                                len_to_validate,
-		                                &string)) {
+		                                &string,
+		                                NULL)) {
 			/* A whitespace is added to separate next strings appended */
 			g_string_append_c (string, ' ');
 		}



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]