tracker r2121 - in branches/indexer-split: . src/libtracker-common



Author: jamiemcc
Date: Thu Aug 21 03:15:01 2008
New Revision: 2121
URL: http://svn.gnome.org/viewvc/tracker?rev=2121&view=rev

Log:
2008-08-20  Jamie McCracken <jamiemcc at gnome org>

	* Added new API for tracker-parser so that it will be compatible with sqlite FTS3



Modified:
   branches/indexer-split/ChangeLog
   branches/indexer-split/src/libtracker-common/tracker-parser.c
   branches/indexer-split/src/libtracker-common/tracker-parser.h

Modified: branches/indexer-split/src/libtracker-common/tracker-parser.c
==============================================================================
--- branches/indexer-split/src/libtracker-common/tracker-parser.c	(original)
+++ branches/indexer-split/src/libtracker-common/tracker-parser.c	Thu Aug 21 03:15:01 2008
@@ -21,11 +21,6 @@
 
 #include <string.h>
 
-#ifdef HAVE_UNAC
-#include <unac.h>
-#endif
-
-#include <pango/pango.h>
 
 #include "tracker-parser.h"
 #include "tracker-log.h"
@@ -48,6 +43,7 @@
 #define IS_ASCII_IGNORE(c)       ((c) <= 0x002C) 
 #define IS_HYPHEN(c)             ((c) == 0x002D)
 #define IS_UNDERSCORE(c)         ((c) == 0x005F)
+#define IS_NEWLINE(c)	         ((c) == 0x000D)
 
 typedef enum {
 	TRACKER_PARSER_WORD_ASCII_HIGHER,
@@ -59,12 +55,14 @@
 	TRACKER_PARSER_WORD_ALPHA_LOWER,
 	TRACKER_PARSER_WORD_ALPHA,
 	TRACKER_PARSER_WORD_ALPHA_NUM,
-	TRACKER_PARSER_WORD_IGNORE
+	TRACKER_PARSER_WORD_IGNORE,
+	TRACKER_PARSER_WORD_NEWLINE	
 } TrackerParserWordType;
 
 static inline TrackerParserWordType
 get_word_type (gunichar c)
 {
+
 	/* Fast ascii handling */
 	if (IS_ASCII (c)) {
 		if (IS_ASCII_ALPHA_LOWER (c)) {
@@ -90,6 +88,10 @@
 		if (IS_UNDERSCORE (c)) {
 			return TRACKER_PARSER_WORD_UNDERSCORE;
 		}
+		
+		if (IS_NEWLINE (c)) {
+			return TRACKER_PARSER_WORD_NEWLINE;
+		}
 	} else 	{
 		if (g_unichar_isalpha (c)) {
 			if (!g_unichar_isupper (c)) {
@@ -124,6 +126,8 @@
 #endif
 }
 
+
+
 static gboolean
 text_needs_pango (const gchar *text)
 {
@@ -144,7 +148,37 @@
 		}
 	}
 
-	return FALSE;
+	return FALSE; 
+}
+
+
+
+static TrackerParserEncoding
+get_encoding (const char *txt)
+{
+	const gchar *p;
+	gunichar     c;
+	gint         i = 0;
+	
+	/* Grab first 255 non-whitespace chars and test */
+	for (p = txt; *p && i < 255; p = g_utf8_next_char (p)) {
+		c = g_utf8_get_char (p);
+
+		if (!g_unichar_isspace (c)) {
+			i++;
+		}
+		
+		if (IS_ASCII(c)) continue;
+		
+		if (IS_LATIN(c)) return TRACKER_PARSER_ENCODING_LATIN;
+		
+		if (NEED_PANGO(c)) return TRACKER_PARSER_ENCODING_CJK;
+		
+		return TRACKER_PARSER_ENCODING_OTHER;
+	}
+	
+	return TRACKER_PARSER_ENCODING_ASCII;
+
 }
 
 static gboolean
@@ -153,6 +187,8 @@
 {
         GHashTable *stop_words;
         
+        if (!word) return FALSE;
+        
         stop_words = tracker_language_get_stop_words (language);
 
         return g_hash_table_lookup (stop_words, word) != NULL;
@@ -197,7 +233,7 @@
                 c = g_utf8_get_char (p);
                 type = get_word_type (c);
                 
-                if (type == TRACKER_PARSER_WORD_IGNORE || 
+                if (type == TRACKER_PARSER_WORD_IGNORE || type == TRACKER_PARSER_WORD_NEWLINE ||
                     (delimit_hyphen && 
                      (type == TRACKER_PARSER_WORD_HYPHEN || 
                       type == TRACKER_PARSER_WORD_UNDERSCORE))) {
@@ -307,7 +343,8 @@
                         return p;
                 }
 		
-                if (do_strip) {
+                if (do_strip && get_encoding (utf8) == TRACKER_PARSER_ENCODING_LATIN) {
+
                         stripped_word = strip_word (utf8, bytes, &len);
                 } else {
                         stripped_word = NULL;
@@ -339,6 +376,489 @@
         return p;	
 }
 
+
+
+TrackerParser *	
+tracker_parser_new (TrackerLanguage *language,
+		    gint max_word_length,
+		    gint min_word_length)
+{
+	
+
+	TrackerParser *parser = g_new (TrackerParser, 1);
+
+	parser->language = language;
+	parser->max_word_length = max_word_length;	
+	parser->min_word_length = min_word_length;	
+	
+	parser->attrs = NULL;	
+	
+	return parser;
+}				    
+				    
+void				    
+tracker_parser_reset (TrackerParser *parser, 
+		      const gchar *txt, 
+		      gint txt_size, 
+		      gboolean delimit_words,
+		      gboolean enable_stemmer,
+		      gboolean enable_stop_words)
+{
+
+	g_return_if_fail (txt && parser);
+	
+	if (parser->attrs) g_free (parser->attrs);	
+	
+	parser->enable_stemmer = enable_stemmer;
+	parser->enable_stop_words = enable_stop_words;
+	parser->delimit_words = delimit_words;	
+	parser->encoding = get_encoding (txt);	
+	parser->txt_size = txt_size;
+	parser->txt = txt;
+
+	parser->word_position = 0;	
+	
+	parser->cursor = txt;
+
+	
+	if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
+	
+		PangoLogAttr *attrs;
+
+                parser->attr_length = g_utf8_strlen (parser->txt, parser->txt_size) + 1;
+
+		attrs = g_new0 (PangoLogAttr, parser->attr_length);
+
+		pango_get_log_attrs (parser->txt, 
+                                     txt_size, 
+                                     0, 
+                                     pango_language_from_string ("C"), 
+                                     attrs, 
+                                     parser->attr_length);
+                                     
+                parser->attrs = attrs;
+                parser->attr_pos = 0;
+	
+	}	
+
+}		
+		
+		
+				    
+static gchar *
+pango_next (TrackerParser *parser,
+	    guint skip_words,
+	    guint *byte_offset_start,
+	    guint *byte_offset_end,
+     	    gboolean *is_new_paragraph)
+{
+	
+	/* CJK text does not need stemming or other treatment */
+	int word_start = -1;
+	int old_word_start = -1;
+	guint words_parsed = 0;
+	guint32 i;
+	
+	*is_new_paragraph = FALSE;
+	
+	for (i = parser->attr_pos; i < parser->attr_length; i++) {
+	
+		if (parser->attrs[i].is_word_start) {
+			word_start = i;	
+			continue;
+		} 
+		
+		if (parser->attrs[i].is_word_end && word_start != old_word_start) {
+			
+			old_word_start = word_start;
+			
+			words_parsed++;
+			
+			if (words_parsed <= skip_words) continue;	
+
+			gchar *start_word, *end_word;
+
+			start_word = g_utf8_offset_to_pointer (parser->txt, word_start);
+			end_word = g_utf8_offset_to_pointer (parser->txt, i);
+
+			if (start_word != end_word) {
+				gchar    *str;
+				gchar    *index_word;
+
+				/* Normalize word */
+                                str = g_utf8_casefold (start_word, end_word - start_word);
+				if (!str) {
+                                               continue;
+                                }
+
+                                index_word = g_utf8_normalize (str, -1, G_NORMALIZE_NFC);
+				g_free (str);
+
+				if (!index_word) {
+                                        continue;
+                                }
+
+				if (word_start > 1 && parser->attrs[word_start -1].is_sentence_boundary) {
+					*is_new_paragraph = TRUE;
+				}
+				
+				*byte_offset_start = (start_word - parser->txt);
+				*byte_offset_end = *byte_offset_start + (end_word - start_word);
+				parser->attr_pos = i;
+				return index_word;
+				
+			}
+			word_start = i;
+		}
+	}
+	parser->attr_pos = i;
+	
+	return NULL;
+	
+}				    
+				   
+static gchar *
+tracker_parser_process_word (TrackerParser *parser, const char *word, gint length, gboolean do_strip)
+{
+	guint bytes, len;
+	char *str = NULL, *stripped_word = NULL;
+	const char *stem_word;
+
+	if (word) {
+
+		if (length == -1) {
+			bytes = strlen (word);
+		} else {
+			bytes = length;
+		}
+
+		if (do_strip && get_encoding (word) == TRACKER_PARSER_ENCODING_LATIN) {
+        	        stripped_word = strip_word (word, bytes, &len);
+        	} else {
+        	        stripped_word = NULL;
+        	}
+                
+                if (!stripped_word) {
+                        str = g_utf8_normalize (word, 
+                                                bytes, 
+                                                G_NORMALIZE_NFC);
+                } else {
+                        str = g_utf8_normalize (stripped_word, 
+                                                len, 
+                                                G_NORMALIZE_NFC);
+                        g_free (stripped_word);
+                }
+                
+                
+                if (!parser->enable_stemmer) {
+                	return str;
+                }
+                
+                len = strlen (str);
+                
+                stem_word = tracker_language_stem_word (parser->language, str, len);
+                         
+                                                     
+		if (stem_word) {                                                     	   
+			char *result = g_strdup (stem_word);                                                        
+                                                        
+        	        g_free (str);
+		               
+			return result;               
+		}
+	}		
+	return str;
+
+}				   
+				    
+static gchar *
+parser_next (TrackerParser *parser,
+	    guint skip_words,
+	    guint *byte_offset_start,
+	    guint *byte_offset_end,
+     	    gboolean *is_new_paragraph)
+{
+	TrackerParserWordType word_type;
+        gunichar              word[64];
+        gboolean              is_valid;
+        gboolean	      filter_numbers = TRUE;
+        guint                  length;
+        gint		      char_count = 0;
+        glong                 bytes;
+	const char           *p;
+	const char           *start;
+	const char	     *end;
+	guint		      words_skipped = 0;
+	gboolean	      do_strip = FALSE;
+
+ 	*byte_offset_start = 1;
+	*byte_offset_end = 2;
+     	*is_new_paragraph = FALSE;
+
+	g_return_val_if_fail (parser, NULL);
+	
+	if (!parser->cursor) {
+                return NULL;
+        }
+
+        word_type = TRACKER_PARSER_WORD_IGNORE;
+        is_valid = TRUE;
+        length = 0;
+        bytes = 0;
+        
+        start = NULL;
+        end = NULL;
+
+
+        for (p = parser->cursor; *p; p = g_utf8_next_char (p)) {
+                TrackerParserWordType type;
+                gunichar              c;
+
+                char_count++;
+                c = g_utf8_get_char (p);
+                type = get_word_type (c);
+                
+                if (type == TRACKER_PARSER_WORD_NEWLINE) {
+                	*is_new_paragraph = TRUE;
+                }
+                
+                if (type == TRACKER_PARSER_WORD_IGNORE || type == TRACKER_PARSER_WORD_NEWLINE || 
+                    (parser->delimit_words && 
+                     (type == TRACKER_PARSER_WORD_HYPHEN || 
+                      type == TRACKER_PARSER_WORD_UNDERSCORE))) {
+                        if (!start) {
+                                continue;
+                        } else {
+                        	/* word break */
+                        	
+                        	if (!is_valid || length < parser->min_word_length || word_type == TRACKER_PARSER_WORD_NUM || words_skipped < skip_words) {
+                        		*is_new_paragraph = FALSE;
+                        	
+                        		if (is_valid && length >= parser->min_word_length && word_type != TRACKER_PARSER_WORD_NUM && words_skipped < skip_words) words_skipped++;
+                        		
+                        		word_type = TRACKER_PARSER_WORD_IGNORE;
+                        		is_valid = TRUE;
+        				length = 0;
+        				bytes = 0;
+              				start = NULL;
+              				end = NULL;
+              				do_strip = FALSE;
+              				continue;
+                        	
+                        	}
+                        
+                        	
+                                break;
+                        }
+                } 
+                
+                if (!is_valid) {
+                        continue;
+                }
+                
+                if (!start) {
+                        start = g_utf8_offset_to_pointer (parser->cursor, char_count);
+                        
+                        /* Valid words must start with an alpha or
+                         * underscore if we are filtering.
+                         */
+                        if (filter_numbers) {
+                                if (type == TRACKER_PARSER_WORD_NUM) {
+                                        is_valid = FALSE;
+                                        continue;
+                                } else {
+                                        if (type == TRACKER_PARSER_WORD_HYPHEN) {
+                                                is_valid = FALSE;
+                                                continue;
+                                        }
+                                }	
+                        }				
+                }
+                
+                if (length >= parser->max_word_length) {
+                        continue;
+                }
+		
+                length++;
+                
+                switch (type) {
+                case TRACKER_PARSER_WORD_ASCII_HIGHER: 
+                        c += 32;
+                        
+                case TRACKER_PARSER_WORD_ASCII_LOWER: 
+                case TRACKER_PARSER_WORD_HYPHEN:
+                case TRACKER_PARSER_WORD_UNDERSCORE:
+                        if (word_type == TRACKER_PARSER_WORD_NUM || 
+                            word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
+                                word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
+                        } else {
+                                word_type = TRACKER_PARSER_WORD_ALPHA;
+                        }
+			
+                        break;
+                        
+                case TRACKER_PARSER_WORD_NUM: 
+                        if (word_type == TRACKER_PARSER_WORD_ALPHA || 
+                            word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
+                                word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
+                        } else {
+                                word_type = TRACKER_PARSER_WORD_NUM;
+                        }
+                        break;
+                        
+                case TRACKER_PARSER_WORD_ALPHA_HIGHER: 
+                        c = g_unichar_tolower (c);
+                        
+                case TRACKER_PARSER_WORD_ALPHA_LOWER: 
+                        if (!do_strip) {
+                                do_strip = TRUE;
+                        }
+                        
+                        if (word_type == TRACKER_PARSER_WORD_NUM || 
+                            word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
+                                word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
+                        } else {
+                                word_type = TRACKER_PARSER_WORD_ALPHA;
+                        }
+			
+                        break;
+                        
+                default: 
+                        break;
+                }
+                
+                word[length -1] = c;
+        }
+        
+        if (!is_valid) {
+        	parser->cursor = NULL;
+                return NULL;
+        }
+        end = g_utf8_offset_to_pointer (parser->cursor, char_count);
+        
+        parser->cursor = end;
+        
+        if (word_type == TRACKER_PARSER_WORD_ALPHA_NUM || word_type == TRACKER_PARSER_WORD_ALPHA) {
+                gchar       *utf8;
+                gchar 	    *processed_word;
+                
+                
+                                
+                utf8 = g_ucs4_to_utf8 (word, length, NULL, &bytes, NULL);
+                
+                if (!utf8) {
+                	return NULL;
+                }
+                *byte_offset_start = start - parser->txt;		
+		*byte_offset_end = end - parser->txt;
+		
+		processed_word = tracker_parser_process_word (parser, utf8, bytes, do_strip);
+		
+		g_free (utf8);
+		
+		return processed_word;
+		
+        }
+        
+        return NULL;	
+
+}
+
+gboolean
+tracker_parser_is_stop_word (TrackerParser *parser, const gchar *word)
+{
+	if (get_encoding (word) == TRACKER_PARSER_ENCODING_CJK) return FALSE;
+
+
+	char *processed_word = tracker_parser_process_word (parser, word, -1, TRUE);
+	gboolean result = is_stop_word (parser->language, processed_word);
+	g_free (processed_word);
+	return result;
+}
+
+				    
+gchar *		
+tracker_parser_next (TrackerParser *parser,
+		     guint *position,
+		     guint *byte_offset_start,
+		     guint *byte_offset_end,
+		     gboolean *new_paragraph,
+		     gboolean *stop_word)
+{
+
+	guint byte_start, byte_end;
+	gboolean new_para;
+	char *str;
+	
+	
+	
+	if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
+		str = pango_next (parser, 0, &byte_start, &byte_end, &new_para);
+		
+		parser->word_position++;
+
+		*stop_word = FALSE;
+		
+	} else {
+		str = parser_next (parser, 0, &byte_start, &byte_end, &new_para);
+		parser->word_position++;
+		if (parser->enable_stop_words && is_stop_word (parser->language, str)) {
+			*stop_word = TRUE;
+		} else {
+			*stop_word = FALSE;
+		}
+	}
+
+	*position = parser->word_position;
+	*byte_offset_start = byte_start;
+	*byte_offset_end = byte_end;
+	*new_paragraph = new_para;	
+	
+	return str;
+
+}    
+				 
+				     
+				     
+void
+tracker_parser_set_posititon (TrackerParser *parser,
+		     	      guint position)
+{
+	guint byte_start, byte_end;
+	gboolean para;
+	
+	
+	parser->word_position = 0;	
+	parser->cursor = parser->txt; 
+	parser->attr_pos = 0;
+
+	if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
+		char *s = pango_next (parser, position, &byte_start, &byte_end, &para);
+		g_free (s);
+	} else {
+		char *s = parser_next (parser, position, &byte_start, &byte_end, &para);
+		g_free (s);
+	}				     	      
+				     	    
+}				     
+				     
+void		
+tracker_parser_free (TrackerParser *parser)
+{
+	if (parser->attrs) g_free (parser->attrs);		
+	
+	g_free (parser);
+
+}
+
+
+
+
+
+
+/* old stuff */
+
+
 gchar *
 tracker_parser_text_to_string (const gchar     *txt, 
                                TrackerLanguage *language,
@@ -555,6 +1075,7 @@
         return update_count;
 }
 
+
 GHashTable *
 tracker_parser_text (GHashTable      *word_table, 
                      const gchar     *txt, 

Modified: branches/indexer-split/src/libtracker-common/tracker-parser.h
==============================================================================
--- branches/indexer-split/src/libtracker-common/tracker-parser.h	(original)
+++ branches/indexer-split/src/libtracker-common/tracker-parser.h	Thu Aug 21 03:15:01 2008
@@ -22,11 +22,79 @@
 #define __TRACKERD_PARSER_H__
 
 #include <glib.h>
+#include <pango/pango.h>
 
 #include "tracker-language.h"
 
 G_BEGIN_DECLS
 
+
+typedef enum {
+	TRACKER_PARSER_ENCODING_ASCII,
+	TRACKER_PARSER_ENCODING_LATIN,
+	TRACKER_PARSER_ENCODING_CJK,
+	TRACKER_PARSER_ENCODING_OTHER
+} TrackerParserEncoding;
+
+typedef struct {
+	const gchar     	*txt;
+	gint			txt_size;
+	TrackerLanguage 	*language;
+	gboolean		enable_stemmer;
+	gboolean		enable_stop_words;
+	guint             	max_words_to_index;
+    	guint             	max_word_length;
+	guint             	min_word_length;
+	gboolean         	delimit_words;
+	
+	/* private members */
+	guint		 	word_position;
+	TrackerParserEncoding	encoding;
+	const gchar		*cursor;
+	
+	/* pango members for CJK text parsing */
+	PangoLogAttr *		attrs;
+	guint			attr_length;	
+	guint			attr_pos;
+	
+} TrackerParser;
+
+
+
+TrackerParser *	tracker_parser_new (TrackerLanguage *language,
+				    gint max_word_length,
+				    gint min_word_length);
+
+				     		      
+void 		tracker_parser_reset (TrackerParser *parser, 
+				      const gchar *txt, 
+				      gint txt_size, 
+				      gboolean delimit_words,
+				      gboolean enable_stemmer,
+				      gboolean enable_stop_words);				     		      
+
+				    
+				    
+gchar *		tracker_parser_next (TrackerParser *parser,
+				     guint *position,
+				     guint *byte_offset_start,
+				     guint *byte_offset_end,
+				     gboolean *new_paragraph,
+				     gboolean *stop_word);
+				     
+				     
+void		tracker_parser_set_posititon (TrackerParser *parser,
+				     	      guint position);
+				     
+gboolean	tracker_parser_is_stop_word (TrackerParser *parser, const gchar *word);	
+
+static gchar *  tracker_parser_process_word (TrackerParser *parser, const char *word, gint length, gboolean do_strip);
+	     
+void		tracker_parser_free (TrackerParser *parser);				     
+				     
+				    
+
+
 /* 
  * Functions to parse supplied text and break into individual words and
  * maintain a count of no of occurences of the word multiplied by a



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]