beagle r4574 - in branches/beagle-lucene2_1/beagled: . Snowball.Net/Lucene.Net/Analysis/Snowball Snowball.Net/upstream-changes
- From: dbera svn gnome org
- To: svn-commits-list gnome org
- Subject: beagle r4574 - in branches/beagle-lucene2_1/beagled: . Snowball.Net/Lucene.Net/Analysis/Snowball Snowball.Net/upstream-changes
- Date: Sun, 2 Mar 2008 18:26:36 +0000 (GMT)
Author: dbera
Date: Sun Mar 2 18:26:36 2008
New Revision: 4574
URL: http://svn.gnome.org/viewvc/beagle?rev=4574&view=rev
Log:
Snowball stemmers are also a bit expensive to create. And by default a new stemmer is created for each field of each document. This patch reuses the stemmers. Also, this will enable us to use easily hook language based stemmers later.
Added:
branches/beagle-lucene2_1/beagled/Snowball.Net/upstream-changes/
branches/beagle-lucene2_1/beagled/Snowball.Net/upstream-changes/01_reuse-stemmer.patch
Modified:
branches/beagle-lucene2_1/beagled/LuceneCommon.cs
branches/beagle-lucene2_1/beagled/NoiseFilter.cs
branches/beagle-lucene2_1/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs
Modified: branches/beagle-lucene2_1/beagled/LuceneCommon.cs
==============================================================================
--- branches/beagle-lucene2_1/beagled/LuceneCommon.cs (original)
+++ branches/beagle-lucene2_1/beagled/LuceneCommon.cs Sun Mar 2 18:26:36 2008
@@ -26,6 +26,7 @@
using System;
using System.Collections;
+using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.IO;
@@ -42,6 +43,9 @@
using Lucene.Net.QueryParsers;
using LNS = Lucene.Net.Search;
+using SF.Snowball.Ext;
+using SnowballProgram = SF.Snowball.SnowballProgram;
+
using Beagle.Util;
namespace Beagle.Daemon {
@@ -477,12 +481,12 @@
}
// FIXME: This assumes everything being indexed is in English!
- internal class BeagleAnalyzer : StandardAnalyzer {
+ public class BeagleAnalyzer : StandardAnalyzer {
+ const string DEFAULT_STEMMER_LANGUAGE = "English";
private char [] buffer = new char [2];
private bool strip_extra_property_info = false;
private bool tokenize_email_hostname = false;
- const string DEFAULT_STEMMER = "English";
public BeagleAnalyzer (bool is_indexing_analyzer)
{
@@ -539,7 +543,10 @@
|| fieldName == "PropertyText"
|| is_text_prop) {
outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname);
- outstream = new SnowballFilter (outstream, DEFAULT_STEMMER);
+ // Sharing Stemmer is not thread safe.
+ // Currently our underlying lucene indexing is not done in multiple threads.
+ StemmerInfo stemmer_info = GetStemmer (DEFAULT_STEMMER_LANGUAGE);
+ outstream = new SnowballFilter (outstream, stemmer_info.Stemmer, stemmer_info.StemMethod);
}
return outstream;
@@ -1039,17 +1046,42 @@
// Access to the stemmer and list of stop words
//
- static SF.Snowball.Ext.EnglishStemmer stemmer = new SF.Snowball.Ext.EnglishStemmer ();
+ private static Dictionary<string, StemmerInfo> stemmer_table = new Dictionary<string, StemmerInfo> ();
+
+ class StemmerInfo {
+ internal SnowballProgram Stemmer;
+ internal System.Reflection.MethodInfo StemMethod;
+ }
+
+ private static StemmerInfo GetStemmer (System.String name)
+ {
+ if (! stemmer_table.ContainsKey (name)) {
+ StemmerInfo stemmer_info = new StemmerInfo ();
+
+ // Taken from Snowball/SnowballFilter.cs
+ System.Type stemClass = System.Type.GetType ("SF.Snowball.Ext." + name + "Stemmer", true);
+ SnowballProgram stemmer = (SnowballProgram) System.Activator.CreateInstance (stemClass);
+ // why doesn't the SnowballProgram class have an (abstract?) stem method?
+ System.Reflection.MethodInfo stemMethod = stemClass.GetMethod ("Stem", (new System.Type [0] == null) ? new System.Type [0] : (System.Type []) new System.Type [0]);
+
+ stemmer_info.Stemmer = stemmer;
+ stemmer_info.StemMethod = stemMethod;
+ stemmer_table [name] = stemmer_info;
+ }
+
+ return stemmer_table [name];
+ }
+
+ private static SF.Snowball.Ext.EnglishStemmer default_stemmer = new SF.Snowball.Ext.EnglishStemmer ();
static public string Stem (string str)
{
string stemmed_str;
- lock (stemmer) {
- stemmer.SetCurrent (str);
- stemmer.Stem ();
- stemmed_str = stemmer.GetCurrent ();
- stemmer.SetCurrent (String.Empty);
+ lock (default_stemmer) {
+ default_stemmer.SetCurrent (str);
+ default_stemmer.Stem ();
+ stemmed_str = default_stemmer.GetCurrent ();
}
return stemmed_str;
Modified: branches/beagle-lucene2_1/beagled/NoiseFilter.cs
==============================================================================
--- branches/beagle-lucene2_1/beagled/NoiseFilter.cs (original)
+++ branches/beagle-lucene2_1/beagled/NoiseFilter.cs Sun Mar 2 18:26:36 2008
@@ -38,7 +38,7 @@
// 1. Removes words which are potential noise like dhyhy8ju7q9
// 2. Splits email addresses into meaningful tokens
// 3. Splits hostnames into subparts
- class NoiseEmailHostFilter : TokenFilter {
+ public class NoiseEmailHostFilter : TokenFilter {
private bool tokenize_email_hostname;
Modified: branches/beagle-lucene2_1/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs
==============================================================================
--- branches/beagle-lucene2_1/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs (original)
+++ branches/beagle-lucene2_1/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs Sun Mar 2 18:26:36 2008
@@ -60,7 +60,13 @@
throw new System.SystemException(e.ToString());
}
}
-
+
+ public SnowballFilter(TokenStream in_Renamed, SnowballProgram stemmer, System.Reflection.MethodInfo stemMethod) : base(in_Renamed)
+ {
+ this.stemmer = stemmer;
+ this.stemMethod = stemMethod;
+ }
+
/// <summary>Returns the next input Token, after being stemmed </summary>
public override Token Next()
{
@@ -81,5 +87,12 @@
newToken.SetPositionIncrement(token.GetPositionIncrement());
return newToken;
}
+
+ public override void Close()
+ {
+ // In case stemmer was shared
+ stemmer.SetCurrent(String.Empty);
+ base.Close();
+ }
}
-}
\ No newline at end of file
+}
Added: branches/beagle-lucene2_1/beagled/Snowball.Net/upstream-changes/01_reuse-stemmer.patch
==============================================================================
--- (empty file)
+++ branches/beagle-lucene2_1/beagled/Snowball.Net/upstream-changes/01_reuse-stemmer.patch Sun Mar 2 18:26:36 2008
@@ -0,0 +1,38 @@
+Reuse stemmers as much as possible.
+
+From: D Bera <dbera web gmail com>
+
+Index: Lucene.Net/Analysis/Snowball/SnowballFilter.cs
+===================================================================
+--- Lucene.Net/Analysis/Snowball/SnowballFilter.cs (revision 4503)
++++ Lucene.Net/Analysis/Snowball/SnowballFilter.cs (working copy)
+@@ -60,7 +60,13 @@
+ throw new System.SystemException(e.ToString());
+ }
+ }
+-
++
++ public SnowballFilter(TokenStream in_Renamed, SnowballProgram stemmer, System.Reflection.MethodInfo stemMethod) : base(in_Renamed)
++ {
++ this.stemmer = stemmer;
++ this.stemMethod = stemMethod;
++ }
++
+ /// <summary>Returns the next input Token, after being stemmed </summary>
+ public override Token Next()
+ {
+@@ -81,5 +87,12 @@
+ newToken.SetPositionIncrement(token.GetPositionIncrement());
+ return newToken;
+ }
++
++ public override void Close()
++ {
++ // In case stemmer was shared
++ stemmer.SetCurrent(String.Empty);
++ base.Close();
++ }
+ }
+-}
+\ No newline at end of file
++}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]