beagle r4579 - in branches/beagle-rdf: . Util beagled beagled/Lucene.Net/Search beagled/Lucene.Net/upstream-changes beagled/Snowball.Net/Lucene.Net/Analysis/Snowball beagled/Snowball.Net/upstream-changes
- From: dbera svn gnome org
- To: svn-commits-list gnome org
- Subject: beagle r4579 - in branches/beagle-rdf: . Util beagled beagled/Lucene.Net/Search beagled/Lucene.Net/upstream-changes beagled/Snowball.Net/Lucene.Net/Analysis/Snowball beagled/Snowball.Net/upstream-changes
- Date: Mon, 3 Mar 2008 23:22:43 +0000 (GMT)
Author: dbera
Date: Mon Mar 3 23:22:43 2008
New Revision: 4579
URL: http://svn.gnome.org/viewvc/beagle?rev=4579&view=rev
Log:
Merge from trunk (Lucene-2.1 changes) 4575-4577.
Added:
branches/beagle-rdf/beagled/Lucene.Net/upstream-changes/17_more-fieldselector.patch
branches/beagle-rdf/beagled/Snowball.Net/upstream-changes/
- copied from r4577, /trunk/beagle/beagled/Snowball.Net/upstream-changes/
Modified:
branches/beagle-rdf/ (props changed)
branches/beagle-rdf/Util/PullingReader.cs
branches/beagle-rdf/Util/StringFu.cs
branches/beagle-rdf/beagled/BuildIndex.cs
branches/beagle-rdf/beagled/Lucene.Net/Search/Hits.cs
branches/beagle-rdf/beagled/Lucene.Net/Search/IndexSearcher.cs
branches/beagle-rdf/beagled/Lucene.Net/Search/MultiSearcher.cs
branches/beagle-rdf/beagled/Lucene.Net/Search/Searcher.cs
branches/beagle-rdf/beagled/LuceneBitArray.cs
branches/beagle-rdf/beagled/LuceneCommon.cs
branches/beagle-rdf/beagled/LuceneIndexingDriver.cs
branches/beagle-rdf/beagled/LuceneQueryingDriver.cs
branches/beagle-rdf/beagled/Makefile.am
branches/beagle-rdf/beagled/NoiseFilter.cs
branches/beagle-rdf/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs
Modified: branches/beagle-rdf/Util/PullingReader.cs
==============================================================================
--- branches/beagle-rdf/Util/PullingReader.cs (original)
+++ branches/beagle-rdf/Util/PullingReader.cs Mon Mar 3 23:22:43 2008
@@ -56,6 +56,7 @@
done = ! pull (pullBuffer, neededSize - pullBuffer.Length);
} catch (Exception e) {
Logger.Log.Debug (e, "Caught exception pulling text from {0}", pull);
+ done = true;
}
}
}
@@ -88,8 +89,7 @@
if (done && pullBuffer.Length < count)
count = pullBuffer.Length;
- for (int i = 0; i < count; ++i)
- buffer [index + i] = pullBuffer [i];
+ pullBuffer.CopyTo (0, buffer, index, count);
pullBuffer.Remove (0, count);
return count;
Modified: branches/beagle-rdf/Util/StringFu.cs
==============================================================================
--- branches/beagle-rdf/Util/StringFu.cs (original)
+++ branches/beagle-rdf/Util/StringFu.cs Mon Mar 3 23:22:43 2008
@@ -26,6 +26,7 @@
using System;
using System.Collections;
+using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Text;
@@ -459,9 +460,14 @@
static public string HexEscape (string str)
{
- StringBuilder builder = new StringBuilder ();
+ int index = -1;
+ if ((index = str.IndexOfAny (CharsToQuote)) == -1)
+ return str;
+
+ StringBuilder builder = new StringBuilder (str, 0, index, str.Length << 1);
- foreach (char c in str) {
+ for (; index < str.Length; ++ index) {
+ char c = str [index];
if (ArrayFu.IndexOfChar (CharsToQuote, c) != -1)
builder.Append (Uri.HexEscape (c));
@@ -491,23 +497,26 @@
/// </returns>
static public string HexUnescape (string str)
{
- ArrayList bytes = new ArrayList ();
- byte[] sub_bytes;
int i, pos = 0;
+ if ((i = str.IndexOf ('%')) == -1)
+ return str;
- while ((i = str.IndexOf ('%', pos)) != -1) {
+ List<byte> bytes = new List<byte> (str.Length);
+ byte[] sub_bytes;
+
+ do {
sub_bytes = Encoding.UTF8.GetBytes (str.Substring (pos, i - pos));
bytes.AddRange (sub_bytes);
pos = i;
char unescaped = Uri.HexUnescape (str, ref pos);
- bytes.Add ((byte) unescaped);
- }
+ bytes.Add (Convert.ToByte (unescaped));
+ } while ((i = str.IndexOf ('%', pos)) != -1);
sub_bytes = Encoding.UTF8.GetBytes (str.Substring (pos, str.Length - pos));
bytes.AddRange (sub_bytes);
- return Encoding.UTF8.GetString ((byte[]) bytes.ToArray (typeof (byte)));
+ return Encoding.UTF8.GetString (bytes.ToArray ());
}
// These strings should never be exposed to the user.
Modified: branches/beagle-rdf/beagled/BuildIndex.cs
==============================================================================
--- branches/beagle-rdf/beagled/BuildIndex.cs (original)
+++ branches/beagle-rdf/beagled/BuildIndex.cs Mon Mar 3 23:22:43 2008
@@ -102,7 +102,7 @@
static Queue pending_directories = new Queue ();
static IndexerRequest pending_request;
- const int BATCH_SIZE = 30;
+ const int BATCH_SIZE = Lucene.Net.Index.IndexWriter.DEFAULT_MAX_BUFFERED_DOCS;
/////////////////////////////////////////////////////////
Modified: branches/beagle-rdf/beagled/Lucene.Net/Search/Hits.cs
==============================================================================
--- branches/beagle-rdf/beagled/Lucene.Net/Search/Hits.cs (original)
+++ branches/beagle-rdf/beagled/Lucene.Net/Search/Hits.cs Mon Mar 3 23:22:43 2008
@@ -18,6 +18,7 @@
using System;
using Document = Lucene.Net.Documents.Document;
+using FieldSelector = Lucene.Net.Documents.FieldSelector;
namespace Lucene.Net.Search
{
@@ -90,11 +91,17 @@
return length;
}
+ public Document Doc(int n)
+ {
+ return Doc(n, null);
+ }
+
/// <summary>Returns the stored fields of the n<sup>th</sup> document in this set.
/// <p>Documents are cached, so that repeated requests for the same element may
- /// return the same Document object.
+ /// return the same Document object. If the fieldselector is changed, then the new
+ /// fields will not be loaded.
/// </summary>
- public Document Doc(int n)
+ public Document Doc(int n, FieldSelector fieldSelector)
{
HitDoc hitDoc = HitDoc(n);
@@ -111,12 +118,15 @@
if (hitDoc.doc == null)
{
- hitDoc.doc = searcher.Doc(hitDoc.id); // cache miss: read document
+ if (fieldSelector == null)
+ hitDoc.doc = searcher.Doc(hitDoc.id); // cache miss: read document
+ else
+ hitDoc.doc = searcher.Doc(hitDoc.id, fieldSelector); // cache miss: read document
}
return hitDoc.doc;
}
-
+
/// <summary>Returns the score for the nth document in this set. </summary>
public float Score(int n)
{
@@ -222,4 +232,4 @@
id = i;
}
}
-}
\ No newline at end of file
+}
Modified: branches/beagle-rdf/beagled/Lucene.Net/Search/IndexSearcher.cs
==============================================================================
--- branches/beagle-rdf/beagled/Lucene.Net/Search/IndexSearcher.cs (original)
+++ branches/beagle-rdf/beagled/Lucene.Net/Search/IndexSearcher.cs Mon Mar 3 23:22:43 2008
@@ -21,6 +21,7 @@
using Document = Lucene.Net.Documents.Document;
using IndexReader = Lucene.Net.Index.IndexReader;
using Term = Lucene.Net.Index.Term;
+using FieldSelector = Lucene.Net.Documents.FieldSelector;
namespace Lucene.Net.Search
{
@@ -126,6 +127,11 @@
return reader.Document(i);
}
+ public override Document Doc(int i, FieldSelector fieldSelector)
+ {
+ return reader.Document(i, fieldSelector);
+ }
+
// inherit javadoc
public override int MaxDoc()
{
@@ -185,4 +191,4 @@
return weight.Explain(reader, doc);
}
}
-}
\ No newline at end of file
+}
Modified: branches/beagle-rdf/beagled/Lucene.Net/Search/MultiSearcher.cs
==============================================================================
--- branches/beagle-rdf/beagled/Lucene.Net/Search/MultiSearcher.cs (original)
+++ branches/beagle-rdf/beagled/Lucene.Net/Search/MultiSearcher.cs Mon Mar 3 23:22:43 2008
@@ -19,6 +19,7 @@
using Document = Lucene.Net.Documents.Document;
using Term = Lucene.Net.Index.Term;
+using FieldSelector = Lucene.Net.Documents.FieldSelector;
namespace Lucene.Net.Search
{
@@ -121,6 +122,11 @@
throw new System.NotSupportedException();
}
+ public override Document Doc(int i, FieldSelector fieldSelector)
+ {
+ throw new System.NotSupportedException();
+ }
+
public override Explanation Explain(Weight weight, int doc)
{
throw new System.NotSupportedException();
@@ -195,6 +201,11 @@
return searchables[i].Doc(n - starts[i]); // dispatch to searcher
}
+ public override Document Doc(int n, FieldSelector fieldSelector)
+ {
+ throw new System.NotSupportedException();
+ }
+
/// <summary>Returns index of the searcher for document <code>n</code> in the array
/// used to construct this searcher.
@@ -389,4 +400,4 @@
return rewrittenQuery.Weight(cacheSim);
}
}
-}
\ No newline at end of file
+}
Modified: branches/beagle-rdf/beagled/Lucene.Net/Search/Searcher.cs
==============================================================================
--- branches/beagle-rdf/beagled/Lucene.Net/Search/Searcher.cs (original)
+++ branches/beagle-rdf/beagled/Lucene.Net/Search/Searcher.cs Mon Mar 3 23:22:43 2008
@@ -19,6 +19,7 @@
using Term = Lucene.Net.Index.Term;
using Document = Lucene.Net.Documents.Document;
+using FieldSelector = Lucene.Net.Documents.FieldSelector;
namespace Lucene.Net.Search
{
@@ -208,9 +209,10 @@
abstract public int MaxDoc();
abstract public TopDocs Search(Weight weight, Filter filter, int n);
abstract public Document Doc(int i);
+ abstract public Document Doc(int i, FieldSelector fieldSelector);
abstract public Query Rewrite(Query query);
abstract public Explanation Explain(Weight weight, int doc);
abstract public TopFieldDocs Search(Weight weight, Filter filter, int n, Sort sort);
/* End patch for GCJ bug #15411. */
}
-}
\ No newline at end of file
+}
Added: branches/beagle-rdf/beagled/Lucene.Net/upstream-changes/17_more-fieldselector.patch
==============================================================================
--- (empty file)
+++ branches/beagle-rdf/beagled/Lucene.Net/upstream-changes/17_more-fieldselector.patch Mon Mar 3 23:22:43 2008
@@ -0,0 +1,157 @@
+Index: Search/IndexSearcher.cs
+===================================================================
+--- Search/IndexSearcher.cs (revision 4576)
++++ Search/IndexSearcher.cs (working copy)
+@@ -21,6 +21,7 @@
+ using Document = Lucene.Net.Documents.Document;
+ using IndexReader = Lucene.Net.Index.IndexReader;
+ using Term = Lucene.Net.Index.Term;
++using FieldSelector = Lucene.Net.Documents.FieldSelector;
+
+ namespace Lucene.Net.Search
+ {
+@@ -126,6 +127,11 @@
+ return reader.Document(i);
+ }
+
++ public override Document Doc(int i, FieldSelector fieldSelector)
++ {
++ return reader.Document(i, fieldSelector);
++ }
++
+ // inherit javadoc
+ public override int MaxDoc()
+ {
+@@ -185,4 +191,4 @@
+ return weight.Explain(reader, doc);
+ }
+ }
+-}
+\ No newline at end of file
++}
+Index: Search/Searcher.cs
+===================================================================
+--- Search/Searcher.cs (revision 4576)
++++ Search/Searcher.cs (working copy)
+@@ -19,6 +19,7 @@
+
+ using Term = Lucene.Net.Index.Term;
+ using Document = Lucene.Net.Documents.Document;
++using FieldSelector = Lucene.Net.Documents.FieldSelector;
+
+ namespace Lucene.Net.Search
+ {
+@@ -208,9 +209,10 @@
+ abstract public int MaxDoc();
+ abstract public TopDocs Search(Weight weight, Filter filter, int n);
+ abstract public Document Doc(int i);
++ abstract public Document Doc(int i, FieldSelector fieldSelector);
+ abstract public Query Rewrite(Query query);
+ abstract public Explanation Explain(Weight weight, int doc);
+ abstract public TopFieldDocs Search(Weight weight, Filter filter, int n, Sort sort);
+ /* End patch for GCJ bug #15411. */
+ }
+-}
+\ No newline at end of file
++}
+Index: Search/Hits.cs
+===================================================================
+--- Search/Hits.cs (revision 4576)
++++ Search/Hits.cs (working copy)
+@@ -18,6 +18,7 @@
+ using System;
+
+ using Document = Lucene.Net.Documents.Document;
++using FieldSelector = Lucene.Net.Documents.FieldSelector;
+
+ namespace Lucene.Net.Search
+ {
+@@ -90,11 +91,17 @@
+ return length;
+ }
+
++ public Document Doc(int n)
++ {
++ return Doc(n, null);
++ }
++
+ /// <summary>Returns the stored fields of the n<sup>th</sup> document in this set.
+ /// <p>Documents are cached, so that repeated requests for the same element may
+- /// return the same Document object.
++ /// return the same Document object. If the fieldselector is changed, then the new
++ /// fields will not be loaded.
+ /// </summary>
+- public Document Doc(int n)
++ public Document Doc(int n, FieldSelector fieldSelector)
+ {
+ HitDoc hitDoc = HitDoc(n);
+
+@@ -111,12 +118,15 @@
+
+ if (hitDoc.doc == null)
+ {
+- hitDoc.doc = searcher.Doc(hitDoc.id); // cache miss: read document
++ if (fieldSelector == null)
++ hitDoc.doc = searcher.Doc(hitDoc.id); // cache miss: read document
++ else
++ hitDoc.doc = searcher.Doc(hitDoc.id, fieldSelector); // cache miss: read document
+ }
+
+ return hitDoc.doc;
+ }
+-
++
+ /// <summary>Returns the score for the nth document in this set. </summary>
+ public float Score(int n)
+ {
+@@ -222,4 +232,4 @@
+ id = i;
+ }
+ }
+-}
+\ No newline at end of file
++}
+Index: Search/MultiSearcher.cs
+===================================================================
+--- Search/MultiSearcher.cs (revision 4576)
++++ Search/MultiSearcher.cs (working copy)
+@@ -19,6 +19,7 @@
+
+ using Document = Lucene.Net.Documents.Document;
+ using Term = Lucene.Net.Index.Term;
++using FieldSelector = Lucene.Net.Documents.FieldSelector;
+
+ namespace Lucene.Net.Search
+ {
+@@ -121,6 +122,11 @@
+ throw new System.NotSupportedException();
+ }
+
++ public override Document Doc(int i, FieldSelector fieldSelector)
++ {
++ throw new System.NotSupportedException();
++ }
++
+ public override Explanation Explain(Weight weight, int doc)
+ {
+ throw new System.NotSupportedException();
+@@ -195,7 +201,12 @@
+ return searchables[i].Doc(n - starts[i]); // dispatch to searcher
+ }
+
++ public override Document Doc(int n, FieldSelector fieldSelector)
++ {
++ throw new System.NotSupportedException();
++ }
+
++
+ /// <summary>Returns index of the searcher for document <code>n</code> in the array
+ /// used to construct this searcher.
+ /// </summary>
+@@ -389,4 +400,4 @@
+ return rewrittenQuery.Weight(cacheSim);
+ }
+ }
+-}
+\ No newline at end of file
++}
Modified: branches/beagle-rdf/beagled/LuceneBitArray.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneBitArray.cs (original)
+++ branches/beagle-rdf/beagled/LuceneBitArray.cs Mon Mar 3 23:22:43 2008
@@ -198,8 +198,6 @@
////////////////////////////////////////////////////////////
- static string[] fields_uri = { "Timestamp", "Uri" };
-
public void ProjectOnto (LuceneBitArray other)
{
int j = 0;
@@ -211,7 +209,7 @@
j = i+1;
Document doc;
- doc = searcher.Doc (i, fields_uri);
+ doc = searcher.Doc (i, LuceneQueryingDriver.fields_uri);
other.AddUri (doc.Get ("Uri"));
}
Modified: branches/beagle-rdf/beagled/LuceneCommon.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneCommon.cs (original)
+++ branches/beagle-rdf/beagled/LuceneCommon.cs Mon Mar 3 23:22:43 2008
@@ -26,6 +26,7 @@
using System;
using System.Collections;
+using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.IO;
@@ -42,6 +43,9 @@
using Lucene.Net.QueryParsers;
using LNS = Lucene.Net.Search;
+using SF.Snowball.Ext;
+using SnowballProgram = SF.Snowball.SnowballProgram;
+
using Beagle.Util;
namespace Beagle.Daemon {
@@ -102,7 +106,7 @@
private Lucene.Net.Store.Directory secondary_store = null;
// Flush if more than this number of requests
- public const int RequestFlushThreshold = 37; // a total arbitrary magic number
+ public const int RequestFlushThreshold = Lucene.Net.Index.IndexWriter.DEFAULT_MAX_BUFFERED_DOCS; // Use same value as Lucene's flush threshold
//////////////////////////////////////////////////////////////////////////////
@@ -383,7 +387,7 @@
// Create a new store.
Lucene.Net.Store.Directory store;
- store = Lucene.Net.Store.FSDirectory.GetDirectory (path, LockDirectory, true);
+ store = Lucene.Net.Store.FSDirectory.GetDirectory (path, new Lucene.Net.Store.SimpleFSLockFactory (LockDirectory));
// Create an empty index in that store.
IndexWriter writer;
@@ -441,8 +445,14 @@
reader.Close ();
// Create stores for our indexes.
- primary_store = Lucene.Net.Store.FSDirectory.GetDirectory (PrimaryIndexDirectory, LockDirectory, false, read_only_mode);
- secondary_store = Lucene.Net.Store.FSDirectory.GetDirectory (SecondaryIndexDirectory, LockDirectory, false, read_only_mode);
+ // Use separate lock factories since each lock factory is tied to the index directory
+ if (read_only_mode) {
+ primary_store = Lucene.Net.Store.FSDirectory.GetDirectory (PrimaryIndexDirectory, Lucene.Net.Store.NoLockFactory.GetNoLockFactory ());
+ secondary_store = Lucene.Net.Store.FSDirectory.GetDirectory (SecondaryIndexDirectory, Lucene.Net.Store.NoLockFactory.GetNoLockFactory ());
+ } else {
+ primary_store = Lucene.Net.Store.FSDirectory.GetDirectory (PrimaryIndexDirectory, new Lucene.Net.Store.SimpleFSLockFactory (LockDirectory));
+ secondary_store = Lucene.Net.Store.FSDirectory.GetDirectory (SecondaryIndexDirectory, new Lucene.Net.Store.SimpleFSLockFactory (LockDirectory));
+ }
}
////////////////////////////////////////////////////////////////
@@ -475,12 +485,12 @@
}
// FIXME: This assumes everything being indexed is in English!
- internal class BeagleAnalyzer : StandardAnalyzer {
+ public class BeagleAnalyzer : StandardAnalyzer {
+ const string DEFAULT_STEMMER_LANGUAGE = "English";
private char [] buffer = new char [2];
private bool strip_extra_property_info = false;
private bool tokenize_email_hostname = false;
- const string DEFAULT_STEMMER = "English";
public BeagleAnalyzer (bool is_indexing_analyzer)
{
@@ -540,7 +550,10 @@
|| fieldName == "PropertyText"
|| is_text_prop) {
outstream = new NoiseEmailHostFilter (outstream, tokenize_email_hostname);
- outstream = new SnowballFilter (outstream, DEFAULT_STEMMER);
+ // Sharing Stemmer is not thread safe.
+ // Currently our underlying lucene indexing is not done in multiple threads.
+ StemmerInfo stemmer_info = GetStemmer (DEFAULT_STEMMER_LANGUAGE);
+ outstream = new SnowballFilter (outstream, stemmer_info.Stemmer, stemmer_info.StemMethod);
}
return outstream;
@@ -1077,17 +1090,42 @@
// Access to the stemmer and list of stop words
//
- static SF.Snowball.Ext.EnglishStemmer stemmer = new SF.Snowball.Ext.EnglishStemmer ();
+ private static Dictionary<string, StemmerInfo> stemmer_table = new Dictionary<string, StemmerInfo> ();
+
+ class StemmerInfo {
+ internal SnowballProgram Stemmer;
+ internal System.Reflection.MethodInfo StemMethod;
+ }
+
+ private static StemmerInfo GetStemmer (System.String name)
+ {
+ if (! stemmer_table.ContainsKey (name)) {
+ StemmerInfo stemmer_info = new StemmerInfo ();
+
+ // Taken from Snowball/SnowballFilter.cs
+ System.Type stemClass = System.Type.GetType ("SF.Snowball.Ext." + name + "Stemmer", true);
+ SnowballProgram stemmer = (SnowballProgram) System.Activator.CreateInstance (stemClass);
+ // why doesn't the SnowballProgram class have an (abstract?) stem method?
+ System.Reflection.MethodInfo stemMethod = stemClass.GetMethod ("Stem", (new System.Type [0] == null) ? new System.Type [0] : (System.Type []) new System.Type [0]);
+
+ stemmer_info.Stemmer = stemmer;
+ stemmer_info.StemMethod = stemMethod;
+ stemmer_table [name] = stemmer_info;
+ }
+
+ return stemmer_table [name];
+ }
+
+ private static SF.Snowball.Ext.EnglishStemmer default_stemmer = new SF.Snowball.Ext.EnglishStemmer ();
static public string Stem (string str)
{
string stemmed_str;
- lock (stemmer) {
- stemmer.SetCurrent (str);
- stemmer.Stem ();
- stemmed_str = stemmer.GetCurrent ();
- stemmer.SetCurrent (String.Empty);
+ lock (default_stemmer) {
+ default_stemmer.SetCurrent (str);
+ default_stemmer.Stem ();
+ stemmed_str = default_stemmer.GetCurrent ();
}
return stemmed_str;
@@ -1376,11 +1414,11 @@
if (d1 != 1 || d2 != DateTime.DaysInMonth (y2, m2)) {
LNS.BooleanQuery sub_query;
sub_query = new LNS.BooleanQuery ();
- sub_query.Add (ym_query, true, false);
- sub_query.Add (NewDayQuery (field_name, d1, d2), true, false);
- top_level_query.Add (sub_query, false, false);
+ sub_query.Add (ym_query, LNS.BooleanClause.Occur.MUST);
+ sub_query.Add (NewDayQuery (field_name, d1, d2), LNS.BooleanClause.Occur.MUST);
+ top_level_query.Add (sub_query, LNS.BooleanClause.Occur.SHOULD);
} else {
- top_level_query.Add (ym_query, false, false);
+ top_level_query.Add (ym_query, LNS.BooleanClause.Occur.SHOULD);
}
} else {
@@ -1389,9 +1427,9 @@
if (d1 > 1) {
LNS.BooleanQuery sub_query;
sub_query = new LNS.BooleanQuery ();
- sub_query.Add (NewYearMonthQuery (field_name, y1, m1), true, false);
- sub_query.Add (NewDayQuery (field_name, d1, DateTime.DaysInMonth (y1, m1)), true, false);
- top_level_query.Add (sub_query, false, false);
+ sub_query.Add (NewYearMonthQuery (field_name, y1, m1), LNS.BooleanClause.Occur.MUST);
+ sub_query.Add (NewDayQuery (field_name, d1, DateTime.DaysInMonth (y1, m1)), LNS.BooleanClause.Occur.MUST);
+ top_level_query.Add (sub_query, LNS.BooleanClause.Occur.SHOULD);
++m1;
if (m1 == 13) {
@@ -1404,9 +1442,9 @@
if (d2 < DateTime.DaysInMonth (y2, m2)) {
LNS.BooleanQuery sub_query;
sub_query = new LNS.BooleanQuery ();
- sub_query.Add (NewYearMonthQuery (field_name, y2, m2), true, false);
- sub_query.Add (NewDayQuery (field_name, 1, d2), true, false);
- top_level_query.Add (sub_query, false, false);
+ sub_query.Add (NewYearMonthQuery (field_name, y2, m2), LNS.BooleanClause.Occur.MUST);
+ sub_query.Add (NewDayQuery (field_name, 1, d2), LNS.BooleanClause.Occur.MUST);
+ top_level_query.Add (sub_query, LNS.BooleanClause.Occur.SHOULD);
--m2;
if (m2 == 0) {
@@ -1418,7 +1456,7 @@
// Generate the query for the "middle" of our period, if it is non-empty
if (y1 < y2 || ((y1 == y2) && m1 <= m2))
top_level_query.Add (NewYearMonthQuery (field_name, y1, m1, y2, m2),
- false, false);
+ LNS.BooleanClause.Occur.SHOULD);
}
return top_level_query;
@@ -1478,14 +1516,14 @@
LNS.Query subquery;
subquery = StringToQuery ("Text", part.Text, term_list);
if (subquery != null) {
- p_query.Add (subquery, false, false);
+ p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
added_subquery = true;
}
// FIXME: HotText is ignored for now!
// subquery = StringToQuery ("HotText", part.Text);
// if (subquery != null) {
- // p_query.Add (subquery, false, false);
+ // p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
// added_subquery = true;
// }
}
@@ -1494,10 +1532,10 @@
LNS.Query subquery;
subquery = StringToQuery ("PropertyText", part.Text, term_list);
if (subquery != null) {
- p_query.Add (subquery, false, false);
+ p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
// Properties can live in either index
if (! only_build_primary_query)
- s_query.Add (subquery.Clone () as LNS.Query, false, false);
+ s_query.Add (subquery.Clone () as LNS.Query, LNS.BooleanClause.Occur.SHOULD);
added_subquery = true;
}
@@ -1528,10 +1566,10 @@
if (term_list != null)
term_list.Add (term);
subquery = new LNS.TermQuery (term);
- p_query.Add (subquery, false, false);
+ p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
// Properties can live in either index
if (! only_build_primary_query)
- s_query.Add (subquery.Clone () as LNS.Query, false, false);
+ s_query.Add (subquery.Clone () as LNS.Query, LNS.BooleanClause.Occur.SHOULD);
} else {
// Reset these so we return a null query
p_query = null;
@@ -1561,26 +1599,26 @@
// Search text content
term = new Term ("Text", query_string_lower);
subquery = new LNS.WildcardQuery (term);
- p_query.Add (subquery, false, false);
+ p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
term_list.Add (term);
// Search text properties
term = new Term ("PropertyText", query_string_lower);
subquery = new LNS.WildcardQuery (term);
- p_query.Add (subquery, false, false);
+ p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
// Properties can live in either index
if (! only_build_primary_query)
- s_query.Add (subquery.Clone () as LNS.Query, false, false);
+ s_query.Add (subquery.Clone () as LNS.Query, LNS.BooleanClause.Occur.SHOULD);
term_list.Add (term);
// Search property keywords
term = new Term ("PropertyKeyword", query_string_lower);
term_list.Add (term);
subquery = new LNS.WildcardQuery (term);
- p_query.Add (subquery, false, false);
+ p_query.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
// Properties can live in either index
if (! only_build_primary_query)
- s_query.Add (subquery.Clone () as LNS.Query, false, false);
+ s_query.Add (subquery.Clone () as LNS.Query, LNS.BooleanClause.Occur.SHOULD);
primary_query = p_query;
if (! only_build_primary_query)
@@ -1633,9 +1671,9 @@
term_list, query_part_hook,
out p_subq, out s_subq, out sub_hit_filter);
if (p_subq != null)
- p_query.Add (p_subq, false, false);
+ p_query.Add (p_subq, LNS.BooleanClause.Occur.SHOULD);
if (s_subq != null)
- s_query.Add (s_subq, false, false);
+ s_query.Add (s_subq, LNS.BooleanClause.Occur.SHOULD);
if (sub_hit_filter != null) {
if (or_hit_filter == null)
or_hit_filter = new OrHitFilter ();
@@ -1726,7 +1764,7 @@
int cursor = 0;
if (extra_requirement != null) {
- top_query.Add (extra_requirement, true, false);
+ top_query.Add (extra_requirement, LNS.BooleanClause.Occur.MUST);
++cursor;
}
@@ -1738,7 +1776,7 @@
LNS.BooleanQuery bq;
bq = new LNS.BooleanQuery ();
bottom_queries.Add (bq);
- top_query.Add (bq, false, false);
+ top_query.Add (bq, LNS.BooleanClause.Occur.SHOULD);
}
}
@@ -1756,7 +1794,7 @@
cursor = 0;
}
- target.Add (subquery, false, false);
+ target.Add (subquery, LNS.BooleanClause.Occur.SHOULD);
}
return top_query;
@@ -2051,7 +2089,7 @@
return GetHitsForUris (uris, null);
}
- public ICollection GetHitsForUris (ICollection uris, string[] fields)
+ public ICollection GetHitsForUris (ICollection uris, FieldSelector fields)
{
Hashtable hits_by_uri = UriFu.NewHashtable ();
Modified: branches/beagle-rdf/beagled/LuceneIndexingDriver.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneIndexingDriver.cs (original)
+++ branches/beagle-rdf/beagled/LuceneIndexingDriver.cs Mon Mar 3 23:22:43 2008
@@ -215,19 +215,19 @@
term = new Term ("Uri", uri_str);
// For property changes, only secondary index is modified
- secondary_reader.Delete (term);
+ secondary_reader.DeleteDocuments (term);
// Now remove from everywhere else (if asked to remove or if asked to add, in which case
// we first remove and then add)
// So we also need to remove child documents
if (indexable.Type != IndexableType.PropertyChange) {
- num_delete = primary_reader.Delete (term);
+ num_delete = primary_reader.DeleteDocuments (term);
// When we delete an indexable, also delete any children.
// FIXME: Shouldn't we also delete any children of children, etc.?
term = new Term ("ParentUri", uri_str);
- num_delete += primary_reader.Delete (term);
- secondary_reader.Delete (term);
+ num_delete += primary_reader.DeleteDocuments (term);
+ secondary_reader.DeleteDocuments (term);
}
// If this is a strict removal (and not a deletion that
@@ -270,6 +270,10 @@
text_cache.BeginTransaction ();
IndexWriter primary_writer, secondary_writer;
+ // FIXME: Lock obtain time-out can happen here; if that happens,
+ // an exception will be thrown and this method will break in the middle
+ // leaving IndexWriters unclosed! Same for any Lucene.Net-index modification
+ // methods.
primary_writer = new IndexWriter (PrimaryStore, IndexingAnalyzer, false);
secondary_writer = null;
Modified: branches/beagle-rdf/beagled/LuceneQueryingDriver.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneQueryingDriver.cs (original)
+++ branches/beagle-rdf/beagled/LuceneQueryingDriver.cs Mon Mar 3 23:22:43 2008
@@ -193,12 +193,12 @@
case QueryPartLogic.Prohibited:
if (primary_prohibited_part_query == null)
primary_prohibited_part_query = new LNS.BooleanQuery ();
- primary_prohibited_part_query.Add (primary_part_query, false, false);
+ primary_prohibited_part_query.Add (primary_part_query, LNS.BooleanClause.Occur.SHOULD);
if (secondary_part_query != null) {
if (secondary_prohibited_part_query == null)
secondary_prohibited_part_query = new LNS.BooleanQuery ();
- secondary_prohibited_part_query.Add (secondary_part_query, false, false);
+ secondary_prohibited_part_query.Add (secondary_part_query, LNS.BooleanClause.Occur.SHOULD);
}
if (part_hit_filter != null) {
@@ -408,7 +408,7 @@
uri_list.Add (new Uri (subject));
string field_name = PropertyToFieldName (pred_type, predicate);
- string[] fields = { "Uri", "Timestamp", field_name };
+ FieldSelector fields = new MapFieldSelector (new string[] { "Uri", "Timestamp", field_name });
ICollection hits = GetHitsForUris (uri_list, fields);
return hits;
@@ -554,9 +554,9 @@
if (secondary_searcher != null)
secondary_term_docs = secondary_searcher.Reader.TermDocs ();
- string[] fields = (field_name != null) ?
- new string[] { "Uri", "Timestamp", field_name } :
- null;
+ FieldSelector fields = null;
+ if (field_name != null)
+ fields = new MapFieldSelector (new string[] { "Uri", "Timestamp", field_name });
for (int match_index = primary_matches.GetNextTrueIndex (0);
match_index < primary_matches.Count;
@@ -594,7 +594,17 @@
found_matching_predicate = true;
}
- if (secondary_searcher != null) {
+ // Now get the matching predicate from the secondary index
+ if (secondary_searcher == null) {
+ doc = null;
+ } else {
+ Term term = new Term ("Uri", doc.Get ("Uri"));
+ secondary_term_docs.Seek (term);
+ if (secondary_term_docs.Next ())
+ doc = secondary_searcher.Doc (secondary_term_docs.Doc ());
+ }
+
+ if (doc != null) {
foreach (Field field in doc.Fields ()) {
if (! FieldIsPredicate (field, field_value))
continue;
@@ -617,7 +627,7 @@
hits.Add (hit);
} else {
doc = primary_searcher.Doc (match_index, fields);
- hits.Add (CreateHit (doc, secondary_searcher, secondary_term_docs, fields));
+ hits.Add (CreateHit (doc, secondary_reader, secondary_term_docs, fields));
}
}
@@ -906,8 +916,7 @@
// Only generate results if we got some matches
if (primary_matches != null && primary_matches.ContainsTrue ()) {
GenerateQueryResults (primary_reader,
- primary_searcher,
- secondary_searcher,
+ secondary_reader,
primary_matches,
result,
term_list,
@@ -964,7 +973,7 @@
LNS.BooleanQuery combined_query;
combined_query = new LNS.BooleanQuery ();
foreach (LNS.Query query in primary_queries)
- combined_query.Add (query, true, false);
+ combined_query.Add (query, LNS.BooleanClause.Occur.MUST);
LuceneBitArray matches;
matches = new LuceneBitArray (primary_searcher, combined_query);
@@ -1100,7 +1109,7 @@
foreach (Term term in term_list) {
double idf;
- idf = similarity.Ldf (reader.DocFreq (term), reader.MaxDoc ());
+ idf = similarity.Idf (reader.DocFreq (term), reader.MaxDoc ());
int hit_count;
hit_count = hits_by_id.Count;
@@ -1136,11 +1145,11 @@
//
// Two arrays we need for quickly creating lucene documents and check if they are valid
- static string[] fields_timestamp_uri = { "Timestamp", "Uri" };
+ static FieldSelector fields_timestamp_uri = new MapFieldSelector (new string[] {"Uri", "Timestamp"});
+ static internal FieldSelector fields_uri = new MapFieldSelector (new string[] {"Uri"});
private static void GenerateQueryResults (IndexReader primary_reader,
- LNS.IndexSearcher primary_searcher,
- LNS.IndexSearcher secondary_searcher,
+ IndexReader secondary_reader,
BetterBitArray primary_matches,
IQueryResult result,
ICollection query_term_list,
@@ -1178,8 +1187,7 @@
if (primary_matches.TrueCount > max_results)
final_list_of_hits = ScanRecentDocs (primary_reader,
- primary_searcher,
- secondary_searcher,
+ secondary_reader,
primary_matches,
hits_by_id,
max_results,
@@ -1188,8 +1196,7 @@
if (final_list_of_hits == null)
final_list_of_hits = FindRecentResults (primary_reader,
- primary_searcher,
- secondary_searcher,
+ secondary_reader,
primary_matches,
hits_by_id,
max_results,
@@ -1280,8 +1287,7 @@
// for all of them.
private static ArrayList ScanRecentDocs (IndexReader primary_reader,
- LNS.IndexSearcher primary_searcher,
- LNS.IndexSearcher secondary_searcher,
+ IndexReader secondary_reader,
BetterBitArray primary_matches,
Dictionary<int, Hit> hits_by_id,
int max_results,
@@ -1300,8 +1306,8 @@
Term term;
TermDocs secondary_term_docs = null;
- if (secondary_searcher != null)
- secondary_term_docs = secondary_searcher.Reader.TermDocs ();
+ if (secondary_reader != null)
+ secondary_term_docs = secondary_reader.TermDocs ();
do {
term = enumerator.Term ();
@@ -1317,13 +1323,13 @@
int doc_id = docs.Doc ();
if (primary_matches.Get (doc_id)) {
- Document doc = primary_searcher.Doc (doc_id);
+ Document doc = primary_reader.Document (doc_id);
// If we have a UriFilter, apply it.
if (uri_filter != null) {
Uri uri;
uri = GetUriFromDocument (doc);
if (uri_filter (uri)) {
- Hit hit = CreateHit (doc, secondary_searcher, secondary_term_docs);
+ Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs);
hits_by_id [doc_id] = hit;
// Add the result, last modified first
results.Add (hit);
@@ -1362,8 +1368,7 @@
}
private static ArrayList FindRecentResults (IndexReader primary_reader,
- LNS.IndexSearcher primary_searcher,
- LNS.IndexSearcher secondary_searcher,
+ IndexReader secondary_reader,
BetterBitArray primary_matches,
Dictionary<int, Hit> hits_by_id,
int max_results,
@@ -1385,8 +1390,8 @@
else
all_docs = new ArrayList (primary_matches.TrueCount);
- if (secondary_searcher != null)
- term_docs = secondary_searcher.Reader.TermDocs ();
+ if (secondary_reader != null)
+ term_docs = secondary_reader.TermDocs ();
for (int match_index = primary_matches.Count; ; match_index --) {
// Walk across the matches backwards, since newer
@@ -1398,7 +1403,7 @@
count++;
- doc = primary_searcher.Doc (match_index, fields_timestamp_uri);
+ doc = primary_reader.Document (match_index, fields_timestamp_uri);
// Check the timestamp --- if we have already reached our
// limit, we might be able to reject it immediately.
@@ -1424,7 +1429,7 @@
// Get the actual hit now
// doc was created with only 2 fields, so first get the complete lucene document for primary document
- Hit hit = CreateHit (primary_searcher.Doc (match_index), secondary_searcher, term_docs);
+ Hit hit = CreateHit (primary_reader.Document (match_index), secondary_reader, term_docs);
hits_by_id [match_index] = hit;
// Add the document to the appropriate data structure.
@@ -1454,23 +1459,23 @@
}
private static Hit CreateHit ( Document primary_doc,
- LNS.IndexSearcher secondary_searcher,
+ IndexReader secondary_reader,
TermDocs term_docs)
{
return CreateHit ( primary_doc,
- secondary_searcher,
+ secondary_reader,
term_docs,
null);
}
private static Hit CreateHit ( Document primary_doc,
- LNS.IndexSearcher secondary_searcher,
+ IndexReader secondary_reader,
TermDocs term_docs,
- string[] fields)
+ FieldSelector fields)
{
Hit hit = DocumentToHit (primary_doc);
- if (secondary_searcher == null)
+ if (secondary_reader == null)
return hit;
// Get the stringified version of the URI
@@ -1482,8 +1487,8 @@
term_docs.Next ();
Document secondary_doc =
(fields == null) ?
- secondary_searcher.Doc (term_docs.Doc ()) :
- secondary_searcher.Doc (term_docs.Doc (), fields);
+ secondary_reader.Document (term_docs.Doc ()) :
+ secondary_reader.Document (term_docs.Doc (), fields);
// If we are using the secondary index, now we need to
// merge the properties from the secondary index
Modified: branches/beagle-rdf/beagled/Makefile.am
==============================================================================
--- branches/beagle-rdf/beagled/Makefile.am (original)
+++ branches/beagle-rdf/beagled/Makefile.am Mon Mar 3 23:22:43 2008
@@ -87,208 +87,9 @@
############################################################
-lucenedir = $(srcdir)/Lucene.Net
+include $(srcdir)/Lucene.Net/Makefile.include
-LUCENE_1_9_CSFILES = \
- $(lucenedir)/Analysis/Standard/CharStream.cs \
- $(lucenedir)/Analysis/Standard/FastCharStream.cs \
- $(lucenedir)/Analysis/Standard/ParseException.cs \
- $(lucenedir)/Analysis/Standard/StandardAnalyzer.cs \
- $(lucenedir)/Analysis/Standard/StandardFilter.cs \
- $(lucenedir)/Analysis/Standard/StandardTokenizer.cs \
- $(lucenedir)/Analysis/Standard/StandardTokenizerConstants.cs \
- $(lucenedir)/Analysis/Standard/StandardTokenizerTokenManager.cs \
- $(lucenedir)/Analysis/Standard/Token.cs \
- $(lucenedir)/Analysis/Standard/TokenMgrError.cs \
- $(lucenedir)/Analysis/Analyzer.cs \
- $(lucenedir)/Analysis/CharTokenizer.cs \
- $(lucenedir)/Analysis/ISOLatin1AccentFilter.cs \
- $(lucenedir)/Analysis/KeywordAnalyzer.cs \
- $(lucenedir)/Analysis/KeywordTokenizer.cs \
- $(lucenedir)/Analysis/LengthFilter.cs \
- $(lucenedir)/Analysis/LetterTokenizer.cs \
- $(lucenedir)/Analysis/LowerCaseFilter.cs \
- $(lucenedir)/Analysis/LowerCaseTokenizer.cs \
- $(lucenedir)/Analysis/PerFieldAnalyzerWrapper.cs \
- $(lucenedir)/Analysis/PorterStemFilter.cs \
- $(lucenedir)/Analysis/PorterStemmer.cs \
- $(lucenedir)/Analysis/SimpleAnalyzer.cs \
- $(lucenedir)/Analysis/StopAnalyzer.cs \
- $(lucenedir)/Analysis/StopFilter.cs \
- $(lucenedir)/Analysis/Token.cs \
- $(lucenedir)/Analysis/TokenFilter.cs \
- $(lucenedir)/Analysis/Tokenizer.cs \
- $(lucenedir)/Analysis/TokenStream.cs \
- $(lucenedir)/Analysis/WhitespaceAnalyzer.cs \
- $(lucenedir)/Analysis/WhitespaceTokenizer.cs \
- $(lucenedir)/Analysis/WordlistLoader.cs \
- $(lucenedir)/Document/DateField.cs \
- $(lucenedir)/Document/DateTools.cs \
- $(lucenedir)/Document/Document.cs \
- $(lucenedir)/Document/Field.cs \
- $(lucenedir)/Document/NumberTools.cs \
- $(lucenedir)/Index/CompoundFileReader.cs \
- $(lucenedir)/Index/CompoundFileWriter.cs \
- $(lucenedir)/Index/DocumentWriter.cs \
- $(lucenedir)/Index/FieldInfo.cs \
- $(lucenedir)/Index/FieldInfos.cs \
- $(lucenedir)/Index/FieldsReader.cs \
- $(lucenedir)/Index/FieldsWriter.cs \
- $(lucenedir)/Index/FilterIndexReader.cs \
- $(lucenedir)/Index/IndexFileNameFilter.cs \
- $(lucenedir)/Index/IndexFileNames.cs \
- $(lucenedir)/Index/IndexModifier.cs \
- $(lucenedir)/Index/IndexReader.cs \
- $(lucenedir)/Index/IndexWriter.cs \
- $(lucenedir)/Index/MultipleTermPositions.cs \
- $(lucenedir)/Index/MultiReader.cs \
- $(lucenedir)/Index/ParallelReader.cs \
- $(lucenedir)/Index/SegmentInfo.cs \
- $(lucenedir)/Index/SegmentInfos.cs \
- $(lucenedir)/Index/SegmentMergeInfo.cs \
- $(lucenedir)/Index/SegmentMergeQueue.cs \
- $(lucenedir)/Index/SegmentMerger.cs \
- $(lucenedir)/Index/SegmentReader.cs \
- $(lucenedir)/Index/SegmentTermDocs.cs \
- $(lucenedir)/Index/SegmentTermEnum.cs \
- $(lucenedir)/Index/SegmentTermPositions.cs \
- $(lucenedir)/Index/SegmentTermPositionVector.cs \
- $(lucenedir)/Index/SegmentTermVector.cs \
- $(lucenedir)/Index/Term.cs \
- $(lucenedir)/Index/TermBuffer.cs \
- $(lucenedir)/Index/TermDocs.cs \
- $(lucenedir)/Index/TermEnum.cs \
- $(lucenedir)/Index/TermFreqVector.cs \
- $(lucenedir)/Index/TermInfo.cs \
- $(lucenedir)/Index/TermInfosReader.cs \
- $(lucenedir)/Index/TermInfosWriter.cs \
- $(lucenedir)/Index/TermPositions.cs \
- $(lucenedir)/Index/TermPositionVector.cs \
- $(lucenedir)/Index/TermVectorOffsetInfo.cs \
- $(lucenedir)/Index/TermVectorsReader.cs \
- $(lucenedir)/Index/TermVectorsWriter.cs \
- $(lucenedir)/QueryParser/CharStream.cs \
- $(lucenedir)/QueryParser/FastCharStream.cs \
- $(lucenedir)/QueryParser/MultiFieldQueryParser.cs \
- $(lucenedir)/QueryParser/ParseException.cs \
- $(lucenedir)/QueryParser/QueryParser.cs \
- $(lucenedir)/QueryParser/QueryParserConstants.cs \
- $(lucenedir)/QueryParser/QueryParserTokenManager.cs \
- $(lucenedir)/QueryParser/Token.cs \
- $(lucenedir)/QueryParser/TokenMgrError.cs \
- $(lucenedir)/Search/Regex/RegexQuery.cs \
- $(lucenedir)/Search/Regex/RegexTermEnum.cs \
- $(lucenedir)/Search/Regex/SpanRegexQuery.cs \
- $(lucenedir)/Search/Spans/NearSpans.cs \
- $(lucenedir)/Search/Spans/SpanFirstQuery.cs \
- $(lucenedir)/Search/Spans/SpanNearQuery.cs \
- $(lucenedir)/Search/Spans/SpanNotQuery.cs \
- $(lucenedir)/Search/Spans/SpanOrQuery.cs \
- $(lucenedir)/Search/Spans/SpanQuery.cs \
- $(lucenedir)/Search/Spans/Spans.cs \
- $(lucenedir)/Search/Spans/SpanScorer.cs \
- $(lucenedir)/Search/Spans/SpanTermQuery.cs \
- $(lucenedir)/Search/Spans/SpanWeight.cs \
- $(lucenedir)/Search/BooleanClause.cs \
- $(lucenedir)/Search/BooleanQuery.cs \
- $(lucenedir)/Search/BooleanScorer.cs \
- $(lucenedir)/Search/BooleanScorer2.cs \
- $(lucenedir)/Search/CachingWrapperFilter.cs \
- $(lucenedir)/Search/ConjunctionScorer.cs \
- $(lucenedir)/Search/ConstantScoreQuery.cs \
- $(lucenedir)/Search/ConstantScoreRangeQuery.cs \
- $(lucenedir)/Search/DateFilter.cs \
- $(lucenedir)/Search/DefaultSimilarity.cs \
- $(lucenedir)/Search/DisjunctionMaxQuery.cs \
- $(lucenedir)/Search/DisjunctionMaxScorer.cs \
- $(lucenedir)/Search/DisjunctionSumScorer.cs \
- $(lucenedir)/Search/ExactPhraseScorer.cs \
- $(lucenedir)/Search/Explanation.cs \
- $(lucenedir)/Search/FieldCache.cs \
- $(lucenedir)/Search/FieldCacheImpl.cs \
- $(lucenedir)/Search/FieldDoc.cs \
- $(lucenedir)/Search/FieldDocSortedHitQueue.cs \
- $(lucenedir)/Search/FieldSortedHitQueue.cs \
- $(lucenedir)/Search/Filter.cs \
- $(lucenedir)/Search/FilteredQuery.cs \
- $(lucenedir)/Search/FilteredTermEnum.cs \
- $(lucenedir)/Search/FuzzyQuery.cs \
- $(lucenedir)/Search/FuzzyTermEnum.cs \
- $(lucenedir)/Search/Hit.cs \
- $(lucenedir)/Search/HitCollector.cs \
- $(lucenedir)/Search/HitIterator.cs \
- $(lucenedir)/Search/HitQueue.cs \
- $(lucenedir)/Search/Hits.cs \
- $(lucenedir)/Search/IndexSearcher.cs \
- $(lucenedir)/Search/MatchAllDocsQuery.cs \
- $(lucenedir)/Search/MultiPhraseQuery.cs \
- $(lucenedir)/Search/MultiSearcher.cs \
- $(lucenedir)/Search/MultiTermQuery.cs \
- $(lucenedir)/Search/NonMatchingScorer.cs \
- $(lucenedir)/Search/ParallelMultiSearcher.cs \
- $(lucenedir)/Search/PhrasePositions.cs \
- $(lucenedir)/Search/PhrasePrefixQuery.cs \
- $(lucenedir)/Search/PhraseQuery.cs \
- $(lucenedir)/Search/PhraseQueue.cs \
- $(lucenedir)/Search/PhraseScorer.cs \
- $(lucenedir)/Search/PrefixQuery.cs \
- $(lucenedir)/Search/Query.cs \
- $(lucenedir)/Search/QueryFilter.cs \
- $(lucenedir)/Search/QueryTermVector.cs \
- $(lucenedir)/Search/RangeFilter.cs \
- $(lucenedir)/Search/RangeQuery.cs \
- $(lucenedir)/Search/ReqExclScorer.cs \
- $(lucenedir)/Search/ReqOptSumScorer.cs \
- $(lucenedir)/Search/ScoreDoc.cs \
- $(lucenedir)/Search/ScoreDocComparator.cs \
- $(lucenedir)/Search/Scorer.cs \
- $(lucenedir)/Search/Searchable.cs \
- $(lucenedir)/Search/Searcher.cs \
- $(lucenedir)/Search/Similarity.cs \
- $(lucenedir)/Search/SimilarityDelegator.cs \
- $(lucenedir)/Search/SloppyPhraseScorer.cs \
- $(lucenedir)/Search/Sort.cs \
- $(lucenedir)/Search/SortComparator.cs \
- $(lucenedir)/Search/SortComparatorSource.cs \
- $(lucenedir)/Search/SortField.cs \
- $(lucenedir)/Search/TermQuery.cs \
- $(lucenedir)/Search/TermScorer.cs \
- $(lucenedir)/Search/TopDocs.cs \
- $(lucenedir)/Search/TopFieldDocs.cs \
- $(lucenedir)/Search/Weight.cs \
- $(lucenedir)/Search/WildcardQuery.cs \
- $(lucenedir)/Search/WildcardTermEnum.cs \
- $(lucenedir)/Store/BufferedIndexInput.cs \
- $(lucenedir)/Store/BufferedIndexOutput.cs \
- $(lucenedir)/Store/Directory.cs \
- $(lucenedir)/Store/FSDirectory.cs \
- $(lucenedir)/Store/IndexInput.cs \
- $(lucenedir)/Store/IndexOutput.cs \
- $(lucenedir)/Store/InputStream.cs \
- $(lucenedir)/Store/Lock.cs \
- $(lucenedir)/Store/MMapDirectory.cs \
- $(lucenedir)/Store/OutputStream.cs \
- $(lucenedir)/Store/RAMDirectory.cs \
- $(lucenedir)/Store/RAMFile.cs \
- $(lucenedir)/Store/RAMInputStream.cs \
- $(lucenedir)/Store/RAMOutputStream.cs \
- $(lucenedir)/Util/BitVector.cs \
- $(lucenedir)/Util/Constants.cs \
- $(lucenedir)/Util/Parameter.cs \
- $(lucenedir)/Util/PriorityQueue.cs \
- $(lucenedir)/Util/SmallFloat.cs \
- $(lucenedir)/Util/StringHelper.cs \
- $(lucenedir)/Util/ToStringUtils.cs \
- $(lucenedir)/LucenePackage.cs \
- $(lucenedir)/SharpZipLibAdapter.cs \
- $(lucenedir)/SupportClass.cs
-
-# Stuff we don't build because we don't use it and it
-# introduces additional library dependencies.
-IGNORED_LUCENE_CSFILES = \
- $(lucenedir)/Search/RemoteSearchable.cs
-
-LUCENE_CSFILES = $(LUCENE_1_9_CSFILES)
+LUCENE_CSFILES = $(LUCENE_2_1_CSFILES)
############################################################
Modified: branches/beagle-rdf/beagled/NoiseFilter.cs
==============================================================================
--- branches/beagle-rdf/beagled/NoiseFilter.cs (original)
+++ branches/beagle-rdf/beagled/NoiseFilter.cs Mon Mar 3 23:22:43 2008
@@ -38,7 +38,7 @@
// 1. Removes words which are potential noise like dhyhy8ju7q9
// 2. Splits email addresses into meaningful tokens
// 3. Splits hostnames into subparts
- class NoiseEmailHostFilter : TokenFilter {
+ public class NoiseEmailHostFilter : TokenFilter {
private bool tokenize_email_hostname;
@@ -131,13 +131,13 @@
// Someone might like to search for emails, hostnames and
// phone numbers (which fall under type NUM)
private static readonly string tokentype_email
- = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.EMAIL];
+ = LNSA.StandardTokenizerImpl.TOKEN_TYPES [LNSA.StandardTokenizerImpl.EMAIL];
private static readonly string tokentype_host
- = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.HOST];
+ = LNSA.StandardTokenizerImpl.TOKEN_TYPES [LNSA.StandardTokenizerImpl.HOST];
private static readonly string tokentype_number
- = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.NUM];
+ = LNSA.StandardTokenizerImpl.TOKEN_TYPES [LNSA.StandardTokenizerImpl.NUM];
private static readonly string tokentype_alphanum
- = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.ALPHANUM];
+ = LNSA.StandardTokenizerImpl.TOKEN_TYPES [LNSA.StandardTokenizerImpl.ALPHANUM];
private bool ProcessToken (ref Lucene.Net.Analysis.Token token)
{
@@ -166,10 +166,10 @@
if (begin == 0)
return ! IsNoise (text);
token = new Lucene.Net.Analysis.Token (
- token.TermText ().Remove (0, begin),
- token.StartOffset (),
+ text.Remove (0, begin),
+ begin,
token.EndOffset (),
- token.Type ());
+ type);
return true;
} else if (type == tokentype_email) {
if (tokenize_email_hostname)
@@ -184,27 +184,46 @@
return ! IsNoise (token.TermText ());
}
- private Queue parts = new Queue ();
- private Lucene.Net.Analysis.Token token;
+ // State for creating smaller tokens from larger email/hostname tokens
+ private string[] parts = null;
+ private int parts_index = -1;
+ private int last_end_offset = -1;
+ private string token_type = null;
public override Lucene.Net.Analysis.Token Next ()
{
- if (parts.Count != 0) {
- string part = (string) parts.Dequeue ();
- Lucene.Net.Analysis.Token part_token;
- // FIXME: Searching for google.com will not match www.google.com.
- // If we decide to allow google-style "abcd.1234" which means
- // "abcd 1234" as a consequtive phrase, then adjusting
- // the startOffset and endOffset would enable matching
- // google.com to www.google.com
- part_token = new Lucene.Net.Analysis.Token (part,
- token.StartOffset (),
- token.EndOffset (),
- token.Type ());
- part_token.SetPositionIncrement (0);
- return part_token;
+ if (parts != null) {
+ if (++parts_index < parts.Length) {
+ string part = parts [parts_index];
+ Lucene.Net.Analysis.Token part_token;
+ // FIXME: Searching for google.com will not match www.google.com.
+ // If we decide to allow google-style "abcd.1234" which means
+ // "abcd 1234" as a consequtive phrase, then adjusting
+ // the startOffset and endOffset would enable matching
+ // google.com to www.google.com
+ int start_offset = (parts_index == 0 && token_type == tokentype_email ?
+ 0 :
+ last_end_offset + 1); // assuming only one separator
+ int end_offset = start_offset + part.Length;
+ part_token = new Lucene.Net.Analysis.Token (part,
+ start_offset,
+ end_offset,
+ token_type);
+ part_token.SetPositionIncrement (0);
+ last_end_offset = (parts_index == 0 && token_type == tokentype_email ?
+ -1 :
+ end_offset); // assuming only one separator
+ return part_token;
+ } else {
+ // clear the array
+ parts = null;
+ parts_index = -1;
+ last_end_offset = -1;
+ token_type = null;
+ }
}
+ Token token;
while ( (token = token_stream.Next ()) != null) {
//Console.WriteLine ("Found token: [{0}]", token.TermText ());
if (ProcessToken (ref token))
@@ -213,41 +232,46 @@
return null;
}
- char[] replace_array = { '@', '.', '-', '_', '+' };
+ private static readonly char[] replace_array = { '@', '.', '-', '_', '+' };
+
private void ProcessEmailToken (Lucene.Net.Analysis.Token token)
{
+ token_type = tokentype_email;
+
string email = token.TermText ();
- string[] tmp = email.Split (replace_array);
- int l = tmp.Length;
+ parts = email.Split (replace_array);
+ if (parts.Length == 1) // safety check
+ return;
- // store username part as a large token
int index_at = email.IndexOf ('@');
- tmp [l-1] = email.Substring (0, index_at);
-
- foreach (string s in tmp)
- parts.Enqueue (s);
-
+ // store username part as a large token
+ // and also remove the final tld part
+ Array.Copy (parts, 0, parts, 1, parts.Length - 1);
+ parts [0] = email.Substring (0, index_at);
}
private void ProcessURLToken (Lucene.Net.Analysis.Token token)
{
+ token_type = tokentype_host;
+
string hostname = token.TermText ();
- string[] host_parts = hostname.Split ('.');
+ parts = hostname.Split ('.');
+
+ if (parts [0] != "www")
+ return;
// remove initial www
- int begin_index = (host_parts [0] == "www" ? 1 : 0);
+ Array.Copy (parts, 1, parts, 0, parts.Length - 1);
+ Array.Resize (ref parts, parts.Length - 1);
// FIXME: Remove final tld
// Any string of form "<alnum> '.')+<alnum>" has type HOST
// Removing last token might remove important words from non-host
// string of that form. To fix that, we need to match against the
// huge list of TLDs.
- for (int i = begin_index; i < host_parts.Length; ++i)
- parts.Enqueue (host_parts [i]);
-
}
}
-#if false
+#if Noisefilter
// To build: gmcs NoiseFilter.cs LuceneCommon.cs -r:../Util/Util.dll -r:../BeagleClient/Beagle.dll -r:BeagleDaemonLib.dll
public class AnalyzerTest {
public static void Main ()
Modified: branches/beagle-rdf/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs
==============================================================================
--- branches/beagle-rdf/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs (original)
+++ branches/beagle-rdf/beagled/Snowball.Net/Lucene.Net/Analysis/Snowball/SnowballFilter.cs Mon Mar 3 23:22:43 2008
@@ -60,7 +60,13 @@
throw new System.SystemException(e.ToString());
}
}
-
+
+ public SnowballFilter(TokenStream in_Renamed, SnowballProgram stemmer, System.Reflection.MethodInfo stemMethod) : base(in_Renamed)
+ {
+ this.stemmer = stemmer;
+ this.stemMethod = stemMethod;
+ }
+
/// <summary>Returns the next input Token, after being stemmed </summary>
public override Token Next()
{
@@ -81,5 +87,12 @@
newToken.SetPositionIncrement(token.GetPositionIncrement());
return newToken;
}
+
+ public override void Close()
+ {
+ // In case stemmer was shared
+ stemmer.SetCurrent(String.Empty);
+ base.Close();
+ }
}
-}
\ No newline at end of file
+}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]