beagle r4493 - branches/beagle-rdf/beagled
- From: dbera svn gnome org
- To: svn-commits-list gnome org
- Subject: beagle r4493 - branches/beagle-rdf/beagled
- Date: Sun, 17 Feb 2008 00:34:20 +0000 (GMT)
Author: dbera
Date: Sun Feb 17 00:34:20 2008
New Revision: 4493
URL: http://svn.gnome.org/viewvc/beagle?rev=4493&view=rev
Log:
Remove the cumbersome GetDocsWithProperty method. Instead store a field with the names of all the other properties, whitespace separated and use that to query. Interestingly, this increased the query time; however the earlier method only searched in the PrimaryIndex while this one searches in both the indexes (and is of course much cleaner and a lot less code).
Use a FieldSelector in LuceneBitArray ... again no improvement in query time. Still it is the right thing to do.
Several other minor fixes.
Modified:
branches/beagle-rdf/beagled/DumpIndex.cs
branches/beagle-rdf/beagled/LuceneBitArray.cs
branches/beagle-rdf/beagled/LuceneCommon.cs
branches/beagle-rdf/beagled/LuceneQueryingDriver.cs
branches/beagle-rdf/beagled/NoiseFilter.cs
Modified: branches/beagle-rdf/beagled/DumpIndex.cs
==============================================================================
--- branches/beagle-rdf/beagled/DumpIndex.cs (original)
+++ branches/beagle-rdf/beagled/DumpIndex.cs Sun Feb 17 00:34:20 2008
@@ -205,7 +205,7 @@
int freq;
freq = term_enum.DocFreq ();
- Console.WriteLine ("{0} {1} {2}", index_name, term_enum.Term ().Text (), freq);
+ Console.WriteLine ("{0} '{1}' {2}", index_name, term_enum.Term ().Text (), freq);
// FIXME: spew these as a count
++distinct_term_count;
Modified: branches/beagle-rdf/beagled/LuceneBitArray.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneBitArray.cs (original)
+++ branches/beagle-rdf/beagled/LuceneBitArray.cs Sun Feb 17 00:34:20 2008
@@ -198,6 +198,8 @@
////////////////////////////////////////////////////////////
+ static string[] fields_uri = { "Timestamp", "Uri" };
+
public void ProjectOnto (LuceneBitArray other)
{
int j = 0;
@@ -209,7 +211,7 @@
j = i+1;
Document doc;
- doc = searcher.Doc (i);
+ doc = searcher.Doc (i, fields_uri);
other.AddUri (doc.Get ("Uri"));
}
Modified: branches/beagle-rdf/beagled/LuceneCommon.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneCommon.cs (original)
+++ branches/beagle-rdf/beagled/LuceneCommon.cs Sun Feb 17 00:34:20 2008
@@ -83,7 +83,8 @@
// 18: add IsPersistent to properties, and adjust coded values
// in AddPropertyToDocument() and GetPropertyFromDocument();
// changed subdate field format rules for better readability
- private const int MAJOR_VERSION = 18;
+ // 19: store a list of current properties in a field
+ private const int MAJOR_VERSION = 19;
private int minor_version = 0;
private string index_name;
@@ -524,6 +525,9 @@
}
} else if (fieldName == "PropertyKeyword")
return new LowerCaseFilter (new SingletonTokenStream (reader.ReadToEnd ()));
+ else if (fieldName == "Properties")
+ return new WhitespaceTokenizer (new StringReader (reader.ReadToEnd ()));
+
TokenStream outstream;
outstream = base.TokenStream (fieldName, reader);
@@ -856,6 +860,11 @@
AddPropertyToDocument (prop, target_doc);
}
+
+ // Now add a field containing a whitespace separated list of other fields in the document
+ AddFieldProperies (primary_doc);
+ if (secondary_doc != null)
+ AddFieldProperies (secondary_doc);
}
static private Document CreateSecondaryDocument (Uri uri, Uri parent_uri)
@@ -928,6 +937,7 @@
}
}
+ AddFieldProperies (new_doc);
return new_doc;
}
@@ -949,9 +959,38 @@
}
}
+ AddFieldProperies (doc);
return doc;
}
+ // Add a new field with whitespace separated names of the existing fields
+ static protected void AddFieldProperies (Document doc)
+ {
+ const string Separator = " ";
+
+ StringBuilder sb = new StringBuilder ();
+ bool seen_properties = false;
+
+ foreach (Field f in doc.Fields ()) {
+ if (f.Name () == "Properties") {
+ seen_properties = true;
+ continue;
+ }
+
+ sb.Append (f.Name ());
+ sb.Append (Separator);
+ }
+
+ if (sb.Length > 0)
+ sb.Length -= Separator.Length;
+
+ if (seen_properties)
+ doc.RemoveFields ("Properties");
+
+ Field field = new Field ("Properties", sb.ToString (), Field.Store.YES, Field.Index.TOKENIZED); // FIXME: Field.Store.No
+ doc.Add (field);
+ }
+
static protected Uri GetUriFromDocument (Document doc)
{
string uri;
@@ -1633,11 +1672,13 @@
else
field_name = PropertyToFieldName (part.Type, part.Key);
+ // Details of the conversion here depends on BeagleAnalyzer::TokenStream
if (part.Type == PropertyType.Text)
primary_query = StringToQuery (field_name, part.Value, term_list);
else {
Term term;
- if (field_name.StartsWith ("prop:k:" + Property.PrivateNamespace))
+ // FIXME: Handle date queries for other date fields
+ if (part.Type == PropertyType.Internal || field_name.StartsWith ("prop:k:" + Property.PrivateNamespace))
term = new Term (field_name, part.Value);
else
term = new Term (field_name, part.Value.ToLower ());
Modified: branches/beagle-rdf/beagled/LuceneQueryingDriver.cs
==============================================================================
--- branches/beagle-rdf/beagled/LuceneQueryingDriver.cs (original)
+++ branches/beagle-rdf/beagled/LuceneQueryingDriver.cs Sun Feb 17 00:34:20 2008
@@ -355,7 +355,15 @@
// Return uris for all documents with this property
if (subject == String.Empty && predicate != String.Empty && _object == String.Empty) {
- return GetDocsWithProperty (predicate, pred_type);
+ string field_name = PropertyToFieldName (pred_type, predicate);
+
+ QueryPart_Property part = new QueryPart_Property ();
+ part.Type = PropertyType.Internal;
+ part.Key = "Properties";
+ part.Value = field_name;
+ query.AddPart (part);
+
+ return DoLowLevelRDFQuery (query, field_name, null);
}
// Property query
@@ -425,120 +433,6 @@
throw new Exception ("Never reaches");
}
- // FIXME FIXME FIXME: Rewrite this horrible method by keeping a field containing
- // the names of all properties in that document ?
- // What about SecondaryDocument ? Which index to store this field in ?
- private ICollection GetDocsWithProperty (string propname, PropertyType prop_type)
- {
- // This is the hardest!
- // Most of the times either all docs will have the property or
- // neither will, but we also have to cover the rare cases.
- // Possible approach: Do a term_enum with this property name.
- // Keep a Set of all Docs (rather Uris) which contain that term
- // (pretty expensive - since most probably all documents will contain that
- // property).
- //
- // Another approach: Get all hits from the driver, scan them one by one
- // and return URIs for the hits which contain the property *shudder*
- //
-
- // FIXME: Uses PrimaryIndex only!
- // Create a bitarray and mark all docs with that property by using a termenum
-
- IndexReader primary_reader;
- primary_reader = LuceneCommon.GetReader (PrimaryStore);
-
- BetterBitArray all_docs = new BetterBitArray (primary_reader.MaxDoc ());
-
- TermDocs docs = primary_reader.TermDocs ();
- string field_name = PropertyToFieldName (prop_type, propname);
- Console.WriteLine (field_name);
- TermEnum enumerator = primary_reader.Terms (new Term (field_name, String.Empty));
- Term term;
- bool field_present = false;
-
- do {
- // Find all terms with given field
- term = enumerator.Term ();
-
- if (term.Field () != field_name)
- break;
-
- field_present = true;
-
- docs.Seek (enumerator);
-
- // Find all docs with that term
- while (docs.Next ())
- all_docs [docs.Doc ()] = true;
- } while (enumerator.Next ());
- Console.WriteLine (field_present);
-
- enumerator.Close ();
-
- // Maxdoc could be millions!
- ArrayList hits = new ArrayList (primary_reader.MaxDoc ());
-
- // If field_present is false, preempt
- if (! field_present) {
- docs.Close ();
- LuceneCommon.ReleaseReader (primary_reader);
-
- return hits;
- }
-
- IndexReader secondary_reader = null;
- LNS.IndexSearcher secondary_searcher = null;
-
- if (SecondaryStore != null) {
- secondary_reader = LuceneCommon.GetReader (SecondaryStore);
- if (secondary_reader.NumDocs () == 0) {
- ReleaseReader (secondary_reader);
- secondary_reader = null;
- }
- }
-
- if (secondary_reader != null)
- secondary_searcher = new LNS.IndexSearcher (secondary_reader);
-
- TermDocs secondary_term_docs = null;
- if (secondary_searcher != null)
- secondary_term_docs = secondary_searcher.Reader.TermDocs ();
-
- string[] fields = { "Uri", "Timestamp", field_name };
-
- // Go through all Uris now
- enumerator = primary_reader.Terms (new Term ("Uri", String.Empty));
- Document doc;
-
- do {
- // Find all terms with
- term = enumerator.Term ();
-
- if (term.Field () != "Uri")
- break;
-
- docs.Seek (enumerator);
- // Assume only one doc with an uri.
- // Go to the doc with this uri
- // If this doc's id is present in bit_array, return the uri
- if (docs.Next () && all_docs [docs.Doc ()]) {
- doc = primary_reader.Document (docs.Doc (), fields);
- Hit hit = CreateHit (doc, secondary_searcher, secondary_term_docs, fields);
- hits.Add (hit);
- }
-
- } while (enumerator.Next ());
-
- // Traverse all docs in all_docs
-
- enumerator.Close ();
- docs.Close ();
- LuceneCommon.ReleaseReader (primary_reader);
-
- return hits;
- }
-
private ICollection DoLowLevelRDFQuery (Query query,
string field_name,
string field_value)
Modified: branches/beagle-rdf/beagled/NoiseFilter.cs
==============================================================================
--- branches/beagle-rdf/beagled/NoiseFilter.cs (original)
+++ branches/beagle-rdf/beagled/NoiseFilter.cs Sun Feb 17 00:34:20 2008
@@ -248,7 +248,13 @@
}
#if false
+ // To build: gmcs NoiseFilter.cs LuceneCommon.cs -r:../Util/Util.dll -r:../BeagleClient/Beagle.dll -r:BeagleDaemonLib.dll
public class AnalyzerTest {
+ public static void Main ()
+ {
+ Analyze (Console.In);
+ }
+
public static void Analyze (TextReader reader)
{
Lucene.Net.Analysis.Token lastToken = null;
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]