beagle r4572 - in branches/beagle-lucene2_1/beagled/Lucene.Net: Analysis/Standard Index upstream-changes
- From: dbera svn gnome org
- To: svn-commits-list gnome org
- Subject: beagle r4572 - in branches/beagle-lucene2_1/beagled/Lucene.Net: Analysis/Standard Index upstream-changes
- Date: Sun, 2 Mar 2008 14:41:58 +0000 (GMT)
Author: dbera
Date: Sun Mar 2 14:41:58 2008
New Revision: 4572
URL: http://svn.gnome.org/viewvc/beagle?rev=4572&view=rev
Log:
Reuse standardtokenizerimpl, they are expensive to create. And up the number of docs in an indexing batch to 50.
Added:
branches/beagle-lucene2_1/beagled/Lucene.Net/upstream-changes/16_reuse-std-analyzer.patch
Modified:
branches/beagle-lucene2_1/beagled/Lucene.Net/Analysis/Standard/StandardTokenizer.cs
branches/beagle-lucene2_1/beagled/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.cs
branches/beagle-lucene2_1/beagled/Lucene.Net/Index/IndexWriter.cs
branches/beagle-lucene2_1/beagled/Lucene.Net/upstream-changes/14_default-max-buffered-docs.patch
Modified: branches/beagle-lucene2_1/beagled/Lucene.Net/Analysis/Standard/StandardTokenizer.cs
==============================================================================
--- branches/beagle-lucene2_1/beagled/Lucene.Net/Analysis/Standard/StandardTokenizer.cs (original)
+++ branches/beagle-lucene2_1/beagled/Lucene.Net/Analysis/Standard/StandardTokenizer.cs Sun Mar 2 14:41:58 2008
@@ -55,7 +55,7 @@
/// <summary>Constructs a tokenizer for this Reader. </summary>
public StandardTokenizer(System.IO.TextReader reader) : base(reader)
{
- this.scanner = new StandardTokenizerImpl(reader);
+ this.scanner = StandardTokenizerImpl.GetStandardTokenizerImpl(reader);
}
/// <summary>Returns the next token in the stream, or null at EOS.
Modified: branches/beagle-lucene2_1/beagled/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.cs
==============================================================================
--- branches/beagle-lucene2_1/beagled/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.cs (original)
+++ branches/beagle-lucene2_1/beagled/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.cs Sun Mar 2 14:41:58 2008
@@ -366,7 +366,6 @@
}
/// <summary>the input device </summary>
- //UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'"
private System.IO.TextReader zzReader;
/// <summary>the current state of the DFA </summary>
@@ -454,7 +453,6 @@
/// </summary>
/// <param name="in"> the java.io.Reader to read input from.
/// </param>
- //UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'"
internal StandardTokenizerImpl(System.IO.TextReader in_Renamed)
{
this.zzReader = in_Renamed;
@@ -470,6 +468,22 @@
{
}
+ internal static StandardTokenizerImpl GetStandardTokenizerImpl(System.IO.TextReader reader)
+ {
+ if (impl==null)
+ {
+ impl = new StandardTokenizerImpl(reader);
+ }
+ else
+ {
+ impl.yyreset(reader);
+ }
+
+ return impl;
+ }
+
+ private static StandardTokenizerImpl impl = null;
+
/// <summary> Unpacks the compressed character translation table.
///
/// </summary>
@@ -528,7 +542,6 @@
}
/* finally: fill the buffer with new input */
- //UPGRADE_TODO: Method 'java.io.Reader.read' was converted to 'System.IO.StreamReader.Read' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javaioReaderread_char[]_int_int'"
int lengthToRead = zzBuffer.Length - zzEndRead;
int numRead = zzReader.Read(zzBuffer, zzEndRead, lengthToRead);
@@ -557,7 +570,6 @@
zzReader.Close();
}
-
/// <summary> Resets the scanner to read from a new input stream.
/// Does not close the old reader.
///
@@ -568,8 +580,7 @@
/// </summary>
/// <param name="reader"> the new input stream
/// </param>
- //UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'"
- public void yyreset(System.IO.StreamReader reader)
+ public void yyreset(System.IO.TextReader reader)
{
zzReader = reader;
zzAtBOL = true;
Modified: branches/beagle-lucene2_1/beagled/Lucene.Net/Index/IndexWriter.cs
==============================================================================
--- branches/beagle-lucene2_1/beagled/Lucene.Net/Index/IndexWriter.cs (original)
+++ branches/beagle-lucene2_1/beagled/Lucene.Net/Index/IndexWriter.cs Sun Mar 2 14:41:58 2008
@@ -95,7 +95,7 @@
public const int DEFAULT_MERGE_FACTOR = 10;
/// <summary> Default value is 10. Change using { link #SetMaxBufferedDocs(int)}.</summary>
- public const int DEFAULT_MAX_BUFFERED_DOCS = 30;
+ public const int DEFAULT_MAX_BUFFERED_DOCS = 50;
/// <summary> Default value is 1000. Change using
/// { link #SetMaxBufferedDeleteTerms(int)}.
Modified: branches/beagle-lucene2_1/beagled/Lucene.Net/upstream-changes/14_default-max-buffered-docs.patch
==============================================================================
--- branches/beagle-lucene2_1/beagled/Lucene.Net/upstream-changes/14_default-max-buffered-docs.patch (original)
+++ branches/beagle-lucene2_1/beagled/Lucene.Net/upstream-changes/14_default-max-buffered-docs.patch Sun Mar 2 14:41:58 2008
@@ -11,7 +11,7 @@
/// <summary> Default value is 10. Change using { link #SetMaxBufferedDocs(int)}.</summary>
- public const int DEFAULT_MAX_BUFFERED_DOCS = 10;
-+ public const int DEFAULT_MAX_BUFFERED_DOCS = 30;
++ public const int DEFAULT_MAX_BUFFERED_DOCS = 50;
/// <summary> Default value is 1000. Change using
/// { link #SetMaxBufferedDeleteTerms(int)}.
Added: branches/beagle-lucene2_1/beagled/Lucene.Net/upstream-changes/16_reuse-std-analyzer.patch
==============================================================================
--- (empty file)
+++ branches/beagle-lucene2_1/beagled/Lucene.Net/upstream-changes/16_reuse-std-analyzer.patch Sun Mar 2 14:41:58 2008
@@ -0,0 +1,86 @@
+Reuse StandardTokenizerImpl. There is one StandardAnalyzer creater per field of every document; also, each StandardTokenizerImpl creates a 16K char array. It is pretty expensive in general to create them.
+
+From: D Bera <dbera web gmail com>
+
+Index: Analysis/Standard/StandardTokenizer.cs
+===================================================================
+--- Analysis/Standard/StandardTokenizer.cs (revision 4526)
++++ Analysis/Standard/StandardTokenizer.cs (working copy)
+@@ -55,7 +55,7 @@
+ /// <summary>Constructs a tokenizer for this Reader. </summary>
+ public StandardTokenizer(System.IO.TextReader reader) : base(reader)
+ {
+- this.scanner = new StandardTokenizerImpl(reader);
++ this.scanner = StandardTokenizerImpl.GetStandardTokenizerImpl(reader);
+ }
+
+ /// <summary>Returns the next token in the stream, or null at EOS.
+Index: Analysis/Standard/StandardTokenizerImpl.cs
+===================================================================
+--- Analysis/Standard/StandardTokenizerImpl.cs (revision 4526)
++++ Analysis/Standard/StandardTokenizerImpl.cs (working copy)
+@@ -366,7 +366,6 @@
+ }
+
+ /// <summary>the input device </summary>
+- //UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'"
+ private System.IO.TextReader zzReader;
+
+ /// <summary>the current state of the DFA </summary>
+@@ -454,7 +453,6 @@
+ /// </summary>
+ /// <param name="in"> the java.io.Reader to read input from.
+ /// </param>
+- //UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'"
+ internal StandardTokenizerImpl(System.IO.TextReader in_Renamed)
+ {
+ this.zzReader = in_Renamed;
+@@ -470,6 +468,22 @@
+ {
+ }
+
++ internal static StandardTokenizerImpl GetStandardTokenizerImpl(System.IO.TextReader reader)
++ {
++ if (impl==null)
++ {
++ impl = new StandardTokenizerImpl(reader);
++ }
++ else
++ {
++ impl.yyreset(reader);
++ }
++
++ return impl;
++ }
++
++ private static StandardTokenizerImpl impl = null;
++
+ /// <summary> Unpacks the compressed character translation table.
+ ///
+ /// </summary>
+@@ -528,7 +542,6 @@
+ }
+
+ /* finally: fill the buffer with new input */
+- //UPGRADE_TODO: Method 'java.io.Reader.read' was converted to 'System.IO.StreamReader.Read' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javaioReaderread_char[]_int_int'"
+ int lengthToRead = zzBuffer.Length - zzEndRead;
+ int numRead = zzReader.Read(zzBuffer, zzEndRead, lengthToRead);
+
+@@ -557,7 +570,6 @@
+ zzReader.Close();
+ }
+
+-
+ /// <summary> Resets the scanner to read from a new input stream.
+ /// Does not close the old reader.
+ ///
+@@ -568,8 +580,7 @@
+ /// </summary>
+ /// <param name="reader"> the new input stream
+ /// </param>
+- //UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'"
+- public void yyreset(System.IO.StreamReader reader)
++ public void yyreset(System.IO.TextReader reader)
+ {
+ zzReader = reader;
+ zzAtBOL = true;
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]