[geary/mjog/334-libstemmer: 1/2] ImapDb.SearchQuery: Use libstemmer for stemming




commit 8f62b2be9a9d53d6f5c03c40a76168d41d76beee
Author: Michael Gratton <mike vee net>
Date:   Sun Sep 13 16:27:51 2020 +1000

    ImapDb.SearchQuery: Use libstemmer for stemming
    
    Instead of shipping our own forked stemmer via SQLite, add a dependency
    on libstemmer and use that instead.

 bindings/vapi/libstemmer.vapi                |  38 ++++++++++
 meson.build                                  |   8 +++
 src/engine/imap-db/imap-db-search-query.vala | 102 ++++++++++++++++-----------
 src/engine/meson.build                       |   1 +
 src/meson.build                              |   1 +
 5 files changed, 109 insertions(+), 41 deletions(-)
---
diff --git a/bindings/vapi/libstemmer.vapi b/bindings/vapi/libstemmer.vapi
new file mode 100644
index 000000000..1d2e01c5c
--- /dev/null
+++ b/bindings/vapi/libstemmer.vapi
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2010-2013 Giulio Paci <giuliopaci gmail com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+[CCode (cheader_filename = "libstemmer.h", lower_case_cprefix = "sb_", cprefix="sb_")]
+namespace SnowBall {
+       [Compact]
+       [CCode (cname = "struct sb_stemmer", free_function="sb_stemmer_delete")]
+       public class Stemmer {
+               public unowned string stem(string word, int size);
+               public int length();
+               public Stemmer(string language, string encoding = "UTF_8");
+               [CCode (array_length = false, array_null_terminated = true)]
+               public static unowned string[] list();
+       }
+}
\ No newline at end of file
diff --git a/meson.build b/meson.build
index 5f33411bf..7672c7d33 100644
--- a/meson.build
+++ b/meson.build
@@ -90,6 +90,7 @@ libmath = cc.find_library('m')
 libpeas = dependency('libpeas-1.0', version: '>= 1.24.0')
 libpeas_gtk = dependency('libpeas-gtk-1.0', version: '>= 1.24.0')
 libsecret = dependency('libsecret-1', version: '>= 0.11')
+libstemmer_dep = cc.find_library('libstemmer')
 libsoup = dependency('libsoup-2.4', version: '>= 2.48')
 libunwind_dep = dependency(
   'libunwind', version: '>= 1.1', required: not get_option('libunwind_optional')
@@ -119,6 +120,13 @@ if libunwind_dep.found()
    )
 endif
 
+libstemmer = declare_dependency(
+  dependencies: [
+    valac.find_library('libstemmer', dirs: [vapi_dir]),
+    libstemmer_dep,
+  ],
+)
+
 # Optional dependencies
 appstream_util = find_program('appstream-util', required: false)
 desktop_file_validate = find_program('desktop-file-validate', required: false)
diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala
index 85f556fa8..968ac6eac 100644
--- a/src/engine/imap-db/imap-db-search-query.vala
+++ b/src/engine/imap-db/imap-db-search-query.vala
@@ -318,6 +318,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
     // A list of all search terms, regardless of search op field name
     private Gee.ArrayList<Term> all = new Gee.ArrayList<Term>();
 
+    private SnowBall.Stemmer stemmer;
+
+
     public async SearchQuery(Geary.Account owner,
                              ImapDB.Account local,
                              string query,
@@ -325,6 +328,7 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
                              GLib.Cancellable? cancellable) {
         base(owner, query, strategy);
         this.account = local;
+        this.stemmer = new SnowBall.Stemmer(find_appropriate_search_stemmer());
 
         switch (strategy) {
             case Strategy.EXACT:
@@ -611,20 +615,19 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
     }
 
     /**
-     * This method is used to convert an unquoted user-entered search terms into a stemmed search
-     * term.
+     * Converts unquoted search terms into a stemmed search term.
      *
-     * Prior experience with the Unicode Snowball stemmer indicates it's too aggressive for our
-     * tastes when coupled with prefix-matching of all unquoted terms (see
-     * https://bugzilla.gnome.org/show_bug.cgi?id=713179)   This method is part of a larger strategy
-     * designed to dampen that aggressiveness without losing the benefits of stemming entirely.
+     * Prior experience with the Snowball stemmer indicates it is too
+     * aggressive for our tastes when coupled with prefix-matching of
+     * all unquoted terms (see
+     * https://bugzilla.gnome.org/show_bug.cgi?id=713179).
      *
-     * Database upgrade 23 removes the old Snowball-stemmed FTS table and replaces it with one
-     * with no stemming (using only SQLite's "simple" tokenizer).  It also creates a "magic" SQLite
-     * table called TokenizerTable which allows for uniform queries to the Snowball stemmer, which
-     * is still installed in Geary.  Thus, we are now in the position to search for the original
-     * term and its stemmed variant, then do post-search processing to strip results which are
-     * too "greedy" due to prefix-matching the stemmed variant.
+     * This method is part of a larger strategy designed to dampen
+     * that aggressiveness without losing the benefits of stemming
+     * entirely: The database's FTS table uses no stemming, but
+     * libstemmer is used to generate stemmed search terms.
+     * Post-search processing is then to strip results which are too
+     * "greedy" due to prefix-matching the stemmed variant.
      *
      * Some heuristics are in place simply to determine if stemming should occur:
      *
@@ -647,36 +650,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
         if (term_length < this.min_term_length_for_stemming)
             return null;
 
-        string? stemmed = null;
-        try {
-            yield this.account.db.exec_transaction_async(RO,
-                (cx, cancellable) => {
-                    Db.Statement stmt = cx.prepare("""
-                        SELECT token
-                        FROM TokenizerTable
-                        WHERE input=?
-                    """);
-                    stmt.bind_string(0, term);
-
-                    // get stemmed string; if no result, fall through
-                    Db.Result result = stmt.exec(cancellable);
-                    if (!result.finished) {
-                        stemmed = result.string_at(0);
-                    } else {
-                        debug("No stemmed term returned for \"%s\"", term);
-                    }
-                    return COMMIT;
-                }, cancellable
-            );
-        } catch (Error err) {
-            debug("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message);
-
-            // fall-through
-        }
-
+        string? stemmed = this.stemmer.stem(term, term.length);
         if (String.is_empty(stemmed)) {
             debug("Empty stemmed term returned for \"%s\"", term);
-
             return null;
         }
 
@@ -693,8 +669,52 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
         }
 
         debug("Search processing: term -> stem is \"%s\" -> \"%s\"", term, stemmed);
-
         return stemmed;
     }
 
+    private string find_appropriate_search_stemmer() {
+        // Unfortunately, the stemmer library only accepts the full language
+        // name for the stemming algorithm.  This translates between the user's
+        // preferred language ISO 639-1 code and our available stemmers.
+        // FIXME: the available list here is determined by what's included in
+        // src/sqlite3-unicodesn/CMakeLists.txt.  We should pass that list in
+        // instead of hardcoding it here.
+        foreach (string l in Intl.get_language_names()) {
+            switch (l) {
+                case "ar": return "arabic";
+                case "eu": return "basque";
+                case "ca": return "catalan";
+                case "da": return "danish";
+                case "nl": return "dutch";
+                case "en": return "english";
+                case "fi": return "finnish";
+                case "fr": return "french";
+                case "de": return "german";
+                case "el": return "greek";
+                case "hi": return "hindi";
+                case "hu": return "hungarian";
+                case "id": return "indonesian";
+                case "ga": return "irish";
+                case "it": return "italian";
+                case "lt": return "lithuanian";
+                case "ne": return "nepali";
+                case "no": return "norwegian";
+                case "pt": return "portuguese";
+                case "ro": return "romanian";
+                case "ru": return "russian";
+                case "sr": return "serbian";
+                case "es": return "spanish";
+                case "sv": return "swedish";
+                case "ta": return "tamil";
+                case "tr": return "turkish";
+            }
+        }
+
+        // Default to English because it seems to be on average the language
+        // most likely to be present in emails, regardless of the user's
+        // language setting.  This is not an exact science, and search results
+        // should be ok either way in most cases.
+        return "english";
+    }
+
 }
diff --git a/src/engine/meson.build b/src/engine/meson.build
index 0efd773ea..226007d52 100644
--- a/src/engine/meson.build
+++ b/src/engine/meson.build
@@ -323,6 +323,7 @@ engine_dependencies = [
   glib,
   gmime,
   libmath,
+  libstemmer,
   libxml,
   posix,
   sqlite
diff --git a/src/meson.build b/src/meson.build
index 87760e741..f8e2fbeaf 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -150,6 +150,7 @@ foreach dep : valadoc_dependencies
   valadoc_dep_args += '--pkg'
   valadoc_dep_args += dep.name()
 endforeach
+valadoc_dep_args += [ '--pkg', 'libstemmer' ]
 valadoc_dep_args += [ '--pkg', 'posix' ]
 
 valadoc_vapidir_args = []


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]