[geary/mjog/334-libstemmer: 1/2] ImapDb.SearchQuery: Use libstemmer for stemming
- From: Michael Gratton <mjog src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [geary/mjog/334-libstemmer: 1/2] ImapDb.SearchQuery: Use libstemmer for stemming
- Date: Sun, 13 Sep 2020 14:47:33 +0000 (UTC)
commit 8f62b2be9a9d53d6f5c03c40a76168d41d76beee
Author: Michael Gratton <mike vee net>
Date: Sun Sep 13 16:27:51 2020 +1000
ImapDb.SearchQuery: Use libstemmer for stemming
Instead of shipping our own forked stemmer via SQLite, add a dependency
on libstemmer and use that instead.
bindings/vapi/libstemmer.vapi | 38 ++++++++++
meson.build | 8 +++
src/engine/imap-db/imap-db-search-query.vala | 102 ++++++++++++++++-----------
src/engine/meson.build | 1 +
src/meson.build | 1 +
5 files changed, 109 insertions(+), 41 deletions(-)
---
diff --git a/bindings/vapi/libstemmer.vapi b/bindings/vapi/libstemmer.vapi
new file mode 100644
index 000000000..1d2e01c5c
--- /dev/null
+++ b/bindings/vapi/libstemmer.vapi
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2010-2013 Giulio Paci <giuliopaci gmail com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+[CCode (cheader_filename = "libstemmer.h", lower_case_cprefix = "sb_", cprefix="sb_")]
+namespace SnowBall {
+ [Compact]
+ [CCode (cname = "struct sb_stemmer", free_function="sb_stemmer_delete")]
+ public class Stemmer {
+ public unowned string stem(string word, int size);
+ public int length();
+ public Stemmer(string language, string encoding = "UTF_8");
+ [CCode (array_length = false, array_null_terminated = true)]
+ public static unowned string[] list();
+ }
+}
\ No newline at end of file
diff --git a/meson.build b/meson.build
index 5f33411bf..7672c7d33 100644
--- a/meson.build
+++ b/meson.build
@@ -90,6 +90,7 @@ libmath = cc.find_library('m')
libpeas = dependency('libpeas-1.0', version: '>= 1.24.0')
libpeas_gtk = dependency('libpeas-gtk-1.0', version: '>= 1.24.0')
libsecret = dependency('libsecret-1', version: '>= 0.11')
+libstemmer_dep = cc.find_library('libstemmer')
libsoup = dependency('libsoup-2.4', version: '>= 2.48')
libunwind_dep = dependency(
'libunwind', version: '>= 1.1', required: not get_option('libunwind_optional')
@@ -119,6 +120,13 @@ if libunwind_dep.found()
)
endif
+libstemmer = declare_dependency(
+ dependencies: [
+ valac.find_library('libstemmer', dirs: [vapi_dir]),
+ libstemmer_dep,
+ ],
+)
+
# Optional dependencies
appstream_util = find_program('appstream-util', required: false)
desktop_file_validate = find_program('desktop-file-validate', required: false)
diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala
index 85f556fa8..968ac6eac 100644
--- a/src/engine/imap-db/imap-db-search-query.vala
+++ b/src/engine/imap-db/imap-db-search-query.vala
@@ -318,6 +318,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
// A list of all search terms, regardless of search op field name
private Gee.ArrayList<Term> all = new Gee.ArrayList<Term>();
+ private SnowBall.Stemmer stemmer;
+
+
public async SearchQuery(Geary.Account owner,
ImapDB.Account local,
string query,
@@ -325,6 +328,7 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
GLib.Cancellable? cancellable) {
base(owner, query, strategy);
this.account = local;
+ this.stemmer = new SnowBall.Stemmer(find_appropriate_search_stemmer());
switch (strategy) {
case Strategy.EXACT:
@@ -611,20 +615,19 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
}
/**
- * This method is used to convert an unquoted user-entered search terms into a stemmed search
- * term.
+ * Converts unquoted search terms into a stemmed search term.
*
- * Prior experience with the Unicode Snowball stemmer indicates it's too aggressive for our
- * tastes when coupled with prefix-matching of all unquoted terms (see
- * https://bugzilla.gnome.org/show_bug.cgi?id=713179) This method is part of a larger strategy
- * designed to dampen that aggressiveness without losing the benefits of stemming entirely.
+ * Prior experience with the Snowball stemmer indicates it is too
+ * aggressive for our tastes when coupled with prefix-matching of
+ * all unquoted terms (see
+ * https://bugzilla.gnome.org/show_bug.cgi?id=713179).
*
- * Database upgrade 23 removes the old Snowball-stemmed FTS table and replaces it with one
- * with no stemming (using only SQLite's "simple" tokenizer). It also creates a "magic" SQLite
- * table called TokenizerTable which allows for uniform queries to the Snowball stemmer, which
- * is still installed in Geary. Thus, we are now in the position to search for the original
- * term and its stemmed variant, then do post-search processing to strip results which are
- * too "greedy" due to prefix-matching the stemmed variant.
+ * This method is part of a larger strategy designed to dampen
+ * that aggressiveness without losing the benefits of stemming
+ * entirely: The database's FTS table uses no stemming, but
+ * libstemmer is used to generate stemmed search terms.
+ * Post-search processing is then to strip results which are too
+ * "greedy" due to prefix-matching the stemmed variant.
*
* Some heuristics are in place simply to determine if stemming should occur:
*
@@ -647,36 +650,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
if (term_length < this.min_term_length_for_stemming)
return null;
- string? stemmed = null;
- try {
- yield this.account.db.exec_transaction_async(RO,
- (cx, cancellable) => {
- Db.Statement stmt = cx.prepare("""
- SELECT token
- FROM TokenizerTable
- WHERE input=?
- """);
- stmt.bind_string(0, term);
-
- // get stemmed string; if no result, fall through
- Db.Result result = stmt.exec(cancellable);
- if (!result.finished) {
- stemmed = result.string_at(0);
- } else {
- debug("No stemmed term returned for \"%s\"", term);
- }
- return COMMIT;
- }, cancellable
- );
- } catch (Error err) {
- debug("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message);
-
- // fall-through
- }
-
+ string? stemmed = this.stemmer.stem(term, term.length);
if (String.is_empty(stemmed)) {
debug("Empty stemmed term returned for \"%s\"", term);
-
return null;
}
@@ -693,8 +669,52 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
}
debug("Search processing: term -> stem is \"%s\" -> \"%s\"", term, stemmed);
-
return stemmed;
}
+ private string find_appropriate_search_stemmer() {
+ // Unfortunately, the stemmer library only accepts the full language
+ // name for the stemming algorithm. This translates between the user's
+ // preferred language ISO 639-1 code and our available stemmers.
+ // FIXME: the available list here is determined by what's included in
+ // src/sqlite3-unicodesn/CMakeLists.txt. We should pass that list in
+ // instead of hardcoding it here.
+ foreach (string l in Intl.get_language_names()) {
+ switch (l) {
+ case "ar": return "arabic";
+ case "eu": return "basque";
+ case "ca": return "catalan";
+ case "da": return "danish";
+ case "nl": return "dutch";
+ case "en": return "english";
+ case "fi": return "finnish";
+ case "fr": return "french";
+ case "de": return "german";
+ case "el": return "greek";
+ case "hi": return "hindi";
+ case "hu": return "hungarian";
+ case "id": return "indonesian";
+ case "ga": return "irish";
+ case "it": return "italian";
+ case "lt": return "lithuanian";
+ case "ne": return "nepali";
+ case "no": return "norwegian";
+ case "pt": return "portuguese";
+ case "ro": return "romanian";
+ case "ru": return "russian";
+ case "sr": return "serbian";
+ case "es": return "spanish";
+ case "sv": return "swedish";
+ case "ta": return "tamil";
+ case "tr": return "turkish";
+ }
+ }
+
+ // Default to English because it seems to be on average the language
+ // most likely to be present in emails, regardless of the user's
+ // language setting. This is not an exact science, and search results
+ // should be ok either way in most cases.
+ return "english";
+ }
+
}
diff --git a/src/engine/meson.build b/src/engine/meson.build
index 0efd773ea..226007d52 100644
--- a/src/engine/meson.build
+++ b/src/engine/meson.build
@@ -323,6 +323,7 @@ engine_dependencies = [
glib,
gmime,
libmath,
+ libstemmer,
libxml,
posix,
sqlite
diff --git a/src/meson.build b/src/meson.build
index 87760e741..f8e2fbeaf 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -150,6 +150,7 @@ foreach dep : valadoc_dependencies
valadoc_dep_args += '--pkg'
valadoc_dep_args += dep.name()
endforeach
+valadoc_dep_args += [ '--pkg', 'libstemmer' ]
valadoc_dep_args += [ '--pkg', 'posix' ]
valadoc_vapidir_args = []
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]