[geary/mjog/search-update: 53/53] Geary.ImapDB.Account: Drop post-search stemmed term greedy match removal




commit e37f7a9d6e4c4bce2b07e0906e6da19624137dbf
Author: Michael Gratton <mike vee net>
Date:   Thu Jan 14 21:20:25 2021 +1100

    Geary.ImapDB.Account: Drop post-search stemmed term greedy match removal
    
    Stop post-processing search results by dropping results that contain a
    matched term that is longer by some criterion than a stemmed term.
    
    Since this cannot be specified by SQLite's FTS queries, it has to be
    done outside of the search, which can have a substantial impact on
    performance, and either means running multiple queries outside of a
    transaction to get the required number of search results (potentially
    a large number of times), running the pruning within a transaction
    (potentially blocking the DB for a large length of time), or returning
    the wrong number of search results (potentially confusing the caller).
    
    Because of these disadvantages, and since SearchQuery's maximum
    difference in lengths between term and stemmed variant helps to prevent
    greedy matching anyway, just drop the post processing.

 src/engine/api/geary-search-query.vala  | 26 --------------
 src/engine/imap-db/imap-db-account.vala | 62 ---------------------------------
 2 files changed, 88 deletions(-)
---
diff --git a/src/engine/api/geary-search-query.vala b/src/engine/api/geary-search-query.vala
index 62f7416d7..d9603d66a 100644
--- a/src/engine/api/geary-search-query.vala
+++ b/src/engine/api/geary-search-query.vala
@@ -121,32 +121,6 @@ public abstract class Geary.SearchQuery : BaseObject {
             return max;
         }
 
-        /**
-         * Maximum difference in lengths between a matched word and the stemmed variant it matched
-         * against.
-         *
-         * This prevents long words being matched to short stem
-         * variants (which creates opportunities for greedy matching).
-         */
-        internal int get_max_difference_match_stem_lengths() {
-            var max = 0;
-            switch (this) {
-            case EXACT:
-                max = 0;
-                break;
-            case CONSERVATIVE:
-                max = 2;
-                break;
-            case AGGRESSIVE:
-                max = 3;
-                break;
-            case HORIZON:
-                max = int.MAX;
-                break;
-            }
-            return max;
-        }
-
     }
 
 
diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala
index e5998afbf..e9d3abc0d 100644
--- a/src/engine/imap-db/imap-db-account.vala
+++ b/src/engine/imap-db/imap-db-account.vala
@@ -633,67 +633,9 @@ private class Geary.ImapDB.Account : BaseObject {
         }, cancellable);
 
         debug("Matching emails found: %d", matching_ids.size);
-
-        if (query.has_stemmed_terms && search_matches != null) {
-            strip_greedy_results(query, matching_ids, search_matches);
-        }
-
-        debug("Final search matches: %d", matching_ids.size);
         return matching_ids.is_empty ? null : matching_ids;
     }
 
-    // Strip out from the given collection of matching ids and results
-    // for any search results that only contain a hit due to "greedy"
-    // matching of the stemmed variants on all search terms.
-    private void strip_greedy_results(SearchQuery query,
-                                      Gee.Collection<EmailIdentifier> matches,
-                                      Gee.Map<EmailIdentifier,Gee.Set<string>> results) {
-        int prestripped_results = matches.size;
-        // Gee.Iterator<EmailIdentifier> iter = matches.iterator();
-        // while (iter.next()) {
-        //     // For each matched string in this message, retain the message in the search results
-        //     // if it prefix-matches any of the straight-up parsed terms or matches a stemmed
-        //     // variant (with only max. difference in their lengths allowed, i.e. not a "greedy"
-        //     // match)
-        //     EmailIdentifier id = iter.get();
-        //     bool good_match_found = false;
-        //     Gee.Set<string>? result = results.get(id);
-        //     if (result != null) {
-        //         foreach (string match in result) {
-        //             foreach (SearchQuery.Term term in query.get_all_terms()) {
-        //                 // if prefix-matches parsed term, then don't strip
-        //                 if (match.has_prefix(term.parsed)) {
-        //                     good_match_found = true;
-        //                     break;
-        //                 }
-
-        //                 // if prefix-matches stemmed term w/o doing so
-        //                 // greedily, then don't strip
-        //                 if (term.stemmed != null && match.has_prefix(term.stemmed)) {
-        //                     int diff = match.length - term.stemmed.length;
-        //                     if (diff <= query.max_difference_match_stem_lengths) {
-        //                         good_match_found = true;
-        //                         break;
-        //                     }
-        //                 }
-        //             }
-        //         }
-
-        //         if (good_match_found) {
-        //             break;
-        //         }
-        //     }
-
-        //     if (!good_match_found) {
-        //         iter.remove();
-        //         matches.remove(id);
-        //     }
-        // }
-
-        debug("Stripped %d emails from search for [%s] due to greedy stem matching",
-              prestripped_results - matches.size, query.raw);
-    }
-
     public async Gee.Set<string>? get_search_matches_async(Geary.SearchQuery q,
         Gee.Collection<ImapDB.EmailIdentifier> ids, Cancellable? cancellable = null) throws Error {
         check_open();
@@ -714,10 +656,6 @@ private class Geary.ImapDB.Account : BaseObject {
                 return Db.TransactionOutcome.DONE;
             }
 
-            if (query.has_stemmed_terms) {
-                strip_greedy_results(query, ids, match_map);
-            }
-
             search_matches = new Gee.HashSet<string>();
             foreach (Gee.Set<string> matches in match_map.values)
                 search_matches.add_all(matches);


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]