[geary/wip/composer-folks] Ensure Geary.ContactStoreImpl handles non english searches
- From: Michael Gratton <mjog src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [geary/wip/composer-folks] Ensure Geary.ContactStoreImpl handles non english searches
- Date: Sat, 22 Jun 2019 04:14:44 +0000 (UTC)
commit 4e1aa3251402c4207b5bf3af446aa07642fe28a8
Author: Michael Gratton <mike vee net>
Date: Sat Jun 22 14:11:10 2019 +1000
Ensure Geary.ContactStoreImpl handles non english searches
Fix DB impl actually do UTF-8 case-insensitive search matching. Add
some unit tests.
src/engine/common/common-contact-store-impl.vala | 10 +--
src/engine/db/db.vala | 11 +++
src/engine/imap-db/imap-db-database.vala | 64 +++++++++++----
.../common/common-contact-store-impl-test.vala | 91 ++++++++++++++++++----
test/engine/imap-db/imap-db-database-test.vala | 4 +-
5 files changed, 146 insertions(+), 34 deletions(-)
---
diff --git a/src/engine/common/common-contact-store-impl.vala
b/src/engine/common/common-contact-store-impl.vala
index 433213bd..912fee2e 100644
--- a/src/engine/common/common-contact-store-impl.vala
+++ b/src/engine/common/common-contact-store-impl.vala
@@ -98,19 +98,19 @@ internal class Geary.ContactStoreImpl : BaseObject, Geary.ContactStore {
GLib.Cancellable? cancellable)
throws GLib.Error {
Gee.Collection<Contact> contacts = new Gee.LinkedList<Contact>();
- string normalised_query = query.make_valid().normalize().down();
+ string normalised_query = Geary.Db.normalise_case_insensitive_query(query);
if (!String.is_empty(normalised_query)) {
normalised_query = normalised_query + "%";
Db.Statement stmt = cx.prepare("""
SELECT * FROM ContactTable
WHERE highest_importance >= ? AND (
- real_name LIKE ? COLLATE UTF8ICASE OR
- normalized_email LIKE ? COLLATE UTF8ICASE
+ UTF8FOLD(real_name) LIKE ? OR
+ UTF8FOLD(email) LIKE ?
)
ORDER BY highest_importance DESC,
real_name IS NULL,
- real_name COLLATE UTF8ICASE,
- email COLLATE UTF8ICASE
+ real_name COLLATE UTF8COLL,
+ email COLLATE UTF8COLL
LIMIT ?
""");
stmt.bind_uint(0, min_importance);
diff --git a/src/engine/db/db.vala b/src/engine/db/db.vala
index ff362496..b062ee9a 100644
--- a/src/engine/db/db.vala
+++ b/src/engine/db/db.vala
@@ -81,6 +81,17 @@ public bool set_shared_cache_mode(bool enabled) {
return sqlite3_enable_shared_cache(enabled ? 1 : 0) == Sqlite.OK;
}
+/** Standard transformation for case-insensitive string values. */
+public inline string normalise_case_insensitive_query(string text) {
+ // This would be a place to do transliteration to improve query
+ // results, for example normalising `á` to `a`. The built-in GLib
+ // method `string.to_ascii()` does this but is too strong: It will
+ // convert e.g. CJK chars to `?`. The `string.tokenize_and_fold`
+ // function may work better but the calling interface is all
+ // wrong.
+ return text.normalize().casefold();
+}
+
private void check_cancelled(string? method, Cancellable? cancellable) throws IOError {
if (cancellable != null && cancellable.is_cancelled())
throw new IOError.CANCELLED("%s cancelled", !String.is_empty(method) ? method : "Operation");
diff --git a/src/engine/imap-db/imap-db-database.vala b/src/engine/imap-db/imap-db-database.vala
index cbf65d65..91e2f01c 100644
--- a/src/engine/imap-db/imap-db-database.vala
+++ b/src/engine/imap-db/imap-db-database.vala
@@ -5,22 +5,45 @@
* (version 2.1 or later). See the COPYING file in this distribution.
*/
-[CCode (cname = "g_utf8_casefold")]
-extern string utf8_casefold(string data, ssize_t len);
+[CCode (cname = "g_utf8_collate_key")]
+extern string utf8_collate_key(string data, ssize_t len);
extern int sqlite3_unicodesn_register_tokenizer(Sqlite.Database db);
private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
- /** Name of UTF-8 case-sensitive SQLite collation function name. */
- public const string UTF8_CASE_INSENSITIVE_COLLATION = "UTF8ICASE";
- private static int case_insensitive_collation(int a_len, void* a_bytes,
- int b_len, void* b_bytes) {
- string a_str = utf8_casefold((string) a_bytes, a_len).collate_key();
- string b_str = utf8_casefold((string) b_bytes, b_len).collate_key();
- return strcmp(a_str, b_str);
+ /** SQLite UTF-8 case-insensitive, transliterating function name. */
+ public const string UTF8_CASE_INSENSITIVE_FN = "UTF8FOLD";
+
+ /** SQLite UTF-8 collation name. */
+ public const string UTF8_COLLATE = "UTF8COLL";
+
+
+ private static void utf8_transliterate_fold(Sqlite.Context context,
+ Sqlite.Value[] values) {
+ string? text = values[0].to_text();
+ if (text != null) {
+ context.result_text(Geary.Db.normalise_case_insensitive_query(text));
+ } else {
+ context.result_value(values[0]);
+ }
+ }
+
+ private static int utf8_collate(int a_len, void* a_bytes,
+ int b_len, void* b_bytes) {
+ // Don't need to normalise, collate_key() will do it for us
+ string? a_str = null;
+ if (a_bytes != null) {
+ a_str = utf8_collate_key((string) a_bytes, a_len);
+ }
+ string? b_str = null;
+ if (b_bytes != null) {
+ b_str = utf8_collate_key((string) b_bytes, b_len);
+ }
+ return GLib.strcmp(a_str, b_str);
}
+
internal GLib.File attachments_path;
private const int OPEN_PUMP_EVENT_LOOP_MSEC = 100;
@@ -598,14 +621,29 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
cx.set_recursive_triggers(true);
cx.set_synchronous(Db.SynchronousMode.NORMAL);
sqlite3_unicodesn_register_tokenizer(cx.db);
+
+ if (cx.db.create_function(
+ UTF8_CASE_INSENSITIVE_FN,
+ 1, // n args
+ Sqlite.UTF8,
+ null,
+ Database.utf8_transliterate_fold,
+ null,
+ null
+ ) != Sqlite.OK) {
+ throw new DatabaseError.GENERAL(
+ "Failed to register function %s",
+ UTF8_CASE_INSENSITIVE_FN
+ );
+ }
+
if (cx.db.create_collation(
- UTF8_CASE_INSENSITIVE_COLLATION,
+ UTF8_COLLATE,
Sqlite.UTF8,
- Database.case_insensitive_collation
+ Database.utf8_collate
) != Sqlite.OK) {
throw new DatabaseError.GENERAL(
- "Failed to register collation function %s",
- UTF8_CASE_INSENSITIVE_COLLATION
+ "Failed to register collation %s", UTF8_COLLATE
);
}
}
diff --git a/test/engine/common/common-contact-store-impl-test.vala
b/test/engine/common/common-contact-store-impl-test.vala
index 99530c87..b0a323ae 100644
--- a/test/engine/common/common-contact-store-impl-test.vala
+++ b/test/engine/common/common-contact-store-impl-test.vala
@@ -20,6 +20,8 @@ class Geary.ContactStoreImplTest : TestCase {
add_test("search_no_match", search_no_match);
add_test("search_email_match", search_email_match);
add_test("search_name_match", search_name_match);
+ add_test("search_utf8_latin_names", search_utf8_latin_names);
+ add_test("search_utf8_multi_byte_names", search_utf8_multi_byte_names);
add_test("update_new_contact", update_new_contact);
add_test("update_existing_contact", update_existing_contact);
}
@@ -45,20 +47,20 @@ class Geary.ContactStoreImplTest : TestCase {
this.db.open.end(async_result());
this.db.exec("""
-INSERT INTO ContactTable (
- id,
- normalized_email,
- real_name,
- email,
- highest_importance
-) VALUES (
- 1,
- 'test example com',
- 'Test Name',
- 'Test example com',
- 50
-);
-""");
+ INSERT INTO ContactTable (
+ id,
+ normalized_email,
+ real_name,
+ email,
+ highest_importance
+ ) VALUES (
+ 1,
+ 'test example com',
+ 'Test Name',
+ 'Test example com',
+ 50
+ );
+ """);
this.test_article = new ContactStoreImpl(this.db);
}
@@ -152,6 +154,67 @@ INSERT INTO ContactTable (
assert_false(search_hit.flags.always_load_remote_images(), "Existing flags");
}
+ public void search_utf8_latin_names() throws GLib.Error {
+ this.db.exec("""
+ INSERT INTO ContactTable (
+ real_name,
+ email,
+ normalized_email,
+ highest_importance
+ ) VALUES (
+ 'Germán',
+ 'latin example com',
+ 'latin example com',
+ 50
+ );
+ """);
+ test_article.search.begin(
+ "germá",
+ 0,
+ 10,
+ null,
+ (obj, ret) => { async_complete(ret); }
+ );
+ Gee.Collection<Contact> results = test_article.search.end(
+ async_result()
+ );
+ assert_int(1, results.size, "results.size");
+
+ Contact search_hit = Collection.get_first(results);
+ assert_string("Germán", search_hit.real_name, "Existing real_name");
+ }
+
+ public void search_utf8_multi_byte_names() throws GLib.Error {
+ this.db.exec("""
+ INSERT INTO ContactTable (
+ real_name,
+ email,
+ normalized_email,
+ highest_importance
+ ) VALUES (
+ '年収1億円目指せ',
+ 'cjk example com',
+ 'cjk example com',
+ 50
+ );
+ """);
+
+ test_article.search.begin(
+ "年収",
+ 0,
+ 10,
+ null,
+ (obj, ret) => { async_complete(ret); }
+ );
+ Gee.Collection<Contact> results = test_article.search.end(
+ async_result()
+ );
+ assert_int(1, results.size, "results.size");
+
+ Contact search_hit = Collection.get_first(results);
+ assert_string("年収1億円目指せ", search_hit.real_name, "Existing real_name");
+ }
+
public void update_new_contact() throws GLib.Error {
Contact not_persisted = new Contact(
"New example com",
diff --git a/test/engine/imap-db/imap-db-database-test.vala b/test/engine/imap-db/imap-db-database-test.vala
index f937456e..44a4845f 100644
--- a/test/engine/imap-db/imap-db-database-test.vala
+++ b/test/engine/imap-db/imap-db-database-test.vala
@@ -147,10 +147,10 @@ class Geary.ImapDB.DatabaseTest : TestCase {
INSERT INTO Test (test_str) VALUES ('BB');
INSERT INTO Test (test_str) VALUES ('🤯');
""");
- string[] expected = { "🤯", "BB", "B", "a" };
+ string[] expected = { "a", "BB", "B", "🤯" };
Db.Result result = db.query(
- "SELECT test_str FROM Test ORDER BY test_str COLLATE UTF8ICASE DESC"
+ "SELECT test_str FROM Test ORDER BY test_str COLLATE UTF8COLL DESC"
);
int i = 0;
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]