From 5308d12239b9896781e0f293791a410fc29f8f68 Mon Sep 17 00:00:00 2001 From: Aleksander Morgado Date: Wed, 12 Dec 2012 17:04:27 +0100 Subject: [PATCH] search: skip combining diacritical marks in search operations https://bugzilla.gnome.org/show_bug.cgi?id=648587 --- src/shell-app-system.c | 3 +- src/shell-app.c | 8 ++--- src/shell-util.c | 78 ++++++++++++++++++++++++++++++++++++++++++ src/shell-util.h | 2 ++ 4 files changed, 86 insertions(+), 5 deletions(-) diff --git a/src/shell-app-system.c b/src/shell-app-system.c index 7f34c28b6..89a074cda 100644 --- a/src/shell-app-system.c +++ b/src/shell-app-system.c @@ -738,7 +738,8 @@ normalize_terms (GSList *terms) for (iter = terms; iter; iter = iter->next) { const char *term = iter->data; - normalized_terms = g_slist_prepend (normalized_terms, shell_util_normalize_and_casefold (term)); + normalized_terms = g_slist_prepend (normalized_terms, + shell_util_normalize_casefold_and_unaccent (term)); } return normalized_terms; } diff --git a/src/shell-app.c b/src/shell-app.c index 8ff53ec7d..1f8d6f01f 100644 --- a/src/shell-app.c +++ b/src/shell-app.c @@ -1319,16 +1319,16 @@ shell_app_init_search_data (ShellApp *app) appinfo = gmenu_tree_entry_get_app_info (app->entry); name = g_app_info_get_name (G_APP_INFO (appinfo)); - app->casefolded_name = shell_util_normalize_and_casefold (name); + app->casefolded_name = shell_util_normalize_casefold_and_unaccent (name); generic_name = g_desktop_app_info_get_generic_name (appinfo); if (generic_name) - app->casefolded_generic_name = shell_util_normalize_and_casefold (generic_name); + app->casefolded_generic_name = shell_util_normalize_casefold_and_unaccent (generic_name); else app->casefolded_generic_name = NULL; exec = g_app_info_get_executable (G_APP_INFO (appinfo)); - normalized_exec = shell_util_normalize_and_casefold (exec); + normalized_exec = shell_util_normalize_casefold_and_unaccent (exec); app->casefolded_exec = trim_exec_line (normalized_exec); g_free (normalized_exec); @@ -1343,7 +1343,7 @@ shell_app_init_search_data (ShellApp *app) i = 0; while (keywords[i]) { - app->casefolded_keywords[i] = shell_util_normalize_and_casefold (keywords[i]); + app->casefolded_keywords[i] = shell_util_normalize_casefold_and_unaccent (keywords[i]); ++i; } app->casefolded_keywords[i] = NULL; diff --git a/src/shell-util.c b/src/shell-util.c index 56ebd0219..3821b3aff 100644 --- a/src/shell-util.c +++ b/src/shell-util.c @@ -122,12 +122,90 @@ shell_util_normalize_and_casefold (const char *str) if (str == NULL) return NULL; + /* NOTE: 'ALL' is equivalent to 'NFKD'. If this is ever updated, please + * update the unaccenting mechanism as well. */ normalized = g_utf8_normalize (str, -1, G_NORMALIZE_ALL); result = g_utf8_casefold (normalized, -1); g_free (normalized); return result; } +/* Combining diacritical mark? + * Basic range: [0x0300,0x036F] + * Supplement: [0x1DC0,0x1DFF] + * For Symbols: [0x20D0,0x20FF] + * Half marks: [0xFE20,0xFE2F] + */ +#define IS_CDM_UCS4(c) (((c) >= 0x0300 && (c) <= 0x036F) || \ + ((c) >= 0x1DC0 && (c) <= 0x1DFF) || \ + ((c) >= 0x20D0 && (c) <= 0x20FF) || \ + ((c) >= 0xFE20 && (c) <= 0xFE2F)) + +/* Copied from tracker/src/libtracker-fts/tracker-parser-glib.c under the GPL + * Originally written by Aleksander Morgado + */ +char * +shell_util_normalize_casefold_and_unaccent (const char *str) +{ + char *tmp; + gsize i = 0, j = 0, ilen; + + if (str == NULL) + return NULL; + + /* Get the NFKD-normalized and casefolded string */ + tmp = shell_util_normalize_and_casefold (str); + ilen = strlen (tmp); + + while (i < ilen) + { + gunichar unichar; + gchar *next_utf8; + gint utf8_len; + + /* Get next character of the word as UCS4 */ + unichar = g_utf8_get_char_validated (&tmp[i], -1); + + /* Invalid UTF-8 character or end of original string. */ + if (unichar == (gunichar) -1 || + unichar == (gunichar) -2) + { + break; + } + + /* Find next UTF-8 character */ + next_utf8 = g_utf8_next_char (&tmp[i]); + utf8_len = next_utf8 - &tmp[i]; + + if (IS_CDM_UCS4 ((guint32) unichar)) + { + /* If the given unichar is a combining diacritical mark, + * just update the original index, not the output one */ + i += utf8_len; + continue; + } + + /* If already found a previous combining + * diacritical mark, indexes are different so + * need to copy characters. As output and input + * buffers may overlap, need to use memmove + * instead of memcpy */ + if (i != j) + { + memmove (&tmp[j], &tmp[i], utf8_len); + } + + /* Update both indexes */ + i += utf8_len; + j += utf8_len; + } + + /* Force proper string end */ + tmp[j] = '\0'; + + return tmp; +} + /** * shell_util_format_date: * @format: a strftime-style string format, as parsed by diff --git a/src/shell-util.h b/src/shell-util.h index 9dbf7239b..41ba96f4a 100644 --- a/src/shell-util.h +++ b/src/shell-util.h @@ -20,6 +20,8 @@ int shell_util_get_week_start (void); char *shell_util_normalize_and_casefold (const char *str); +char *shell_util_normalize_casefold_and_unaccent (const char *str); + char *shell_util_format_date (const char *format, gint64 time_ms);