search: skip combining diacritical marks in search operations

https://bugzilla.gnome.org/show_bug.cgi?id=648587
2012-12-12 17:04:27 +01:00
parent 15cac0157c
commit 5308d12239
4 changed files with 86 additions and 5 deletions
--- a/src/shell-app-system.c
+++ b/src/shell-app-system.c
@ -738,7 +738,8 @@ normalize_terms (GSList *terms)
  for (iter = terms; iter; iter = iter->next)
    {
      const char *term = iter->data;
-      normalized_terms = g_slist_prepend (normalized_terms, shell_util_normalize_and_casefold (term));
+      normalized_terms = g_slist_prepend (normalized_terms,
                                          shell_util_normalize_casefold_and_unaccent (term));
    }
  return normalized_terms;
 }
--- a/src/shell-app.c
+++ b/src/shell-app.c
@ -1319,16 +1319,16 @@ shell_app_init_search_data (ShellApp *app)
  appinfo = gmenu_tree_entry_get_app_info (app->entry);
  name = g_app_info_get_name (G_APP_INFO (appinfo));
-  app->casefolded_name = shell_util_normalize_and_casefold (name);
+  app->casefolded_name = shell_util_normalize_casefold_and_unaccent (name);
  generic_name = g_desktop_app_info_get_generic_name (appinfo);
  if (generic_name)
-    app->casefolded_generic_name = shell_util_normalize_and_casefold (generic_name);
+    app->casefolded_generic_name = shell_util_normalize_casefold_and_unaccent (generic_name);
  else
    app->casefolded_generic_name = NULL;
  exec = g_app_info_get_executable (G_APP_INFO (appinfo));
-  normalized_exec = shell_util_normalize_and_casefold (exec);
+  normalized_exec = shell_util_normalize_casefold_and_unaccent (exec);
  app->casefolded_exec = trim_exec_line (normalized_exec);
  g_free (normalized_exec);
@ -1343,7 +1343,7 @@ shell_app_init_search_data (ShellApp *app)
      i = 0;
      while (keywords[i])
        {
-          app->casefolded_keywords[i] = shell_util_normalize_and_casefold (keywords[i]);
+          app->casefolded_keywords[i] = shell_util_normalize_casefold_and_unaccent (keywords[i]);
          ++i;
        }
      app->casefolded_keywords[i] = NULL;
--- a/src/shell-util.c
+++ b/src/shell-util.c
@ -122,12 +122,90 @@ shell_util_normalize_and_casefold (const char *str)
  if (str == NULL)
    return NULL;
  /* NOTE: 'ALL' is equivalent to 'NFKD'. If this is ever updated, please
   * update the unaccenting mechanism as well. */
  normalized = g_utf8_normalize (str, -1, G_NORMALIZE_ALL);
  result = g_utf8_casefold (normalized, -1);
  g_free (normalized);
  return result;
 }
 /* Combining diacritical mark?
 *  Basic range: [0x0300,0x036F]
 *  Supplement:  [0x1DC0,0x1DFF]
 *  For Symbols: [0x20D0,0x20FF]
 *  Half marks:  [0xFE20,0xFE2F]
 */
 #define IS_CDM_UCS4(c) (((c) >= 0x0300 && (c) <= 0x036F)  || \
                        ((c) >= 0x1DC0 && (c) <= 0x1DFF)  || \
                        ((c) >= 0x20D0 && (c) <= 0x20FF)  || \
                        ((c) >= 0xFE20 && (c) <= 0xFE2F))
 /* Copied from tracker/src/libtracker-fts/tracker-parser-glib.c under the GPL
 * Originally written by Aleksander Morgado <aleksander@gnu.org>
 */
 char *
 shell_util_normalize_casefold_and_unaccent (const char *str)
 {
  char *tmp;
  gsize i = 0, j = 0, ilen;
  if (str == NULL)
    return NULL;
  /* Get the NFKD-normalized and casefolded string */
  tmp = shell_util_normalize_and_casefold (str);
  ilen = strlen (tmp);
  while (i < ilen)
    {
      gunichar unichar;
      gchar *next_utf8;
      gint utf8_len;
      /* Get next character of the word as UCS4 */
      unichar = g_utf8_get_char_validated (&tmp[i], -1);
      /* Invalid UTF-8 character or end of original string. */
      if (unichar == (gunichar) -1 ||
          unichar == (gunichar) -2)
        {
          break;
        }
      /* Find next UTF-8 character */
      next_utf8 = g_utf8_next_char (&tmp[i]);
      utf8_len = next_utf8 - &tmp[i];
      if (IS_CDM_UCS4 ((guint32) unichar))
        {
          /* If the given unichar is a combining diacritical mark,
           * just update the original index, not the output one */
          i += utf8_len;
          continue;
        }
      /* If already found a previous combining
       * diacritical mark, indexes are different so
       * need to copy characters. As output and input
       * buffers may overlap, need to use memmove
       * instead of memcpy */
      if (i != j)
        {
          memmove (&tmp[j], &tmp[i], utf8_len);
        }
      /* Update both indexes */
      i += utf8_len;
      j += utf8_len;
    }
  /* Force proper string end */
  tmp[j] = '\0';
  return tmp;
 }
 /**
 * shell_util_format_date:
 * @format: a strftime-style string format, as parsed by
--- a/src/shell-util.h
+++ b/src/shell-util.h
@ -20,6 +20,8 @@ int      shell_util_get_week_start             (void);
 char    *shell_util_normalize_and_casefold     (const char       *str);
 char    *shell_util_normalize_casefold_and_unaccent (const char  *str);
 char    *shell_util_format_date                (const char       *format,
                                                gint64            time_ms);