search: skip combining diacritical marks in search operations

https://bugzilla.gnome.org/show_bug.cgi?id=648587
2012-12-12 17:04:27 +01:00
parent 15cac0157c
commit 5308d12239
4 changed files with 86 additions and 5 deletions
--- a/src/shell-util.c
+++ b/src/shell-util.c
@ -122,12 +122,90 @@ shell_util_normalize_and_casefold (const char *str)
  if (str == NULL)
    return NULL;

+  /* NOTE: 'ALL' is equivalent to 'NFKD'. If this is ever updated, please
+   * update the unaccenting mechanism as well. */
  normalized = g_utf8_normalize (str, -1, G_NORMALIZE_ALL);
  result = g_utf8_casefold (normalized, -1);
  g_free (normalized);
  return result;
 }

+/* Combining diacritical mark?
+ *  Basic range: [0x0300,0x036F]
+ *  Supplement:  [0x1DC0,0x1DFF]
+ *  For Symbols: [0x20D0,0x20FF]
+ *  Half marks:  [0xFE20,0xFE2F]
+ */
+#define IS_CDM_UCS4(c) (((c) >= 0x0300 && (c) <= 0x036F)  || \
+                        ((c) >= 0x1DC0 && (c) <= 0x1DFF)  || \
+                        ((c) >= 0x20D0 && (c) <= 0x20FF)  || \
+                        ((c) >= 0xFE20 && (c) <= 0xFE2F))
+
+/* Copied from tracker/src/libtracker-fts/tracker-parser-glib.c under the GPL
+ * Originally written by Aleksander Morgado <aleksander@gnu.org>
+ */
+char *
+shell_util_normalize_casefold_and_unaccent (const char *str)
+{
+  char *tmp;
+  gsize i = 0, j = 0, ilen;
+
+  if (str == NULL)
+    return NULL;
+
+  /* Get the NFKD-normalized and casefolded string */
+  tmp = shell_util_normalize_and_casefold (str);
+  ilen = strlen (tmp);
+
+  while (i < ilen)
+    {
+      gunichar unichar;
+      gchar *next_utf8;
+      gint utf8_len;
+
+      /* Get next character of the word as UCS4 */
+      unichar = g_utf8_get_char_validated (&tmp[i], -1);
+
+      /* Invalid UTF-8 character or end of original string. */
+      if (unichar == (gunichar) -1 ||
+          unichar == (gunichar) -2)
+        {
+          break;
+        }
+
+      /* Find next UTF-8 character */
+      next_utf8 = g_utf8_next_char (&tmp[i]);
+      utf8_len = next_utf8 - &tmp[i];
+
+      if (IS_CDM_UCS4 ((guint32) unichar))
+        {
+          /* If the given unichar is a combining diacritical mark,
+           * just update the original index, not the output one */
+          i += utf8_len;
+          continue;
+        }
+
+      /* If already found a previous combining
+       * diacritical mark, indexes are different so
+       * need to copy characters. As output and input
+       * buffers may overlap, need to use memmove
+       * instead of memcpy */
+      if (i != j)
+        {
+          memmove (&tmp[j], &tmp[i], utf8_len);
+        }
+
+      /* Update both indexes */
+      i += utf8_len;
+      j += utf8_len;
+    }
+
+  /* Force proper string end */
+  tmp[j] = '\0';
+
+  return tmp;
+}
+
 /**
 * shell_util_format_date:
 * @format: a strftime-style string format, as parsed by