Teach levenshtein() about multi-byte characters.

robertmhaas · robertmhaas · commit 57d9aefcaa1e · 2010-08-02T23:20:23.000Z
Based on a patch by, and further ideas from, Alexander Korotkov.
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c
@@ -5,7 +5,7 @@
  *
  * Joe Conway <mail@joeconway.com>
  *
- * $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.33 2010/07/29 20:11:48 rhaas Exp $
+ * $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.34 2010/08/02 23:20:23 rhaas Exp $
  * Copyright (c) 2001-2010, PostgreSQL Global Development Group
  * ALL RIGHTS RESERVED;
  *
@@ -50,6 +50,7 @@
 #include <ctype.h>
 
 #include "fmgr.h"
+#include "mb/pg_wchar.h"
 #include "utils/builtins.h"
 
 PG_MODULE_MAGIC;
@@ -183,6 +184,18 @@ getcode(char c)
 /* These prevent GH from becoming F */
 #define NOGHTOF(c)	(getcode(c) & 16)	/* BDH */
 
+/* Faster than memcmp(), for this use case. */
+static bool inline
+rest_of_char_same(const char *s1, const char *s2, int len)
+{
+	while (len > 0)
+	{
+		len--;
+		if (s1[len] != s2[len])
+			return false;
+	}
+	return true;
+}
 
 /*
  * levenshtein_internal - Calculates Levenshtein distance metric
@@ -195,16 +208,27 @@ levenshtein_internal(text *s, text *t,
 					 int ins_c, int del_c, int sub_c)
 {
 	int			m,
-				n;
+				n,
+				s_bytes,
+				t_bytes;
 	int		   *prev;
 	int		   *curr;
+	int		   *s_char_len = NULL;
 	int			i,
 				j;
-	const char *x;
+	const char *s_data;
+	const char *t_data;
 	const char *y;
 
-	m = VARSIZE_ANY_EXHDR(s);
-	n = VARSIZE_ANY_EXHDR(t);
+	/* Extract a pointer to the actual character data. */
+	s_data = VARDATA_ANY(s);
+	t_data = VARDATA_ANY(t);
+
+	/* Determine length of each string in bytes and characters. */
+	s_bytes = VARSIZE_ANY_EXHDR(s);
+	t_bytes = VARSIZE_ANY_EXHDR(t);
+	m = pg_mbstrlen_with_len(s_data, s_bytes);
+	n = pg_mbstrlen_with_len(t_data, t_bytes);
 
 	/*
 	 * We can transform an empty s into t with n insertions, or a non-empty t
@@ -226,6 +250,28 @@ levenshtein_internal(text *s, text *t,
 				 errmsg("argument exceeds the maximum length of %d bytes",
 						MAX_LEVENSHTEIN_STRLEN)));
 
+	/*
+	 * In order to avoid calling pg_mblen() repeatedly on each character in s,
+	 * we cache all the lengths before starting the main loop -- but if all the
+	 * characters in both strings are single byte, then we skip this and use
+	 * a fast-path in the main loop.  If only one string contains multi-byte
+	 * characters, we still build the array, so that the fast-path needn't
+	 * deal with the case where the array hasn't been initialized.
+	 */
+	if (m != s_bytes || n != t_bytes)
+	{
+		int		i;
+		const char *cp = s_data;
+
+		s_char_len = (int *) palloc((m + 1) * sizeof(int));
+		for (i = 0; i < m; ++i)
+		{
+			s_char_len[i] = pg_mblen(cp);
+			cp += s_char_len[i];
+		}
+		s_char_len[i] = 0;
+	}
+
 	/* One more cell for initialization column and row. */
 	++m;
 	++n;
@@ -244,36 +290,89 @@ levenshtein_internal(text *s, text *t,
 		prev[i] = i * del_c;
 
 	/* Loop through rows of the notional array */
-	for (y = VARDATA_ANY(t), j = 1; j < n; y++, j++)
+	for (y = t_data, j = 1; j < n; j++)
 	{
 		int		   *temp;
+		const char *x = s_data;
+		int			y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1;
 
 		/*
 		 * First cell must increment sequentially, as we're on the j'th row of
 		 * the (m+1)x(n+1) array.
 		 */
 		curr[0] = j * ins_c;
 
-		for (x = VARDATA_ANY(s), i = 1; i < m; x++, i++)
+		/*
+		 * This inner loop is critical to performance, so we include a
+		 * fast-path to handle the (fairly common) case where no multibyte
+		 * characters are in the mix.  The fast-path is entitled to assume
+		 * that if s_char_len is not initialized then BOTH strings contain
+		 * only single-byte characters.
+		 */
+		if (s_char_len != NULL)
 		{
-			int			ins;
-			int			del;
-			int			sub;
-
-			/* Calculate costs for probable operations. */
-			ins = prev[i] + ins_c;		/* Insertion	*/
-			del = curr[i - 1] + del_c;	/* Deletion		*/
-			sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);		/* Substitution */
-
-			/* Take the one with minimum cost. */
-			curr[i] = Min(ins, del);
-			curr[i] = Min(curr[i], sub);
+			for (i = 1; i < m; i++)
+			{
+				int			ins;
+				int			del;
+				int			sub;
+				int			x_char_len = s_char_len[i - 1];
+
+				/*
+				 * Calculate costs for insertion, deletion, and substitution.
+				 *
+				 * When calculating cost for substitution, we compare the last
+				 * character of each possibly-multibyte character first,
+				 * because that's enough to rule out most mis-matches.  If we
+				 * get past that test, then we compare the lengths and the
+				 * remaining bytes.
+				 */
+				ins = prev[i] + ins_c;
+				del = curr[i - 1] + del_c;
+				if (x[x_char_len-1] == y[y_char_len-1]
+					&& x_char_len == y_char_len &&
+					(x_char_len == 1 || rest_of_char_same(x, y, x_char_len)))
+					sub = prev[i - 1];
+				else
+					sub = prev[i - 1] + sub_c;
+
+				/* Take the one with minimum cost. */
+				curr[i] = Min(ins, del);
+				curr[i] = Min(curr[i], sub);
+
+				/* Point to next character. */
+				x += x_char_len;
+			}
+		}
+		else
+		{
+			for (i = 1; i < m; i++)
+			{
+				int			ins;
+				int			del;
+				int			sub;
+
+				/* Calculate costs for insertion, deletion, and substitution. */
+				ins = prev[i] + ins_c;
+				del = curr[i - 1] + del_c;
+				sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);
+
+				/* Take the one with minimum cost. */
+				curr[i] = Min(ins, del);
+				curr[i] = Min(curr[i], sub);
+
+				/* Point to next character. */
+				x++;
+			}
 		}
 
 		/* Swap current row with previous row. */
 		temp = curr;
 		curr = prev;
 		prev = temp;
+
+		/* Point to next character. */
+		y += y_char_len;
 	}
 
 	/*
diff --git a/doc/src/sgml/fuzzystrmatch.sgml b/doc/src/sgml/fuzzystrmatch.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/fuzzystrmatch.sgml,v 1.6 2010/07/29 19:34:40 petere Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/fuzzystrmatch.sgml,v 1.7 2010/08/02 23:20:23 rhaas Exp $ -->
 
 <sect1 id="fuzzystrmatch">
  <title>fuzzystrmatch</title>
@@ -14,8 +14,9 @@
 
  <caution>
   <para>
-   At present, <filename>fuzzystrmatch</> does not work well with
-   multi-byte encodings (such as UTF-8).
+   At present, the <function>soundex</>, <function>metaphone</>,
+   <function>dmetaphone</>, and <function>dmetaphone_alt</> functions do
+   not work well with multi-byte encodings (such as UTF-8).
   </para>
  </caution>