Skip to content

Commit 57d9aef

Browse files
committed
Teach levenshtein() about multi-byte characters.
Based on a patch by, and further ideas from, Alexander Korotkov.
1 parent ad17ff9 commit 57d9aef

File tree

2 files changed

+122
-22
lines changed

2 files changed

+122
-22
lines changed

contrib/fuzzystrmatch/fuzzystrmatch.c

Lines changed: 118 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
*
66
* Joe Conway <mail@joeconway.com>
77
*
8-
* $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.33 2010/07/29 20:11:48 rhaas Exp $
8+
* $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.34 2010/08/02 23:20:23 rhaas Exp $
99
* Copyright (c) 2001-2010, PostgreSQL Global Development Group
1010
* ALL RIGHTS RESERVED;
1111
*
@@ -50,6 +50,7 @@
5050
#include <ctype.h>
5151

5252
#include "fmgr.h"
53+
#include "mb/pg_wchar.h"
5354
#include "utils/builtins.h"
5455

5556
PG_MODULE_MAGIC;
@@ -183,6 +184,18 @@ getcode(char c)
183184
/* These prevent GH from becoming F */
184185
#define NOGHTOF(c) (getcode(c) & 16) /* BDH */
185186

187+
/* Faster than memcmp(), for this use case. */
188+
static bool inline
189+
rest_of_char_same(const char *s1, const char *s2, int len)
190+
{
191+
while (len > 0)
192+
{
193+
len--;
194+
if (s1[len] != s2[len])
195+
return false;
196+
}
197+
return true;
198+
}
186199

187200
/*
188201
* levenshtein_internal - Calculates Levenshtein distance metric
@@ -195,16 +208,27 @@ levenshtein_internal(text *s, text *t,
195208
int ins_c, int del_c, int sub_c)
196209
{
197210
int m,
198-
n;
211+
n,
212+
s_bytes,
213+
t_bytes;
199214
int *prev;
200215
int *curr;
216+
int *s_char_len = NULL;
201217
int i,
202218
j;
203-
const char *x;
219+
const char *s_data;
220+
const char *t_data;
204221
const char *y;
205222

206-
m = VARSIZE_ANY_EXHDR(s);
207-
n = VARSIZE_ANY_EXHDR(t);
223+
/* Extract a pointer to the actual character data. */
224+
s_data = VARDATA_ANY(s);
225+
t_data = VARDATA_ANY(t);
226+
227+
/* Determine length of each string in bytes and characters. */
228+
s_bytes = VARSIZE_ANY_EXHDR(s);
229+
t_bytes = VARSIZE_ANY_EXHDR(t);
230+
m = pg_mbstrlen_with_len(s_data, s_bytes);
231+
n = pg_mbstrlen_with_len(t_data, t_bytes);
208232

209233
/*
210234
* We can transform an empty s into t with n insertions, or a non-empty t
@@ -226,6 +250,28 @@ levenshtein_internal(text *s, text *t,
226250
errmsg("argument exceeds the maximum length of %d bytes",
227251
MAX_LEVENSHTEIN_STRLEN)));
228252

253+
/*
254+
* In order to avoid calling pg_mblen() repeatedly on each character in s,
255+
* we cache all the lengths before starting the main loop -- but if all the
256+
* characters in both strings are single byte, then we skip this and use
257+
* a fast-path in the main loop. If only one string contains multi-byte
258+
* characters, we still build the array, so that the fast-path needn't
259+
* deal with the case where the array hasn't been initialized.
260+
*/
261+
if (m != s_bytes || n != t_bytes)
262+
{
263+
int i;
264+
const char *cp = s_data;
265+
266+
s_char_len = (int *) palloc((m + 1) * sizeof(int));
267+
for (i = 0; i < m; ++i)
268+
{
269+
s_char_len[i] = pg_mblen(cp);
270+
cp += s_char_len[i];
271+
}
272+
s_char_len[i] = 0;
273+
}
274+
229275
/* One more cell for initialization column and row. */
230276
++m;
231277
++n;
@@ -244,36 +290,89 @@ levenshtein_internal(text *s, text *t,
244290
prev[i] = i * del_c;
245291

246292
/* Loop through rows of the notional array */
247-
for (y = VARDATA_ANY(t), j = 1; j < n; y++, j++)
293+
for (y = t_data, j = 1; j < n; j++)
248294
{
249295
int *temp;
296+
const char *x = s_data;
297+
int y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1;
250298

251299
/*
252300
* First cell must increment sequentially, as we're on the j'th row of
253301
* the (m+1)x(n+1) array.
254302
*/
255303
curr[0] = j * ins_c;
256304

257-
for (x = VARDATA_ANY(s), i = 1; i < m; x++, i++)
305+
/*
306+
* This inner loop is critical to performance, so we include a
307+
* fast-path to handle the (fairly common) case where no multibyte
308+
* characters are in the mix. The fast-path is entitled to assume
309+
* that if s_char_len is not initialized then BOTH strings contain
310+
* only single-byte characters.
311+
*/
312+
if (s_char_len != NULL)
258313
{
259-
int ins;
260-
int del;
261-
int sub;
262-
263-
/* Calculate costs for probable operations. */
264-
ins = prev[i] + ins_c; /* Insertion */
265-
del = curr[i - 1] + del_c; /* Deletion */
266-
sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c); /* Substitution */
267-
268-
/* Take the one with minimum cost. */
269-
curr[i] = Min(ins, del);
270-
curr[i] = Min(curr[i], sub);
314+
for (i = 1; i < m; i++)
315+
{
316+
int ins;
317+
int del;
318+
int sub;
319+
int x_char_len = s_char_len[i - 1];
320+
321+
/*
322+
* Calculate costs for insertion, deletion, and substitution.
323+
*
324+
* When calculating cost for substitution, we compare the last
325+
* character of each possibly-multibyte character first,
326+
* because that's enough to rule out most mis-matches. If we
327+
* get past that test, then we compare the lengths and the
328+
* remaining bytes.
329+
*/
330+
ins = prev[i] + ins_c;
331+
del = curr[i - 1] + del_c;
332+
if (x[x_char_len-1] == y[y_char_len-1]
333+
&& x_char_len == y_char_len &&
334+
(x_char_len == 1 || rest_of_char_same(x, y, x_char_len)))
335+
sub = prev[i - 1];
336+
else
337+
sub = prev[i - 1] + sub_c;
338+
339+
/* Take the one with minimum cost. */
340+
curr[i] = Min(ins, del);
341+
curr[i] = Min(curr[i], sub);
342+
343+
/* Point to next character. */
344+
x += x_char_len;
345+
}
346+
}
347+
else
348+
{
349+
for (i = 1; i < m; i++)
350+
{
351+
int ins;
352+
int del;
353+
int sub;
354+
355+
/* Calculate costs for insertion, deletion, and substitution. */
356+
ins = prev[i] + ins_c;
357+
del = curr[i - 1] + del_c;
358+
sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);
359+
360+
/* Take the one with minimum cost. */
361+
curr[i] = Min(ins, del);
362+
curr[i] = Min(curr[i], sub);
363+
364+
/* Point to next character. */
365+
x++;
366+
}
271367
}
272368

273369
/* Swap current row with previous row. */
274370
temp = curr;
275371
curr = prev;
276372
prev = temp;
373+
374+
/* Point to next character. */
375+
y += y_char_len;
277376
}
278377

279378
/*

doc/src/sgml/fuzzystrmatch.sgml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/fuzzystrmatch.sgml,v 1.6 2010/07/29 19:34:40 petere Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/fuzzystrmatch.sgml,v 1.7 2010/08/02 23:20:23 rhaas Exp $ -->
22

33
<sect1 id="fuzzystrmatch">
44
<title>fuzzystrmatch</title>
@@ -14,8 +14,9 @@
1414

1515
<caution>
1616
<para>
17-
At present, <filename>fuzzystrmatch</> does not work well with
18-
multi-byte encodings (such as UTF-8).
17+
At present, the <function>soundex</>, <function>metaphone</>,
18+
<function>dmetaphone</>, and <function>dmetaphone_alt</> functions do
19+
not work well with multi-byte encodings (such as UTF-8).
1920
</para>
2021
</caution>
2122

0 commit comments

Comments
 (0)