5
5
*
6
6
* Joe Conway <mail@joeconway.com>
7
7
*
8
- * $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.33 2010/07/29 20:11:48 rhaas Exp $
8
+ * $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.34 2010/08/02 23: 20:23 rhaas Exp $
9
9
* Copyright (c) 2001-2010, PostgreSQL Global Development Group
10
10
* ALL RIGHTS RESERVED;
11
11
*
50
50
#include <ctype.h>
51
51
52
52
#include "fmgr.h"
53
+ #include "mb/pg_wchar.h"
53
54
#include "utils/builtins.h"
54
55
55
56
PG_MODULE_MAGIC ;
@@ -183,6 +184,18 @@ getcode(char c)
183
184
/* These prevent GH from becoming F */
184
185
#define NOGHTOF (c ) (getcode(c) & 16) /* BDH */
185
186
187
+ /* Faster than memcmp(), for this use case. */
188
+ static bool inline
189
+ rest_of_char_same (const char * s1 , const char * s2 , int len )
190
+ {
191
+ while (len > 0 )
192
+ {
193
+ len -- ;
194
+ if (s1 [len ] != s2 [len ])
195
+ return false;
196
+ }
197
+ return true;
198
+ }
186
199
187
200
/*
188
201
* levenshtein_internal - Calculates Levenshtein distance metric
@@ -195,16 +208,27 @@ levenshtein_internal(text *s, text *t,
195
208
int ins_c , int del_c , int sub_c )
196
209
{
197
210
int m ,
198
- n ;
211
+ n ,
212
+ s_bytes ,
213
+ t_bytes ;
199
214
int * prev ;
200
215
int * curr ;
216
+ int * s_char_len = NULL ;
201
217
int i ,
202
218
j ;
203
- const char * x ;
219
+ const char * s_data ;
220
+ const char * t_data ;
204
221
const char * y ;
205
222
206
- m = VARSIZE_ANY_EXHDR (s );
207
- n = VARSIZE_ANY_EXHDR (t );
223
+ /* Extract a pointer to the actual character data. */
224
+ s_data = VARDATA_ANY (s );
225
+ t_data = VARDATA_ANY (t );
226
+
227
+ /* Determine length of each string in bytes and characters. */
228
+ s_bytes = VARSIZE_ANY_EXHDR (s );
229
+ t_bytes = VARSIZE_ANY_EXHDR (t );
230
+ m = pg_mbstrlen_with_len (s_data , s_bytes );
231
+ n = pg_mbstrlen_with_len (t_data , t_bytes );
208
232
209
233
/*
210
234
* We can transform an empty s into t with n insertions, or a non-empty t
@@ -226,6 +250,28 @@ levenshtein_internal(text *s, text *t,
226
250
errmsg ("argument exceeds the maximum length of %d bytes" ,
227
251
MAX_LEVENSHTEIN_STRLEN )));
228
252
253
+ /*
254
+ * In order to avoid calling pg_mblen() repeatedly on each character in s,
255
+ * we cache all the lengths before starting the main loop -- but if all the
256
+ * characters in both strings are single byte, then we skip this and use
257
+ * a fast-path in the main loop. If only one string contains multi-byte
258
+ * characters, we still build the array, so that the fast-path needn't
259
+ * deal with the case where the array hasn't been initialized.
260
+ */
261
+ if (m != s_bytes || n != t_bytes )
262
+ {
263
+ int i ;
264
+ const char * cp = s_data ;
265
+
266
+ s_char_len = (int * ) palloc ((m + 1 ) * sizeof (int ));
267
+ for (i = 0 ; i < m ; ++ i )
268
+ {
269
+ s_char_len [i ] = pg_mblen (cp );
270
+ cp += s_char_len [i ];
271
+ }
272
+ s_char_len [i ] = 0 ;
273
+ }
274
+
229
275
/* One more cell for initialization column and row. */
230
276
++ m ;
231
277
++ n ;
@@ -244,36 +290,89 @@ levenshtein_internal(text *s, text *t,
244
290
prev [i ] = i * del_c ;
245
291
246
292
/* Loop through rows of the notional array */
247
- for (y = VARDATA_ANY ( t ) , j = 1 ; j < n ; y ++ , j ++ )
293
+ for (y = t_data , j = 1 ; j < n ; j ++ )
248
294
{
249
295
int * temp ;
296
+ const char * x = s_data ;
297
+ int y_char_len = n != t_bytes + 1 ? pg_mblen (y ) : 1 ;
250
298
251
299
/*
252
300
* First cell must increment sequentially, as we're on the j'th row of
253
301
* the (m+1)x(n+1) array.
254
302
*/
255
303
curr [0 ] = j * ins_c ;
256
304
257
- for (x = VARDATA_ANY (s ), i = 1 ; i < m ; x ++ , i ++ )
305
+ /*
306
+ * This inner loop is critical to performance, so we include a
307
+ * fast-path to handle the (fairly common) case where no multibyte
308
+ * characters are in the mix. The fast-path is entitled to assume
309
+ * that if s_char_len is not initialized then BOTH strings contain
310
+ * only single-byte characters.
311
+ */
312
+ if (s_char_len != NULL )
258
313
{
259
- int ins ;
260
- int del ;
261
- int sub ;
262
-
263
- /* Calculate costs for probable operations. */
264
- ins = prev [i ] + ins_c ; /* Insertion */
265
- del = curr [i - 1 ] + del_c ; /* Deletion */
266
- sub = prev [i - 1 ] + ((* x == * y ) ? 0 : sub_c ); /* Substitution */
267
-
268
- /* Take the one with minimum cost. */
269
- curr [i ] = Min (ins , del );
270
- curr [i ] = Min (curr [i ], sub );
314
+ for (i = 1 ; i < m ; i ++ )
315
+ {
316
+ int ins ;
317
+ int del ;
318
+ int sub ;
319
+ int x_char_len = s_char_len [i - 1 ];
320
+
321
+ /*
322
+ * Calculate costs for insertion, deletion, and substitution.
323
+ *
324
+ * When calculating cost for substitution, we compare the last
325
+ * character of each possibly-multibyte character first,
326
+ * because that's enough to rule out most mis-matches. If we
327
+ * get past that test, then we compare the lengths and the
328
+ * remaining bytes.
329
+ */
330
+ ins = prev [i ] + ins_c ;
331
+ del = curr [i - 1 ] + del_c ;
332
+ if (x [x_char_len - 1 ] == y [y_char_len - 1 ]
333
+ && x_char_len == y_char_len &&
334
+ (x_char_len == 1 || rest_of_char_same (x , y , x_char_len )))
335
+ sub = prev [i - 1 ];
336
+ else
337
+ sub = prev [i - 1 ] + sub_c ;
338
+
339
+ /* Take the one with minimum cost. */
340
+ curr [i ] = Min (ins , del );
341
+ curr [i ] = Min (curr [i ], sub );
342
+
343
+ /* Point to next character. */
344
+ x += x_char_len ;
345
+ }
346
+ }
347
+ else
348
+ {
349
+ for (i = 1 ; i < m ; i ++ )
350
+ {
351
+ int ins ;
352
+ int del ;
353
+ int sub ;
354
+
355
+ /* Calculate costs for insertion, deletion, and substitution. */
356
+ ins = prev [i ] + ins_c ;
357
+ del = curr [i - 1 ] + del_c ;
358
+ sub = prev [i - 1 ] + ((* x == * y ) ? 0 : sub_c );
359
+
360
+ /* Take the one with minimum cost. */
361
+ curr [i ] = Min (ins , del );
362
+ curr [i ] = Min (curr [i ], sub );
363
+
364
+ /* Point to next character. */
365
+ x ++ ;
366
+ }
271
367
}
272
368
273
369
/* Swap current row with previous row. */
274
370
temp = curr ;
275
371
curr = prev ;
276
372
prev = temp ;
373
+
374
+ /* Point to next character. */
375
+ y += y_char_len ;
277
376
}
278
377
279
378
/*
0 commit comments