Skip to content

Commit b80e106

Browse files
committed
Add mbverifystr() functions specific to each encoding.
This makes pg_verify_mbstr() function faster, by allowing more efficient encoding-specific implementations. All the implementations included in this commit are pretty naive, they just call the same encoding-specific verifychar functions that were used previously, but that already gives a performance boost because the tight character-at-a-time loop is simpler. Reviewed-by: John Naylor Discussion: https://www.postgresql.org/message-id/e7861509-3960-538a-9025-b75a61188e01@iki.fi
1 parent a3367aa commit b80e106

File tree

9 files changed

+493
-101
lines changed

9 files changed

+493
-101
lines changed

src/backend/commands/extension.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -682,7 +682,7 @@ read_extension_script_file(const ExtensionControlFile *control,
682682
src_encoding = control->encoding;
683683

684684
/* make sure that source string is valid in the expected encoding */
685-
pg_verify_mbstr_len(src_encoding, src_str, len, false);
685+
(void) pg_verify_mbstr(src_encoding, src_str, len, false);
686686

687687
/*
688688
* Convert the encoding to the database encoding. read_whole_file

src/backend/utils/mb/conv.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,7 @@ LocalToUtf(const unsigned char *iso, int len,
653653
continue;
654654
}
655655

656-
l = pg_encoding_verifymb(encoding, (const char *) iso, len);
656+
l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
657657
if (l < 0)
658658
break;
659659

src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
8787
continue;
8888
}
8989

90-
l = pg_encoding_verifymb(PG_EUC_JIS_2004, (const char *) euc, len);
90+
l = pg_encoding_verifymbchar(PG_EUC_JIS_2004, (const char *) euc, len);
9191

9292
if (l < 0)
9393
report_invalid_encoding(PG_EUC_JIS_2004,
@@ -238,7 +238,7 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
238238
continue;
239239
}
240240

241-
l = pg_encoding_verifymb(PG_SHIFT_JIS_2004, (const char *) sjis, len);
241+
l = pg_encoding_verifymbchar(PG_SHIFT_JIS_2004, (const char *) sjis, len);
242242

243243
if (l < 0 || l > len)
244244
report_invalid_encoding(PG_SHIFT_JIS_2004,

src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
291291
len--;
292292
continue;
293293
}
294-
l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
294+
l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
295295
if (l < 0)
296296
report_invalid_encoding(PG_MULE_INTERNAL,
297297
(const char *) mic, len);
@@ -381,7 +381,7 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
381381
len--;
382382
continue;
383383
}
384-
l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len);
384+
l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
385385
if (l < 0)
386386
report_invalid_encoding(PG_EUC_JP,
387387
(const char *) euc, len);
@@ -431,7 +431,7 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
431431
len--;
432432
continue;
433433
}
434-
l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
434+
l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
435435
if (l < 0)
436436
report_invalid_encoding(PG_MULE_INTERNAL,
437437
(const char *) mic, len);
@@ -485,7 +485,7 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
485485
len--;
486486
continue;
487487
}
488-
l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len);
488+
l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
489489
if (l < 0)
490490
report_invalid_encoding(PG_EUC_JP,
491491
(const char *) euc, len);
@@ -580,7 +580,7 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
580580
len--;
581581
continue;
582582
}
583-
l = pg_encoding_verifymb(PG_SJIS, (const char *) sjis, len);
583+
l = pg_encoding_verifymbchar(PG_SJIS, (const char *) sjis, len);
584584
if (l < 0)
585585
report_invalid_encoding(PG_SJIS,
586586
(const char *) sjis, len);

src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
7676
c1 = *euc;
7777
if (IS_HIGHBIT_SET(c1))
7878
{
79-
l = pg_encoding_verifymb(PG_EUC_KR, (const char *) euc, len);
79+
l = pg_encoding_verifymbchar(PG_EUC_KR, (const char *) euc, len);
8080
if (l != 2)
8181
report_invalid_encoding(PG_EUC_KR,
8282
(const char *) euc, len);
@@ -122,7 +122,7 @@ mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
122122
len--;
123123
continue;
124124
}
125-
l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
125+
l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
126126
if (l < 0)
127127
report_invalid_encoding(PG_MULE_INTERNAL,
128128
(const char *) mic, len);

src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
148148
c1 = *euc;
149149
if (IS_HIGHBIT_SET(c1))
150150
{
151-
l = pg_encoding_verifymb(PG_EUC_TW, (const char *) euc, len);
151+
l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
152152
if (l < 0)
153153
report_invalid_encoding(PG_EUC_TW,
154154
(const char *) euc, len);
@@ -213,7 +213,7 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
213213
len--;
214214
continue;
215215
}
216-
l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
216+
l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
217217
if (l < 0)
218218
report_invalid_encoding(PG_MULE_INTERNAL,
219219
(const char *) mic, len);
@@ -272,7 +272,7 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
272272
len--;
273273
continue;
274274
}
275-
l = pg_encoding_verifymb(PG_BIG5, (const char *) big5, len);
275+
l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
276276
if (l < 0)
277277
report_invalid_encoding(PG_BIG5,
278278
(const char *) big5, len);
@@ -321,7 +321,7 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
321321
len--;
322322
continue;
323323
}
324-
l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
324+
l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
325325
if (l < 0)
326326
report_invalid_encoding(PG_MULE_INTERNAL,
327327
(const char *) mic, len);

src/backend/utils/mb/mbutils.c

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ pg_convert(PG_FUNCTION_ARGS)
519519
/* make sure that source string is valid */
520520
len = VARSIZE_ANY_EXHDR(string);
521521
src_str = VARDATA_ANY(string);
522-
pg_verify_mbstr_len(src_encoding, src_str, len, false);
522+
(void) pg_verify_mbstr(src_encoding, src_str, len, false);
523523

524524
/* perform conversion */
525525
dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
@@ -1215,10 +1215,10 @@ static bool
12151215
pg_generic_charinc(unsigned char *charptr, int len)
12161216
{
12171217
unsigned char *lastbyte = charptr + len - 1;
1218-
mbverifier mbverify;
1218+
mbchar_verifier mbverify;
12191219

12201220
/* We can just invoke the character verifier directly. */
1221-
mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
1221+
mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
12221222

12231223
while (*lastbyte < (unsigned char) 255)
12241224
{
@@ -1445,8 +1445,7 @@ pg_database_encoding_max_length(void)
14451445
bool
14461446
pg_verifymbstr(const char *mbstr, int len, bool noError)
14471447
{
1448-
return
1449-
pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
1448+
return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
14501449
}
14511450

14521451
/*
@@ -1456,7 +1455,18 @@ pg_verifymbstr(const char *mbstr, int len, bool noError)
14561455
bool
14571456
pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
14581457
{
1459-
return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
1458+
int oklen;
1459+
1460+
Assert(PG_VALID_ENCODING(encoding));
1461+
1462+
oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
1463+
if (oklen != len)
1464+
{
1465+
if (noError)
1466+
return false;
1467+
report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
1468+
}
1469+
return true;
14601470
}
14611471

14621472
/*
@@ -1469,11 +1479,14 @@ pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
14691479
* If OK, return length of string in the encoding.
14701480
* If a problem is found, return -1 when noError is
14711481
* true; when noError is false, ereport() a descriptive message.
1482+
*
1483+
* Note: We cannot use the faster encoding-specific mbverifystr() function
1484+
* here, because we need to count the number of characters in the string.
14721485
*/
14731486
int
14741487
pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
14751488
{
1476-
mbverifier mbverify;
1489+
mbchar_verifier mbverifychar;
14771490
int mb_len;
14781491

14791492
Assert(PG_VALID_ENCODING(encoding));
@@ -1493,7 +1506,7 @@ pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
14931506
}
14941507

14951508
/* fetch function pointer just once */
1496-
mbverify = pg_wchar_table[encoding].mbverify;
1509+
mbverifychar = pg_wchar_table[encoding].mbverifychar;
14971510

14981511
mb_len = 0;
14991512

@@ -1516,7 +1529,7 @@ pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
15161529
report_invalid_encoding(encoding, mbstr, len);
15171530
}
15181531

1519-
l = (*mbverify) ((const unsigned char *) mbstr, len);
1532+
l = (*mbverifychar) ((const unsigned char *) mbstr, len);
15201533

15211534
if (l < 0)
15221535
{

0 commit comments

Comments
 (0)