Skip to content

Commit 33c697e

Browse files
committed
Make ts_locale.c's character-type functions cope with UTF-16.
On Windows, in UTF8 database encoding, what char2wchar() produces is UTF16 not UTF32, ie, characters above U+FFFF will be represented by surrogate pairs. t_isdigit() and siblings did not account for this and failed to provide a large enough result buffer. That in turn led to bogus "invalid multibyte character for locale" errors, because contrary to what you might think from char2wchar()'s documentation, its Windows code path doesn't cope sanely with buffer overflow. The solution for t_isdigit() and siblings is pretty clear: provide a 3-wchar_t result buffer not 2. char2wchar() also needs some work to provide more consistent, and more accurately documented, buffer overrun behavior. But that's a bigger job and it doesn't actually have any immediate payoff, so leave it for later. Per bug #15476 from Kenji Uno, who deserves credit for identifying the cause of the problem. Back-patch to all active branches. Discussion: https://postgr.es/m/15476-4314f480acf0f114@postgresql.org
1 parent 1aad3a7 commit 33c697e

File tree

1 file changed

+19
-8
lines changed

1 file changed

+19
-8
lines changed

src/backend/tsearch/ts_locale.c

+19-8
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,29 @@ static void tsearch_readline_callback(void *arg);
2323

2424
#ifdef USE_WIDE_UPPER_LOWER
2525

26+
/*
27+
* The reason these functions use a 3-wchar_t output buffer, not 2 as you
28+
* might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
29+
* getting from char2wchar() is UTF16 not UTF32. A single input character
30+
* may therefore produce a surrogate pair rather than just one wchar_t;
31+
* we also need room for a trailing null. When we do get a surrogate pair,
32+
* we pass just the first code to iswdigit() etc, so that these functions will
33+
* always return false for characters outside the Basic Multilingual Plane.
34+
*/
35+
#define WC_BUF_LEN 3
36+
2637
int
2738
t_isdigit(const char *ptr)
2839
{
2940
int clen = pg_mblen(ptr);
30-
wchar_t character[2];
41+
wchar_t character[WC_BUF_LEN];
3142
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
3243
pg_locale_t mylocale = 0; /* TODO */
3344

3445
if (clen == 1 || lc_ctype_is_c(collation))
3546
return isdigit(TOUCHAR(ptr));
3647

37-
char2wchar(character, 2, ptr, clen, mylocale);
48+
char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
3849

3950
return iswdigit((wint_t) character[0]);
4051
}
@@ -43,14 +54,14 @@ int
4354
t_isspace(const char *ptr)
4455
{
4556
int clen = pg_mblen(ptr);
46-
wchar_t character[2];
57+
wchar_t character[WC_BUF_LEN];
4758
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
4859
pg_locale_t mylocale = 0; /* TODO */
4960

5061
if (clen == 1 || lc_ctype_is_c(collation))
5162
return isspace(TOUCHAR(ptr));
5263

53-
char2wchar(character, 2, ptr, clen, mylocale);
64+
char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
5465

5566
return iswspace((wint_t) character[0]);
5667
}
@@ -59,14 +70,14 @@ int
5970
t_isalpha(const char *ptr)
6071
{
6172
int clen = pg_mblen(ptr);
62-
wchar_t character[2];
73+
wchar_t character[WC_BUF_LEN];
6374
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
6475
pg_locale_t mylocale = 0; /* TODO */
6576

6677
if (clen == 1 || lc_ctype_is_c(collation))
6778
return isalpha(TOUCHAR(ptr));
6879

69-
char2wchar(character, 2, ptr, clen, mylocale);
80+
char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
7081

7182
return iswalpha((wint_t) character[0]);
7283
}
@@ -75,14 +86,14 @@ int
7586
t_isprint(const char *ptr)
7687
{
7788
int clen = pg_mblen(ptr);
78-
wchar_t character[2];
89+
wchar_t character[WC_BUF_LEN];
7990
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
8091
pg_locale_t mylocale = 0; /* TODO */
8192

8293
if (clen == 1 || lc_ctype_is_c(collation))
8394
return isprint(TOUCHAR(ptr));
8495

85-
char2wchar(character, 2, ptr, clen, mylocale);
96+
char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
8697

8798
return iswprint((wint_t) character[0]);
8899
}

0 commit comments

Comments
 (0)