Skip to content

Commit 2ab0796

Browse files
committed
Fix char2wchar/wchar2char to support collations properly.
These functions should take a pg_locale_t, not a collation OID, and should call mbstowcs_l/wcstombs_l where available. Where those functions are not available, temporarily select the correct locale with uselocale(). This change removes the bogus assumption that all locales selectable in a given database have the same wide-character conversion method; in particular, the collate.linux.utf8 regression test now passes with LC_CTYPE=C, so long as the database encoding is UTF8. I decided to move the char2wchar/wchar2char functions out of mbutils.c and into pg_locale.c, because they work on wchar_t not pg_wchar_t and thus don't really belong with the mbutils.c functions. Keeping them where they were would have required importing pg_locale_t into pg_wchar.h somehow, which did not seem like a good plan.
1 parent bb85030 commit 2ab0796

File tree

12 files changed

+217
-144
lines changed

12 files changed

+217
-144
lines changed

configure

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18985,7 +18985,8 @@ fi
1898518985

1898618986

1898718987

18988-
for ac_func in cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs
18988+
18989+
for ac_func in cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l
1898918990
do
1899018991
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
1899118992
{ $as_echo "$as_me:$LINENO: checking for $ac_func" >&5

configure.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1187,7 +1187,7 @@ PGAC_VAR_INT_TIMEZONE
11871187
AC_FUNC_ACCEPT_ARGTYPES
11881188
PGAC_FUNC_GETTIMEOFDAY_1ARG
11891189

1190-
AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs])
1190+
AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l])
11911191

11921192
AC_REPLACE_FUNCS(fseeko)
11931193
case $host_os in

src/backend/tsearch/ts_locale.c

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,12 @@ t_isdigit(const char *ptr)
2929
int clen = pg_mblen(ptr);
3030
wchar_t character[2];
3131
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
32+
pg_locale_t mylocale = 0; /* TODO */
3233

3334
if (clen == 1 || lc_ctype_is_c(collation))
3435
return isdigit(TOUCHAR(ptr));
3536

36-
char2wchar(character, 2, ptr, clen, collation);
37+
char2wchar(character, 2, ptr, clen, mylocale);
3738

3839
return iswdigit((wint_t) character[0]);
3940
}
@@ -44,11 +45,12 @@ t_isspace(const char *ptr)
4445
int clen = pg_mblen(ptr);
4546
wchar_t character[2];
4647
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
48+
pg_locale_t mylocale = 0; /* TODO */
4749

4850
if (clen == 1 || lc_ctype_is_c(collation))
4951
return isspace(TOUCHAR(ptr));
5052

51-
char2wchar(character, 2, ptr, clen, collation);
53+
char2wchar(character, 2, ptr, clen, mylocale);
5254

5355
return iswspace((wint_t) character[0]);
5456
}
@@ -59,11 +61,12 @@ t_isalpha(const char *ptr)
5961
int clen = pg_mblen(ptr);
6062
wchar_t character[2];
6163
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
64+
pg_locale_t mylocale = 0; /* TODO */
6265

6366
if (clen == 1 || lc_ctype_is_c(collation))
6467
return isalpha(TOUCHAR(ptr));
6568

66-
char2wchar(character, 2, ptr, clen, collation);
69+
char2wchar(character, 2, ptr, clen, mylocale);
6770

6871
return iswalpha((wint_t) character[0]);
6972
}
@@ -74,11 +77,12 @@ t_isprint(const char *ptr)
7477
int clen = pg_mblen(ptr);
7578
wchar_t character[2];
7679
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
80+
pg_locale_t mylocale = 0; /* TODO */
7781

7882
if (clen == 1 || lc_ctype_is_c(collation))
7983
return isprint(TOUCHAR(ptr));
8084

81-
char2wchar(character, 2, ptr, clen, collation);
85+
char2wchar(character, 2, ptr, clen, mylocale);
8286

8387
return iswprint((wint_t) character[0]);
8488
}
@@ -246,6 +250,7 @@ lowerstr_with_len(const char *str, int len)
246250

247251
#ifdef USE_WIDE_UPPER_LOWER
248252
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
253+
pg_locale_t mylocale = 0; /* TODO */
249254
#endif
250255

251256
if (len == 0)
@@ -272,7 +277,7 @@ lowerstr_with_len(const char *str, int len)
272277
*/
273278
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
274279

275-
wlen = char2wchar(wstr, len + 1, str, len, collation);
280+
wlen = char2wchar(wstr, len + 1, str, len, mylocale);
276281
Assert(wlen <= len);
277282

278283
while (*wptr)
@@ -287,7 +292,7 @@ lowerstr_with_len(const char *str, int len)
287292
len = pg_database_encoding_max_length() * wlen + 1;
288293
out = (char *) palloc(len);
289294

290-
wlen = wchar2char(out, wstr, len, collation);
295+
wlen = wchar2char(out, wstr, len, mylocale);
291296

292297
pfree(wstr);
293298

src/backend/tsearch/wparser_def.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -300,21 +300,23 @@ TParserInit(char *str, int len)
300300
if (prs->charmaxlen > 1)
301301
{
302302
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
303+
pg_locale_t mylocale = 0; /* TODO */
303304

304305
prs->usewide = true;
305306
if (lc_ctype_is_c(collation))
306307
{
307308
/*
308309
* char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
309-
* be not equal to sizeof(wchar_t)
310+
* be different from sizeof(wchar_t)
310311
*/
311312
prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
312313
pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
313314
}
314315
else
315316
{
316317
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
317-
char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr, collation);
318+
char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
319+
mylocale);
318320
}
319321
}
320322
else

src/backend/utils/adt/formatting.c

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1454,6 +1454,10 @@ str_numth(char *dest, char *num, int type)
14541454
return dest;
14551455
}
14561456

1457+
/*****************************************************************************
1458+
* upper/lower/initcap functions
1459+
*****************************************************************************/
1460+
14571461
/*
14581462
* If the system provides the needed functions for wide-character manipulation
14591463
* (which are all standardized by C99), then we implement upper/lower/initcap
@@ -1527,7 +1531,7 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
15271531
/* Output workspace cannot have more codes than input bytes */
15281532
workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
15291533

1530-
char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
1534+
char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
15311535

15321536
for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
15331537
{
@@ -1543,7 +1547,7 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
15431547
result_size = curr_char * pg_database_encoding_max_length() + 1;
15441548
result = palloc(result_size);
15451549

1546-
wchar2char(result, workspace, result_size, collid);
1550+
wchar2char(result, workspace, result_size, mylocale);
15471551
pfree(workspace);
15481552
}
15491553
#endif /* USE_WIDE_UPPER_LOWER */
@@ -1648,7 +1652,7 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
16481652
/* Output workspace cannot have more codes than input bytes */
16491653
workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
16501654

1651-
char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
1655+
char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
16521656

16531657
for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
16541658
{
@@ -1664,7 +1668,7 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
16641668
result_size = curr_char * pg_database_encoding_max_length() + 1;
16651669
result = palloc(result_size);
16661670

1667-
wchar2char(result, workspace, result_size, collid);
1671+
wchar2char(result, workspace, result_size, mylocale);
16681672
pfree(workspace);
16691673
}
16701674
#endif /* USE_WIDE_UPPER_LOWER */
@@ -1781,7 +1785,7 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
17811785
/* Output workspace cannot have more codes than input bytes */
17821786
workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
17831787

1784-
char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
1788+
char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
17851789

17861790
for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
17871791
{
@@ -1809,7 +1813,7 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
18091813
result_size = curr_char * pg_database_encoding_max_length() + 1;
18101814
result = palloc(result_size);
18111815

1812-
wchar2char(result, workspace, result_size, collid);
1816+
wchar2char(result, workspace, result_size, mylocale);
18131817
pfree(workspace);
18141818
}
18151819
#endif /* USE_WIDE_UPPER_LOWER */

src/backend/utils/adt/pg_locale.c

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,3 +1030,176 @@ pg_newlocale_from_collation(Oid collid)
10301030

10311031
return cache_entry->locale;
10321032
}
1033+
1034+
1035+
/*
1036+
* These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
1037+
* Therefore we keep them here rather than with the mbutils code.
1038+
*/
1039+
1040+
#ifdef USE_WIDE_UPPER_LOWER
1041+
1042+
/*
1043+
* wchar2char --- convert wide characters to multibyte format
1044+
*
1045+
* This has the same API as the standard wcstombs_l() function; in particular,
1046+
* tolen is the maximum number of bytes to store at *to, and *from must be
1047+
* zero-terminated. The output will be zero-terminated iff there is room.
1048+
*/
1049+
size_t
1050+
wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
1051+
{
1052+
size_t result;
1053+
1054+
if (tolen == 0)
1055+
return 0;
1056+
1057+
#ifdef WIN32
1058+
1059+
/*
1060+
* On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1061+
* for some reason mbstowcs and wcstombs won't do this for us, so we use
1062+
* MultiByteToWideChar().
1063+
*/
1064+
if (GetDatabaseEncoding() == PG_UTF8)
1065+
{
1066+
result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1067+
NULL, NULL);
1068+
/* A zero return is failure */
1069+
if (result <= 0)
1070+
result = -1;
1071+
else
1072+
{
1073+
Assert(result <= tolen);
1074+
/* Microsoft counts the zero terminator in the result */
1075+
result--;
1076+
}
1077+
}
1078+
else
1079+
#endif /* WIN32 */
1080+
if (locale == (pg_locale_t) 0)
1081+
{
1082+
/* Use wcstombs directly for the default locale */
1083+
result = wcstombs(to, from, tolen);
1084+
}
1085+
else
1086+
{
1087+
#ifdef HAVE_LOCALE_T
1088+
#ifdef HAVE_WCSTOMBS_L
1089+
/* Use wcstombs_l for nondefault locales */
1090+
result = wcstombs_l(to, from, tolen, locale);
1091+
#else /* !HAVE_WCSTOMBS_L */
1092+
/* We have to temporarily set the locale as current ... ugh */
1093+
locale_t save_locale = uselocale(locale);
1094+
1095+
result = wcstombs(to, from, tolen);
1096+
1097+
uselocale(save_locale);
1098+
#endif /* HAVE_WCSTOMBS_L */
1099+
#else /* !HAVE_LOCALE_T */
1100+
/* Can't have locale != 0 without HAVE_LOCALE_T */
1101+
elog(ERROR, "wcstombs_l is not available");
1102+
result = 0; /* keep compiler quiet */
1103+
#endif /* HAVE_LOCALE_T */
1104+
}
1105+
1106+
return result;
1107+
}
1108+
1109+
/*
1110+
* char2wchar --- convert multibyte characters to wide characters
1111+
*
1112+
* This has almost the API of mbstowcs_l(), except that *from need not be
1113+
* null-terminated; instead, the number of input bytes is specified as
1114+
* fromlen. Also, we ereport() rather than returning -1 for invalid
1115+
* input encoding. tolen is the maximum number of wchar_t's to store at *to.
1116+
* The output will be zero-terminated iff there is room.
1117+
*/
1118+
size_t
1119+
char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1120+
pg_locale_t locale)
1121+
{
1122+
size_t result;
1123+
1124+
if (tolen == 0)
1125+
return 0;
1126+
1127+
#ifdef WIN32
1128+
/* See WIN32 "Unicode" comment above */
1129+
if (GetDatabaseEncoding() == PG_UTF8)
1130+
{
1131+
/* Win32 API does not work for zero-length input */
1132+
if (fromlen == 0)
1133+
result = 0;
1134+
else
1135+
{
1136+
result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1137+
/* A zero return is failure */
1138+
if (result == 0)
1139+
result = -1;
1140+
}
1141+
1142+
if (result != -1)
1143+
{
1144+
Assert(result < tolen);
1145+
/* Append trailing null wchar (MultiByteToWideChar() does not) */
1146+
to[result] = 0;
1147+
}
1148+
}
1149+
else
1150+
#endif /* WIN32 */
1151+
{
1152+
/* mbstowcs requires ending '\0' */
1153+
char *str = pnstrdup(from, fromlen);
1154+
1155+
if (locale == (pg_locale_t) 0)
1156+
{
1157+
/* Use mbstowcs directly for the default locale */
1158+
result = mbstowcs(to, str, tolen);
1159+
}
1160+
else
1161+
{
1162+
#ifdef HAVE_LOCALE_T
1163+
#ifdef HAVE_WCSTOMBS_L
1164+
/* Use mbstowcs_l for nondefault locales */
1165+
result = mbstowcs_l(to, str, tolen, locale);
1166+
#else /* !HAVE_WCSTOMBS_L */
1167+
/* We have to temporarily set the locale as current ... ugh */
1168+
locale_t save_locale = uselocale(locale);
1169+
1170+
result = mbstowcs(to, str, tolen);
1171+
1172+
uselocale(save_locale);
1173+
#endif /* HAVE_WCSTOMBS_L */
1174+
#else /* !HAVE_LOCALE_T */
1175+
/* Can't have locale != 0 without HAVE_LOCALE_T */
1176+
elog(ERROR, "mbstowcs_l is not available");
1177+
result = 0; /* keep compiler quiet */
1178+
#endif /* HAVE_LOCALE_T */
1179+
}
1180+
1181+
pfree(str);
1182+
}
1183+
1184+
if (result == -1)
1185+
{
1186+
/*
1187+
* Invalid multibyte character encountered. We try to give a useful
1188+
* error message by letting pg_verifymbstr check the string. But it's
1189+
* possible that the string is OK to us, and not OK to mbstowcs ---
1190+
* this suggests that the LC_CTYPE locale is different from the
1191+
* database encoding. Give a generic error message if verifymbstr
1192+
* can't find anything wrong.
1193+
*/
1194+
pg_verifymbstr(from, fromlen, false); /* might not return */
1195+
/* but if it does ... */
1196+
ereport(ERROR,
1197+
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1198+
errmsg("invalid multibyte character for locale"),
1199+
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1200+
}
1201+
1202+
return result;
1203+
}
1204+
1205+
#endif /* USE_WIDE_UPPER_LOWER */

0 commit comments

Comments
 (0)