Skip to content

Commit 90260e2

Browse files
committed
Fix INITCAP() word boundaries for PG_UNICODE_FAST.
Word boundaries are based on whether a character is alphanumeric or not. For the PG_UNICODE_FAST collation, alphanumeric includes non-ASCII digits; whereas for the PG_C_UTF8 collation, it only includes digits 0-9. Pass down the right information from the pg_locale_t into initcap_wbnext to differentiate the behavior. Reported-by: Noah Misch <noah@leadboat.com> Reviewed-by: Noah Misch <noah@leadboat.com> Discussion: https://postgr.es/m/20250417135841.33.nmisch@google.com
1 parent 80b727e commit 90260e2

File tree

4 files changed

+23
-4
lines changed

4 files changed

+23
-4
lines changed

src/backend/utils/adt/pg_locale_builtin.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ struct WordBoundaryState
4040
const char *str;
4141
size_t len;
4242
size_t offset;
43+
bool posix;
4344
bool init;
4445
bool prev_alnum;
4546
};
@@ -58,7 +59,7 @@ initcap_wbnext(void *state)
5859
{
5960
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
6061
wbstate->offset);
61-
bool curr_alnum = pg_u_isalnum(u, true);
62+
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
6263

6364
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
6465
{
@@ -92,6 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
9293
.str = src,
9394
.len = srclen,
9495
.offset = 0,
96+
.posix = !locale->info.builtin.casemap_full,
9597
.init = false,
9698
.prev_alnum = false,
9799
};

src/common/unicode/case_test.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ struct WordBoundaryState
4141
const char *str;
4242
size_t len;
4343
size_t offset;
44+
bool posix;
4445
bool init;
4546
bool prev_alnum;
4647
};
@@ -55,7 +56,7 @@ initcap_wbnext(void *state)
5556
{
5657
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
5758
wbstate->offset);
58-
bool curr_alnum = pg_u_isalnum(u, true);
59+
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
5960

6061
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
6162
{
@@ -112,10 +113,13 @@ icu_test_full(char *str)
112113
char icu_upper[BUFSZ];
113114
char icu_fold[BUFSZ];
114115
UErrorCode status;
116+
117+
/* full case mapping doesn't use posix semantics */
115118
struct WordBoundaryState wbstate = {
116119
.str = str,
117120
.len = strlen(str),
118121
.offset = 0,
122+
.posix = false,
119123
.init = false,
120124
.prev_alnum = false,
121125
};
@@ -344,6 +348,12 @@ test_convert_case()
344348
test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
345349
test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
346350
test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
351+
/* test that alphanumerics are word characters */
352+
test_convert(tfunc_title, "λλ", "Λλ");
353+
test_convert(tfunc_title, "1a", "1a");
354+
/* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
355+
test_convert(tfunc_title, "\uFF11a", "\uFF11a");
356+
347357

348358
#ifdef USE_ICU
349359
icu_test_full("");
@@ -354,6 +364,7 @@ test_convert_case()
354364
icu_test_full("abc 123xyz");
355365
icu_test_full("σςΣ ΣΣΣ");
356366
icu_test_full("ıiIİ");
367+
icu_test_full("\uFF11a");
357368
/* test <alpha><iota_subscript><acute> */
358369
icu_test_full("\u0391\u0345\u0301");
359370
#endif

src/test/regress/expected/collate.utf8.out

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ INSERT INTO test_pg_c_utf8 VALUES
5252
('abc DEF 123abc'),
5353
('ábc sßs ßss DÉF'),
5454
('DŽxxDŽ džxxDž Džxxdž'),
55+
(U&'Λλ 1a \FF11a'),
5556
('ȺȺȺ'),
5657
('ⱥⱥⱥ'),
5758
('ⱥȺ');
@@ -67,10 +68,11 @@ SELECT
6768
abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14
6869
ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF | 19 | 19 | 19 | 19
6970
DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | DŽxxdž DŽxxdž DŽxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20
71+
Λλ 1a 1a | λλ 1a 1a | Λλ 1a 1A | ΛΛ 1A 1A | 12 | 12 | 12 | 12
7072
ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6
7173
ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6
7274
ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4
73-
(6 rows)
75+
(7 rows)
7476

7577
DROP TABLE test_pg_c_utf8;
7678
-- negative test: Final_Sigma not used for builtin locale C.UTF-8
@@ -182,6 +184,7 @@ INSERT INTO test_pg_unicode_fast VALUES
182184
('abc DEF 123abc'),
183185
('ábc sßs ßss DÉF'),
184186
('DŽxxDŽ džxxDž Džxxdž'),
187+
(U&'Λλ 1a \FF11a'),
185188
('ȺȺȺ'),
186189
('ⱥⱥⱥ'),
187190
('ⱥȺ');
@@ -197,10 +200,11 @@ SELECT
197200
abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14
198201
ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF | 19 | 19 | 19 | 19
199202
DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | Džxxdž Džxxdž Džxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20
203+
Λλ 1a 1a | λλ 1a 1a | Λλ 1a 1a | ΛΛ 1A 1A | 12 | 12 | 12 | 12
200204
ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6
201205
ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6
202206
ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4
203-
(6 rows)
207+
(7 rows)
204208

205209
DROP TABLE test_pg_unicode_fast;
206210
-- test Final_Sigma

src/test/regress/sql/collate.utf8.sql

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ INSERT INTO test_pg_c_utf8 VALUES
4545
('abc DEF 123abc'),
4646
('ábc sßs ßss DÉF'),
4747
('DŽxxDŽ džxxDž Džxxdž'),
48+
(U&'Λλ 1a \FF11a'),
4849
('ȺȺȺ'),
4950
('ⱥⱥⱥ'),
5051
('ⱥȺ');
@@ -100,6 +101,7 @@ INSERT INTO test_pg_unicode_fast VALUES
100101
('abc DEF 123abc'),
101102
('ábc sßs ßss DÉF'),
102103
('DŽxxDŽ džxxDž Džxxdž'),
104+
(U&'Λλ 1a \FF11a'),
103105
('ȺȺȺ'),
104106
('ⱥⱥⱥ'),
105107
('ⱥȺ');

0 commit comments

Comments
 (0)