Skip to content

Commit 286a365

Browse files
committed
Support Unicode full case mapping and conversion.
Generate tables from Unicode SpecialCasing.txt to support more sophisticated case mapping behavior: * support case mappings to multiple codepoints, such as "ß" uppercasing to "SS" * support conditional case mappings, such as the "final sigma" * support titlecase variants, such as "dž" uppercasing to "DŽ" but titlecasing to "Dž" Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite
1 parent 6a9b2a6 commit 286a365

File tree

9 files changed

+3645
-2993
lines changed

9 files changed

+3645
-2993
lines changed

src/backend/utils/adt/pg_locale_builtin.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ size_t
7878
strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
7979
pg_locale_t locale)
8080
{
81-
return unicode_strlower(dest, destsize, src, srclen);
81+
return unicode_strlower(dest, destsize, src, srclen, false);
8282
}
8383

8484
size_t
@@ -93,15 +93,15 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
9393
.prev_alnum = false,
9494
};
9595

96-
return unicode_strtitle(dest, destsize, src, srclen,
96+
return unicode_strtitle(dest, destsize, src, srclen, false,
9797
initcap_wbnext, &wbstate);
9898
}
9999

100100
size_t
101101
strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
102102
pg_locale_t locale)
103103
{
104-
return unicode_strupper(dest, destsize, src, srclen);
104+
return unicode_strupper(dest, destsize, src, srclen, false);
105105
}
106106

107107
pg_locale_t

src/common/unicode/Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
3030
# These files are part of the Unicode Character Database. Download
3131
# them on demand. The dependency on Makefile.global is for
3232
# UNICODE_VERSION.
33-
CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
33+
CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
3434
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
3535

3636
unicode_version.h: generate-unicode_version.pl
@@ -91,4 +91,4 @@ clean:
9191
rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
9292

9393
distclean: clean
94-
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
94+
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h

src/common/unicode/case_test.c

+191-11
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,61 @@
1818
#include <wctype.h>
1919

2020
#ifdef USE_ICU
21+
#include <unicode/ucasemap.h>
2122
#include <unicode/uchar.h>
2223
#endif
2324
#include "common/unicode_case.h"
2425
#include "common/unicode_category.h"
2526
#include "common/unicode_version.h"
2627

28+
/* enough to hold largest source or result string, including NUL */
29+
#define BUFSZ 256
30+
31+
#ifdef USE_ICU
32+
static UCaseMap * casemap = NULL;
33+
#endif
34+
35+
typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
36+
ssize_t srclen);
37+
38+
/* simple boundary iterator copied from pg_locale_builtin.c */
39+
struct WordBoundaryState
40+
{
41+
const char *str;
42+
size_t len;
43+
size_t offset;
44+
bool init;
45+
bool prev_alnum;
46+
};
47+
48+
static size_t
49+
initcap_wbnext(void *state)
50+
{
51+
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
52+
53+
while (wbstate->offset < wbstate->len &&
54+
wbstate->str[wbstate->offset] != '\0')
55+
{
56+
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
57+
wbstate->offset);
58+
bool curr_alnum = pg_u_isalnum(u, true);
59+
60+
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
61+
{
62+
size_t prev_offset = wbstate->offset;
63+
64+
wbstate->init = true;
65+
wbstate->offset += unicode_utf8len(u);
66+
wbstate->prev_alnum = curr_alnum;
67+
return prev_offset;
68+
}
69+
70+
wbstate->offset += unicode_utf8len(u);
71+
}
72+
73+
return wbstate->len;
74+
}
75+
2776
#ifdef USE_ICU
2877

2978
static void
@@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
4897
}
4998
}
5099

100+
static void
101+
icu_test_full(char *str)
102+
{
103+
char lower[BUFSZ];
104+
char title[BUFSZ];
105+
char upper[BUFSZ];
106+
char icu_lower[BUFSZ];
107+
char icu_title[BUFSZ];
108+
char icu_upper[BUFSZ];
109+
UErrorCode status;
110+
struct WordBoundaryState wbstate = {
111+
.str = str,
112+
.len = strlen(str),
113+
.offset = 0,
114+
.init = false,
115+
.prev_alnum = false,
116+
};
117+
118+
unicode_strlower(lower, BUFSZ, str, -1, true);
119+
unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
120+
unicode_strupper(upper, BUFSZ, str, -1, true);
121+
status = U_ZERO_ERROR;
122+
ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
123+
status = U_ZERO_ERROR;
124+
ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
125+
status = U_ZERO_ERROR;
126+
ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
127+
128+
if (strcmp(lower, icu_lower) != 0)
129+
{
130+
printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
131+
icu_lower);
132+
exit(1);
133+
}
134+
if (strcmp(title, icu_title) != 0)
135+
{
136+
printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
137+
icu_title);
138+
exit(1);
139+
}
140+
if (strcmp(upper, icu_upper) != 0)
141+
{
142+
printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
143+
icu_upper);
144+
exit(1);
145+
}
146+
}
147+
51148
/*
52149
* Exhaustively compare case mappings with the results from ICU.
53150
*/
@@ -64,6 +161,7 @@ test_icu(void)
64161
if (category != PG_U_UNASSIGNED)
65162
{
66163
uint8_t icu_category = u_charType(code);
164+
char code_str[5] = {0};
67165

68166
if (icu_category == PG_U_UNASSIGNED)
69167
{
@@ -72,6 +170,9 @@ test_icu(void)
72170
}
73171

74172
icu_test_simple(code);
173+
unicode_to_utf8(code, (unsigned char *) code_str);
174+
icu_test_full(code_str);
175+
75176
successful++;
76177
}
77178
}
@@ -86,7 +187,7 @@ test_icu(void)
86187
#endif
87188

88189
static void
89-
test_strlower(const char *test_string, const char *expected)
190+
test_convert(TestFunc tfunc, const char *test_string, const char *expected)
90191
{
91192
size_t src1len = strlen(test_string);
92193
size_t src2len = -1; /* NUL-terminated */
@@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)
102203

103204
/* neither source nor destination are NUL-terminated */
104205
memset(dst1, 0x7F, dst1len);
105-
needed = unicode_strlower(dst1, dst1len, src1, src1len);
206+
needed = tfunc(dst1, dst1len, src1, src1len);
106207
if (needed != strlen(expected))
107208
{
108-
printf("case_test: convert_case test1 FAILURE: needed %zu\n", needed);
209+
printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
210+
test_string, needed, strlen(expected));
109211
exit(1);
110212
}
111213
if (memcmp(dst1, expected, dst1len) != 0)
@@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)
117219

118220
/* destination is NUL-terminated and source is not */
119221
memset(dst2, 0x7F, dst2len);
120-
needed = unicode_strlower(dst2, dst2len, src1, src1len);
222+
needed = tfunc(dst2, dst2len, src1, src1len);
121223
if (needed != strlen(expected))
122224
{
123-
printf("case_test: convert_case test2 FAILURE: needed %zu\n", needed);
225+
printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
226+
test_string, needed, strlen(expected));
124227
exit(1);
125228
}
126229
if (strcmp(dst2, expected) != 0)
@@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)
132235

133236
/* source is NUL-terminated and destination is not */
134237
memset(dst1, 0x7F, dst1len);
135-
needed = unicode_strlower(dst1, dst1len, src2, src2len);
238+
needed = tfunc(dst1, dst1len, src2, src2len);
136239
if (needed != strlen(expected))
137240
{
241+
printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
242+
test_string, needed, strlen(expected));
138243
printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
139244
exit(1);
140245
}
@@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)
147252

148253
/* both source and destination are NUL-terminated */
149254
memset(dst2, 0x7F, dst2len);
150-
needed = unicode_strlower(dst2, dst2len, src2, src2len);
255+
needed = tfunc(dst2, dst2len, src2, src2len);
151256
if (needed != strlen(expected))
152257
{
153-
printf("case_test: convert_case test4 FAILURE: needed %zu\n", needed);
258+
printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
259+
test_string, needed, strlen(expected));
154260
exit(1);
155261
}
156262
if (strcmp(dst2, expected) != 0)
@@ -166,22 +272,92 @@ test_strlower(const char *test_string, const char *expected)
166272
free(dst2);
167273
}
168274

275+
static size_t
276+
tfunc_lower(char *dst, size_t dstsize, const char *src,
277+
ssize_t srclen)
278+
{
279+
return unicode_strlower(dst, dstsize, src, srclen, true);
280+
}
281+
282+
static size_t
283+
tfunc_title(char *dst, size_t dstsize, const char *src,
284+
ssize_t srclen)
285+
{
286+
struct WordBoundaryState wbstate = {
287+
.str = src,
288+
.len = srclen,
289+
.offset = 0,
290+
.init = false,
291+
.prev_alnum = false,
292+
};
293+
294+
return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
295+
&wbstate);
296+
}
297+
298+
static size_t
299+
tfunc_upper(char *dst, size_t dstsize, const char *src,
300+
ssize_t srclen)
301+
{
302+
return unicode_strupper(dst, dstsize, src, srclen, true);
303+
}
304+
305+
169306
static void
170307
test_convert_case()
171308
{
172309
/* test string with no case changes */
173-
test_strlower("√∞", "√∞");
310+
test_convert(tfunc_lower, "√∞", "√∞");
311+
/* test adjust-to-cased behavior */
312+
test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
174313
/* test string with case changes */
175-
test_strlower("ABC", "abc");
314+
test_convert(tfunc_upper, "abc", "ABC");
176315
/* test string with case changes and byte length changes */
177-
test_strlower("ȺȺȺ", "ⱥⱥⱥ");
316+
test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
317+
/* test special case conversions */
318+
test_convert(tfunc_upper, "ß", "SS");
319+
test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
320+
test_convert(tfunc_upper, "ıiIİ", "IIIİ");
321+
/* test final sigma */
322+
test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
323+
test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
324+
test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
325+
326+
#ifdef USE_ICU
327+
icu_test_full("");
328+
icu_test_full("ȺȺȺ");
329+
icu_test_full("ßßß");
330+
icu_test_full("√∞");
331+
icu_test_full("a b");
332+
icu_test_full("abc 123xyz");
333+
icu_test_full("σςΣ ΣΣΣ");
334+
icu_test_full("ıiIİ");
335+
/* test <alpha><iota_subscript><acute> */
336+
icu_test_full("\u0391\u0345\u0301");
337+
#endif
178338

179339
printf("case_test: convert_case: success\n");
180340
}
181341

182342
int
183343
main(int argc, char **argv)
184344
{
345+
#ifdef USE_ICU
346+
UErrorCode status = U_ZERO_ERROR;
347+
348+
/*
349+
* Disable ICU's word break adjustment for titlecase to match the expected
350+
* behavior of unicode_strtitle().
351+
*/
352+
casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
353+
if (U_FAILURE(status))
354+
{
355+
printf("case_test: failure opening UCaseMap: %s\n",
356+
u_errorName(status));
357+
exit(1);
358+
}
359+
#endif
360+
185361
printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
186362
#ifdef USE_ICU
187363
printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
@@ -191,5 +367,9 @@ main(int argc, char **argv)
191367
#endif
192368

193369
test_convert_case();
370+
371+
#ifdef USE_ICU
372+
ucasemap_close(casemap);
373+
#endif
194374
exit(0);
195375
}

0 commit comments

Comments
 (0)