Skip to content

Commit ad49994

Browse files
committed
Add Unicode property tables.
Provide functions to test for Unicode properties, such as Alphabetic or Cased. These functions use tables derived from Unicode data files, similar to the tables for Unicode normalization or general category, and those tables can be updated with the 'update-unicode' build target. Use Unicode properties to provide functions to test for regex character classes, like 'punct' or 'alnum'. Infrastructure in preparation for a builtin collation provider, and may also be useful for other callers. Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com Reviewed-by: Daniel Verite, Peter Eisentraut, Jeremy Schneider
1 parent 2ed8f9a commit ad49994

File tree

8 files changed

+4604
-102
lines changed

8 files changed

+4604
-102
lines changed

src/common/unicode/Makefile

+3-3
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,13 @@ update-unicode: unicode_category_table.h unicode_east_asian_fw_table.h unicode_n
2929
# These files are part of the Unicode Character Database. Download
3030
# them on demand. The dependency on Makefile.global is for
3131
# UNICODE_VERSION.
32-
CompositionExclusions.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
32+
CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
3333
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
3434

3535
unicode_version.h: generate-unicode_version.pl
3636
$(PERL) $< --version $(UNICODE_VERSION)
3737

38-
unicode_category_table.h: generate-unicode_category_table.pl UnicodeData.txt
38+
unicode_category_table.h: generate-unicode_category_table.pl DerivedCoreProperties.txt PropList.txt UnicodeData.txt
3939
$(PERL) $<
4040

4141
# Generation of conversion tables used for string normalization with
@@ -82,4 +82,4 @@ clean:
8282
rm -f $(OBJS) category_test category_test.o norm_test norm_test.o
8383

8484
distclean: clean
85-
rm -f CompositionExclusions.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt UnicodeData.txt norm_test_table.h unicode_category_table.h unicode_norm_table.h
85+
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_category_table.h unicode_norm_table.h

src/common/unicode/README

+35-10
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,35 @@
1-
This directory contains tools to generate the tables in
2-
src/include/common/unicode_norm.h, used for Unicode normalization. The
3-
generated .h file is included in the source tree, so these are normally not
4-
needed to build PostgreSQL, only if you need to re-generate the .h file
5-
from the Unicode data files for some reason, e.g. to update to a new version
6-
of Unicode.
1+
This directory contains tools to download new Unicode data files and
2+
generate static tables. These tables are used to normalize or
3+
determine various properties of Unicode data.
74

8-
Generating unicode_norm_table.h
9-
-------------------------------
5+
The generated header files are copied to src/include/common/, and
6+
included in the source tree, so these tools are not normally required
7+
to build PostgreSQL.
108

11-
Run
9+
Update Unicode Version
10+
----------------------
11+
12+
Edit src/Makefile.global.in and src/common/unicode/meson.build
13+
to update the UNICODE_VERSION.
14+
15+
Then, generate the new header files with:
1216

1317
make update-unicode
1418

15-
from the top level of the source tree and commit the result.
19+
or if using meson:
20+
21+
ninja update-unicode
22+
23+
from the top level of the source tree. Examine the result to make sure
24+
the changes look reasonable (that is, that the diff size and scope is
25+
comparable to the Unicode changes since the last update), and then
26+
commit it.
1627

1728
Tests
1829
-----
1930

31+
Normalization tests:
32+
2033
The Unicode consortium publishes a comprehensive test suite for the
2134
normalization algorithm, in a file called NormalizationTest.txt. This
2235
directory also contains a perl script and some C code, to run our
@@ -26,3 +39,15 @@ To download NormalizationTest.txt and run the tests:
2639
make normalization-check
2740

2841
This is also run as part of the update-unicode target.
42+
43+
Category & Property tests:
44+
45+
The file category_test.c exhaustively compares the category and
46+
properties of each code point as determined by the generated tables
47+
with the category and properties as reported by ICU. For this test to
48+
be effective, the version of the Unicode data files must be similar to
49+
the version of Unicode on which ICU is based, so attempt to match the
50+
versions as closely as possible. A mismatched Unicode will skip over
51+
codepoints that are assigned in one version and not the other, and may
52+
falsely report failures. This test is run as a part of the
53+
update-unicode target.

src/common/unicode/category_test.c

+179-43
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*-------------------------------------------------------------------------
22
* category_test.c
3-
* Program to test Unicode general category functions.
3+
* Program to test Unicode general category and character properties.
44
*
55
* Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
66
*
@@ -14,17 +14,23 @@
1414
#include <stdio.h>
1515
#include <stdlib.h>
1616
#include <string.h>
17+
#include <wctype.h>
1718

1819
#ifdef USE_ICU
1920
#include <unicode/uchar.h>
2021
#endif
22+
2123
#include "common/unicode_category.h"
2224
#include "common/unicode_version.h"
2325

26+
static int pg_unicode_version = 0;
27+
#ifdef USE_ICU
28+
static int icu_unicode_version = 0;
29+
#endif
30+
2431
/*
2532
* Parse version into integer for easy comparison.
2633
*/
27-
#ifdef USE_ICU
2834
static int
2935
parse_unicode_version(const char *version)
3036
{
@@ -39,57 +45,175 @@ parse_unicode_version(const char *version)
3945

4046
return major * 100 + minor;
4147
}
42-
#endif
4348

49+
#ifdef USE_ICU
4450
/*
45-
* Exhaustively test that the Unicode category for each codepoint matches that
46-
* returned by ICU.
51+
* Test Postgres Unicode tables by comparing with ICU. Test the General
52+
* Category, as well as the properties Alphabetic, Lowercase, Uppercase,
53+
* White_Space, and Hex_Digit.
4754
*/
48-
int
49-
main(int argc, char **argv)
55+
static void
56+
icu_test()
5057
{
51-
#ifdef USE_ICU
52-
int pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
53-
int icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
58+
int successful = 0;
5459
int pg_skipped_codepoints = 0;
5560
int icu_skipped_codepoints = 0;
5661

57-
printf("category_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
58-
printf("category_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
59-
60-
for (UChar32 code = 0; code <= 0x10ffff; code++)
62+
for (pg_wchar code = 0; code <= 0x10ffff; code++)
6163
{
6264
uint8_t pg_category = unicode_category(code);
6365
uint8_t icu_category = u_charType(code);
6466

67+
/* Property tests */
68+
bool prop_alphabetic = pg_u_prop_alphabetic(code);
69+
bool prop_lowercase = pg_u_prop_lowercase(code);
70+
bool prop_uppercase = pg_u_prop_uppercase(code);
71+
bool prop_cased = pg_u_prop_cased(code);
72+
bool prop_case_ignorable = pg_u_prop_case_ignorable(code);
73+
bool prop_white_space = pg_u_prop_white_space(code);
74+
bool prop_hex_digit = pg_u_prop_hex_digit(code);
75+
bool prop_join_control = pg_u_prop_join_control(code);
76+
77+
bool icu_prop_alphabetic = u_hasBinaryProperty(
78+
code, UCHAR_ALPHABETIC);
79+
bool icu_prop_lowercase = u_hasBinaryProperty(
80+
code, UCHAR_LOWERCASE);
81+
bool icu_prop_uppercase = u_hasBinaryProperty(
82+
code, UCHAR_UPPERCASE);
83+
bool icu_prop_cased = u_hasBinaryProperty(
84+
code, UCHAR_CASED);
85+
bool icu_prop_case_ignorable = u_hasBinaryProperty(
86+
code, UCHAR_CASE_IGNORABLE);
87+
bool icu_prop_white_space = u_hasBinaryProperty(
88+
code, UCHAR_WHITE_SPACE);
89+
bool icu_prop_hex_digit = u_hasBinaryProperty(
90+
code, UCHAR_HEX_DIGIT);
91+
bool icu_prop_join_control = u_hasBinaryProperty(
92+
code, UCHAR_JOIN_CONTROL);
93+
94+
/*
95+
* Compare with ICU for character classes using:
96+
*
97+
* https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uchar_8h.html#details
98+
*
99+
* which describes how to use ICU to test for membership in regex
100+
* character classes.
101+
*
102+
* NB: the document suggests testing for some properties such as
103+
* UCHAR_POSIX_ALNUM, but that doesn't mean that we're testing for the
104+
* "POSIX Compatible" character classes.
105+
*/
106+
bool isalpha = pg_u_isalpha(code);
107+
bool islower = pg_u_islower(code);
108+
bool isupper = pg_u_isupper(code);
109+
bool ispunct = pg_u_ispunct(code, false);
110+
bool isdigit = pg_u_isdigit(code, false);
111+
bool isxdigit = pg_u_isxdigit(code, false);
112+
bool isalnum = pg_u_isalnum(code, false);
113+
bool isspace = pg_u_isspace(code);
114+
bool isblank = pg_u_isblank(code);
115+
bool iscntrl = pg_u_iscntrl(code);
116+
bool isgraph = pg_u_isgraph(code);
117+
bool isprint = pg_u_isprint(code);
118+
119+
bool icu_isalpha = u_isUAlphabetic(code);
120+
bool icu_islower = u_isULowercase(code);
121+
bool icu_isupper = u_isUUppercase(code);
122+
bool icu_ispunct = u_ispunct(code);
123+
bool icu_isdigit = u_isdigit(code);
124+
bool icu_isxdigit = u_hasBinaryProperty(code,
125+
UCHAR_POSIX_XDIGIT);
126+
bool icu_isalnum = u_hasBinaryProperty(code,
127+
UCHAR_POSIX_ALNUM);
128+
bool icu_isspace = u_isUWhiteSpace(code);
129+
bool icu_isblank = u_isblank(code);
130+
bool icu_iscntrl = icu_category == PG_U_CONTROL;
131+
bool icu_isgraph = u_hasBinaryProperty(code,
132+
UCHAR_POSIX_GRAPH);
133+
bool icu_isprint = u_hasBinaryProperty(code,
134+
UCHAR_POSIX_PRINT);
135+
136+
/*
137+
* A version mismatch means that some assigned codepoints in the newer
138+
* version may be unassigned in the older version. That's OK, though
139+
* the test will not cover those codepoints marked unassigned in the
140+
* older version (that is, it will no longer be an exhaustive test).
141+
*/
142+
if (pg_category == PG_U_UNASSIGNED &&
143+
icu_category != PG_U_UNASSIGNED &&
144+
pg_unicode_version < icu_unicode_version)
145+
{
146+
pg_skipped_codepoints++;
147+
continue;
148+
}
149+
150+
if (icu_category == PG_U_UNASSIGNED &&
151+
pg_category != PG_U_UNASSIGNED &&
152+
icu_unicode_version < pg_unicode_version)
153+
{
154+
icu_skipped_codepoints++;
155+
continue;
156+
}
157+
65158
if (pg_category != icu_category)
66159
{
67-
/*
68-
* A version mismatch means that some assigned codepoints in the
69-
* newer version may be unassigned in the older version. That's
70-
* OK, though the test will not cover those codepoints marked
71-
* unassigned in the older version (that is, it will no longer be
72-
* an exhaustive test).
73-
*/
74-
if (pg_category == PG_U_UNASSIGNED &&
75-
pg_unicode_version < icu_unicode_version)
76-
pg_skipped_codepoints++;
77-
else if (icu_category == PG_U_UNASSIGNED &&
78-
icu_unicode_version < pg_unicode_version)
79-
icu_skipped_codepoints++;
80-
else
81-
{
82-
printf("category_test: FAILURE for codepoint 0x%06x\n", code);
83-
printf("category_test: Postgres category: %02d %s %s\n", pg_category,
84-
unicode_category_abbrev(pg_category),
85-
unicode_category_string(pg_category));
86-
printf("category_test: ICU category: %02d %s %s\n", icu_category,
87-
unicode_category_abbrev(icu_category),
88-
unicode_category_string(icu_category));
89-
printf("\n");
90-
exit(1);
91-
}
160+
printf("category_test: FAILURE for codepoint 0x%06x\n", code);
161+
printf("category_test: Postgres category: %02d %s %s\n", pg_category,
162+
unicode_category_abbrev(pg_category),
163+
unicode_category_string(pg_category));
164+
printf("category_test: ICU category: %02d %s %s\n", icu_category,
165+
unicode_category_abbrev(icu_category),
166+
unicode_category_string(icu_category));
167+
printf("\n");
168+
exit(1);
169+
}
170+
171+
if (prop_alphabetic != icu_prop_alphabetic ||
172+
prop_lowercase != icu_prop_lowercase ||
173+
prop_uppercase != icu_prop_uppercase ||
174+
prop_cased != icu_prop_cased ||
175+
prop_case_ignorable != icu_prop_case_ignorable ||
176+
prop_white_space != icu_prop_white_space ||
177+
prop_hex_digit != icu_prop_hex_digit ||
178+
prop_join_control != icu_prop_join_control)
179+
{
180+
printf("category_test: FAILURE for codepoint 0x%06x\n", code);
181+
printf("category_test: Postgres property alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",
182+
prop_alphabetic, prop_lowercase, prop_uppercase,
183+
prop_cased, prop_case_ignorable,
184+
prop_white_space, prop_hex_digit, prop_join_control);
185+
printf("category_test: ICU property alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",
186+
icu_prop_alphabetic, icu_prop_lowercase, icu_prop_uppercase,
187+
icu_prop_cased, icu_prop_case_ignorable,
188+
icu_prop_white_space, icu_prop_hex_digit, icu_prop_join_control);
189+
printf("\n");
190+
exit(1);
92191
}
192+
193+
if (isalpha != icu_isalpha ||
194+
islower != icu_islower ||
195+
isupper != icu_isupper ||
196+
ispunct != icu_ispunct ||
197+
isdigit != icu_isdigit ||
198+
isxdigit != icu_isxdigit ||
199+
isalnum != icu_isalnum ||
200+
isspace != icu_isspace ||
201+
isblank != icu_isblank ||
202+
iscntrl != icu_iscntrl ||
203+
isgraph != icu_isgraph ||
204+
isprint != icu_isprint)
205+
{
206+
printf("category_test: FAILURE for codepoint 0x%06x\n", code);
207+
printf("category_test: Postgres class alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",
208+
isalpha, islower, isupper, ispunct, isdigit, isxdigit, isalnum, isspace, isblank, iscntrl, isgraph, isprint);
209+
printf("category_test: ICU class alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",
210+
icu_isalpha, icu_islower, icu_isupper, icu_ispunct, icu_isdigit, icu_isxdigit, icu_isalnum, icu_isspace, icu_isblank, icu_iscntrl, icu_isgraph, icu_isprint);
211+
printf("\n");
212+
exit(1);
213+
}
214+
215+
if (pg_category != PG_U_UNASSIGNED)
216+
successful++;
93217
}
94218

95219
if (pg_skipped_codepoints > 0)
@@ -99,10 +223,22 @@ main(int argc, char **argv)
99223
printf("category_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",
100224
icu_skipped_codepoints);
101225

102-
printf("category_test: success\n");
103-
exit(0);
226+
printf("category_test: ICU test: %d codepoints successful\n", successful);
227+
}
228+
#endif
229+
230+
int
231+
main(int argc, char **argv)
232+
{
233+
pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
234+
printf("category_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
235+
236+
#ifdef USE_ICU
237+
icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
238+
printf("category_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
239+
240+
icu_test();
104241
#else
105-
printf("category_test: ICU support required for test; skipping\n");
106-
exit(0);
242+
printf("category_test: ICU not available; skipping\n");
107243
#endif
108244
}

0 commit comments

Comments
 (0)