1
1
/*-------------------------------------------------------------------------
2
2
* category_test.c
3
- * Program to test Unicode general category functions .
3
+ * Program to test Unicode general category and character properties .
4
4
*
5
5
* Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
6
6
*
14
14
#include <stdio.h>
15
15
#include <stdlib.h>
16
16
#include <string.h>
17
+ #include <wctype.h>
17
18
18
19
#ifdef USE_ICU
19
20
#include <unicode/uchar.h>
20
21
#endif
22
+
21
23
#include "common/unicode_category.h"
22
24
#include "common/unicode_version.h"
23
25
26
+ static int pg_unicode_version = 0 ;
27
+ #ifdef USE_ICU
28
+ static int icu_unicode_version = 0 ;
29
+ #endif
30
+
24
31
/*
25
32
* Parse version into integer for easy comparison.
26
33
*/
27
- #ifdef USE_ICU
28
34
static int
29
35
parse_unicode_version (const char * version )
30
36
{
@@ -39,57 +45,175 @@ parse_unicode_version(const char *version)
39
45
40
46
return major * 100 + minor ;
41
47
}
42
- #endif
43
48
49
+ #ifdef USE_ICU
44
50
/*
45
- * Exhaustively test that the Unicode category for each codepoint matches that
46
- * returned by ICU.
51
+ * Test Postgres Unicode tables by comparing with ICU. Test the General
52
+ * Category, as well as the properties Alphabetic, Lowercase, Uppercase,
53
+ * White_Space, and Hex_Digit.
47
54
*/
48
- int
49
- main ( int argc , char * * argv )
55
+ static void
56
+ icu_test ( )
50
57
{
51
- #ifdef USE_ICU
52
- int pg_unicode_version = parse_unicode_version (PG_UNICODE_VERSION );
53
- int icu_unicode_version = parse_unicode_version (U_UNICODE_VERSION );
58
+ int successful = 0 ;
54
59
int pg_skipped_codepoints = 0 ;
55
60
int icu_skipped_codepoints = 0 ;
56
61
57
- printf ("category_test: Postgres Unicode version:\t%s\n" , PG_UNICODE_VERSION );
58
- printf ("category_test: ICU Unicode version:\t\t%s\n" , U_UNICODE_VERSION );
59
-
60
- for (UChar32 code = 0 ; code <= 0x10ffff ; code ++ )
62
+ for (pg_wchar code = 0 ; code <= 0x10ffff ; code ++ )
61
63
{
62
64
uint8_t pg_category = unicode_category (code );
63
65
uint8_t icu_category = u_charType (code );
64
66
67
+ /* Property tests */
68
+ bool prop_alphabetic = pg_u_prop_alphabetic (code );
69
+ bool prop_lowercase = pg_u_prop_lowercase (code );
70
+ bool prop_uppercase = pg_u_prop_uppercase (code );
71
+ bool prop_cased = pg_u_prop_cased (code );
72
+ bool prop_case_ignorable = pg_u_prop_case_ignorable (code );
73
+ bool prop_white_space = pg_u_prop_white_space (code );
74
+ bool prop_hex_digit = pg_u_prop_hex_digit (code );
75
+ bool prop_join_control = pg_u_prop_join_control (code );
76
+
77
+ bool icu_prop_alphabetic = u_hasBinaryProperty (
78
+ code , UCHAR_ALPHABETIC );
79
+ bool icu_prop_lowercase = u_hasBinaryProperty (
80
+ code , UCHAR_LOWERCASE );
81
+ bool icu_prop_uppercase = u_hasBinaryProperty (
82
+ code , UCHAR_UPPERCASE );
83
+ bool icu_prop_cased = u_hasBinaryProperty (
84
+ code , UCHAR_CASED );
85
+ bool icu_prop_case_ignorable = u_hasBinaryProperty (
86
+ code , UCHAR_CASE_IGNORABLE );
87
+ bool icu_prop_white_space = u_hasBinaryProperty (
88
+ code , UCHAR_WHITE_SPACE );
89
+ bool icu_prop_hex_digit = u_hasBinaryProperty (
90
+ code , UCHAR_HEX_DIGIT );
91
+ bool icu_prop_join_control = u_hasBinaryProperty (
92
+ code , UCHAR_JOIN_CONTROL );
93
+
94
+ /*
95
+ * Compare with ICU for character classes using:
96
+ *
97
+ * https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uchar_8h.html#details
98
+ *
99
+ * which describes how to use ICU to test for membership in regex
100
+ * character classes.
101
+ *
102
+ * NB: the document suggests testing for some properties such as
103
+ * UCHAR_POSIX_ALNUM, but that doesn't mean that we're testing for the
104
+ * "POSIX Compatible" character classes.
105
+ */
106
+ bool isalpha = pg_u_isalpha (code );
107
+ bool islower = pg_u_islower (code );
108
+ bool isupper = pg_u_isupper (code );
109
+ bool ispunct = pg_u_ispunct (code , false);
110
+ bool isdigit = pg_u_isdigit (code , false);
111
+ bool isxdigit = pg_u_isxdigit (code , false);
112
+ bool isalnum = pg_u_isalnum (code , false);
113
+ bool isspace = pg_u_isspace (code );
114
+ bool isblank = pg_u_isblank (code );
115
+ bool iscntrl = pg_u_iscntrl (code );
116
+ bool isgraph = pg_u_isgraph (code );
117
+ bool isprint = pg_u_isprint (code );
118
+
119
+ bool icu_isalpha = u_isUAlphabetic (code );
120
+ bool icu_islower = u_isULowercase (code );
121
+ bool icu_isupper = u_isUUppercase (code );
122
+ bool icu_ispunct = u_ispunct (code );
123
+ bool icu_isdigit = u_isdigit (code );
124
+ bool icu_isxdigit = u_hasBinaryProperty (code ,
125
+ UCHAR_POSIX_XDIGIT );
126
+ bool icu_isalnum = u_hasBinaryProperty (code ,
127
+ UCHAR_POSIX_ALNUM );
128
+ bool icu_isspace = u_isUWhiteSpace (code );
129
+ bool icu_isblank = u_isblank (code );
130
+ bool icu_iscntrl = icu_category == PG_U_CONTROL ;
131
+ bool icu_isgraph = u_hasBinaryProperty (code ,
132
+ UCHAR_POSIX_GRAPH );
133
+ bool icu_isprint = u_hasBinaryProperty (code ,
134
+ UCHAR_POSIX_PRINT );
135
+
136
+ /*
137
+ * A version mismatch means that some assigned codepoints in the newer
138
+ * version may be unassigned in the older version. That's OK, though
139
+ * the test will not cover those codepoints marked unassigned in the
140
+ * older version (that is, it will no longer be an exhaustive test).
141
+ */
142
+ if (pg_category == PG_U_UNASSIGNED &&
143
+ icu_category != PG_U_UNASSIGNED &&
144
+ pg_unicode_version < icu_unicode_version )
145
+ {
146
+ pg_skipped_codepoints ++ ;
147
+ continue ;
148
+ }
149
+
150
+ if (icu_category == PG_U_UNASSIGNED &&
151
+ pg_category != PG_U_UNASSIGNED &&
152
+ icu_unicode_version < pg_unicode_version )
153
+ {
154
+ icu_skipped_codepoints ++ ;
155
+ continue ;
156
+ }
157
+
65
158
if (pg_category != icu_category )
66
159
{
67
- /*
68
- * A version mismatch means that some assigned codepoints in the
69
- * newer version may be unassigned in the older version. That's
70
- * OK, though the test will not cover those codepoints marked
71
- * unassigned in the older version (that is, it will no longer be
72
- * an exhaustive test).
73
- */
74
- if (pg_category == PG_U_UNASSIGNED &&
75
- pg_unicode_version < icu_unicode_version )
76
- pg_skipped_codepoints ++ ;
77
- else if (icu_category == PG_U_UNASSIGNED &&
78
- icu_unicode_version < pg_unicode_version )
79
- icu_skipped_codepoints ++ ;
80
- else
81
- {
82
- printf ("category_test: FAILURE for codepoint 0x%06x\n" , code );
83
- printf ("category_test: Postgres category: %02d %s %s\n" , pg_category ,
84
- unicode_category_abbrev (pg_category ),
85
- unicode_category_string (pg_category ));
86
- printf ("category_test: ICU category: %02d %s %s\n" , icu_category ,
87
- unicode_category_abbrev (icu_category ),
88
- unicode_category_string (icu_category ));
89
- printf ("\n" );
90
- exit (1 );
91
- }
160
+ printf ("category_test: FAILURE for codepoint 0x%06x\n" , code );
161
+ printf ("category_test: Postgres category: %02d %s %s\n" , pg_category ,
162
+ unicode_category_abbrev (pg_category ),
163
+ unicode_category_string (pg_category ));
164
+ printf ("category_test: ICU category: %02d %s %s\n" , icu_category ,
165
+ unicode_category_abbrev (icu_category ),
166
+ unicode_category_string (icu_category ));
167
+ printf ("\n" );
168
+ exit (1 );
169
+ }
170
+
171
+ if (prop_alphabetic != icu_prop_alphabetic ||
172
+ prop_lowercase != icu_prop_lowercase ||
173
+ prop_uppercase != icu_prop_uppercase ||
174
+ prop_cased != icu_prop_cased ||
175
+ prop_case_ignorable != icu_prop_case_ignorable ||
176
+ prop_white_space != icu_prop_white_space ||
177
+ prop_hex_digit != icu_prop_hex_digit ||
178
+ prop_join_control != icu_prop_join_control )
179
+ {
180
+ printf ("category_test: FAILURE for codepoint 0x%06x\n" , code );
181
+ printf ("category_test: Postgres property alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n" ,
182
+ prop_alphabetic , prop_lowercase , prop_uppercase ,
183
+ prop_cased , prop_case_ignorable ,
184
+ prop_white_space , prop_hex_digit , prop_join_control );
185
+ printf ("category_test: ICU property alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n" ,
186
+ icu_prop_alphabetic , icu_prop_lowercase , icu_prop_uppercase ,
187
+ icu_prop_cased , icu_prop_case_ignorable ,
188
+ icu_prop_white_space , icu_prop_hex_digit , icu_prop_join_control );
189
+ printf ("\n" );
190
+ exit (1 );
92
191
}
192
+
193
+ if (isalpha != icu_isalpha ||
194
+ islower != icu_islower ||
195
+ isupper != icu_isupper ||
196
+ ispunct != icu_ispunct ||
197
+ isdigit != icu_isdigit ||
198
+ isxdigit != icu_isxdigit ||
199
+ isalnum != icu_isalnum ||
200
+ isspace != icu_isspace ||
201
+ isblank != icu_isblank ||
202
+ iscntrl != icu_iscntrl ||
203
+ isgraph != icu_isgraph ||
204
+ isprint != icu_isprint )
205
+ {
206
+ printf ("category_test: FAILURE for codepoint 0x%06x\n" , code );
207
+ printf ("category_test: Postgres class alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n" ,
208
+ isalpha , islower , isupper , ispunct , isdigit , isxdigit , isalnum , isspace , isblank , iscntrl , isgraph , isprint );
209
+ printf ("category_test: ICU class alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n" ,
210
+ icu_isalpha , icu_islower , icu_isupper , icu_ispunct , icu_isdigit , icu_isxdigit , icu_isalnum , icu_isspace , icu_isblank , icu_iscntrl , icu_isgraph , icu_isprint );
211
+ printf ("\n" );
212
+ exit (1 );
213
+ }
214
+
215
+ if (pg_category != PG_U_UNASSIGNED )
216
+ successful ++ ;
93
217
}
94
218
95
219
if (pg_skipped_codepoints > 0 )
@@ -99,10 +223,22 @@ main(int argc, char **argv)
99
223
printf ("category_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n" ,
100
224
icu_skipped_codepoints );
101
225
102
- printf ("category_test: success\n" );
103
- exit (0 );
226
+ printf ("category_test: ICU test: %d codepoints successful\n" , successful );
227
+ }
228
+ #endif
229
+
230
+ int
231
+ main (int argc , char * * argv )
232
+ {
233
+ pg_unicode_version = parse_unicode_version (PG_UNICODE_VERSION );
234
+ printf ("category_test: Postgres Unicode version:\t%s\n" , PG_UNICODE_VERSION );
235
+
236
+ #ifdef USE_ICU
237
+ icu_unicode_version = parse_unicode_version (U_UNICODE_VERSION );
238
+ printf ("category_test: ICU Unicode version:\t\t%s\n" , U_UNICODE_VERSION );
239
+
240
+ icu_test ();
104
241
#else
105
- printf ("category_test: ICU support required for test; skipping\n" );
106
- exit (0 );
242
+ printf ("category_test: ICU not available; skipping\n" );
107
243
#endif
108
244
}
0 commit comments