18
18
#include <wctype.h>
19
19
20
20
#ifdef USE_ICU
21
+ #include <unicode/ucasemap.h>
21
22
#include <unicode/uchar.h>
22
23
#endif
23
24
#include "common/unicode_case.h"
24
25
#include "common/unicode_category.h"
25
26
#include "common/unicode_version.h"
26
27
28
+ /* enough to hold largest source or result string, including NUL */
29
+ #define BUFSZ 256
30
+
31
+ #ifdef USE_ICU
32
+ static UCaseMap * casemap = NULL ;
33
+ #endif
34
+
35
+ typedef size_t (* TestFunc ) (char * dst , size_t dstsize , const char * src ,
36
+ ssize_t srclen );
37
+
38
+ /* simple boundary iterator copied from pg_locale_builtin.c */
39
+ struct WordBoundaryState
40
+ {
41
+ const char * str ;
42
+ size_t len ;
43
+ size_t offset ;
44
+ bool init ;
45
+ bool prev_alnum ;
46
+ };
47
+
48
+ static size_t
49
+ initcap_wbnext (void * state )
50
+ {
51
+ struct WordBoundaryState * wbstate = (struct WordBoundaryState * ) state ;
52
+
53
+ while (wbstate -> offset < wbstate -> len &&
54
+ wbstate -> str [wbstate -> offset ] != '\0' )
55
+ {
56
+ pg_wchar u = utf8_to_unicode ((unsigned char * ) wbstate -> str +
57
+ wbstate -> offset );
58
+ bool curr_alnum = pg_u_isalnum (u , true);
59
+
60
+ if (!wbstate -> init || curr_alnum != wbstate -> prev_alnum )
61
+ {
62
+ size_t prev_offset = wbstate -> offset ;
63
+
64
+ wbstate -> init = true;
65
+ wbstate -> offset += unicode_utf8len (u );
66
+ wbstate -> prev_alnum = curr_alnum ;
67
+ return prev_offset ;
68
+ }
69
+
70
+ wbstate -> offset += unicode_utf8len (u );
71
+ }
72
+
73
+ return wbstate -> len ;
74
+ }
75
+
27
76
#ifdef USE_ICU
28
77
29
78
static void
@@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
48
97
}
49
98
}
50
99
100
+ static void
101
+ icu_test_full (char * str )
102
+ {
103
+ char lower [BUFSZ ];
104
+ char title [BUFSZ ];
105
+ char upper [BUFSZ ];
106
+ char icu_lower [BUFSZ ];
107
+ char icu_title [BUFSZ ];
108
+ char icu_upper [BUFSZ ];
109
+ UErrorCode status ;
110
+ struct WordBoundaryState wbstate = {
111
+ .str = str ,
112
+ .len = strlen (str ),
113
+ .offset = 0 ,
114
+ .init = false,
115
+ .prev_alnum = false,
116
+ };
117
+
118
+ unicode_strlower (lower , BUFSZ , str , -1 , true);
119
+ unicode_strtitle (title , BUFSZ , str , -1 , true, initcap_wbnext , & wbstate );
120
+ unicode_strupper (upper , BUFSZ , str , -1 , true);
121
+ status = U_ZERO_ERROR ;
122
+ ucasemap_utf8ToLower (casemap , icu_lower , BUFSZ , str , -1 , & status );
123
+ status = U_ZERO_ERROR ;
124
+ ucasemap_utf8ToTitle (casemap , icu_title , BUFSZ , str , -1 , & status );
125
+ status = U_ZERO_ERROR ;
126
+ ucasemap_utf8ToUpper (casemap , icu_upper , BUFSZ , str , -1 , & status );
127
+
128
+ if (strcmp (lower , icu_lower ) != 0 )
129
+ {
130
+ printf ("case_test: str='%s' lower='%s' icu_lower='%s'\n" , str , lower ,
131
+ icu_lower );
132
+ exit (1 );
133
+ }
134
+ if (strcmp (title , icu_title ) != 0 )
135
+ {
136
+ printf ("case_test: str='%s' title='%s' icu_title='%s'\n" , str , title ,
137
+ icu_title );
138
+ exit (1 );
139
+ }
140
+ if (strcmp (upper , icu_upper ) != 0 )
141
+ {
142
+ printf ("case_test: str='%s' upper='%s' icu_upper='%s'\n" , str , upper ,
143
+ icu_upper );
144
+ exit (1 );
145
+ }
146
+ }
147
+
51
148
/*
52
149
* Exhaustively compare case mappings with the results from ICU.
53
150
*/
@@ -64,6 +161,7 @@ test_icu(void)
64
161
if (category != PG_U_UNASSIGNED )
65
162
{
66
163
uint8_t icu_category = u_charType (code );
164
+ char code_str [5 ] = {0 };
67
165
68
166
if (icu_category == PG_U_UNASSIGNED )
69
167
{
@@ -72,6 +170,9 @@ test_icu(void)
72
170
}
73
171
74
172
icu_test_simple (code );
173
+ unicode_to_utf8 (code , (unsigned char * ) code_str );
174
+ icu_test_full (code_str );
175
+
75
176
successful ++ ;
76
177
}
77
178
}
@@ -86,7 +187,7 @@ test_icu(void)
86
187
#endif
87
188
88
189
static void
89
- test_strlower ( const char * test_string , const char * expected )
190
+ test_convert ( TestFunc tfunc , const char * test_string , const char * expected )
90
191
{
91
192
size_t src1len = strlen (test_string );
92
193
size_t src2len = -1 ; /* NUL-terminated */
@@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)
102
203
103
204
/* neither source nor destination are NUL-terminated */
104
205
memset (dst1 , 0x7F , dst1len );
105
- needed = unicode_strlower (dst1 , dst1len , src1 , src1len );
206
+ needed = tfunc (dst1 , dst1len , src1 , src1len );
106
207
if (needed != strlen (expected ))
107
208
{
108
- printf ("case_test: convert_case test1 FAILURE: needed %zu\n" , needed );
209
+ printf ("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n" ,
210
+ test_string , needed , strlen (expected ));
109
211
exit (1 );
110
212
}
111
213
if (memcmp (dst1 , expected , dst1len ) != 0 )
@@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)
117
219
118
220
/* destination is NUL-terminated and source is not */
119
221
memset (dst2 , 0x7F , dst2len );
120
- needed = unicode_strlower (dst2 , dst2len , src1 , src1len );
222
+ needed = tfunc (dst2 , dst2len , src1 , src1len );
121
223
if (needed != strlen (expected ))
122
224
{
123
- printf ("case_test: convert_case test2 FAILURE: needed %zu\n" , needed );
225
+ printf ("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n" ,
226
+ test_string , needed , strlen (expected ));
124
227
exit (1 );
125
228
}
126
229
if (strcmp (dst2 , expected ) != 0 )
@@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)
132
235
133
236
/* source is NUL-terminated and destination is not */
134
237
memset (dst1 , 0x7F , dst1len );
135
- needed = unicode_strlower (dst1 , dst1len , src2 , src2len );
238
+ needed = tfunc (dst1 , dst1len , src2 , src2len );
136
239
if (needed != strlen (expected ))
137
240
{
241
+ printf ("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n" ,
242
+ test_string , needed , strlen (expected ));
138
243
printf ("case_test: convert_case test3 FAILURE: needed %zu\n" , needed );
139
244
exit (1 );
140
245
}
@@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)
147
252
148
253
/* both source and destination are NUL-terminated */
149
254
memset (dst2 , 0x7F , dst2len );
150
- needed = unicode_strlower (dst2 , dst2len , src2 , src2len );
255
+ needed = tfunc (dst2 , dst2len , src2 , src2len );
151
256
if (needed != strlen (expected ))
152
257
{
153
- printf ("case_test: convert_case test4 FAILURE: needed %zu\n" , needed );
258
+ printf ("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n" ,
259
+ test_string , needed , strlen (expected ));
154
260
exit (1 );
155
261
}
156
262
if (strcmp (dst2 , expected ) != 0 )
@@ -166,22 +272,92 @@ test_strlower(const char *test_string, const char *expected)
166
272
free (dst2 );
167
273
}
168
274
275
+ static size_t
276
+ tfunc_lower (char * dst , size_t dstsize , const char * src ,
277
+ ssize_t srclen )
278
+ {
279
+ return unicode_strlower (dst , dstsize , src , srclen , true);
280
+ }
281
+
282
+ static size_t
283
+ tfunc_title (char * dst , size_t dstsize , const char * src ,
284
+ ssize_t srclen )
285
+ {
286
+ struct WordBoundaryState wbstate = {
287
+ .str = src ,
288
+ .len = srclen ,
289
+ .offset = 0 ,
290
+ .init = false,
291
+ .prev_alnum = false,
292
+ };
293
+
294
+ return unicode_strtitle (dst , dstsize , src , srclen , true, initcap_wbnext ,
295
+ & wbstate );
296
+ }
297
+
298
+ static size_t
299
+ tfunc_upper (char * dst , size_t dstsize , const char * src ,
300
+ ssize_t srclen )
301
+ {
302
+ return unicode_strupper (dst , dstsize , src , srclen , true);
303
+ }
304
+
305
+
169
306
static void
170
307
test_convert_case ()
171
308
{
172
309
/* test string with no case changes */
173
- test_strlower ("√∞" , "√∞" );
310
+ test_convert (tfunc_lower , "√∞" , "√∞" );
311
+ /* test adjust-to-cased behavior */
312
+ test_convert (tfunc_title , "abc 123xyz" , "Abc 123xyz" );
174
313
/* test string with case changes */
175
- test_strlower ( "ABC " , "abc " );
314
+ test_convert ( tfunc_upper , "abc " , "ABC " );
176
315
/* test string with case changes and byte length changes */
177
- test_strlower ("ȺȺȺ" , "ⱥⱥⱥ" );
316
+ test_convert (tfunc_lower , "ȺȺȺ" , "ⱥⱥⱥ" );
317
+ /* test special case conversions */
318
+ test_convert (tfunc_upper , "ß" , "SS" );
319
+ test_convert (tfunc_lower , "ıiIİ" , "ıiii\u0307" );
320
+ test_convert (tfunc_upper , "ıiIİ" , "IIIİ" );
321
+ /* test final sigma */
322
+ test_convert (tfunc_lower , "σςΣ ΣΣΣ" , "σςς σσς" );
323
+ test_convert (tfunc_lower , "σς'Σ' ΣΣ'Σ'" , "σς'ς' σσ'ς'" );
324
+ test_convert (tfunc_title , "σςΣ ΣΣΣ" , "Σςς Σσς" );
325
+
326
+ #ifdef USE_ICU
327
+ icu_test_full ("" );
328
+ icu_test_full ("ȺȺȺ" );
329
+ icu_test_full ("ßßß" );
330
+ icu_test_full ("√∞" );
331
+ icu_test_full ("a b" );
332
+ icu_test_full ("abc 123xyz" );
333
+ icu_test_full ("σςΣ ΣΣΣ" );
334
+ icu_test_full ("ıiIİ" );
335
+ /* test <alpha><iota_subscript><acute> */
336
+ icu_test_full ("\u0391\u0345\u0301" );
337
+ #endif
178
338
179
339
printf ("case_test: convert_case: success\n" );
180
340
}
181
341
182
342
int
183
343
main (int argc , char * * argv )
184
344
{
345
+ #ifdef USE_ICU
346
+ UErrorCode status = U_ZERO_ERROR ;
347
+
348
+ /*
349
+ * Disable ICU's word break adjustment for titlecase to match the expected
350
+ * behavior of unicode_strtitle().
351
+ */
352
+ casemap = ucasemap_open ("und" , U_TITLECASE_NO_BREAK_ADJUSTMENT , & status );
353
+ if (U_FAILURE (status ))
354
+ {
355
+ printf ("case_test: failure opening UCaseMap: %s\n" ,
356
+ u_errorName (status ));
357
+ exit (1 );
358
+ }
359
+ #endif
360
+
185
361
printf ("case_test: Postgres Unicode version:\t%s\n" , PG_UNICODE_VERSION );
186
362
#ifdef USE_ICU
187
363
printf ("case_test: ICU Unicode version:\t\t%s\n" , U_UNICODE_VERSION );
@@ -191,5 +367,9 @@ main(int argc, char **argv)
191
367
#endif
192
368
193
369
test_convert_case ();
370
+
371
+ #ifdef USE_ICU
372
+ ucasemap_close (casemap );
373
+ #endif
194
374
exit (0 );
195
375
}
0 commit comments