Skip to content

Commit 5c40364

Browse files
committed
Unicode case mapping tables and functions.
Implements Unicode simple case mapping, in which all code points map to exactly one other code point unconditionally. These tables are generated from UnicodeData.txt, which is already being used by other infrastructure in src/common/unicode. The tables are checked into the source tree, so they only need to be regenerated when we update the Unicode version. In preparation for the builtin collation provider, and possibly useful for other callers. Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com Reviewed-by: Peter Eisentraut, Daniel Verite, Jeremy Schneider
1 parent 6d47021 commit 5c40364

11 files changed

+3498
-5
lines changed

src/common/Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ OBJS_COMMON = \
7878
scram-common.o \
7979
string.o \
8080
stringinfo.o \
81+
unicode_case.o \
8182
unicode_category.o \
8283
unicode_norm.o \
8384
username.o \

src/common/meson.build

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ common_sources = files(
3232
'scram-common.c',
3333
'string.c',
3434
'stringinfo.c',
35+
'unicode_case.c',
3536
'unicode_category.c',
3637
'unicode_norm.c',
3738
'username.c',

src/common/unicode/Makefile

+12-3
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@ CPPFLAGS += $(ICU_CFLAGS)
2121
# By default, do nothing.
2222
all:
2323

24-
update-unicode: unicode_category_table.h unicode_east_asian_fw_table.h unicode_nonspacing_table.h unicode_norm_hashfunc.h unicode_norm_table.h unicode_normprops_table.h unicode_version.h
24+
update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian_fw_table.h unicode_nonspacing_table.h unicode_norm_hashfunc.h unicode_norm_table.h unicode_normprops_table.h unicode_version.h
2525
mv $^ $(top_srcdir)/src/include/common/
26+
$(MAKE) case-check
2627
$(MAKE) category-check
2728
$(MAKE) normalization-check
2829

@@ -35,6 +36,9 @@ CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.tx
3536
unicode_version.h: generate-unicode_version.pl
3637
$(PERL) $< --version $(UNICODE_VERSION)
3738

39+
unicode_case_table.h: generate-unicode_case_table.pl UnicodeData.txt
40+
$(PERL) $<
41+
3842
unicode_category_table.h: generate-unicode_category_table.pl DerivedCoreProperties.txt PropList.txt UnicodeData.txt
3943
$(PERL) $<
4044

@@ -55,12 +59,17 @@ unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizat
5559
$(PERL) $^ >$@
5660

5761
# Test suite
62+
case-check: case_test
63+
./case_test
64+
5865
category-check: category_test
5966
./category_test
6067

6168
normalization-check: norm_test
6269
./norm_test
6370

71+
case_test: case_test.o ../unicode_case.o | submake-common
72+
6473
category_test: category_test.o ../unicode_category.o | submake-common
6574

6675
norm_test: norm_test.o ../unicode_norm.o | submake-common
@@ -79,7 +88,7 @@ norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt
7988

8089

8190
clean:
82-
rm -f $(OBJS) category_test category_test.o norm_test norm_test.o
91+
rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
8392

8493
distclean: clean
85-
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_category_table.h unicode_norm_table.h
94+
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h

src/common/unicode/case_test.c

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/*-------------------------------------------------------------------------
2+
* case_test.c
3+
* Program to test Unicode case mapping functions.
4+
*
5+
* Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
6+
*
7+
* IDENTIFICATION
8+
* src/common/unicode/case_test.c
9+
*
10+
*-------------------------------------------------------------------------
11+
*/
12+
#include "postgres_fe.h"
13+
14+
#include <locale.h>
15+
#include <stdio.h>
16+
#include <stdlib.h>
17+
#include <string.h>
18+
#include <wctype.h>
19+
20+
#ifdef USE_ICU
21+
#include <unicode/uchar.h>
22+
#endif
23+
#include "common/unicode_case.h"
24+
#include "common/unicode_category.h"
25+
#include "common/unicode_version.h"
26+
27+
#ifdef USE_ICU
28+
29+
static void
30+
icu_test_simple(pg_wchar code)
31+
{
32+
pg_wchar lower = unicode_lowercase_simple(code);
33+
pg_wchar title = unicode_titlecase_simple(code);
34+
pg_wchar upper = unicode_uppercase_simple(code);
35+
pg_wchar iculower = u_tolower(code);
36+
pg_wchar icutitle = u_totitle(code);
37+
pg_wchar icuupper = u_toupper(code);
38+
39+
if (lower != iculower || title != icutitle || upper != icuupper)
40+
{
41+
printf("case_test: FAILURE for codepoint 0x%06x\n", code);
42+
printf("case_test: Postgres lower/title/upper: 0x%06x/0x%06x/0x%06x\n",
43+
lower, title, upper);
44+
printf("case_test: ICU lower/title/upper: 0x%06x/0x%06x/0x%06x\n",
45+
iculower, icutitle, icuupper);
46+
printf("\n");
47+
exit(1);
48+
}
49+
}
50+
51+
static void
52+
test_icu(void)
53+
{
54+
int successful = 0;
55+
int skipped_mismatch = 0;
56+
57+
for (pg_wchar code = 0; code <= 0x10ffff; code++)
58+
{
59+
pg_unicode_category category = unicode_category(code);
60+
61+
if (category != PG_U_UNASSIGNED)
62+
{
63+
uint8_t icu_category = u_charType(code);
64+
65+
if (icu_category == PG_U_UNASSIGNED)
66+
{
67+
skipped_mismatch++;
68+
continue;
69+
}
70+
71+
icu_test_simple(code);
72+
successful++;
73+
}
74+
}
75+
76+
if (skipped_mismatch > 0)
77+
printf("case_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",
78+
skipped_mismatch);
79+
80+
printf("case_test: ICU simple mapping test: %d codepoints successful\n",
81+
successful);
82+
}
83+
#endif
84+
85+
/*
86+
* Exhaustively compare case mappings with the results from libc and ICU.
87+
*/
88+
int
89+
main(int argc, char **argv)
90+
{
91+
printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
92+
#ifdef USE_ICU
93+
printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
94+
test_icu();
95+
#else
96+
printf("case_test: ICU not available; skipping\n");
97+
#endif
98+
99+
exit(0);
100+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#!/usr/bin/perl
2+
#
3+
# Generate Unicode character case mappings. Does not include tailoring
4+
# or locale-specific mappings.
5+
#
6+
# Input: UnicodeData.txt
7+
# Output: unicode_case_table.h
8+
#
9+
# Copyright (c) 2000-2023, PostgreSQL Global Development Group
10+
11+
use strict;
12+
use warnings;
13+
use Getopt::Long;
14+
15+
use FindBin;
16+
use lib "$FindBin::RealBin/../../tools/";
17+
18+
my $output_path = '.';
19+
20+
GetOptions('outdir:s' => \$output_path);
21+
22+
my $output_table_file = "$output_path/unicode_case_table.h";
23+
24+
my $FH;
25+
26+
my %simple = ();
27+
28+
open($FH, '<', "$output_path/UnicodeData.txt")
29+
or die "Could not open $output_path/UnicodeData.txt: $!.";
30+
while (my $line = <$FH>)
31+
{
32+
my @elts = split(';', $line);
33+
my $code = hex($elts[0]);
34+
my $simple_uppercase = hex($elts[12] =~ s/^\s+|\s+$//rg);
35+
my $simple_lowercase = hex($elts[13] =~ s/^\s+|\s+$//rg);
36+
my $simple_titlecase = hex($elts[14] =~ s/^\s+|\s+$//rg);
37+
38+
die "codepoint $code out of range" if $code > 0x10FFFF;
39+
die "Simple_Lowercase $code out of range" if $simple_lowercase > 0x10FFFF;
40+
die "Simple_Titlecase $code out of range" if $simple_titlecase > 0x10FFFF;
41+
die "Simple_Uppercase $code out of range" if $simple_uppercase > 0x10FFFF;
42+
43+
if ($simple_lowercase || $simple_titlecase || $simple_uppercase)
44+
{
45+
$simple{$code} = {
46+
Simple_Lowercase => ($simple_lowercase || $code),
47+
Simple_Titlecase => ($simple_titlecase || $code),
48+
Simple_Uppercase => ($simple_uppercase || $code)
49+
};
50+
}
51+
}
52+
close $FH;
53+
54+
# Start writing out the output files
55+
open my $OT, '>', $output_table_file
56+
or die "Could not open output file $output_table_file: $!\n";
57+
58+
# determine size of array given that codepoints <= 0x80 are dense and
59+
# the rest of the entries are sparse
60+
my $num_simple = 0x80;
61+
foreach my $code (sort { $a <=> $b } (keys %simple))
62+
{
63+
$num_simple++ unless $code < 0x80;
64+
}
65+
66+
print $OT <<"EOS";
67+
/*-------------------------------------------------------------------------
68+
*
69+
* unicode_case_table.h
70+
* Case mapping and information table.
71+
*
72+
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
73+
* Portions Copyright (c) 1994, Regents of the University of California
74+
*
75+
* src/include/common/unicode_case_table.h
76+
*
77+
*-------------------------------------------------------------------------
78+
*/
79+
80+
/*
81+
* File auto-generated by src/common/unicode/generate-unicode_case_table.pl,
82+
* do not edit. There is deliberately not an #ifndef PG_UNICODE_CASE_TABLE_H
83+
* here.
84+
*/
85+
86+
#include "common/unicode_case.h"
87+
#include "mb/pg_wchar.h"
88+
89+
typedef enum
90+
{
91+
CaseLower = 0,
92+
CaseTitle = 1,
93+
CaseUpper = 2,
94+
NCaseKind
95+
} CaseKind;
96+
97+
typedef struct
98+
{
99+
pg_wchar codepoint; /* Unicode codepoint */
100+
pg_wchar simplemap[NCaseKind];
101+
} pg_case_map;
102+
103+
/*
104+
* Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup),
105+
* sparse for higher codepoints (requiring scan or binary search).
106+
*/
107+
static const pg_case_map case_map[$num_simple] =
108+
{
109+
EOS
110+
111+
printf $OT "\t/* begin dense entries for codepoints < 0x80 */\n";
112+
for (my $code = 0; $code < 0x80; $code++)
113+
{
114+
my $lc = ($simple{$code}{Simple_Lowercase} || $code);
115+
my $tc = ($simple{$code}{Simple_Titlecase} || $code);
116+
my $uc = ($simple{$code}{Simple_Uppercase} || $code);
117+
printf $OT
118+
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
119+
$code, $lc, $tc, $uc;
120+
}
121+
printf $OT "\n";
122+
123+
printf $OT "\t/* begin sparse entries for codepoints >= 0x80 */\n";
124+
foreach my $code (sort { $a <=> $b } (keys %simple))
125+
{
126+
next unless $code >= 0x80; # already output above
127+
128+
my $map = $simple{$code};
129+
printf $OT
130+
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
131+
$code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
132+
$map->{Simple_Uppercase};
133+
}
134+
print $OT "};\n";

src/common/unicode/meson.build

+31
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,16 @@ endforeach
2424

2525
update_unicode_targets = []
2626

27+
update_unicode_targets += \
28+
custom_target('unicode_case_table.h',
29+
input: [unicode_data['UnicodeData.txt']],
30+
output: ['unicode_case_table.h'],
31+
command: [
32+
perl, files('generate-unicode_case_table.pl'),
33+
'--outdir', '@OUTDIR@', '@INPUT@'],
34+
build_by_default: false,
35+
)
36+
2737
update_unicode_targets += \
2838
custom_target('unicode_category_table.h',
2939
input: [unicode_data['UnicodeData.txt'], unicode_data['DerivedCoreProperties.txt'], unicode_data['PropList.txt']],
@@ -92,6 +102,17 @@ norm_test_table = custom_target('norm_test_table.h',
92102

93103
inc = include_directories('.')
94104

105+
case_test = executable('case_test',
106+
['case_test.c'],
107+
dependencies: [frontend_port_code, icu],
108+
include_directories: inc,
109+
link_with: [common_static, pgport_static],
110+
build_by_default: false,
111+
kwargs: default_bin_args + {
112+
'install': false,
113+
}
114+
)
115+
95116
category_test = executable('category_test',
96117
['category_test.c'],
97118
dependencies: [frontend_port_code, icu],
@@ -116,6 +137,16 @@ norm_test = executable('norm_test',
116137

117138
update_unicode_dep = []
118139

140+
if not meson.is_cross_build()
141+
update_unicode_dep += custom_target('case_test.run',
142+
output: 'case_test.run',
143+
input: update_unicode_targets,
144+
command: [case_test, UNICODE_VERSION],
145+
build_by_default: false,
146+
build_always_stale: true,
147+
)
148+
endif
149+
119150
if not meson.is_cross_build()
120151
update_unicode_dep += custom_target('category_test.run',
121152
output: 'category_test.run',

0 commit comments

Comments
 (0)