|
1 | 1 | #!/usr/bin/perl
|
2 | 2 | #
|
3 |
| -# Generate a composition table, using Unicode data files as input |
| 3 | +# Generate a composition table and its lookup utilities, using Unicode data |
| 4 | +# files as input. |
4 | 5 | #
|
5 | 6 | # Input: UnicodeData.txt and CompositionExclusions.txt
|
6 |
| -# Output: unicode_norm_table.h |
| 7 | +# Output: unicode_norm_table.h and unicode_norm_hashfunc.h |
7 | 8 | #
|
8 | 9 | # Copyright (c) 2000-2020, PostgreSQL Global Development Group
|
9 | 10 |
|
10 | 11 | use strict;
|
11 | 12 | use warnings;
|
12 | 13 |
|
13 |
| -my $output_file = "unicode_norm_table.h"; |
| 14 | +use FindBin; |
| 15 | +use lib "$FindBin::RealBin/../../tools/"; |
| 16 | +use PerfectHash; |
| 17 | + |
| 18 | +my $output_table_file = "unicode_norm_table.h"; |
| 19 | +my $output_func_file = "unicode_norm_hashfunc.h"; |
14 | 20 |
|
15 | 21 | my $FH;
|
16 | 22 |
|
|
64 | 70 |
|
65 | 71 | my $num_characters = scalar @characters;
|
66 | 72 |
|
67 |
| -# Start writing out the output file |
68 |
| -open my $OUTPUT, '>', $output_file |
69 |
| - or die "Could not open output file $output_file: $!\n"; |
| 73 | +# Start writing out the output files |
| 74 | +open my $OT, '>', $output_table_file |
| 75 | + or die "Could not open output file $output_table_file: $!\n"; |
| 76 | +open my $OF, '>', $output_func_file |
| 77 | + or die "Could not open output file $output_func_file: $!\n"; |
70 | 78 |
|
71 |
| -print $OUTPUT <<HEADER; |
| 79 | +print $OT <<HEADER; |
72 | 80 | /*-------------------------------------------------------------------------
|
73 | 81 | *
|
74 | 82 | * unicode_norm_table.h
|
|
111 | 119 | {
|
112 | 120 | HEADER
|
113 | 121 |
|
| 122 | +print $OF <<HEADER; |
| 123 | +/*------------------------------------------------------------------------- |
| 124 | + * |
| 125 | + * unicode_norm_hashfunc.h |
| 126 | + * Perfect hash functions used for Unicode normalization |
| 127 | + * |
| 128 | + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group |
| 129 | + * Portions Copyright (c) 1994, Regents of the University of California |
| 130 | + * |
| 131 | + * src/include/common/unicode_norm_hashfunc.h |
| 132 | + * |
| 133 | + *------------------------------------------------------------------------- |
| 134 | + */ |
| 135 | +
|
| 136 | +/* |
| 137 | + * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl, |
| 138 | + * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H |
| 139 | + * here. |
| 140 | + */ |
| 141 | +
|
| 142 | +#include "common/unicode_norm_table.h" |
| 143 | +
|
| 144 | +/* Typedef for perfect hash functions */ |
| 145 | +typedef int (*cp_hash_func) (const void *key); |
| 146 | +
|
| 147 | +/* Information for lookups with perfect hash functions */ |
| 148 | +typedef struct |
| 149 | +{ |
| 150 | + const pg_unicode_decomposition *decomps; |
| 151 | + cp_hash_func hash; |
| 152 | + int num_decomps; |
| 153 | +} pg_unicode_decompinfo; |
| 154 | +
|
| 155 | +typedef struct |
| 156 | +{ |
| 157 | + const uint16 *inverse_lookup; |
| 158 | + cp_hash_func hash; |
| 159 | + int num_recomps; |
| 160 | +} pg_unicode_recompinfo; |
| 161 | +
|
| 162 | +HEADER |
| 163 | + |
114 | 164 | my $decomp_index = 0;
|
115 | 165 | my $decomp_string = "";
|
| 166 | +my @dec_cp_packed; |
| 167 | +my $main_index = 0; |
| 168 | +my @rec_info; |
116 | 169 |
|
117 | 170 | my $last_code = $characters[-1]->{code};
|
118 | 171 | foreach my $char (@characters)
|
|
121 | 174 | my $class = $char->{class};
|
122 | 175 | my $decomp = $char->{decomp};
|
123 | 176 |
|
| 177 | + # Save the code point bytes as a string in network order. |
| 178 | + push @dec_cp_packed, pack('N', hex($char->{code})); |
| 179 | + |
124 | 180 | # The character decomposition mapping field in UnicodeData.txt is a list
|
125 | 181 | # of unicode codepoints, separated by space. But it can be prefixed with
|
126 | 182 | # so-called compatibility formatting tag, like "<compat>", or "<font>".
|
|
163 | 219 | {
|
164 | 220 | foreach my $lcode (@composition_exclusion_codes)
|
165 | 221 | {
|
166 |
| - if ($lcode eq $char->{code}) |
| 222 | + if ($lcode eq $code) |
167 | 223 | {
|
168 | 224 | $flags .= " | DECOMP_NO_COMPOSE";
|
169 | 225 | $comment = "in exclusion list";
|
170 | 226 | last;
|
171 | 227 | }
|
172 | 228 | }
|
173 | 229 | }
|
| 230 | + |
| 231 | + # Save info for recomposeable codepoints. |
| 232 | + # Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C |
| 233 | + # above! See also the inverse lookup in recompose_code() found in |
| 234 | + # src/common/unicode_norm.c. |
| 235 | + if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/)) |
| 236 | + { |
| 237 | + push @rec_info, |
| 238 | + { |
| 239 | + code => $code, |
| 240 | + main_index => $main_index, |
| 241 | + first => $first_decomp, |
| 242 | + second => $decomp_elts[0] |
| 243 | + }; |
| 244 | + } |
174 | 245 | }
|
175 | 246 |
|
176 | 247 | if ($decomp_size == 0)
|
177 | 248 | {
|
178 |
| - print $OUTPUT "\t{0x$code, $class, 0$flags, 0}"; |
| 249 | + print $OT "\t{0x$code, $class, 0$flags, 0}"; |
179 | 250 | }
|
180 | 251 | elsif ($decomp_size == 1 && length($first_decomp) <= 4)
|
181 | 252 | {
|
182 | 253 |
|
183 | 254 | # The decomposition consists of a single codepoint, and it fits
|
184 | 255 | # in a uint16, so we can store it "inline" in the main table.
|
185 | 256 | $flags .= " | DECOMP_INLINE";
|
186 |
| - print $OUTPUT "\t{0x$code, $class, 1$flags, 0x$first_decomp}"; |
| 257 | + print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}"; |
187 | 258 | }
|
188 | 259 | else
|
189 | 260 | {
|
190 |
| - print $OUTPUT |
191 |
| - "\t{0x$code, $class, $decomp_size$flags, $decomp_index}"; |
| 261 | + print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}"; |
192 | 262 |
|
193 | 263 | # Now save the decompositions into a dedicated area that will
|
194 | 264 | # be written afterwards. First build the entry dedicated to
|
|
205 | 275 | }
|
206 | 276 |
|
207 | 277 | # Print a comma after all items except the last one.
|
208 |
| - print $OUTPUT "," unless ($code eq $last_code); |
209 |
| - if ($comment ne "") |
210 |
| - { |
| 278 | + print $OT "," unless ($code eq $last_code); |
211 | 279 |
|
212 |
| - # If the line is wide already, indent the comment with one tab, |
213 |
| - # otherwise with two. This is to make the output match the way |
214 |
| - # pgindent would mangle it. (This is quite hacky. To do this |
215 |
| - # properly, we should actually track how long the line is so far, |
216 |
| - # but this works for now.) |
217 |
| - print $OUTPUT "\t" if ($decomp_index < 10); |
| 280 | + print $OT "\t/* $comment */" if ($comment ne ""); |
| 281 | + print $OT "\n"; |
218 | 282 |
|
219 |
| - print $OUTPUT "\t/* $comment */" if ($comment ne ""); |
220 |
| - } |
221 |
| - print $OUTPUT "\n"; |
| 283 | + $main_index++; |
222 | 284 | }
|
223 |
| -print $OUTPUT "\n};\n\n"; |
| 285 | +print $OT "\n};\n\n"; |
224 | 286 |
|
225 | 287 | # Print the array of decomposed codes.
|
226 |
| -print $OUTPUT <<HEADER; |
| 288 | +print $OT <<HEADER; |
227 | 289 | /* codepoints array */
|
228 | 290 | static const uint32 UnicodeDecomp_codepoints[$decomp_index] =
|
229 | 291 | {
|
230 | 292 | $decomp_string
|
231 | 293 | };
|
232 | 294 | HEADER
|
233 | 295 |
|
234 |
| -close $OUTPUT; |
| 296 | +# Emit the definition of the decomp hash function. |
| 297 | +my $dec_funcname = 'Decomp_hash_func'; |
| 298 | +my $dec_func = PerfectHash::generate_hash_function(\@dec_cp_packed, |
| 299 | + $dec_funcname, fixed_key_length => 4); |
| 300 | +print $OF "/* Perfect hash function for decomposition */\n"; |
| 301 | +print $OF "static $dec_func\n"; |
| 302 | + |
| 303 | +# Emit the structure that wraps the hash lookup information into |
| 304 | +# one variable. |
| 305 | +print $OF <<HEADER; |
| 306 | +/* Hash lookup information for decomposition */ |
| 307 | +static const pg_unicode_decompinfo UnicodeDecompInfo = |
| 308 | +{ |
| 309 | + UnicodeDecompMain, |
| 310 | + $dec_funcname, |
| 311 | + $num_characters |
| 312 | +}; |
| 313 | +
|
| 314 | +HEADER |
| 315 | + |
| 316 | +# Find the lowest codepoint that decomposes to each recomposeable |
| 317 | +# code pair and create a mapping to it. |
| 318 | +my $recomp_string = ""; |
| 319 | +my @rec_cp_packed; |
| 320 | +my %seenit; |
| 321 | +my $firstentry = 1; |
| 322 | +foreach my $rec (sort recomp_sort @rec_info) |
| 323 | +{ |
| 324 | + # The hash key is formed by concatenating the bytes of the two |
| 325 | + # codepoints. See also recompose_code() in common/unicode_norm.c. |
| 326 | + my $hashkey = (hex($rec->{first}) << 32) | hex($rec->{second}); |
| 327 | + |
| 328 | + # We are only interested in the lowest code point that decomposes |
| 329 | + # to the given code pair. |
| 330 | + next if $seenit{$hashkey}; |
| 331 | + |
| 332 | + # Save the hash key bytes in network order |
| 333 | + push @rec_cp_packed, pack('Q>', $hashkey); |
| 334 | + |
| 335 | + # Append inverse lookup element |
| 336 | + $recomp_string .= ",\n" if !$firstentry; |
| 337 | + $recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s", |
| 338 | + $rec->{first}, |
| 339 | + $rec->{second}, |
| 340 | + $rec->{code}, |
| 341 | + $rec->{main_index}; |
| 342 | + |
| 343 | + $seenit{$hashkey} = 1; |
| 344 | + $firstentry = 0; |
| 345 | +} |
| 346 | + |
| 347 | +# Emit the inverse lookup array containing indexes into UnicodeDecompMain. |
| 348 | +my $num_recomps = scalar @rec_cp_packed; |
| 349 | +print $OF <<HEADER; |
| 350 | +/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */ |
| 351 | +static const uint16 RecompInverseLookup[$num_recomps] = |
| 352 | +{ |
| 353 | +$recomp_string |
| 354 | +}; |
| 355 | +
|
| 356 | +HEADER |
| 357 | + |
| 358 | +# Emit the definition of the recomposition hash function. |
| 359 | +my $rec_funcname = 'Recomp_hash_func'; |
| 360 | +my $rec_func = |
| 361 | + PerfectHash::generate_hash_function(\@rec_cp_packed, $rec_funcname, |
| 362 | + fixed_key_length => 8); |
| 363 | +print $OF "/* Perfect hash function for recomposition */\n"; |
| 364 | +print $OF "static $rec_func\n"; |
| 365 | + |
| 366 | +# Emit the structure that wraps the hash lookup information into |
| 367 | +# one variable. |
| 368 | +print $OF <<HEADER; |
| 369 | +/* Hash lookup information for recomposition */ |
| 370 | +static const pg_unicode_recompinfo UnicodeRecompInfo = |
| 371 | +{ |
| 372 | + RecompInverseLookup, |
| 373 | + $rec_funcname, |
| 374 | + $num_recomps |
| 375 | +}; |
| 376 | +HEADER |
| 377 | + |
| 378 | +close $OT; |
| 379 | +close $OF; |
| 380 | + |
| 381 | +sub recomp_sort |
| 382 | +{ |
| 383 | + my $a1 = hex($a->{first}); |
| 384 | + my $b1 = hex($b->{first}); |
| 385 | + |
| 386 | + my $a2 = hex($a->{second}); |
| 387 | + my $b2 = hex($b->{second}); |
| 388 | + |
| 389 | + # First sort by the first code point |
| 390 | + return -1 if $a1 < $b1; |
| 391 | + return 1 if $a1 > $b1; |
| 392 | + |
| 393 | + # Then sort by the second code point |
| 394 | + return -1 if $a2 < $b2; |
| 395 | + return 1 if $a2 > $b2; |
| 396 | + |
| 397 | + # Finally sort by the code point that decomposes into first and |
| 398 | + # second ones. |
| 399 | + my $acode = hex($a->{code}); |
| 400 | + my $bcode = hex($b->{code}); |
| 401 | + |
| 402 | + return -1 if $acode < $bcode; |
| 403 | + return -1 if $acode > $bcode; |
| 404 | + |
| 405 | + die "found duplicate entries of recomposeable code pairs"; |
| 406 | +} |
0 commit comments