|
| 1 | +#!/usr/bin/perl |
| 2 | +# |
| 3 | +# Generate Unicode character case mappings. Does not include tailoring |
| 4 | +# or locale-specific mappings. |
| 5 | +# |
| 6 | +# Input: UnicodeData.txt |
| 7 | +# Output: unicode_case_table.h |
| 8 | +# |
| 9 | +# Copyright (c) 2000-2023, PostgreSQL Global Development Group |
| 10 | + |
| 11 | +use strict; |
| 12 | +use warnings; |
| 13 | +use Getopt::Long; |
| 14 | + |
| 15 | +use FindBin; |
| 16 | +use lib "$FindBin::RealBin/../../tools/"; |
| 17 | + |
| 18 | +my $output_path = '.'; |
| 19 | + |
| 20 | +GetOptions('outdir:s' => \$output_path); |
| 21 | + |
| 22 | +my $output_table_file = "$output_path/unicode_case_table.h"; |
| 23 | + |
| 24 | +my $FH; |
| 25 | + |
| 26 | +my %simple = (); |
| 27 | + |
| 28 | +open($FH, '<', "$output_path/UnicodeData.txt") |
| 29 | + or die "Could not open $output_path/UnicodeData.txt: $!."; |
| 30 | +while (my $line = <$FH>) |
| 31 | +{ |
| 32 | + my @elts = split(';', $line); |
| 33 | + my $code = hex($elts[0]); |
| 34 | + my $simple_uppercase = hex($elts[12] =~ s/^\s+|\s+$//rg); |
| 35 | + my $simple_lowercase = hex($elts[13] =~ s/^\s+|\s+$//rg); |
| 36 | + my $simple_titlecase = hex($elts[14] =~ s/^\s+|\s+$//rg); |
| 37 | + |
| 38 | + die "codepoint $code out of range" if $code > 0x10FFFF; |
| 39 | + die "Simple_Lowercase $code out of range" if $simple_lowercase > 0x10FFFF; |
| 40 | + die "Simple_Titlecase $code out of range" if $simple_titlecase > 0x10FFFF; |
| 41 | + die "Simple_Uppercase $code out of range" if $simple_uppercase > 0x10FFFF; |
| 42 | + |
| 43 | + if ($simple_lowercase || $simple_titlecase || $simple_uppercase) |
| 44 | + { |
| 45 | + $simple{$code} = { |
| 46 | + Simple_Lowercase => ($simple_lowercase || $code), |
| 47 | + Simple_Titlecase => ($simple_titlecase || $code), |
| 48 | + Simple_Uppercase => ($simple_uppercase || $code) |
| 49 | + }; |
| 50 | + } |
| 51 | +} |
| 52 | +close $FH; |
| 53 | + |
| 54 | +# Start writing out the output files |
| 55 | +open my $OT, '>', $output_table_file |
| 56 | + or die "Could not open output file $output_table_file: $!\n"; |
| 57 | + |
| 58 | +# determine size of array given that codepoints <= 0x80 are dense and |
| 59 | +# the rest of the entries are sparse |
| 60 | +my $num_simple = 0x80; |
| 61 | +foreach my $code (sort { $a <=> $b } (keys %simple)) |
| 62 | +{ |
| 63 | + $num_simple++ unless $code < 0x80; |
| 64 | +} |
| 65 | + |
| 66 | +print $OT <<"EOS"; |
| 67 | +/*------------------------------------------------------------------------- |
| 68 | + * |
| 69 | + * unicode_case_table.h |
| 70 | + * Case mapping and information table. |
| 71 | + * |
| 72 | + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| 73 | + * Portions Copyright (c) 1994, Regents of the University of California |
| 74 | + * |
| 75 | + * src/include/common/unicode_case_table.h |
| 76 | + * |
| 77 | + *------------------------------------------------------------------------- |
| 78 | + */ |
| 79 | +
|
| 80 | +/* |
| 81 | + * File auto-generated by src/common/unicode/generate-unicode_case_table.pl, |
| 82 | + * do not edit. There is deliberately not an #ifndef PG_UNICODE_CASE_TABLE_H |
| 83 | + * here. |
| 84 | + */ |
| 85 | +
|
| 86 | +#include "common/unicode_case.h" |
| 87 | +#include "mb/pg_wchar.h" |
| 88 | +
|
| 89 | +typedef enum |
| 90 | +{ |
| 91 | + CaseLower = 0, |
| 92 | + CaseTitle = 1, |
| 93 | + CaseUpper = 2, |
| 94 | + NCaseKind |
| 95 | +} CaseKind; |
| 96 | +
|
| 97 | +typedef struct |
| 98 | +{ |
| 99 | + pg_wchar codepoint; /* Unicode codepoint */ |
| 100 | + pg_wchar simplemap[NCaseKind]; |
| 101 | +} pg_case_map; |
| 102 | +
|
| 103 | +/* |
| 104 | + * Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup), |
| 105 | + * sparse for higher codepoints (requiring scan or binary search). |
| 106 | + */ |
| 107 | +static const pg_case_map case_map[$num_simple] = |
| 108 | +{ |
| 109 | +EOS |
| 110 | + |
| 111 | +printf $OT "\t/* begin dense entries for codepoints < 0x80 */\n"; |
| 112 | +for (my $code = 0; $code < 0x80; $code++) |
| 113 | +{ |
| 114 | + my $lc = ($simple{$code}{Simple_Lowercase} || $code); |
| 115 | + my $tc = ($simple{$code}{Simple_Titlecase} || $code); |
| 116 | + my $uc = ($simple{$code}{Simple_Uppercase} || $code); |
| 117 | + printf $OT |
| 118 | + "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n", |
| 119 | + $code, $lc, $tc, $uc; |
| 120 | +} |
| 121 | +printf $OT "\n"; |
| 122 | + |
| 123 | +printf $OT "\t/* begin sparse entries for codepoints >= 0x80 */\n"; |
| 124 | +foreach my $code (sort { $a <=> $b } (keys %simple)) |
| 125 | +{ |
| 126 | + next unless $code >= 0x80; # already output above |
| 127 | + |
| 128 | + my $map = $simple{$code}; |
| 129 | + printf $OT |
| 130 | + "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n", |
| 131 | + $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase}, |
| 132 | + $map->{Simple_Uppercase}; |
| 133 | +} |
| 134 | +print $OT "};\n"; |
0 commit comments