|
25 | 25 | # # and Unicode name (not used in this script)
|
26 | 26 |
|
27 | 27 |
|
28 |
| -require "ucs2utf.pl"; |
| 28 | +require "convutils.pm"; |
29 | 29 |
|
| 30 | +# Load BIG5.TXT |
| 31 | +my $all = &read_source("BIG5.TXT"); |
30 | 32 |
|
31 |
| -# |
32 |
| -# first, generate UTF8 --> BIG5 table |
33 |
| -# |
34 |
| -$in_file = "BIG5.TXT"; |
35 |
| - |
36 |
| -open(FILE, $in_file) || die("cannot open $in_file"); |
37 |
| - |
38 |
| -reset 'array'; |
| 33 | +# Load CP950.TXT |
| 34 | +my $cp950txt = &read_source("CP950.TXT"); |
39 | 35 |
|
40 |
| -while (<FILE>) |
41 |
| -{ |
42 |
| - chop; |
43 |
| - if (/^#/) |
44 |
| - { |
45 |
| - next; |
46 |
| - } |
47 |
| - ($c, $u, $rest) = split; |
48 |
| - $ucs = hex($u); |
49 |
| - $code = hex($c); |
50 |
| - if ($code >= 0x80 && $ucs >= 0x0080) |
51 |
| - { |
52 |
| - $utf = &ucs2utf($ucs); |
53 |
| - if ($array{$utf} ne "") |
54 |
| - { |
55 |
| - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; |
56 |
| - next; |
57 |
| - } |
58 |
| - $count++; |
59 |
| - $array{$utf} = $code; |
60 |
| - } |
61 |
| -} |
62 |
| -close(FILE); |
63 |
| - |
64 |
| -$in_file = "CP950.TXT"; |
65 |
| - |
66 |
| -open(FILE, $in_file) || die("cannot open $in_file"); |
67 |
| - |
68 |
| -while (<FILE>) |
69 |
| -{ |
70 |
| - chop; |
71 |
| - if (/^#/) |
72 |
| - { |
73 |
| - next; |
74 |
| - } |
75 |
| - ($c, $u, $rest) = split; |
76 |
| - $ucs = hex($u); |
77 |
| - $code = hex($c); |
| 36 | +foreach my $i (@$cp950txt) { |
| 37 | + my $code = $i->{code}; |
| 38 | + my $ucs = $i->{ucs}; |
78 | 39 |
|
79 | 40 | # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
|
80 | 41 | # from CP950.TXT
|
|
83 | 44 | && $code >= 0xf9d6
|
84 | 45 | && $code <= 0xf9dc)
|
85 | 46 | {
|
86 |
| - $utf = &ucs2utf($ucs); |
87 |
| - if ($array{$utf} ne "") |
88 |
| - { |
89 |
| - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; |
90 |
| - next; |
91 |
| - } |
92 |
| - $count++; |
93 |
| - $array{$utf} = $code; |
| 47 | + push @$all, {code => $code, |
| 48 | + ucs => $ucs, |
| 49 | + comment => $i->{comment}, |
| 50 | + direction => "both"}; |
94 | 51 | }
|
95 | 52 | }
|
96 |
| -close(FILE); |
97 |
| - |
98 |
| -$file = lc("utf8_to_big5.map"); |
99 |
| -open(FILE, "> $file") || die("cannot open $file"); |
100 |
| - |
101 |
| -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; |
102 |
| -print FILE "static const pg_utf_to_local ULmapBIG5[ $count ] = {\n"; |
103 |
| - |
104 |
| -for $index (sort { $a <=> $b } keys(%array)) |
105 |
| -{ |
106 |
| - $code = $array{$index}; |
107 |
| - $count--; |
108 |
| - if ($count == 0) |
109 |
| - { |
110 |
| - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; |
111 |
| - } |
112 |
| - else |
113 |
| - { |
114 |
| - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; |
115 |
| - } |
116 |
| -} |
117 |
| - |
118 |
| -print FILE "};\n"; |
119 |
| -close(FILE); |
120 |
| - |
121 |
| -# |
122 |
| -# then generate BIG5 --> UTF8 table |
123 |
| -# |
124 |
| -$in_file = "BIG5.TXT"; |
125 | 53 |
|
126 |
| -open(FILE, $in_file) || die("cannot open $in_file"); |
| 54 | +foreach my $i (@$all) { |
| 55 | + my $code = $i->{code}; |
| 56 | + my $ucs = $i->{ucs}; |
127 | 57 |
|
128 |
| -reset 'array'; |
129 |
| - |
130 |
| -while (<FILE>) |
131 |
| -{ |
132 |
| - chop; |
133 |
| - if (/^#/) |
134 |
| - { |
135 |
| - next; |
136 |
| - } |
137 |
| - ($c, $u, $rest) = split; |
138 |
| - $ucs = hex($u); |
139 |
| - $code = hex($c); |
140 |
| - if ($code >= 0x80 && $ucs >= 0x0080) |
141 |
| - { |
142 |
| - $utf = &ucs2utf($ucs); |
143 |
| - if ($array{$utf} ne "") |
144 |
| - { |
145 |
| - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; |
146 |
| - next; |
147 |
| - } |
148 |
| - $count++; |
149 |
| - $array{$code} = $utf; |
150 |
| - } |
151 |
| -} |
152 |
| -close(FILE); |
153 |
| - |
154 |
| -$in_file = "CP950.TXT"; |
155 |
| - |
156 |
| -open(FILE, $in_file) || die("cannot open $in_file"); |
157 |
| - |
158 |
| -while (<FILE>) |
159 |
| -{ |
160 |
| - chop; |
161 |
| - if (/^#/) |
162 |
| - { |
163 |
| - next; |
164 |
| - } |
165 |
| - ($c, $u, $rest) = split; |
166 |
| - $ucs = hex($u); |
167 |
| - $code = hex($c); |
168 |
| - |
169 |
| - # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc |
170 |
| - # from CP950.TXT |
171 |
| - if ( $code >= 0x80 |
172 |
| - && $ucs >= 0x0080 |
173 |
| - && $code >= 0xf9d6 |
174 |
| - && $code <= 0xf9dc) |
175 |
| - { |
176 |
| - $utf = &ucs2utf($ucs); |
177 |
| - if ($array{$utf} ne "") |
178 |
| - { |
179 |
| - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; |
180 |
| - next; |
181 |
| - } |
182 |
| - $count++; |
183 |
| - $array{$code} = $utf; |
184 |
| - } |
185 |
| -} |
186 |
| -close(FILE); |
187 |
| - |
188 |
| -$file = lc("big5_to_utf8.map"); |
189 |
| -open(FILE, "> $file") || die("cannot open $file"); |
190 |
| - |
191 |
| -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; |
192 |
| -print FILE "static const pg_local_to_utf LUmapBIG5[ $count ] = {\n"; |
193 |
| -for $index (sort { $a <=> $b } keys(%array)) |
194 |
| -{ |
195 |
| - $utf = $array{$index}; |
196 |
| - $count--; |
197 |
| - if ($count == 0) |
198 |
| - { |
199 |
| - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; |
200 |
| - } |
201 |
| - else |
| 58 | + # BIG5.TXT maps several BIG5 characters to U+FFFD. The UTF-8 to BIG5 mapping can |
| 59 | + # contain only one of them. XXX: Doesn't really make sense to include any of them, |
| 60 | + # but for historical reasons, we map the first one of them. |
| 61 | + if ($i->{ucs} == 0xFFFD && $i->{code} != 0xA15A) |
202 | 62 | {
|
203 |
| - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; |
| 63 | + $i->{direction} = "to_unicode"; |
204 | 64 | }
|
205 | 65 | }
|
206 | 66 |
|
207 |
| -print FILE "};\n"; |
208 |
| -close(FILE); |
| 67 | +# Output |
| 68 | +print_tables("BIG5", $all); |
0 commit comments