Skip to content

Commit 1de9cc0

Browse files
committed
Rewrite the perl scripts to produce our Unicode conversion tables.
Generate EUC_CN mappings from gb-18030-2000.xml, because GB2312.TXT is no longer available. Get UHC from windows-949-2000.xml, it's more up-to-date. Plus tons more small changes. With these changes, the perl scripts faithfully produce the *.map files we have in the repository, from the external source files. In the passing, fix the Makefile to also download CP932.TXT and CP950.TXT. Based on patches by Kyotaro Horiguchi, reviewed by Daniel Gustafsson. Discussion: https://postgr.es/m/08e7892a-d55c-eefe-76e6-7910bc8dd1f3@iki.fi
1 parent 6c30322 commit 1de9cc0

33 files changed

+791
-1541
lines changed

src/backend/utils/mb/Unicode/Makefile

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,6 @@ WINMAPS = win866_to_utf8.map utf8_to_win866.map \
3939
win1258_to_utf8.map utf8_to_win1258.map
4040

4141
GENERICMAPS = $(ISO8859MAPS) $(WINMAPS) \
42-
johab_to_utf8.map utf8_to_johab.map \
43-
uhc_to_utf8.map utf8_to_uhc.map \
4442
gbk_to_utf8.map utf8_to_gbk.map \
4543
koi8r_to_utf8.map utf8_to_koi8r.map
4644

@@ -51,6 +49,8 @@ SPECIALMAPS = euc_cn_to_utf8.map utf8_to_euc_cn.map \
5149
sjis_to_utf8.map utf8_to_sjis.map \
5250
gb18030_to_utf8.map utf8_to_gb18030.map \
5351
big5_to_utf8.map utf8_to_big5.map \
52+
johab_to_utf8.map utf8_to_johab.map \
53+
uhc_to_utf8.map utf8_to_uhc.map \
5454
euc_jis_2004_to_utf8.map euc_jis_2004_to_utf8_combined.map \
5555
utf8_to_euc_jis_2004.map utf8_to_euc_jis_2004_combined.map \
5656
shift_jis_2004_to_utf8.map shift_jis_2004_to_utf8_combined.map \
@@ -63,23 +63,29 @@ ISO8859TEXTS = 8859-2.TXT 8859-3.TXT 8859-4.TXT 8859-5.TXT \
6363
8859-10.TXT 8859-13.TXT 8859-14.TXT 8859-15.TXT \
6464
8859-16.TXT
6565

66-
WINTEXTS = CP866.TXT CP874.TXT CP936.TXT CP949.TXT \
66+
WINTEXTS = CP866.TXT CP874.TXT CP936.TXT \
6767
CP1250.TXT CP1251.TXT \
6868
CP1252.TXT CP1253.TXT CP1254.TXT CP1255.TXT \
6969
CP1256.TXT CP1257.TXT CP1258.TXT
7070

7171
GENERICTEXTS = $(ISO8859TEXTS) $(WINTEXTS) \
72-
KOI8-R.TXT KOI8-U.TXT JOHAB.TXT
72+
KOI8-R.TXT KOI8-U.TXT
7373

7474
all: $(MAPS)
7575

7676
$(GENERICMAPS): UCS_to_most.pl $(GENERICTEXTS)
7777
$(PERL) $<
7878

79-
euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl JIS0201.TXT JIS0208.TXT JIS0212.TXT
79+
johab_to_utf8.map utf8_to_johab.map: UCS_to_JOHAB.pl JOHAB.TXT
80+
$(PERL) $<
81+
82+
uhc_to_utf8.map utf8_to_uhc.map: UCS_to_UHC.pl windows-949-2000.xml
83+
$(PERL) $<
84+
85+
euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl CP932.TXT JIS0212.TXT
8086
$(PERL) $<
8187

82-
euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl GB2312.TXT
88+
euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl gb-18030-2000.xml
8389
$(PERL) $<
8490

8591
euc_kr_to_utf8.map utf8_to_euc_kr.map: UCS_to_EUC_KR.pl KSX1001.TXT
@@ -119,7 +125,7 @@ BIG5.TXT CNS11643.TXT:
119125
euc-jis-2004-std.txt sjis-0213-2004-std.txt:
120126
$(DOWNLOAD) http://x0213.org/codetable/$(@F)
121127

122-
gb-18030-2000.xml:
128+
gb-18030-2000.xml windows-949-2000.xml:
123129
$(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F)
124130

125131
GB2312.TXT:
@@ -137,7 +143,7 @@ KOI8-R.TXT KOI8-U.TXT:
137143
$(ISO8859TEXTS):
138144
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
139145

140-
$(filter-out CP8%,$(WINTEXTS)):
146+
$(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT:
141147
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
142148

143149
$(filter CP8%,$(WINTEXTS)):

src/backend/utils/mb/Unicode/UCS_to_BIG5.pl

Lines changed: 22 additions & 162 deletions
Original file line numberDiff line numberDiff line change
@@ -25,56 +25,17 @@
2525
# # and Unicode name (not used in this script)
2626

2727

28-
require "ucs2utf.pl";
28+
require "convutils.pm";
2929

30+
# Load BIG5.TXT
31+
my $all = &read_source("BIG5.TXT");
3032

31-
#
32-
# first, generate UTF8 --> BIG5 table
33-
#
34-
$in_file = "BIG5.TXT";
35-
36-
open(FILE, $in_file) || die("cannot open $in_file");
37-
38-
reset 'array';
33+
# Load CP950.TXT
34+
my $cp950txt = &read_source("CP950.TXT");
3935

40-
while (<FILE>)
41-
{
42-
chop;
43-
if (/^#/)
44-
{
45-
next;
46-
}
47-
($c, $u, $rest) = split;
48-
$ucs = hex($u);
49-
$code = hex($c);
50-
if ($code >= 0x80 && $ucs >= 0x0080)
51-
{
52-
$utf = &ucs2utf($ucs);
53-
if ($array{$utf} ne "")
54-
{
55-
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
56-
next;
57-
}
58-
$count++;
59-
$array{$utf} = $code;
60-
}
61-
}
62-
close(FILE);
63-
64-
$in_file = "CP950.TXT";
65-
66-
open(FILE, $in_file) || die("cannot open $in_file");
67-
68-
while (<FILE>)
69-
{
70-
chop;
71-
if (/^#/)
72-
{
73-
next;
74-
}
75-
($c, $u, $rest) = split;
76-
$ucs = hex($u);
77-
$code = hex($c);
36+
foreach my $i (@$cp950txt) {
37+
my $code = $i->{code};
38+
my $ucs = $i->{ucs};
7839

7940
# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
8041
# from CP950.TXT
@@ -83,126 +44,25 @@
8344
&& $code >= 0xf9d6
8445
&& $code <= 0xf9dc)
8546
{
86-
$utf = &ucs2utf($ucs);
87-
if ($array{$utf} ne "")
88-
{
89-
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
90-
next;
91-
}
92-
$count++;
93-
$array{$utf} = $code;
47+
push @$all, {code => $code,
48+
ucs => $ucs,
49+
comment => $i->{comment},
50+
direction => "both"};
9451
}
9552
}
96-
close(FILE);
97-
98-
$file = lc("utf8_to_big5.map");
99-
open(FILE, "> $file") || die("cannot open $file");
100-
101-
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
102-
print FILE "static const pg_utf_to_local ULmapBIG5[ $count ] = {\n";
103-
104-
for $index (sort { $a <=> $b } keys(%array))
105-
{
106-
$code = $array{$index};
107-
$count--;
108-
if ($count == 0)
109-
{
110-
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
111-
}
112-
else
113-
{
114-
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
115-
}
116-
}
117-
118-
print FILE "};\n";
119-
close(FILE);
120-
121-
#
122-
# then generate BIG5 --> UTF8 table
123-
#
124-
$in_file = "BIG5.TXT";
12553

126-
open(FILE, $in_file) || die("cannot open $in_file");
54+
foreach my $i (@$all) {
55+
my $code = $i->{code};
56+
my $ucs = $i->{ucs};
12757

128-
reset 'array';
129-
130-
while (<FILE>)
131-
{
132-
chop;
133-
if (/^#/)
134-
{
135-
next;
136-
}
137-
($c, $u, $rest) = split;
138-
$ucs = hex($u);
139-
$code = hex($c);
140-
if ($code >= 0x80 && $ucs >= 0x0080)
141-
{
142-
$utf = &ucs2utf($ucs);
143-
if ($array{$utf} ne "")
144-
{
145-
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
146-
next;
147-
}
148-
$count++;
149-
$array{$code} = $utf;
150-
}
151-
}
152-
close(FILE);
153-
154-
$in_file = "CP950.TXT";
155-
156-
open(FILE, $in_file) || die("cannot open $in_file");
157-
158-
while (<FILE>)
159-
{
160-
chop;
161-
if (/^#/)
162-
{
163-
next;
164-
}
165-
($c, $u, $rest) = split;
166-
$ucs = hex($u);
167-
$code = hex($c);
168-
169-
# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
170-
# from CP950.TXT
171-
if ( $code >= 0x80
172-
&& $ucs >= 0x0080
173-
&& $code >= 0xf9d6
174-
&& $code <= 0xf9dc)
175-
{
176-
$utf = &ucs2utf($ucs);
177-
if ($array{$utf} ne "")
178-
{
179-
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
180-
next;
181-
}
182-
$count++;
183-
$array{$code} = $utf;
184-
}
185-
}
186-
close(FILE);
187-
188-
$file = lc("big5_to_utf8.map");
189-
open(FILE, "> $file") || die("cannot open $file");
190-
191-
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
192-
print FILE "static const pg_local_to_utf LUmapBIG5[ $count ] = {\n";
193-
for $index (sort { $a <=> $b } keys(%array))
194-
{
195-
$utf = $array{$index};
196-
$count--;
197-
if ($count == 0)
198-
{
199-
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
200-
}
201-
else
58+
# BIG5.TXT maps several BIG5 characters to U+FFFD. The UTF-8 to BIG5 mapping can
59+
# contain only one of them. XXX: Doesn't really make sense to include any of them,
60+
# but for historical reasons, we map the first one of them.
61+
if ($i->{ucs} == 0xFFFD && $i->{code} != 0xA15A)
20262
{
203-
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
63+
$i->{direction} = "to_unicode";
20464
}
20565
}
20666

207-
print FILE "};\n";
208-
close(FILE);
67+
# Output
68+
print_tables("BIG5", $all);

0 commit comments

Comments
 (0)