postgres
diff --git a/‎src/backend/utils/mb/Unicode/Makefile
Lines changed: 14 additions & 8 deletions b/‎src/backend/utils/mb/Unicode/Makefile
Lines changed: 14 additions & 8 deletions
diff --git a/‎src/backend/utils/mb/Unicode/UCS_to_BIG5.pl
Lines changed: 22 additions & 162 deletions b/‎src/backend/utils/mb/Unicode/UCS_to_BIG5.pl
Lines changed: 22 additions & 162 deletions
@@ -39,8 +39,6 @@ WINMAPS = win866_to_utf8.map utf8_to_win866.map \
 	win1258_to_utf8.map utf8_to_win1258.map
 
 GENERICMAPS = $(ISO8859MAPS) $(WINMAPS) \
-	johab_to_utf8.map utf8_to_johab.map \
-	uhc_to_utf8.map utf8_to_uhc.map \
 	gbk_to_utf8.map utf8_to_gbk.map \
 	koi8r_to_utf8.map utf8_to_koi8r.map
 
@@ -51,6 +49,8 @@ SPECIALMAPS = euc_cn_to_utf8.map utf8_to_euc_cn.map \
 	sjis_to_utf8.map utf8_to_sjis.map \
 	gb18030_to_utf8.map utf8_to_gb18030.map \
 	big5_to_utf8.map utf8_to_big5.map \
+	johab_to_utf8.map utf8_to_johab.map \
+	uhc_to_utf8.map utf8_to_uhc.map \
 	euc_jis_2004_to_utf8.map euc_jis_2004_to_utf8_combined.map \
 	utf8_to_euc_jis_2004.map utf8_to_euc_jis_2004_combined.map \
 	shift_jis_2004_to_utf8.map shift_jis_2004_to_utf8_combined.map \
@@ -63,23 +63,29 @@ ISO8859TEXTS = 8859-2.TXT 8859-3.TXT 8859-4.TXT 8859-5.TXT \
 	8859-10.TXT 8859-13.TXT 8859-14.TXT 8859-15.TXT \
 	8859-16.TXT
 
-WINTEXTS = CP866.TXT CP874.TXT CP936.TXT CP949.TXT \
+WINTEXTS = CP866.TXT CP874.TXT CP936.TXT \
 	CP1250.TXT CP1251.TXT \
 	CP1252.TXT CP1253.TXT CP1254.TXT CP1255.TXT \
 	CP1256.TXT CP1257.TXT CP1258.TXT
 
 GENERICTEXTS = $(ISO8859TEXTS) $(WINTEXTS) \
-	KOI8-R.TXT KOI8-U.TXT JOHAB.TXT
+	KOI8-R.TXT KOI8-U.TXT
 
 all: $(MAPS)
 
 $(GENERICMAPS): UCS_to_most.pl $(GENERICTEXTS)
 	$(PERL) $<
 
-euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl JIS0201.TXT JIS0208.TXT JIS0212.TXT
+johab_to_utf8.map utf8_to_johab.map: UCS_to_JOHAB.pl JOHAB.TXT
+	$(PERL) $<
+
+uhc_to_utf8.map utf8_to_uhc.map: UCS_to_UHC.pl windows-949-2000.xml
+	$(PERL) $<
+
+euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl CP932.TXT JIS0212.TXT
 	$(PERL) $<
 
-euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl GB2312.TXT
+euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl gb-18030-2000.xml
 	$(PERL) $<
 
 euc_kr_to_utf8.map utf8_to_euc_kr.map: UCS_to_EUC_KR.pl KSX1001.TXT
@@ -119,7 +125,7 @@ BIG5.TXT CNS11643.TXT:
 euc-jis-2004-std.txt sjis-0213-2004-std.txt:
 	$(DOWNLOAD) http://x0213.org/codetable/$(@F)
 
-gb-18030-2000.xml:
+gb-18030-2000.xml windows-949-2000.xml:
 	$(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F)
 
 GB2312.TXT:
@@ -137,7 +143,7 @@ KOI8-R.TXT KOI8-U.TXT:
 $(ISO8859TEXTS):
 	$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
 
-$(filter-out CP8%,$(WINTEXTS)):
+$(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT:
 	$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
 
 $(filter CP8%,$(WINTEXTS)):
 
@@ -25,56 +25,17 @@
 #		 # and Unicode name (not used in this script)
 
 
-require "ucs2utf.pl";
+require "convutils.pm";
 
+# Load BIG5.TXT
+my $all = &read_source("BIG5.TXT");
 
-#
-# first, generate UTF8 --> BIG5 table
-#
-$in_file = "BIG5.TXT";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-reset 'array';
+# Load CP950.TXT
+my $cp950txt = &read_source("CP950.TXT");
 
-while (<FILE>)
-{
-	chop;
-	if (/^#/)
-	{
-		next;
-	}
-	($c, $u, $rest) = split;
-	$ucs  = hex($u);
-	$code = hex($c);
-	if ($code >= 0x80 && $ucs >= 0x0080)
-	{
-		$utf = &ucs2utf($ucs);
-		if ($array{$utf} ne "")
-		{
-			printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-			next;
-		}
-		$count++;
-		$array{$utf} = $code;
-	}
-}
-close(FILE);
-
-$in_file = "CP950.TXT";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
-	chop;
-	if (/^#/)
-	{
-		next;
-	}
-	($c, $u, $rest) = split;
-	$ucs  = hex($u);
-	$code = hex($c);
+foreach my $i (@$cp950txt) {
+	my $code = $i->{code};
+	my $ucs = $i->{ucs};
 
 	# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
 	# from CP950.TXT
@@ -83,126 +44,25 @@
 		&& $code >= 0xf9d6
 		&& $code <= 0xf9dc)
 	{
-		$utf = &ucs2utf($ucs);
-		if ($array{$utf} ne "")
-		{
-			printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-			next;
-		}
-		$count++;
-		$array{$utf} = $code;
+		push @$all, {code => $code,
+					 ucs => $ucs,
+					 comment => $i->{comment},
+					 direction => "both"};
 	}
 }
-close(FILE);
-
-$file = lc("utf8_to_big5.map");
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_utf_to_local ULmapBIG5[ $count ] = {\n";
-
-for $index (sort { $a <=> $b } keys(%array))
-{
-	$code = $array{$index};
-	$count--;
-	if ($count == 0)
-	{
-		printf FILE "  {0x%04x, 0x%04x}\n", $index, $code;
-	}
-	else
-	{
-		printf FILE "  {0x%04x, 0x%04x},\n", $index, $code;
-	}
-}
-
-print FILE "};\n";
-close(FILE);
-
-#
-# then generate BIG5 --> UTF8 table
-#
-$in_file = "BIG5.TXT";
 
-open(FILE, $in_file) || die("cannot open $in_file");
+foreach my $i (@$all) {
+	my $code = $i->{code};
+	my $ucs = $i->{ucs};
 
-reset 'array';
-
-while (<FILE>)
-{
-	chop;
-	if (/^#/)
-	{
-		next;
-	}
-	($c, $u, $rest) = split;
-	$ucs  = hex($u);
-	$code = hex($c);
-	if ($code >= 0x80 && $ucs >= 0x0080)
-	{
-		$utf = &ucs2utf($ucs);
-		if ($array{$utf} ne "")
-		{
-			printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-			next;
-		}
-		$count++;
-		$array{$code} = $utf;
-	}
-}
-close(FILE);
-
-$in_file = "CP950.TXT";
-
-open(FILE, $in_file) || die("cannot open $in_file");
-
-while (<FILE>)
-{
-	chop;
-	if (/^#/)
-	{
-		next;
-	}
-	($c, $u, $rest) = split;
-	$ucs  = hex($u);
-	$code = hex($c);
-
-	# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
-	# from CP950.TXT
-	if (   $code >= 0x80
-		&& $ucs >= 0x0080
-		&& $code >= 0xf9d6
-		&& $code <= 0xf9dc)
-	{
-		$utf = &ucs2utf($ucs);
-		if ($array{$utf} ne "")
-		{
-			printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
-			next;
-		}
-		$count++;
-		$array{$code} = $utf;
-	}
-}
-close(FILE);
-
-$file = lc("big5_to_utf8.map");
-open(FILE, "> $file") || die("cannot open $file");
-
-print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
-print FILE "static const pg_local_to_utf LUmapBIG5[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
-{
-	$utf = $array{$index};
-	$count--;
-	if ($count == 0)
-	{
-		printf FILE "  {0x%04x, 0x%04x}\n", $index, $utf;
-	}
-	else
+	# BIG5.TXT maps several BIG5 characters to U+FFFD. The UTF-8 to BIG5 mapping can
+	# contain only one of them. XXX: Doesn't really make sense to include any of them,
+	# but for historical reasons, we map the first one of them.
+	if ($i->{ucs} == 0xFFFD && $i->{code} != 0xA15A)
 	{
-		printf FILE "  {0x%04x, 0x%04x},\n", $index, $utf;
+		$i->{direction} = "to_unicode";
 	}
 }
 
-print FILE "};\n";
-close(FILE);
+# Output
+print_tables("BIG5", $all);