Skip to content

Commit aeed17d

Browse files
committed
Use radix tree for character encoding conversions.
Replace the mapping tables used to convert between UTF-8 and other character encodings with new radix tree-based maps. Looking up an entry in a radix tree is much faster than a binary search in the old maps. As a bonus, the radix tree representation is also more compact, making the binaries slightly smaller. The "combined" maps work the same as before, with binary search. They are much smaller than the main tables, so it doesn't matter so much. However, the "combined" maps are now stored in the same .map files as the main tables. This seems more clear, since they're always used together, and generated from the same source files. Patch by Kyotaro Horiguchi, with lot of hacking by me at various stages. Reviewed by Michael Paquier and Daniel Gustafsson. Discussion: https://www.postgresql.org/message-id/20170306.171609.204324917.horiguchi.kyotaro%40lab.ntt.co.jp
1 parent 8489269 commit aeed17d

File tree

111 files changed

+147742
-367346
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

111 files changed

+147742
-367346
lines changed

src/backend/utils/mb/Unicode/Makefile

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,8 @@ SPECIALMAPS = euc_cn_to_utf8.map utf8_to_euc_cn.map \
5252
big5_to_utf8.map utf8_to_big5.map \
5353
johab_to_utf8.map utf8_to_johab.map \
5454
uhc_to_utf8.map utf8_to_uhc.map \
55-
euc_jis_2004_to_utf8.map euc_jis_2004_to_utf8_combined.map \
56-
utf8_to_euc_jis_2004.map utf8_to_euc_jis_2004_combined.map \
57-
shift_jis_2004_to_utf8.map shift_jis_2004_to_utf8_combined.map \
58-
utf8_to_shift_jis_2004.map utf8_to_shift_jis_2004_combined.map
55+
euc_jis_2004_to_utf8.map utf8_to_euc_jis_2004.map \
56+
shift_jis_2004_to_utf8.map utf8_to_shift_jis_2004.map
5957

6058
MAPS = $(GENERICMAPS) $(SPECIALMAPS)
6159

@@ -104,10 +102,10 @@ gb18030_to_utf8.map utf8_to_gb18030.map: UCS_to_GB18030.pl gb-18030-2000.xml
104102
big5_to_utf8.map utf8_to_big5.map: UCS_to_BIG5.pl BIG5.TXT CP950.TXT
105103
$(PERL) $<
106104

107-
euc_jis_2004_to_utf8.map euc_jis_2004_to_utf8_combined.map utf8_to_euc_jis_2004.map utf8_to_euc_jis_2004_combined.map: UCS_to_EUC_JIS_2004.pl euc-jis-2004-std.txt
105+
euc_jis_2004_to_utf8.map utf8_to_euc_jis_2004.map: UCS_to_EUC_JIS_2004.pl euc-jis-2004-std.txt
108106
$(PERL) $<
109107

110-
shift_jis_2004_to_utf8.map shift_jis_2004_to_utf8_combined.map utf8_to_shift_jis_2004.map utf8_to_shift_jis_2004_combined.map: UCS_to_SHIFT_JIS_2004.pl sjis-0213-2004-std.txt
108+
shift_jis_2004_to_utf8.map utf8_to_shift_jis_2004.map: UCS_to_SHIFT_JIS_2004.pl sjis-0213-2004-std.txt
111109
$(PERL) $<
112110

113111
distclean: clean

src/backend/utils/mb/Unicode/UCS_to_BIG5.pl

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
# # and Unicode name (not used in this script)
2626

2727
use strict;
28-
require convutils;
28+
use convutils;
29+
30+
my $this_script = $0;
2931

3032
# Load BIG5.TXT
3133
my $all = &read_source("BIG5.TXT");
@@ -47,7 +49,9 @@
4749
push @$all, {code => $code,
4850
ucs => $ucs,
4951
comment => $i->{comment},
50-
direction => "both"};
52+
direction => BOTH,
53+
f => $i->{f},
54+
l => $i->{l} };
5155
}
5256
}
5357

@@ -60,9 +64,9 @@
6064
# but for historical reasons, we map the first one of them.
6165
if ($i->{ucs} == 0xFFFD && $i->{code} != 0xA15A)
6266
{
63-
$i->{direction} = "to_unicode";
67+
$i->{direction} = TO_UNICODE;
6468
}
6569
}
6670

6771
# Output
68-
print_tables("BIG5", $all);
72+
print_conversion_tables($this_script, "BIG5", $all);

src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414
# and the "b" field is the hex byte sequence for GB18030
1515

1616
use strict;
17-
require convutils;
17+
use convutils;
18+
19+
my $this_script = $0;
1820

1921
# Read the input
2022

@@ -68,9 +70,11 @@
6870
push @mapping, {
6971
ucs => $ucs,
7072
code => $code,
71-
direction => 'both'
73+
direction => BOTH,
74+
f => $in_file,
75+
l => $.
7276
};
7377
}
7478
close($in);
7579

76-
print_tables("EUC_CN", \@mapping);
80+
print_conversion_tables($this_script, "EUC_CN", \@mapping);

src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
# "euc-jis-2004-std.txt" (http://x0213.org)
99

1010
use strict;
11-
require convutils;
11+
use convutils;
12+
13+
my $this_script = $0;
1214

1315
# first generate UTF-8 --> EUC_JIS_2004 table
1416

@@ -29,12 +31,14 @@
2931
my $ucs1 = hex($u1);
3032
my $ucs2 = hex($u2);
3133

32-
push @all, { direction => 'both',
34+
push @all, { direction => BOTH,
3335
ucs => $ucs1,
3436
ucs_second => $ucs2,
3537
code => $code,
36-
comment => $rest };
37-
next;
38+
comment => $rest,
39+
f => $in_file,
40+
l => $.
41+
};
3842
}
3943
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
4044
{
@@ -45,9 +49,15 @@
4549

4650
next if ($code < 0x80 && $ucs < 0x80);
4751

48-
push @all, { direction => 'both', ucs => $ucs, code => $code, comment => $rest };
52+
push @all, { direction => BOTH,
53+
ucs => $ucs,
54+
code => $code,
55+
comment => $rest,
56+
f => $in_file,
57+
l => $.
58+
};
4959
}
5060
}
5161
close($in);
5262

53-
print_tables("EUC_JIS_2004", \@all, 1);
63+
print_conversion_tables($this_script, "EUC_JIS_2004", \@all);

src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl

Lines changed: 96 additions & 93 deletions
Large diffs are not rendered by default.

src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
# # and Unicode name (not used in this script)
1818

1919
use strict;
20-
require convutils;
20+
use convutils;
21+
22+
my $this_script = $0;
2123

2224
# Load the source file.
2325

@@ -29,10 +31,10 @@
2931
}
3032

3133
# Some extra characters that are not in KSX1001.TXT
32-
push @$mapping, (
33-
{direction => 'both', ucs => 0x20AC, code => 0xa2e6, comment => '# EURO SIGN'},
34-
{direction => 'both', ucs => 0x00AE, code => 0xa2e7, comment => '# REGISTERED SIGN'},
35-
{direction => 'both', ucs => 0x327E, code => 0xa2e8, comment => '# CIRCLED HANGUL IEUNG U'}
34+
push @$mapping,(
35+
{direction => BOTH, ucs => 0x20AC, code => 0xa2e6, comment => '# EURO SIGN', f => $this_script, l => __LINE__},
36+
{direction => BOTH, ucs => 0x00AE, code => 0xa2e7, comment => '# REGISTERED SIGN', f => $this_script, l => __LINE__ },
37+
{direction => BOTH, ucs => 0x327E, code => 0xa2e8, comment => '# CIRCLED HANGUL IEUNG U', f => $this_script, l => __LINE__ }
3638
);
3739

38-
print_tables("EUC_KR", $mapping);
40+
print_conversion_tables($this_script, "EUC_KR", $mapping);

src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
# # and Unicode name (not used in this script)
1919

2020
use strict;
21-
require convutils;
21+
use convutils;
22+
23+
my $this_script = $0;
2224

2325
my $mapping = &read_source("CNS11643.TXT");
2426

@@ -54,11 +56,13 @@
5456
ucs => $i->{ucs},
5557
code => ($i->{code} + 0x8ea10000),
5658
rest => $i->{rest},
57-
direction => 'to_unicode'
59+
direction => TO_UNICODE,
60+
f => $i->{f},
61+
l => $i->{l}
5862
};
5963
}
6064
}
6165

6266
push @$mapping, @extras;
6367

64-
print_tables("EUC_TW", $mapping);
68+
print_conversion_tables($this_script, "EUC_TW", $mapping);

src/backend/utils/mb/Unicode/UCS_to_GB18030.pl

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414
# and the "b" field is the hex byte sequence for GB18030
1515

1616
use strict;
17-
require convutils;
17+
use convutils;
18+
19+
my $this_script = $0;
1820

1921
# Read the input
2022

@@ -36,10 +38,12 @@
3638
push @mapping, {
3739
ucs => $ucs,
3840
code => $code,
39-
direction => 'both'
41+
direction => BOTH,
42+
f => $in_file,
43+
l => $.
4044
};
4145
}
4246
}
4347
close($in);
4448

45-
print_tables("GB18030", \@mapping);
49+
print_conversion_tables($this_script, "GB18030", \@mapping);

src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,19 @@
1616
# # and Unicode name (not used in this script)
1717

1818
use strict;
19-
require convutils;
19+
use convutils;
20+
21+
my $this_script = $0;
2022

2123
# Load the source file.
2224

2325
my $mapping = &read_source("JOHAB.TXT");
2426

2527
# Some extra characters that are not in JOHAB.TXT
2628
push @$mapping, (
27-
{direction => 'both', ucs => 0x20AC, code => 0xd9e6, comment => '# EURO SIGN'},
28-
{direction => 'both', ucs => 0x00AE, code => 0xd9e7, comment => '# REGISTERED SIGN'},
29-
{direction => 'both', ucs => 0x327E, code => 0xd9e8, comment => '# CIRCLED HANGUL IEUNG U'}
29+
{direction => BOTH, ucs => 0x20AC, code => 0xd9e6, comment => '# EURO SIGN', f => $this_script, l => __LINE__ },
30+
{direction => BOTH, ucs => 0x00AE, code => 0xd9e7, comment => '# REGISTERED SIGN', f => $this_script, l => __LINE__ },
31+
{direction => BOTH, ucs => 0x327E, code => 0xd9e8, comment => '# CIRCLED HANGUL IEUNG U', f => $this_script, l => __LINE__ }
3032
);
3133

32-
print_tables("JOHAB", $mapping);
34+
print_conversion_tables($this_script, "JOHAB", $mapping);

src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@
88
# "sjis-0213-2004-std.txt" (http://x0213.org)
99

1010
use strict;
11-
require convutils;
11+
use convutils;
1212

1313
# first generate UTF-8 --> SHIFT_JIS_2004 table
1414

15+
my $this_script = $0;
16+
1517
my $in_file = "sjis-0213-2004-std.txt";
1618

1719
open(my $in, '<', $in_file) || die("cannot open $in_file");
@@ -34,9 +36,10 @@
3436
ucs => $ucs1,
3537
ucs_second => $ucs2,
3638
comment => $rest,
37-
direction => 'both'
39+
direction => BOTH,
40+
f => $in_file,
41+
l => $.
3842
};
39-
next;
4043
}
4144
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
4245
{
@@ -52,25 +55,27 @@
5255
}
5356
elsif ($code < 0x80)
5457
{
55-
$direction = 'from_unicode';
58+
$direction = FROM_UNICODE;
5659
}
5760
elsif ($ucs < 0x80)
5861
{
59-
$direction = 'to_unicode';
62+
$direction = TO_UNICODE;
6063
}
6164
else
6265
{
63-
$direction = 'both';
66+
$direction = BOTH;
6467
}
6568

6669
push @mapping, {
6770
code => $code,
6871
ucs => $ucs,
6972
comment => $rest,
70-
direction => $direction
73+
direction => $direction,
74+
f => $in_file,
75+
l => $.
7176
};
7277
}
7378
}
7479
close($in);
7580

76-
print_tables("SHIFT_JIS_2004", \@mapping, 1);
81+
print_conversion_tables($this_script, "SHIFT_JIS_2004", \@mapping);

src/backend/utils/mb/Unicode/UCS_to_SJIS.pl

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
# ftp site.
1212

1313
use strict;
14-
require convutils;
14+
use convutils;
1515

16-
my $charset = read_source("CP932.TXT");
16+
my $this_script = $0;
17+
18+
my $mapping = read_source("CP932.TXT");
1719

1820
# Drop these SJIS codes from the source for UTF8=>SJIS conversion
1921
my @reject_sjis =(
@@ -22,27 +24,27 @@
2224
0x879a..0x879c
2325
);
2426

25-
foreach my $i (@$charset)
27+
foreach my $i (@$mapping)
2628
{
2729
my $code = $i->{code};
2830
my $ucs = $i->{ucs};
2931

3032
if (grep {$code == $_} @reject_sjis)
3133
{
32-
$i->{direction} = "to_unicode";
34+
$i->{direction} = TO_UNICODE;
3335
}
3436
}
3537

3638
# Add these UTF8->SJIS pairs to the table.
37-
push @$charset, (
38-
{direction => "from_unicode", ucs => 0x00a2, code => 0x8191, comment => '# CENT SIGN'},
39-
{direction => "from_unicode", ucs => 0x00a3, code => 0x8192, comment => '# POUND SIGN'},
40-
{direction => "from_unicode", ucs => 0x00a5, code => 0x5c, comment => '# YEN SIGN'},
41-
{direction => "from_unicode", ucs => 0x00ac, code => 0x81ca, comment => '# NOT SIGN'},
42-
{direction => "from_unicode", ucs => 0x2016, code => 0x8161, comment => '# DOUBLE VERTICAL LINE'},
43-
{direction => "from_unicode", ucs => 0x203e, code => 0x7e, comment => '# OVERLINE'},
44-
{direction => "from_unicode", ucs => 0x2212, code => 0x817c, comment => '# MINUS SIGN'},
45-
{direction => "from_unicode", ucs => 0x301c, code => 0x8160, comment => '# WAVE DASH'}
46-
);
39+
push @$mapping, (
40+
{direction => FROM_UNICODE, ucs => 0x00a2, code => 0x8191, comment => '# CENT SIGN', f => $this_script, l => __LINE__ },
41+
{direction => FROM_UNICODE, ucs => 0x00a3, code => 0x8192, comment => '# POUND SIGN', f => $this_script, l => __LINE__ },
42+
{direction => FROM_UNICODE, ucs => 0x00a5, code => 0x5c, comment => '# YEN SIGN', f => $this_script, l => __LINE__ },
43+
{direction => FROM_UNICODE, ucs => 0x00ac, code => 0x81ca, comment => '# NOT SIGN', f => $this_script, l => __LINE__ },
44+
{direction => FROM_UNICODE, ucs => 0x2016, code => 0x8161, comment => '# DOUBLE VERTICAL LINE', f => $this_script, l => __LINE__ },
45+
{direction => FROM_UNICODE, ucs => 0x203e, code => 0x7e, comment => '# OVERLINE', f => $this_script, l => __LINE__ },
46+
{direction => FROM_UNICODE, ucs => 0x2212, code => 0x817c, comment => '# MINUS SIGN', f => $this_script, l => __LINE__ },
47+
{direction => FROM_UNICODE, ucs => 0x301c, code => 0x8160, comment => '# WAVE DASH', f => $this_script, l => __LINE__ }
48+
);
4749

48-
print_tables("SJIS", $charset);
50+
print_conversion_tables($this_script, "SJIS", $mapping);

src/backend/utils/mb/Unicode/UCS_to_UHC.pl

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414
# and the "b" field is the hex byte sequence for UHC
1515

1616
use strict;
17-
require convutils;
17+
use convutils;
18+
19+
my $this_script = $0;
1820

1921
# Read the input
2022

@@ -39,13 +41,15 @@
3941
push @mapping, {
4042
ucs => $ucs,
4143
code => $code,
42-
direction => 'both'
44+
direction => BOTH,
45+
f => $in_file,
46+
l => $.
4347
};
4448
}
4549
}
4650
close($in);
4751

4852
# One extra character that's not in the source file.
49-
push @mapping, { direction => 'both', code => 0xa2e8, ucs => 0x327e, comment => 'CIRCLED HANGUL IEUNG U' };
53+
push @mapping, { direction => BOTH, code => 0xa2e8, ucs => 0x327e, comment => 'CIRCLED HANGUL IEUNG U', f => $this_script, l => __LINE__ };
5054

51-
print_tables("UHC", \@mapping);
55+
print_conversion_tables($this_script, "UHC", \@mapping);

src/backend/utils/mb/Unicode/UCS_to_most.pl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
# # and Unicode name (not used in this script)
1717

1818
use strict;
19-
require convutils;
19+
use convutils;
20+
21+
my $this_script = $0;
2022

2123
my %filename = (
2224
'WIN866' => 'CP866.TXT',
@@ -54,5 +56,5 @@
5456
{
5557
my $mapping = &read_source($filename{$charset});
5658

57-
print_tables($charset, $mapping);
59+
print_conversion_tables($this_script, $charset, $mapping);
5860
}

0 commit comments

Comments
 (0)