Skip to content

Commit 783f0cc

Browse files
committed
Improve performance of Unicode {de,re}composition in the backend
This replaces the existing binary search with two perfect hash functions for the composition and the decomposition in the backend code, at the cost of slightly-larger binaries there (35kB in libpgcommon_srv.a). Per the measurements done, this improves the speed of the recomposition and decomposition by up to 30~40 times for the NFC and NFKC conversions, while all other operations get at least 40% faster. This is not as "good" as what libicu has, but it closes the gap a lot as per the feedback from Daniel Verite. The decomposition table remains the same, getting used for the binary search in the frontend code, where we care more about the size of the libraries like libpq over performance as this gets involved only in code paths related to the SCRAM authentication. In consequence, note that the perfect hash function for the recomposition needs to use a new inverse lookup array back to to the existing decomposition table. The size of all frontend deliverables remains unchanged, even with --enable-debug, including libpq. Author: John Naylor Reviewed-by: Michael Paquier, Tom Lane Discussion: https://postgr.es/m/CAFBsxsHUuMFCt6-pU+oG-F1==CmEp8wR+O+bRouXWu6i8kXuqA@mail.gmail.com
1 parent 7d6d6bc commit 783f0cc

File tree

5 files changed

+3227
-44
lines changed

5 files changed

+3227
-44
lines changed

src/common/unicode/Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ LIBS += $(PTHREAD_LIBS)
1818
# By default, do nothing.
1919
all:
2020

21-
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h
21+
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
2222
mv $^ ../../../src/include/common/
2323
$(MAKE) normalization-check
2424

@@ -30,6 +30,8 @@ UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt Normaliz
3030

3131
# Generation of conversion tables used for string normalization with
3232
# UTF-8 strings.
33+
unicode_norm_hashfunc.h: unicode_norm_table.h
34+
3335
unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt
3436
$(PERL) generate-unicode_norm_table.pl
3537

src/common/unicode/generate-unicode_norm_table.pl

Lines changed: 199 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,22 @@
11
#!/usr/bin/perl
22
#
3-
# Generate a composition table, using Unicode data files as input
3+
# Generate a composition table and its lookup utilities, using Unicode data
4+
# files as input.
45
#
56
# Input: UnicodeData.txt and CompositionExclusions.txt
6-
# Output: unicode_norm_table.h
7+
# Output: unicode_norm_table.h and unicode_norm_hashfunc.h
78
#
89
# Copyright (c) 2000-2020, PostgreSQL Global Development Group
910

1011
use strict;
1112
use warnings;
1213

13-
my $output_file = "unicode_norm_table.h";
14+
use FindBin;
15+
use lib "$FindBin::RealBin/../../tools/";
16+
use PerfectHash;
17+
18+
my $output_table_file = "unicode_norm_table.h";
19+
my $output_func_file = "unicode_norm_hashfunc.h";
1420

1521
my $FH;
1622

@@ -64,11 +70,13 @@
6470

6571
my $num_characters = scalar @characters;
6672

67-
# Start writing out the output file
68-
open my $OUTPUT, '>', $output_file
69-
or die "Could not open output file $output_file: $!\n";
73+
# Start writing out the output files
74+
open my $OT, '>', $output_table_file
75+
or die "Could not open output file $output_table_file: $!\n";
76+
open my $OF, '>', $output_func_file
77+
or die "Could not open output file $output_func_file: $!\n";
7078

71-
print $OUTPUT <<HEADER;
79+
print $OT <<HEADER;
7280
/*-------------------------------------------------------------------------
7381
*
7482
* unicode_norm_table.h
@@ -111,8 +119,53 @@
111119
{
112120
HEADER
113121

122+
print $OF <<HEADER;
123+
/*-------------------------------------------------------------------------
124+
*
125+
* unicode_norm_hashfunc.h
126+
* Perfect hash functions used for Unicode normalization
127+
*
128+
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
129+
* Portions Copyright (c) 1994, Regents of the University of California
130+
*
131+
* src/include/common/unicode_norm_hashfunc.h
132+
*
133+
*-------------------------------------------------------------------------
134+
*/
135+
136+
/*
137+
* File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,
138+
* do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H
139+
* here.
140+
*/
141+
142+
#include "common/unicode_norm_table.h"
143+
144+
/* Typedef for perfect hash functions */
145+
typedef int (*cp_hash_func) (const void *key);
146+
147+
/* Information for lookups with perfect hash functions */
148+
typedef struct
149+
{
150+
const pg_unicode_decomposition *decomps;
151+
cp_hash_func hash;
152+
int num_decomps;
153+
} pg_unicode_decompinfo;
154+
155+
typedef struct
156+
{
157+
const uint16 *inverse_lookup;
158+
cp_hash_func hash;
159+
int num_recomps;
160+
} pg_unicode_recompinfo;
161+
162+
HEADER
163+
114164
my $decomp_index = 0;
115165
my $decomp_string = "";
166+
my @dec_cp_packed;
167+
my $main_index = 0;
168+
my @rec_info;
116169

117170
my $last_code = $characters[-1]->{code};
118171
foreach my $char (@characters)
@@ -121,6 +174,9 @@
121174
my $class = $char->{class};
122175
my $decomp = $char->{decomp};
123176

177+
# Save the code point bytes as a string in network order.
178+
push @dec_cp_packed, pack('N', hex($char->{code}));
179+
124180
# The character decomposition mapping field in UnicodeData.txt is a list
125181
# of unicode codepoints, separated by space. But it can be prefixed with
126182
# so-called compatibility formatting tag, like "<compat>", or "<font>".
@@ -163,32 +219,46 @@
163219
{
164220
foreach my $lcode (@composition_exclusion_codes)
165221
{
166-
if ($lcode eq $char->{code})
222+
if ($lcode eq $code)
167223
{
168224
$flags .= " | DECOMP_NO_COMPOSE";
169225
$comment = "in exclusion list";
170226
last;
171227
}
172228
}
173229
}
230+
231+
# Save info for recomposeable codepoints.
232+
# Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C
233+
# above! See also the inverse lookup in recompose_code() found in
234+
# src/common/unicode_norm.c.
235+
if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/))
236+
{
237+
push @rec_info,
238+
{
239+
code => $code,
240+
main_index => $main_index,
241+
first => $first_decomp,
242+
second => $decomp_elts[0]
243+
};
244+
}
174245
}
175246

176247
if ($decomp_size == 0)
177248
{
178-
print $OUTPUT "\t{0x$code, $class, 0$flags, 0}";
249+
print $OT "\t{0x$code, $class, 0$flags, 0}";
179250
}
180251
elsif ($decomp_size == 1 && length($first_decomp) <= 4)
181252
{
182253

183254
# The decomposition consists of a single codepoint, and it fits
184255
# in a uint16, so we can store it "inline" in the main table.
185256
$flags .= " | DECOMP_INLINE";
186-
print $OUTPUT "\t{0x$code, $class, 1$flags, 0x$first_decomp}";
257+
print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}";
187258
}
188259
else
189260
{
190-
print $OUTPUT
191-
"\t{0x$code, $class, $decomp_size$flags, $decomp_index}";
261+
print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}";
192262

193263
# Now save the decompositions into a dedicated area that will
194264
# be written afterwards. First build the entry dedicated to
@@ -205,30 +275,132 @@
205275
}
206276

207277
# Print a comma after all items except the last one.
208-
print $OUTPUT "," unless ($code eq $last_code);
209-
if ($comment ne "")
210-
{
278+
print $OT "," unless ($code eq $last_code);
211279

212-
# If the line is wide already, indent the comment with one tab,
213-
# otherwise with two. This is to make the output match the way
214-
# pgindent would mangle it. (This is quite hacky. To do this
215-
# properly, we should actually track how long the line is so far,
216-
# but this works for now.)
217-
print $OUTPUT "\t" if ($decomp_index < 10);
280+
print $OT "\t/* $comment */" if ($comment ne "");
281+
print $OT "\n";
218282

219-
print $OUTPUT "\t/* $comment */" if ($comment ne "");
220-
}
221-
print $OUTPUT "\n";
283+
$main_index++;
222284
}
223-
print $OUTPUT "\n};\n\n";
285+
print $OT "\n};\n\n";
224286

225287
# Print the array of decomposed codes.
226-
print $OUTPUT <<HEADER;
288+
print $OT <<HEADER;
227289
/* codepoints array */
228290
static const uint32 UnicodeDecomp_codepoints[$decomp_index] =
229291
{
230292
$decomp_string
231293
};
232294
HEADER
233295

234-
close $OUTPUT;
296+
# Emit the definition of the decomp hash function.
297+
my $dec_funcname = 'Decomp_hash_func';
298+
my $dec_func = PerfectHash::generate_hash_function(\@dec_cp_packed,
299+
$dec_funcname, fixed_key_length => 4);
300+
print $OF "/* Perfect hash function for decomposition */\n";
301+
print $OF "static $dec_func\n";
302+
303+
# Emit the structure that wraps the hash lookup information into
304+
# one variable.
305+
print $OF <<HEADER;
306+
/* Hash lookup information for decomposition */
307+
static const pg_unicode_decompinfo UnicodeDecompInfo =
308+
{
309+
UnicodeDecompMain,
310+
$dec_funcname,
311+
$num_characters
312+
};
313+
314+
HEADER
315+
316+
# Find the lowest codepoint that decomposes to each recomposeable
317+
# code pair and create a mapping to it.
318+
my $recomp_string = "";
319+
my @rec_cp_packed;
320+
my %seenit;
321+
my $firstentry = 1;
322+
foreach my $rec (sort recomp_sort @rec_info)
323+
{
324+
# The hash key is formed by concatenating the bytes of the two
325+
# codepoints. See also recompose_code() in common/unicode_norm.c.
326+
my $hashkey = (hex($rec->{first}) << 32) | hex($rec->{second});
327+
328+
# We are only interested in the lowest code point that decomposes
329+
# to the given code pair.
330+
next if $seenit{$hashkey};
331+
332+
# Save the hash key bytes in network order
333+
push @rec_cp_packed, pack('Q>', $hashkey);
334+
335+
# Append inverse lookup element
336+
$recomp_string .= ",\n" if !$firstentry;
337+
$recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s",
338+
$rec->{first},
339+
$rec->{second},
340+
$rec->{code},
341+
$rec->{main_index};
342+
343+
$seenit{$hashkey} = 1;
344+
$firstentry = 0;
345+
}
346+
347+
# Emit the inverse lookup array containing indexes into UnicodeDecompMain.
348+
my $num_recomps = scalar @rec_cp_packed;
349+
print $OF <<HEADER;
350+
/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */
351+
static const uint16 RecompInverseLookup[$num_recomps] =
352+
{
353+
$recomp_string
354+
};
355+
356+
HEADER
357+
358+
# Emit the definition of the recomposition hash function.
359+
my $rec_funcname = 'Recomp_hash_func';
360+
my $rec_func =
361+
PerfectHash::generate_hash_function(\@rec_cp_packed, $rec_funcname,
362+
fixed_key_length => 8);
363+
print $OF "/* Perfect hash function for recomposition */\n";
364+
print $OF "static $rec_func\n";
365+
366+
# Emit the structure that wraps the hash lookup information into
367+
# one variable.
368+
print $OF <<HEADER;
369+
/* Hash lookup information for recomposition */
370+
static const pg_unicode_recompinfo UnicodeRecompInfo =
371+
{
372+
RecompInverseLookup,
373+
$rec_funcname,
374+
$num_recomps
375+
};
376+
HEADER
377+
378+
close $OT;
379+
close $OF;
380+
381+
sub recomp_sort
382+
{
383+
my $a1 = hex($a->{first});
384+
my $b1 = hex($b->{first});
385+
386+
my $a2 = hex($a->{second});
387+
my $b2 = hex($b->{second});
388+
389+
# First sort by the first code point
390+
return -1 if $a1 < $b1;
391+
return 1 if $a1 > $b1;
392+
393+
# Then sort by the second code point
394+
return -1 if $a2 < $b2;
395+
return 1 if $a2 > $b2;
396+
397+
# Finally sort by the code point that decomposes into first and
398+
# second ones.
399+
my $acode = hex($a->{code});
400+
my $bcode = hex($b->{code});
401+
402+
return -1 if $acode < $bcode;
403+
return -1 if $acode > $bcode;
404+
405+
die "found duplicate entries of recomposeable code pairs";
406+
}

0 commit comments

Comments
 (0)