postgres
diff --git a/‎src/common/unicode/Makefile
Lines changed: 3 additions & 1 deletion b/‎src/common/unicode/Makefile
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/common/unicode/generate-unicode_norm_table.pl
Lines changed: 199 additions & 27 deletions b/‎src/common/unicode/generate-unicode_norm_table.pl
Lines changed: 199 additions & 27 deletions
@@ -18,7 +18,7 @@ LIBS += $(PTHREAD_LIBS)
 # By default, do nothing.
 all:
 
-update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h
+update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
 	mv $^ ../../../src/include/common/
 	$(MAKE) normalization-check
 
@@ -30,6 +30,8 @@ UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt Normaliz
 
 # Generation of conversion tables used for string normalization with
 # UTF-8 strings.
+unicode_norm_hashfunc.h: unicode_norm_table.h
+
 unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt
 	$(PERL) generate-unicode_norm_table.pl
 
 
@@ -1,16 +1,22 @@
 #!/usr/bin/perl
 #
-# Generate a composition table, using Unicode data files as input
+# Generate a composition table and its lookup utilities, using Unicode data
+# files as input.
 #
 # Input: UnicodeData.txt and CompositionExclusions.txt
-# Output: unicode_norm_table.h
+# Output: unicode_norm_table.h and unicode_norm_hashfunc.h
 #
 # Copyright (c) 2000-2020, PostgreSQL Global Development Group
 
 use strict;
 use warnings;
 
-my $output_file = "unicode_norm_table.h";
+use FindBin;
+use lib "$FindBin::RealBin/../../tools/";
+use PerfectHash;
+
+my $output_table_file = "unicode_norm_table.h";
+my $output_func_file  = "unicode_norm_hashfunc.h";
 
 my $FH;
 
@@ -64,11 +70,13 @@
 
 my $num_characters = scalar @characters;
 
-# Start writing out the output file
-open my $OUTPUT, '>', $output_file
-  or die "Could not open output file $output_file: $!\n";
+# Start writing out the output files
+open my $OT, '>', $output_table_file
+  or die "Could not open output file $output_table_file: $!\n";
+open my $OF, '>', $output_func_file
+  or die "Could not open output file $output_func_file: $!\n";
 
-print $OUTPUT <<HEADER;
+print $OT <<HEADER;
 /*-------------------------------------------------------------------------
  *
  * unicode_norm_table.h
@@ -111,8 +119,53 @@
 {
 HEADER
 
+print $OF <<HEADER;
+/*-------------------------------------------------------------------------
+ *
+ * unicode_norm_hashfunc.h
+ *	  Perfect hash functions used for Unicode normalization
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/common/unicode_norm_hashfunc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,
+ * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H
+ * here.
+ */
+
+#include "common/unicode_norm_table.h"
+
+/* Typedef for perfect hash functions */
+typedef int (*cp_hash_func) (const void *key);
+
+/* Information for lookups with perfect hash functions */
+typedef struct
+{
+	const pg_unicode_decomposition *decomps;
+	cp_hash_func	hash;
+	int		num_decomps;
+} pg_unicode_decompinfo;
+
+typedef struct
+{
+	const uint16	*inverse_lookup;
+	cp_hash_func	hash;
+	int		num_recomps;
+} pg_unicode_recompinfo;
+
+HEADER
+
 my $decomp_index  = 0;
 my $decomp_string = "";
+my @dec_cp_packed;
+my $main_index = 0;
+my @rec_info;
 
 my $last_code = $characters[-1]->{code};
 foreach my $char (@characters)
@@ -121,6 +174,9 @@
 	my $class  = $char->{class};
 	my $decomp = $char->{decomp};
 
+	# Save the code point bytes as a string in network order.
+	push @dec_cp_packed, pack('N', hex($char->{code}));
+
 	# The character decomposition mapping field in UnicodeData.txt is a list
 	# of unicode codepoints, separated by space. But it can be prefixed with
 	# so-called compatibility formatting tag, like "<compat>", or "<font>".
@@ -163,32 +219,46 @@
 		{
 			foreach my $lcode (@composition_exclusion_codes)
 			{
-				if ($lcode eq $char->{code})
+				if ($lcode eq $code)
 				{
 					$flags .= " | DECOMP_NO_COMPOSE";
 					$comment = "in exclusion list";
 					last;
 				}
 			}
 		}
+
+		# Save info for recomposeable codepoints.
+		# Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C
+		# above!  See also the inverse lookup in recompose_code() found in
+		# src/common/unicode_norm.c.
+		if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/))
+		{
+			push @rec_info,
+			  {
+				code       => $code,
+				main_index => $main_index,
+				first      => $first_decomp,
+				second     => $decomp_elts[0]
+			  };
+		}
 	}
 
 	if ($decomp_size == 0)
 	{
-		print $OUTPUT "\t{0x$code, $class, 0$flags, 0}";
+		print $OT "\t{0x$code, $class, 0$flags, 0}";
 	}
 	elsif ($decomp_size == 1 && length($first_decomp) <= 4)
 	{
 
 		# The decomposition consists of a single codepoint, and it fits
 		# in a uint16, so we can store it "inline" in the main table.
 		$flags .= " | DECOMP_INLINE";
-		print $OUTPUT "\t{0x$code, $class, 1$flags, 0x$first_decomp}";
+		print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}";
 	}
 	else
 	{
-		print $OUTPUT
-		  "\t{0x$code, $class, $decomp_size$flags, $decomp_index}";
+		print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}";
 
 		# Now save the decompositions into a dedicated area that will
 		# be written afterwards.  First build the entry dedicated to
@@ -205,30 +275,132 @@
 	}
 
 	# Print a comma after all items except the last one.
-	print $OUTPUT "," unless ($code eq $last_code);
-	if ($comment ne "")
-	{
+	print $OT "," unless ($code eq $last_code);
 
-		# If the line is wide already, indent the comment with one tab,
-		# otherwise with two. This is to make the output match the way
-		# pgindent would mangle it. (This is quite hacky. To do this
-		# properly, we should actually track how long the line is so far,
-		# but this works for now.)
-		print $OUTPUT "\t" if ($decomp_index < 10);
+	print $OT "\t/* $comment */" if ($comment ne "");
+	print $OT "\n";
 
-		print $OUTPUT "\t/* $comment */" if ($comment ne "");
-	}
-	print $OUTPUT "\n";
+	$main_index++;
 }
-print $OUTPUT "\n};\n\n";
+print $OT "\n};\n\n";
 
 # Print the array of decomposed codes.
-print $OUTPUT <<HEADER;
+print $OT <<HEADER;
 /* codepoints array  */
 static const uint32 UnicodeDecomp_codepoints[$decomp_index] =
 {
 $decomp_string
 };
 HEADER
 
-close $OUTPUT;
+# Emit the definition of the decomp hash function.
+my $dec_funcname = 'Decomp_hash_func';
+my $dec_func     = PerfectHash::generate_hash_function(\@dec_cp_packed,
+	$dec_funcname, fixed_key_length => 4);
+print $OF "/* Perfect hash function for decomposition */\n";
+print $OF "static $dec_func\n";
+
+# Emit the structure that wraps the hash lookup information into
+# one variable.
+print $OF <<HEADER;
+/* Hash lookup information for decomposition */
+static const pg_unicode_decompinfo UnicodeDecompInfo =
+{
+	UnicodeDecompMain,
+	$dec_funcname,
+	$num_characters
+};
+
+HEADER
+
+# Find the lowest codepoint that decomposes to each recomposeable
+# code pair and create a mapping to it.
+my $recomp_string = "";
+my @rec_cp_packed;
+my %seenit;
+my $firstentry = 1;
+foreach my $rec (sort recomp_sort @rec_info)
+{
+	# The hash key is formed by concatenating the bytes of the two
+	# codepoints. See also recompose_code() in common/unicode_norm.c.
+	my $hashkey = (hex($rec->{first}) << 32) | hex($rec->{second});
+
+	# We are only interested in the lowest code point that decomposes
+	# to the given code pair.
+	next if $seenit{$hashkey};
+
+	# Save the hash key bytes in network order
+	push @rec_cp_packed, pack('Q>', $hashkey);
+
+	# Append inverse lookup element
+	$recomp_string .= ",\n" if !$firstentry;
+	$recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s",
+	  $rec->{first},
+	  $rec->{second},
+	  $rec->{code},
+	  $rec->{main_index};
+
+	$seenit{$hashkey} = 1;
+	$firstentry = 0;
+}
+
+# Emit the inverse lookup array containing indexes into UnicodeDecompMain.
+my $num_recomps = scalar @rec_cp_packed;
+print $OF <<HEADER;
+/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */
+static const uint16 RecompInverseLookup[$num_recomps] =
+{
+$recomp_string
+};
+
+HEADER
+
+# Emit the definition of the recomposition hash function.
+my $rec_funcname = 'Recomp_hash_func';
+my $rec_func =
+  PerfectHash::generate_hash_function(\@rec_cp_packed, $rec_funcname,
+	fixed_key_length => 8);
+print $OF "/* Perfect hash function for recomposition */\n";
+print $OF "static $rec_func\n";
+
+# Emit the structure that wraps the hash lookup information into
+# one variable.
+print $OF <<HEADER;
+/* Hash lookup information for recomposition */
+static const pg_unicode_recompinfo UnicodeRecompInfo =
+{
+	RecompInverseLookup,
+	$rec_funcname,
+	$num_recomps
+};
+HEADER
+
+close $OT;
+close $OF;
+
+sub recomp_sort
+{
+	my $a1 = hex($a->{first});
+	my $b1 = hex($b->{first});
+
+	my $a2 = hex($a->{second});
+	my $b2 = hex($b->{second});
+
+	# First sort by the first code point
+	return -1 if $a1 < $b1;
+	return 1  if $a1 > $b1;
+
+	# Then sort by the second code point
+	return -1 if $a2 < $b2;
+	return 1  if $a2 > $b2;
+
+	# Finally sort by the code point that decomposes into first and
+	# second ones.
+	my $acode = hex($a->{code});
+	my $bcode = hex($b->{code});
+
+	return -1 if $acode < $bcode;
+	return -1 if $acode > $bcode;
+
+	die "found duplicate entries of recomposeable code pairs";
+}