postgres
diff --git a/‎src/common/unicode/Makefile
+3-3 b/‎src/common/unicode/Makefile
+3-3
diff --git a/‎src/common/unicode/README
+35-10 b/‎src/common/unicode/README
+35-10
diff --git a/‎src/common/unicode/category_test.c
+179-43 b/‎src/common/unicode/category_test.c
+179-43
@@ -29,13 +29,13 @@ update-unicode: unicode_category_table.h unicode_east_asian_fw_table.h unicode_n
 # These files are part of the Unicode Character Database. Download
 # them on demand.  The dependency on Makefile.global is for
 # UNICODE_VERSION.
-CompositionExclusions.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
+CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
 	$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
 
 unicode_version.h: generate-unicode_version.pl
 	$(PERL) $< --version $(UNICODE_VERSION)
 
-unicode_category_table.h: generate-unicode_category_table.pl UnicodeData.txt
+unicode_category_table.h: generate-unicode_category_table.pl DerivedCoreProperties.txt PropList.txt UnicodeData.txt
 	$(PERL) $<
 
 # Generation of conversion tables used for string normalization with
@@ -82,4 +82,4 @@ clean:
 	rm -f $(OBJS) category_test category_test.o norm_test norm_test.o
 
 distclean: clean
-	rm -f CompositionExclusions.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt UnicodeData.txt norm_test_table.h unicode_category_table.h unicode_norm_table.h
+	rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_category_table.h unicode_norm_table.h
@@ -1,22 +1,35 @@
-This directory contains tools to generate the tables in
-src/include/common/unicode_norm.h, used for Unicode normalization. The
-generated .h file is included in the source tree, so these are normally not
-needed to build PostgreSQL, only if you need to re-generate the .h file
-from the Unicode data files for some reason, e.g. to update to a new version
-of Unicode.
+This directory contains tools to download new Unicode data files and
+generate static tables. These tables are used to normalize or
+determine various properties of Unicode data.
 
-Generating unicode_norm_table.h
--------------------------------
+The generated header files are copied to src/include/common/, and
+included in the source tree, so these tools are not normally required
+to build PostgreSQL.
 
-Run
+Update Unicode Version
+----------------------
+
+Edit src/Makefile.global.in and src/common/unicode/meson.build
+to update the UNICODE_VERSION.
+
+Then, generate the new header files with:
 
     make update-unicode
 
-from the top level of the source tree and commit the result.
+or if using meson:
+
+    ninja update-unicode
+
+from the top level of the source tree. Examine the result to make sure
+the changes look reasonable (that is, that the diff size and scope is
+comparable to the Unicode changes since the last update), and then
+commit it.
 
 Tests
 -----
 
+Normalization tests:
+
 The Unicode consortium publishes a comprehensive test suite for the
 normalization algorithm, in a file called NormalizationTest.txt. This
 directory also contains a perl script and some C code, to run our
@@ -26,3 +39,15 @@ To download NormalizationTest.txt and run the tests:
     make normalization-check
 
 This is also run as part of the update-unicode target.
+
+Category & Property tests:
+
+The file category_test.c exhaustively compares the category and
+properties of each code point as determined by the generated tables
+with the category and properties as reported by ICU. For this test to
+be effective, the version of the Unicode data files must be similar to
+the version of Unicode on which ICU is based, so attempt to match the
+versions as closely as possible. A mismatched Unicode will skip over
+codepoints that are assigned in one version and not the other, and may
+falsely report failures. This test is run as a part of the
+update-unicode target.
@@ -1,6 +1,6 @@
 /*-------------------------------------------------------------------------
  * category_test.c
- *		Program to test Unicode general category functions.
+ *		Program to test Unicode general category and character properties.
  *
  * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
  *
@@ -14,17 +14,23 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <wctype.h>
 
 #ifdef USE_ICU
 #include <unicode/uchar.h>
 #endif
+
 #include "common/unicode_category.h"
 #include "common/unicode_version.h"
 
+static int	pg_unicode_version = 0;
+#ifdef USE_ICU
+static int	icu_unicode_version = 0;
+#endif
+
 /*
  * Parse version into integer for easy comparison.
  */
-#ifdef USE_ICU
 static int
 parse_unicode_version(const char *version)
 {
@@ -39,57 +45,175 @@ parse_unicode_version(const char *version)
 
 	return major * 100 + minor;
 }
-#endif
 
+#ifdef USE_ICU
 /*
- * Exhaustively test that the Unicode category for each codepoint matches that
- * returned by ICU.
+ * Test Postgres Unicode tables by comparing with ICU. Test the General
+ * Category, as well as the properties Alphabetic, Lowercase, Uppercase,
+ * White_Space, and Hex_Digit.
  */
-int
-main(int argc, char **argv)
+static void
+icu_test()
 {
-#ifdef USE_ICU
-	int			pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
-	int			icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
+	int			successful = 0;
 	int			pg_skipped_codepoints = 0;
 	int			icu_skipped_codepoints = 0;
 
-	printf("category_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
-	printf("category_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
-
-	for (UChar32 code = 0; code <= 0x10ffff; code++)
+	for (pg_wchar code = 0; code <= 0x10ffff; code++)
 	{
 		uint8_t		pg_category = unicode_category(code);
 		uint8_t		icu_category = u_charType(code);
 
+		/* Property tests */
+		bool		prop_alphabetic = pg_u_prop_alphabetic(code);
+		bool		prop_lowercase = pg_u_prop_lowercase(code);
+		bool		prop_uppercase = pg_u_prop_uppercase(code);
+		bool		prop_cased = pg_u_prop_cased(code);
+		bool		prop_case_ignorable = pg_u_prop_case_ignorable(code);
+		bool		prop_white_space = pg_u_prop_white_space(code);
+		bool		prop_hex_digit = pg_u_prop_hex_digit(code);
+		bool		prop_join_control = pg_u_prop_join_control(code);
+
+		bool		icu_prop_alphabetic = u_hasBinaryProperty(
+															  code, UCHAR_ALPHABETIC);
+		bool		icu_prop_lowercase = u_hasBinaryProperty(
+															 code, UCHAR_LOWERCASE);
+		bool		icu_prop_uppercase = u_hasBinaryProperty(
+															 code, UCHAR_UPPERCASE);
+		bool		icu_prop_cased = u_hasBinaryProperty(
+														 code, UCHAR_CASED);
+		bool		icu_prop_case_ignorable = u_hasBinaryProperty(
+																  code, UCHAR_CASE_IGNORABLE);
+		bool		icu_prop_white_space = u_hasBinaryProperty(
+															   code, UCHAR_WHITE_SPACE);
+		bool		icu_prop_hex_digit = u_hasBinaryProperty(
+															 code, UCHAR_HEX_DIGIT);
+		bool		icu_prop_join_control = u_hasBinaryProperty(
+																code, UCHAR_JOIN_CONTROL);
+
+		/*
+		 * Compare with ICU for character classes using:
+		 *
+		 * https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uchar_8h.html#details
+		 *
+		 * which describes how to use ICU to test for membership in regex
+		 * character classes.
+		 *
+		 * NB: the document suggests testing for some properties such as
+		 * UCHAR_POSIX_ALNUM, but that doesn't mean that we're testing for the
+		 * "POSIX Compatible" character classes.
+		 */
+		bool		isalpha = pg_u_isalpha(code);
+		bool		islower = pg_u_islower(code);
+		bool		isupper = pg_u_isupper(code);
+		bool		ispunct = pg_u_ispunct(code, false);
+		bool		isdigit = pg_u_isdigit(code, false);
+		bool		isxdigit = pg_u_isxdigit(code, false);
+		bool		isalnum = pg_u_isalnum(code, false);
+		bool		isspace = pg_u_isspace(code);
+		bool		isblank = pg_u_isblank(code);
+		bool		iscntrl = pg_u_iscntrl(code);
+		bool		isgraph = pg_u_isgraph(code);
+		bool		isprint = pg_u_isprint(code);
+
+		bool		icu_isalpha = u_isUAlphabetic(code);
+		bool		icu_islower = u_isULowercase(code);
+		bool		icu_isupper = u_isUUppercase(code);
+		bool		icu_ispunct = u_ispunct(code);
+		bool		icu_isdigit = u_isdigit(code);
+		bool		icu_isxdigit = u_hasBinaryProperty(code,
+													   UCHAR_POSIX_XDIGIT);
+		bool		icu_isalnum = u_hasBinaryProperty(code,
+													  UCHAR_POSIX_ALNUM);
+		bool		icu_isspace = u_isUWhiteSpace(code);
+		bool		icu_isblank = u_isblank(code);
+		bool		icu_iscntrl = icu_category == PG_U_CONTROL;
+		bool		icu_isgraph = u_hasBinaryProperty(code,
+													  UCHAR_POSIX_GRAPH);
+		bool		icu_isprint = u_hasBinaryProperty(code,
+													  UCHAR_POSIX_PRINT);
+
+		/*
+		 * A version mismatch means that some assigned codepoints in the newer
+		 * version may be unassigned in the older version. That's OK, though
+		 * the test will not cover those codepoints marked unassigned in the
+		 * older version (that is, it will no longer be an exhaustive test).
+		 */
+		if (pg_category == PG_U_UNASSIGNED &&
+			icu_category != PG_U_UNASSIGNED &&
+			pg_unicode_version < icu_unicode_version)
+		{
+			pg_skipped_codepoints++;
+			continue;
+		}
+
+		if (icu_category == PG_U_UNASSIGNED &&
+			pg_category != PG_U_UNASSIGNED &&
+			icu_unicode_version < pg_unicode_version)
+		{
+			icu_skipped_codepoints++;
+			continue;
+		}
+
 		if (pg_category != icu_category)
 		{
-			/*
-			 * A version mismatch means that some assigned codepoints in the
-			 * newer version may be unassigned in the older version. That's
-			 * OK, though the test will not cover those codepoints marked
-			 * unassigned in the older version (that is, it will no longer be
-			 * an exhaustive test).
-			 */
-			if (pg_category == PG_U_UNASSIGNED &&
-				pg_unicode_version < icu_unicode_version)
-				pg_skipped_codepoints++;
-			else if (icu_category == PG_U_UNASSIGNED &&
-					 icu_unicode_version < pg_unicode_version)
-				icu_skipped_codepoints++;
-			else
-			{
-				printf("category_test: FAILURE for codepoint 0x%06x\n", code);
-				printf("category_test: Postgres category:	%02d %s %s\n", pg_category,
-					   unicode_category_abbrev(pg_category),
-					   unicode_category_string(pg_category));
-				printf("category_test: ICU category:		%02d %s %s\n", icu_category,
-					   unicode_category_abbrev(icu_category),
-					   unicode_category_string(icu_category));
-				printf("\n");
-				exit(1);
-			}
+			printf("category_test: FAILURE for codepoint 0x%06x\n", code);
+			printf("category_test: Postgres category:	%02d %s %s\n", pg_category,
+				   unicode_category_abbrev(pg_category),
+				   unicode_category_string(pg_category));
+			printf("category_test: ICU category:		%02d %s %s\n", icu_category,
+				   unicode_category_abbrev(icu_category),
+				   unicode_category_string(icu_category));
+			printf("\n");
+			exit(1);
+		}
+
+		if (prop_alphabetic != icu_prop_alphabetic ||
+			prop_lowercase != icu_prop_lowercase ||
+			prop_uppercase != icu_prop_uppercase ||
+			prop_cased != icu_prop_cased ||
+			prop_case_ignorable != icu_prop_case_ignorable ||
+			prop_white_space != icu_prop_white_space ||
+			prop_hex_digit != icu_prop_hex_digit ||
+			prop_join_control != icu_prop_join_control)
+		{
+			printf("category_test: FAILURE for codepoint 0x%06x\n", code);
+			printf("category_test: Postgres	property	alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",
+				   prop_alphabetic, prop_lowercase, prop_uppercase,
+				   prop_cased, prop_case_ignorable,
+				   prop_white_space, prop_hex_digit, prop_join_control);
+			printf("category_test: ICU	property	alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",
+				   icu_prop_alphabetic, icu_prop_lowercase, icu_prop_uppercase,
+				   icu_prop_cased, icu_prop_case_ignorable,
+				   icu_prop_white_space, icu_prop_hex_digit, icu_prop_join_control);
+			printf("\n");
+			exit(1);
 		}
+
+		if (isalpha != icu_isalpha ||
+			islower != icu_islower ||
+			isupper != icu_isupper ||
+			ispunct != icu_ispunct ||
+			isdigit != icu_isdigit ||
+			isxdigit != icu_isxdigit ||
+			isalnum != icu_isalnum ||
+			isspace != icu_isspace ||
+			isblank != icu_isblank ||
+			iscntrl != icu_iscntrl ||
+			isgraph != icu_isgraph ||
+			isprint != icu_isprint)
+		{
+			printf("category_test: FAILURE for codepoint 0x%06x\n", code);
+			printf("category_test: Postgres	class	alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",
+				   isalpha, islower, isupper, ispunct, isdigit, isxdigit, isalnum, isspace, isblank, iscntrl, isgraph, isprint);
+			printf("category_test: ICU class	alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",
+				   icu_isalpha, icu_islower, icu_isupper, icu_ispunct, icu_isdigit, icu_isxdigit, icu_isalnum, icu_isspace, icu_isblank, icu_iscntrl, icu_isgraph, icu_isprint);
+			printf("\n");
+			exit(1);
+		}
+
+		if (pg_category != PG_U_UNASSIGNED)
+			successful++;
 	}
 
 	if (pg_skipped_codepoints > 0)
@@ -99,10 +223,22 @@ main(int argc, char **argv)
 		printf("category_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",
 			   icu_skipped_codepoints);
 
-	printf("category_test: success\n");
-	exit(0);
+	printf("category_test: ICU test: %d codepoints successful\n", successful);
+}
+#endif
+
+int
+main(int argc, char **argv)
+{
+	pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
+	printf("category_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
+
+#ifdef USE_ICU
+	icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
+	printf("category_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
+
+	icu_test();
 #else
-	printf("category_test: ICU support required for test; skipping\n");
-	exit(0);
+	printf("category_test: ICU not available; skipping\n");
 #endif
 }