Skip to content

Commit a02b37f

Browse files
committed
Additional unicode primitive functions.
Introduce unicode_version(), icu_unicode_version(), and unicode_assigned(). The latter requires introducing a new lookup table for the Unicode General Category, which is generated along with the other Unicode lookup tables. Discussion: https://postgr.es/m/CA+TgmoYzYR-yhU6k1XFCADeyj=Oyz2PkVsa3iKv+keM8wp-F_A@mail.gmail.com Reviewed-by: Peter Eisentraut
1 parent 7021d3b commit a02b37f

18 files changed

+4924
-22
lines changed

doc/src/sgml/func.sgml

Lines changed: 90 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2859,6 +2859,22 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
28592859
</para></entry>
28602860
</row>
28612861

2862+
<row>
2863+
<entry role="func_table_entry"><para role="func_signature">
2864+
<indexterm>
2865+
<primary>unicode_assigned</primary>
2866+
</indexterm>
2867+
<function>unicode_assigned</function> ( <type>text</type> )
2868+
<returnvalue>text</returnvalue>
2869+
</para>
2870+
<para>
2871+
Returns <literal>true</literal> if all characters in the string are
2872+
assigned Unicode codepoints; <literal>false</literal> otherwise. This
2873+
function can only be used when the server encoding is
2874+
<literal>UTF8</literal>.
2875+
</para></entry>
2876+
</row>
2877+
28622878
<row>
28632879
<entry role="func_table_entry"><para role="func_signature">
28642880
<indexterm>
@@ -23427,25 +23443,6 @@ SELECT * FROM pg_ls_dir('.') WITH ORDINALITY AS t(ls,n);
2342723443
This is equivalent to <function>current_user</function>.
2342823444
</para></entry>
2342923445
</row>
23430-
23431-
<row>
23432-
<entry role="func_table_entry"><para role="func_signature">
23433-
<indexterm>
23434-
<primary>version</primary>
23435-
</indexterm>
23436-
<function>version</function> ()
23437-
<returnvalue>text</returnvalue>
23438-
</para>
23439-
<para>
23440-
Returns a string describing the <productname>PostgreSQL</productname>
23441-
server's version. You can also get this information from
23442-
<xref linkend="guc-server-version"/>, or for a machine-readable
23443-
version use <xref linkend="guc-server-version-num"/>. Software
23444-
developers should use <varname>server_version_num</varname> (available
23445-
since 8.2) or <xref linkend="libpq-PQserverVersion"/> instead of
23446-
parsing the text version.
23447-
</para></entry>
23448-
</row>
2344923446
</tbody>
2345023447
</tgroup>
2345123448
</table>
@@ -26332,6 +26329,80 @@ SELECT collation for ('foo' COLLATE "de_DE");
2633226329

2633326330
</sect2>
2633426331

26332+
<sect2 id="functions-info-version">
26333+
<title>Version Information Functions</title>
26334+
26335+
<para>
26336+
The functions shown in <xref linkend="functions-version"/>
26337+
print version information.
26338+
</para>
26339+
26340+
<table id="functions-version">
26341+
<title>Version Information Functions</title>
26342+
<tgroup cols="1">
26343+
<thead>
26344+
<row>
26345+
<entry role="func_table_entry"><para role="func_signature">
26346+
Function
26347+
</para>
26348+
<para>
26349+
Description
26350+
</para></entry>
26351+
</row>
26352+
</thead>
26353+
26354+
<tbody>
26355+
<row>
26356+
<entry role="func_table_entry"><para role="func_signature">
26357+
<indexterm>
26358+
<primary>version</primary>
26359+
</indexterm>
26360+
<function>version</function> ()
26361+
<returnvalue>text</returnvalue>
26362+
</para>
26363+
<para>
26364+
Returns a string describing the <productname>PostgreSQL</productname>
26365+
server's version. You can also get this information from
26366+
<xref linkend="guc-server-version"/>, or for a machine-readable
26367+
version use <xref linkend="guc-server-version-num"/>. Software
26368+
developers should use <varname>server_version_num</varname> (available
26369+
since 8.2) or <xref linkend="libpq-PQserverVersion"/> instead of
26370+
parsing the text version.
26371+
</para></entry>
26372+
</row>
26373+
26374+
<row>
26375+
<entry role="func_table_entry"><para role="func_signature">
26376+
<indexterm>
26377+
<primary>unicode_version</primary>
26378+
</indexterm>
26379+
<function>unicode_version</function> ()
26380+
<returnvalue>text</returnvalue>
26381+
</para>
26382+
<para>
26383+
Returns a string representing the version of Unicode used by
26384+
<productname>PostgreSQL</productname>.
26385+
</para></entry>
26386+
</row>
26387+
<row>
26388+
<entry role="func_table_entry"><para role="func_signature">
26389+
<indexterm>
26390+
<primary>icu_unicode_version</primary>
26391+
</indexterm>
26392+
<function>icu_unicode_version</function> ()
26393+
<returnvalue>text</returnvalue>
26394+
</para>
26395+
<para>
26396+
Returns a string representing the version of Unicode used by ICU, if
26397+
the server was built with ICU support; otherwise returns
26398+
<literal>NULL</literal> </para></entry>
26399+
</row>
26400+
</tbody>
26401+
</tgroup>
26402+
</table>
26403+
26404+
</sect2>
26405+
2633526406
</sect1>
2633626407

2633726408
<sect1 id="functions-admin">

src/backend/utils/adt/varlena.c

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
#include "catalog/pg_type.h"
2424
#include "common/hashfn.h"
2525
#include "common/int.h"
26+
#include "common/unicode_category.h"
2627
#include "common/unicode_norm.h"
28+
#include "common/unicode_version.h"
2729
#include "funcapi.h"
2830
#include "lib/hyperloglog.h"
2931
#include "libpq/pqformat.h"
@@ -6237,6 +6239,65 @@ unicode_norm_form_from_string(const char *formstr)
62376239
return form;
62386240
}
62396241

6242+
/*
6243+
* Returns version of Unicode used by Postgres in "major.minor" format (the
6244+
* same format as the Unicode version reported by ICU). The third component
6245+
* ("update version") never involves additions to the character repertiore and
6246+
* is unimportant for most purposes.
6247+
*
6248+
* See: https://unicode.org/versions/
6249+
*/
6250+
Datum
6251+
unicode_version(PG_FUNCTION_ARGS)
6252+
{
6253+
PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
6254+
}
6255+
6256+
/*
6257+
* Returns version of Unicode used by ICU, if enabled; otherwise NULL.
6258+
*/
6259+
Datum
6260+
icu_unicode_version(PG_FUNCTION_ARGS)
6261+
{
6262+
#ifdef USE_ICU
6263+
PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
6264+
#else
6265+
PG_RETURN_NULL();
6266+
#endif
6267+
}
6268+
6269+
/*
6270+
* Check whether the string contains only assigned Unicode code
6271+
* points. Requires that the database encoding is UTF-8.
6272+
*/
6273+
Datum
6274+
unicode_assigned(PG_FUNCTION_ARGS)
6275+
{
6276+
text *input = PG_GETARG_TEXT_PP(0);
6277+
unsigned char *p;
6278+
int size;
6279+
6280+
if (GetDatabaseEncoding() != PG_UTF8)
6281+
ereport(ERROR,
6282+
(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
6283+
6284+
/* convert to pg_wchar */
6285+
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6286+
p = (unsigned char *) VARDATA_ANY(input);
6287+
for (int i = 0; i < size; i++)
6288+
{
6289+
pg_wchar uchar = utf8_to_unicode(p);
6290+
int category = unicode_category(uchar);
6291+
6292+
if (category == PG_U_UNASSIGNED)
6293+
PG_RETURN_BOOL(false);
6294+
6295+
p += pg_utf_mblen(p);
6296+
}
6297+
6298+
PG_RETURN_BOOL(true);
6299+
}
6300+
62406301
Datum
62416302
unicode_normalize_func(PG_FUNCTION_ARGS)
62426303
{

src/common/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ OBJS_COMMON = \
7878
scram-common.o \
7979
string.o \
8080
stringinfo.o \
81+
unicode_category.o \
8182
unicode_norm.o \
8283
username.o \
8384
wait_error.o \

src/common/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ common_sources = files(
3030
'scram-common.c',
3131
'string.c',
3232
'stringinfo.c',
33+
'unicode_category.c',
3334
'unicode_norm.c',
3435
'username.c',
3536
'wait_error.c',

src/common/unicode/Makefile

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,15 @@ include $(top_builddir)/src/Makefile.global
1515
override CPPFLAGS := -DFRONTEND -I. $(CPPFLAGS)
1616
LIBS += $(PTHREAD_LIBS)
1717

18+
LDFLAGS_INTERNAL += $(ICU_LIBS)
19+
CPPFLAGS += $(ICU_CFLAGS)
20+
1821
# By default, do nothing.
1922
all:
2023

21-
update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
24+
update-unicode: unicode_category_table.h unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h unicode_version.h
2225
mv $^ $(top_srcdir)/src/include/common/
26+
$(MAKE) category-check
2327
$(MAKE) normalization-check
2428

2529
# These files are part of the Unicode Character Database. Download
@@ -28,6 +32,12 @@ update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asi
2832
UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
2933
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
3034

35+
unicode_version.h: generate-unicode_version.pl
36+
$(PERL) $< --version $(UNICODE_VERSION)
37+
38+
unicode_category_table.h: generate-unicode_category_table.pl UnicodeData.txt
39+
$(PERL) $<
40+
3141
# Generation of conversion tables used for string normalization with
3242
# UTF-8 strings.
3343
unicode_norm_hashfunc.h: unicode_norm_table.h
@@ -45,9 +55,14 @@ unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizat
4555
$(PERL) $^ >$@
4656

4757
# Test suite
58+
category-check: category_test
59+
./category_test
60+
4861
normalization-check: norm_test
4962
./norm_test
5063

64+
category_test: category_test.o ../unicode_category.o | submake-common
65+
5166
norm_test: norm_test.o ../unicode_norm.o | submake-common
5267

5368
norm_test.o: norm_test_table.h
@@ -64,7 +79,7 @@ norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt
6479

6580

6681
clean:
67-
rm -f $(OBJS) norm_test norm_test.o
82+
rm -f $(OBJS) category_test category_test.o norm_test norm_test.o
6883

6984
distclean: clean
7085
rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h

src/common/unicode/category_test.c

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/*-------------------------------------------------------------------------
2+
* category_test.c
3+
* Program to test Unicode general category functions.
4+
*
5+
* Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
6+
*
7+
* IDENTIFICATION
8+
* src/common/unicode/category_test.c
9+
*
10+
*-------------------------------------------------------------------------
11+
*/
12+
#include "postgres_fe.h"
13+
14+
#include <stdio.h>
15+
#include <stdlib.h>
16+
#include <string.h>
17+
18+
#ifdef USE_ICU
19+
#include <unicode/uchar.h>
20+
#endif
21+
#include "common/unicode_category.h"
22+
#include "common/unicode_version.h"
23+
24+
/*
25+
* Parse version into integer for easy comparison.
26+
*/
27+
#ifdef USE_ICU
28+
static int
29+
parse_unicode_version(const char *version)
30+
{
31+
int n,
32+
major,
33+
minor;
34+
35+
n = sscanf(version, "%d.%d", &major, &minor);
36+
37+
Assert(n == 2);
38+
Assert(minor < 100);
39+
40+
return major * 100 + minor;
41+
}
42+
#endif
43+
44+
/*
45+
* Exhaustively test that the Unicode category for each codepoint matches that
46+
* returned by ICU.
47+
*/
48+
int
49+
main(int argc, char **argv)
50+
{
51+
#ifdef USE_ICU
52+
int pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
53+
int icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
54+
int pg_skipped_codepoints = 0;
55+
int icu_skipped_codepoints = 0;
56+
57+
printf("Postgres Unicode Version:\t%s\n", PG_UNICODE_VERSION);
58+
printf("ICU Unicode Version:\t\t%s\n", U_UNICODE_VERSION);
59+
60+
for (UChar32 code = 0; code <= 0x10ffff; code++)
61+
{
62+
uint8_t pg_category = unicode_category(code);
63+
uint8_t icu_category = u_charType(code);
64+
65+
if (pg_category != icu_category)
66+
{
67+
/*
68+
* A version mismatch means that some assigned codepoints in the
69+
* newer version may be unassigned in the older version. That's
70+
* OK, though the test will not cover those codepoints marked
71+
* unassigned in the older version (that is, it will no longer be
72+
* an exhaustive test).
73+
*/
74+
if (pg_category == PG_U_UNASSIGNED &&
75+
pg_unicode_version < icu_unicode_version)
76+
pg_skipped_codepoints++;
77+
else if (icu_category == PG_U_UNASSIGNED &&
78+
icu_unicode_version < pg_unicode_version)
79+
icu_skipped_codepoints++;
80+
else
81+
{
82+
printf("FAILURE for codepoint %06x\n", code);
83+
printf("Postgres category: %02d %s %s\n", pg_category,
84+
unicode_category_abbrev(pg_category),
85+
unicode_category_string(pg_category));
86+
printf("ICU category: %02d %s %s\n", icu_category,
87+
unicode_category_abbrev(icu_category),
88+
unicode_category_string(icu_category));
89+
printf("\n");
90+
exit(1);
91+
}
92+
}
93+
}
94+
95+
if (pg_skipped_codepoints > 0)
96+
printf("Skipped %d codepoints unassigned in Postgres due to Unicode version mismatch.\n",
97+
pg_skipped_codepoints);
98+
if (icu_skipped_codepoints > 0)
99+
printf("Skipped %d codepoints unassigned in ICU due to Unicode version mismatch.\n",
100+
icu_skipped_codepoints);
101+
102+
printf("category_test: All tests successful!\n");
103+
exit(0);
104+
#else
105+
printf("ICU support required for test; skipping.\n");
106+
exit(0);
107+
#endif
108+
}

0 commit comments

Comments
 (0)