Skip to content

Commit f69319f

Browse files
committed
Support C.UTF-8 locale in the new builtin collation provider.
The builtin C.UTF-8 locale has similar semantics to the libc locale of the same name. That is, code point sort order (fast, memcmp-based) combined with Unicode semantics for character operations such as pattern matching, regular expressions, and LOWER()/INITCAP()/UPPER(). The character semantics are based on Unicode simple case mappings. The builtin provider's C.UTF-8 offers several important advantages over libc: * faster sorting -- benefits from additional optimizations such as abbreviated keys and varstrfastcmp_c * faster case conversion, e.g. LOWER(), at least compared with some libc implementations * available on all platforms with identical semantics, and the semantics are stable, testable, and documentable within a given Postgres major version Being based on memcmp, the builtin C.UTF-8 locale does not offer natural language sort order. But it is an improvement for most use cases that might otherwise use libc's "C.UTF-8" locale, as well as many use cases that use libc's "C" locale. Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider
1 parent fd0398f commit f69319f

17 files changed

+494
-26
lines changed

doc/src/sgml/charset.sgml

+26-1
Original file line numberDiff line numberDiff line change
@@ -377,13 +377,21 @@ initdb --locale-provider=icu --icu-locale=en
377377
<listitem>
378378
<para>
379379
The <literal>builtin</literal> provider uses built-in operations. Only
380-
the <literal>C</literal> locale is supported for this provider.
380+
the <literal>C</literal> and <literal>C.UTF-8</literal> locales are
381+
supported for this provider.
381382
</para>
382383
<para>
383384
The <literal>C</literal> locale behavior is identical to the
384385
<literal>C</literal> locale in the libc provider. When using this
385386
locale, the behavior may depend on the database encoding.
386387
</para>
388+
<para>
389+
The <literal>C.UTF-8</literal> locale is available only for when the
390+
database encoding is <literal>UTF-8</literal>, and the behavior is
391+
based on Unicode. The collation uses the code point values only. The
392+
regular expression character classes are based on the "POSIX
393+
Compatible" semantics, and the case mapping is the "simple" variant.
394+
</para>
387395
</listitem>
388396
</varlistentry>
389397

@@ -878,6 +886,23 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR";
878886
</listitem>
879887
</varlistentry>
880888

889+
<varlistentry>
890+
<term><literal>pg_c_utf8</literal></term>
891+
<listitem>
892+
<para>
893+
This collation sorts by Unicode code point values rather than natural
894+
language order. For the functions <function>lower</function>,
895+
<function>initcap</function>, and <function>upper</function>, it uses
896+
Unicode simple case mapping. For pattern matching (including regular
897+
expressions), it uses the POSIX Compatible variant of Unicode <ulink
898+
url="https://www.unicode.org/reports/tr18/#Compatibility_Properties">Compatibility
899+
Properties</ulink>. Behavior is efficient and stable within a
900+
<productname>Postgres</productname> major version. This collation is
901+
only available for encoding <literal>UTF8</literal>.
902+
</para>
903+
</listitem>
904+
</varlistentry>
905+
881906
<varlistentry>
882907
<term><literal>C</literal> (equivalent to <literal>POSIX</literal>)</term>
883908
<listitem>

doc/src/sgml/ref/create_collation.sgml

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ CREATE COLLATION [ IF NOT EXISTS ] <replaceable>name</replaceable> FROM <replace
9999
<para>
100100
If <replaceable>provider</replaceable> is <literal>builtin</literal>,
101101
then <replaceable>locale</replaceable> must be specified and set to
102-
<literal>C</literal>.
102+
either <literal>C</literal> or <literal>C.UTF-8</literal>.
103103
</para>
104104
</listitem>
105105
</varlistentry>

doc/src/sgml/ref/create_database.sgml

+8-5
Original file line numberDiff line numberDiff line change
@@ -166,8 +166,9 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable>
166166
</para>
167167
<para>
168168
If <xref linkend="create-database-locale-provider"/> is
169-
<literal>builtin</literal>, then <replaceable>locale</replaceable>
170-
must be specified and set to <literal>C</literal>.
169+
<literal>builtin</literal>, then <replaceable>locale</replaceable> or
170+
<replaceable>builtin_locale</replaceable> must be specified and set to
171+
either <literal>C</literal> or <literal>C.UTF-8</literal>.
171172
</para>
172173
<tip>
173174
<para>
@@ -228,9 +229,11 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable>
228229
linkend="create-database-locale-provider">locale provider</link> must
229230
be <literal>builtin</literal>. The default is the setting of <xref
230231
linkend="create-database-locale"/> if specified; otherwise the same
231-
setting as the template database. Currently, the only available
232-
locale for the <literal>builtin</literal> provider is
233-
<literal>C</literal>.
232+
setting as the template database.
233+
</para>
234+
<para>
235+
The locales available for the <literal>builtin</literal> provider are
236+
<literal>C</literal> and <literal>C.UTF-8</literal>.
234237
</para>
235238
</listitem>
236239
</varlistentry>

doc/src/sgml/ref/initdb.sgml

+3-2
Original file line numberDiff line numberDiff line change
@@ -288,8 +288,9 @@ PostgreSQL documentation
288288
</para>
289289
<para>
290290
If <option>--locale-provider</option> is <literal>builtin</literal>,
291-
<option>--locale</option> must be specified and set to
292-
<literal>C</literal>.
291+
<option>--locale</option> or <option>--builtin-locale</option> must be
292+
specified and set to <literal>C</literal> or
293+
<literal>C.UTF-8</literal>.
293294
</para>
294295
</listitem>
295296
</varlistentry>

src/backend/regex/regc_pg_locale.c

+35-1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
*/
1717

1818
#include "catalog/pg_collation.h"
19+
#include "common/unicode_case.h"
20+
#include "common/unicode_category.h"
1921
#include "utils/pg_locale.h"
2022

2123
/*
@@ -64,6 +66,7 @@
6466
typedef enum
6567
{
6668
PG_REGEX_LOCALE_C, /* C locale (encoding independent) */
69+
PG_REGEX_BUILTIN, /* built-in Unicode semantics */
6770
PG_REGEX_LOCALE_WIDE, /* Use <wctype.h> functions */
6871
PG_REGEX_LOCALE_1BYTE, /* Use <ctype.h> functions */
6972
PG_REGEX_LOCALE_WIDE_L, /* Use locale_t <wctype.h> functions */
@@ -266,7 +269,12 @@ pg_set_regex_collation(Oid collation)
266269
if (GetDatabaseEncoding() == PG_UTF8)
267270
{
268271
if (pg_regex_locale)
269-
pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
272+
{
273+
if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
274+
pg_regex_strategy = PG_REGEX_BUILTIN;
275+
else
276+
pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
277+
}
270278
else
271279
pg_regex_strategy = PG_REGEX_LOCALE_WIDE;
272280
}
@@ -290,6 +298,8 @@ pg_wc_isdigit(pg_wchar c)
290298
case PG_REGEX_LOCALE_C:
291299
return (c <= (pg_wchar) 127 &&
292300
(pg_char_properties[c] & PG_ISDIGIT));
301+
case PG_REGEX_BUILTIN:
302+
return pg_u_isdigit(c, true);
293303
case PG_REGEX_LOCALE_WIDE:
294304
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
295305
return iswdigit((wint_t) c);
@@ -322,6 +332,8 @@ pg_wc_isalpha(pg_wchar c)
322332
case PG_REGEX_LOCALE_C:
323333
return (c <= (pg_wchar) 127 &&
324334
(pg_char_properties[c] & PG_ISALPHA));
335+
case PG_REGEX_BUILTIN:
336+
return pg_u_isalpha(c);
325337
case PG_REGEX_LOCALE_WIDE:
326338
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
327339
return iswalpha((wint_t) c);
@@ -354,6 +366,8 @@ pg_wc_isalnum(pg_wchar c)
354366
case PG_REGEX_LOCALE_C:
355367
return (c <= (pg_wchar) 127 &&
356368
(pg_char_properties[c] & PG_ISALNUM));
369+
case PG_REGEX_BUILTIN:
370+
return pg_u_isalnum(c, true);
357371
case PG_REGEX_LOCALE_WIDE:
358372
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
359373
return iswalnum((wint_t) c);
@@ -395,6 +409,8 @@ pg_wc_isupper(pg_wchar c)
395409
case PG_REGEX_LOCALE_C:
396410
return (c <= (pg_wchar) 127 &&
397411
(pg_char_properties[c] & PG_ISUPPER));
412+
case PG_REGEX_BUILTIN:
413+
return pg_u_isupper(c);
398414
case PG_REGEX_LOCALE_WIDE:
399415
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
400416
return iswupper((wint_t) c);
@@ -427,6 +443,8 @@ pg_wc_islower(pg_wchar c)
427443
case PG_REGEX_LOCALE_C:
428444
return (c <= (pg_wchar) 127 &&
429445
(pg_char_properties[c] & PG_ISLOWER));
446+
case PG_REGEX_BUILTIN:
447+
return pg_u_islower(c);
430448
case PG_REGEX_LOCALE_WIDE:
431449
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
432450
return iswlower((wint_t) c);
@@ -459,6 +477,8 @@ pg_wc_isgraph(pg_wchar c)
459477
case PG_REGEX_LOCALE_C:
460478
return (c <= (pg_wchar) 127 &&
461479
(pg_char_properties[c] & PG_ISGRAPH));
480+
case PG_REGEX_BUILTIN:
481+
return pg_u_isgraph(c);
462482
case PG_REGEX_LOCALE_WIDE:
463483
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
464484
return iswgraph((wint_t) c);
@@ -491,6 +511,8 @@ pg_wc_isprint(pg_wchar c)
491511
case PG_REGEX_LOCALE_C:
492512
return (c <= (pg_wchar) 127 &&
493513
(pg_char_properties[c] & PG_ISPRINT));
514+
case PG_REGEX_BUILTIN:
515+
return pg_u_isprint(c);
494516
case PG_REGEX_LOCALE_WIDE:
495517
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
496518
return iswprint((wint_t) c);
@@ -523,6 +545,8 @@ pg_wc_ispunct(pg_wchar c)
523545
case PG_REGEX_LOCALE_C:
524546
return (c <= (pg_wchar) 127 &&
525547
(pg_char_properties[c] & PG_ISPUNCT));
548+
case PG_REGEX_BUILTIN:
549+
return pg_u_ispunct(c, true);
526550
case PG_REGEX_LOCALE_WIDE:
527551
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
528552
return iswpunct((wint_t) c);
@@ -555,6 +579,8 @@ pg_wc_isspace(pg_wchar c)
555579
case PG_REGEX_LOCALE_C:
556580
return (c <= (pg_wchar) 127 &&
557581
(pg_char_properties[c] & PG_ISSPACE));
582+
case PG_REGEX_BUILTIN:
583+
return pg_u_isspace(c);
558584
case PG_REGEX_LOCALE_WIDE:
559585
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
560586
return iswspace((wint_t) c);
@@ -588,6 +614,8 @@ pg_wc_toupper(pg_wchar c)
588614
if (c <= (pg_wchar) 127)
589615
return pg_ascii_toupper((unsigned char) c);
590616
return c;
617+
case PG_REGEX_BUILTIN:
618+
return unicode_uppercase_simple(c);
591619
case PG_REGEX_LOCALE_WIDE:
592620
/* force C behavior for ASCII characters, per comments above */
593621
if (c <= (pg_wchar) 127)
@@ -628,6 +656,8 @@ pg_wc_tolower(pg_wchar c)
628656
if (c <= (pg_wchar) 127)
629657
return pg_ascii_tolower((unsigned char) c);
630658
return c;
659+
case PG_REGEX_BUILTIN:
660+
return unicode_lowercase_simple(c);
631661
case PG_REGEX_LOCALE_WIDE:
632662
/* force C behavior for ASCII characters, per comments above */
633663
if (c <= (pg_wchar) 127)
@@ -792,6 +822,9 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
792822
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
793823
#endif
794824
break;
825+
case PG_REGEX_BUILTIN:
826+
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
827+
break;
795828
case PG_REGEX_LOCALE_WIDE:
796829
case PG_REGEX_LOCALE_WIDE_L:
797830
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
@@ -809,6 +842,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
809842
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
810843
break;
811844
default:
845+
Assert(false);
812846
max_chr = 0; /* can't get here, but keep compiler quiet */
813847
break;
814848
}

src/backend/utils/adt/formatting.c

+112
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@
7777

7878
#include "catalog/pg_collation.h"
7979
#include "catalog/pg_type.h"
80+
#include "common/unicode_case.h"
81+
#include "common/unicode_category.h"
8082
#include "mb/pg_wchar.h"
8183
#include "nodes/miscnodes.h"
8284
#include "parser/scansup.h"
@@ -1679,6 +1681,34 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
16791681
}
16801682
else
16811683
#endif
1684+
if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
1685+
{
1686+
const char *src = buff;
1687+
size_t srclen = nbytes;
1688+
size_t dstsize;
1689+
char *dst;
1690+
size_t needed;
1691+
1692+
Assert(GetDatabaseEncoding() == PG_UTF8);
1693+
1694+
/* first try buffer of equal size plus terminating NUL */
1695+
dstsize = srclen + 1;
1696+
dst = palloc(dstsize);
1697+
1698+
needed = unicode_strlower(dst, dstsize, src, srclen);
1699+
if (needed + 1 > dstsize)
1700+
{
1701+
/* grow buffer if needed and retry */
1702+
dstsize = needed + 1;
1703+
dst = repalloc(dst, dstsize);
1704+
needed = unicode_strlower(dst, dstsize, src, srclen);
1705+
Assert(needed + 1 == dstsize);
1706+
}
1707+
1708+
Assert(dst[needed] == '\0');
1709+
result = dst;
1710+
}
1711+
else
16821712
{
16831713
Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC);
16841714

@@ -1799,6 +1829,34 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
17991829
}
18001830
else
18011831
#endif
1832+
if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
1833+
{
1834+
const char *src = buff;
1835+
size_t srclen = nbytes;
1836+
size_t dstsize;
1837+
char *dst;
1838+
size_t needed;
1839+
1840+
Assert(GetDatabaseEncoding() == PG_UTF8);
1841+
1842+
/* first try buffer of equal size plus terminating NUL */
1843+
dstsize = srclen + 1;
1844+
dst = palloc(dstsize);
1845+
1846+
needed = unicode_strupper(dst, dstsize, src, srclen);
1847+
if (needed + 1 > dstsize)
1848+
{
1849+
/* grow buffer if needed and retry */
1850+
dstsize = needed + 1;
1851+
dst = repalloc(dst, dstsize);
1852+
needed = unicode_strupper(dst, dstsize, src, srclen);
1853+
Assert(needed + 1 == dstsize);
1854+
}
1855+
1856+
Assert(dst[needed] == '\0');
1857+
result = dst;
1858+
}
1859+
else
18021860
{
18031861
Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC);
18041862

@@ -1920,6 +1978,60 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
19201978
}
19211979
else
19221980
#endif
1981+
if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
1982+
{
1983+
const unsigned char *src = (unsigned char *) buff;
1984+
size_t srclen = nbytes;
1985+
unsigned char *dst;
1986+
size_t dstsize;
1987+
int srcoff = 0;
1988+
int dstoff = 0;
1989+
1990+
Assert(GetDatabaseEncoding() == PG_UTF8);
1991+
1992+
/* overflow paranoia */
1993+
if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN))
1994+
ereport(ERROR,
1995+
(errcode(ERRCODE_OUT_OF_MEMORY),
1996+
errmsg("out of memory")));
1997+
1998+
/* result is at most srclen codepoints plus terminating NUL */
1999+
dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1;
2000+
dst = (unsigned char *) palloc(dstsize);
2001+
2002+
while (srcoff < nbytes)
2003+
{
2004+
pg_wchar u1 = utf8_to_unicode(src + srcoff);
2005+
pg_wchar u2;
2006+
int u1len = unicode_utf8len(u1);
2007+
int u2len;
2008+
2009+
if (wasalnum)
2010+
u2 = unicode_lowercase_simple(u1);
2011+
else
2012+
u2 = unicode_uppercase_simple(u1);
2013+
2014+
u2len = unicode_utf8len(u2);
2015+
2016+
Assert(dstoff + u2len + 1 <= dstsize);
2017+
2018+
wasalnum = pg_u_isalnum(u2, true);
2019+
2020+
unicode_to_utf8(u2, dst + dstoff);
2021+
srcoff += u1len;
2022+
dstoff += u2len;
2023+
}
2024+
2025+
Assert(dstoff + 1 <= dstsize);
2026+
*(dst + dstoff) = '\0';
2027+
dstoff++;
2028+
2029+
/* allocate result buffer of the right size and free workspace */
2030+
result = palloc(dstoff);
2031+
memcpy(result, dst, dstoff);
2032+
pfree(dst);
2033+
}
2034+
else
19232035
{
19242036
Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC);
19252037

0 commit comments

Comments
 (0)