Skip to content

Commit bfc5992

Browse files
committed
Add SQL function CASEFOLD().
Useful for caseless matching. Similar to LOWER(), but avoids edge-case problems with using LOWER() for caseless matching. For collations that support it, CASEFOLD() handles characters with more than two case variations or multi-character case variations. Some characters may fold to uppercase. The results of case folding are also more stable across Unicode versions than LOWER() or UPPER(). Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com Reviewed-by: Ian Lawrence Barwick
1 parent f15538c commit bfc5992

14 files changed

+278
-3
lines changed

doc/src/sgml/func.sgml

+44-2
Original file line numberDiff line numberDiff line change
@@ -2596,7 +2596,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
25962596

25972597
<row>
25982598
<entry role="func_table_entry"><para role="func_signature">
2599-
<indexterm>
2599+
<indexterm id="function-lower">
26002600
<primary>lower</primary>
26012601
</indexterm>
26022602
<function>lower</function> ( <type>text</type> )
@@ -2657,7 +2657,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
26572657

26582658
<row>
26592659
<entry role="func_table_entry"><para role="func_signature">
2660-
<indexterm>
2660+
<indexterm id="function-normalize">
26612661
<primary>normalize</primary>
26622662
</indexterm>
26632663
<indexterm>
@@ -3109,6 +3109,48 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
31093109
</para></entry>
31103110
</row>
31113111

3112+
<row>
3113+
<entry role="func_table_entry"><para role="func_signature">
3114+
<indexterm>
3115+
<primary>casefold</primary>
3116+
</indexterm>
3117+
<function>casefold</function> ( <type>text</type> )
3118+
<returnvalue>text</returnvalue>
3119+
</para>
3120+
<para>
3121+
Performs case folding of the input string according to the collation.
3122+
Case folding is similar to case conversion, but the purpose of case
3123+
folding is to facilitate case-insensitive comparison of strings,
3124+
whereas the purpose of case conversion is to convert to a particular
3125+
cased form. This function can only be used when the server encoding
3126+
is <literal>UTF8</literal>.
3127+
</para>
3128+
<para>
3129+
Ordinarily, case folding simply converts to lowercase, but there are a
3130+
few notable exceptions depending on the collation. For instance, the
3131+
character <literal>Σ</literal> (U+03A3) has two lowercase forms:
3132+
<literal>σ</literal> (U+03C3) and <literal>ς</literal> (U+03C2); case
3133+
folding in the <literal>PG_C_UTF8</literal> collation maps all three
3134+
forms to <literal>σ</literal>. Additionally, the result is not
3135+
necessarily lowercase; some characters may be folded to uppercase.
3136+
</para>
3137+
<para>
3138+
Case folding may change the length of the string. For instance, in
3139+
the <literal>PG_UNICODE_FAST</literal> collation, <literal>ß</literal>
3140+
(U+00DF) folds to <literal>ss</literal>.
3141+
</para>
3142+
<para>
3143+
<function>casefold</function> can be used for Unicode Default Caseless
3144+
Matching. It does not always preserve the normalized form of the
3145+
input string (see <xref linkend="function-normalize"/>).
3146+
</para>
3147+
<para>
3148+
The <literal>libc</literal> provider doesn't support case folding, so
3149+
<function>casefold</function> is identical to <xref
3150+
linkend="function-lower"/>.
3151+
</para></entry>
3152+
</row>
3153+
31123154
<row>
31133155
<entry role="func_table_entry"><para role="func_signature">
31143156
<indexterm>

src/backend/utils/adt/formatting.c

+69
Original file line numberDiff line numberDiff line change
@@ -1819,6 +1819,75 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
18191819
return result;
18201820
}
18211821

1822+
/*
1823+
* collation-aware, wide-character-aware case folding
1824+
*
1825+
* We pass the number of bytes so we can pass varlena and char*
1826+
* to this function. The result is a palloc'd, null-terminated string.
1827+
*/
1828+
char *
1829+
str_casefold(const char *buff, size_t nbytes, Oid collid)
1830+
{
1831+
char *result;
1832+
pg_locale_t mylocale;
1833+
1834+
if (!buff)
1835+
return NULL;
1836+
1837+
if (!OidIsValid(collid))
1838+
{
1839+
/*
1840+
* This typically means that the parser could not resolve a conflict
1841+
* of implicit collations, so report it that way.
1842+
*/
1843+
ereport(ERROR,
1844+
(errcode(ERRCODE_INDETERMINATE_COLLATION),
1845+
errmsg("could not determine which collation to use for %s function",
1846+
"lower()"),
1847+
errhint("Use the COLLATE clause to set the collation explicitly.")));
1848+
}
1849+
1850+
if (GetDatabaseEncoding() != PG_UTF8)
1851+
ereport(ERROR,
1852+
(errcode(ERRCODE_SYNTAX_ERROR),
1853+
errmsg("Unicode case folding can only be performed if server encoding is UTF8")));
1854+
1855+
mylocale = pg_newlocale_from_collation(collid);
1856+
1857+
/* C/POSIX collations use this path regardless of database encoding */
1858+
if (mylocale->ctype_is_c)
1859+
{
1860+
result = asc_tolower(buff, nbytes);
1861+
}
1862+
else
1863+
{
1864+
const char *src = buff;
1865+
size_t srclen = nbytes;
1866+
size_t dstsize;
1867+
char *dst;
1868+
size_t needed;
1869+
1870+
/* first try buffer of equal size plus terminating NUL */
1871+
dstsize = srclen + 1;
1872+
dst = palloc(dstsize);
1873+
1874+
needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
1875+
if (needed + 1 > dstsize)
1876+
{
1877+
/* grow buffer if needed and retry */
1878+
dstsize = needed + 1;
1879+
dst = repalloc(dst, dstsize);
1880+
needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
1881+
Assert(needed + 1 <= dstsize);
1882+
}
1883+
1884+
Assert(dst[needed] == '\0');
1885+
result = dst;
1886+
}
1887+
1888+
return result;
1889+
}
1890+
18221891
/*
18231892
* ASCII-only lower function
18241893
*

src/backend/utils/adt/oracle_compat.c

+16
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,22 @@ initcap(PG_FUNCTION_ARGS)
126126
PG_RETURN_TEXT_P(result);
127127
}
128128

129+
Datum
130+
casefold(PG_FUNCTION_ARGS)
131+
{
132+
text *in_string = PG_GETARG_TEXT_PP(0);
133+
char *out_string;
134+
text *result;
135+
136+
out_string = str_casefold(VARDATA_ANY(in_string),
137+
VARSIZE_ANY_EXHDR(in_string),
138+
PG_GET_COLLATION());
139+
result = cstring_to_text(out_string);
140+
pfree(out_string);
141+
142+
PG_RETURN_TEXT_P(result);
143+
}
144+
129145

130146
/********************************************************************
131147
*

src/backend/utils/adt/pg_locale.c

+24
Original file line numberDiff line numberDiff line change
@@ -106,13 +106,17 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
106106
ssize_t srclen, pg_locale_t locale);
107107
extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
108108
ssize_t srclen, pg_locale_t locale);
109+
extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
110+
ssize_t srclen, pg_locale_t locale);
109111

110112
extern size_t strlower_icu(char *dst, size_t dstsize, const char *src,
111113
ssize_t srclen, pg_locale_t locale);
112114
extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
113115
ssize_t srclen, pg_locale_t locale);
114116
extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
115117
ssize_t srclen, pg_locale_t locale);
118+
extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
119+
ssize_t srclen, pg_locale_t locale);
116120

117121
extern size_t strlower_libc(char *dst, size_t dstsize, const char *src,
118122
ssize_t srclen, pg_locale_t locale);
@@ -1447,6 +1451,26 @@ pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
14471451
return 0; /* keep compiler quiet */
14481452
}
14491453

1454+
size_t
1455+
pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
1456+
pg_locale_t locale)
1457+
{
1458+
if (locale->provider == COLLPROVIDER_BUILTIN)
1459+
return strfold_builtin(dst, dstsize, src, srclen, locale);
1460+
#ifdef USE_ICU
1461+
else if (locale->provider == COLLPROVIDER_ICU)
1462+
return strfold_icu(dst, dstsize, src, srclen, locale);
1463+
#endif
1464+
/* for libc, just use strlower */
1465+
else if (locale->provider == COLLPROVIDER_LIBC)
1466+
return strlower_libc(dst, dstsize, src, srclen, locale);
1467+
else
1468+
/* shouldn't happen */
1469+
PGLOCALE_SUPPORT_ERROR(locale->provider);
1470+
1471+
return 0; /* keep compiler quiet */
1472+
}
1473+
14501474
/*
14511475
* pg_strcoll
14521476
*

src/backend/utils/adt/pg_locale_builtin.c

+10
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
3131
ssize_t srclen, pg_locale_t locale);
3232
extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
3333
ssize_t srclen, pg_locale_t locale);
34+
extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
35+
ssize_t srclen, pg_locale_t locale);
3436

3537

3638
struct WordBoundaryState
@@ -107,6 +109,14 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
107109
locale->info.builtin.casemap_full);
108110
}
109111

112+
size_t
113+
strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
114+
pg_locale_t locale)
115+
{
116+
return unicode_strfold(dest, destsize, src, srclen,
117+
locale->info.builtin.casemap_full);
118+
}
119+
110120
pg_locale_t
111121
create_pg_locale_builtin(Oid collid, MemoryContext context)
112122
{

src/backend/utils/adt/pg_locale_icu.c

+58
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
5454
ssize_t srclen, pg_locale_t locale);
5555
extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
5656
ssize_t srclen, pg_locale_t locale);
57+
extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
58+
ssize_t srclen, pg_locale_t locale);
5759

5860
#ifdef USE_ICU
5961

@@ -117,6 +119,10 @@ static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
117119
const UChar *src, int32_t srcLength,
118120
const char *locale,
119121
UErrorCode *pErrorCode);
122+
static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
123+
const UChar *src, int32_t srcLength,
124+
const char *locale,
125+
UErrorCode *pErrorCode);
120126

121127
static const struct collate_methods collate_methods_icu = {
122128
.strncoll = strncoll_icu,
@@ -439,6 +445,26 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
439445
return result_len;
440446
}
441447

448+
size_t
449+
strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
450+
pg_locale_t locale)
451+
{
452+
int32_t len_uchar;
453+
int32_t len_conv;
454+
UChar *buff_uchar;
455+
UChar *buff_conv;
456+
size_t result_len;
457+
458+
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
459+
len_conv = icu_convert_case(u_strFoldCase_default, locale,
460+
&buff_conv, buff_uchar, len_uchar);
461+
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
462+
pfree(buff_uchar);
463+
pfree(buff_conv);
464+
465+
return result_len;
466+
}
467+
442468
/*
443469
* strncoll_icu_utf8
444470
*
@@ -673,6 +699,38 @@ u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
673699
NULL, locale, pErrorCode);
674700
}
675701

702+
static int32_t
703+
u_strFoldCase_default(UChar *dest, int32_t destCapacity,
704+
const UChar *src, int32_t srcLength,
705+
const char *locale,
706+
UErrorCode *pErrorCode)
707+
{
708+
uint32 options = U_FOLD_CASE_DEFAULT;
709+
char lang[3];
710+
UErrorCode status;
711+
712+
/*
713+
* Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
714+
* folding does not accept a locale. Instead it just supports a single
715+
* option relevant to Turkic languages 'az' and 'tr'; check for those
716+
* languages to enable the option.
717+
*/
718+
status = U_ZERO_ERROR;
719+
uloc_getLanguage(locale, lang, 3, &status);
720+
if (U_SUCCESS(status))
721+
{
722+
/*
723+
* The option name is confusing, but it causes u_strFoldCase to use
724+
* the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
725+
*/
726+
if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
727+
options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
728+
}
729+
730+
return u_strFoldCase(dest, destCapacity, src, srcLength,
731+
options, pErrorCode);
732+
}
733+
676734
/*
677735
* strncoll_icu
678736
*

src/include/catalog/catversion.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,6 @@
5757
*/
5858

5959
/* yyyymmddN */
60-
#define CATALOG_VERSION_NO 202501231
60+
#define CATALOG_VERSION_NO 202501232
6161

6262
#endif

src/include/catalog/pg_proc.dat

+3
Original file line numberDiff line numberDiff line change
@@ -3623,6 +3623,9 @@
36233623
{ oid => '872', descr => 'capitalize each word',
36243624
proname => 'initcap', prorettype => 'text', proargtypes => 'text',
36253625
prosrc => 'initcap' },
3626+
{ oid => '9569', descr => 'fold case',
3627+
proname => 'casefold', prorettype => 'text', proargtypes => 'text',
3628+
prosrc => 'casefold' },
36263629
{ oid => '873', descr => 'left-pad string to length',
36273630
proname => 'lpad', prorettype => 'text', proargtypes => 'text int4 text',
36283631
prosrc => 'lpad' },

src/include/utils/formatting.h

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
extern char *str_tolower(const char *buff, size_t nbytes, Oid collid);
2222
extern char *str_toupper(const char *buff, size_t nbytes, Oid collid);
2323
extern char *str_initcap(const char *buff, size_t nbytes, Oid collid);
24+
extern char *str_casefold(const char *buff, size_t nbytes, Oid collid);
2425

2526
extern char *asc_tolower(const char *buff, size_t nbytes);
2627
extern char *asc_toupper(const char *buff, size_t nbytes);

src/include/utils/pg_locale.h

+3
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,9 @@ extern size_t pg_strtitle(char *dest, size_t destsize,
134134
extern size_t pg_strupper(char *dest, size_t destsize,
135135
const char *src, ssize_t srclen,
136136
pg_locale_t locale);
137+
extern size_t pg_strfold(char *dest, size_t destsize,
138+
const char *src, ssize_t srclen,
139+
pg_locale_t locale);
137140
extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
138141
extern int pg_strncoll(const char *arg1, ssize_t len1,
139142
const char *arg2, ssize_t len2, pg_locale_t locale);

src/test/regress/expected/collate.icu.utf8.out

+24
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,30 @@ SELECT a, x, y FROM collate_test10 ORDER BY lower(y), a;
255255
1 | hij | hij
256256
(2 rows)
257257

258+
SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
259+
lower
260+
-------------------------------
261+
abcd 123 #$% ıiii̇ ß ß dždždž σσς
262+
(1 row)
263+
264+
SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
265+
casefold
266+
---------------------------------
267+
abcd 123 #$% ıiii̇ ss ss dždždž σσσ
268+
(1 row)
269+
270+
SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
271+
lower
272+
-------------------------------
273+
abcd 123 #$% ıiıi ß ß dždždž σσς
274+
(1 row)
275+
276+
SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
277+
casefold
278+
---------------------------------
279+
abcd 123 #$% ıiıi ss ss dždždž σσσ
280+
(1 row)
281+
258282
-- LIKE/ILIKE
259283
SELECT * FROM collate_test1 WHERE b LIKE 'abc';
260284
a | b

0 commit comments

Comments
 (0)