Skip to content

Commit 1e16a81

Browse files
committed
Teach regular expression operators to honor collations.
This involves getting the character classification and case-folding functions in the regex library to use the collations infrastructure. Most of this work had been done already in connection with the upper/lower and LIKE logic, so it was a simple matter of transposition. While at it, split out these functions into a separate source file regc_pg_locale.c, so that they can be correctly labeled with the Postgres project's license rather than the Scriptics license. These functions are 100% Postgres-written code whereas what remains in regc_locale.c is still mostly not ours, so lumping them both under the same copyright notice was getting more and more misleading.
1 parent 210f95f commit 1e16a81

File tree

12 files changed

+819
-192
lines changed

12 files changed

+819
-192
lines changed

doc/src/sgml/charset.sgml

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -221,17 +221,21 @@ initdb --locale=sv_SE
221221

222222
<listitem>
223223
<para>
224-
The ability to use indexes with <literal>LIKE</> clauses
225-
<indexterm><primary>LIKE</><secondary>and locales</></indexterm>
224+
The <function>upper</>, <function>lower</>, and <function>initcap</>
225+
functions
226+
<indexterm><primary>upper</><secondary>and locales</></indexterm>
227+
<indexterm><primary>lower</><secondary>and locales</></indexterm>
226228
</para>
227229
</listitem>
228230

229231
<listitem>
230232
<para>
231-
The <function>upper</>, <function>lower</>, and <function>initcap</>
232-
functions
233-
<indexterm><primary>upper</><secondary>and locales</></indexterm>
234-
<indexterm><primary>lower</><secondary>and locales</></indexterm>
233+
Pattern matching operators (<literal>LIKE</>, <literal>SIMILAR TO</>,
234+
and POSIX-style regular expressions); locales affect both case
235+
insensitive matching and the classification of characters by
236+
character-class regular expressions
237+
<indexterm><primary>LIKE</><secondary>and locales</></indexterm>
238+
<indexterm><primary>regular expressions</><secondary>and locales</></indexterm>
235239
</para>
236240
</listitem>
237241

@@ -241,6 +245,12 @@ initdb --locale=sv_SE
241245
<indexterm><primary>to_char</><secondary>and locales</></indexterm>
242246
</para>
243247
</listitem>
248+
249+
<listitem>
250+
<para>
251+
The ability to use indexes with <literal>LIKE</> clauses
252+
</para>
253+
</listitem>
244254
</itemizedlist>
245255
</para>
246256

@@ -319,8 +329,8 @@ initdb --locale=sv_SE
319329
<indexterm zone="collation"><primary>collation</></>
320330

321331
<para>
322-
The collation feature allows specifying the sort order and certain
323-
other locale aspects of data per-column, or even per-operation.
332+
The collation feature allows specifying the sort order and character
333+
classification behavior of data per-column, or even per-operation.
324334
This alleviates the restriction that the
325335
<symbol>LC_COLLATE</symbol> and <symbol>LC_CTYPE</symbol> settings
326336
of a database cannot be changed after its creation.
@@ -351,8 +361,8 @@ initdb --locale=sv_SE
351361
</para>
352362

353363
<para>
354-
When the database system has to perform an ordering or a
355-
comparison, it uses the collation of the input expression. This
364+
When the database system has to perform an ordering or a character
365+
classification, it uses the collation of the input expression. This
356366
happens, for example, with <literal>ORDER BY</literal> clauses
357367
and function or operator calls such as <literal>&lt;</literal>.
358368
The collation to apply for an <literal>ORDER BY</literal> clause
@@ -361,7 +371,8 @@ initdb --locale=sv_SE
361371
below. In addition to comparison operators, collations are taken into
362372
account by functions that convert between lower and upper case
363373
letters, such as <function>lower</>, <function>upper</>, and
364-
<function>initcap</>.
374+
<function>initcap</>; by pattern matching operators; and by
375+
<function>to_char</> and related functions.
365376
</para>
366377

367378
<para>

src/backend/libpq/hba.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <arpa/inet.h>
2626
#include <unistd.h>
2727

28+
#include "catalog/pg_collation.h"
2829
#include "libpq/ip.h"
2930
#include "libpq/libpq.h"
3031
#include "regex/regex.h"
@@ -1781,7 +1782,7 @@ parse_ident_usermap(List *line, int line_number, const char *usermap_name,
17811782
* XXX: Major room for optimization: regexps could be compiled when
17821783
* the file is loaded and then re-used in every connection.
17831784
*/
1784-
r = pg_regcomp(&re, wstr, wlen, REG_ADVANCED);
1785+
r = pg_regcomp(&re, wstr, wlen, REG_ADVANCED, C_COLLATION_OID);
17851786
if (r)
17861787
{
17871788
char errstr[100];

src/backend/regex/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ OBJS = regcomp.o regerror.o regexec.o regfree.o
1717
include $(top_srcdir)/src/backend/common.mk
1818

1919
# mark inclusion dependencies between .c files explicitly
20-
regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c regc_locale.c
20+
regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c \
21+
regc_locale.c regc_pg_locale.c
2122

2223
regexec.o: regexec.c rege_dfa.c

src/backend/regex/regc_locale.c

Lines changed: 0 additions & 165 deletions
Original file line numberDiff line numberDiff line change
@@ -350,171 +350,6 @@ static const struct cname
350350
};
351351

352352

353-
/*
354-
* ctype functions adapted to work on pg_wchar (a/k/a chr)
355-
*
356-
* When working in UTF8 encoding, we use the <wctype.h> functions if
357-
* available. This assumes that every platform uses Unicode codepoints
358-
* directly as the wchar_t representation of Unicode. On some platforms
359-
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
360-
*
361-
* In all other encodings, we use the <ctype.h> functions for pg_wchar
362-
* values up to 255, and punt for values above that. This is only 100%
363-
* correct in single-byte encodings such as LATINn. However, non-Unicode
364-
* multibyte encodings are mostly Far Eastern character sets for which the
365-
* properties being tested here aren't relevant for higher code values anyway.
366-
*
367-
* NB: the coding here assumes pg_wchar is an unsigned type.
368-
*/
369-
370-
static int
371-
pg_wc_isdigit(pg_wchar c)
372-
{
373-
#ifdef USE_WIDE_UPPER_LOWER
374-
if (GetDatabaseEncoding() == PG_UTF8)
375-
{
376-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
377-
return iswdigit((wint_t) c);
378-
}
379-
#endif
380-
return (c <= (pg_wchar) UCHAR_MAX && isdigit((unsigned char) c));
381-
}
382-
383-
static int
384-
pg_wc_isalpha(pg_wchar c)
385-
{
386-
#ifdef USE_WIDE_UPPER_LOWER
387-
if (GetDatabaseEncoding() == PG_UTF8)
388-
{
389-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
390-
return iswalpha((wint_t) c);
391-
}
392-
#endif
393-
return (c <= (pg_wchar) UCHAR_MAX && isalpha((unsigned char) c));
394-
}
395-
396-
static int
397-
pg_wc_isalnum(pg_wchar c)
398-
{
399-
#ifdef USE_WIDE_UPPER_LOWER
400-
if (GetDatabaseEncoding() == PG_UTF8)
401-
{
402-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
403-
return iswalnum((wint_t) c);
404-
}
405-
#endif
406-
return (c <= (pg_wchar) UCHAR_MAX && isalnum((unsigned char) c));
407-
}
408-
409-
static int
410-
pg_wc_isupper(pg_wchar c)
411-
{
412-
#ifdef USE_WIDE_UPPER_LOWER
413-
if (GetDatabaseEncoding() == PG_UTF8)
414-
{
415-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
416-
return iswupper((wint_t) c);
417-
}
418-
#endif
419-
return (c <= (pg_wchar) UCHAR_MAX && isupper((unsigned char) c));
420-
}
421-
422-
static int
423-
pg_wc_islower(pg_wchar c)
424-
{
425-
#ifdef USE_WIDE_UPPER_LOWER
426-
if (GetDatabaseEncoding() == PG_UTF8)
427-
{
428-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
429-
return iswlower((wint_t) c);
430-
}
431-
#endif
432-
return (c <= (pg_wchar) UCHAR_MAX && islower((unsigned char) c));
433-
}
434-
435-
static int
436-
pg_wc_isgraph(pg_wchar c)
437-
{
438-
#ifdef USE_WIDE_UPPER_LOWER
439-
if (GetDatabaseEncoding() == PG_UTF8)
440-
{
441-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
442-
return iswgraph((wint_t) c);
443-
}
444-
#endif
445-
return (c <= (pg_wchar) UCHAR_MAX && isgraph((unsigned char) c));
446-
}
447-
448-
static int
449-
pg_wc_isprint(pg_wchar c)
450-
{
451-
#ifdef USE_WIDE_UPPER_LOWER
452-
if (GetDatabaseEncoding() == PG_UTF8)
453-
{
454-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
455-
return iswprint((wint_t) c);
456-
}
457-
#endif
458-
return (c <= (pg_wchar) UCHAR_MAX && isprint((unsigned char) c));
459-
}
460-
461-
static int
462-
pg_wc_ispunct(pg_wchar c)
463-
{
464-
#ifdef USE_WIDE_UPPER_LOWER
465-
if (GetDatabaseEncoding() == PG_UTF8)
466-
{
467-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
468-
return iswpunct((wint_t) c);
469-
}
470-
#endif
471-
return (c <= (pg_wchar) UCHAR_MAX && ispunct((unsigned char) c));
472-
}
473-
474-
static int
475-
pg_wc_isspace(pg_wchar c)
476-
{
477-
#ifdef USE_WIDE_UPPER_LOWER
478-
if (GetDatabaseEncoding() == PG_UTF8)
479-
{
480-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
481-
return iswspace((wint_t) c);
482-
}
483-
#endif
484-
return (c <= (pg_wchar) UCHAR_MAX && isspace((unsigned char) c));
485-
}
486-
487-
static pg_wchar
488-
pg_wc_toupper(pg_wchar c)
489-
{
490-
#ifdef USE_WIDE_UPPER_LOWER
491-
if (GetDatabaseEncoding() == PG_UTF8)
492-
{
493-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
494-
return towupper((wint_t) c);
495-
}
496-
#endif
497-
if (c <= (pg_wchar) UCHAR_MAX)
498-
return toupper((unsigned char) c);
499-
return c;
500-
}
501-
502-
static pg_wchar
503-
pg_wc_tolower(pg_wchar c)
504-
{
505-
#ifdef USE_WIDE_UPPER_LOWER
506-
if (GetDatabaseEncoding() == PG_UTF8)
507-
{
508-
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
509-
return towlower((wint_t) c);
510-
}
511-
#endif
512-
if (c <= (pg_wchar) UCHAR_MAX)
513-
return tolower((unsigned char) c);
514-
return c;
515-
}
516-
517-
518353
/*
519354
* element - map collating-element name to celt
520355
*/

0 commit comments

Comments
 (0)