Skip to content

Commit e9e535d

Browse files
committed
Fix conversion of SIMILAR TO regexes for character classes
The code that translates SIMILAR TO pattern matching expressions to POSIX-style regular expressions did not consider that square brackets can be nested. For example, in an expression like [[:alpha:]%_], the logic replaced the placeholders '_' and '%' but it should not. This commit fixes the conversion logic by tracking the nesting level of square brackets marking character class areas, while considering that in expressions like []] or [^]] the first closing square bracket is a regular character. Multiple tests are added to show how the conversions should or should not apply applied while in a character class area, with specific cases added for all the characters converted outside character classes like an opening parenthesis '(', dollar sign '$', etc. Author: Laurenz Albe <laurenz.albe@cybertec.at> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Reviewed-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at Backpatch-through: 13
1 parent a7d3e32 commit e9e535d

File tree

3 files changed

+114
-6
lines changed

3 files changed

+114
-6
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -774,8 +774,11 @@ similar_escape_internal(text *pat_text, text *esc_text)
774774
int plen,
775775
elen;
776776
bool afterescape = false;
777-
bool incharclass = false;
778777
int nquotes = 0;
778+
int charclass_depth = 0; /* Nesting level of character classes,
779+
* encompassed by square brackets */
780+
int charclass_start = 0; /* State of the character class start,
781+
* for carets */
779782

780783
p = VARDATA_ANY(pat_text);
781784
plen = VARSIZE_ANY_EXHDR(pat_text);
@@ -905,7 +908,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
905908
/* fast path */
906909
if (afterescape)
907910
{
908-
if (pchar == '"' && !incharclass) /* escape-double-quote? */
911+
if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */
909912
{
910913
/* emit appropriate part separator, per notes above */
911914
if (nquotes == 0)
@@ -954,18 +957,41 @@ similar_escape_internal(text *pat_text, text *esc_text)
954957
/* SQL escape character; do not send to output */
955958
afterescape = true;
956959
}
957-
else if (incharclass)
960+
else if (charclass_depth > 0)
958961
{
959962
if (pchar == '\\')
960963
*r++ = '\\';
961964
*r++ = pchar;
962-
if (pchar == ']')
963-
incharclass = false;
965+
966+
/*
967+
* Ignore a closing bracket at the start of a character class.
968+
* Such a bracket is taken literally rather than closing the
969+
* class. "charclass_start" is 1 right at the beginning of a
970+
* class and 2 after an initial caret.
971+
*/
972+
if (pchar == ']' && charclass_start > 2)
973+
charclass_depth--;
974+
else if (pchar == '[')
975+
charclass_depth++;
976+
977+
/*
978+
* If there is a caret right after the opening bracket, it negates
979+
* the character class, but a following closing bracket should
980+
* still be treated as a normal character. That holds only for
981+
* the first caret, so only the values 1 and 2 mean that closing
982+
* brackets should be taken literally.
983+
*/
984+
if (pchar == '^')
985+
charclass_start++;
986+
else
987+
charclass_start = 3; /* definitely past the start */
964988
}
965989
else if (pchar == '[')
966990
{
991+
/* start of a character class */
967992
*r++ = pchar;
968-
incharclass = true;
993+
charclass_depth++;
994+
charclass_start = 1;
969995
}
970996
else if (pchar == '%')
971997
{

src/test/regress/expected/strings.out

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,68 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
596596
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
597597
ERROR: invalid escape string
598598
HINT: Escape string must be empty or one character.
599+
-- Characters that should be left alone in character classes when a
600+
-- SIMILAR TO regexp pattern is converted to POSIX style.
601+
-- Underscore "_"
602+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
603+
QUERY PLAN
604+
------------------------------------------------
605+
Seq Scan on text_tbl
606+
Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text)
607+
(2 rows)
608+
609+
-- Percentage "%"
610+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
611+
QUERY PLAN
612+
--------------------------------------------------
613+
Seq Scan on text_tbl
614+
Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text)
615+
(2 rows)
616+
617+
-- Dot "."
618+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
619+
QUERY PLAN
620+
--------------------------------------------------
621+
Seq Scan on text_tbl
622+
Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text)
623+
(2 rows)
624+
625+
-- Dollar "$"
626+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
627+
QUERY PLAN
628+
--------------------------------------------------
629+
Seq Scan on text_tbl
630+
Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text)
631+
(2 rows)
632+
633+
-- Opening parenthesis "("
634+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
635+
ERROR: invalid regular expression: parentheses () not balanced
636+
-- Caret "^"
637+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
638+
QUERY PLAN
639+
------------------------------------------------------------------------
640+
Seq Scan on text_tbl
641+
Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text)
642+
(2 rows)
643+
644+
-- Closing square bracket "]" at the beginning of character class
645+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
646+
QUERY PLAN
647+
------------------------------------------------
648+
Seq Scan on text_tbl
649+
Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text)
650+
(2 rows)
651+
652+
-- Closing square bracket effective after two carets at the beginning
653+
-- of character class.
654+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
655+
QUERY PLAN
656+
---------------------------------------
657+
Seq Scan on text_tbl
658+
Filter: (f1 ~ '^(?:[^^]\^)$'::text)
659+
(2 rows)
660+
599661
-- Test backslash escapes in regexp_replace's replacement string
600662
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
601663
regexp_replace

src/test/regress/sql/strings.sql

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
193193
SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
194194
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
195195

196+
-- Characters that should be left alone in character classes when a
197+
-- SIMILAR TO regexp pattern is converted to POSIX style.
198+
-- Underscore "_"
199+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
200+
-- Percentage "%"
201+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
202+
-- Dot "."
203+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
204+
-- Dollar "$"
205+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
206+
-- Opening parenthesis "("
207+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
208+
-- Caret "^"
209+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
210+
-- Closing square bracket "]" at the beginning of character class
211+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
212+
-- Closing square bracket effective after two carets at the beginning
213+
-- of character class.
214+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
215+
196216
-- Test backslash escapes in regexp_replace's replacement string
197217
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
198218
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');

0 commit comments

Comments
 (0)