Skip to content

Commit 244dd79

Browse files
committed
Fix regexp misbehavior with capturing parens inside "{0}".
Regexps like "(.){0}...\1" drew an "invalid backreference number". That's not unreasonable on its face, since the capture group will never be matched if it's iterated zero times. However, other engines such as Perl's don't complain about this, nor do we throw an error for related cases such as "(.)|\1", even though that backref can never succeed either. Also, if the zero-iterations case happens at runtime rather than compile time --- say, "(x)*...\1" when there's no "x" to be found --- that's not an error, we just deem the backref to not match. Making this even less defensible, no error was thrown for nested cases such as "((.)){0}...\2"; and to add insult to injury, those cases could result in assertion failures instead. (It seems that nothing especially bad happened in non-assert builds, though.) Let's just fix it so that no error is thrown and instead the backref is deemed to never match, so that compile-time detection of no iterations behaves the same as run-time detection. Per report from Mark Dilger. This appears to be an aboriginal error in Spencer's library, so back-patch to all supported versions. Pre-v14, it turns out to also be necessary to back-patch one aspect of commits cb76fbd/00116dee5, namely to create capture-node subREs with the begin/end states of their subexpressions, not the current lp/rp of the outer parseqatom invocation. Otherwise delsub complains that we're trying to disconnect a state from itself. This is a bit scary but code examination shows that it's safe: in the pre-v14 code, if we want to wrap iteration around the subexpression, the first thing we do is overwrite the atom's begin/end fields with new states. So the bogus values didn't survive long enough to be used for anything, except if no iteration is required, in which case it doesn't matter. Discussion: https://postgr.es/m/A099E4A8-4377-4C64-A98C-3DEDDC075502@enterprisedb.com
1 parent 5cfcd46 commit 244dd79

File tree

5 files changed

+69
-5
lines changed

5 files changed

+69
-5
lines changed

src/backend/regex/regcomp.c

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1086,11 +1086,23 @@ parseqatom(struct vars *v,
10861086
/* annoying special case: {0} or {0,0} cancels everything */
10871087
if (m == 0 && n == 0)
10881088
{
1089-
if (atom != NULL)
1090-
freesubre(v, atom);
1091-
if (atomtype == '(')
1092-
v->subs[subno] = NULL;
1093-
delsub(v->nfa, lp, rp);
1089+
/*
1090+
* If we had capturing subexpression(s) within the atom, we don't want
1091+
* to destroy them, because it's legal (if useless) to back-ref them
1092+
* later. Hence, just unlink the atom from lp/rp and then ignore it.
1093+
*/
1094+
if (atom != NULL && (atom->flags & CAP))
1095+
{
1096+
delsub(v->nfa, lp, atom->begin);
1097+
delsub(v->nfa, atom->end, rp);
1098+
}
1099+
else
1100+
{
1101+
/* Otherwise, we can clean up any subre infrastructure we made */
1102+
if (atom != NULL)
1103+
freesubre(v, atom);
1104+
delsub(v->nfa, lp, rp);
1105+
}
10941106
EMPTYARC(lp, rp);
10951107
return top;
10961108
}

src/test/modules/test_regex/expected/test_regex.out

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3490,6 +3490,28 @@ select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
34903490
{x,x,x,NULL}
34913491
(2 rows)
34923492

3493+
-- expectNomatch 21.39 PQR {(.){0}(\1)} xxx
3494+
select * from test_regex('(.){0}(\1)', 'xxx', 'PQR');
3495+
test_regex
3496+
--------------------------------------------
3497+
{2,REG_UBACKREF,REG_UBOUNDS,REG_UNONPOSIX}
3498+
(1 row)
3499+
3500+
-- expectNomatch 21.40 PQR {((.)){0}(\2)} xxx
3501+
select * from test_regex('((.)){0}(\2)', 'xxx', 'PQR');
3502+
test_regex
3503+
--------------------------------------------
3504+
{3,REG_UBACKREF,REG_UBOUNDS,REG_UNONPOSIX}
3505+
(1 row)
3506+
3507+
-- expectMatch 21.41 NPQR {((.)){0}(\2){0}} xyz {} {} {} {}
3508+
select * from test_regex('((.)){0}(\2){0}', 'xyz', 'NPQR');
3509+
test_regex
3510+
------------------------------------------------------------
3511+
{3,REG_UBACKREF,REG_UBOUNDS,REG_UNONPOSIX,REG_UEMPTYMATCH}
3512+
{"",NULL,NULL,NULL}
3513+
(2 rows)
3514+
34933515
-- doing 22 "multicharacter collating elements"
34943516
-- # again ugh
34953517
-- MCCEs are not implemented in Postgres, so we skip all these tests

src/test/modules/test_regex/sql/test_regex.sql

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,6 +1015,12 @@ select * from test_regex('(a*)*', 'bc', 'N');
10151015
select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M');
10161016
-- expectMatch 21.36 RPQ ((.))(\2){0} xy x x x {}
10171017
select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
1018+
-- expectNomatch 21.39 PQR {(.){0}(\1)} xxx
1019+
select * from test_regex('(.){0}(\1)', 'xxx', 'PQR');
1020+
-- expectNomatch 21.40 PQR {((.)){0}(\2)} xxx
1021+
select * from test_regex('((.)){0}(\2)', 'xxx', 'PQR');
1022+
-- expectMatch 21.41 NPQR {((.)){0}(\2){0}} xyz {} {} {} {}
1023+
select * from test_regex('((.)){0}(\2){0}', 'xyz', 'NPQR');
10181024

10191025
-- doing 22 "multicharacter collating elements"
10201026
-- # again ugh

src/test/regress/expected/regex.out

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,25 @@ select 'a' ~ '()+\1';
567567
t
568568
(1 row)
569569

570+
-- Test incorrect removal of capture groups within {0}
571+
select 'xxx' ~ '(.){0}(\1)' as f;
572+
f
573+
---
574+
f
575+
(1 row)
576+
577+
select 'xxx' ~ '((.)){0}(\2)' as f;
578+
f
579+
---
580+
f
581+
(1 row)
582+
583+
select 'xyz' ~ '((.)){0}(\2){0}' as t;
584+
t
585+
---
586+
t
587+
(1 row)
588+
570589
-- Test ancient oversight in when to apply zaptreesubs
571590
select 'abcdef' ~ '^(.)\1|\1.' as f;
572591
f

src/test/regress/sql/regex.sql

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,11 @@ select 'a' ~ '.. ()|\1';
135135
select 'a' ~ '()*\1';
136136
select 'a' ~ '()+\1';
137137

138+
-- Test incorrect removal of capture groups within {0}
139+
select 'xxx' ~ '(.){0}(\1)' as f;
140+
select 'xxx' ~ '((.)){0}(\2)' as f;
141+
select 'xyz' ~ '((.)){0}(\2){0}' as t;
142+
138143
-- Test ancient oversight in when to apply zaptreesubs
139144
select 'abcdef' ~ '^(.)\1|\1.' as f;
140145
select 'abadef' ~ '^((.)\2|..)\2' as f;

0 commit comments

Comments
 (0)