Skip to content

Commit 779557b

Browse files
committed
Prevent regexp back-refs from sometimes matching when they shouldn't.
The recursion in cdissect() was careless about clearing match data for capturing parentheses after rejecting a partial match. This could allow a later back-reference to succeed when by rights it should fail for lack of a defined referent. To fix, think a little more rigorously about what the contract between different levels of cdissect's recursion needs to be. With the right spec, we can fix this using fewer rather than more resets of the match data; the key decision being that a failed sub-match is now explicitly responsible for clearing any matches it may have set. There are enough other cross-checks and optimizations in the code that it's not especially easy to exhibit this problem; usually, the match will fail as-expected. Plus, regexps that are even potentially vulnerable are most likely user errors, since there's just not much point in writing a back-ref that doesn't always have a referent. These facts perhaps explain why the issue hasn't been detected, even though it's almost certainly a couple of decades old. Discussion: https://postgr.es/m/151435.1629733387@sss.pgh.pa.us
1 parent e3fb617 commit 779557b

File tree

5 files changed

+69
-8
lines changed

5 files changed

+69
-8
lines changed

src/backend/regex/regexec.c

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,8 @@ pg_regexec(regex_t *re,
230230
}
231231
else
232232
v->pmatch = pmatch;
233+
if (v->nmatch > 0)
234+
zapallsubs(v->pmatch, v->nmatch);
233235
v->details = details;
234236
v->start = (chr *) string;
235237
v->search_start = (chr *) string + search_start;
@@ -473,7 +475,6 @@ find(struct vars *v,
473475
return REG_OKAY;
474476

475477
/* find submatches */
476-
zapallsubs(v->pmatch, v->nmatch);
477478
return cdissect(v, v->g->tree, begin, end);
478479
}
479480

@@ -584,7 +585,6 @@ cfindloop(struct vars *v,
584585
break; /* no match with this begin point, try next */
585586
MDEBUG(("tentative end %ld\n", LOFF(end)));
586587
/* Dissect the potential match to see if it really matches */
587-
zapallsubs(v->pmatch, v->nmatch);
588588
er = cdissect(v, v->g->tree, begin, end);
589589
if (er == REG_OKAY)
590590
{
@@ -632,6 +632,8 @@ cfindloop(struct vars *v,
632632

633633
/*
634634
* zapallsubs - initialize all subexpression matches to "no match"
635+
*
636+
* Note that p[0], the overall-match location, is not touched.
635637
*/
636638
static void
637639
zapallsubs(regmatch_t *p,
@@ -701,8 +703,30 @@ subset(struct vars *v,
701703
* DFA and found that the proposed substring satisfies the DFA. (We make
702704
* the caller do that because in concatenation and iteration nodes, it's
703705
* much faster to check all the substrings against the child DFAs before we
704-
* recurse.) Also, caller must have cleared subexpression match data via
705-
* zaptreesubs (or zapallsubs at the top level).
706+
* recurse.)
707+
*
708+
* A side-effect of a successful match is to save match locations for
709+
* capturing subexpressions in v->pmatch[]. This is a little bit tricky,
710+
* so we make the following rules:
711+
* 1. Before initial entry to cdissect, all match data must have been
712+
* cleared (this is seen to by zapallsubs).
713+
* 2. Before any recursive entry to cdissect, the match data for that
714+
* subexpression tree must be guaranteed clear (see zaptreesubs).
715+
* 3. When returning REG_OKAY, each level of cdissect will have saved
716+
* any relevant match locations.
717+
* 4. When returning REG_NOMATCH, each level of cdissect will guarantee
718+
* that its subexpression match locations are again clear.
719+
* 5. No guarantees are made for error cases (i.e., other result codes).
720+
* 6. When a level of cdissect abandons a successful sub-match, it will
721+
* clear that subtree's match locations with zaptreesubs before trying
722+
* any new DFA match or cdissect call for that subtree or any subtree
723+
* to its right (that is, any subtree that could have a backref into the
724+
* abandoned match).
725+
* This may seem overly complicated, but it's difficult to simplify it
726+
* because of the provision that match locations must be reset before
727+
* any fresh DFA match (a rule that is needed to make dfa_backref safe).
728+
* That means it won't work to just reset relevant match locations at the
729+
* start of each cdissect level.
706730
*/
707731
static int /* regexec return code */
708732
cdissect(struct vars *v,
@@ -827,6 +851,8 @@ ccondissect(struct vars *v,
827851
MDEBUG(("%d: successful\n", t->id));
828852
return REG_OKAY;
829853
}
854+
/* Reset left's matches (right should have done so itself) */
855+
zaptreesubs(v, left);
830856
}
831857
if (er != REG_NOMATCH)
832858
return er;
@@ -849,8 +875,6 @@ ccondissect(struct vars *v,
849875
return REG_NOMATCH;
850876
}
851877
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
852-
zaptreesubs(v, left);
853-
zaptreesubs(v, right);
854878
}
855879

856880
/* can't get here */
@@ -908,6 +932,8 @@ crevcondissect(struct vars *v,
908932
MDEBUG(("%d: successful\n", t->id));
909933
return REG_OKAY;
910934
}
935+
/* Reset left's matches (right should have done so itself) */
936+
zaptreesubs(v, left);
911937
}
912938
if (er != REG_NOMATCH)
913939
return er;
@@ -930,8 +956,6 @@ crevcondissect(struct vars *v,
930956
return REG_NOMATCH;
931957
}
932958
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
933-
zaptreesubs(v, left);
934-
zaptreesubs(v, right);
935959
}
936960

937961
/* can't get here */
@@ -1200,6 +1224,7 @@ citerdissect(struct vars *v,
12001224

12011225
for (i = nverified + 1; i <= k; i++)
12021226
{
1227+
/* zap any match data from a non-last iteration */
12031228
zaptreesubs(v, t->child);
12041229
er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
12051230
if (er == REG_OKAY)
@@ -1412,6 +1437,7 @@ creviterdissect(struct vars *v,
14121437

14131438
for (i = nverified + 1; i <= k; i++)
14141439
{
1440+
/* zap any match data from a non-last iteration */
14151441
zaptreesubs(v, t->child);
14161442
er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
14171443
if (er == REG_OKAY)

src/test/modules/test_regex/expected/test_regex.out

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2636,6 +2636,20 @@ select * from test_regex('^(.+)( \1)+$', 'abc abc abd', 'RP');
26362636
{2,REG_UBACKREF,REG_UNONPOSIX}
26372637
(1 row)
26382638

2639+
-- expectNomatch 14.30 RP {^(.)\1|\1.} {abcdef}
2640+
select * from test_regex('^(.)\1|\1.', 'abcdef', 'RP');
2641+
test_regex
2642+
--------------------------------
2643+
{1,REG_UBACKREF,REG_UNONPOSIX}
2644+
(1 row)
2645+
2646+
-- expectNomatch 14.31 RP {^((.)\2|..)\2} {abadef}
2647+
select * from test_regex('^((.)\2|..)\2', 'abadef', 'RP');
2648+
test_regex
2649+
--------------------------------
2650+
{2,REG_UBACKREF,REG_UNONPOSIX}
2651+
(1 row)
2652+
26392653
-- back reference only matches the string, not any constraints
26402654
select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');
26412655
test_regex

src/test/modules/test_regex/sql/test_regex.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -769,6 +769,10 @@ select * from test_regex('^(.+)( \1)+$', 'abc abc abc', 'RP');
769769
select * from test_regex('^(.+)( \1)+$', 'abc abd abc', 'RP');
770770
-- expectNomatch 14.29 RP {^(.+)( \1)+$} {abc abc abd}
771771
select * from test_regex('^(.+)( \1)+$', 'abc abc abd', 'RP');
772+
-- expectNomatch 14.30 RP {^(.)\1|\1.} {abcdef}
773+
select * from test_regex('^(.)\1|\1.', 'abcdef', 'RP');
774+
-- expectNomatch 14.31 RP {^((.)\2|..)\2} {abadef}
775+
select * from test_regex('^((.)\2|..)\2', 'abadef', 'RP');
772776

773777
-- back reference only matches the string, not any constraints
774778
select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');

src/test/regress/expected/regex.out

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,19 @@ select 'a' ~ '()+\1';
567567
t
568568
(1 row)
569569

570+
-- Test ancient oversight in when to apply zaptreesubs
571+
select 'abcdef' ~ '^(.)\1|\1.' as f;
572+
f
573+
---
574+
f
575+
(1 row)
576+
577+
select 'abadef' ~ '^((.)\2|..)\2' as f;
578+
f
579+
---
580+
f
581+
(1 row)
582+
570583
-- Add coverage for some cases in checkmatchall
571584
select regexp_match('xy', '.|...');
572585
regexp_match

src/test/regress/sql/regex.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,10 @@ select 'a' ~ '.. ()|\1';
135135
select 'a' ~ '()*\1';
136136
select 'a' ~ '()+\1';
137137

138+
-- Test ancient oversight in when to apply zaptreesubs
139+
select 'abcdef' ~ '^(.)\1|\1.' as f;
140+
select 'abadef' ~ '^((.)\2|..)\2' as f;
141+
138142
-- Add coverage for some cases in checkmatchall
139143
select regexp_match('xy', '.|...');
140144
select regexp_match('xyz', '.|...');

0 commit comments

Comments
 (0)