Skip to content

Commit 9a32717

Browse files
committed
Prevent regexp back-refs from sometimes matching when they shouldn't.
The recursion in cdissect() was careless about clearing match data for capturing parentheses after rejecting a partial match. This could allow a later back-reference to succeed when by rights it should fail for lack of a defined referent. To fix, think a little more rigorously about what the contract between different levels of cdissect's recursion needs to be. With the right spec, we can fix this using fewer rather than more resets of the match data; the key decision being that a failed sub-match is now explicitly responsible for clearing any matches it may have set. There are enough other cross-checks and optimizations in the code that it's not especially easy to exhibit this problem; usually, the match will fail as-expected. Plus, regexps that are even potentially vulnerable are most likely user errors, since there's just not much point in writing a back-ref that doesn't always have a referent. These facts perhaps explain why the issue hasn't been detected, even though it's almost certainly a couple of decades old. Discussion: https://postgr.es/m/151435.1629733387@sss.pgh.pa.us
1 parent ad12311 commit 9a32717

File tree

3 files changed

+51
-8
lines changed

3 files changed

+51
-8
lines changed

src/backend/regex/regexec.c

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,8 @@ pg_regexec(regex_t *re,
226226
}
227227
else
228228
v->pmatch = pmatch;
229+
if (v->nmatch > 0)
230+
zapallsubs(v->pmatch, v->nmatch);
229231
v->details = details;
230232
v->start = (chr *) string;
231233
v->search_start = (chr *) string + search_start;
@@ -455,7 +457,6 @@ find(struct vars *v,
455457
return REG_OKAY;
456458

457459
/* find submatches */
458-
zapallsubs(v->pmatch, v->nmatch);
459460
return cdissect(v, v->g->tree, begin, end);
460461
}
461462

@@ -566,7 +567,6 @@ cfindloop(struct vars *v,
566567
break; /* no match with this begin point, try next */
567568
MDEBUG(("tentative end %ld\n", LOFF(end)));
568569
/* Dissect the potential match to see if it really matches */
569-
zapallsubs(v->pmatch, v->nmatch);
570570
er = cdissect(v, v->g->tree, begin, end);
571571
if (er == REG_OKAY)
572572
{
@@ -614,6 +614,8 @@ cfindloop(struct vars *v,
614614

615615
/*
616616
* zapallsubs - initialize all subexpression matches to "no match"
617+
*
618+
* Note that p[0], the overall-match location, is not touched.
617619
*/
618620
static void
619621
zapallsubs(regmatch_t *p,
@@ -685,8 +687,30 @@ subset(struct vars *v,
685687
* DFA and found that the proposed substring satisfies the DFA. (We make
686688
* the caller do that because in concatenation and iteration nodes, it's
687689
* much faster to check all the substrings against the child DFAs before we
688-
* recurse.) Also, caller must have cleared subexpression match data via
689-
* zaptreesubs (or zapallsubs at the top level).
690+
* recurse.)
691+
*
692+
* A side-effect of a successful match is to save match locations for
693+
* capturing subexpressions in v->pmatch[]. This is a little bit tricky,
694+
* so we make the following rules:
695+
* 1. Before initial entry to cdissect, all match data must have been
696+
* cleared (this is seen to by zapallsubs).
697+
* 2. Before any recursive entry to cdissect, the match data for that
698+
* subexpression tree must be guaranteed clear (see zaptreesubs).
699+
* 3. When returning REG_OKAY, each level of cdissect will have saved
700+
* any relevant match locations.
701+
* 4. When returning REG_NOMATCH, each level of cdissect will guarantee
702+
* that its subexpression match locations are again clear.
703+
* 5. No guarantees are made for error cases (i.e., other result codes).
704+
* 6. When a level of cdissect abandons a successful sub-match, it will
705+
* clear that subtree's match locations with zaptreesubs before trying
706+
* any new DFA match or cdissect call for that subtree or any subtree
707+
* to its right (that is, any subtree that could have a backref into the
708+
* abandoned match).
709+
* This may seem overly complicated, but it's difficult to simplify it
710+
* because of the provision that match locations must be reset before
711+
* any fresh DFA match (a rule that is needed to make dfa_backref safe).
712+
* That means it won't work to just reset relevant match locations at the
713+
* start of each cdissect level.
690714
*/
691715
static int /* regexec return code */
692716
cdissect(struct vars *v,
@@ -804,6 +828,8 @@ ccondissect(struct vars *v,
804828
MDEBUG(("successful\n"));
805829
return REG_OKAY;
806830
}
831+
/* Reset left's matches (right should have done so itself) */
832+
zaptreesubs(v, t->left);
807833
}
808834
if (er != REG_NOMATCH)
809835
return er;
@@ -826,8 +852,6 @@ ccondissect(struct vars *v,
826852
return REG_NOMATCH;
827853
}
828854
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
829-
zaptreesubs(v, t->left);
830-
zaptreesubs(v, t->right);
831855
}
832856

833857
/* can't get here */
@@ -882,6 +906,8 @@ crevcondissect(struct vars *v,
882906
MDEBUG(("successful\n"));
883907
return REG_OKAY;
884908
}
909+
/* Reset left's matches (right should have done so itself) */
910+
zaptreesubs(v, t->left);
885911
}
886912
if (er != REG_NOMATCH)
887913
return er;
@@ -904,8 +930,6 @@ crevcondissect(struct vars *v,
904930
return REG_NOMATCH;
905931
}
906932
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
907-
zaptreesubs(v, t->left);
908-
zaptreesubs(v, t->right);
909933
}
910934

911935
/* can't get here */
@@ -1165,6 +1189,7 @@ citerdissect(struct vars *v,
11651189

11661190
for (i = nverified + 1; i <= k; i++)
11671191
{
1192+
/* zap any match data from a non-last iteration */
11681193
zaptreesubs(v, t->left);
11691194
er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
11701195
if (er == REG_OKAY)
@@ -1373,6 +1398,7 @@ creviterdissect(struct vars *v,
13731398

13741399
for (i = nverified + 1; i <= k; i++)
13751400
{
1401+
/* zap any match data from a non-last iteration */
13761402
zaptreesubs(v, t->left);
13771403
er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
13781404
if (er == REG_OKAY)

src/test/regress/expected/regex.out

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,19 @@ select 'a' ~ '()+\1';
567567
t
568568
(1 row)
569569

570+
-- Test ancient oversight in when to apply zaptreesubs
571+
select 'abcdef' ~ '^(.)\1|\1.' as f;
572+
f
573+
---
574+
f
575+
(1 row)
576+
577+
select 'abadef' ~ '^((.)\2|..)\2' as f;
578+
f
579+
---
580+
f
581+
(1 row)
582+
570583
-- Error conditions
571584
select 'xyz' ~ 'x(\w)(?=\1)'; -- no backrefs in LACONs
572585
ERROR: invalid regular expression: invalid backreference number

src/test/regress/sql/regex.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,10 @@ select 'a' ~ '.. ()|\1';
135135
select 'a' ~ '()*\1';
136136
select 'a' ~ '()+\1';
137137

138+
-- Test ancient oversight in when to apply zaptreesubs
139+
select 'abcdef' ~ '^(.)\1|\1.' as f;
140+
select 'abadef' ~ '^((.)\2|..)\2' as f;
141+
138142
-- Error conditions
139143
select 'xyz' ~ 'x(\w)(?=\1)'; -- no backrefs in LACONs
140144
select 'xyz' ~ 'x(\w)(?=(\1))';

0 commit comments

Comments
 (0)