Skip to content

Commit 173e29a

Browse files
committed
Fix the general case of quantified regex back-references.
Cases where a back-reference is part of a larger subexpression that is quantified have never worked in Spencer's regex engine, because he used a compile-time transformation that neglected the need to check the back-reference match in iterations before the last one. (That was okay for capturing parens, and we still do it if the regex has *only* capturing parens ... but it's not okay for backrefs.) To make this work properly, we have to add an "iteration" node type to the regex engine's vocabulary of sub-regex nodes. Since this is a moderately large change with a fair risk of introducing new bugs of its own, apply to HEAD only, even though it's a fix for a longstanding bug.
1 parent 0c9e5d5 commit 173e29a

File tree

6 files changed

+884
-55
lines changed

6 files changed

+884
-55
lines changed

src/backend/regex/README

+9-9
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are
102102
either plain regular expressions (which are executed as DFAs in the manner
103103
described above) or back-references (which try to match the input to some
104104
previous substring). Non-leaf nodes are capture nodes (which save the
105-
location of the substring currently matching their child node) or
106-
concatenation or alternation nodes. At execution time, the executor
107-
recursively scans the tree. At concatenation or alternation nodes,
108-
it considers each possible alternative way of matching the input string,
109-
ie each place where the string could be split for a concatenation, or each
110-
child node for an alternation. It tries the next alternative if the match
111-
fails according to the child nodes. This is exactly the sort of
112-
backtracking search done by a traditional NFA regex engine. If there are
113-
many tree levels it can get very slow.
105+
location of the substring currently matching their child node),
106+
concatenation, alternation, or iteration nodes. At execution time, the
107+
executor recursively scans the tree. At concatenation, alternation, or
108+
iteration nodes, it considers each possible alternative way of matching the
109+
input string, that is each place where the string could be split for a
110+
concatenation or iteration, or each child node for an alternation. It
111+
tries the next alternative if the match fails according to the child nodes.
112+
This is exactly the sort of backtracking search done by a traditional NFA
113+
regex engine. If there are many tree levels it can get very slow.
114114

115115
But all is not lost: we can still be smarter than the average pure NFA
116116
engine. To do this, each subre node has an associated DFA, which

src/backend/regex/regcomp.c

+52-36
Original file line numberDiff line numberDiff line change
@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,
10361036
/*----------
10371037
* Prepare a general-purpose state skeleton.
10381038
*
1039-
* ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
1040-
* / /
1041-
* [lp] ----> [s2] ----bypass---------------------
1039+
* In the no-backrefs case, we want this:
10421040
*
1043-
* where bypass is an empty, and prefix is some repetitions of atom
1041+
* [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
1042+
*
1043+
* where prefix is some repetitions of atom. In the general case we need
1044+
*
1045+
* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
1046+
*
1047+
* where the iterator wraps around [begin] ---atom---> [end]
1048+
*
1049+
* We make the s state here for both cases; s2 is made below if needed
10441050
*----------
10451051
*/
10461052
s = newstate(v->nfa); /* first, new endpoints for the atom */
@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,
10511057
NOERR();
10521058
atom->begin = s;
10531059
atom->end = s2;
1054-
s = newstate(v->nfa); /* and spots for prefix and bypass */
1055-
s2 = newstate(v->nfa);
1060+
s = newstate(v->nfa); /* set up starting state */
10561061
NOERR();
10571062
EMPTYARC(lp, s);
1058-
EMPTYARC(lp, s2);
10591063
NOERR();
10601064

10611065
/* break remaining subRE into x{...} and what follows */
@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,
10891093
}
10901094

10911095
/*
1092-
* It's quantifier time. If the atom is just a BACKREF, we'll let it deal
1093-
* with quantifiers internally. Otherwise, the first step is to turn
1094-
* x{0,...} into x{1,...}|empty
1096+
* It's quantifier time. If the atom is just a backref, we'll let it deal
1097+
* with quantifiers internally.
10951098
*/
1096-
if (m == 0 && atomtype != BACKREF)
1097-
{
1098-
EMPTYARC(s2, atom->end); /* the bypass */
1099-
assert(PREF(qprefer) != 0);
1100-
f = COMBINE(qprefer, atom->flags);
1101-
t = subre(v, '|', f, lp, atom->end);
1102-
NOERR();
1103-
t->left = atom;
1104-
t->right = subre(v, '|', PREF(f), s2, atom->end);
1105-
NOERR();
1106-
t->right->left = subre(v, '=', 0, s2, atom->end);
1107-
NOERR();
1108-
*atomp = t;
1109-
atomp = &t->left;
1110-
m = 1;
1111-
}
1112-
1113-
/* deal with the rest of the quantifier */
11141099
if (atomtype == BACKREF)
11151100
{
11161101
/* special case: backrefs have internal quantifiers */
@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,
11201105
atom->min = (short) m;
11211106
atom->max = (short) n;
11221107
atom->flags |= COMBINE(qprefer, atom->flags);
1108+
/* rest of branch can be strung starting from atom->end */
1109+
s2 = atom->end;
11231110
}
11241111
else if (m == 1 && n == 1)
11251112
{
11261113
/* no/vacuous quantifier: done */
11271114
EMPTYARC(s, atom->begin); /* empty prefix */
1115+
/* rest of branch can be strung starting from atom->end */
1116+
s2 = atom->end;
11281117
}
1129-
else
1118+
else if (m > 0 && !(atom->flags & BACKR))
11301119
{
11311120
/*
1132-
* Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the
1133-
* second x
1121+
* If there's no backrefs involved, we can turn x{m,n} into
1122+
* x{m-1,n-1}x, with capturing parens in only the second x. This
1123+
* is valid because we only care about capturing matches from the
1124+
* final iteration of the quantifier. It's a win because we can
1125+
* implement the backref-free left side as a plain DFA node, since
1126+
* we don't really care where its submatches are.
11341127
*/
11351128
dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
11361129
assert(m >= 1 && m != INFINITY && n >= 1);
@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,
11421135
NOERR();
11431136
t->right = atom;
11441137
*atomp = t;
1138+
/* rest of branch can be strung starting from atom->end */
1139+
s2 = atom->end;
1140+
}
1141+
else
1142+
{
1143+
/* general case: need an iteration node */
1144+
s2 = newstate(v->nfa);
1145+
NOERR();
1146+
moveouts(v->nfa, atom->end, s2);
1147+
NOERR();
1148+
dupnfa(v->nfa, atom->begin, atom->end, s, s2);
1149+
repeat(v, s, s2, m, n);
1150+
f = COMBINE(qprefer, atom->flags);
1151+
t = subre(v, '*', f, s, s2);
1152+
NOERR();
1153+
t->min = (short) m;
1154+
t->max = (short) n;
1155+
t->left = atom;
1156+
*atomp = t;
1157+
/* rest of branch is to be strung from iteration's end state */
11451158
}
11461159

11471160
/* and finally, look after that postponed recursion */
11481161
t = top->right;
11491162
if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
1150-
t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
1163+
t->right = parsebranch(v, stopper, type, s2, rp, 1);
11511164
else
11521165
{
1153-
EMPTYARC(atom->end, rp);
1154-
t->right = subre(v, '=', 0, atom->end, rp);
1166+
EMPTYARC(s2, rp);
1167+
t->right = subre(v, '=', 0, s2, rp);
11551168
}
11561169
assert(SEE('|') || SEE(stopper) || SEE(EOS));
11571170
t->flags |= COMBINE(t->flags, t->right->flags);
@@ -1214,6 +1227,9 @@ scannum(struct vars * v)
12141227
/*
12151228
* repeat - replicate subNFA for quantifiers
12161229
*
1230+
* The sub-NFA strung from lp to rp is modified to represent m to n
1231+
* repetitions of its initial contents.
1232+
*
12171233
* The duplication sequences used here are chosen carefully so that any
12181234
* pointers starting out pointing into the subexpression end up pointing into
12191235
* the last occurrence. (Note that it may not be strung between the same
@@ -1229,7 +1245,7 @@ repeat(struct vars * v,
12291245
int n)
12301246
{
12311247
#define SOME 2
1232-
#define INF 3
1248+
#define INF 3
12331249
#define PAIR(x, y) ((x)*4 + (y))
12341250
#define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
12351251
const int rm = REDUCE(m);
@@ -1603,7 +1619,7 @@ subre(struct vars * v,
16031619
v->treechain = ret;
16041620
}
16051621

1606-
assert(strchr("|.b(=", op) != NULL);
1622+
assert(strchr("=b|.*(", op) != NULL);
16071623

16081624
ret->op = op;
16091625
ret->flags = flags;

0 commit comments

Comments
 (0)