Skip to content

Commit 2bdd3b2

Browse files
committed
Avoid assertion due to disconnected NFA sub-graphs in regex parsing.
In commit 08c0d6a which introduced "rainbow" arcs in regex NFAs, I didn't think terribly hard about what to do when creating the color complement of a rainbow arc. Clearly, the complement cannot match any characters, and I took the easy way out by just not building any arcs at all in the complement arc set. That mostly works, but Nikolay Shaplov found a case where it doesn't: if we decide to delete that sub-NFA later because it's inside a "{0}" quantifier, delsub() suffered an assertion failure. That's because delsub() relies on the target sub-NFA being fully connected. That was always true before, and the best fix seems to be to restore that property. Hence, invent a new arc type CANTMATCH that can be generated in place of an empty color complement, and drop it again later when we start NFA optimization. (At that point we don't need to do delsub() any more, and besides there are other cases where NFA optimization can lead to disconnected subgraphs.) It appears that this bug has no consequences in a non-assert-enabled build: there will be some transiently leaked NFA states/arcs, but they'll get cleaned up eventually. Still, we don't like assertion failures, so back-patch to v14 where rainbow arcs were introduced. Per bug #18708 from Nikolay Shaplov. Discussion: https://postgr.es/m/18708-f94f2599c9d2c005@postgresql.org
1 parent ba25358 commit 2bdd3b2

File tree

6 files changed

+79
-1
lines changed

6 files changed

+79
-1
lines changed

src/backend/regex/regc_color.c

+17-1
Original file line numberDiff line numberDiff line change
@@ -1075,9 +1075,19 @@ colorcomplement(struct nfa *nfa,
10751075

10761076
assert(of != from);
10771077

1078-
/* A RAINBOW arc matches all colors, making the complement empty */
1078+
/*
1079+
* A RAINBOW arc matches all colors, making the complement empty. But we
1080+
* can't just return without making any arcs, because that would leave the
1081+
* NFA disconnected which would break any future delsub(). Instead, make
1082+
* a CANTMATCH arc. Also set the HASCANTMATCH flag so we know we need to
1083+
* clean that up at the start of NFA optimization.
1084+
*/
10791085
if (findarc(of, PLAIN, RAINBOW) != NULL)
1086+
{
1087+
newarc(nfa, CANTMATCH, 0, from, to);
1088+
nfa->flags |= HASCANTMATCH;
10801089
return;
1090+
}
10811091

10821092
/* Otherwise, transiently mark the colors that appear in of's out-arcs */
10831093
for (a = of->outs; a != NULL; a = a->outchain)
@@ -1089,6 +1099,12 @@ colorcomplement(struct nfa *nfa,
10891099
assert(!UNUSEDCOLOR(cd));
10901100
cd->flags |= COLMARK;
10911101
}
1102+
1103+
/*
1104+
* There's no syntax for re-complementing a color set, so we cannot
1105+
* see CANTMATCH arcs here.
1106+
*/
1107+
assert(a->type != CANTMATCH);
10921108
}
10931109

10941110
/* Scan colors, clear transient marks, add arcs for unmarked colors */

src/backend/regex/regc_nfa.c

+40
Original file line numberDiff line numberDiff line change
@@ -1435,6 +1435,7 @@ removetraverse(struct nfa *nfa,
14351435
{
14361436
case PLAIN:
14371437
case EMPTY:
1438+
case CANTMATCH:
14381439
/* nothing to do */
14391440
break;
14401441
case AHEAD:
@@ -1572,6 +1573,12 @@ optimize(struct nfa *nfa,
15721573
if (verbose)
15731574
fprintf(f, "\ninitial cleanup:\n");
15741575
#endif
1576+
/* If we have any CANTMATCH arcs, drop them; but this is uncommon */
1577+
if (nfa->flags & HASCANTMATCH)
1578+
{
1579+
removecantmatch(nfa);
1580+
nfa->flags &= ~HASCANTMATCH;
1581+
}
15751582
cleanup(nfa); /* may simplify situation */
15761583
#ifdef REG_DEBUG
15771584
if (verbose)
@@ -2893,6 +2900,34 @@ clonesuccessorstates(struct nfa *nfa,
28932900
}
28942901
}
28952902

2903+
/*
2904+
* removecantmatch - remove CANTMATCH arcs, which are no longer useful
2905+
* once we are done with the parsing phase. (We need them only to
2906+
* preserve connectedness of NFA subgraphs during parsing.)
2907+
*/
2908+
static void
2909+
removecantmatch(struct nfa *nfa)
2910+
{
2911+
struct state *s;
2912+
2913+
for (s = nfa->states; s != NULL; s = s->next)
2914+
{
2915+
struct arc *a;
2916+
struct arc *nexta;
2917+
2918+
for (a = s->outs; a != NULL; a = nexta)
2919+
{
2920+
nexta = a->outchain;
2921+
if (a->type == CANTMATCH)
2922+
{
2923+
freearc(nfa, a);
2924+
if (NISERR())
2925+
return;
2926+
}
2927+
}
2928+
}
2929+
}
2930+
28962931
/*
28972932
* cleanup - clean up NFA after optimizations
28982933
*/
@@ -3601,6 +3636,8 @@ dumpnfa(struct nfa *nfa,
36013636
fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
36023637
if (nfa->flags & HASLACONS)
36033638
fprintf(f, ", haslacons");
3639+
if (nfa->flags & HASCANTMATCH)
3640+
fprintf(f, ", hascantmatch");
36043641
if (nfa->flags & MATCHALL)
36053642
{
36063643
fprintf(f, ", minmatchall %d", nfa->minmatchall);
@@ -3723,6 +3760,9 @@ dumparc(struct arc *a,
37233760
break;
37243761
case EMPTY:
37253762
break;
3763+
case CANTMATCH:
3764+
fprintf(f, "X");
3765+
break;
37263766
default:
37273767
fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
37283768
break;

src/backend/regex/regcomp.c

+3
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ static void breakconstraintloop(struct nfa *, struct state *);
177177
static void clonesuccessorstates(struct nfa *, struct state *, struct state *,
178178
struct state *, struct arc *,
179179
char *, char *, int);
180+
static void removecantmatch(struct nfa *);
180181
static void cleanup(struct nfa *);
181182
static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
182183
static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
@@ -299,6 +300,7 @@ struct vars
299300
#define BEHIND 'r' /* color-lookbehind arc */
300301
#define WBDRY 'w' /* word boundary constraint */
301302
#define NWBDRY 'W' /* non-word-boundary constraint */
303+
#define CANTMATCH 'x' /* arc that cannot match anything */
302304
#define SBEGIN 'A' /* beginning of string (even if not BOL) */
303305
#define SEND 'Z' /* end of string (even if not EOL) */
304306

@@ -2288,6 +2290,7 @@ nfanode(struct vars *v,
22882290
nfa = newnfa(v, v->cm, v->nfa);
22892291
NOERRZ();
22902292
dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
2293+
nfa->flags = v->nfa->flags;
22912294
if (!ISERR())
22922295
specialcolors(nfa);
22932296
if (!ISERR())

src/include/regex/regguts.h

+2
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,8 @@ struct cnfa
405405
int flags; /* bitmask of the following flags: */
406406
#define HASLACONS 01 /* uses lookaround constraints */
407407
#define MATCHALL 02 /* matches all strings of a range of lengths */
408+
#define HASCANTMATCH 04 /* contains CANTMATCH arcs */
409+
/* Note: HASCANTMATCH appears in nfa structs' flags, but never in cnfas */
408410
int pre; /* setup state number */
409411
int post; /* teardown state number */
410412
color bos[2]; /* colors, if any, assigned to BOS and BOL */

src/test/modules/test_regex/expected/test_regex.out

+14
Original file line numberDiff line numberDiff line change
@@ -2043,6 +2043,20 @@ select * from test_regex('[\s\S]*', '012 3456789abc_*', 'LNPE');
20432043
{"012 3456789abc_*"}
20442044
(2 rows)
20452045

2046+
-- bug #18708:
2047+
select * from test_regex('(?:[^\d\D]){0}', '0123456789abc*', 'LNPQE');
2048+
test_regex
2049+
--------------------------------------------------------------------
2050+
{0,REG_UBOUNDS,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE,REG_UEMPTYMATCH}
2051+
{""}
2052+
(2 rows)
2053+
2054+
select * from test_regex('[^\d\D]', '0123456789abc*', 'ILPE');
2055+
test_regex
2056+
--------------------------------------------------------
2057+
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE,REG_UIMPOSSIBLE}
2058+
(1 row)
2059+
20462060
-- check char classes' handling of newlines
20472061
select * from test_regex('\s+', E'abc \n def', 'LP');
20482062
test_regex

src/test/modules/test_regex/sql/test_regex.sql

+3
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,9 @@ select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE');
613613
select * from test_regex('\W', '0123456789abc_*', 'LP');
614614
select * from test_regex('[\W]', '0123456789abc_*', 'LPE');
615615
select * from test_regex('[\s\S]*', '012 3456789abc_*', 'LNPE');
616+
-- bug #18708:
617+
select * from test_regex('(?:[^\d\D]){0}', '0123456789abc*', 'LNPQE');
618+
select * from test_regex('[^\d\D]', '0123456789abc*', 'ILPE');
616619

617620
-- check char classes' handling of newlines
618621
select * from test_regex('\s+', E'abc \n def', 'LP');

0 commit comments

Comments
 (0)