Skip to content

Commit 6cfaffc

Browse files
committed
Fix regexport.c to behave sanely with lookaround constraints.
regexport.c thought it could just ignore LACON arcs, but the correct behavior is to treat them as satisfiable while consuming zero input (rather reminiscently of commit 9f1e642). Otherwise, the emitted simplified-NFA representation may contain no paths leading from initial to final state, which unsurprisingly confuses pg_trgm, as seen in bug #14623 from Jeff Janes. Since regexport's output representation has no concept of an arc that consumes zero input, recurse internally to find the next normal arc(s) after any LACON transitions. We'd be forced into changing that representation if a LACON could be the last arc reaching the final state, but fortunately the regex library never builds NFAs with such a configuration, so there always is a next normal arc. Back-patch to 9.3 where this logic was introduced. Discussion: https://postgr.es/m/20170413180503.25948.94871@wrigleys.postgresql.org
1 parent 885fea5 commit 6cfaffc

File tree

3 files changed

+80
-25
lines changed

3 files changed

+80
-25
lines changed

contrib/pg_trgm/expected/pg_trgm.out

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3684,6 +3684,12 @@ select * from test2 where t ~ ' z foo';
36843684
z foo bar
36853685
(1 row)
36863686

3687+
select * from test2 where t ~ 'qua(?!foo)';
3688+
t
3689+
-------
3690+
quark
3691+
(1 row)
3692+
36873693
drop index test2_idx_gin;
36883694
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
36893695
set enable_seqscan=off;
@@ -3864,6 +3870,12 @@ select * from test2 where t ~ ' z foo';
38643870
z foo bar
38653871
(1 row)
38663872

3873+
select * from test2 where t ~ 'qua(?!foo)';
3874+
t
3875+
-------
3876+
quark
3877+
(1 row)
3878+
38673879
-- Check similarity threshold (bug #14202)
38683880
CREATE TEMP TABLE restaurants (city text);
38693881
INSERT INTO restaurants SELECT 'Warsaw' FROM generate_series(1, 10000);

contrib/pg_trgm/sql/pg_trgm.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ select * from test2 where t ~ 'z foo bar';
8686
select * from test2 where t ~ ' z foo bar';
8787
select * from test2 where t ~ ' z foo bar';
8888
select * from test2 where t ~ ' z foo';
89+
select * from test2 where t ~ 'qua(?!foo)';
8990
drop index test2_idx_gin;
91+
9092
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
9193
set enable_seqscan=off;
9294
explain (costs off)
@@ -121,6 +123,7 @@ select * from test2 where t ~ 'z foo bar';
121123
select * from test2 where t ~ ' z foo bar';
122124
select * from test2 where t ~ ' z foo bar';
123125
select * from test2 where t ~ ' z foo';
126+
select * from test2 where t ~ 'qua(?!foo)';
124127

125128
-- Check similarity threshold (bug #14202)
126129

src/backend/regex/regexport.c

Lines changed: 65 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
* In this implementation, the NFA defines a necessary but not sufficient
77
* condition for a string to match the regex: that is, there can be strings
88
* that match the NFA but don't match the full regex, but not vice versa.
9-
* Thus, for example, it is okay for the functions below to ignore lookaround
10-
* constraints, which merely constrain the string some more.
9+
* Thus, for example, it is okay for the functions below to treat lookaround
10+
* constraints as no-ops, since they merely constrain the string some more.
1111
*
1212
* Notice that these functions return info into caller-provided arrays
1313
* rather than doing their own malloc's. This simplifies the APIs by
@@ -72,29 +72,78 @@ pg_reg_getfinalstate(const regex_t *regex)
7272
}
7373

7474
/*
75-
* Get number of outgoing NFA arcs of state number "st".
75+
* pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON
76+
* arcs from the caller, treating any LACON as being automatically satisfied.
77+
* Since the output representation does not support arcs that consume no
78+
* character when traversed, we have to recursively traverse LACON arcs here,
79+
* and report whatever normal arcs are reachable by traversing LACON arcs.
80+
* Note that this wouldn't work if it were possible to reach the final state
81+
* via LACON traversal, but the regex library never builds NFAs that have
82+
* LACON arcs leading directly to the final state. (This is because the
83+
* regex executor is designed to consume one character beyond the nominal
84+
* match end --- possibly an EOS indicator --- so there is always a set of
85+
* ordinary arcs leading to the final state.)
7686
*
77-
* Note: LACON arcs are ignored, both here and in pg_reg_getoutarcs().
87+
* traverse_lacons is a recursive subroutine used by both exported functions
88+
* to count and then emit the reachable regular arcs. *arcs_count is
89+
* incremented by the number of reachable arcs, and as many as will fit in
90+
* arcs_len (possibly 0) are emitted into arcs[].
91+
*/
92+
static void
93+
traverse_lacons(struct cnfa * cnfa, int st,
94+
int *arcs_count,
95+
regex_arc_t *arcs, int arcs_len)
96+
{
97+
struct carc *ca;
98+
99+
/*
100+
* Since this function recurses, it could theoretically be driven to stack
101+
* overflow. In practice, this is mostly useful to backstop against a
102+
* failure of the regex compiler to remove a loop of LACON arcs.
103+
*/
104+
check_stack_depth();
105+
106+
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
107+
{
108+
if (ca->co < cnfa->ncolors)
109+
{
110+
/* Ordinary arc, so count and possibly emit it */
111+
int ndx = (*arcs_count)++;
112+
113+
if (ndx < arcs_len)
114+
{
115+
arcs[ndx].co = ca->co;
116+
arcs[ndx].to = ca->to;
117+
}
118+
}
119+
else
120+
{
121+
/* LACON arc --- assume it's satisfied and recurse... */
122+
/* ... but first, assert it doesn't lead directly to post state */
123+
Assert(ca->to != cnfa->post);
124+
125+
traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len);
126+
}
127+
}
128+
}
129+
130+
/*
131+
* Get number of outgoing NFA arcs of state number "st".
78132
*/
79133
int
80134
pg_reg_getnumoutarcs(const regex_t *regex, int st)
81135
{
82136
struct cnfa *cnfa;
83-
struct carc *ca;
84-
int count;
137+
int arcs_count;
85138

86139
assert(regex != NULL && regex->re_magic == REMAGIC);
87140
cnfa = &((struct guts *) regex->re_guts)->search;
88141

89142
if (st < 0 || st >= cnfa->nstates)
90143
return 0;
91-
count = 0;
92-
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
93-
{
94-
if (ca->co < cnfa->ncolors)
95-
count++;
96-
}
97-
return count;
144+
arcs_count = 0;
145+
traverse_lacons(cnfa, st, &arcs_count, NULL, 0);
146+
return arcs_count;
98147
}
99148

100149
/*
@@ -107,24 +156,15 @@ pg_reg_getoutarcs(const regex_t *regex, int st,
107156
regex_arc_t *arcs, int arcs_len)
108157
{
109158
struct cnfa *cnfa;
110-
struct carc *ca;
159+
int arcs_count;
111160

112161
assert(regex != NULL && regex->re_magic == REMAGIC);
113162
cnfa = &((struct guts *) regex->re_guts)->search;
114163

115164
if (st < 0 || st >= cnfa->nstates || arcs_len <= 0)
116165
return;
117-
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
118-
{
119-
if (ca->co < cnfa->ncolors)
120-
{
121-
arcs->co = ca->co;
122-
arcs->to = ca->to;
123-
arcs++;
124-
if (--arcs_len == 0)
125-
break;
126-
}
127-
}
166+
arcs_count = 0;
167+
traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len);
128168
}
129169

130170
/*

0 commit comments

Comments
 (0)