Skip to content

Commit 4e703d6

Browse files
committed
Make some minor improvements in the regex code.
Push some hopefully-uncontroversial bits extracted from an upcoming patch series, to remove non-relevant clutter from the main patches. In compact(), return immediately after setting REG_ASSERT error; continuing the loop would just lead to assertion failure below. (Ask me how I know.) In parseqatom(), remove assertion that moresubs() did its job. When moresubs actually did its job, this is redundant with that function's final assert; but when it failed on OOM, this is an assertion crash. We could avoid the crash by adding a NOERR() check before the assertion, but it seems better to subtract code than add it. (Note that there's a NOERR exit a few lines further down, and nothing else between here and there requires moresubs to have succeeded. So we don't really need an extra error exit.) This is a live bug in assert-enabled builds, but given the very low likelihood of OOM in moresub's tiny allocation, I don't think it's worth back-patching. On the other hand, it seems worthwhile to add an assertion that our intended v->subs[subno] target is still null by the time we are ready to insert into it, since there's a recursion in between. In pg_regexec, ensure we fflush any debug output on the way out, and try to make MDEBUG messages more uniform and helpful. (In particular, ensure that all of them are prefixed with the subre's id number, so one can match up entry and exit reports.) Add some test cases in test_regex to improve coverage of lookahead and lookbehind constraints. Adding these now is mainly to establish that this is indeed the existing behavior. Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
1 parent f40c696 commit 4e703d6

File tree

5 files changed

+99
-27
lines changed

5 files changed

+99
-27
lines changed

src/backend/regex/regc_nfa.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2902,7 +2902,7 @@ compact(struct nfa *nfa,
29022902
break;
29032903
default:
29042904
NERR(REG_ASSERT);
2905-
break;
2905+
return;
29062906
}
29072907
carcsort(first, ca - first);
29082908
ca->co = COLORLESS;

src/backend/regex/regcomp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -942,7 +942,6 @@ parseqatom(struct vars *v,
942942
subno = v->nsubexp;
943943
if ((size_t) subno >= v->nsubs)
944944
moresubs(v, subno);
945-
assert((size_t) subno < v->nsubs);
946945
}
947946
else
948947
atomtype = PLAIN; /* something that's not '(' */
@@ -960,6 +959,7 @@ parseqatom(struct vars *v,
960959
NOERR();
961960
if (cap)
962961
{
962+
assert(v->subs[subno] == NULL);
963963
v->subs[subno] = atom;
964964
t = subre(v, '(', atom->flags | CAP, lp, rp);
965965
NOERR();

src/backend/regex/regexec.c

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,11 @@ pg_regexec(regex_t *re,
324324
if (v->lblastcp != NULL)
325325
FREE(v->lblastcp);
326326

327+
#ifdef REG_DEBUG
328+
if (v->eflags & (REG_FTRACE | REG_MTRACE))
329+
fflush(stdout);
330+
#endif
331+
327332
return st;
328333
}
329334

@@ -668,7 +673,7 @@ subset(struct vars *v,
668673
if ((size_t) n >= v->nmatch)
669674
return;
670675

671-
MDEBUG(("setting %d\n", n));
676+
MDEBUG(("%d: setting %d = %ld-%ld\n", sub->id, n, LOFF(begin), LOFF(end)));
672677
v->pmatch[n].rm_so = OFF(begin);
673678
v->pmatch[n].rm_eo = OFF(end);
674679
}
@@ -697,7 +702,7 @@ cdissect(struct vars *v,
697702
int er;
698703

699704
assert(t != NULL);
700-
MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
705+
MDEBUG(("%d: cdissect %c %ld-%ld\n", t->id, t->op, LOFF(begin), LOFF(end)));
701706

702707
/* handy place to check for operation cancel */
703708
if (CANCEL_REQUESTED(v->re))
@@ -779,14 +784,14 @@ ccondissect(struct vars *v,
779784
NOERR();
780785
d2 = getsubdfa(v, t->right);
781786
NOERR();
782-
MDEBUG(("cconcat %d\n", t->id));
787+
MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
783788

784789
/* pick a tentative midpoint */
785790
mid = longest(v, d, begin, end, (int *) NULL);
786791
NOERR();
787792
if (mid == NULL)
788793
return REG_NOMATCH;
789-
MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
794+
MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
790795

791796
/* iterate until satisfaction or failure */
792797
for (;;)
@@ -801,7 +806,7 @@ ccondissect(struct vars *v,
801806
if (er == REG_OKAY)
802807
{
803808
/* satisfaction */
804-
MDEBUG(("successful\n"));
809+
MDEBUG(("%d: successful\n", t->id));
805810
return REG_OKAY;
806811
}
807812
}
@@ -814,15 +819,15 @@ ccondissect(struct vars *v,
814819
if (mid == begin)
815820
{
816821
/* all possibilities exhausted */
817-
MDEBUG(("%d no midpoint\n", t->id));
822+
MDEBUG(("%d: no midpoint\n", t->id));
818823
return REG_NOMATCH;
819824
}
820825
mid = longest(v, d, begin, mid - 1, (int *) NULL);
821826
NOERR();
822827
if (mid == NULL)
823828
{
824829
/* failed to find a new one */
825-
MDEBUG(("%d failed midpoint\n", t->id));
830+
MDEBUG(("%d: failed midpoint\n", t->id));
826831
return REG_NOMATCH;
827832
}
828833
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
@@ -857,14 +862,14 @@ crevcondissect(struct vars *v,
857862
NOERR();
858863
d2 = getsubdfa(v, t->right);
859864
NOERR();
860-
MDEBUG(("crevcon %d\n", t->id));
865+
MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
861866

862867
/* pick a tentative midpoint */
863868
mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
864869
NOERR();
865870
if (mid == NULL)
866871
return REG_NOMATCH;
867-
MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
872+
MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
868873

869874
/* iterate until satisfaction or failure */
870875
for (;;)
@@ -879,7 +884,7 @@ crevcondissect(struct vars *v,
879884
if (er == REG_OKAY)
880885
{
881886
/* satisfaction */
882-
MDEBUG(("successful\n"));
887+
MDEBUG(("%d: successful\n", t->id));
883888
return REG_OKAY;
884889
}
885890
}
@@ -892,15 +897,15 @@ crevcondissect(struct vars *v,
892897
if (mid == end)
893898
{
894899
/* all possibilities exhausted */
895-
MDEBUG(("%d no midpoint\n", t->id));
900+
MDEBUG(("%d: no midpoint\n", t->id));
896901
return REG_NOMATCH;
897902
}
898903
mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
899904
NOERR();
900905
if (mid == NULL)
901906
{
902907
/* failed to find a new one */
903-
MDEBUG(("%d failed midpoint\n", t->id));
908+
MDEBUG(("%d: failed midpoint\n", t->id));
904909
return REG_NOMATCH;
905910
}
906911
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
@@ -935,7 +940,8 @@ cbrdissect(struct vars *v,
935940
assert(n >= 0);
936941
assert((size_t) n < v->nmatch);
937942

938-
MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max));
943+
MDEBUG(("%d: cbrdissect %d{%d-%d} %ld-%ld\n", t->id, n, min, max,
944+
LOFF(begin), LOFF(end)));
939945

940946
/* get the backreferenced string */
941947
if (v->pmatch[n].rm_so == -1)
@@ -952,7 +958,7 @@ cbrdissect(struct vars *v,
952958
*/
953959
if (begin == end && min <= max)
954960
{
955-
MDEBUG(("cbackref matched trivially\n"));
961+
MDEBUG(("%d: backref matched trivially\n", t->id));
956962
return REG_OKAY;
957963
}
958964
return REG_NOMATCH;
@@ -962,7 +968,7 @@ cbrdissect(struct vars *v,
962968
/* matches only if zero repetitions are okay */
963969
if (min == 0)
964970
{
965-
MDEBUG(("cbackref matched trivially\n"));
971+
MDEBUG(("%d: backref matched trivially\n", t->id));
966972
return REG_OKAY;
967973
}
968974
return REG_NOMATCH;
@@ -989,7 +995,7 @@ cbrdissect(struct vars *v,
989995
p += brlen;
990996
}
991997

992-
MDEBUG(("cbackref matched\n"));
998+
MDEBUG(("%d: backref matched\n", t->id));
993999
return REG_OKAY;
9941000
}
9951001

@@ -1011,13 +1017,13 @@ caltdissect(struct vars *v,
10111017
assert(t->op == '|');
10121018
assert(t->left != NULL && t->left->cnfa.nstates > 0);
10131019

1014-
MDEBUG(("calt n%d\n", t->id));
1020+
MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
10151021

10161022
d = getsubdfa(v, t->left);
10171023
NOERR();
10181024
if (longest(v, d, begin, end, (int *) NULL) == end)
10191025
{
1020-
MDEBUG(("calt matched\n"));
1026+
MDEBUG(("%d: caltdissect matched\n", t->id));
10211027
er = cdissect(v, t->left, begin, end);
10221028
if (er != REG_NOMATCH)
10231029
return er;
@@ -1054,6 +1060,8 @@ citerdissect(struct vars *v,
10541060
assert(!(t->left->flags & SHORTER));
10551061
assert(begin <= end);
10561062

1063+
MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1064+
10571065
/*
10581066
* For the moment, assume the minimum number of matches is 1. If zero
10591067
* matches are allowed, and the target string is empty, we are allowed to
@@ -1092,7 +1100,6 @@ citerdissect(struct vars *v,
10921100
FREE(endpts);
10931101
return v->err;
10941102
}
1095-
MDEBUG(("citer %d\n", t->id));
10961103

10971104
/*
10981105
* Our strategy is to first find a set of sub-match endpoints that are
@@ -1182,7 +1189,7 @@ citerdissect(struct vars *v,
11821189
if (i > k)
11831190
{
11841191
/* satisfaction */
1185-
MDEBUG(("%d successful\n", t->id));
1192+
MDEBUG(("%d: successful\n", t->id));
11861193
FREE(endpts);
11871194
return REG_OKAY;
11881195
}
@@ -1223,11 +1230,11 @@ citerdissect(struct vars *v,
12231230
*/
12241231
if (t->min == 0 && begin == end)
12251232
{
1226-
MDEBUG(("%d allowing zero matches\n", t->id));
1233+
MDEBUG(("%d: allowing zero matches\n", t->id));
12271234
return REG_OKAY;
12281235
}
12291236

1230-
MDEBUG(("%d failed\n", t->id));
1237+
MDEBUG(("%d: failed\n", t->id));
12311238
return REG_NOMATCH;
12321239
}
12331240

@@ -1255,6 +1262,8 @@ creviterdissect(struct vars *v,
12551262
assert(t->left->flags & SHORTER);
12561263
assert(begin <= end);
12571264

1265+
MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1266+
12581267
/*
12591268
* If zero matches are allowed, and target string is empty, just declare
12601269
* victory. OTOH, if target string isn't empty, zero matches can't work
@@ -1264,7 +1273,10 @@ creviterdissect(struct vars *v,
12641273
if (min_matches <= 0)
12651274
{
12661275
if (begin == end)
1276+
{
1277+
MDEBUG(("%d: allowing zero matches\n", t->id));
12671278
return REG_OKAY;
1279+
}
12681280
min_matches = 1;
12691281
}
12701282

@@ -1293,7 +1305,6 @@ creviterdissect(struct vars *v,
12931305
FREE(endpts);
12941306
return v->err;
12951307
}
1296-
MDEBUG(("creviter %d\n", t->id));
12971308

12981309
/*
12991310
* Our strategy is to first find a set of sub-match endpoints that are
@@ -1389,7 +1400,7 @@ creviterdissect(struct vars *v,
13891400
if (i > k)
13901401
{
13911402
/* satisfaction */
1392-
MDEBUG(("%d successful\n", t->id));
1403+
MDEBUG(("%d: successful\n", t->id));
13931404
FREE(endpts);
13941405
return REG_OKAY;
13951406
}
@@ -1415,7 +1426,7 @@ creviterdissect(struct vars *v,
14151426
}
14161427

14171428
/* all possibilities exhausted */
1418-
MDEBUG(("%d failed\n", t->id));
1429+
MDEBUG(("%d: failed\n", t->id));
14191430
FREE(endpts);
14201431
return REG_NOMATCH;
14211432
}

src/test/modules/test_regex/expected/test_regex.out

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3315,6 +3315,21 @@ select * from test_regex('(?=b)b', 'a', 'HP');
33153315
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
33163316
(1 row)
33173317

3318+
-- expectMatch 23.9 HP ...(?!.) abcde cde
3319+
select * from test_regex('...(?!.)', 'abcde', 'HP');
3320+
test_regex
3321+
-----------------------------------
3322+
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
3323+
{cde}
3324+
(2 rows)
3325+
3326+
-- expectNomatch 23.10 HP ...(?=.) abc
3327+
select * from test_regex('...(?=.)', 'abc', 'HP');
3328+
test_regex
3329+
-----------------------------------
3330+
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
3331+
(1 row)
3332+
33183333
-- Postgres addition: lookbehind constraints
33193334
-- expectMatch 23.11 HPN (?<=a)b* ab b
33203335
select * from test_regex('(?<=a)b*', 'ab', 'HPN');
@@ -3376,6 +3391,39 @@ select * from test_regex('(?<=b)b', 'b', 'HP');
33763391
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
33773392
(1 row)
33783393

3394+
-- expectMatch 23.19 HP (?<=.).. abcde bc
3395+
select * from test_regex('(?<=.)..', 'abcde', 'HP');
3396+
test_regex
3397+
-----------------------------------
3398+
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
3399+
{bc}
3400+
(2 rows)
3401+
3402+
-- expectMatch 23.20 HP (?<=..)a* aaabb a
3403+
select * from test_regex('(?<=..)a*', 'aaabb', 'HP');
3404+
test_regex
3405+
-----------------------------------
3406+
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
3407+
{a}
3408+
(2 rows)
3409+
3410+
-- expectMatch 23.21 HP (?<=..)b* aaabb {}
3411+
-- Note: empty match here is correct, it matches after the first 2 characters
3412+
select * from test_regex('(?<=..)b*', 'aaabb', 'HP');
3413+
test_regex
3414+
-----------------------------------
3415+
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
3416+
{""}
3417+
(2 rows)
3418+
3419+
-- expectMatch 23.22 HP (?<=..)b+ aaabb bb
3420+
select * from test_regex('(?<=..)b+', 'aaabb', 'HP');
3421+
test_regex
3422+
-----------------------------------
3423+
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
3424+
{bb}
3425+
(2 rows)
3426+
33793427
-- doing 24 "non-greedy quantifiers"
33803428
-- expectMatch 24.1 PT ab+? abb ab
33813429
select * from test_regex('ab+?', 'abb', 'PT');

src/test/modules/test_regex/sql/test_regex.sql

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,6 +1049,10 @@ select * from test_regex('a(?!b)b*', 'a', 'HP');
10491049
select * from test_regex('(?=b)b', 'b', 'HP');
10501050
-- expectNomatch 23.8 HP (?=b)b a
10511051
select * from test_regex('(?=b)b', 'a', 'HP');
1052+
-- expectMatch 23.9 HP ...(?!.) abcde cde
1053+
select * from test_regex('...(?!.)', 'abcde', 'HP');
1054+
-- expectNomatch 23.10 HP ...(?=.) abc
1055+
select * from test_regex('...(?=.)', 'abc', 'HP');
10521056

10531057
-- Postgres addition: lookbehind constraints
10541058

@@ -1068,6 +1072,15 @@ select * from test_regex('a(?<!b)b*', 'a', 'HP');
10681072
select * from test_regex('(?<=b)b', 'bb', 'HP');
10691073
-- expectNomatch 23.18 HP (?<=b)b b
10701074
select * from test_regex('(?<=b)b', 'b', 'HP');
1075+
-- expectMatch 23.19 HP (?<=.).. abcde bc
1076+
select * from test_regex('(?<=.)..', 'abcde', 'HP');
1077+
-- expectMatch 23.20 HP (?<=..)a* aaabb a
1078+
select * from test_regex('(?<=..)a*', 'aaabb', 'HP');
1079+
-- expectMatch 23.21 HP (?<=..)b* aaabb {}
1080+
-- Note: empty match here is correct, it matches after the first 2 characters
1081+
select * from test_regex('(?<=..)b*', 'aaabb', 'HP');
1082+
-- expectMatch 23.22 HP (?<=..)b+ aaabb bb
1083+
select * from test_regex('(?<=..)b+', 'aaabb', 'HP');
10711084

10721085
-- doing 24 "non-greedy quantifiers"
10731086

0 commit comments

Comments
 (0)