Skip to content

Commit dea7fc6

Browse files
committed
Repair bug in regexp split performance improvements.
Commit c8ea87e introduced a temporary conversion buffer for substrings extracted during regexp splits. Unfortunately the code that sized it was failing to ignore the effects of ignored degenerate regexp matches, so for regexp_split_* calls it could under-size the buffer in such cases. Fix, and add some regression test cases (though those will only catch the bug if run in a multibyte encoding). Backpatch to 9.3 as the faulty code was. Thanks to the PostGIS project, Regina Obe and Paul Ramsey for the report (via IRC) and assistance in analysis. Patch by me.
1 parent 520711d commit dea7fc6

File tree

3 files changed

+31
-6
lines changed

3 files changed

+31
-6
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -882,6 +882,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
882882
int array_len;
883883
int array_idx;
884884
int prev_match_end;
885+
int prev_valid_match_end;
885886
int start_search;
886887
int maxlen = 0; /* largest fetch length in characters */
887888

@@ -937,6 +938,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
937938

938939
/* search for the pattern, perhaps repeatedly */
939940
prev_match_end = 0;
941+
prev_valid_match_end = 0;
940942
start_search = 0;
941943
while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
942944
pmatch_len, pmatch))
@@ -989,13 +991,15 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
989991
matchctx->nmatches++;
990992

991993
/*
992-
* check length of unmatched portion between end of previous match
993-
* and start of current one
994+
* check length of unmatched portion between end of previous valid
995+
* (nondegenerate, or degenerate but not ignored) match and start
996+
* of current one
994997
*/
995998
if (fetching_unmatched &&
996999
pmatch[0].rm_so >= 0 &&
997-
(pmatch[0].rm_so - prev_match_end) > maxlen)
998-
maxlen = (pmatch[0].rm_so - prev_match_end);
1000+
(pmatch[0].rm_so - prev_valid_match_end) > maxlen)
1001+
maxlen = (pmatch[0].rm_so - prev_valid_match_end);
1002+
prev_valid_match_end = pmatch[0].rm_eo;
9991003
}
10001004
prev_match_end = pmatch[0].rm_eo;
10011005

@@ -1021,8 +1025,8 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
10211025
* input string
10221026
*/
10231027
if (fetching_unmatched &&
1024-
(wide_len - prev_match_end) > maxlen)
1025-
maxlen = (wide_len - prev_match_end);
1028+
(wide_len - prev_valid_match_end) > maxlen)
1029+
maxlen = (wide_len - prev_valid_match_end);
10261030

10271031
/*
10281032
* Keep a note of the end position of the string for the benefit of

src/test/regress/expected/strings.out

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,24 @@ SELECT regexp_split_to_array('123456','.');
674674
{"","","","","","",""}
675675
(1 row)
676676

677+
SELECT regexp_split_to_array('123456','');
678+
regexp_split_to_array
679+
-----------------------
680+
{1,2,3,4,5,6}
681+
(1 row)
682+
683+
SELECT regexp_split_to_array('123456','(?:)');
684+
regexp_split_to_array
685+
-----------------------
686+
{1,2,3,4,5,6}
687+
(1 row)
688+
689+
SELECT regexp_split_to_array('1','');
690+
regexp_split_to_array
691+
-----------------------
692+
{1}
693+
(1 row)
694+
677695
-- errors
678696
SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'zippy') AS foo;
679697
ERROR: invalid regexp option: "z"

src/test/regress/sql/strings.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,9 @@ SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', 'nom
188188
SELECT regexp_split_to_array('123456','1');
189189
SELECT regexp_split_to_array('123456','6');
190190
SELECT regexp_split_to_array('123456','.');
191+
SELECT regexp_split_to_array('123456','');
192+
SELECT regexp_split_to_array('123456','(?:)');
193+
SELECT regexp_split_to_array('1','');
191194
-- errors
192195
SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'zippy') AS foo;
193196
SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'iz');

0 commit comments

Comments
 (0)