Skip to content

Commit 2ba7c4e

Browse files
committed
Avoid quadratic slowdown in regexp match/split functions.
regexp_matches, regexp_split_to_table and regexp_split_to_array all work by compiling a list of match positions as character offsets (NOT byte positions) in the source string. Formerly, they then used text_substr to extract the matched text; but in a multi-byte encoding, that counts the characters in the string, and the characters needed to reach the starting byte position, on every call. Accordingly, the performance degraded as the product of the input string length and the number of match positions, such that splitting a string of a few hundred kbytes could take many minutes. Repair by keeping the wide-character copy of the input string available (only in the case where encoding_max_length is not 1) after performing the match operation, and extracting substrings from that instead. This reduces the complexity to being linear in the number of result bytes, discounting the actual regexp match itself (which is not affected by this patch). In passing, remove cleanup using retail pfree() which was obsoleted by commit ff428cd (Feb 2008) which made cleanup of SRF multi-call contexts automatic. Also increase (to ~134 million) the maximum number of matches and provide an error message when it is reached. Backpatch all the way because this has been wrong forever. Analysis and patch by me; review by Kaiting Chen. Discussion: https://postgr.es/m/87pnyn55qh.fsf@news-spur.riddles.org.uk see also https://postgr.es/m/87lg996g4r.fsf@news-spur.riddles.org.uk
1 parent 48bc1a5 commit 2ba7c4e

File tree

1 file changed

+131
-54
lines changed

1 file changed

+131
-54
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 131 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "regex/regex.h"
3636
#include "utils/array.h"
3737
#include "utils/builtins.h"
38+
#include "utils/memutils.h"
3839

3940
#define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
4041
(PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
@@ -60,6 +61,9 @@ typedef struct regexp_matches_ctx
6061
/* workspace for build_regexp_matches_result() */
6162
Datum *elems; /* has npatterns elements */
6263
bool *nulls; /* has npatterns elements */
64+
pg_wchar *wide_str; /* wide-char version of original string */
65+
char *conv_buf; /* conversion buffer */
66+
int conv_bufsiz; /* size thereof */
6367
} regexp_matches_ctx;
6468

6569
/*
@@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
111115
Oid collation,
112116
bool force_glob,
113117
bool use_subpatterns,
114-
bool ignore_degenerate);
115-
static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
118+
bool ignore_degenerate,
119+
bool fetching_unmatched);
116120
static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
117121
static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
118122

@@ -809,7 +813,7 @@ regexp_matches(PG_FUNCTION_ARGS)
809813
matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
810814
flags,
811815
PG_GET_COLLATION(),
812-
false, true, false);
816+
false, true, false, false);
813817

814818
/* Pre-create workspace that build_regexp_matches_result needs */
815819
matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
@@ -831,9 +835,6 @@ regexp_matches(PG_FUNCTION_ARGS)
831835
SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
832836
}
833837

834-
/* release space in multi-call ctx to avoid intraquery memory leak */
835-
cleanup_regexp_matches(matchctx);
836-
837838
SRF_RETURN_DONE(funcctx);
838839
}
839840

@@ -852,17 +853,25 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
852853
* all the matching in one swoop. The returned regexp_matches_ctx contains
853854
* the locations of all the substrings matching the pattern.
854855
*
855-
* The three bool parameters have only two patterns (one for each caller)
856-
* but it seems clearer to distinguish the functionality this way than to
857-
* key it all off one "is_split" flag.
856+
* The four bool parameters have only two patterns (one for matching, one for
857+
* splitting) but it seems clearer to distinguish the functionality this way
858+
* than to key it all off one "is_split" flag. We don't currently assume that
859+
* fetching_unmatched is exclusive of fetching the matched text too; if it's
860+
* set, the conversion buffer is large enough to fetch any single matched or
861+
* unmatched string, but not any larger substring. (In practice, when splitting
862+
* the matches are usually small anyway, and it didn't seem worth complicating
863+
* the code further.)
858864
*/
859865
static regexp_matches_ctx *
860866
setup_regexp_matches(text *orig_str, text *pattern, text *flags,
861867
Oid collation,
862-
bool force_glob, bool use_subpatterns,
863-
bool ignore_degenerate)
868+
bool force_glob,
869+
bool use_subpatterns,
870+
bool ignore_degenerate,
871+
bool fetching_unmatched)
864872
{
865873
regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
874+
int eml = pg_database_encoding_max_length();
866875
int orig_len;
867876
pg_wchar *wide_str;
868877
int wide_len;
@@ -874,6 +883,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
874883
int array_idx;
875884
int prev_match_end;
876885
int start_search;
886+
int maxlen = 0; /* largest fetch length in characters */
877887

878888
/* save original string --- we'll extract result substrings from it */
879889
matchctx->orig_str = orig_str;
@@ -915,8 +925,13 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
915925
/* temporary output space for RE package */
916926
pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
917927

918-
/* the real output space (grown dynamically if needed) */
919-
array_len = re_flags.glob ? 256 : 32;
928+
/*
929+
* the real output space (grown dynamically if needed)
930+
*
931+
* use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
932+
* than at 2^27
933+
*/
934+
array_len = re_flags.glob ? 255 : 31;
920935
matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
921936
array_idx = 0;
922937

@@ -936,9 +951,13 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
936951
pmatch[0].rm_eo > prev_match_end))
937952
{
938953
/* enlarge output space if needed */
939-
while (array_idx + matchctx->npatterns * 2 > array_len)
954+
while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
940955
{
941-
array_len *= 2;
956+
array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */
957+
if (array_len > MaxAllocSize/sizeof(int))
958+
ereport(ERROR,
959+
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
960+
errmsg("too many regular expression matches")));
942961
matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
943962
sizeof(int) * array_len);
944963
}
@@ -950,16 +969,33 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
950969

951970
for (i = 1; i <= matchctx->npatterns; i++)
952971
{
953-
matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
954-
matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
972+
int so = pmatch[i].rm_so;
973+
int eo = pmatch[i].rm_eo;
974+
matchctx->match_locs[array_idx++] = so;
975+
matchctx->match_locs[array_idx++] = eo;
976+
if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
977+
maxlen = (eo - so);
955978
}
956979
}
957980
else
958981
{
959-
matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
960-
matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
982+
int so = pmatch[0].rm_so;
983+
int eo = pmatch[0].rm_eo;
984+
matchctx->match_locs[array_idx++] = so;
985+
matchctx->match_locs[array_idx++] = eo;
986+
if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
987+
maxlen = (eo - so);
961988
}
962989
matchctx->nmatches++;
990+
991+
/*
992+
* check length of unmatched portion between end of previous match
993+
* and start of current one
994+
*/
995+
if (fetching_unmatched &&
996+
pmatch[0].rm_so >= 0 &&
997+
(pmatch[0].rm_so - prev_match_end) > maxlen)
998+
maxlen = (pmatch[0].rm_so - prev_match_end);
963999
}
9641000
prev_match_end = pmatch[0].rm_eo;
9651001

@@ -980,34 +1016,67 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
9801016
break;
9811017
}
9821018

1019+
/*
1020+
* check length of unmatched portion between end of last match and end of
1021+
* input string
1022+
*/
1023+
if (fetching_unmatched &&
1024+
(wide_len - prev_match_end) > maxlen)
1025+
maxlen = (wide_len - prev_match_end);
1026+
1027+
/*
1028+
* Keep a note of the end position of the string for the benefit of
1029+
* splitting code.
1030+
*/
1031+
matchctx->match_locs[array_idx] = wide_len;
1032+
1033+
if (eml > 1)
1034+
{
1035+
int64 maxsiz = eml * (int64) maxlen;
1036+
int conv_bufsiz;
1037+
1038+
/*
1039+
* Make the conversion buffer large enough for any substring of
1040+
* interest.
1041+
*
1042+
* Worst case: assume we need the maximum size (maxlen*eml), but take
1043+
* advantage of the fact that the original string length in bytes is an
1044+
* upper bound on the byte length of any fetched substring (and we know
1045+
* that len+1 is safe to allocate because the varlena header is longer
1046+
* than 1 byte).
1047+
*/
1048+
if (maxsiz > orig_len)
1049+
conv_bufsiz = orig_len + 1;
1050+
else
1051+
conv_bufsiz = maxsiz + 1; /* safe since maxsiz < 2^30 */
1052+
1053+
matchctx->conv_buf = palloc(conv_bufsiz);
1054+
matchctx->conv_bufsiz = conv_bufsiz;
1055+
matchctx->wide_str = wide_str;
1056+
}
1057+
else
1058+
{
1059+
/* No need to keep the wide string if we're in a single-byte charset. */
1060+
pfree(wide_str);
1061+
matchctx->wide_str = NULL;
1062+
matchctx->conv_buf = NULL;
1063+
matchctx->conv_bufsiz = 0;
1064+
}
1065+
9831066
/* Clean up temp storage */
984-
pfree(wide_str);
9851067
pfree(pmatch);
9861068

9871069
return matchctx;
9881070
}
9891071

990-
/*
991-
* cleanup_regexp_matches - release memory of a regexp_matches_ctx
992-
*/
993-
static void
994-
cleanup_regexp_matches(regexp_matches_ctx *matchctx)
995-
{
996-
pfree(matchctx->orig_str);
997-
pfree(matchctx->match_locs);
998-
if (matchctx->elems)
999-
pfree(matchctx->elems);
1000-
if (matchctx->nulls)
1001-
pfree(matchctx->nulls);
1002-
pfree(matchctx);
1003-
}
1004-
10051072
/*
10061073
* build_regexp_matches_result - build output array for current match
10071074
*/
10081075
static ArrayType *
10091076
build_regexp_matches_result(regexp_matches_ctx *matchctx)
10101077
{
1078+
char *buf = matchctx->conv_buf;
1079+
int bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz;
10111080
Datum *elems = matchctx->elems;
10121081
bool *nulls = matchctx->nulls;
10131082
int dims[1];
@@ -1027,6 +1096,15 @@ build_regexp_matches_result(regexp_matches_ctx *matchctx)
10271096
elems[i] = (Datum) 0;
10281097
nulls[i] = true;
10291098
}
1099+
else if (buf)
1100+
{
1101+
int len = pg_wchar2mb_with_len(matchctx->wide_str + so,
1102+
buf,
1103+
eo - so);
1104+
Assert(len < bufsiz);
1105+
elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
1106+
nulls[i] = false;
1107+
}
10301108
else
10311109
{
10321110
elems[i] = DirectFunctionCall3(text_substr,
@@ -1069,7 +1147,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
10691147
splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
10701148
flags,
10711149
PG_GET_COLLATION(),
1072-
true, false, true);
1150+
true, false, true, true);
10731151

10741152
MemoryContextSwitchTo(oldcontext);
10751153
funcctx->user_fctx = (void *) splitctx;
@@ -1086,9 +1164,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
10861164
SRF_RETURN_NEXT(funcctx, result);
10871165
}
10881166

1089-
/* release space in multi-call ctx to avoid intraquery memory leak */
1090-
cleanup_regexp_matches(splitctx);
1091-
10921167
SRF_RETURN_DONE(funcctx);
10931168
}
10941169

@@ -1114,7 +1189,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
11141189
PG_GETARG_TEXT_PP(1),
11151190
PG_GETARG_TEXT_PP_IF_EXISTS(2),
11161191
PG_GET_COLLATION(),
1117-
true, false, true);
1192+
true, false, true, true);
11181193

11191194
while (splitctx->next_match <= splitctx->nmatches)
11201195
{
@@ -1126,12 +1201,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
11261201
splitctx->next_match++;
11271202
}
11281203

1129-
/*
1130-
* We don't call cleanup_regexp_matches here; it would try to pfree the
1131-
* input string, which we didn't copy. The space is not in a long-lived
1132-
* memory context anyway.
1133-
*/
1134-
11351204
PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
11361205
}
11371206

@@ -1151,6 +1220,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
11511220
static Datum
11521221
build_regexp_split_result(regexp_matches_ctx *splitctx)
11531222
{
1223+
char *buf = splitctx->conv_buf;
11541224
int startpos;
11551225
int endpos;
11561226

@@ -1161,22 +1231,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
11611231
if (startpos < 0)
11621232
elog(ERROR, "invalid match ending position");
11631233

1164-
if (splitctx->next_match < splitctx->nmatches)
1234+
if (buf)
11651235
{
1236+
int bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz;
1237+
int len;
1238+
11661239
endpos = splitctx->match_locs[splitctx->next_match * 2];
11671240
if (endpos < startpos)
11681241
elog(ERROR, "invalid match starting position");
1169-
return DirectFunctionCall3(text_substr,
1170-
PointerGetDatum(splitctx->orig_str),
1171-
Int32GetDatum(startpos + 1),
1172-
Int32GetDatum(endpos - startpos));
1242+
len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
1243+
buf,
1244+
endpos-startpos);
1245+
Assert(len < bufsiz);
1246+
return PointerGetDatum(cstring_to_text_with_len(buf, len));
11731247
}
11741248
else
11751249
{
1176-
/* no more matches, return rest of string */
1177-
return DirectFunctionCall2(text_substr_no_len,
1250+
endpos = splitctx->match_locs[splitctx->next_match * 2];
1251+
if (endpos < startpos)
1252+
elog(ERROR, "invalid match starting position");
1253+
return DirectFunctionCall3(text_substr,
11781254
PointerGetDatum(splitctx->orig_str),
1179-
Int32GetDatum(startpos + 1));
1255+
Int32GetDatum(startpos + 1),
1256+
Int32GetDatum(endpos - startpos));
11801257
}
11811258
}
11821259

0 commit comments

Comments
 (0)