35
35
#include "regex/regex.h"
36
36
#include "utils/array.h"
37
37
#include "utils/builtins.h"
38
+ #include "utils/memutils.h"
38
39
39
40
#define PG_GETARG_TEXT_PP_IF_EXISTS (_n ) \
40
41
(PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
@@ -60,6 +61,9 @@ typedef struct regexp_matches_ctx
60
61
/* workspace for build_regexp_matches_result() */
61
62
Datum * elems ; /* has npatterns elements */
62
63
bool * nulls ; /* has npatterns elements */
64
+ pg_wchar * wide_str ; /* wide-char version of original string */
65
+ char * conv_buf ; /* conversion buffer */
66
+ int conv_bufsiz ; /* size thereof */
63
67
} regexp_matches_ctx ;
64
68
65
69
/*
@@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
111
115
Oid collation ,
112
116
bool force_glob ,
113
117
bool use_subpatterns ,
114
- bool ignore_degenerate );
115
- static void cleanup_regexp_matches ( regexp_matches_ctx * matchctx );
118
+ bool ignore_degenerate ,
119
+ bool fetching_unmatched );
116
120
static ArrayType * build_regexp_matches_result (regexp_matches_ctx * matchctx );
117
121
static Datum build_regexp_split_result (regexp_matches_ctx * splitctx );
118
122
@@ -809,7 +813,7 @@ regexp_matches(PG_FUNCTION_ARGS)
809
813
matchctx = setup_regexp_matches (PG_GETARG_TEXT_P_COPY (0 ), pattern ,
810
814
flags ,
811
815
PG_GET_COLLATION (),
812
- false, true, false);
816
+ false, true, false, false );
813
817
814
818
/* Pre-create workspace that build_regexp_matches_result needs */
815
819
matchctx -> elems = (Datum * ) palloc (sizeof (Datum ) * matchctx -> npatterns );
@@ -831,9 +835,6 @@ regexp_matches(PG_FUNCTION_ARGS)
831
835
SRF_RETURN_NEXT (funcctx , PointerGetDatum (result_ary ));
832
836
}
833
837
834
- /* release space in multi-call ctx to avoid intraquery memory leak */
835
- cleanup_regexp_matches (matchctx );
836
-
837
838
SRF_RETURN_DONE (funcctx );
838
839
}
839
840
@@ -852,17 +853,25 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
852
853
* all the matching in one swoop. The returned regexp_matches_ctx contains
853
854
* the locations of all the substrings matching the pattern.
854
855
*
855
- * The three bool parameters have only two patterns (one for each caller)
856
- * but it seems clearer to distinguish the functionality this way than to
857
- * key it all off one "is_split" flag.
856
+ * The four bool parameters have only two patterns (one for matching, one for
857
+ * splitting) but it seems clearer to distinguish the functionality this way
858
+ * than to key it all off one "is_split" flag. We don't currently assume that
859
+ * fetching_unmatched is exclusive of fetching the matched text too; if it's
860
+ * set, the conversion buffer is large enough to fetch any single matched or
861
+ * unmatched string, but not any larger substring. (In practice, when splitting
862
+ * the matches are usually small anyway, and it didn't seem worth complicating
863
+ * the code further.)
858
864
*/
859
865
static regexp_matches_ctx *
860
866
setup_regexp_matches (text * orig_str , text * pattern , text * flags ,
861
867
Oid collation ,
862
- bool force_glob , bool use_subpatterns ,
863
- bool ignore_degenerate )
868
+ bool force_glob ,
869
+ bool use_subpatterns ,
870
+ bool ignore_degenerate ,
871
+ bool fetching_unmatched )
864
872
{
865
873
regexp_matches_ctx * matchctx = palloc0 (sizeof (regexp_matches_ctx ));
874
+ int eml = pg_database_encoding_max_length ();
866
875
int orig_len ;
867
876
pg_wchar * wide_str ;
868
877
int wide_len ;
@@ -874,6 +883,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
874
883
int array_idx ;
875
884
int prev_match_end ;
876
885
int start_search ;
886
+ int maxlen = 0 ; /* largest fetch length in characters */
877
887
878
888
/* save original string --- we'll extract result substrings from it */
879
889
matchctx -> orig_str = orig_str ;
@@ -915,8 +925,13 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
915
925
/* temporary output space for RE package */
916
926
pmatch = palloc (sizeof (regmatch_t ) * pmatch_len );
917
927
918
- /* the real output space (grown dynamically if needed) */
919
- array_len = re_flags .glob ? 256 : 32 ;
928
+ /*
929
+ * the real output space (grown dynamically if needed)
930
+ *
931
+ * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
932
+ * than at 2^27
933
+ */
934
+ array_len = re_flags .glob ? 255 : 31 ;
920
935
matchctx -> match_locs = (int * ) palloc (sizeof (int ) * array_len );
921
936
array_idx = 0 ;
922
937
@@ -936,9 +951,13 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
936
951
pmatch [0 ].rm_eo > prev_match_end ))
937
952
{
938
953
/* enlarge output space if needed */
939
- while (array_idx + matchctx -> npatterns * 2 > array_len )
954
+ while (array_idx + matchctx -> npatterns * 2 + 1 > array_len )
940
955
{
941
- array_len *= 2 ;
956
+ array_len += array_len + 1 ; /* 2^n-1 => 2^(n+1)-1 */
957
+ if (array_len > MaxAllocSize /sizeof (int ))
958
+ ereport (ERROR ,
959
+ (errcode (ERRCODE_PROGRAM_LIMIT_EXCEEDED ),
960
+ errmsg ("too many regular expression matches" )));
942
961
matchctx -> match_locs = (int * ) repalloc (matchctx -> match_locs ,
943
962
sizeof (int ) * array_len );
944
963
}
@@ -950,16 +969,33 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
950
969
951
970
for (i = 1 ; i <= matchctx -> npatterns ; i ++ )
952
971
{
953
- matchctx -> match_locs [array_idx ++ ] = pmatch [i ].rm_so ;
954
- matchctx -> match_locs [array_idx ++ ] = pmatch [i ].rm_eo ;
972
+ int so = pmatch [i ].rm_so ;
973
+ int eo = pmatch [i ].rm_eo ;
974
+ matchctx -> match_locs [array_idx ++ ] = so ;
975
+ matchctx -> match_locs [array_idx ++ ] = eo ;
976
+ if (so >= 0 && eo >= 0 && (eo - so ) > maxlen )
977
+ maxlen = (eo - so );
955
978
}
956
979
}
957
980
else
958
981
{
959
- matchctx -> match_locs [array_idx ++ ] = pmatch [0 ].rm_so ;
960
- matchctx -> match_locs [array_idx ++ ] = pmatch [0 ].rm_eo ;
982
+ int so = pmatch [0 ].rm_so ;
983
+ int eo = pmatch [0 ].rm_eo ;
984
+ matchctx -> match_locs [array_idx ++ ] = so ;
985
+ matchctx -> match_locs [array_idx ++ ] = eo ;
986
+ if (so >= 0 && eo >= 0 && (eo - so ) > maxlen )
987
+ maxlen = (eo - so );
961
988
}
962
989
matchctx -> nmatches ++ ;
990
+
991
+ /*
992
+ * check length of unmatched portion between end of previous match
993
+ * and start of current one
994
+ */
995
+ if (fetching_unmatched &&
996
+ pmatch [0 ].rm_so >= 0 &&
997
+ (pmatch [0 ].rm_so - prev_match_end ) > maxlen )
998
+ maxlen = (pmatch [0 ].rm_so - prev_match_end );
963
999
}
964
1000
prev_match_end = pmatch [0 ].rm_eo ;
965
1001
@@ -980,34 +1016,67 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
980
1016
break ;
981
1017
}
982
1018
1019
+ /*
1020
+ * check length of unmatched portion between end of last match and end of
1021
+ * input string
1022
+ */
1023
+ if (fetching_unmatched &&
1024
+ (wide_len - prev_match_end ) > maxlen )
1025
+ maxlen = (wide_len - prev_match_end );
1026
+
1027
+ /*
1028
+ * Keep a note of the end position of the string for the benefit of
1029
+ * splitting code.
1030
+ */
1031
+ matchctx -> match_locs [array_idx ] = wide_len ;
1032
+
1033
+ if (eml > 1 )
1034
+ {
1035
+ int64 maxsiz = eml * (int64 ) maxlen ;
1036
+ int conv_bufsiz ;
1037
+
1038
+ /*
1039
+ * Make the conversion buffer large enough for any substring of
1040
+ * interest.
1041
+ *
1042
+ * Worst case: assume we need the maximum size (maxlen*eml), but take
1043
+ * advantage of the fact that the original string length in bytes is an
1044
+ * upper bound on the byte length of any fetched substring (and we know
1045
+ * that len+1 is safe to allocate because the varlena header is longer
1046
+ * than 1 byte).
1047
+ */
1048
+ if (maxsiz > orig_len )
1049
+ conv_bufsiz = orig_len + 1 ;
1050
+ else
1051
+ conv_bufsiz = maxsiz + 1 ; /* safe since maxsiz < 2^30 */
1052
+
1053
+ matchctx -> conv_buf = palloc (conv_bufsiz );
1054
+ matchctx -> conv_bufsiz = conv_bufsiz ;
1055
+ matchctx -> wide_str = wide_str ;
1056
+ }
1057
+ else
1058
+ {
1059
+ /* No need to keep the wide string if we're in a single-byte charset. */
1060
+ pfree (wide_str );
1061
+ matchctx -> wide_str = NULL ;
1062
+ matchctx -> conv_buf = NULL ;
1063
+ matchctx -> conv_bufsiz = 0 ;
1064
+ }
1065
+
983
1066
/* Clean up temp storage */
984
- pfree (wide_str );
985
1067
pfree (pmatch );
986
1068
987
1069
return matchctx ;
988
1070
}
989
1071
990
- /*
991
- * cleanup_regexp_matches - release memory of a regexp_matches_ctx
992
- */
993
- static void
994
- cleanup_regexp_matches (regexp_matches_ctx * matchctx )
995
- {
996
- pfree (matchctx -> orig_str );
997
- pfree (matchctx -> match_locs );
998
- if (matchctx -> elems )
999
- pfree (matchctx -> elems );
1000
- if (matchctx -> nulls )
1001
- pfree (matchctx -> nulls );
1002
- pfree (matchctx );
1003
- }
1004
-
1005
1072
/*
1006
1073
* build_regexp_matches_result - build output array for current match
1007
1074
*/
1008
1075
static ArrayType *
1009
1076
build_regexp_matches_result (regexp_matches_ctx * matchctx )
1010
1077
{
1078
+ char * buf = matchctx -> conv_buf ;
1079
+ int bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx -> conv_bufsiz ;
1011
1080
Datum * elems = matchctx -> elems ;
1012
1081
bool * nulls = matchctx -> nulls ;
1013
1082
int dims [1 ];
@@ -1027,6 +1096,15 @@ build_regexp_matches_result(regexp_matches_ctx *matchctx)
1027
1096
elems [i ] = (Datum ) 0 ;
1028
1097
nulls [i ] = true;
1029
1098
}
1099
+ else if (buf )
1100
+ {
1101
+ int len = pg_wchar2mb_with_len (matchctx -> wide_str + so ,
1102
+ buf ,
1103
+ eo - so );
1104
+ Assert (len < bufsiz );
1105
+ elems [i ] = PointerGetDatum (cstring_to_text_with_len (buf , len ));
1106
+ nulls [i ] = false;
1107
+ }
1030
1108
else
1031
1109
{
1032
1110
elems [i ] = DirectFunctionCall3 (text_substr ,
@@ -1069,7 +1147,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
1069
1147
splitctx = setup_regexp_matches (PG_GETARG_TEXT_P_COPY (0 ), pattern ,
1070
1148
flags ,
1071
1149
PG_GET_COLLATION (),
1072
- true, false, true);
1150
+ true, false, true, true );
1073
1151
1074
1152
MemoryContextSwitchTo (oldcontext );
1075
1153
funcctx -> user_fctx = (void * ) splitctx ;
@@ -1086,9 +1164,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
1086
1164
SRF_RETURN_NEXT (funcctx , result );
1087
1165
}
1088
1166
1089
- /* release space in multi-call ctx to avoid intraquery memory leak */
1090
- cleanup_regexp_matches (splitctx );
1091
-
1092
1167
SRF_RETURN_DONE (funcctx );
1093
1168
}
1094
1169
@@ -1114,7 +1189,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
1114
1189
PG_GETARG_TEXT_PP (1 ),
1115
1190
PG_GETARG_TEXT_PP_IF_EXISTS (2 ),
1116
1191
PG_GET_COLLATION (),
1117
- true, false, true);
1192
+ true, false, true, true );
1118
1193
1119
1194
while (splitctx -> next_match <= splitctx -> nmatches )
1120
1195
{
@@ -1126,12 +1201,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
1126
1201
splitctx -> next_match ++ ;
1127
1202
}
1128
1203
1129
- /*
1130
- * We don't call cleanup_regexp_matches here; it would try to pfree the
1131
- * input string, which we didn't copy. The space is not in a long-lived
1132
- * memory context anyway.
1133
- */
1134
-
1135
1204
PG_RETURN_ARRAYTYPE_P (makeArrayResult (astate , CurrentMemoryContext ));
1136
1205
}
1137
1206
@@ -1151,6 +1220,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
1151
1220
static Datum
1152
1221
build_regexp_split_result (regexp_matches_ctx * splitctx )
1153
1222
{
1223
+ char * buf = splitctx -> conv_buf ;
1154
1224
int startpos ;
1155
1225
int endpos ;
1156
1226
@@ -1161,22 +1231,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
1161
1231
if (startpos < 0 )
1162
1232
elog (ERROR , "invalid match ending position" );
1163
1233
1164
- if (splitctx -> next_match < splitctx -> nmatches )
1234
+ if (buf )
1165
1235
{
1236
+ int bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx -> conv_bufsiz ;
1237
+ int len ;
1238
+
1166
1239
endpos = splitctx -> match_locs [splitctx -> next_match * 2 ];
1167
1240
if (endpos < startpos )
1168
1241
elog (ERROR , "invalid match starting position" );
1169
- return DirectFunctionCall3 (text_substr ,
1170
- PointerGetDatum (splitctx -> orig_str ),
1171
- Int32GetDatum (startpos + 1 ),
1172
- Int32GetDatum (endpos - startpos ));
1242
+ len = pg_wchar2mb_with_len (splitctx -> wide_str + startpos ,
1243
+ buf ,
1244
+ endpos - startpos );
1245
+ Assert (len < bufsiz );
1246
+ return PointerGetDatum (cstring_to_text_with_len (buf , len ));
1173
1247
}
1174
1248
else
1175
1249
{
1176
- /* no more matches, return rest of string */
1177
- return DirectFunctionCall2 (text_substr_no_len ,
1250
+ endpos = splitctx -> match_locs [splitctx -> next_match * 2 ];
1251
+ if (endpos < startpos )
1252
+ elog (ERROR , "invalid match starting position" );
1253
+ return DirectFunctionCall3 (text_substr ,
1178
1254
PointerGetDatum (splitctx -> orig_str ),
1179
- Int32GetDatum (startpos + 1 ));
1255
+ Int32GetDatum (startpos + 1 ),
1256
+ Int32GetDatum (endpos - startpos ));
1180
1257
}
1181
1258
}
1182
1259
0 commit comments