Skip to content

Commit b31e3f5

Browse files
committed
Improve worst-case performance of text_position_get_match_pos()
This function converts a byte position to a character position after a successful string match. Rather than calling pg_mblen() in a loop, use pg_mbstrlen_with_len() since the latter can inline its own call to pg_mblen(). When the string match is at the end of the haystack text, this change results in 10-20% performance improvement, depending on platform and typical character length in bytes. This also simplifies the code a little. Specializing for UTF-8 could result in further improvement, but the performance gain was not found to be reliable between platforms. The modest gain in this commit is stable between platforms and usable by all server encodings. Discussion: https://www.postgresql.org/message-id/CAFBsxsH1Yutrmu+6LLHKK8iXY+vG--Do6zN+2900spHXQNNQKQ@mail.gmail.com
1 parent 807fee1 commit b31e3f5

File tree

1 file changed

+5
-23
lines changed

1 file changed

+5
-23
lines changed

src/backend/utils/adt/varlena.c

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ typedef struct varlena VarString;
5151
*/
5252
typedef struct
5353
{
54-
bool is_multibyte; /* T if multibyte encoding */
5554
bool is_multibyte_char_in_char; /* need to check char boundaries? */
5655

5756
char *str1; /* haystack string */
@@ -1221,20 +1220,11 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
12211220
* and continue the search if it was a false match.
12221221
*/
12231222
if (pg_database_encoding_max_length() == 1)
1224-
{
1225-
state->is_multibyte = false;
12261223
state->is_multibyte_char_in_char = false;
1227-
}
12281224
else if (GetDatabaseEncoding() == PG_UTF8)
1229-
{
1230-
state->is_multibyte = true;
12311225
state->is_multibyte_char_in_char = false;
1232-
}
12331226
else
1234-
{
1235-
state->is_multibyte = true;
12361227
state->is_multibyte_char_in_char = true;
1237-
}
12381228

12391229
state->str1 = VARDATA_ANY(t1);
12401230
state->str2 = VARDATA_ANY(t2);
@@ -1466,19 +1456,11 @@ text_position_get_match_ptr(TextPositionState *state)
14661456
static int
14671457
text_position_get_match_pos(TextPositionState *state)
14681458
{
1469-
if (!state->is_multibyte)
1470-
return state->last_match - state->str1 + 1;
1471-
else
1472-
{
1473-
/* Convert the byte position to char position. */
1474-
while (state->refpoint < state->last_match)
1475-
{
1476-
state->refpoint += pg_mblen(state->refpoint);
1477-
state->refpos++;
1478-
}
1479-
Assert(state->refpoint == state->last_match);
1480-
return state->refpos + 1;
1481-
}
1459+
/* Convert the byte position to char position. */
1460+
state->refpos += pg_mbstrlen_with_len(state->refpoint,
1461+
state->last_match - state->refpoint);
1462+
state->refpoint = state->last_match;
1463+
return state->refpos + 1;
14821464
}
14831465

14841466
/*

0 commit comments

Comments
 (0)