Skip to content

Commit ca6fde9

Browse files
committed
Optimize JSON escaping using SIMD
Here we adjust escape_json_with_len() to make use of SIMD to allow processing of up to 16-bytes at a time rather than processing a single byte at a time. This has been shown to speed up escaping of JSON strings significantly. Escaping is required for both JSON string properties and also the property names themselves, so this should also help improve the speed of the conversion from JSON into text for JSON objects that have property names 16 or more bytes long. Escaping JSON strings was often a significant bottleneck for longer strings. With these changes, some benchmarking has shown a query performing nearly 4 times faster when escaping a JSON object with a 1MB text property. Tests with shorter text properties saw smaller but still significant performance improvements. For example, a test outputting 1024 JSON strings with a text property length ranging from 1 char to 1024 chars became around 2 times faster. Author: David Rowley Reviewed-by: Melih Mutlu Discussion: https://postgr.es/m/CAApHDvpLXwMZvbCKcdGfU9XQjGCDm7tFpRdTXuB9PVgpNUYfEQ@mail.gmail.com
1 parent b5df24e commit ca6fde9

File tree

3 files changed

+157
-2
lines changed

3 files changed

+157
-2
lines changed

src/backend/utils/adt/json.c

+102-2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "funcapi.h"
2020
#include "libpq/pqformat.h"
2121
#include "miscadmin.h"
22+
#include "port/simd.h"
2223
#include "utils/array.h"
2324
#include "utils/builtins.h"
2425
#include "utils/date.h"
@@ -1594,6 +1595,18 @@ escape_json(StringInfo buf, const char *str)
15941595
appendStringInfoCharMacro(buf, '"');
15951596
}
15961597

1598+
/*
1599+
* Define the number of bytes that escape_json_with_len will look ahead in the
1600+
* input string before flushing the input string to the destination buffer.
1601+
* Looking ahead too far could result in cachelines being evicted that will
1602+
* need to be reloaded in order to perform the appendBinaryStringInfo call.
1603+
* Smaller values will result in a larger number of calls to
1604+
* appendBinaryStringInfo and introduce additional function call overhead.
1605+
* Values larger than the size of L1d cache will likely result in worse
1606+
* performance.
1607+
*/
1608+
#define ESCAPE_JSON_FLUSH_AFTER 512
1609+
15971610
/*
15981611
* escape_json_with_len
15991612
* Produce a JSON string literal, properly escaping the possibly not
@@ -1603,11 +1616,98 @@ escape_json(StringInfo buf, const char *str)
16031616
void
16041617
escape_json_with_len(StringInfo buf, const char *str, int len)
16051618
{
1619+
int vlen;
1620+
1621+
Assert(len >= 0);
1622+
1623+
/*
1624+
* Since we know the minimum length we'll need to append, let's just
1625+
* enlarge the buffer now rather than incrementally making more space when
1626+
* we run out. Add two extra bytes for the enclosing quotes.
1627+
*/
1628+
enlargeStringInfo(buf, len + 2);
1629+
1630+
/*
1631+
* Figure out how many bytes to process using SIMD. Round 'len' down to
1632+
* the previous multiple of sizeof(Vector8), assuming that's a power-of-2.
1633+
*/
1634+
vlen = len & (int) (~(sizeof(Vector8) - 1));
1635+
16061636
appendStringInfoCharMacro(buf, '"');
16071637

1608-
for (int i = 0; i < len; i++)
1609-
escape_json_char(buf, str[i]);
1638+
for (int i = 0, copypos = 0;;)
1639+
{
1640+
/*
1641+
* To speed this up, try searching sizeof(Vector8) bytes at once for
1642+
* special characters that we need to escape. When we find one, we
1643+
* fall out of the Vector8 loop and copy the portion we've vector
1644+
* searched and then we process sizeof(Vector8) bytes one byte at a
1645+
* time. Once done, come back and try doing vector searching again.
1646+
* We'll also process any remaining bytes at the tail end of the
1647+
* string byte-by-byte. This optimization assumes that most chunks of
1648+
* sizeof(Vector8) bytes won't contain any special characters.
1649+
*/
1650+
for (; i < vlen; i += sizeof(Vector8))
1651+
{
1652+
Vector8 chunk;
1653+
1654+
vector8_load(&chunk, (const uint8 *) &str[i]);
1655+
1656+
/*
1657+
* Break on anything less than ' ' or if we find a '"' or '\\'.
1658+
* Those need special handling. That's done in the per-byte loop.
1659+
*/
1660+
if (vector8_has_le(chunk, (unsigned char) 0x1F) ||
1661+
vector8_has(chunk, (unsigned char) '"') ||
1662+
vector8_has(chunk, (unsigned char) '\\'))
1663+
break;
1664+
1665+
#ifdef ESCAPE_JSON_FLUSH_AFTER
1666+
1667+
/*
1668+
* Flush what's been checked so far out to the destination buffer
1669+
* every so often to avoid having to re-read cachelines when
1670+
* escaping large strings.
1671+
*/
1672+
if (i - copypos >= ESCAPE_JSON_FLUSH_AFTER)
1673+
{
1674+
appendBinaryStringInfo(buf, &str[copypos], i - copypos);
1675+
copypos = i;
1676+
}
1677+
#endif
1678+
}
1679+
1680+
/*
1681+
* Write to the destination up to the point that we've vector searched
1682+
* so far. Do this only when switching into per-byte mode rather than
1683+
* once every sizeof(Vector8) bytes.
1684+
*/
1685+
if (copypos < i)
1686+
{
1687+
appendBinaryStringInfo(buf, &str[copypos], i - copypos);
1688+
copypos = i;
1689+
}
1690+
1691+
/*
1692+
* Per-byte loop for Vector8s containing special chars and for
1693+
* processing the tail of the string.
1694+
*/
1695+
for (int b = 0; b < sizeof(Vector8); b++)
1696+
{
1697+
/* check if we've finished */
1698+
if (i == len)
1699+
goto done;
1700+
1701+
Assert(i < len);
1702+
1703+
escape_json_char(buf, str[i++]);
1704+
}
1705+
1706+
copypos = i;
1707+
/* We're not done yet. Try the vector search again. */
1708+
}
16101709

1710+
done:
16111711
appendStringInfoCharMacro(buf, '"');
16121712
}
16131713

src/test/regress/expected/json.out

+48
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,54 @@ SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes
5555
"............abc\n"
5656
(1 row)
5757

58+
-- Test various lengths of strings to validate SIMD processing to escape
59+
-- special chars in the JSON.
60+
SELECT row_to_json(j)::jsonb FROM (
61+
SELECT left(E'abcdefghijklmnopqrstuv"\twxyz012345678', a) AS a
62+
FROM generate_series(0,37) a
63+
) j;
64+
row_to_json
65+
--------------------------------------------------
66+
{"a": ""}
67+
{"a": "a"}
68+
{"a": "ab"}
69+
{"a": "abc"}
70+
{"a": "abcd"}
71+
{"a": "abcde"}
72+
{"a": "abcdef"}
73+
{"a": "abcdefg"}
74+
{"a": "abcdefgh"}
75+
{"a": "abcdefghi"}
76+
{"a": "abcdefghij"}
77+
{"a": "abcdefghijk"}
78+
{"a": "abcdefghijkl"}
79+
{"a": "abcdefghijklm"}
80+
{"a": "abcdefghijklmn"}
81+
{"a": "abcdefghijklmno"}
82+
{"a": "abcdefghijklmnop"}
83+
{"a": "abcdefghijklmnopq"}
84+
{"a": "abcdefghijklmnopqr"}
85+
{"a": "abcdefghijklmnopqrs"}
86+
{"a": "abcdefghijklmnopqrst"}
87+
{"a": "abcdefghijklmnopqrstu"}
88+
{"a": "abcdefghijklmnopqrstuv"}
89+
{"a": "abcdefghijklmnopqrstuv\""}
90+
{"a": "abcdefghijklmnopqrstuv\"\t"}
91+
{"a": "abcdefghijklmnopqrstuv\"\tw"}
92+
{"a": "abcdefghijklmnopqrstuv\"\twx"}
93+
{"a": "abcdefghijklmnopqrstuv\"\twxy"}
94+
{"a": "abcdefghijklmnopqrstuv\"\twxyz"}
95+
{"a": "abcdefghijklmnopqrstuv\"\twxyz0"}
96+
{"a": "abcdefghijklmnopqrstuv\"\twxyz01"}
97+
{"a": "abcdefghijklmnopqrstuv\"\twxyz012"}
98+
{"a": "abcdefghijklmnopqrstuv\"\twxyz0123"}
99+
{"a": "abcdefghijklmnopqrstuv\"\twxyz01234"}
100+
{"a": "abcdefghijklmnopqrstuv\"\twxyz012345"}
101+
{"a": "abcdefghijklmnopqrstuv\"\twxyz0123456"}
102+
{"a": "abcdefghijklmnopqrstuv\"\twxyz01234567"}
103+
{"a": "abcdefghijklmnopqrstuv\"\twxyz012345678"}
104+
(38 rows)
105+
58106
-- see json_encoding test for input with unicode escapes
59107
-- Numbers.
60108
SELECT '1'::json; -- OK

src/test/regress/sql/json.sql

+7
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,13 @@ SELECT '"\v"'::json; -- ERROR, not a valid JSON escape
1212
SELECT ('"'||repeat('.', 12)||'abc"')::json; -- OK
1313
SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes
1414

15+
-- Test various lengths of strings to validate SIMD processing to escape
16+
-- special chars in the JSON.
17+
SELECT row_to_json(j)::jsonb FROM (
18+
SELECT left(E'abcdefghijklmnopqrstuv"\twxyz012345678', a) AS a
19+
FROM generate_series(0,37) a
20+
) j;
21+
1522
-- see json_encoding test for input with unicode escapes
1623

1724
-- Numbers.

0 commit comments

Comments
 (0)