Skip to content

Commit 880e892

Browse files
committed
Reorder and unroll+interleave stage 1 loop
1 parent 060b622 commit 880e892

File tree

5 files changed

+116
-70
lines changed

5 files changed

+116
-70
lines changed

include/simdjson/common_defs.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,17 @@
1717
#define SIMDJSON_PADDING 32
1818
#endif
1919

20+
#if defined(__GNUC__)
21+
// Marks a block with a name so that MCA analysis can see it.
22+
#define BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
23+
#define END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
24+
#define DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
25+
#else
26+
#define BEGIN_DEBUG_BLOCK(name)
27+
#define END_DEBUG_BLOCK(name)
28+
#define DEBUG_BLOCK(name, block)
29+
#endif
30+
2031
#ifndef _MSC_VER
2132
// Implemented using Labels as Values which works in GCC and CLANG (and maybe
2233
// also in Intel's compiler), but won't work in MSVC.

src/arm64/simd_input.h

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -40,21 +40,21 @@ using namespace simdjson::arm64;
4040

4141
template <>
4242
struct simd_input<Architecture::ARM64> {
43-
uint8x16_t chunks[4];
43+
const uint8x16_t chunks[4];
4444

45-
really_inline simd_input(const uint8_t *ptr) {
46-
this->chunks[0] = vld1q_u8(ptr + 0*16);
47-
this->chunks[1] = vld1q_u8(ptr + 1*16);
48-
this->chunks[2] = vld1q_u8(ptr + 2*16);
49-
this->chunks[3] = vld1q_u8(ptr + 3*16);
50-
}
45+
really_inline simd_input()
46+
: chunks{uint8x16_t(), uint8x16_t(), uint8x16_t(), uint8x16_t() } {}
5147

52-
really_inline simd_input(uint8x16_t chunk0, uint8x16_t chunk1, uint8x16_t chunk2, uint8x16_t chunk3) {
53-
this->chunks[0] = chunk0;
54-
this->chunks[1] = chunk1;
55-
this->chunks[2] = chunk2;
56-
this->chunks[3] = chunk3;
57-
}
48+
really_inline simd_input(const uint8x16_t chunk0, const uint8x16_t chunk1, const uint8x16_t chunk2, const uint8x16_t chunk3)
49+
: chunks{chunk0, chunk1, chunk2, chunk3 } {}
50+
51+
really_inline simd_input(const uint8_t *ptr)
52+
: chunks{
53+
vld1q_u8(ptr + 0*16),
54+
vld1q_u8(ptr + 1*16),
55+
vld1q_u8(ptr + 2*16),
56+
vld1q_u8(ptr + 3*16)
57+
} {}
5858

5959
template <typename F>
6060
really_inline void each(F const& each_chunk) const {

src/generic/stage1_find_marks.h

Lines changed: 70 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ really_inline ErrorValues detect_errors_on_eof(
8989
//
9090
// Backslash sequences outside of quotes will be detected in stage 2.
9191
//
92-
really_inline uint64_t find_in_string(const simd_input<ARCHITECTURE> in, uint64_t &prev_escaped, uint64_t &prev_in_string) {
92+
really_inline uint64_t find_strings(const simd_input<ARCHITECTURE> in, uint64_t &prev_escaped, uint64_t &prev_in_string) {
9393
const uint64_t backslash = in.eq('\\');
9494
const uint64_t escaped = follows_odd_sequence_of(backslash, prev_escaped);
9595
const uint64_t quote = in.eq('"') & ~escaped;
@@ -103,13 +103,12 @@ really_inline uint64_t find_in_string(const simd_input<ARCHITECTURE> in, uint64_
103103
return in_string ^ quote;
104104
}
105105

106-
really_inline uint64_t invalid_string_bytes(const simd_input<ARCHITECTURE> in, const uint64_t quote_mask) {
106+
really_inline uint64_t invalid_string_bytes(const uint64_t unescaped, const uint64_t quote_mask) {
107107
/* All Unicode characters may be placed within the
108108
* quotation marks, except for the characters that MUST be escaped:
109109
* quotation mark, reverse solidus, and the control characters (U+0000
110110
* through U+001F).
111111
* https://tools.ietf.org/html/rfc8259 */
112-
const uint64_t unescaped = in.lteq(0x1F);
113112
return quote_mask & unescaped;
114113
}
115114

@@ -129,7 +128,7 @@ really_inline uint64_t invalid_string_bytes(const simd_input<ARCHITECTURE> in, c
129128
// contents of a string the same as content outside. Errors and structurals inside the string or on
130129
// the trailing quote will need to be removed later when the correct string information is known.
131130
//
132-
really_inline uint64_t find_structurals(const simd_input<ARCHITECTURE> in, uint64_t &prev_primitive) {
131+
really_inline uint64_t find_potential_structurals(const simd_input<ARCHITECTURE> in, uint64_t &prev_primitive) {
133132
// These use SIMD so let's kick them off before running the regular 64-bit stuff ...
134133
uint64_t whitespace, op;
135134
find_whitespace_and_operators(in, whitespace, op);
@@ -144,28 +143,69 @@ really_inline uint64_t find_structurals(const simd_input<ARCHITECTURE> in, uint6
144143
return op | start_primitive;
145144
}
146145

147-
// Find structural bits in a 64-byte chunk.
148-
really_inline void find_structural_bits_64(
146+
static const size_t STEP_SIZE = 128;
147+
148+
//
149+
// Find the important bits of JSON in a 128-byte chunk, and add them to :
150+
//
151+
//
152+
//
153+
// PERF NOTES:
154+
// We pipe 2 inputs through these stages:
155+
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
156+
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
157+
// 2. Scan the JSON for critical data: strings, primitives and operators. This is the critical path.
158+
// The output of step 1 depends entirely on this information. These functions don't quite use
159+
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
160+
// at a time. The second input's scans has some dependency on the first ones finishing it, but
161+
// they can make a lot of progress before they need that information.
162+
// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
163+
// to finish: utf-8 checks and generating the output from the last iteration.
164+
//
165+
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
166+
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
167+
// workout.
168+
//
169+
really_inline void find_structural_bits_128(
149170
const uint8_t *buf, const size_t idx, uint32_t *&base_ptr,
150171
uint64_t &prev_escaped, uint64_t &prev_in_string,
151172
uint64_t &prev_primitive,
152-
uint64_t &structurals,
173+
uint64_t &prev_structurals,
153174
uint64_t &unescaped_chars_error,
154175
utf8_checker<ARCHITECTURE> &utf8_state) {
155-
// Validate UTF-8
156-
const simd_input<ARCHITECTURE> in(buf);
157-
utf8_state.check_next_input(in);
176+
//
177+
// Load up all 128 bytes into SIMD registers
178+
//
179+
simd_input<ARCHITECTURE> in_1(buf);
180+
simd_input<ARCHITECTURE> in_2(buf+64);
158181

159-
// Detect values in strings
160-
const uint64_t in_string = find_in_string(in, prev_escaped, prev_in_string);
161-
unescaped_chars_error |= invalid_string_bytes(in, in_string);
182+
//
183+
// Find the strings and potential structurals (operators / primitives).
184+
//
185+
// This will include false structurals that are *inside* strings--we'll filter strings out
186+
// before we return.
187+
//
188+
uint64_t string_1 = find_strings(in_1, prev_escaped, prev_in_string);
189+
uint64_t structurals_1 = find_potential_structurals(in_1, prev_primitive);
190+
uint64_t string_2 = find_strings(in_2, prev_escaped, prev_in_string);
191+
uint64_t structurals_2 = find_potential_structurals(in_2, prev_primitive);
162192

163-
/* take the previous iterations structural bits, not our current
164-
* iteration, and flatten */
165-
flatten_bits(base_ptr, idx, structurals);
193+
//
194+
// Do miscellaneous work while the processor is busy calculating strings and structurals.
195+
//
196+
// After that, weed out structurals that are inside strings and find invalid string characters.
197+
//
198+
uint64_t unescaped_1 = in_1.lteq(0x1F);
199+
utf8_state.check_next_input(in_1);
200+
flatten_bits(base_ptr, idx, prev_structurals); // Output *last* iteration's structurals to ParsedJson
201+
prev_structurals = structurals_1 & ~string_1;
202+
unescaped_chars_error |= unescaped_1 & string_1;
166203

167-
// find_structurals doesn't use in_string; we filter that out here.
168-
structurals = find_structurals(in, prev_primitive) & ~in_string;
204+
uint64_t unescaped_2 = in_2.lteq(0x1F);
205+
utf8_state.check_next_input(in_2);
206+
flatten_bits(base_ptr, idx+64, prev_structurals); // Output *last* iteration's structurals to ParsedJson
207+
prev_structurals = structurals_2 & ~string_2;
208+
unescaped_chars_error |= unescaped_2 & string_2;
169209
}
170210

171211
int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
@@ -178,10 +218,6 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
178218
uint32_t *base_ptr = pj.structural_indexes;
179219
utf8_checker<ARCHITECTURE> utf8_state;
180220

181-
/* we have padded the input out to 64 byte multiple with the remainder
182-
* being zeros persistent state across loop does the last iteration end
183-
* with an odd-length sequence of backslashes? */
184-
185221
// Whether the first character of the next iteration is escaped.
186222
uint64_t prev_escaped = 0ULL;
187223
// Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
@@ -194,27 +230,28 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
194230
// CPU capacity while the next iteration is busy with an expensive clmul in compute_quote_mask.
195231
uint64_t structurals = 0;
196232

197-
size_t lenminus64 = len < 64 ? 0 : len - 64;
233+
size_t lenminusstep = len < STEP_SIZE ? 0 : len - STEP_SIZE;
198234
size_t idx = 0;
199235
// Errors with unescaped characters in strings (ASCII codepoints < 0x20)
200236
uint64_t unescaped_chars_error = 0;
201237

202-
for (; idx < lenminus64; idx += 64) {
203-
find_structural_bits_64(&buf[idx], idx, base_ptr,
204-
prev_escaped, prev_in_string, prev_primitive,
205-
structurals, unescaped_chars_error, utf8_state);
238+
for (; idx < lenminusstep; idx += STEP_SIZE) {
239+
find_structural_bits_128(&buf[idx], idx, base_ptr,
240+
prev_escaped, prev_in_string, prev_primitive,
241+
structurals, unescaped_chars_error, utf8_state);
206242
}
243+
207244
/* If we have a final chunk of less than 64 bytes, pad it to 64 with
208245
* spaces before processing it (otherwise, we risk invalidating the UTF-8
209246
* checks). */
210247
if (likely(idx < len)) {
211-
uint8_t tmp_buf[64];
212-
memset(tmp_buf, 0x20, 64);
248+
uint8_t tmp_buf[STEP_SIZE];
249+
memset(tmp_buf, 0x20, STEP_SIZE);
213250
memcpy(tmp_buf, buf + idx, len - idx);
214-
find_structural_bits_64(&tmp_buf[0], idx, base_ptr,
215-
prev_escaped, prev_in_string, prev_primitive,
216-
structurals, unescaped_chars_error, utf8_state);
217-
idx += 64;
251+
find_structural_bits_128(&tmp_buf[0], idx, base_ptr,
252+
prev_escaped, prev_in_string, prev_primitive,
253+
structurals, unescaped_chars_error, utf8_state);
254+
idx += STEP_SIZE;
218255
}
219256

220257
/* finally, flatten out the remaining structurals from the last iteration */

src/haswell/simd_input.h

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,18 @@ namespace simdjson {
1010

1111
template <>
1212
struct simd_input<Architecture::HASWELL> {
13-
__m256i chunks[2];
13+
const __m256i chunks[2];
1414

15-
really_inline simd_input(const uint8_t *ptr)
16-
{
17-
this->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0*32));
18-
this->chunks[1] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 1*32));
19-
}
15+
really_inline simd_input() : chunks{__m256i(), __m256i()} {}
2016

2117
really_inline simd_input(const __m256i chunk0, const __m256i chunk1)
22-
{
23-
this->chunks[0] = chunk0;
24-
this->chunks[1] = chunk1;
25-
}
18+
: chunks{chunk0, chunk1} {}
19+
20+
really_inline simd_input(const uint8_t *ptr)
21+
: chunks{
22+
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0*32)),
23+
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 1*32))
24+
} {}
2625

2726
template <typename F>
2827
really_inline void each(F const& each_chunk) const

src/westmere/simd_input.h

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,22 +10,21 @@ namespace simdjson {
1010

1111
template <>
1212
struct simd_input<Architecture::WESTMERE> {
13-
__m128i chunks[4];
13+
const __m128i chunks[4];
1414

15-
really_inline simd_input(const uint8_t *ptr) {
16-
this->chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0));
17-
this->chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
18-
this->chunks[2] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32));
19-
this->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48));
20-
}
15+
really_inline simd_input()
16+
: chunks { __m128i(), __m128i(), __m128i(), __m128i() } {}
2117

22-
really_inline simd_input(__m128i i0, __m128i i1, __m128i i2, __m128i i3)
23-
{
24-
this->chunks[0] = i0;
25-
this->chunks[1] = i1;
26-
this->chunks[2] = i2;
27-
this->chunks[3] = i3;
28-
}
18+
really_inline simd_input(const __m128i chunk0, const __m128i chunk1, const __m128i chunk2, const __m128i chunk3)
19+
: chunks{chunk0, chunk1, chunk2, chunk3} {}
20+
21+
really_inline simd_input(const uint8_t *ptr)
22+
: simd_input(
23+
_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 0)),
24+
_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16)),
25+
_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 32)),
26+
_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 48))
27+
) {}
2928

3029
template <typename F>
3130
really_inline void each(F const& each_chunk) const {

0 commit comments

Comments
 (0)