@@ -89,7 +89,7 @@ really_inline ErrorValues detect_errors_on_eof(
89
89
//
90
90
// Backslash sequences outside of quotes will be detected in stage 2.
91
91
//
92
- really_inline uint64_t find_in_string (const simd_input<ARCHITECTURE> in, uint64_t &prev_escaped, uint64_t &prev_in_string) {
92
+ really_inline uint64_t find_strings (const simd_input<ARCHITECTURE> in, uint64_t &prev_escaped, uint64_t &prev_in_string) {
93
93
const uint64_t backslash = in.eq (' \\ ' );
94
94
const uint64_t escaped = follows_odd_sequence_of (backslash, prev_escaped);
95
95
const uint64_t quote = in.eq (' "' ) & ~escaped;
@@ -103,13 +103,12 @@ really_inline uint64_t find_in_string(const simd_input<ARCHITECTURE> in, uint64_
103
103
return in_string ^ quote;
104
104
}
105
105
106
- really_inline uint64_t invalid_string_bytes (const simd_input<ARCHITECTURE> in , const uint64_t quote_mask) {
106
+ really_inline uint64_t invalid_string_bytes (const uint64_t unescaped , const uint64_t quote_mask) {
107
107
/* All Unicode characters may be placed within the
108
108
* quotation marks, except for the characters that MUST be escaped:
109
109
* quotation mark, reverse solidus, and the control characters (U+0000
110
110
* through U+001F).
111
111
* https://tools.ietf.org/html/rfc8259 */
112
- const uint64_t unescaped = in.lteq (0x1F );
113
112
return quote_mask & unescaped;
114
113
}
115
114
@@ -129,7 +128,7 @@ really_inline uint64_t invalid_string_bytes(const simd_input<ARCHITECTURE> in, c
129
128
// contents of a string the same as content outside. Errors and structurals inside the string or on
130
129
// the trailing quote will need to be removed later when the correct string information is known.
131
130
//
132
- really_inline uint64_t find_structurals (const simd_input<ARCHITECTURE> in, uint64_t &prev_primitive) {
131
+ really_inline uint64_t find_potential_structurals (const simd_input<ARCHITECTURE> in, uint64_t &prev_primitive) {
133
132
// These use SIMD so let's kick them off before running the regular 64-bit stuff ...
134
133
uint64_t whitespace, op;
135
134
find_whitespace_and_operators (in, whitespace, op);
@@ -144,28 +143,69 @@ really_inline uint64_t find_structurals(const simd_input<ARCHITECTURE> in, uint6
144
143
return op | start_primitive;
145
144
}
146
145
147
- // Find structural bits in a 64-byte chunk.
148
- really_inline void find_structural_bits_64 (
146
+ static const size_t STEP_SIZE = 128 ;
147
+
148
+ //
149
+ // Find the important bits of JSON in a 128-byte chunk, and add them to :
150
+ //
151
+ //
152
+ //
153
+ // PERF NOTES:
154
+ // We pipe 2 inputs through these stages:
155
+ // 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
156
+ // 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
157
+ // 2. Scan the JSON for critical data: strings, primitives and operators. This is the critical path.
158
+ // The output of step 1 depends entirely on this information. These functions don't quite use
159
+ // up enough CPU: the second half of the functions is highly serial, only using 1 execution core
160
+ // at a time. The second input's scans has some dependency on the first ones finishing it, but
161
+ // they can make a lot of progress before they need that information.
162
+ // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
163
+ // to finish: utf-8 checks and generating the output from the last iteration.
164
+ //
165
+ // The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
166
+ // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
167
+ // workout.
168
+ //
169
+ really_inline void find_structural_bits_128 (
149
170
const uint8_t *buf, const size_t idx, uint32_t *&base_ptr,
150
171
uint64_t &prev_escaped, uint64_t &prev_in_string,
151
172
uint64_t &prev_primitive,
152
- uint64_t &structurals ,
173
+ uint64_t &prev_structurals ,
153
174
uint64_t &unescaped_chars_error,
154
175
utf8_checker<ARCHITECTURE> &utf8_state) {
155
- // Validate UTF-8
156
- const simd_input<ARCHITECTURE> in (buf);
157
- utf8_state.check_next_input (in);
176
+ //
177
+ // Load up all 128 bytes into SIMD registers
178
+ //
179
+ simd_input<ARCHITECTURE> in_1 (buf);
180
+ simd_input<ARCHITECTURE> in_2 (buf+64 );
158
181
159
- // Detect values in strings
160
- const uint64_t in_string = find_in_string (in, prev_escaped, prev_in_string);
161
- unescaped_chars_error |= invalid_string_bytes (in, in_string);
182
+ //
183
+ // Find the strings and potential structurals (operators / primitives).
184
+ //
185
+ // This will include false structurals that are *inside* strings--we'll filter strings out
186
+ // before we return.
187
+ //
188
+ uint64_t string_1 = find_strings (in_1, prev_escaped, prev_in_string);
189
+ uint64_t structurals_1 = find_potential_structurals (in_1, prev_primitive);
190
+ uint64_t string_2 = find_strings (in_2, prev_escaped, prev_in_string);
191
+ uint64_t structurals_2 = find_potential_structurals (in_2, prev_primitive);
162
192
163
- /* take the previous iterations structural bits, not our current
164
- * iteration, and flatten */
165
- flatten_bits (base_ptr, idx, structurals);
193
+ //
194
+ // Do miscellaneous work while the processor is busy calculating strings and structurals.
195
+ //
196
+ // After that, weed out structurals that are inside strings and find invalid string characters.
197
+ //
198
+ uint64_t unescaped_1 = in_1.lteq (0x1F );
199
+ utf8_state.check_next_input (in_1);
200
+ flatten_bits (base_ptr, idx, prev_structurals); // Output *last* iteration's structurals to ParsedJson
201
+ prev_structurals = structurals_1 & ~string_1;
202
+ unescaped_chars_error |= unescaped_1 & string_1;
166
203
167
- // find_structurals doesn't use in_string; we filter that out here.
168
- structurals = find_structurals (in, prev_primitive) & ~in_string;
204
+ uint64_t unescaped_2 = in_2.lteq (0x1F );
205
+ utf8_state.check_next_input (in_2);
206
+ flatten_bits (base_ptr, idx+64 , prev_structurals); // Output *last* iteration's structurals to ParsedJson
207
+ prev_structurals = structurals_2 & ~string_2;
208
+ unescaped_chars_error |= unescaped_2 & string_2;
169
209
}
170
210
171
211
int find_structural_bits (const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
@@ -178,10 +218,6 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
178
218
uint32_t *base_ptr = pj.structural_indexes ;
179
219
utf8_checker<ARCHITECTURE> utf8_state;
180
220
181
- /* we have padded the input out to 64 byte multiple with the remainder
182
- * being zeros persistent state across loop does the last iteration end
183
- * with an odd-length sequence of backslashes? */
184
-
185
221
// Whether the first character of the next iteration is escaped.
186
222
uint64_t prev_escaped = 0ULL ;
187
223
// Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
@@ -194,27 +230,28 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
194
230
// CPU capacity while the next iteration is busy with an expensive clmul in compute_quote_mask.
195
231
uint64_t structurals = 0 ;
196
232
197
- size_t lenminus64 = len < 64 ? 0 : len - 64 ;
233
+ size_t lenminusstep = len < STEP_SIZE ? 0 : len - STEP_SIZE ;
198
234
size_t idx = 0 ;
199
235
// Errors with unescaped characters in strings (ASCII codepoints < 0x20)
200
236
uint64_t unescaped_chars_error = 0 ;
201
237
202
- for (; idx < lenminus64 ; idx += 64 ) {
203
- find_structural_bits_64 (&buf[idx], idx, base_ptr,
204
- prev_escaped, prev_in_string, prev_primitive,
205
- structurals, unescaped_chars_error, utf8_state);
238
+ for (; idx < lenminusstep ; idx += STEP_SIZE ) {
239
+ find_structural_bits_128 (&buf[idx], idx, base_ptr,
240
+ prev_escaped, prev_in_string, prev_primitive,
241
+ structurals, unescaped_chars_error, utf8_state);
206
242
}
243
+
207
244
/* If we have a final chunk of less than 64 bytes, pad it to 64 with
208
245
* spaces before processing it (otherwise, we risk invalidating the UTF-8
209
246
* checks). */
210
247
if (likely (idx < len)) {
211
- uint8_t tmp_buf[64 ];
212
- memset (tmp_buf, 0x20 , 64 );
248
+ uint8_t tmp_buf[STEP_SIZE ];
249
+ memset (tmp_buf, 0x20 , STEP_SIZE );
213
250
memcpy (tmp_buf, buf + idx, len - idx);
214
- find_structural_bits_64 (&tmp_buf[0 ], idx, base_ptr,
215
- prev_escaped, prev_in_string, prev_primitive,
216
- structurals, unescaped_chars_error, utf8_state);
217
- idx += 64 ;
251
+ find_structural_bits_128 (&tmp_buf[0 ], idx, base_ptr,
252
+ prev_escaped, prev_in_string, prev_primitive,
253
+ structurals, unescaped_chars_error, utf8_state);
254
+ idx += STEP_SIZE ;
218
255
}
219
256
220
257
/* finally, flatten out the remaining structurals from the last iteration */
0 commit comments