@@ -81,37 +81,6 @@ really_inline ErrorValues detect_errors_on_eof(
81
81
return SUCCESS;
82
82
}
83
83
84
- //
85
- // Return a mask of all string characters plus end quotes.
86
- //
87
- // prev_escaped is overflow saying whether the next character is escaped.
88
- // prev_in_string is overflow saying whether we're still in a string.
89
- //
90
- // Backslash sequences outside of quotes will be detected in stage 2.
91
- //
92
- really_inline uint64_t find_strings (const simd_input<ARCHITECTURE> in, uint64_t &prev_escaped, uint64_t &prev_in_string) {
93
- const uint64_t backslash = in.eq (' \\ ' );
94
- const uint64_t escaped = follows_odd_sequence_of (backslash, prev_escaped);
95
- const uint64_t quote = in.eq (' "' ) & ~escaped;
96
- // compute_quote_mask returns start quote plus string contents.
97
- const uint64_t in_string = compute_quote_mask (quote) ^ prev_in_string;
98
- /* right shift of a signed value expected to be well-defined and standard
99
- * compliant as of C++20,
100
- * John Regher from Utah U. says this is fine code */
101
- prev_in_string = static_cast <uint64_t >(static_cast <int64_t >(in_string) >> 63 );
102
- // Use ^ to turn the beginning quote off, and the end quote on.
103
- return in_string ^ quote;
104
- }
105
-
106
- really_inline uint64_t invalid_string_bytes (const uint64_t unescaped, const uint64_t quote_mask) {
107
- /* All Unicode characters may be placed within the
108
- * quotation marks, except for the characters that MUST be escaped:
109
- * quotation mark, reverse solidus, and the control characters (U+0000
110
- * through U+001F).
111
- * https://tools.ietf.org/html/rfc8259 */
112
- return quote_mask & unescaped;
113
- }
114
-
115
84
//
116
85
// Determine which characters are *structural*:
117
86
// - braces: [] and {}
@@ -128,11 +97,7 @@ really_inline uint64_t invalid_string_bytes(const uint64_t unescaped, const uint
128
97
// contents of a string the same as content outside. Errors and structurals inside the string or on
129
98
// the trailing quote will need to be removed later when the correct string information is known.
130
99
//
131
- really_inline uint64_t find_potential_structurals (const simd_input<ARCHITECTURE> in, uint64_t &prev_primitive) {
132
- // These use SIMD so let's kick them off before running the regular 64-bit stuff ...
133
- uint64_t whitespace, op;
134
- find_whitespace_and_operators (in, whitespace, op);
135
-
100
+ really_inline uint64_t find_potential_structurals (uint64_t whitespace, uint64_t op, uint64_t &prev_primitive) {
136
101
// Detect the start of a run of primitive characters. Includes numbers, booleans, and strings (").
137
102
// Everything except whitespace, braces, colon and comma.
138
103
const uint64_t primitive = ~(op | whitespace);
@@ -143,7 +108,61 @@ really_inline uint64_t find_potential_structurals(const simd_input<ARCHITECTURE>
143
108
return op | start_primitive;
144
109
}
145
110
146
- static const size_t STEP_SIZE = 128 ;
111
+ // All independent tasks that scan the SIMD input go here.
112
+ really_inline void stage2_scan_input (
113
+ const simd_input<ARCHITECTURE> in,
114
+ uint64_t &prev_escaped, uint64_t &prev_primitive, utf8_checker<ARCHITECTURE> utf8_state,
115
+ uint64_t "e, uint64_t &potential_structurals, uint64_t &invalid_string_bytes
116
+ ) {
117
+ // Finding real quotes, and finding potential structurals, are interleaved here.
118
+ // First we read the input we need from SIMD for them ...
119
+ uint64_t backslash = in.eq (' \\ ' );
120
+ uint64_t whitespace, op;
121
+ find_whitespace_and_operators (in, whitespace, op);
122
+
123
+ // Then we do the actual work. follows_odd_sequence_of takes longer than find_potential_structurals.
124
+ const uint64_t escaped = follows_odd_sequence_of (backslash, prev_escaped);
125
+ potential_structurals = find_potential_structurals (whitespace, op, prev_primitive);
126
+ uint64_t raw_quote = in.eq (' "' );
127
+ quote = raw_quote & ~escaped;
128
+ invalid_string_bytes = in.lteq (0x1F );
129
+
130
+ // The UTF-8 scan is pretty much independent of anything else, so we stick it at the end here to
131
+ // soak up some CPU.
132
+ utf8_state.check_next_input (in);
133
+ }
134
+
135
+ //
136
+ // Return a mask of all string characters plus end quotes.
137
+ //
138
+ // prev_escaped is overflow saying whether the next character is escaped.
139
+ // prev_in_string is overflow saying whether we're still in a string.
140
+ //
141
+ // Backslash sequences outside of quotes will be detected in stage 2.
142
+ //
143
+ really_inline uint64_t find_strings (uint64_t quote, uint64_t &prev_in_string) {
144
+ // compute_quote_mask returns start quote plus string contents.
145
+ const uint64_t in_string = compute_quote_mask (quote) ^ prev_in_string;
146
+ /* right shift of a signed value expected to be well-defined and standard
147
+ * compliant as of C++20,
148
+ * John Regher from Utah U. says this is fine code */
149
+ prev_in_string = static_cast <uint64_t >(static_cast <int64_t >(in_string) >> 63 );
150
+ // Use ^ to turn the beginning quote off, and the end quote on (i.e. everything except the
151
+ // starting quote is the string's value).
152
+ return in_string ^ quote;
153
+ }
154
+
155
+ really_inline void stage3_find_strings (
156
+ uint64_t quote,
157
+ uint64_t &prev_in_string,
158
+ uint64_t &string,
159
+ const uint64_t potential_structurals, uint64_t &potential_structurals_out, const uint64_t invalid_string_bytes, uint64_t &invalid_string_bytes_out
160
+ ) {
161
+ string = find_strings (quote, prev_in_string);
162
+ potential_structurals_out = potential_structurals;
163
+ invalid_string_bytes_out = invalid_string_bytes;
164
+ }
165
+
147
166
148
167
//
149
168
// Find the important bits of JSON in a 128-byte chunk, and add them to :
@@ -166,67 +185,39 @@ static const size_t STEP_SIZE = 128;
166
185
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
167
186
// workout.
168
187
//
169
- really_inline void find_structural_bits_start (
170
- const uint8_t *buf,
188
+ really_inline bool stage1_read_input (
189
+ const uint8_t *&buf,
190
+ const uint8_t *buf_end,
171
191
simd_input<ARCHITECTURE> &in
172
192
) {
173
- in = simd_input<ARCHITECTURE>(buf);
174
- }
193
+ // If we have a full input, load it up and return it.
194
+ const uint8_t *next_buf = buf+64 ;
195
+ if (likely (next_buf < buf_end)) {
196
+ in = simd_input<ARCHITECTURE>(buf);
197
+ buf = next_buf;
198
+ return true ;
199
+ }
175
200
176
- really_inline void find_structural_bits_middle (
177
- simd_input<ARCHITECTURE> in,
178
- uint64_t &prev_escaped, uint64_t &prev_in_string, uint64_t &prev_primitive,
179
- uint64_t &string, uint64_t &structurals
180
- ) {
181
- string = find_strings (in, prev_escaped, prev_in_string);
182
- structurals = find_potential_structurals (in, prev_primitive);
201
+ // If we have a *partial* (< 64 byte) input, load it up and return it.
202
+ if (likely (buf_end > buf)) {
203
+ uint8_t tmp_buf[64 ];
204
+ memset (tmp_buf, 0x20 , 64 );
205
+ memcpy (tmp_buf, buf, buf_end-buf);
206
+ in = simd_input<ARCHITECTURE>(tmp_buf);
207
+ buf = next_buf;
208
+ return true ;
209
+ }
210
+
211
+ return false ;
183
212
}
184
213
185
- really_inline void find_structural_bits_end (
186
- simd_input<ARCHITECTURE> in, uint64_t idx, uint64_t string, uint64_t structurals ,
187
- uint32_t *&base_ptr, uint64_t &prev_structurals, utf8_checker<ARCHITECTURE> &utf8_state ,
214
+ really_inline void stage4_output_structurals (
215
+ const uint64_t string, const uint64_t structurals, const uint64_t invalid_string_bytes ,
216
+ uint32_t *&base_ptr, size_t &idx ,
188
217
uint64_t &unescaped_chars_error
189
218
) {
190
- uint64_t unescaped = in.lteq (0x1F );
191
- utf8_state.check_next_input (in);
192
- flatten_bits (base_ptr, idx, prev_structurals); // Output *last* iteration's structurals to ParsedJson
193
- prev_structurals = structurals & ~string;
194
- unescaped_chars_error |= unescaped & string;
195
- idx += 64 ;
196
- }
197
-
198
- really_inline void find_structural_bits_128 (
199
- const uint8_t *buf, size_t &idx, uint32_t *&base_ptr,
200
- uint64_t &prev_escaped, uint64_t &prev_in_string,
201
- uint64_t &prev_primitive,
202
- uint64_t &prev_structurals,
203
- uint64_t &unescaped_chars_error,
204
- utf8_checker<ARCHITECTURE> &utf8_state) {
205
- //
206
- // Load up all 128 bytes into SIMD registers
207
- //
208
- simd_input<ARCHITECTURE> in_1, in_2;
209
- find_structural_bits_start (buf, in_1);
210
- find_structural_bits_start (buf+64 , in_2);
211
-
212
- //
213
- // Find the strings and potential structurals (operators / primitives).
214
- //
215
- // This will include false structurals that are *inside* strings--we'll filter strings out
216
- // before we return.
217
- //
218
- uint64_t string_1, structurals_1, string_2, structurals_2;
219
- find_structural_bits_middle (in_1, prev_escaped, prev_in_string, prev_primitive, string_1, structurals_1);
220
- find_structural_bits_middle (in_2, prev_escaped, prev_in_string, prev_primitive, string_2, structurals_2);
221
-
222
- //
223
- // Do miscellaneous work while the processor is busy calculating strings and structurals.
224
- //
225
- // After that, weed out structurals that are inside strings and find invalid string characters.
226
- //
227
- find_structural_bits_end (in_1, idx, string_1, structurals_1, base_ptr, prev_structurals, utf8_state, unescaped_chars_error);
228
- find_structural_bits_end (in_2, idx+64 , string_2, structurals_2, base_ptr, prev_structurals, utf8_state, unescaped_chars_error);
229
- idx += 128 ;
219
+ flatten_bits (base_ptr, idx, structurals & ~string);
220
+ unescaped_chars_error |= invalid_string_bytes & string;
230
221
}
231
222
232
223
int find_structural_bits (const uint8_t *buf, size_t len, simdjson::ParsedJson &pj) {
@@ -236,49 +227,56 @@ int find_structural_bits(const uint8_t *buf, size_t len, simdjson::ParsedJson &p
236
227
<< len << " bytes" << std::endl;
237
228
return simdjson::CAPACITY;
238
229
}
239
- if (unlikely (len == 0 )) {
240
- return simdjson::EMPTY;
241
- }
242
- uint32_t *base_ptr = pj.structural_indexes ;
243
- utf8_checker<ARCHITECTURE> utf8_state;
244
230
231
+ // Stage 1 state and output
232
+ const uint8_t *buf_end = buf + len;
233
+ simd_input<ARCHITECTURE> in;
234
+
235
+ // Stage 2 state and output
236
+ uint64_t quote, potential_structurals, invalid_string_bytes;
245
237
// Whether the first character of the next iteration is escaped.
246
- uint64_t prev_escaped = 0ULL ;
247
- // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
248
- uint64_t prev_in_string = 0ULL ;
238
+ uint64_t prev_escaped = 0 ;
249
239
// Whether the last character of the previous iteration is a primitive value character
250
240
// (anything except whitespace, braces, comma or colon).
251
- uint64_t prev_primitive = 0ULL ;
252
- // Mask of structural characters from the last iteration.
253
- // Kept around for performance reasons, so we can call flatten_bits to soak up some unused
254
- // CPU capacity while the next iteration is busy with an expensive clmul in compute_quote_mask.
255
- uint64_t structurals = 0 ;
241
+ uint64_t prev_primitive = 0 ;
242
+ utf8_checker<ARCHITECTURE> utf8_state;
243
+
244
+ // Stage 3 state and output
245
+ uint64_t prev_in_string = 0 ;
246
+ uint64_t string, potential_structurals2, invalid_string_bytes2;
256
247
257
- size_t last_buf_size = (len % STEP_SIZE == 0 ) ? STEP_SIZE : (len % STEP_SIZE);
258
- const uint8_t *last_buf = buf + len - last_buf_size ;
259
- size_t idx = 0 ;
248
+ // Stage 4 state and output
249
+ uint32_t *base_ptr = pj. structural_indexes ;
250
+ uint64_t idx = 0 ;
260
251
// Errors with unescaped characters in strings (ASCII codepoints < 0x20)
261
252
uint64_t unescaped_chars_error = 0 ;
262
253
263
- while (buf < last_buf) {
264
- find_structural_bits_128 (buf, idx, base_ptr,
265
- prev_escaped, prev_in_string, prev_primitive,
266
- structurals, unescaped_chars_error, utf8_state);
267
- buf += 128 ;
268
- }
254
+ // Read the first input (if any)
255
+ if (stage1_read_input (buf, buf_end, in)) {
256
+ // Stage 2 is ready. Push it so stage 3 is ready.
257
+ stage2_scan_input (in, prev_escaped, prev_primitive, utf8_state, quote, potential_structurals, invalid_string_bytes);
269
258
270
- /* If we have a final chunk of less than 64 bytes, pad it to 64 with
271
- * spaces before processing it (otherwise, we risk invalidating the UTF-8
272
- * checks). */
273
- uint8_t tmp_buf[STEP_SIZE];
274
- memset (tmp_buf, 0x20 , STEP_SIZE);
275
- memcpy (tmp_buf, last_buf, last_buf_size);
276
- find_structural_bits_128 (&tmp_buf[0 ], idx, base_ptr,
277
- prev_escaped, prev_in_string, prev_primitive,
278
- structurals, unescaped_chars_error, utf8_state);
259
+ // Read the second input (if any)
260
+ if (stage1_read_input (buf, buf_end, in)) {
261
+ // stages 2 and 3 are ready. Push them so 3 and 4 are ready.
262
+ stage3_find_strings (quote, prev_in_string, string, potential_structurals, potential_structurals2, invalid_string_bytes, invalid_string_bytes2);
263
+ stage2_scan_input (in, prev_escaped, prev_primitive, utf8_state, quote, potential_structurals, invalid_string_bytes);
279
264
280
- /* finally, flatten out the remaining structurals from the last iteration */
281
- flatten_bits (base_ptr, idx, structurals);
265
+ // Read remaining inputs
266
+ while (stage1_read_input (buf, buf_end, in)) {
267
+ // stages 2, 3 and 4 are ready. Push them so 3 and 4 are ready.
268
+ stage4_output_structurals (string, potential_structurals2, invalid_string_bytes2, base_ptr, idx, unescaped_chars_error);
269
+ stage3_find_strings (quote, prev_in_string, string, potential_structurals, potential_structurals2, invalid_string_bytes, invalid_string_bytes2);
270
+ stage2_scan_input (in, prev_escaped, prev_primitive, utf8_state, quote, potential_structurals, invalid_string_bytes);
271
+ }
272
+ // stages 3 and 4 are ready. Push 4 so only 3 is ready.
273
+ stage4_output_structurals (string, potential_structurals2, invalid_string_bytes2, base_ptr, idx, unescaped_chars_error);
274
+ }
275
+
276
+ // stage 3 is ready. Finish it.
277
+ stage3_find_strings (quote, prev_in_string, string, potential_structurals, potential_structurals2, invalid_string_bytes, invalid_string_bytes2);
278
+ stage4_output_structurals (string, potential_structurals2, invalid_string_bytes2, base_ptr, idx, unescaped_chars_error);
279
+ }
282
280
283
281
simdjson::ErrorValues error = detect_errors_on_eof (unescaped_chars_error, prev_in_string);
284
282
if (unlikely (error != simdjson::SUCCESS)) {
0 commit comments