Skip to content

Commit 71d3746

Browse files
committed
Store current buf in parser
1 parent 856d362 commit 71d3746

File tree

5 files changed

+70
-81
lines changed

5 files changed

+70
-81
lines changed

include/simdjson/dom/parser.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,10 @@ class parser {
351351
uint32_t current_loc{0};
352352
/** @private Current location in string buffer */
353353
uint8_t *current_string_buf_loc;
354+
/** @private Buffer being currently parsed */
355+
const uint8_t *parsing_buf;
356+
/** @private Next structural to parse */
357+
size_t next_structural;
354358

355359
/** @private Number of structural indices passed from stage 1 to stage 2 */
356360
uint32_t n_structural_indexes{0};

src/generic/stage2/logger.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ namespace logger {
5555
}
5656
printf("| %c ", printable_char(structurals.at_beginning() ? ' ' : structurals.current_char()));
5757
printf("| %c ", printable_char(structurals.peek_char()));
58-
printf("| %5zd ", structurals.next_structural);
58+
printf("| %5zd ", structurals.doc_parser.next_structural);
5959
printf("| %-*s ", LOG_DETAIL_LEN, detail);
6060
printf("|\n");
6161
}

src/generic/stage2/streaming_structural_parser.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,8 @@
11
namespace stage2 {
22

33
struct streaming_structural_parser: structural_parser {
4-
really_inline streaming_structural_parser(
5-
const uint8_t *buf,
6-
size_t len,
7-
parser &_doc_parser,
8-
size_t &next_structural
9-
) : structural_parser(buf, len, _doc_parser, next_structural) {}
4+
really_inline streaming_structural_parser(size_t len, parser &_doc_parser)
5+
: structural_parser(len, _doc_parser) {}
106

117
// override to add streaming
128
WARN_UNUSED really_inline error_code start(UNUSED size_t len) {
@@ -24,7 +20,7 @@ struct streaming_structural_parser: structural_parser {
2420

2521
// override to add streaming
2622
WARN_UNUSED really_inline error_code finish() {
27-
if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
23+
if ( structurals.past_end(doc_parser().n_structural_indexes) ) {
2824
log_error("IMPOSSIBLE: past the end of the JSON!");
2925
return on_error(TAPE_ERROR);
3026
}
@@ -33,7 +29,7 @@ struct streaming_structural_parser: structural_parser {
3329
log_error("Unclosed objects or arrays!");
3430
return on_error(TAPE_ERROR);
3531
}
36-
bool finished = structurals.at_end(doc_parser.n_structural_indexes);
32+
bool finished = structurals.at_end(doc_parser().n_structural_indexes);
3733
if (!finished) { log_value("(and has more)"); }
3834
return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
3935
}
@@ -46,12 +42,15 @@ struct streaming_structural_parser: structural_parser {
4642
* for documentation.
4743
***********/
4844
WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
49-
stage2::streaming_structural_parser parser(buf, len, doc_parser, next_json);
45+
doc_parser.parsing_buf = buf;
46+
doc_parser.next_structural = next_json;
47+
stage2::streaming_structural_parser parser(len, doc_parser);
5048
error_code result = parser.start(len);
5149
if (result) { return result; }
5250

5351
if (parser.parse_root_value()) {
5452
return parser.error();
5553
}
54+
next_json = doc_parser.next_structural;
5655
return parser.finish();
5756
}

src/generic/stage2/structural_iterator.h

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,23 @@ namespace stage2 {
22

33
class structural_iterator {
44
public:
5-
really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t &_next_structural)
6-
: buf{_buf},
7-
len{_len},
8-
structural_indexes{_structural_indexes},
9-
next_structural{_next_structural}
5+
really_inline structural_iterator(parser &_doc_parser, size_t _len)
6+
: doc_parser{_doc_parser},
7+
buf{_doc_parser.parsing_buf},
8+
len{_len},
9+
structural_indexes{_doc_parser.structural_indexes.get()}
1010
{}
1111
really_inline char advance_char() {
12-
idx = structural_indexes[next_structural];
13-
next_structural++;
12+
idx = structural_indexes[doc_parser.next_structural];
13+
doc_parser.next_structural++;
1414
c = *current();
1515
return c;
1616
}
1717
really_inline char current_char() {
1818
return c;
1919
}
2020
really_inline char peek_char() {
21-
return buf[structural_indexes[next_structural]];
21+
return buf[structural_indexes[doc_parser.next_structural]];
2222
}
2323
really_inline const uint8_t* current() {
2424
return &buf[idx];
@@ -52,22 +52,22 @@ class structural_iterator {
5252
return result;
5353
}
5454
really_inline bool past_end(uint32_t n_structural_indexes) {
55-
return next_structural+1 > n_structural_indexes;
55+
return doc_parser.next_structural+1 > n_structural_indexes;
5656
}
5757
really_inline bool at_end(uint32_t n_structural_indexes) {
58-
return next_structural+1 == n_structural_indexes;
58+
return doc_parser.next_structural+1 == n_structural_indexes;
5959
}
6060
really_inline bool at_beginning() {
61-
return next_structural == 0;
61+
return doc_parser.next_structural == 0;
6262
}
6363
really_inline size_t next_structural_index() {
64-
return next_structural;
64+
return doc_parser.next_structural;
6565
}
6666

67+
parser &doc_parser;
6768
const uint8_t* const buf;
6869
const size_t len;
6970
const uint32_t* const structural_indexes;
70-
size_t &next_structural; // next structural index
7171
size_t idx{0}; // location of the structural character in the input (buf)
7272
uint8_t c{0}; // used to track the (structural) character we are looking at
7373
};

src/generic/stage2/structural_parser.h

Lines changed: 44 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -30,31 +30,28 @@ struct number_writer {
3030

3131
struct structural_parser {
3232
structural_iterator structurals;
33-
parser &doc_parser;
3433
uint32_t depth;
3534

3635
really_inline structural_parser(
37-
const uint8_t *buf,
3836
size_t len,
39-
parser &_doc_parser,
40-
size_t &next_structural
41-
) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural),
42-
doc_parser{_doc_parser},
43-
depth{0} {
44-
}
37+
parser &_doc_parser
38+
) : structurals(_doc_parser, len), depth{0} {}
4539

4640
really_inline structural_parser(
47-
const uint8_t *buf,
4841
parser &_doc_parser,
49-
size_t &next_structural,
5042
uint32_t _depth
51-
) : structurals(buf, 0, _doc_parser.structural_indexes.get(), next_structural),
52-
doc_parser{_doc_parser},
53-
depth{_depth} {
43+
) : structurals(_doc_parser, 0), depth{_depth} {}
44+
45+
really_inline parser &doc_parser() {
46+
return structurals.doc_parser;
47+
}
48+
49+
really_inline document &doc() {
50+
return doc_parser().doc;
5451
}
5552

5653
WARN_UNUSED really_inline bool start_scope(internal::tape_type type) {
57-
bool exceeded_max_depth = depth >= doc_parser.max_depth();
54+
bool exceeded_max_depth = depth >= doc_parser().max_depth();
5855
if (exceeded_max_depth) { log_error("Exceeded max depth!"); return true; }
5956
write_tape(0, type); // if the document is correct, this gets rewritten later
6057
return false;
@@ -84,7 +81,7 @@ struct structural_parser {
8481
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
8582
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
8683
// This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index]
87-
doc_parser.doc.tape[start_loc] |= doc_parser.current_loc | (uint64_t(cntsat) << 32);
84+
doc().tape[start_loc] |= doc_parser().current_loc | (uint64_t(cntsat) << 32);
8885
}
8986

9087
really_inline void end_object(uint32_t start_loc, uint32_t count) {
@@ -101,25 +98,25 @@ struct structural_parser {
10198
}
10299

103100
really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
104-
doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
101+
doc().tape[doc_parser().current_loc++] = val | ((uint64_t(char(t))) << 56);
105102
}
106103

107104
really_inline uint8_t *on_start_string() noexcept {
108105
// we advance the point, accounting for the fact that we have a NULL termination
109-
write_tape(doc_parser.current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING);
110-
return doc_parser.current_string_buf_loc + sizeof(uint32_t);
106+
write_tape(doc_parser().current_string_buf_loc - doc().string_buf.get(), internal::tape_type::STRING);
107+
return doc_parser().current_string_buf_loc + sizeof(uint32_t);
111108
}
112109

113110
really_inline void on_end_string(uint8_t *dst) noexcept {
114-
uint32_t str_length = uint32_t(dst - (doc_parser.current_string_buf_loc + sizeof(uint32_t)));
111+
uint32_t str_length = uint32_t(dst - (doc_parser().current_string_buf_loc + sizeof(uint32_t)));
115112
// TODO check for overflow in case someone has a crazy string (>=4GB?)
116113
// But only add the overflow check when the document itself exceeds 4GB
117114
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
118-
memcpy(doc_parser.current_string_buf_loc, &str_length, sizeof(uint32_t));
115+
memcpy(doc_parser().current_string_buf_loc, &str_length, sizeof(uint32_t));
119116
// NULL termination is still handy if you expect all your strings to
120117
// be NULL terminated? It comes at a small cost
121118
*dst = 0;
122-
doc_parser.current_string_buf_loc = dst + 1;
119+
doc_parser().current_string_buf_loc = dst + 1;
123120
}
124121

125122
WARN_UNUSED really_inline bool parse_string(bool key = false) {
@@ -136,7 +133,7 @@ struct structural_parser {
136133

137134
WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
138135
log_value("number");
139-
number_writer writer{doc_parser};
136+
number_writer writer{doc_parser()};
140137
bool succeeded = numberparsing::parse_number(src, found_minus, writer);
141138
if (!succeeded) { log_error("Invalid number"); }
142139
return !succeeded;
@@ -244,22 +241,16 @@ struct structural_parser {
244241
}
245242

246243
WARN_UNUSED really_inline bool parse_object() {
247-
return parse_object(structurals.buf, doc_parser, structurals.next_structural, depth+1);
244+
return parse_object(doc_parser(), depth+1);
248245
}
249246

250-
WARN_UNUSED static bool parse_object(
251-
const uint8_t *buf,
252-
parser &doc_parser,
253-
size_t &next_structural,
254-
uint32_t depth) {
255-
structural_parser parser(buf, doc_parser, next_structural, depth);
256-
bool result = parser.parse_object_inline();
257-
next_structural = parser.structurals.next_structural;
258-
return result;
247+
WARN_UNUSED static bool parse_object(parser &doc_parser, uint32_t depth) {
248+
structural_parser parser(doc_parser, depth);
249+
return parser.parse_object_inline();
259250
}
260251

261252
WARN_UNUSED really_inline bool parse_object_inline() {
262-
uint32_t start_loc = doc_parser.current_loc;
253+
uint32_t start_loc = doc_parser().current_loc;
263254
if (start_object()) { return true; }
264255
switch (advance_char()) {
265256
case '"':
@@ -302,22 +293,16 @@ struct structural_parser {
302293
}
303294

304295
WARN_UNUSED really_inline bool parse_array() {
305-
return parse_array(structurals.buf, doc_parser, structurals.next_structural, depth+1);
296+
return parse_array(doc_parser(), depth+1);
306297
}
307298

308-
WARN_UNUSED static bool parse_array(
309-
const uint8_t *buf,
310-
parser &doc_parser,
311-
size_t &next_structural,
312-
uint32_t depth) {
313-
structural_parser parser(buf, doc_parser, next_structural, depth);
314-
bool result = parser.parse_array_inline();
315-
next_structural = parser.structurals.next_structural;
316-
return result;
299+
WARN_UNUSED static bool parse_array(parser &doc_parser, uint32_t depth) {
300+
structural_parser parser(doc_parser, depth);
301+
return parser.parse_array_inline();
317302
}
318303

319304
WARN_UNUSED really_inline bool parse_array_inline() {
320-
uint32_t start_loc = doc_parser.current_loc;
305+
uint32_t start_loc = doc_parser().current_loc;
321306
if (start_array()) { return true; }
322307

323308
if (advance_char() == ']') {
@@ -346,7 +331,7 @@ struct structural_parser {
346331

347332
WARN_UNUSED really_inline error_code finish() {
348333
// the string might not be NULL terminated.
349-
if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
334+
if ( !structurals.at_end(doc_parser().n_structural_indexes) ) {
350335
log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
351336
return on_error(TAPE_ERROR);
352337
}
@@ -356,27 +341,27 @@ struct structural_parser {
356341
}
357342

358343
really_inline error_code on_error(error_code new_error_code) noexcept {
359-
doc_parser.error = new_error_code;
344+
doc_parser().error = new_error_code;
360345
return new_error_code;
361346
}
362347
really_inline error_code on_success(error_code success_code) noexcept {
363-
doc_parser.error = success_code;
364-
doc_parser.valid = true;
348+
doc_parser().error = success_code;
349+
doc_parser().valid = true;
365350
return success_code;
366351
}
367352

368353
WARN_UNUSED really_inline error_code error() {
369-
/* We do not need the next line because this is done by doc_parser.init_stage2(),
354+
/* We do not need the next line because this is done by doc_parser().init_stage2(),
370355
* pessimistically.
371-
* doc_parser.is_valid = false;
356+
* doc_parser().is_valid = false;
372357
* At this point in the code, we have all the time in the world.
373358
* Note that we know exactly where we are in the document so we could,
374359
* without any overhead on the processing code, report a specific
375360
* location.
376361
* We could even trigger special code paths to assess what happened
377362
* carefully,
378363
* all without any added cost. */
379-
if (depth >= doc_parser.max_depth()) {
364+
if (depth >= doc_parser().max_depth()) {
380365
return on_error(DEPTH_ERROR);
381366
}
382367
switch (structurals.current_char()) {
@@ -406,16 +391,16 @@ struct structural_parser {
406391
}
407392

408393
really_inline void init() {
409-
doc_parser.current_string_buf_loc = doc_parser.doc.string_buf.get();
410-
doc_parser.current_loc = 0;
411-
doc_parser.valid = false;
412-
doc_parser.error = UNINITIALIZED;
394+
doc_parser().current_string_buf_loc = doc().string_buf.get();
395+
doc_parser().current_loc = 0;
396+
doc_parser().valid = false;
397+
doc_parser().error = UNINITIALIZED;
413398
}
414399

415400
WARN_UNUSED really_inline error_code start(size_t len) {
416401
log_start();
417402
init(); // sets is_valid to false
418-
if (len > doc_parser.capacity()) {
403+
if (len > doc_parser().capacity()) {
419404
return CAPACITY;
420405
}
421406
// Advance to the first character as soon as possible
@@ -461,8 +446,9 @@ struct structural_parser {
461446
* for documentation.
462447
***********/
463448
WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
464-
size_t next_structural = 0;
465-
stage2::structural_parser parser(buf, len, doc_parser, next_structural);
449+
doc_parser.parsing_buf = buf;
450+
doc_parser.next_structural = 0;
451+
stage2::structural_parser parser(len, doc_parser);
466452
error_code result = parser.start(len);
467453
if (result) { return result; }
468454

0 commit comments

Comments
 (0)