Skip to content

Commit 90a6c1f

Browse files
committed
Use json_iterator in array/object
1 parent be30b77 commit 90a6c1f

15 files changed

+719
-180
lines changed

benchmark/bench_sax.cpp

Lines changed: 147 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ simdjson_really_inline void read_tweets(ondemand::parser &parser, padded_string
6060
}
6161
}
6262

63-
static void bench_tweets(State &state) {
63+
static void ondemand_tweets(State &state) {
6464
// Load twitter.json to a buffer
6565
padded_string json;
6666
if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
@@ -87,13 +87,98 @@ static void bench_tweets(State &state) {
8787
state.counters["tweets"] = Counter(double(tweet_count), benchmark::Counter::kIsRate);
8888
}
8989

90-
BENCHMARK(bench_tweets)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
90+
BENCHMARK(ondemand_tweets)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
9191
return *(std::max_element(std::begin(v), std::end(v)));
9292
})->DisplayAggregatesOnly(true);
9393

9494
} // namespace ondemand_bench
9595

9696

97+
SIMDJSON_UNTARGET_REGION
98+
99+
SIMDJSON_TARGET_HASWELL
100+
101+
namespace iter_bench {
102+
103+
using namespace simdjson;
104+
using namespace haswell;
105+
106+
simdjson_really_inline void read_tweets(ondemand::parser &parser, padded_string &json, std::vector<twitter::tweet> &tweets) {
107+
// Walk the document, parsing the tweets as we go
108+
109+
// { "statuses":
110+
auto doc = parser.parse(json);
111+
ondemand::json_iterator &iter = doc.iterate();
112+
iter.start_object().value();
113+
if (!iter.find_first_field_raw("statuses")) { throw "No statuses field"; }
114+
// { "statuses": [
115+
auto tweets_array = iter.start_array().value();
116+
if (iter.is_empty_array()) { return; }
117+
118+
do {
119+
auto tweet_object = iter.start_object().value();
120+
twitter::tweet tweet;
121+
if (!iter.find_first_field_raw("created_at")) { throw "Could not find created_at"; }
122+
tweet.created_at = iter.get_raw_json_string().value().unescape(parser);
123+
if (!iter.find_next_field_raw("id", tweet_object)) { throw "Could not find id"; }
124+
tweet.id = iter.get_uint64();
125+
if (!iter.find_next_field_raw("text", tweet_object)) { throw "Could not find text"; }
126+
tweet.text = iter.get_raw_json_string().value().unescape(parser);
127+
if (!iter.find_next_field_raw("in_reply_to_status_id", tweet_object)) { throw "Could not find in_reply_to_status_id"; }
128+
if (!iter.is_null()) {
129+
tweet.in_reply_to_status_id = iter.get_uint64();
130+
}
131+
if (!iter.find_next_field_raw("user", tweet_object)) { throw "Could not find user"; }
132+
{
133+
auto user_object = iter.start_object().value();
134+
if (!iter.find_first_field_raw("id")) { throw "Could not find user.id"; }
135+
tweet.user.id = iter.get_uint64();
136+
if (!iter.find_next_field_raw("screen_name", user_object)) { throw "Could not find user.screen_name"; }
137+
tweet.user.screen_name = iter.get_raw_json_string().value().unescape(parser);
138+
}
139+
if (!iter.find_next_field_raw("retweet_count", tweet_object)) { throw "Could not find retweet_count"; }
140+
tweet.retweet_count = iter.get_uint64();
141+
if (!iter.find_next_field_raw("favorite_count", tweet_object)) { throw "Could not find favorite_count"; }
142+
tweet.favorite_count = iter.get_uint64();
143+
144+
tweets.push_back(tweet);
145+
} while (iter.next_element(tweets_array));
146+
}
147+
148+
static void iter_tweets(State &state) {
149+
// Load twitter.json to a buffer
150+
padded_string json;
151+
if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
152+
153+
// Allocate and warm the vector
154+
std::vector<twitter::tweet> tweets;
155+
ondemand::parser parser;
156+
read_tweets(parser, json, tweets);
157+
158+
// Read tweets
159+
size_t byte_count = 0;
160+
size_t tweet_count = 0;
161+
for (SIMDJSON_UNUSED auto _ : state) {
162+
tweets.clear();
163+
read_tweets(parser, json, tweets);
164+
byte_count += json.size();
165+
tweet_count += tweets.size();
166+
}
167+
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
168+
state.counters["Gigabytes"] = benchmark::Counter(
169+
double(byte_count), benchmark::Counter::kIsRate,
170+
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
171+
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
172+
state.counters["tweets"] = Counter(double(tweet_count), benchmark::Counter::kIsRate);
173+
}
174+
175+
BENCHMARK(iter_tweets)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
176+
return *(std::max_element(std::begin(v), std::end(v)));
177+
})->DisplayAggregatesOnly(true);
178+
179+
} // namespace iter_bench
180+
181+
97182
SIMDJSON_UNTARGET_REGION
98183

99184
#include "twitter/sax_tweet_reader.h"
@@ -251,7 +336,7 @@ struct my_point {
251336
/***
252337
* We start with the naive DOM-based approach.
253338
**/
254-
static void dom_parse_largerandom(State &state) {
339+
static void dom_largerandom(State &state) {
255340
// Load twitter.json to a buffer
256341
const padded_string& json = get_my_json_str();
257342

@@ -283,7 +368,7 @@ static void dom_parse_largerandom(State &state) {
283368
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
284369
}
285370

286-
BENCHMARK(dom_parse_largerandom)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
371+
BENCHMARK(dom_largerandom)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
287372
return *(std::max_element(std::begin(v), std::end(v)));
288373
})->DisplayAggregatesOnly(true);
289374

@@ -294,7 +379,7 @@ SIMDJSON_TARGET_HASWELL
294379
/***
295380
* On Demand approach.
296381
**/
297-
static void ondemand_parse_largerandom(State &state) {
382+
static void ondemand_largerandom(State &state) {
298383
using namespace haswell;
299384
// Load twitter.json to a buffer
300385
const padded_string& json = get_my_json_str();
@@ -324,7 +409,61 @@ static void ondemand_parse_largerandom(State &state) {
324409

325410
SIMDJSON_UNTARGET_REGION
326411

327-
BENCHMARK(ondemand_parse_largerandom)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
412+
BENCHMARK(ondemand_largerandom)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
413+
return *(std::max_element(std::begin(v), std::end(v)));
414+
})->DisplayAggregatesOnly(true);
415+
416+
SIMDJSON_TARGET_HASWELL
417+
418+
static simdjson_really_inline double first_double(haswell::ondemand::json_iterator &iter) {
419+
if (iter.start_object().error() || iter.field_key().error() || iter.field_value()) { throw "Invalid field"; }
420+
return iter.get_double();
421+
}
422+
423+
static simdjson_really_inline double next_double(haswell::ondemand::json_iterator &iter) {
424+
if (!iter.has_next_field() || iter.field_key().error() || iter.field_value()) { throw "Invalid field"; }
425+
return iter.get_double();
426+
}
427+
428+
/***
429+
* On Demand Iterator approach.
430+
**/
431+
static void iter_largerandom(State &state) {
432+
using namespace haswell;
433+
// Load twitter.json to a buffer
434+
const padded_string& json = get_my_json_str();
435+
436+
// Allocate
437+
ondemand::parser parser;
438+
error_code error;
439+
if ((error = parser.allocate(json.size()))) { throw error; };
440+
441+
// Read
442+
size_t bytes = 0;
443+
for (SIMDJSON_UNUSED auto _ : state) {
444+
std::vector<my_point> container;
445+
auto doc = parser.parse(json);
446+
ondemand::json_iterator &iter = doc.iterate();
447+
iter.start_array().value();
448+
if (!iter.is_empty_array()) {
449+
do {
450+
container.emplace_back(my_point{first_double(iter), next_double(iter), next_double(iter)});
451+
if (iter.has_next_field()) { throw "Too many fields"; }
452+
} while (iter.has_next_element());
453+
}
454+
bytes += json.size();
455+
benchmark::DoNotOptimize(container.data());
456+
}
457+
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
458+
state.counters["Gigabytes"] = benchmark::Counter(
459+
double(bytes), benchmark::Counter::kIsRate,
460+
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
461+
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
462+
}
463+
464+
SIMDJSON_UNTARGET_REGION
465+
466+
BENCHMARK(iter_largerandom)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
328467
return *(std::max_element(std::begin(v), std::end(v)));
329468
})->DisplayAggregatesOnly(true);
330469

@@ -430,7 +569,7 @@ SIMDJSON_UNTARGET_REGION
430569

431570

432571
// ./benchmark/bench_sax --benchmark_filter=largerandom
433-
static void sax_parse_largerandom(State &state) {
572+
static void sax_largerandom(State &state) {
434573
// Load twitter.json to a buffer
435574
const padded_string& json = get_my_json_str();
436575

@@ -455,7 +594,7 @@ static void sax_parse_largerandom(State &state) {
455594
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
456595
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
457596
}
458-
BENCHMARK(sax_parse_largerandom)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
597+
BENCHMARK(sax_largerandom)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
459598
return *(std::max_element(std::begin(v), std::end(v)));
460599
})->DisplayAggregatesOnly(true);
461600

src/generic/ondemand/array-inl.h

Lines changed: 28 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -41,21 +41,27 @@ namespace ondemand {
4141
//
4242

4343
simdjson_really_inline array::array() noexcept = default;
44+
simdjson_really_inline array::array(document *_doc, json_iterator::container _container) noexcept
45+
: doc{_doc}, container{_container}, at_start{true}, error{SUCCESS}
46+
{
47+
}
4448
simdjson_really_inline array::array(document *_doc, error_code _error) noexcept
45-
: doc{_doc}, depth{_doc->iter.depth}, at_start{!_error}, error{_error}
49+
: doc{_doc}, container{_doc->iter.current_container()}, at_start{false}, error{_error}
4650
{
51+
SIMDJSON_ASSUME(_error);
4752
}
4853

4954
simdjson_really_inline bool array::finished() const noexcept {
50-
return doc->iter.depth < depth;
51-
}
52-
simdjson_really_inline void array::finish(bool log_end) noexcept {
53-
doc->iter.depth = depth - 1;
54-
if (log_end) { logger::log_end_value(doc->iter, "array"); }
55+
return !doc->iter.in_container(container);
5556
}
5657

57-
simdjson_really_inline array array::begin(document *doc, error_code error) noexcept {
58-
doc->iter.depth++;
58+
simdjson_really_inline array array::start(document *doc) noexcept {
59+
return array(doc, doc->iter.start_array());
60+
}
61+
simdjson_really_inline array array::started(document *doc) noexcept {
62+
return array(doc, doc->iter.started_array());
63+
}
64+
simdjson_really_inline array array::error_chain(document *doc, error_code error) noexcept {
5965
return array(doc, error);
6066
}
6167
simdjson_really_inline array array::begin() noexcept {
@@ -65,8 +71,15 @@ simdjson_really_inline array array::end() noexcept {
6571
return {};
6672
}
6773

74+
simdjson_really_inline error_code array::report_error() noexcept {
75+
container = doc->iter.current_container().child(); // Make it so we'll stop
76+
auto result = error;
77+
error = SUCCESS;
78+
return result;
79+
}
80+
6881
simdjson_really_inline simdjson_result<value> array::operator*() noexcept {
69-
if (error) { finish(); return { doc, error }; }
82+
if (error) { return { doc, report_error() }; }
7083
return value::start(doc);
7184
}
7285
simdjson_really_inline bool array::operator==(const array &other) noexcept {
@@ -75,33 +88,16 @@ simdjson_really_inline bool array::operator==(const array &other) noexcept {
7588
simdjson_really_inline bool array::operator!=(const array &) noexcept {
7689
// If we're at the start, check for empty array.
7790
if (at_start) {
78-
if (*doc->iter.peek() == ']') {
79-
doc->iter.advance();
80-
logger::log_value(doc->iter, "empty array");
81-
finish();
82-
} else {
83-
logger::log_start_value(doc->iter, "array");
84-
}
91+
at_start = false;
92+
return !doc->iter.is_empty_array();
8593
}
8694
return !finished();
8795
}
8896
simdjson_really_inline array &array::operator++() noexcept {
89-
if (!finished()) {
90-
SIMDJSON_ASSUME(!error);
91-
SIMDJSON_ASSUME(!at_start);
92-
doc->iter.skip_unfinished_children(depth);
93-
switch (*doc->iter.advance()) {
94-
case ',':
95-
break;
96-
case ']':
97-
finish(true);
98-
break;
99-
default:
100-
logger::log_error(doc->iter, "Missing comma between array elements");
101-
finish();
102-
error = TAPE_ERROR;
103-
}
104-
}
97+
SIMDJSON_ASSUME(!finished());
98+
SIMDJSON_ASSUME(!at_start);
99+
100+
error = doc->iter.next_element(container).error();
105101
return *this;
106102
}
107103

src/generic/ondemand/array.h

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,24 +27,43 @@ class array {
2727
simdjson_really_inline array &operator++() noexcept;
2828

2929
protected:
30+
/**
31+
* Begin array iteration.
32+
*
33+
* @param doc The document containing the array.
34+
* @error INCORRECT_TYPE if the iterator is not at [.
35+
*/
36+
static simdjson_really_inline array start(document *doc) noexcept;
3037
/**
3138
* Begin array iteration.
3239
*
3340
* @param doc The document containing the array. The iterator must be just after the opening `[`.
34-
* doc->iter.depth will be incremented automatically to reflect the nesting level.
35-
* @param error If this is not SUCCESS, creates an error chained array.
3641
*/
37-
static simdjson_really_inline array begin(document *doc, error_code error=SUCCESS) noexcept;
42+
static simdjson_really_inline array started(document *doc) noexcept;
43+
/**
44+
* Created an error chained array iterator.
45+
*
46+
* @param doc The document containing the array.
47+
*/
48+
static simdjson_really_inline array error_chain(document *doc, error_code error) noexcept;
49+
50+
simdjson_really_inline error_code report_error() noexcept;
3851

3952
/**
40-
* Internal array creation. Call array::begin(doc[, error]) instead of this.
53+
* Internal array creation. Call array::start() or array::started() instead of this.
4154
*
4255
* @param doc The document containing the array. doc->iter.depth must already be incremented to
43-
* reflect the array's depth. If there is no error, the iterator must be just after
44-
* the opening `[`.
45-
* @param error The error to report. If the error is not SUCCESS, this is an error chained object.
56+
* reflect the array's depth. The iterator must be just after the opening `[`.
57+
* @param container The container returned from iter.start_array() / iter.started_array().
58+
*/
59+
simdjson_really_inline array(document *_doc, json_iterator::container _container) noexcept;
60+
/**
61+
* Internal array creation. Call array::error_chain() instead of this.
62+
*
63+
* @param doc The document containing the array.
64+
* @param error The error to report. If it is not SUCCESS, this is an error chained object.
4665
*/
47-
simdjson_really_inline array(document *doc, error_code error) noexcept;
66+
simdjson_really_inline array(document *_doc, error_code error) noexcept;
4867

4968
/** Check whether iteration is complete. */
5069
bool finished() const noexcept;
@@ -59,14 +78,11 @@ class array {
5978
*/
6079
document *doc{};
6180
/**
62-
* Depth of the array.
63-
*
64-
* If doc->iter.depth < json.depth, we have finished.
81+
* Container value for this array, obtained from json_iterator::started_array().
6582
*
66-
* PERF NOTE: expected to be elided entirely, as any individual array's depth is a constant
67-
* knowable at compile time, incremented each time we nest an object or array.
83+
* PERF NOTE: expected to be elided entirely, as this is a constant knowable at compile time.
6884
*/
69-
uint32_t depth{};
85+
json_iterator::container container{};
7086
/**
7187
* Whether we're at the beginning of the array, or after.
7288
*

0 commit comments

Comments
 (0)