Skip to content

Commit 638f1de

Browse files
committed
Add DOM tweet reader for comparison
1 parent 7e74d30 commit 638f1de

File tree

9 files changed

+727
-226
lines changed

9 files changed

+727
-226
lines changed

benchmark/bench_dom_api.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ static void recover_one_string(State& state) {
2222
return;
2323
}
2424
dom::element doc;
25-
if (error = parser.parse(docdata).get(doc)) {
25+
if ((error = parser.parse(docdata).get(doc))) {
2626
cerr << "could not parse string" << error << endl;
2727
return;
2828
}
@@ -48,8 +48,7 @@ static void serialize_twitter(State& state) {
4848
return;
4949
}
5050
// we do not want mem. alloc. in the loop.
51-
error = parser.allocate(docdata.size());
52-
if(error) {
51+
if((error = parser.allocate(docdata.size()))) {
5352
cout << error << endl;
5453
return;
5554
}

benchmark/bench_sax.cpp

Lines changed: 89 additions & 217 deletions
Original file line numberDiff line numberDiff line change
@@ -3,261 +3,133 @@
33
#define SIMDJSON_IMPLEMENTATION_AMD64 0
44

55
#include "simdjson.h"
6-
#include "simdjson.cpp"
7-
using namespace simdjson;
8-
9-
using namespace haswell;
10-
using namespace haswell::stage2;
11-
12-
SIMDJSON_TARGET_HASWELL
13-
14-
namespace twitter {
15-
16-
#define KEY_IS(KEY, MATCH) (!strncmp((const char *)KEY, "\"" MATCH "\"", strlen("\"" MATCH "\"")))
176

18-
struct twitter_user {
19-
uint64_t id{};
20-
std::string_view screen_name{};
21-
};
22-
struct tweet {
23-
uint64_t id{};
24-
std::string_view text{};
25-
std::string_view created_at{};
26-
uint64_t in_reply_to_status_id{};
27-
uint64_t retweet_count{};
28-
uint64_t favorite_count{};
29-
twitter_user user{};
30-
};
31-
struct sax_tweet_reader {
32-
std::vector<tweet> tweets;
33-
std::unique_ptr<uint8_t[]> string_buf;
34-
size_t capacity;
35-
dom_parser_implementation dom_parser;
36-
37-
sax_tweet_reader();
38-
error_code set_capacity(size_t new_capacity);
39-
error_code read_tweets(padded_string &json);
40-
}; // struct tweet_reader
41-
42-
} // namespace twitter
7+
SIMDJSON_PUSH_DISABLE_ALL_WARNINGS
8+
#include <benchmark/benchmark.h>
9+
SIMDJSON_POP_DISABLE_WARNINGS
4310

44-
namespace twitter {
11+
#include "simdjson.cpp"
12+
#include "twitter/sax_tweet_reader.h"
4513

46-
struct sax_tweet_reader_visitor {
47-
bool in_statuses{false};
48-
bool in_user{false};
49-
std::vector<tweet> &tweets;
50-
uint8_t *current_string_buf_loc;
51-
uint64_t *expect_int{};
52-
std::string_view *expect_string{};
14+
using namespace benchmark;
15+
using namespace simdjson;
16+
using std::cerr;
17+
using std::endl;
5318

54-
sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf);
19+
const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json";
20+
const int REPETITIONS = 10;
5521

56-
simdjson_really_inline error_code visit_document_start(json_iterator &iter);
57-
simdjson_really_inline error_code visit_object_start(json_iterator &iter);
58-
simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key);
59-
simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value);
60-
simdjson_really_inline error_code visit_array_start(json_iterator &iter);
61-
simdjson_really_inline error_code visit_array_end(json_iterator &iter);
62-
simdjson_really_inline error_code visit_object_end(json_iterator &iter);
63-
simdjson_really_inline error_code visit_document_end(json_iterator &iter);
64-
simdjson_really_inline error_code visit_empty_array(json_iterator &iter);
65-
simdjson_really_inline error_code visit_empty_object(json_iterator &iter);
66-
simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value);
67-
simdjson_really_inline error_code increment_count(json_iterator &iter);
68-
}; // sax_tweet_reader_visitor
22+
static void sax_tweets(State &state) {
23+
// Load twitter.json to a buffer
24+
padded_string json;
25+
if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
6926

70-
sax_tweet_reader::sax_tweet_reader() : tweets{}, string_buf{}, capacity{0}, dom_parser() {}
27+
// Allocate
28+
twitter::sax_tweet_reader reader;
29+
if (auto error = reader.set_capacity(json.size())) { cerr << error << endl; return; }
7130

72-
error_code sax_tweet_reader::set_capacity(size_t new_capacity) {
73-
// string_capacity copied from document::allocate
74-
size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * new_capacity / 3 + 32, 64);
75-
string_buf.reset(new (std::nothrow) uint8_t[string_capacity]);
76-
if (auto error = dom_parser.set_capacity(new_capacity)) { return error; }
77-
if (capacity == 0) { // set max depth the first time only
78-
if (auto error = dom_parser.set_max_depth(DEFAULT_MAX_DEPTH)) { return error; }
79-
}
80-
capacity = new_capacity;
81-
return SUCCESS;
82-
}
31+
// Warm the vector
32+
if (auto error = reader.read_tweets(json)) { throw error; }
8333

84-
// NOTE: this assumes the dom_parser is already allocated
85-
error_code sax_tweet_reader::read_tweets(padded_string &json) {
86-
// Allocate capacity if needed
87-
tweets.clear();
88-
if (capacity < json.size()) {
89-
if (auto error = set_capacity(capacity)) { return error; }
34+
// Read tweets
35+
size_t bytes = 0;
36+
size_t tweets = 0;
37+
for (SIMDJSON_UNUSED auto _ : state) {
38+
if (auto error = reader.read_tweets(json)) { throw error; }
39+
bytes += json.size();
40+
tweets += reader.tweets.size();
9041
}
91-
92-
// Run stage 1 first.
93-
if (auto error = dom_parser.stage1((uint8_t *)json.data(), json.size(), false)) { return error; }
94-
95-
// Then walk the document, parsing the tweets as we go
96-
json_iterator iter(dom_parser, 0);
97-
sax_tweet_reader_visitor visitor(tweets, string_buf.get());
98-
if (auto error = iter.walk_document<false>(visitor)) { return error; }
99-
return SUCCESS;
42+
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
43+
state.counters["Gigabytes"] = benchmark::Counter(
44+
double(bytes), benchmark::Counter::kIsRate,
45+
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
46+
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
47+
state.counters["tweets"] = Counter(double(tweets), benchmark::Counter::kIsRate);
10048
}
49+
BENCHMARK(sax_tweets)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
50+
return *(std::max_element(std::begin(v), std::end(v)));
51+
})->DisplayAggregatesOnly(true);
10152

102-
sax_tweet_reader_visitor::sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf)
103-
: tweets{_tweets},
104-
current_string_buf_loc{string_buf} {
105-
}
53+
#if SIMDJSON_EXCEPTIONS
10654

107-
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_start(json_iterator &iter) {
108-
iter.log_start_value("document");
109-
return SUCCESS;
55+
simdjson_really_inline uint64_t nullable_int(dom::element element) {
56+
if (element.is_null()) { return 0; }
57+
return element;
11058
}
111-
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_start(json_iterator &iter) {
112-
// iter.log_start_value("array");
113-
// if we expected an int or string and got an array or object, it's an error
114-
if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; }
115-
return SUCCESS;
116-
}
117-
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_start(json_iterator &iter) {
118-
// iter.log_start_value("object");
119-
120-
// if we expected an int or string and got an array or object, it's an error
121-
if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; }
122-
123-
// { "statuses": [ {
124-
if (in_statuses && iter.depth == 3) {
125-
iter.log_start_value("tweet");
126-
tweets.push_back({});
59+
simdjson_really_inline void read_dom_tweets(dom::parser &parser, padded_string &json, std::vector<twitter::tweet> &tweets) {
60+
for (dom::element tweet : parser.parse(json)["statuses"]) {
61+
auto user = tweet["user"];
62+
tweets.push_back(
63+
{
64+
tweet["id"],
65+
tweet["text"],
66+
tweet["created_at"],
67+
nullable_int(tweet["in_reply_to_status_id"]),
68+
tweet["retweet_count"],
69+
tweet["favorite_count"],
70+
{ user["id"], user["screen_name"] }
71+
}
72+
);
12773
}
128-
return SUCCESS;
12974
}
130-
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_key(json_iterator &iter, const uint8_t *key) {
131-
// iter.log_value("key");
132-
if (in_statuses) {
133-
switch (iter.depth) {
134-
case 3: // in tweet: { "statuses": [ { <key>
135-
// NOTE: the way we're comparing key (fairly naturally) means the caller doesn't have to check " for us at all
136-
if (KEY_IS(key, "user")) { iter.log_start_value("user"); in_user = true; }
13775

138-
else if (KEY_IS(key, "id")) { iter.log_value("id"); expect_int = &tweets.back().id; }
139-
else if (KEY_IS(key, "in_reply_to_status_id")) { iter.log_value("in_reply_to_status_id"); expect_int = &tweets.back().in_reply_to_status_id; }
140-
else if (KEY_IS(key, "retweet_count")) { iter.log_value("retweet_count"); expect_int = &tweets.back().retweet_count; }
141-
else if (KEY_IS(key, "favorite_count")) { iter.log_value("favorite_count"); expect_int = &tweets.back().favorite_count; }
76+
static void dom_tweets(State &state) {
77+
// Load twitter.json to a buffer
78+
padded_string json;
79+
if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
14280

143-
else if (KEY_IS(key, "text")) { iter.log_value("text"); expect_string = &tweets.back().text; }
144-
else if (KEY_IS(key, "created_at")) { iter.log_value("created_at"); expect_string = &tweets.back().created_at; }
145-
break;
146-
case 4:
147-
if (in_user) { // in user: { "statuses": [ { "user": { <key>
148-
if (KEY_IS(key, "id")) { iter.log_value("id"); expect_int = &tweets.back().user.id; }
149-
else if (KEY_IS(key, "screen_name")) { iter.log_value("screen_name"); expect_string = &tweets.back().user.screen_name; }
150-
}
151-
break;
152-
default: break;
153-
}
154-
} else {
155-
if (iter.depth == 1 && KEY_IS(key, "statuses")) {
156-
iter.log_start_value("statuses");
157-
in_statuses = true;
158-
}
159-
}
160-
return SUCCESS;
161-
}
162-
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_primitive(json_iterator &iter, const uint8_t *value) {
163-
// iter.log_value("primitive");
164-
if (expect_int) {
165-
iter.log_value("int");
166-
if (auto error = numberparsing::parse_unsigned(value).get(*expect_int)) {
167-
// If number parsing failed, check if it's null before returning the error
168-
if (!atomparsing::is_valid_null_atom(value)) { iter.log_error("expected number or null"); return error; }
169-
}
170-
expect_int = nullptr;
171-
} else if (expect_string) {
172-
iter.log_value("string");
173-
// Must be a string!
174-
if (value[0] != '"') { iter.log_error("expected string"); return STRING_ERROR; }
175-
auto end = stringparsing::parse_string(value, current_string_buf_loc);
176-
if (!end) { iter.log_error("error parsing string"); return STRING_ERROR; }
177-
*expect_string = std::string_view((const char *)current_string_buf_loc, end-current_string_buf_loc);
178-
current_string_buf_loc = end;
179-
expect_string = nullptr;
180-
}
181-
return SUCCESS;
182-
}
183-
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_end(json_iterator &iter) {
184-
// iter.log_end_value("array");
185-
// When we hit the end of { "statuses": [ ... ], we're done with statuses.
186-
if (in_statuses && iter.depth == 2) { iter.log_end_value("statuses"); in_statuses = false; }
187-
return SUCCESS;
188-
}
189-
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_end(json_iterator &iter) {
190-
// iter.log_end_value("object");
191-
// When we hit the end of { "statuses": [ { "user": { ... }, we're done with the user
192-
if (in_user && iter.depth == 4) { iter.log_end_value("user"); in_user = false; }
193-
if (in_statuses && iter.depth == 3) { iter.log_end_value("tweet"); }
194-
return SUCCESS;
195-
}
81+
// Allocate
82+
dom::parser parser;
83+
if (auto error = parser.allocate(json.size())) { cerr << error << endl; return; };
19684

197-
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_end(json_iterator &iter) {
198-
iter.log_end_value("document");
199-
return SUCCESS;
200-
}
85+
// Warm the vector
86+
std::vector<twitter::tweet> tweets;
87+
read_dom_tweets(parser, json, tweets);
20188

202-
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_array(json_iterator &iter) {
203-
// if we expected an int or string and got an array or object, it's an error
204-
// iter.log_value("empty array");
205-
if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; }
206-
return SUCCESS;
207-
}
208-
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_object(json_iterator &iter) {
209-
// if we expected an int or string and got an array or object, it's an error
210-
// iter.log_value("empty object");
211-
if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; }
212-
return SUCCESS;
213-
}
214-
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_root_primitive(json_iterator &iter, const uint8_t *) {
215-
// iter.log_value("root primitive");
216-
iter.log_error("unexpected root primitive");
217-
return TAPE_ERROR;
89+
// Read tweets
90+
size_t bytes = 0;
91+
size_t num_tweets = 0;
92+
for (SIMDJSON_UNUSED auto _ : state) {
93+
tweets.clear();
94+
read_dom_tweets(parser, json, tweets);
95+
bytes += json.size();
96+
num_tweets += tweets.size();
97+
}
98+
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
99+
state.counters["Gigabytes"] = benchmark::Counter(
100+
double(bytes), benchmark::Counter::kIsRate,
101+
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
102+
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
103+
state.counters["tweets"] = Counter(double(num_tweets), benchmark::Counter::kIsRate);
218104
}
105+
BENCHMARK(dom_tweets)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
106+
return *(std::max_element(std::begin(v), std::end(v)));
107+
})->DisplayAggregatesOnly(true);
219108

220-
simdjson_really_inline error_code sax_tweet_reader_visitor::increment_count(json_iterator &) { return SUCCESS; }
221-
222-
} // namespace twitter
223-
224-
SIMDJSON_UNTARGET_REGION
225-
226-
227-
SIMDJSON_PUSH_DISABLE_ALL_WARNINGS
228-
#include <benchmark/benchmark.h>
229-
SIMDJSON_POP_DISABLE_WARNINGS
230-
231-
using namespace benchmark;
232-
using namespace std;
233-
234-
const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json";
109+
#endif // SIMDJSON_EXCEPTIONS
235110

236-
static void sax_tweets(State& state) {
111+
static void dom_parse(State &state) {
237112
// Load twitter.json to a buffer
238113
padded_string json;
239114
if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
240115

241116
// Allocate
242-
twitter::sax_tweet_reader reader;
243-
if (auto error = reader.set_capacity(json.size())) { cerr << error << endl; return; }
117+
dom::parser parser;
118+
if (auto error = parser.allocate(json.size())) { cerr << error << endl; return; };
244119

245-
// Make the tweet_reader
120+
// Read tweets
246121
size_t bytes = 0;
247-
size_t tweets = 0;
248122
for (SIMDJSON_UNUSED auto _ : state) {
249-
if (auto error = reader.read_tweets(json)) { throw error; }
123+
if (parser.parse(json).error()) { throw "Parsing failed"; };
250124
bytes += json.size();
251-
tweets += reader.tweets.size();
252125
}
253126
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
254127
state.counters["Gigabytes"] = benchmark::Counter(
255128
double(bytes), benchmark::Counter::kIsRate,
256129
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
257130
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
258-
state.counters["tweets"] = Counter(double(tweets), benchmark::Counter::kIsRate);
259131
}
260-
BENCHMARK(sax_tweets)->Repetitions(10)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
132+
BENCHMARK(dom_parse)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
261133
return *(std::max_element(std::begin(v), std::end(v)));
262134
})->DisplayAggregatesOnly(true);
263135

0 commit comments

Comments
 (0)