Skip to content

Commit 7e74d30

Browse files
committed
[WIP] tweet reader SAX benchmark
1 parent ce8d0f8 commit 7e74d30

File tree

3 files changed

+500
-7
lines changed

3 files changed

+500
-7
lines changed

benchmark/CMakeLists.txt

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
include_directories( . linux )
2-
link_libraries(simdjson simdjson-flags simdjson-windows-headers test-data)
2+
link_libraries(simdjson-windows-headers test-data)
3+
4+
5+
if (TARGET benchmark::benchmark)
6+
add_executable(bench_sax bench_sax.cpp)
7+
target_link_libraries(bench_sax simdjson-internal-flags simdjson-include-source benchmark::benchmark)
8+
endif (TARGET benchmark::benchmark)
9+
10+
link_libraries(simdjson simdjson-flags)
311
add_executable(benchfeatures benchfeatures.cpp)
412
add_executable(get_corpus_benchmark get_corpus_benchmark.cpp)
513
add_executable(perfdiff perfdiff.cpp)
@@ -14,12 +22,6 @@ target_compile_definitions(parse_nonumberparsing PRIVATE SIMDJSON_SKIPNUMBERPARS
1422
add_executable(parse_nostringparsing parse.cpp)
1523
target_compile_definitions(parse_nostringparsing PRIVATE SIMDJSON_SKIPSTRINGPARSING)
1624

17-
if (TARGET benchmark::benchmark)
18-
link_libraries(benchmark::benchmark)
19-
add_executable(bench_parse_call bench_parse_call.cpp)
20-
add_executable(bench_dom_api bench_dom_api.cpp)
21-
endif()
22-
2325
if (TARGET competition-all)
2426
add_executable(distinctuseridcompetition distinctuseridcompetition.cpp)
2527
target_link_libraries(distinctuseridcompetition competition-core)
@@ -34,4 +36,10 @@ if (TARGET competition-all)
3436
target_compile_definitions(allparsingcompetition PRIVATE ALLPARSER)
3537
endif()
3638

39+
if (TARGET benchmark::benchmark)
40+
link_libraries(benchmark::benchmark)
41+
add_executable(bench_parse_call bench_parse_call.cpp)
42+
add_executable(bench_dom_api bench_dom_api.cpp)
43+
endif()
44+
3745
include(checkperf.cmake)

benchmark/bench_sax.cpp

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
#define SIMDJSON_IMPLEMENTATION_FALLBACK 0
2+
#define SIMDJSON_IMPLEMENTATION_WESTMERE 0
3+
#define SIMDJSON_IMPLEMENTATION_AMD64 0
4+
5+
#include "simdjson.h"
6+
#include "simdjson.cpp"
7+
using namespace simdjson;
8+
9+
using namespace haswell;
10+
using namespace haswell::stage2;
11+
12+
SIMDJSON_TARGET_HASWELL
13+
14+
namespace twitter {
15+
16+
#define KEY_IS(KEY, MATCH) (!strncmp((const char *)KEY, "\"" MATCH "\"", strlen("\"" MATCH "\"")))
17+
18+
struct twitter_user {
19+
uint64_t id{};
20+
std::string_view screen_name{};
21+
};
22+
struct tweet {
23+
uint64_t id{};
24+
std::string_view text{};
25+
std::string_view created_at{};
26+
uint64_t in_reply_to_status_id{};
27+
uint64_t retweet_count{};
28+
uint64_t favorite_count{};
29+
twitter_user user{};
30+
};
31+
struct sax_tweet_reader {
32+
std::vector<tweet> tweets;
33+
std::unique_ptr<uint8_t[]> string_buf;
34+
size_t capacity;
35+
dom_parser_implementation dom_parser;
36+
37+
sax_tweet_reader();
38+
error_code set_capacity(size_t new_capacity);
39+
error_code read_tweets(padded_string &json);
40+
}; // struct tweet_reader
41+
42+
} // namespace twitter
43+
44+
namespace twitter {
45+
46+
struct sax_tweet_reader_visitor {
47+
bool in_statuses{false};
48+
bool in_user{false};
49+
std::vector<tweet> &tweets;
50+
uint8_t *current_string_buf_loc;
51+
uint64_t *expect_int{};
52+
std::string_view *expect_string{};
53+
54+
sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf);
55+
56+
simdjson_really_inline error_code visit_document_start(json_iterator &iter);
57+
simdjson_really_inline error_code visit_object_start(json_iterator &iter);
58+
simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key);
59+
simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value);
60+
simdjson_really_inline error_code visit_array_start(json_iterator &iter);
61+
simdjson_really_inline error_code visit_array_end(json_iterator &iter);
62+
simdjson_really_inline error_code visit_object_end(json_iterator &iter);
63+
simdjson_really_inline error_code visit_document_end(json_iterator &iter);
64+
simdjson_really_inline error_code visit_empty_array(json_iterator &iter);
65+
simdjson_really_inline error_code visit_empty_object(json_iterator &iter);
66+
simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value);
67+
simdjson_really_inline error_code increment_count(json_iterator &iter);
68+
}; // sax_tweet_reader_visitor
69+
70+
sax_tweet_reader::sax_tweet_reader() : tweets{}, string_buf{}, capacity{0}, dom_parser() {}
71+
72+
error_code sax_tweet_reader::set_capacity(size_t new_capacity) {
73+
// string_capacity copied from document::allocate
74+
size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * new_capacity / 3 + 32, 64);
75+
string_buf.reset(new (std::nothrow) uint8_t[string_capacity]);
76+
if (auto error = dom_parser.set_capacity(new_capacity)) { return error; }
77+
if (capacity == 0) { // set max depth the first time only
78+
if (auto error = dom_parser.set_max_depth(DEFAULT_MAX_DEPTH)) { return error; }
79+
}
80+
capacity = new_capacity;
81+
return SUCCESS;
82+
}
83+
84+
// NOTE: this assumes the dom_parser is already allocated
85+
error_code sax_tweet_reader::read_tweets(padded_string &json) {
86+
// Allocate capacity if needed
87+
tweets.clear();
88+
if (capacity < json.size()) {
89+
if (auto error = set_capacity(capacity)) { return error; }
90+
}
91+
92+
// Run stage 1 first.
93+
if (auto error = dom_parser.stage1((uint8_t *)json.data(), json.size(), false)) { return error; }
94+
95+
// Then walk the document, parsing the tweets as we go
96+
json_iterator iter(dom_parser, 0);
97+
sax_tweet_reader_visitor visitor(tweets, string_buf.get());
98+
if (auto error = iter.walk_document<false>(visitor)) { return error; }
99+
return SUCCESS;
100+
}
101+
102+
sax_tweet_reader_visitor::sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf)
103+
: tweets{_tweets},
104+
current_string_buf_loc{string_buf} {
105+
}
106+
107+
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_start(json_iterator &iter) {
108+
iter.log_start_value("document");
109+
return SUCCESS;
110+
}
111+
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_start(json_iterator &iter) {
112+
// iter.log_start_value("array");
113+
// if we expected an int or string and got an array or object, it's an error
114+
if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; }
115+
return SUCCESS;
116+
}
117+
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_start(json_iterator &iter) {
118+
// iter.log_start_value("object");
119+
120+
// if we expected an int or string and got an array or object, it's an error
121+
if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; }
122+
123+
// { "statuses": [ {
124+
if (in_statuses && iter.depth == 3) {
125+
iter.log_start_value("tweet");
126+
tweets.push_back({});
127+
}
128+
return SUCCESS;
129+
}
130+
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_key(json_iterator &iter, const uint8_t *key) {
131+
// iter.log_value("key");
132+
if (in_statuses) {
133+
switch (iter.depth) {
134+
case 3: // in tweet: { "statuses": [ { <key>
135+
// NOTE: the way we're comparing key (fairly naturally) means the caller doesn't have to check " for us at all
136+
if (KEY_IS(key, "user")) { iter.log_start_value("user"); in_user = true; }
137+
138+
else if (KEY_IS(key, "id")) { iter.log_value("id"); expect_int = &tweets.back().id; }
139+
else if (KEY_IS(key, "in_reply_to_status_id")) { iter.log_value("in_reply_to_status_id"); expect_int = &tweets.back().in_reply_to_status_id; }
140+
else if (KEY_IS(key, "retweet_count")) { iter.log_value("retweet_count"); expect_int = &tweets.back().retweet_count; }
141+
else if (KEY_IS(key, "favorite_count")) { iter.log_value("favorite_count"); expect_int = &tweets.back().favorite_count; }
142+
143+
else if (KEY_IS(key, "text")) { iter.log_value("text"); expect_string = &tweets.back().text; }
144+
else if (KEY_IS(key, "created_at")) { iter.log_value("created_at"); expect_string = &tweets.back().created_at; }
145+
break;
146+
case 4:
147+
if (in_user) { // in user: { "statuses": [ { "user": { <key>
148+
if (KEY_IS(key, "id")) { iter.log_value("id"); expect_int = &tweets.back().user.id; }
149+
else if (KEY_IS(key, "screen_name")) { iter.log_value("screen_name"); expect_string = &tweets.back().user.screen_name; }
150+
}
151+
break;
152+
default: break;
153+
}
154+
} else {
155+
if (iter.depth == 1 && KEY_IS(key, "statuses")) {
156+
iter.log_start_value("statuses");
157+
in_statuses = true;
158+
}
159+
}
160+
return SUCCESS;
161+
}
162+
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_primitive(json_iterator &iter, const uint8_t *value) {
163+
// iter.log_value("primitive");
164+
if (expect_int) {
165+
iter.log_value("int");
166+
if (auto error = numberparsing::parse_unsigned(value).get(*expect_int)) {
167+
// If number parsing failed, check if it's null before returning the error
168+
if (!atomparsing::is_valid_null_atom(value)) { iter.log_error("expected number or null"); return error; }
169+
}
170+
expect_int = nullptr;
171+
} else if (expect_string) {
172+
iter.log_value("string");
173+
// Must be a string!
174+
if (value[0] != '"') { iter.log_error("expected string"); return STRING_ERROR; }
175+
auto end = stringparsing::parse_string(value, current_string_buf_loc);
176+
if (!end) { iter.log_error("error parsing string"); return STRING_ERROR; }
177+
*expect_string = std::string_view((const char *)current_string_buf_loc, end-current_string_buf_loc);
178+
current_string_buf_loc = end;
179+
expect_string = nullptr;
180+
}
181+
return SUCCESS;
182+
}
183+
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_end(json_iterator &iter) {
184+
// iter.log_end_value("array");
185+
// When we hit the end of { "statuses": [ ... ], we're done with statuses.
186+
if (in_statuses && iter.depth == 2) { iter.log_end_value("statuses"); in_statuses = false; }
187+
return SUCCESS;
188+
}
189+
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_end(json_iterator &iter) {
190+
// iter.log_end_value("object");
191+
// When we hit the end of { "statuses": [ { "user": { ... }, we're done with the user
192+
if (in_user && iter.depth == 4) { iter.log_end_value("user"); in_user = false; }
193+
if (in_statuses && iter.depth == 3) { iter.log_end_value("tweet"); }
194+
return SUCCESS;
195+
}
196+
197+
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_end(json_iterator &iter) {
198+
iter.log_end_value("document");
199+
return SUCCESS;
200+
}
201+
202+
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_array(json_iterator &iter) {
203+
// if we expected an int or string and got an array or object, it's an error
204+
// iter.log_value("empty array");
205+
if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; }
206+
return SUCCESS;
207+
}
208+
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_object(json_iterator &iter) {
209+
// if we expected an int or string and got an array or object, it's an error
210+
// iter.log_value("empty object");
211+
if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; }
212+
return SUCCESS;
213+
}
214+
simdjson_really_inline error_code sax_tweet_reader_visitor::visit_root_primitive(json_iterator &iter, const uint8_t *) {
215+
// iter.log_value("root primitive");
216+
iter.log_error("unexpected root primitive");
217+
return TAPE_ERROR;
218+
}
219+
220+
simdjson_really_inline error_code sax_tweet_reader_visitor::increment_count(json_iterator &) { return SUCCESS; }
221+
222+
} // namespace twitter
223+
224+
SIMDJSON_UNTARGET_REGION
225+
226+
227+
SIMDJSON_PUSH_DISABLE_ALL_WARNINGS
228+
#include <benchmark/benchmark.h>
229+
SIMDJSON_POP_DISABLE_WARNINGS
230+
231+
using namespace benchmark;
232+
using namespace std;
233+
234+
const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json";
235+
236+
static void sax_tweets(State& state) {
237+
// Load twitter.json to a buffer
238+
padded_string json;
239+
if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
240+
241+
// Allocate
242+
twitter::sax_tweet_reader reader;
243+
if (auto error = reader.set_capacity(json.size())) { cerr << error << endl; return; }
244+
245+
// Make the tweet_reader
246+
size_t bytes = 0;
247+
size_t tweets = 0;
248+
for (SIMDJSON_UNUSED auto _ : state) {
249+
if (auto error = reader.read_tweets(json)) { throw error; }
250+
bytes += json.size();
251+
tweets += reader.tweets.size();
252+
}
253+
// Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
254+
state.counters["Gigabytes"] = benchmark::Counter(
255+
double(bytes), benchmark::Counter::kIsRate,
256+
benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
257+
state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
258+
state.counters["tweets"] = Counter(double(tweets), benchmark::Counter::kIsRate);
259+
}
260+
BENCHMARK(sax_tweets)->Repetitions(10)->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
261+
return *(std::max_element(std::begin(v), std::end(v)));
262+
})->DisplayAggregatesOnly(true);
263+
264+
BENCHMARK_MAIN();

0 commit comments

Comments
 (0)