|
3 | 3 | #define SIMDJSON_IMPLEMENTATION_AMD64 0
|
4 | 4 |
|
5 | 5 | #include "simdjson.h"
|
6 |
| -#include "simdjson.cpp" |
7 |
| -using namespace simdjson; |
8 |
| - |
9 |
| -using namespace haswell; |
10 |
| -using namespace haswell::stage2; |
11 |
| - |
12 |
| -SIMDJSON_TARGET_HASWELL |
13 |
| - |
14 |
| -namespace twitter { |
15 |
| - |
16 |
| -#define KEY_IS(KEY, MATCH) (!strncmp((const char *)KEY, "\"" MATCH "\"", strlen("\"" MATCH "\""))) |
17 | 6 |
|
18 |
| -struct twitter_user { |
19 |
| - uint64_t id{}; |
20 |
| - std::string_view screen_name{}; |
21 |
| -}; |
22 |
| -struct tweet { |
23 |
| - uint64_t id{}; |
24 |
| - std::string_view text{}; |
25 |
| - std::string_view created_at{}; |
26 |
| - uint64_t in_reply_to_status_id{}; |
27 |
| - uint64_t retweet_count{}; |
28 |
| - uint64_t favorite_count{}; |
29 |
| - twitter_user user{}; |
30 |
| -}; |
31 |
| -struct sax_tweet_reader { |
32 |
| - std::vector<tweet> tweets; |
33 |
| - std::unique_ptr<uint8_t[]> string_buf; |
34 |
| - size_t capacity; |
35 |
| - dom_parser_implementation dom_parser; |
36 |
| - |
37 |
| - sax_tweet_reader(); |
38 |
| - error_code set_capacity(size_t new_capacity); |
39 |
| - error_code read_tweets(padded_string &json); |
40 |
| -}; // struct tweet_reader |
41 |
| - |
42 |
| -} // namespace twitter |
| 7 | +SIMDJSON_PUSH_DISABLE_ALL_WARNINGS |
| 8 | +#include <benchmark/benchmark.h> |
| 9 | +SIMDJSON_POP_DISABLE_WARNINGS |
43 | 10 |
|
44 |
| -namespace twitter { |
| 11 | +#include "simdjson.cpp" |
| 12 | +#include "twitter/sax_tweet_reader.h" |
45 | 13 |
|
46 |
| -struct sax_tweet_reader_visitor { |
47 |
| - bool in_statuses{false}; |
48 |
| - bool in_user{false}; |
49 |
| - std::vector<tweet> &tweets; |
50 |
| - uint8_t *current_string_buf_loc; |
51 |
| - uint64_t *expect_int{}; |
52 |
| - std::string_view *expect_string{}; |
| 14 | +using namespace benchmark; |
| 15 | +using namespace simdjson; |
| 16 | +using std::cerr; |
| 17 | +using std::endl; |
53 | 18 |
|
54 |
| - sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf); |
| 19 | +const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json"; |
| 20 | +const int REPETITIONS = 10; |
55 | 21 |
|
56 |
| - simdjson_really_inline error_code visit_document_start(json_iterator &iter); |
57 |
| - simdjson_really_inline error_code visit_object_start(json_iterator &iter); |
58 |
| - simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key); |
59 |
| - simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value); |
60 |
| - simdjson_really_inline error_code visit_array_start(json_iterator &iter); |
61 |
| - simdjson_really_inline error_code visit_array_end(json_iterator &iter); |
62 |
| - simdjson_really_inline error_code visit_object_end(json_iterator &iter); |
63 |
| - simdjson_really_inline error_code visit_document_end(json_iterator &iter); |
64 |
| - simdjson_really_inline error_code visit_empty_array(json_iterator &iter); |
65 |
| - simdjson_really_inline error_code visit_empty_object(json_iterator &iter); |
66 |
| - simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value); |
67 |
| - simdjson_really_inline error_code increment_count(json_iterator &iter); |
68 |
| -}; // sax_tweet_reader_visitor |
| 22 | +static void sax_tweets(State &state) { |
| 23 | + // Load twitter.json to a buffer |
| 24 | + padded_string json; |
| 25 | + if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; } |
69 | 26 |
|
70 |
| -sax_tweet_reader::sax_tweet_reader() : tweets{}, string_buf{}, capacity{0}, dom_parser() {} |
| 27 | + // Allocate |
| 28 | + twitter::sax_tweet_reader reader; |
| 29 | + if (auto error = reader.set_capacity(json.size())) { cerr << error << endl; return; } |
71 | 30 |
|
72 |
| -error_code sax_tweet_reader::set_capacity(size_t new_capacity) { |
73 |
| - // string_capacity copied from document::allocate |
74 |
| - size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * new_capacity / 3 + 32, 64); |
75 |
| - string_buf.reset(new (std::nothrow) uint8_t[string_capacity]); |
76 |
| - if (auto error = dom_parser.set_capacity(new_capacity)) { return error; } |
77 |
| - if (capacity == 0) { // set max depth the first time only |
78 |
| - if (auto error = dom_parser.set_max_depth(DEFAULT_MAX_DEPTH)) { return error; } |
79 |
| - } |
80 |
| - capacity = new_capacity; |
81 |
| - return SUCCESS; |
82 |
| -} |
| 31 | + // Warm the vector |
| 32 | + if (auto error = reader.read_tweets(json)) { throw error; } |
83 | 33 |
|
84 |
| -// NOTE: this assumes the dom_parser is already allocated |
85 |
| -error_code sax_tweet_reader::read_tweets(padded_string &json) { |
86 |
| - // Allocate capacity if needed |
87 |
| - tweets.clear(); |
88 |
| - if (capacity < json.size()) { |
89 |
| - if (auto error = set_capacity(capacity)) { return error; } |
| 34 | + // Read tweets |
| 35 | + size_t bytes = 0; |
| 36 | + size_t tweets = 0; |
| 37 | + for (SIMDJSON_UNUSED auto _ : state) { |
| 38 | + if (auto error = reader.read_tweets(json)) { throw error; } |
| 39 | + bytes += json.size(); |
| 40 | + tweets += reader.tweets.size(); |
90 | 41 | }
|
91 |
| - |
92 |
| - // Run stage 1 first. |
93 |
| - if (auto error = dom_parser.stage1((uint8_t *)json.data(), json.size(), false)) { return error; } |
94 |
| - |
95 |
| - // Then walk the document, parsing the tweets as we go |
96 |
| - json_iterator iter(dom_parser, 0); |
97 |
| - sax_tweet_reader_visitor visitor(tweets, string_buf.get()); |
98 |
| - if (auto error = iter.walk_document<false>(visitor)) { return error; } |
99 |
| - return SUCCESS; |
| 42 | + // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte |
| 43 | + state.counters["Gigabytes"] = benchmark::Counter( |
| 44 | + double(bytes), benchmark::Counter::kIsRate, |
| 45 | + benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024 |
| 46 | + state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate); |
| 47 | + state.counters["tweets"] = Counter(double(tweets), benchmark::Counter::kIsRate); |
100 | 48 | }
|
| 49 | +BENCHMARK(sax_tweets)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double { |
| 50 | + return *(std::max_element(std::begin(v), std::end(v))); |
| 51 | + })->DisplayAggregatesOnly(true); |
101 | 52 |
|
102 |
| -sax_tweet_reader_visitor::sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf) |
103 |
| - : tweets{_tweets}, |
104 |
| - current_string_buf_loc{string_buf} { |
105 |
| -} |
| 53 | +#if SIMDJSON_EXCEPTIONS |
106 | 54 |
|
107 |
| -simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_start(json_iterator &iter) { |
108 |
| - iter.log_start_value("document"); |
109 |
| - return SUCCESS; |
| 55 | +simdjson_really_inline uint64_t nullable_int(dom::element element) { |
| 56 | + if (element.is_null()) { return 0; } |
| 57 | + return element; |
110 | 58 | }
|
111 |
| -simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_start(json_iterator &iter) { |
112 |
| - // iter.log_start_value("array"); |
113 |
| - // if we expected an int or string and got an array or object, it's an error |
114 |
| - if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; } |
115 |
| - return SUCCESS; |
116 |
| -} |
117 |
| -simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_start(json_iterator &iter) { |
118 |
| - // iter.log_start_value("object"); |
119 |
| - |
120 |
| - // if we expected an int or string and got an array or object, it's an error |
121 |
| - if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; } |
122 |
| - |
123 |
| - // { "statuses": [ { |
124 |
| - if (in_statuses && iter.depth == 3) { |
125 |
| - iter.log_start_value("tweet"); |
126 |
| - tweets.push_back({}); |
| 59 | +simdjson_really_inline void read_dom_tweets(dom::parser &parser, padded_string &json, std::vector<twitter::tweet> &tweets) { |
| 60 | + for (dom::element tweet : parser.parse(json)["statuses"]) { |
| 61 | + auto user = tweet["user"]; |
| 62 | + tweets.push_back( |
| 63 | + { |
| 64 | + tweet["id"], |
| 65 | + tweet["text"], |
| 66 | + tweet["created_at"], |
| 67 | + nullable_int(tweet["in_reply_to_status_id"]), |
| 68 | + tweet["retweet_count"], |
| 69 | + tweet["favorite_count"], |
| 70 | + { user["id"], user["screen_name"] } |
| 71 | + } |
| 72 | + ); |
127 | 73 | }
|
128 |
| - return SUCCESS; |
129 | 74 | }
|
130 |
| -simdjson_really_inline error_code sax_tweet_reader_visitor::visit_key(json_iterator &iter, const uint8_t *key) { |
131 |
| - // iter.log_value("key"); |
132 |
| - if (in_statuses) { |
133 |
| - switch (iter.depth) { |
134 |
| - case 3: // in tweet: { "statuses": [ { <key> |
135 |
| - // NOTE: the way we're comparing key (fairly naturally) means the caller doesn't have to check " for us at all |
136 |
| - if (KEY_IS(key, "user")) { iter.log_start_value("user"); in_user = true; } |
137 | 75 |
|
138 |
| - else if (KEY_IS(key, "id")) { iter.log_value("id"); expect_int = &tweets.back().id; } |
139 |
| - else if (KEY_IS(key, "in_reply_to_status_id")) { iter.log_value("in_reply_to_status_id"); expect_int = &tweets.back().in_reply_to_status_id; } |
140 |
| - else if (KEY_IS(key, "retweet_count")) { iter.log_value("retweet_count"); expect_int = &tweets.back().retweet_count; } |
141 |
| - else if (KEY_IS(key, "favorite_count")) { iter.log_value("favorite_count"); expect_int = &tweets.back().favorite_count; } |
| 76 | +static void dom_tweets(State &state) { |
| 77 | + // Load twitter.json to a buffer |
| 78 | + padded_string json; |
| 79 | + if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; } |
142 | 80 |
|
143 |
| - else if (KEY_IS(key, "text")) { iter.log_value("text"); expect_string = &tweets.back().text; } |
144 |
| - else if (KEY_IS(key, "created_at")) { iter.log_value("created_at"); expect_string = &tweets.back().created_at; } |
145 |
| - break; |
146 |
| - case 4: |
147 |
| - if (in_user) { // in user: { "statuses": [ { "user": { <key> |
148 |
| - if (KEY_IS(key, "id")) { iter.log_value("id"); expect_int = &tweets.back().user.id; } |
149 |
| - else if (KEY_IS(key, "screen_name")) { iter.log_value("screen_name"); expect_string = &tweets.back().user.screen_name; } |
150 |
| - } |
151 |
| - break; |
152 |
| - default: break; |
153 |
| - } |
154 |
| - } else { |
155 |
| - if (iter.depth == 1 && KEY_IS(key, "statuses")) { |
156 |
| - iter.log_start_value("statuses"); |
157 |
| - in_statuses = true; |
158 |
| - } |
159 |
| - } |
160 |
| - return SUCCESS; |
161 |
| -} |
162 |
| -simdjson_really_inline error_code sax_tweet_reader_visitor::visit_primitive(json_iterator &iter, const uint8_t *value) { |
163 |
| - // iter.log_value("primitive"); |
164 |
| - if (expect_int) { |
165 |
| - iter.log_value("int"); |
166 |
| - if (auto error = numberparsing::parse_unsigned(value).get(*expect_int)) { |
167 |
| - // If number parsing failed, check if it's null before returning the error |
168 |
| - if (!atomparsing::is_valid_null_atom(value)) { iter.log_error("expected number or null"); return error; } |
169 |
| - } |
170 |
| - expect_int = nullptr; |
171 |
| - } else if (expect_string) { |
172 |
| - iter.log_value("string"); |
173 |
| - // Must be a string! |
174 |
| - if (value[0] != '"') { iter.log_error("expected string"); return STRING_ERROR; } |
175 |
| - auto end = stringparsing::parse_string(value, current_string_buf_loc); |
176 |
| - if (!end) { iter.log_error("error parsing string"); return STRING_ERROR; } |
177 |
| - *expect_string = std::string_view((const char *)current_string_buf_loc, end-current_string_buf_loc); |
178 |
| - current_string_buf_loc = end; |
179 |
| - expect_string = nullptr; |
180 |
| - } |
181 |
| - return SUCCESS; |
182 |
| -} |
183 |
| -simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_end(json_iterator &iter) { |
184 |
| - // iter.log_end_value("array"); |
185 |
| - // When we hit the end of { "statuses": [ ... ], we're done with statuses. |
186 |
| - if (in_statuses && iter.depth == 2) { iter.log_end_value("statuses"); in_statuses = false; } |
187 |
| - return SUCCESS; |
188 |
| -} |
189 |
| -simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_end(json_iterator &iter) { |
190 |
| - // iter.log_end_value("object"); |
191 |
| - // When we hit the end of { "statuses": [ { "user": { ... }, we're done with the user |
192 |
| - if (in_user && iter.depth == 4) { iter.log_end_value("user"); in_user = false; } |
193 |
| - if (in_statuses && iter.depth == 3) { iter.log_end_value("tweet"); } |
194 |
| - return SUCCESS; |
195 |
| -} |
| 81 | + // Allocate |
| 82 | + dom::parser parser; |
| 83 | + if (auto error = parser.allocate(json.size())) { cerr << error << endl; return; }; |
196 | 84 |
|
197 |
| -simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_end(json_iterator &iter) { |
198 |
| - iter.log_end_value("document"); |
199 |
| - return SUCCESS; |
200 |
| -} |
| 85 | + // Warm the vector |
| 86 | + std::vector<twitter::tweet> tweets; |
| 87 | + read_dom_tweets(parser, json, tweets); |
201 | 88 |
|
202 |
| -simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_array(json_iterator &iter) { |
203 |
| - // if we expected an int or string and got an array or object, it's an error |
204 |
| - // iter.log_value("empty array"); |
205 |
| - if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; } |
206 |
| - return SUCCESS; |
207 |
| -} |
208 |
| -simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_object(json_iterator &iter) { |
209 |
| - // if we expected an int or string and got an array or object, it's an error |
210 |
| - // iter.log_value("empty object"); |
211 |
| - if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; } |
212 |
| - return SUCCESS; |
213 |
| -} |
214 |
| -simdjson_really_inline error_code sax_tweet_reader_visitor::visit_root_primitive(json_iterator &iter, const uint8_t *) { |
215 |
| - // iter.log_value("root primitive"); |
216 |
| - iter.log_error("unexpected root primitive"); |
217 |
| - return TAPE_ERROR; |
| 89 | + // Read tweets |
| 90 | + size_t bytes = 0; |
| 91 | + size_t num_tweets = 0; |
| 92 | + for (SIMDJSON_UNUSED auto _ : state) { |
| 93 | + tweets.clear(); |
| 94 | + read_dom_tweets(parser, json, tweets); |
| 95 | + bytes += json.size(); |
| 96 | + num_tweets += tweets.size(); |
| 97 | + } |
| 98 | + // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte |
| 99 | + state.counters["Gigabytes"] = benchmark::Counter( |
| 100 | + double(bytes), benchmark::Counter::kIsRate, |
| 101 | + benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024 |
| 102 | + state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate); |
| 103 | + state.counters["tweets"] = Counter(double(num_tweets), benchmark::Counter::kIsRate); |
218 | 104 | }
|
| 105 | +BENCHMARK(dom_tweets)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double { |
| 106 | + return *(std::max_element(std::begin(v), std::end(v))); |
| 107 | + })->DisplayAggregatesOnly(true); |
219 | 108 |
|
220 |
| -simdjson_really_inline error_code sax_tweet_reader_visitor::increment_count(json_iterator &) { return SUCCESS; } |
221 |
| - |
222 |
| -} // namespace twitter |
223 |
| - |
224 |
| -SIMDJSON_UNTARGET_REGION |
225 |
| - |
226 |
| - |
227 |
| -SIMDJSON_PUSH_DISABLE_ALL_WARNINGS |
228 |
| -#include <benchmark/benchmark.h> |
229 |
| -SIMDJSON_POP_DISABLE_WARNINGS |
230 |
| - |
231 |
| -using namespace benchmark; |
232 |
| -using namespace std; |
233 |
| - |
234 |
| -const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json"; |
| 109 | +#endif // SIMDJSON_EXCEPTIONS |
235 | 110 |
|
236 |
| -static void sax_tweets(State& state) { |
| 111 | +static void dom_parse(State &state) { |
237 | 112 | // Load twitter.json to a buffer
|
238 | 113 | padded_string json;
|
239 | 114 | if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; }
|
240 | 115 |
|
241 | 116 | // Allocate
|
242 |
| - twitter::sax_tweet_reader reader; |
243 |
| - if (auto error = reader.set_capacity(json.size())) { cerr << error << endl; return; } |
| 117 | + dom::parser parser; |
| 118 | + if (auto error = parser.allocate(json.size())) { cerr << error << endl; return; }; |
244 | 119 |
|
245 |
| - // Make the tweet_reader |
| 120 | + // Read tweets |
246 | 121 | size_t bytes = 0;
|
247 |
| - size_t tweets = 0; |
248 | 122 | for (SIMDJSON_UNUSED auto _ : state) {
|
249 |
| - if (auto error = reader.read_tweets(json)) { throw error; } |
| 123 | + if (parser.parse(json).error()) { throw "Parsing failed"; }; |
250 | 124 | bytes += json.size();
|
251 |
| - tweets += reader.tweets.size(); |
252 | 125 | }
|
253 | 126 | // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
|
254 | 127 | state.counters["Gigabytes"] = benchmark::Counter(
|
255 | 128 | double(bytes), benchmark::Counter::kIsRate,
|
256 | 129 | benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024
|
257 | 130 | state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate);
|
258 |
| - state.counters["tweets"] = Counter(double(tweets), benchmark::Counter::kIsRate); |
259 | 131 | }
|
260 |
| -BENCHMARK(sax_tweets)->Repetitions(10)->ComputeStatistics("max", [](const std::vector<double>& v) -> double { |
| 132 | +BENCHMARK(dom_parse)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](const std::vector<double>& v) -> double { |
261 | 133 | return *(std::max_element(std::begin(v), std::end(v)));
|
262 | 134 | })->DisplayAggregatesOnly(true);
|
263 | 135 |
|
|
0 commit comments