|
| 1 | +#define SIMDJSON_IMPLEMENTATION_FALLBACK 0 |
| 2 | +#define SIMDJSON_IMPLEMENTATION_WESTMERE 0 |
| 3 | +#define SIMDJSON_IMPLEMENTATION_AMD64 0 |
| 4 | + |
| 5 | +#include "simdjson.h" |
| 6 | +#include "simdjson.cpp" |
| 7 | +using namespace simdjson; |
| 8 | + |
| 9 | +using namespace haswell; |
| 10 | +using namespace haswell::stage2; |
| 11 | + |
| 12 | +SIMDJSON_TARGET_HASWELL |
| 13 | + |
| 14 | +namespace twitter { |
| 15 | + |
| 16 | +#define KEY_IS(KEY, MATCH) (!strncmp((const char *)KEY, "\"" MATCH "\"", strlen("\"" MATCH "\""))) |
| 17 | + |
| 18 | +struct twitter_user { |
| 19 | + uint64_t id{}; |
| 20 | + std::string_view screen_name{}; |
| 21 | +}; |
| 22 | +struct tweet { |
| 23 | + uint64_t id{}; |
| 24 | + std::string_view text{}; |
| 25 | + std::string_view created_at{}; |
| 26 | + uint64_t in_reply_to_status_id{}; |
| 27 | + uint64_t retweet_count{}; |
| 28 | + uint64_t favorite_count{}; |
| 29 | + twitter_user user{}; |
| 30 | +}; |
| 31 | +struct sax_tweet_reader { |
| 32 | + std::vector<tweet> tweets; |
| 33 | + std::unique_ptr<uint8_t[]> string_buf; |
| 34 | + size_t capacity; |
| 35 | + dom_parser_implementation dom_parser; |
| 36 | + |
| 37 | + sax_tweet_reader(); |
| 38 | + error_code set_capacity(size_t new_capacity); |
| 39 | + error_code read_tweets(padded_string &json); |
| 40 | +}; // struct tweet_reader |
| 41 | + |
| 42 | +} // namespace twitter |
| 43 | + |
| 44 | +namespace twitter { |
| 45 | + |
| 46 | +struct sax_tweet_reader_visitor { |
| 47 | + bool in_statuses{false}; |
| 48 | + bool in_user{false}; |
| 49 | + std::vector<tweet> &tweets; |
| 50 | + uint8_t *current_string_buf_loc; |
| 51 | + uint64_t *expect_int{}; |
| 52 | + std::string_view *expect_string{}; |
| 53 | + |
| 54 | + sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf); |
| 55 | + |
| 56 | + simdjson_really_inline error_code visit_document_start(json_iterator &iter); |
| 57 | + simdjson_really_inline error_code visit_object_start(json_iterator &iter); |
| 58 | + simdjson_really_inline error_code visit_key(json_iterator &iter, const uint8_t *key); |
| 59 | + simdjson_really_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value); |
| 60 | + simdjson_really_inline error_code visit_array_start(json_iterator &iter); |
| 61 | + simdjson_really_inline error_code visit_array_end(json_iterator &iter); |
| 62 | + simdjson_really_inline error_code visit_object_end(json_iterator &iter); |
| 63 | + simdjson_really_inline error_code visit_document_end(json_iterator &iter); |
| 64 | + simdjson_really_inline error_code visit_empty_array(json_iterator &iter); |
| 65 | + simdjson_really_inline error_code visit_empty_object(json_iterator &iter); |
| 66 | + simdjson_really_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value); |
| 67 | + simdjson_really_inline error_code increment_count(json_iterator &iter); |
| 68 | +}; // sax_tweet_reader_visitor |
| 69 | + |
| 70 | +sax_tweet_reader::sax_tweet_reader() : tweets{}, string_buf{}, capacity{0}, dom_parser() {} |
| 71 | + |
| 72 | +error_code sax_tweet_reader::set_capacity(size_t new_capacity) { |
| 73 | + // string_capacity copied from document::allocate |
| 74 | + size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * new_capacity / 3 + 32, 64); |
| 75 | + string_buf.reset(new (std::nothrow) uint8_t[string_capacity]); |
| 76 | + if (auto error = dom_parser.set_capacity(new_capacity)) { return error; } |
| 77 | + if (capacity == 0) { // set max depth the first time only |
| 78 | + if (auto error = dom_parser.set_max_depth(DEFAULT_MAX_DEPTH)) { return error; } |
| 79 | + } |
| 80 | + capacity = new_capacity; |
| 81 | + return SUCCESS; |
| 82 | +} |
| 83 | + |
| 84 | +// NOTE: this assumes the dom_parser is already allocated |
| 85 | +error_code sax_tweet_reader::read_tweets(padded_string &json) { |
| 86 | + // Allocate capacity if needed |
| 87 | + tweets.clear(); |
| 88 | + if (capacity < json.size()) { |
| 89 | + if (auto error = set_capacity(capacity)) { return error; } |
| 90 | + } |
| 91 | + |
| 92 | + // Run stage 1 first. |
| 93 | + if (auto error = dom_parser.stage1((uint8_t *)json.data(), json.size(), false)) { return error; } |
| 94 | + |
| 95 | + // Then walk the document, parsing the tweets as we go |
| 96 | + json_iterator iter(dom_parser, 0); |
| 97 | + sax_tweet_reader_visitor visitor(tweets, string_buf.get()); |
| 98 | + if (auto error = iter.walk_document<false>(visitor)) { return error; } |
| 99 | + return SUCCESS; |
| 100 | +} |
| 101 | + |
| 102 | +sax_tweet_reader_visitor::sax_tweet_reader_visitor(std::vector<tweet> &_tweets, uint8_t *string_buf) |
| 103 | + : tweets{_tweets}, |
| 104 | + current_string_buf_loc{string_buf} { |
| 105 | +} |
| 106 | + |
| 107 | +simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_start(json_iterator &iter) { |
| 108 | + iter.log_start_value("document"); |
| 109 | + return SUCCESS; |
| 110 | +} |
| 111 | +simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_start(json_iterator &iter) { |
| 112 | + // iter.log_start_value("array"); |
| 113 | + // if we expected an int or string and got an array or object, it's an error |
| 114 | + if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; } |
| 115 | + return SUCCESS; |
| 116 | +} |
| 117 | +simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_start(json_iterator &iter) { |
| 118 | + // iter.log_start_value("object"); |
| 119 | + |
| 120 | + // if we expected an int or string and got an array or object, it's an error |
| 121 | + if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; } |
| 122 | + |
| 123 | + // { "statuses": [ { |
| 124 | + if (in_statuses && iter.depth == 3) { |
| 125 | + iter.log_start_value("tweet"); |
| 126 | + tweets.push_back({}); |
| 127 | + } |
| 128 | + return SUCCESS; |
| 129 | +} |
| 130 | +simdjson_really_inline error_code sax_tweet_reader_visitor::visit_key(json_iterator &iter, const uint8_t *key) { |
| 131 | + // iter.log_value("key"); |
| 132 | + if (in_statuses) { |
| 133 | + switch (iter.depth) { |
| 134 | + case 3: // in tweet: { "statuses": [ { <key> |
| 135 | + // NOTE: the way we're comparing key (fairly naturally) means the caller doesn't have to check " for us at all |
| 136 | + if (KEY_IS(key, "user")) { iter.log_start_value("user"); in_user = true; } |
| 137 | + |
| 138 | + else if (KEY_IS(key, "id")) { iter.log_value("id"); expect_int = &tweets.back().id; } |
| 139 | + else if (KEY_IS(key, "in_reply_to_status_id")) { iter.log_value("in_reply_to_status_id"); expect_int = &tweets.back().in_reply_to_status_id; } |
| 140 | + else if (KEY_IS(key, "retweet_count")) { iter.log_value("retweet_count"); expect_int = &tweets.back().retweet_count; } |
| 141 | + else if (KEY_IS(key, "favorite_count")) { iter.log_value("favorite_count"); expect_int = &tweets.back().favorite_count; } |
| 142 | + |
| 143 | + else if (KEY_IS(key, "text")) { iter.log_value("text"); expect_string = &tweets.back().text; } |
| 144 | + else if (KEY_IS(key, "created_at")) { iter.log_value("created_at"); expect_string = &tweets.back().created_at; } |
| 145 | + break; |
| 146 | + case 4: |
| 147 | + if (in_user) { // in user: { "statuses": [ { "user": { <key> |
| 148 | + if (KEY_IS(key, "id")) { iter.log_value("id"); expect_int = &tweets.back().user.id; } |
| 149 | + else if (KEY_IS(key, "screen_name")) { iter.log_value("screen_name"); expect_string = &tweets.back().user.screen_name; } |
| 150 | + } |
| 151 | + break; |
| 152 | + default: break; |
| 153 | + } |
| 154 | + } else { |
| 155 | + if (iter.depth == 1 && KEY_IS(key, "statuses")) { |
| 156 | + iter.log_start_value("statuses"); |
| 157 | + in_statuses = true; |
| 158 | + } |
| 159 | + } |
| 160 | + return SUCCESS; |
| 161 | +} |
| 162 | +simdjson_really_inline error_code sax_tweet_reader_visitor::visit_primitive(json_iterator &iter, const uint8_t *value) { |
| 163 | + // iter.log_value("primitive"); |
| 164 | + if (expect_int) { |
| 165 | + iter.log_value("int"); |
| 166 | + if (auto error = numberparsing::parse_unsigned(value).get(*expect_int)) { |
| 167 | + // If number parsing failed, check if it's null before returning the error |
| 168 | + if (!atomparsing::is_valid_null_atom(value)) { iter.log_error("expected number or null"); return error; } |
| 169 | + } |
| 170 | + expect_int = nullptr; |
| 171 | + } else if (expect_string) { |
| 172 | + iter.log_value("string"); |
| 173 | + // Must be a string! |
| 174 | + if (value[0] != '"') { iter.log_error("expected string"); return STRING_ERROR; } |
| 175 | + auto end = stringparsing::parse_string(value, current_string_buf_loc); |
| 176 | + if (!end) { iter.log_error("error parsing string"); return STRING_ERROR; } |
| 177 | + *expect_string = std::string_view((const char *)current_string_buf_loc, end-current_string_buf_loc); |
| 178 | + current_string_buf_loc = end; |
| 179 | + expect_string = nullptr; |
| 180 | + } |
| 181 | + return SUCCESS; |
| 182 | +} |
| 183 | +simdjson_really_inline error_code sax_tweet_reader_visitor::visit_array_end(json_iterator &iter) { |
| 184 | + // iter.log_end_value("array"); |
| 185 | + // When we hit the end of { "statuses": [ ... ], we're done with statuses. |
| 186 | + if (in_statuses && iter.depth == 2) { iter.log_end_value("statuses"); in_statuses = false; } |
| 187 | + return SUCCESS; |
| 188 | +} |
| 189 | +simdjson_really_inline error_code sax_tweet_reader_visitor::visit_object_end(json_iterator &iter) { |
| 190 | + // iter.log_end_value("object"); |
| 191 | + // When we hit the end of { "statuses": [ { "user": { ... }, we're done with the user |
| 192 | + if (in_user && iter.depth == 4) { iter.log_end_value("user"); in_user = false; } |
| 193 | + if (in_statuses && iter.depth == 3) { iter.log_end_value("tweet"); } |
| 194 | + return SUCCESS; |
| 195 | +} |
| 196 | + |
| 197 | +simdjson_really_inline error_code sax_tweet_reader_visitor::visit_document_end(json_iterator &iter) { |
| 198 | + iter.log_end_value("document"); |
| 199 | + return SUCCESS; |
| 200 | +} |
| 201 | + |
| 202 | +simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_array(json_iterator &iter) { |
| 203 | + // if we expected an int or string and got an array or object, it's an error |
| 204 | + // iter.log_value("empty array"); |
| 205 | + if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; } |
| 206 | + return SUCCESS; |
| 207 | +} |
| 208 | +simdjson_really_inline error_code sax_tweet_reader_visitor::visit_empty_object(json_iterator &iter) { |
| 209 | + // if we expected an int or string and got an array or object, it's an error |
| 210 | + // iter.log_value("empty object"); |
| 211 | + if (expect_int || expect_string) { iter.log_error("expected int/string"); return TAPE_ERROR; } |
| 212 | + return SUCCESS; |
| 213 | +} |
| 214 | +simdjson_really_inline error_code sax_tweet_reader_visitor::visit_root_primitive(json_iterator &iter, const uint8_t *) { |
| 215 | + // iter.log_value("root primitive"); |
| 216 | + iter.log_error("unexpected root primitive"); |
| 217 | + return TAPE_ERROR; |
| 218 | +} |
| 219 | + |
| 220 | +simdjson_really_inline error_code sax_tweet_reader_visitor::increment_count(json_iterator &) { return SUCCESS; } |
| 221 | + |
| 222 | +} // namespace twitter |
| 223 | + |
| 224 | +SIMDJSON_UNTARGET_REGION |
| 225 | + |
| 226 | + |
| 227 | +SIMDJSON_PUSH_DISABLE_ALL_WARNINGS |
| 228 | +#include <benchmark/benchmark.h> |
| 229 | +SIMDJSON_POP_DISABLE_WARNINGS |
| 230 | + |
| 231 | +using namespace benchmark; |
| 232 | +using namespace std; |
| 233 | + |
| 234 | +const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR "twitter.json"; |
| 235 | + |
| 236 | +static void sax_tweets(State& state) { |
| 237 | + // Load twitter.json to a buffer |
| 238 | + padded_string json; |
| 239 | + if (auto error = padded_string::load(TWITTER_JSON).get(json)) { cerr << error << endl; return; } |
| 240 | + |
| 241 | + // Allocate |
| 242 | + twitter::sax_tweet_reader reader; |
| 243 | + if (auto error = reader.set_capacity(json.size())) { cerr << error << endl; return; } |
| 244 | + |
| 245 | + // Make the tweet_reader |
| 246 | + size_t bytes = 0; |
| 247 | + size_t tweets = 0; |
| 248 | + for (SIMDJSON_UNUSED auto _ : state) { |
| 249 | + if (auto error = reader.read_tweets(json)) { throw error; } |
| 250 | + bytes += json.size(); |
| 251 | + tweets += reader.tweets.size(); |
| 252 | + } |
| 253 | + // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte |
| 254 | + state.counters["Gigabytes"] = benchmark::Counter( |
| 255 | + double(bytes), benchmark::Counter::kIsRate, |
| 256 | + benchmark::Counter::OneK::kIs1000); // For GiB : kIs1024 |
| 257 | + state.counters["docs"] = Counter(double(state.iterations()), benchmark::Counter::kIsRate); |
| 258 | + state.counters["tweets"] = Counter(double(tweets), benchmark::Counter::kIsRate); |
| 259 | +} |
| 260 | +BENCHMARK(sax_tweets)->Repetitions(10)->ComputeStatistics("max", [](const std::vector<double>& v) -> double { |
| 261 | + return *(std::max_element(std::begin(v), std::end(v))); |
| 262 | + })->DisplayAggregatesOnly(true); |
| 263 | + |
| 264 | +BENCHMARK_MAIN(); |
0 commit comments