2
2
#define SIMDJSON_IMPLEMENTATION_WESTMERE 0
3
3
#define SIMDJSON_IMPLEMENTATION_AMD64 0
4
4
5
+ #include < iostream>
6
+ #include < sstream>
7
+ #include < random>
8
+
5
9
#include " simdjson.h"
6
10
7
11
SIMDJSON_PUSH_DISABLE_ALL_WARNINGS
8
12
#include < benchmark/benchmark.h>
9
13
SIMDJSON_POP_DISABLE_WARNINGS
10
14
11
15
#include " simdjson.cpp"
12
- #include " twitter/sax_tweet_reader.h"
16
+
17
+ #if SIMDJSON_EXCEPTIONS
13
18
14
19
using namespace benchmark ;
15
20
using namespace simdjson ;
@@ -19,6 +24,10 @@ using std::endl;
19
24
const char *TWITTER_JSON = SIMDJSON_BENCHMARK_DATA_DIR " twitter.json" ;
20
25
const int REPETITIONS = 10 ;
21
26
27
+ #if SIMDJSON_IMPLEMENTATION_HASWELL
28
+
29
+ #include " twitter/sax_tweet_reader.h"
30
+
22
31
static void sax_tweets (State &state) {
23
32
// Load twitter.json to a buffer
24
33
padded_string json;
@@ -50,7 +59,9 @@ BENCHMARK(sax_tweets)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](con
50
59
return *(std::max_element (std::begin (v), std::end (v)));
51
60
})->DisplayAggregatesOnly (true );
52
61
53
- #if SIMDJSON_EXCEPTIONS
62
+ #endif // SIMDJSON_IMPLEMENTATION_HASWELL
63
+
64
+ #include " twitter/tweet.h"
54
65
55
66
simdjson_really_inline uint64_t nullable_int (dom::element element) {
56
67
if (element.is_null ()) { return 0 ; }
@@ -106,8 +117,6 @@ BENCHMARK(dom_tweets)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](con
106
117
return *(std::max_element (std::begin (v), std::end (v)));
107
118
})->DisplayAggregatesOnly (true );
108
119
109
- #endif // SIMDJSON_EXCEPTIONS
110
-
111
120
static void dom_parse (State &state) {
112
121
// Load twitter.json to a buffer
113
122
padded_string json;
@@ -133,4 +142,218 @@ BENCHMARK(dom_parse)->Repetitions(REPETITIONS)->ComputeStatistics("max", [](cons
133
142
return *(std::max_element (std::begin (v), std::end (v)));
134
143
})->DisplayAggregatesOnly (true );
135
144
145
+
146
+ /* *******************
147
+ * Large file parsing benchmarks:
148
+ ********************/
149
+
150
+ static std::string build_json_array (size_t N) {
151
+ std::default_random_engine e;
152
+ std::uniform_real_distribution<> dis (0 , 1 );
153
+ std::stringstream myss;
154
+ myss << " [" << std::endl;
155
+ if (N > 0 ) {
156
+ myss << " { \" x\" :" << dis (e) << " , \" y\" :" << dis (e) << " , \" z\" :" << dis (e) << " }" << std::endl;
157
+ }
158
+ for (size_t i = 1 ; i < N; i++) {
159
+ myss << " ," << std::endl;
160
+ myss << " { \" x\" :" << dis (e) << " , \" y\" :" << dis (e) << " , \" z\" :" << dis (e) << " }" ;
161
+ }
162
+ myss << std::endl;
163
+ myss << " ]" << std::endl;
164
+ std::string answer = myss.str ();
165
+ std::cout << " Creating a source file spanning " << (answer.size () + 512 ) / 1024 << " KB " << std::endl;
166
+ return answer;
167
+ }
168
+
169
+ static const simdjson::padded_string& get_my_json_str () {
170
+ static simdjson::padded_string s = build_json_array (1000000 );
171
+ return s;
172
+ }
173
+
174
+ struct my_point {
175
+ double x;
176
+ double y;
177
+ double z;
178
+ };
179
+
180
+ // ./benchmark/bench_sax --benchmark_filter=largerandom
181
+
182
+
183
+ /* **
184
+ * We start with the naive DOM-based approach.
185
+ **/
186
+ static void dom_parse_largerandom (State &state) {
187
+ // Load twitter.json to a buffer
188
+ const padded_string& json = get_my_json_str ();
189
+
190
+ // Allocate
191
+ dom::parser parser;
192
+ if (auto error = parser.allocate (json.size ())) { cerr << error << endl; return ; };
193
+
194
+ // Read
195
+ size_t bytes = 0 ;
196
+ simdjson::error_code error;
197
+ for (SIMDJSON_UNUSED auto _ : state) {
198
+ std::vector<my_point> container;
199
+ dom::element doc;
200
+ if ((error = parser.parse (json).get (doc))) {
201
+ std::cerr << " failure: " << error << std::endl;
202
+ throw " Parsing failed" ;
203
+ };
204
+ for (auto p : doc) {
205
+ container.emplace_back (my_point{p[" x" ], p[" y" ], p[" z" ]});
206
+ }
207
+ bytes += json.size ();
208
+ benchmark::DoNotOptimize (container.data ());
209
+
210
+ }
211
+ // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
212
+ state.counters [" Gigabytes" ] = benchmark::Counter (
213
+ double (bytes), benchmark::Counter::kIsRate ,
214
+ benchmark::Counter::OneK::kIs1000 ); // For GiB : kIs1024
215
+ state.counters [" docs" ] = Counter (double (state.iterations ()), benchmark::Counter::kIsRate );
216
+ }
217
+
218
+ BENCHMARK (dom_parse_largerandom)->Repetitions(REPETITIONS)->ComputeStatistics(" max" , [](const std::vector<double >& v) -> double {
219
+ return *(std::max_element (std::begin (v), std::end (v)));
220
+ })->DisplayAggregatesOnly (true );
221
+
222
+ #if SIMDJSON_IMPLEMENTATION_HASWELL
223
+
224
+ /* **
225
+ * Next we are going to code the SAX approach.
226
+ **/
227
+
228
+ SIMDJSON_TARGET_HASWELL
229
+
230
+ namespace largerandom {
231
+ namespace {
232
+
233
+ using namespace simdjson ;
234
+ using namespace haswell ;
235
+ using namespace haswell ::stage2;
236
+ struct sax_point_reader_visitor {
237
+ public:
238
+ sax_point_reader_visitor (std::vector<my_point> &_points) : points(_points) {
239
+ }
240
+
241
+ simdjson_really_inline error_code visit_document_start (json_iterator &) { return SUCCESS; }
242
+ simdjson_really_inline error_code visit_object_start (json_iterator &) { return SUCCESS; }
243
+ simdjson_really_inline error_code visit_key (json_iterator &, const uint8_t *key) {
244
+ switch (key[0 ]) {
245
+ case ' x' :
246
+ idx = 0 ;
247
+ break ;
248
+ case ' y' :
249
+ idx = 2 ;
250
+ break ;
251
+ case ' z' :
252
+ idx = 3 ;
253
+ break ;
254
+ }
255
+ return SUCCESS;
256
+ }
257
+ simdjson_really_inline error_code visit_primitive (json_iterator &, const uint8_t *value) {
258
+ return numberparsing::parse_double (value).get (buffer[idx]);
259
+ }
260
+ simdjson_really_inline error_code visit_array_start (json_iterator &) { return SUCCESS; }
261
+ simdjson_really_inline error_code visit_array_end (json_iterator &) { return SUCCESS; }
262
+ simdjson_really_inline error_code visit_object_end (json_iterator &) { return SUCCESS; }
263
+ simdjson_really_inline error_code visit_document_end (json_iterator &) { return SUCCESS; }
264
+ simdjson_really_inline error_code visit_empty_array (json_iterator &) { return SUCCESS; }
265
+ simdjson_really_inline error_code visit_empty_object (json_iterator &) { return SUCCESS; }
266
+ simdjson_really_inline error_code visit_root_primitive (json_iterator &, const uint8_t *) { return SUCCESS; }
267
+ simdjson_really_inline error_code increment_count (json_iterator &) { return SUCCESS; }
268
+ std::vector<my_point> &points;
269
+ size_t idx{0 };
270
+ double buffer[3 ];
271
+ };
272
+
273
+ struct sax_point_reader {
274
+ std::vector<my_point> points;
275
+ std::unique_ptr<uint8_t []> string_buf;
276
+ size_t capacity;
277
+ dom_parser_implementation dom_parser;
278
+
279
+ sax_point_reader ();
280
+ error_code set_capacity (size_t new_capacity);
281
+ error_code read_points (const padded_string &json);
282
+ }; // struct sax_point_reader
283
+
284
+ sax_point_reader::sax_point_reader () : points{}, string_buf{}, capacity{0 }, dom_parser() {
285
+ }
286
+
287
+ error_code sax_point_reader::set_capacity (size_t new_capacity) {
288
+ // string_capacity copied from document::allocate
289
+ size_t string_capacity = SIMDJSON_ROUNDUP_N (5 * new_capacity / 3 + 32 , 64 );
290
+ string_buf.reset (new (std::nothrow) uint8_t [string_capacity]);
291
+ if (auto error = dom_parser.set_capacity (new_capacity)) { return error; }
292
+ if (capacity == 0 ) { // set max depth the first time only
293
+ if (auto error = dom_parser.set_max_depth (DEFAULT_MAX_DEPTH)) { return error; }
294
+ }
295
+ capacity = new_capacity;
296
+ return SUCCESS;
297
+ }
298
+
299
+ error_code sax_point_reader::read_points (const padded_string &json) {
300
+ // Allocate capacity if needed
301
+ points.clear ();
302
+ if (capacity < json.size ()) {
303
+ if (auto error = set_capacity (capacity)) { return error; }
304
+ }
305
+
306
+ // Run stage 1 first.
307
+ if (auto error = dom_parser.stage1 ((uint8_t *)json.data (), json.size (), false )) { return error; }
308
+
309
+ // Then walk the document, parsing the tweets as we go
310
+ json_iterator iter (dom_parser, 0 );
311
+ sax_point_reader_visitor visitor (points);
312
+ if (auto error = iter.walk_document <false >(visitor)) { return error; }
313
+ return SUCCESS;
314
+ }
315
+
316
+ } // unnamed namespace
317
+ } // namespace largerandom
318
+
319
+ SIMDJSON_UNTARGET_REGION
320
+
321
+
322
+
323
+
324
+
325
+ // ./benchmark/bench_sax --benchmark_filter=largerandom
326
+ static void sax_parse_largerandom (State &state) {
327
+ // Load twitter.json to a buffer
328
+ const padded_string& json = get_my_json_str ();
329
+
330
+ // Allocate
331
+ largerandom::sax_point_reader reader;
332
+ if (auto error = reader.set_capacity (json.size ())) { throw error; }
333
+ // warming
334
+ for (size_t i = 0 ; i < 10 ; i++) {
335
+ if (auto error = reader.read_points (json)) { throw error; }
336
+ }
337
+
338
+ // Read
339
+ size_t bytes = 0 ;
340
+ for (SIMDJSON_UNUSED auto _ : state) {
341
+ if (auto error = reader.read_points (json)) { throw error; }
342
+ bytes += json.size ();
343
+ benchmark::DoNotOptimize (reader.points .data ());
344
+ }
345
+ // Gigabyte: https://en.wikipedia.org/wiki/Gigabyte
346
+ state.counters [" Gigabytes" ] = benchmark::Counter (
347
+ double (bytes), benchmark::Counter::kIsRate ,
348
+ benchmark::Counter::OneK::kIs1000 ); // For GiB : kIs1024
349
+ state.counters [" docs" ] = Counter (double (state.iterations ()), benchmark::Counter::kIsRate );
350
+ }
351
+ BENCHMARK (sax_parse_largerandom)->Repetitions(REPETITIONS)->ComputeStatistics(" max" , [](const std::vector<double >& v) -> double {
352
+ return *(std::max_element (std::begin (v), std::end (v)));
353
+ })->DisplayAggregatesOnly (true );
354
+
355
+ #endif // SIMDJSON_IMPLEMENTATION_HASWELL
356
+
357
+ #endif // SIMDJSON_EXCEPTIONS
358
+
136
359
BENCHMARK_MAIN ();
0 commit comments