Skip to content

Commit 21dce6c

Browse files
authored
Displaying the numbers of documents parsed per second (simdjson#652)
* Some users are interested, as a metric, in the number of documents parsed per second. Obviously, this means reusing the same parser again and again. * Adding a sentence * This update the parsingcompetition benchmark so that it displays the number of documents parsed per second.
1 parent 56bc8a7 commit 21dce6c

File tree

5 files changed

+47
-13
lines changed

5 files changed

+47
-13
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ Performance results
7474
7575
The simdjson library uses three-quarters less instructions than state-of-the-art parser RapidJSON and
7676
fifty percent less than sajson. To our knowledge, simdjson is the first fully-validating JSON parser
77-
to run at gigabytes per second on commodity processors.
77+
to run at gigabytes per second on commodity processors. It can parse millions of JSON documents
78+
per second on a single core.
7879
7980
The following figure represents parsing speed in GB/s for parsing various files
8081
on an Intel Skylake processor (3.4 GHz) using the GNU GCC 9 compiler (with the -O3 flag).

benchmark/benchmark.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,12 +130,16 @@ double diff(timespec start, timespec end) {
130130
if (verbose) \
131131
printf(" %7.3f %s per input byte (best) ", cycle_per_op, unitname); \
132132
if (verbose) \
133-
printf(" %7.3f %s per input byte (avg) ", avg_cycle_per_op, unitname); \
133+
printf(" %7.3f %s (avg) ", avg_cycle_per_op, unitname); \
134134
if (verbose) \
135-
printf(" %7.3f GB/s (error margin: %.3f GB/s)", max_gb_per_s, \
135+
printf(" %7.3f GB/s (error margin: %5.3f GB/s)", max_gb_per_s, \
136136
-avg_gb_per_s + max_gb_per_s); \
137+
if (verbose) \
138+
printf(" %13.0f documents/s (best)", 1.0/min_sumclockdiff); \
139+
if (verbose) \
140+
printf(" %13.0f documents/s (avg)", 1.0/(sumclockdiff/repeat)); \
137141
if (!verbose) \
138-
printf(" %20.3f %20.3f %20.3f %20.3f ", cycle_per_op, \
142+
printf(" %20.3f %20.3f %20.3f %20.3f", cycle_per_op, \
139143
avg_cycle_per_op - cycle_per_op, max_gb_per_s, \
140144
-avg_gb_per_s + max_gb_per_s); \
141145
printf("\n"); \

benchmark/benchmarker.h

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,8 @@ struct benchmarker {
258258
event_aggregate stage2;
259259
// Speed and event summary for allocation
260260
event_aggregate allocate_stage;
261+
// Speed and event summary for the repeatly-parsing mode
262+
event_aggregate loop;
261263

262264
benchmarker(const char *_filename, event_collector& _collector)
263265
: filename(_filename), collector(_collector), stats(NULL) {
@@ -346,10 +348,30 @@ struct benchmarker {
346348
}
347349
}
348350

351+
void run_loop(size_t iterations) {
352+
dom::parser parser;
353+
auto firstresult = parser.parse((const uint8_t *)json.data(), json.size());
354+
if (firstresult.error()) {
355+
exit_error(string("Failed to parse ") + filename + string(":") + error_message(firstresult.error()));
356+
}
357+
358+
collector.start();
359+
// some users want something closer to "number of documents per second"
360+
for(size_t i = 0; i < iterations; i++) {
361+
auto result = parser.parse((const uint8_t *)json.data(), json.size());
362+
if (result.error()) {
363+
exit_error(string("Failed to parse ") + filename + string(":") + error_message(result.error()));
364+
}
365+
}
366+
event_count all_loop_count = collector.end();
367+
loop << all_loop_count;
368+
}
369+
349370
really_inline void run_iterations(size_t iterations, bool stage1_only, bool hotbuffers=false) {
350371
for (size_t i = 0; i<iterations; i++) {
351372
run_iteration(stage1_only, hotbuffers);
352373
}
374+
run_loop(iterations);
353375
}
354376

355377
template<typename T>
@@ -397,7 +419,7 @@ struct benchmarker {
397419
}
398420
}
399421

400-
void print(bool tabbed_output) const {
422+
void print(bool tabbed_output, size_t iterations) const {
401423
if (tabbed_output) {
402424
char* filename_copy = (char*)malloc(strlen(filename)+1);
403425
strcpy(filename_copy, filename);
@@ -458,9 +480,9 @@ struct benchmarker {
458480
printf("|- Allocation\n");
459481
print_aggregate("| ", allocate_stage.best);
460482
}
461-
printf("|- Stage 1\n");
483+
printf("|- Stage 1\n");
462484
print_aggregate("| ", stage1.best);
463-
printf("|- Stage 2\n");
485+
printf("|- Stage 2\n");
464486
print_aggregate("| ", stage2.best);
465487
if (collector.has_events()) {
466488
double freq1 = (stage1.best.cycles() / stage1.best.elapsed_sec()) / 1000000000.0;
@@ -475,6 +497,7 @@ struct benchmarker {
475497
freqmin, freqmax, freqall);
476498
}
477499
}
500+
printf("\n%.1f documents parsed per second\n", iterations/loop.best.elapsed_sec());
478501
}
479502
}
480503
};

benchmark/parse.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ int main(int argc, char *argv[]) {
212212
if (!options.verbose) { progress.erase(); }
213213

214214
for (size_t i=0; i<options.files.size(); i++) {
215-
benchmarkers[i]->print(options.tabbed_output);
215+
benchmarkers[i]->print(options.tabbed_output, options.iterations);
216216
delete benchmarkers[i];
217217
}
218218

benchmark/parsingcompetition.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ bool fastjson_parse(const char *input) {
6565
// end of fastjson stuff
6666
#endif
6767

68-
size_t sum_line_lengths(char * data, size_t length) {
68+
never_inline size_t sum_line_lengths(char * data, size_t length) {
6969
std::stringstream is;
7070
is.rdbuf()->pubsetbuf(data, length);
7171
std::string line;
@@ -124,19 +124,25 @@ bool bench(const char *filename, bool verbose, bool just_data, int repeat_multip
124124
#ifndef ALLPARSER
125125
if (!just_data)
126126
#endif
127+
{
128+
memcpy(buffer, p.data(), p.size());
127129
BEST_TIME("RapidJSON ",
128130
d.Parse<kParseValidateEncodingFlag>((const char *)buffer)
129131
.HasParseError(),
130-
false, memcpy(buffer, p.data(), p.size()), repeat, volume,
132+
false, , repeat, volume,
131133
!just_data);
134+
}
132135
#ifndef ALLPARSER
133136
if (!just_data)
134137
#endif
138+
{
139+
memcpy(buffer, p.data(), p.size());
135140
BEST_TIME("RapidJSON (accurate number parsing) ",
136141
d.Parse<kParseValidateEncodingFlag|kParseFullPrecisionFlag>((const char *)buffer)
137142
.HasParseError(),
138-
false, memcpy(buffer, p.data(), p.size()), repeat, volume,
143+
false, , repeat, volume,
139144
!just_data);
145+
}
140146
BEST_TIME("RapidJSON (insitu)",
141147
d.ParseInsitu<kParseValidateEncodingFlag>(buffer).HasParseError(),
142148
false,
@@ -167,10 +173,10 @@ bool bench(const char *filename, bool verbose, bool just_data, int repeat_multip
167173
.is_valid(),
168174
true, memcpy(buffer, p.data(), p.size()), repeat, volume, !just_data);
169175

170-
176+
memcpy(buffer, p.data(), p.size());
171177
size_t expected = json::parse(p.data(), p.data() + p.size()).size();
172178
BEST_TIME("nlohmann-json", json::parse(buffer, buffer + p.size()).size(),
173-
expected, memcpy(buffer, p.data(), p.size()), repeat, volume,
179+
expected, , repeat, volume,
174180
!just_data);
175181

176182
#ifdef ALLPARSER

0 commit comments

Comments
 (0)