Add option to make buffers hot and remove recent benchmarking changes (simdjson#443)

lemire · web-flow · commit f87e64f988fc · 2020-01-15T19:48:00.000-05:00
* This revert the code back to how it was prior to the silly "run two stages" routine and instead
adds an option to benchmark the code over hot buffers. It turns out that it can be expensive,
when the files are large, to allocate the pages.
diff --git a/benchmark/benchmarker.h b/benchmark/benchmarker.h
@@ -280,13 +280,19 @@ struct benchmarker {
     return all_stages.iterations;
   }
 
-  really_inline void run_iteration(bool stage1_only, bool rerunbothstages) {
+  really_inline void run_iteration(bool stage1_only, bool hotbuffers) {
     // Allocate ParsedJson
     collector.start();
     ParsedJson pj;
     bool allocok = pj.allocate_capacity(json.size());
     event_count allocate_count = collector.end();
     allocate_stage << allocate_count;
+    if(hotbuffers) {
+      int result = parser.parse((const uint8_t *)json.data(), json.size(), pj);
+      if (result != simdjson::SUCCESS) {
+        exit_error(string("Failed to parse ") + filename + string(":") + pj.get_error_message());
+      }
+    }
 
     if (!allocok) {
       exit_error(string("Unable to allocate_stage ") + to_string(json.size()) + " bytes for the JSON result.");
@@ -316,21 +322,7 @@ struct benchmarker {
       }
       stage2_count = collector.end();
       stage2 << stage2_count;
-      if(rerunbothstages) {
-        // You would think that the entire processing is just stage 1 + stage 2, but
-        // empirically, that's not true! Not even close to be true in some instances.
-        event_count allstages_count;
-        collector.start();
-        result = parser.parse((const uint8_t *)json.data(), json.size(), pj);
-        if (result != simdjson::SUCCESS) {
-          exit_error(string("Failed to parse ") + filename + " during overall parsing " + pj.get_error_message());
-        }
-        allstages_count = collector.end();
-        all_stages << allstages_count;
-      } else {
-        // we are optimistic
-        all_stages << stage1_count + stage2_count;
-      }
+      all_stages << allocate_count + stage1_count + stage2_count;
     }
     // Calculate stats the first time we parse
     if (stats == NULL) {
@@ -344,9 +336,9 @@ struct benchmarker {
     }
   }
 
-  really_inline void run_iterations(size_t iterations, bool stage1_only, bool rerunbothstages) {
+  really_inline void run_iterations(size_t iterations, bool stage1_only, bool hotbuffers) {
     for (size_t i = 0; i<iterations; i++) {
-      run_iteration(stage1_only, rerunbothstages);
+      run_iteration(stage1_only, hotbuffers);
     }
   }
 
@@ -449,8 +441,11 @@ struct benchmarker {
       printf("\n");
       printf("All Stages\n");
       print_aggregate("|    "   , all_stages.best);
-      //          printf("|- Allocation\n");
-      // print_aggregate("|    ", allocate_stage.best);
+      // frequently, allocation is a tiny fraction of the running time so we omit it
+      if(allocate_stage.best.elapsed_sec() > 0.01 * all_stages.best.elapsed_sec()) {
+        printf("|- Allocation\n");
+        print_aggregate("|    ", allocate_stage.best);
+      }
               printf("|- Stage 1\n");
       print_aggregate("|    ", stage1.best);
               printf("|- Stage 2\n");
diff --git a/benchmark/parse.cpp b/benchmark/parse.cpp
@@ -70,12 +70,10 @@ void print_usage(ostream& out) {
   out << "-s STAGE   - Stop after the given stage." << endl;
   out << "             -s stage1  - Stop after find_structural_bits." << endl;
   out << "             -s all     - Run all stages." << endl;
-  out << "             -s allfast - Run all stages." << endl;
+  out << "-H         - Make the buffers hot (reduce page allocation during parsing)" << endl;
 
   out << "-a ARCH    - Use the parser with the designated architecture (HASWELL, WESTMERE" << endl;
   out << "             or ARM64). By default, detects best supported architecture." << endl;
-  out << "-o         - Estimate the overall speed as stage 1 + stage 2 instead of a rerun of both" << endl;
-
 }
 
 void exit_usage(string message) {
@@ -95,13 +93,13 @@ struct option_struct {
 
   bool verbose = false;
   bool tabbed_output = false;
-  bool rerunbothstages = true;
+  bool hotbuffers = false;
 
   option_struct(int argc, char **argv) {
     #ifndef _MSC_VER
       int c;
 
-      while ((c = getopt(argc, argv, "vtn:i:a:s:")) != -1) {
+      while ((c = getopt(argc, argv, "vtn:i:a:s:H")) != -1) {
         switch (c) {
         case 'n':
           iterations = atoi(optarg);
@@ -121,15 +119,14 @@ struct option_struct {
             exit_usage(string("Unsupported option value -a ") + optarg + ": expected -a HASWELL, WESTMERE or ARM64");
           }
           break;
+        case 'H':
+          hotbuffers = true;
+          break;
         case 's':
           if (!strcmp(optarg, "stage1")) {
             stage1_only = true;
           } else if (!strcmp(optarg, "all")) {
             stage1_only = false;
-            rerunbothstages = true; // for safety
-          } else if (!strcmp(optarg, "allfast")) {
-            stage1_only = false;
-            rerunbothstages = false;
           } else {
             exit_usage(string("Unsupported option value -s ") + optarg + ": expected -s stage1 or all");
           }
@@ -204,7 +201,7 @@ int main(int argc, char *argv[]) {
       // Benchmark each file once per iteration
       for (size_t f=0; f<options.files.size(); f++) {
         verbose() << "[verbose] " << benchmarkers[f]->filename << " iterations #" << iteration << "-" << (iteration+options.iteration_step-1) << endl;
-        benchmarkers[f]->run_iterations(options.iteration_step, true, false);
+        benchmarkers[f]->run_iterations(options.iteration_step, true, options.hotbuffers);
       }
     }
   } else {
@@ -213,7 +210,7 @@ int main(int argc, char *argv[]) {
       // Benchmark each file once per iteration
       for (size_t f=0; f<options.files.size(); f++) {
         verbose() << "[verbose] " << benchmarkers[f]->filename << " iterations #" << iteration << "-" << (iteration+options.iteration_step-1) << endl;
-        benchmarkers[f]->run_iterations(options.iteration_step, false, options.rerunbothstages);
+        benchmarkers[f]->run_iterations(options.iteration_step, false, options.hotbuffers);
       }
     }
   }