Making checkperf a lot more robust.

lemire · lemire · commit cc701e69bcfb · 2020-05-07T16:46:14.000-04:00
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -94,12 +94,12 @@ jobs:
     description: Build and run tests on GCC 7 and AVX 2 with a cmake static build
     executor: gcc7
     environment: { CMAKE_FLAGS: -DSIMDJSON_GOOGLE_BENCHMARKS=ON }
-    steps: [ install_cmake, cmake_test_all, cmake_install_test ]
+    steps: [ install_cmake, cmake_test, cmake_install_test ]
   clang6:
     description: Build and run tests on clang 6 and AVX 2 with a cmake static build
     executor: clang6
     environment: { CMAKE_FLAGS: -DSIMDJSON_GOOGLE_BENCHMARKS=ON }
-    steps: [ cmake_test_all, cmake_install_test ]
+    steps: [ cmake_test, cmake_install_test ]
   # libcpp
   libcpp-clang9:
     description: Build and run tests on clang 6 and AVX 2 with a cmake static build and libc++
diff --git a/.drone.yml b/.drone.yml
@@ -33,7 +33,7 @@ steps:
     CC: clang-6.0
     CXX: clang++-6.0
     BUILD_FLAGS: -- -j
-    CTEST_FLAGS: -j4 --output-on-failure
+    CTEST_FLAGS: -j4 --output-on-failure  -E checkperf
   commands:
     - mkdir build
     - cd build
@@ -168,7 +168,7 @@ steps:
     CXX: clang++-6.0
     CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF
     BUILD_FLAGS: -- -j
-    CTEST_FLAGS: -j4 --output-on-failure
+    CTEST_FLAGS: -j4 --output-on-failure  -E checkperf
   commands:
     - apt-get update -qq
     - apt-get install -y clang cmake git
@@ -210,7 +210,7 @@ steps:
     CXX: clang++-6.0
     CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF
     BUILD_FLAGS: -- -j
-    CTEST_FLAGS: -j4 --output-on-failure
+    CTEST_FLAGS: -j4 --output-on-failure  -E checkperf
   commands:
     - apt-get update -qq
     - apt-get install -y clang cmake git
diff --git a/benchmark/perfdiff.cpp b/benchmark/perfdiff.cpp
@@ -63,22 +63,9 @@ double readThroughput(std::string parseOutput) {
 }
 
 const double INTERLEAVED_ATTEMPTS = 7;
+const double MAX_TRIAL_COUNT= 5;
 
-int main(int argc, const char *argv[]) {
-    if (argc < 3) {
-        std::cerr << "Usage: " << argv[0] << " <old parse exe> <new parse exe> [<parse arguments>]" << std::endl;
-        return 1;
-    }
-
-    std::string newCommand = argv[1];
-    std::string refCommand = argv[2];
-    for (int i=3; i<argc; i++) {
-        newCommand += " ";
-        newCommand += argv[i];
-        refCommand += " ";
-        refCommand += argv[i];
-    }
-
+void run_tests(const std::string refCommand, const std::string newCommand, double &worseref, double &bestref, double&worsenewcode, double &bestnewcode) {
     std::vector<double> ref;
     std::vector<double> newcode;
     for (int attempt=0; attempt < INTERLEAVED_ATTEMPTS; attempt++) {
@@ -95,20 +82,79 @@ int main(int argc, const char *argv[]) {
         ref.push_back(referenceThroughput);
     }
     // we check if the maximum of newcode is lower than minimum of ref, if so we have a problem so fail!
-    double worseref = *std::min_element(ref.begin(), ref.end());
-    double bestnewcode =  *std::max_element(newcode.begin(), newcode.end());
-    double bestref = *std::max_element(ref.begin(), ref.end());
-    double worsenewcode =  *std::min_element(newcode.begin(), newcode.end());
+    worseref = *std::min_element(ref.begin(), ref.end());
+    bestnewcode =  *std::max_element(newcode.begin(), newcode.end());
+    bestref = *std::max_element(ref.begin(), ref.end());
+    worsenewcode =  *std::min_element(newcode.begin(), newcode.end());
     std::cout << "The new code has a throughput in       " << worsenewcode << " -- " << bestnewcode << std::endl;
     std::cout << "The reference code has a throughput in " << worseref << " -- " << bestref << std::endl;
-    if(bestnewcode < worseref) {
-      std::cerr << "You probably have a performance degradation." << std::endl;
-      return EXIT_FAILURE;
+}
+
+
+int main(int argc, const char *argv[]) {
+    if (argc < 3) {
+        std::cerr << "Usage: " << argv[0] << " <old parse exe> <new parse exe> [<parse arguments>]" << std::endl;
+        return 1;
+    }
+
+    std::string newCommand = argv[1];
+    std::string refCommand = argv[2];
+    for (int i=3; i<argc; i++) {
+        newCommand += " ";
+        newCommand += argv[i];
+        refCommand += " ";
+        refCommand += argv[i];
     }
-    if(bestnewcode < worseref) {
-      std::cout << "You probably have a performance gain." << std::endl;
-      return EXIT_SUCCESS;
+    double worseref, bestref, worsenewcode, bestnewcode;
+    /**
+     * We take performance degradation seriously. When it occurs, we want
+     * to investigate it thoroughly. Theoretically, if INTERLEAVED_ATTEMPTS
+     * samples from one distribution are distinct from INTERLEAVED_ATTEMPTS
+     * from another distribution, then there should be a real difference.
+     * Unfortunately, in practice, we can get the impression that there are
+     * false positives. So the tool should make absolutely sure that the
+     * difference is entirely reproducible. So we require that it be
+     * able to reproduce it consistently MAX_TRIAL_COUNT times. Then it
+     * will be hard to argue with.
+     */
+    int degradation = 0;
+    int gain = 0;
+    int neutral = 0;
+
+    // at most, we will rerun the tests MAX_TRIAL_COUNT times
+    for(size_t trial = 0 ; trial < MAX_TRIAL_COUNT; trial++) {
+      run_tests(refCommand, newCommand, worseref, bestref, worsenewcode, bestnewcode);
+      if(bestnewcode < worseref) {
+        printf("Possible degradation detected (%f %%)\n", (worseref - bestnewcode) * 100.0 / worseref);
+        degradation++;
+        if(gain > 0) {
+            break; // mixed results
+        }
+        // otherwise, continue to make sure that the bad result is not a fluke
+      } else if(bestref < worsenewcode) {
+        printf("Possible gain detected (%f %%)\n", (bestref - bestref) * 100.0 / bestref);
+        gain++;
+        if(degradation > 0) {
+            break; // mixed results
+        }
+        // otherwise, continue to make sure that the good result is not a fluke
+      } else {
+        // Whenever no difference is detected, we cut short.
+        neutral++;
+        break;
+      }
+    }
+    // If we have at least one neutral, we conclude that there is no difference.
+    // If we have mixed results,  we conclude that there is no difference.
+    if(neutral > 0 || ((gain > 0) && (degradation > 0)) ){
+        std::cout << "There may not be performance difference. A manual check might be needed." << std::endl;
+        return EXIT_SUCCESS;
     }
-    std::cout << "There is no obvious performance difference. A manual check might be needed." << std::endl;
-    return EXIT_SUCCESS;
+    if(gain > 0) {
+        std::cout << "You may have a performance gain." << std::endl;
+        return EXIT_SUCCESS;
+    }
+
+    std::cerr << "You probably have a performance degradation." << std::endl;
+    return EXIT_FAILURE;
 }