Skip to content

Commit cc701e6

Browse files
committed
Making checkperf a lot more robust.
1 parent 3c3a4db commit cc701e6

File tree

3 files changed

+78
-32
lines changed

3 files changed

+78
-32
lines changed

.circleci/config.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,12 +94,12 @@ jobs:
9494
description: Build and run tests on GCC 7 and AVX 2 with a cmake static build
9595
executor: gcc7
9696
environment: { CMAKE_FLAGS: -DSIMDJSON_GOOGLE_BENCHMARKS=ON }
97-
steps: [ install_cmake, cmake_test_all, cmake_install_test ]
97+
steps: [ install_cmake, cmake_test, cmake_install_test ]
9898
clang6:
9999
description: Build and run tests on clang 6 and AVX 2 with a cmake static build
100100
executor: clang6
101101
environment: { CMAKE_FLAGS: -DSIMDJSON_GOOGLE_BENCHMARKS=ON }
102-
steps: [ cmake_test_all, cmake_install_test ]
102+
steps: [ cmake_test, cmake_install_test ]
103103
# libcpp
104104
libcpp-clang9:
105105
description: Build and run tests on clang 6 and AVX 2 with a cmake static build and libc++

.drone.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ steps:
3333
CC: clang-6.0
3434
CXX: clang++-6.0
3535
BUILD_FLAGS: -- -j
36-
CTEST_FLAGS: -j4 --output-on-failure
36+
CTEST_FLAGS: -j4 --output-on-failure -E checkperf
3737
commands:
3838
- mkdir build
3939
- cd build
@@ -168,7 +168,7 @@ steps:
168168
CXX: clang++-6.0
169169
CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF
170170
BUILD_FLAGS: -- -j
171-
CTEST_FLAGS: -j4 --output-on-failure
171+
CTEST_FLAGS: -j4 --output-on-failure -E checkperf
172172
commands:
173173
- apt-get update -qq
174174
- apt-get install -y clang cmake git
@@ -210,7 +210,7 @@ steps:
210210
CXX: clang++-6.0
211211
CMAKE_FLAGS: -DSIMDJSON_BUILD_STATIC=OFF
212212
BUILD_FLAGS: -- -j
213-
CTEST_FLAGS: -j4 --output-on-failure
213+
CTEST_FLAGS: -j4 --output-on-failure -E checkperf
214214
commands:
215215
- apt-get update -qq
216216
- apt-get install -y clang cmake git

benchmark/perfdiff.cpp

Lines changed: 73 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -63,22 +63,9 @@ double readThroughput(std::string parseOutput) {
6363
}
6464

6565
const double INTERLEAVED_ATTEMPTS = 7;
66+
const double MAX_TRIAL_COUNT= 5;
6667

67-
int main(int argc, const char *argv[]) {
68-
if (argc < 3) {
69-
std::cerr << "Usage: " << argv[0] << " <old parse exe> <new parse exe> [<parse arguments>]" << std::endl;
70-
return 1;
71-
}
72-
73-
std::string newCommand = argv[1];
74-
std::string refCommand = argv[2];
75-
for (int i=3; i<argc; i++) {
76-
newCommand += " ";
77-
newCommand += argv[i];
78-
refCommand += " ";
79-
refCommand += argv[i];
80-
}
81-
68+
void run_tests(const std::string refCommand, const std::string newCommand, double &worseref, double &bestref, double&worsenewcode, double &bestnewcode) {
8269
std::vector<double> ref;
8370
std::vector<double> newcode;
8471
for (int attempt=0; attempt < INTERLEAVED_ATTEMPTS; attempt++) {
@@ -95,20 +82,79 @@ int main(int argc, const char *argv[]) {
9582
ref.push_back(referenceThroughput);
9683
}
9784
// we check if the maximum of newcode is lower than minimum of ref, if so we have a problem so fail!
98-
double worseref = *std::min_element(ref.begin(), ref.end());
99-
double bestnewcode = *std::max_element(newcode.begin(), newcode.end());
100-
double bestref = *std::max_element(ref.begin(), ref.end());
101-
double worsenewcode = *std::min_element(newcode.begin(), newcode.end());
85+
worseref = *std::min_element(ref.begin(), ref.end());
86+
bestnewcode = *std::max_element(newcode.begin(), newcode.end());
87+
bestref = *std::max_element(ref.begin(), ref.end());
88+
worsenewcode = *std::min_element(newcode.begin(), newcode.end());
10289
std::cout << "The new code has a throughput in " << worsenewcode << " -- " << bestnewcode << std::endl;
10390
std::cout << "The reference code has a throughput in " << worseref << " -- " << bestref << std::endl;
104-
if(bestnewcode < worseref) {
105-
std::cerr << "You probably have a performance degradation." << std::endl;
106-
return EXIT_FAILURE;
91+
}
92+
93+
94+
int main(int argc, const char *argv[]) {
95+
if (argc < 3) {
96+
std::cerr << "Usage: " << argv[0] << " <old parse exe> <new parse exe> [<parse arguments>]" << std::endl;
97+
return 1;
98+
}
99+
100+
std::string newCommand = argv[1];
101+
std::string refCommand = argv[2];
102+
for (int i=3; i<argc; i++) {
103+
newCommand += " ";
104+
newCommand += argv[i];
105+
refCommand += " ";
106+
refCommand += argv[i];
107107
}
108-
if(bestnewcode < worseref) {
109-
std::cout << "You probably have a performance gain." << std::endl;
110-
return EXIT_SUCCESS;
108+
double worseref, bestref, worsenewcode, bestnewcode;
109+
/**
110+
* We take performance degradation seriously. When it occurs, we want
111+
* to investigate it thoroughly. Theoretically, if INTERLEAVED_ATTEMPTS
112+
* samples from one distribution are distinct from INTERLEAVED_ATTEMPTS
113+
* from another distribution, then there should be a real difference.
114+
* Unfortunately, in practice, we can get the impression that there are
115+
* false positives. So the tool should make absolutely sure that the
116+
* difference is entirely reproducible. So we require that it be
117+
* able to reproduce it consistently MAX_TRIAL_COUNT times. Then it
118+
* will be hard to argue with.
119+
*/
120+
int degradation = 0;
121+
int gain = 0;
122+
int neutral = 0;
123+
124+
// at most, we will rerun the tests MAX_TRIAL_COUNT times
125+
for(size_t trial = 0 ; trial < MAX_TRIAL_COUNT; trial++) {
126+
run_tests(refCommand, newCommand, worseref, bestref, worsenewcode, bestnewcode);
127+
if(bestnewcode < worseref) {
128+
printf("Possible degradation detected (%f %%)\n", (worseref - bestnewcode) * 100.0 / worseref);
129+
degradation++;
130+
if(gain > 0) {
131+
break; // mixed results
132+
}
133+
// otherwise, continue to make sure that the bad result is not a fluke
134+
} else if(bestref < worsenewcode) {
135+
printf("Possible gain detected (%f %%)\n", (bestref - bestref) * 100.0 / bestref);
136+
gain++;
137+
if(degradation > 0) {
138+
break; // mixed results
139+
}
140+
// otherwise, continue to make sure that the good result is not a fluke
141+
} else {
142+
// Whenever no difference is detected, we cut short.
143+
neutral++;
144+
break;
145+
}
146+
}
147+
// If we have at least one neutral, we conclude that there is no difference.
148+
// If we have mixed results, we conclude that there is no difference.
149+
if(neutral > 0 || ((gain > 0) && (degradation > 0)) ){
150+
std::cout << "There may not be performance difference. A manual check might be needed." << std::endl;
151+
return EXIT_SUCCESS;
111152
}
112-
std::cout << "There is no obvious performance difference. A manual check might be needed." << std::endl;
113-
return EXIT_SUCCESS;
153+
if(gain > 0) {
154+
std::cout << "You may have a performance gain." << std::endl;
155+
return EXIT_SUCCESS;
156+
}
157+
158+
std::cerr << "You probably have a performance degradation." << std::endl;
159+
return EXIT_FAILURE;
114160
}

0 commit comments

Comments
 (0)