@@ -63,22 +63,9 @@ double readThroughput(std::string parseOutput) {
63
63
}
64
64
65
65
const double INTERLEAVED_ATTEMPTS = 7 ;
66
+ const double MAX_TRIAL_COUNT= 5 ;
66
67
67
- int main (int argc, const char *argv[]) {
68
- if (argc < 3 ) {
69
- std::cerr << " Usage: " << argv[0 ] << " <old parse exe> <new parse exe> [<parse arguments>]" << std::endl;
70
- return 1 ;
71
- }
72
-
73
- std::string newCommand = argv[1 ];
74
- std::string refCommand = argv[2 ];
75
- for (int i=3 ; i<argc; i++) {
76
- newCommand += " " ;
77
- newCommand += argv[i];
78
- refCommand += " " ;
79
- refCommand += argv[i];
80
- }
81
-
68
+ void run_tests (const std::string refCommand, const std::string newCommand, double &worseref, double &bestref, double &worsenewcode, double &bestnewcode) {
82
69
std::vector<double > ref;
83
70
std::vector<double > newcode;
84
71
for (int attempt=0 ; attempt < INTERLEAVED_ATTEMPTS; attempt++) {
@@ -95,20 +82,79 @@ int main(int argc, const char *argv[]) {
95
82
ref.push_back (referenceThroughput);
96
83
}
97
84
// we check if the maximum of newcode is lower than minimum of ref, if so we have a problem so fail!
98
- double worseref = *std::min_element (ref.begin (), ref.end ());
99
- double bestnewcode = *std::max_element (newcode.begin (), newcode.end ());
100
- double bestref = *std::max_element (ref.begin (), ref.end ());
101
- double worsenewcode = *std::min_element (newcode.begin (), newcode.end ());
85
+ worseref = *std::min_element (ref.begin (), ref.end ());
86
+ bestnewcode = *std::max_element (newcode.begin (), newcode.end ());
87
+ bestref = *std::max_element (ref.begin (), ref.end ());
88
+ worsenewcode = *std::min_element (newcode.begin (), newcode.end ());
102
89
std::cout << " The new code has a throughput in " << worsenewcode << " -- " << bestnewcode << std::endl;
103
90
std::cout << " The reference code has a throughput in " << worseref << " -- " << bestref << std::endl;
104
- if (bestnewcode < worseref) {
105
- std::cerr << " You probably have a performance degradation." << std::endl;
106
- return EXIT_FAILURE;
91
+ }
92
+
93
+
94
+ int main (int argc, const char *argv[]) {
95
+ if (argc < 3 ) {
96
+ std::cerr << " Usage: " << argv[0 ] << " <old parse exe> <new parse exe> [<parse arguments>]" << std::endl;
97
+ return 1 ;
98
+ }
99
+
100
+ std::string newCommand = argv[1 ];
101
+ std::string refCommand = argv[2 ];
102
+ for (int i=3 ; i<argc; i++) {
103
+ newCommand += " " ;
104
+ newCommand += argv[i];
105
+ refCommand += " " ;
106
+ refCommand += argv[i];
107
107
}
108
- if (bestnewcode < worseref) {
109
- std::cout << " You probably have a performance gain." << std::endl;
110
- return EXIT_SUCCESS;
108
+ double worseref, bestref, worsenewcode, bestnewcode;
109
+ /* *
110
+ * We take performance degradation seriously. When it occurs, we want
111
+ * to investigate it thoroughly. Theoretically, if INTERLEAVED_ATTEMPTS
112
+ * samples from one distribution are distinct from INTERLEAVED_ATTEMPTS
113
+ * from another distribution, then there should be a real difference.
114
+ * Unfortunately, in practice, we can get the impression that there are
115
+ * false positives. So the tool should make absolutely sure that the
116
+ * difference is entirely reproducible. So we require that it be
117
+ * able to reproduce it consistently MAX_TRIAL_COUNT times. Then it
118
+ * will be hard to argue with.
119
+ */
120
+ int degradation = 0 ;
121
+ int gain = 0 ;
122
+ int neutral = 0 ;
123
+
124
+ // at most, we will rerun the tests MAX_TRIAL_COUNT times
125
+ for (size_t trial = 0 ; trial < MAX_TRIAL_COUNT; trial++) {
126
+ run_tests (refCommand, newCommand, worseref, bestref, worsenewcode, bestnewcode);
127
+ if (bestnewcode < worseref) {
128
+ printf (" Possible degradation detected (%f %%)\n " , (worseref - bestnewcode) * 100.0 / worseref);
129
+ degradation++;
130
+ if (gain > 0 ) {
131
+ break ; // mixed results
132
+ }
133
+ // otherwise, continue to make sure that the bad result is not a fluke
134
+ } else if (bestref < worsenewcode) {
135
+ printf (" Possible gain detected (%f %%)\n " , (bestref - bestref) * 100.0 / bestref);
136
+ gain++;
137
+ if (degradation > 0 ) {
138
+ break ; // mixed results
139
+ }
140
+ // otherwise, continue to make sure that the good result is not a fluke
141
+ } else {
142
+ // Whenever no difference is detected, we cut short.
143
+ neutral++;
144
+ break ;
145
+ }
146
+ }
147
+ // If we have at least one neutral, we conclude that there is no difference.
148
+ // If we have mixed results, we conclude that there is no difference.
149
+ if (neutral > 0 || ((gain > 0 ) && (degradation > 0 )) ){
150
+ std::cout << " There may not be performance difference. A manual check might be needed." << std::endl;
151
+ return EXIT_SUCCESS;
111
152
}
112
- std::cout << " There is no obvious performance difference. A manual check might be needed." << std::endl;
113
- return EXIT_SUCCESS;
153
+ if (gain > 0 ) {
154
+ std::cout << " You may have a performance gain." << std::endl;
155
+ return EXIT_SUCCESS;
156
+ }
157
+
158
+ std::cerr << " You probably have a performance degradation." << std::endl;
159
+ return EXIT_FAILURE;
114
160
}
0 commit comments