22
22
PROGRESS_UPDATE = 30.0 # seconds
23
23
assert PROGRESS_UPDATE >= PROGRESS_MIN_TIME
24
24
25
+ # Kill the main process after 5 minutes. It is supposed to write an update
26
+ # every PROGRESS_UPDATE seconds. Tolerate 5 minutes for Python slowest
27
+ # buildbot workers.
28
+ MAIN_PROCESS_TIMEOUT = 5 * 60.0
29
+ assert MAIN_PROCESS_TIMEOUT >= PROGRESS_UPDATE
30
+
25
31
# Time to wait until a worker completes: should be immediate
26
32
JOIN_TIMEOUT = 30.0 # seconds
27
33
@@ -160,20 +166,6 @@ def mp_result_error(self, test_name, error_type, stdout='', stderr='',
160
166
result = TestResult (test_name , error_type , test_time , None )
161
167
return MultiprocessResult (result , stdout , stderr , err_msg )
162
168
163
- def _timedout (self , test_name ):
164
- self ._kill ()
165
-
166
- stdout = stderr = ''
167
- popen = self ._popen
168
- try :
169
- stdout , stderr = popen .communicate (timeout = JOIN_TIMEOUT )
170
- except (subprocess .TimeoutExpired , OSError ) as exc :
171
- print_warning (f"Failed to read { self } output "
172
- f"(timeout={ format_duration (JOIN_TIMEOUT )} ): "
173
- f"{ exc !r} " )
174
-
175
- return self .mp_result_error (test_name , TIMEOUT , stdout , stderr )
176
-
177
169
def _run_process (self , test_name ):
178
170
self .start_time = time .monotonic ()
179
171
@@ -197,23 +189,32 @@ def _run_process(self, test_name):
197
189
198
190
try :
199
191
stdout , stderr = popen .communicate (timeout = self .timeout )
192
+ retcode = popen .returncode
193
+ assert retcode is not None
200
194
except subprocess .TimeoutExpired :
201
195
if self ._stopped :
202
196
# kill() has been called: communicate() fails
203
197
# on reading closed stdout/stderr
204
198
raise ExitThread
205
199
206
- return self ._timedout (test_name )
200
+ # On timeout, kill the process
201
+ self ._kill ()
202
+
203
+ # None means TIMEOUT for the caller
204
+ retcode = None
205
+ # bpo-38207: Don't attempt to call communicate() again: on it
206
+ # can hang until all child processes using stdout and stderr
207
+ # pipes completes.
208
+ stdout = stderr = ''
207
209
except OSError :
208
210
if self ._stopped :
209
211
# kill() has been called: communicate() fails
210
212
# on reading closed stdout/stderr
211
213
raise ExitThread
212
214
raise
213
-
214
- retcode = popen .returncode
215
- stdout = stdout .strip ()
216
- stderr = stderr .rstrip ()
215
+ else :
216
+ stdout = stdout .strip ()
217
+ stderr = stderr .rstrip ()
217
218
218
219
return (retcode , stdout , stderr )
219
220
except :
@@ -225,13 +226,10 @@ def _run_process(self, test_name):
225
226
self .current_test_name = None
226
227
227
228
def _runtest (self , test_name ):
228
- result = self ._run_process (test_name )
229
-
230
- if isinstance (result , MultiprocessResult ):
231
- # _timedout() case
232
- return result
229
+ retcode , stdout , stderr = self ._run_process (test_name )
233
230
234
- retcode , stdout , stderr = result
231
+ if retcode is None :
232
+ return self .mp_result_error (test_name , TIMEOUT , stdout , stderr )
235
233
236
234
err_msg = None
237
235
if retcode != 0 :
@@ -250,7 +248,8 @@ def _runtest(self, test_name):
250
248
err_msg = "Failed to parse worker JSON: %s" % exc
251
249
252
250
if err_msg is not None :
253
- return self .mp_result_error (test_name , CHILD_ERROR , stdout , stderr , err_msg )
251
+ return self .mp_result_error (test_name , CHILD_ERROR ,
252
+ stdout , stderr , err_msg )
254
253
255
254
return MultiprocessResult (result , stdout , stderr , err_msg )
256
255
@@ -289,6 +288,16 @@ def _wait_completed(self):
289
288
f"{ exc !r} " )
290
289
291
290
def wait_stopped (self , start_time ):
291
+ # bpo-38207: MultiprocessTestRunner.stop_workers() called self.stop()
292
+ # which killed the process. Sometimes, killing the process from the
293
+ # main thread does not interrupt popen.communicate() in
294
+ # TestWorkerProcess thread. This loop with a timeout is a workaround
295
+ # for that.
296
+ #
297
+ # Moreover, if this method fails to join the thread, it is likely
298
+ # that Python will hang at exit while calling threading._shutdown()
299
+ # which tries again to join the blocked thread. Regrtest.main()
300
+ # uses EXIT_TIMEOUT to workaround this second bug.
292
301
while True :
293
302
# Write a message every second
294
303
self .join (1.0 )
@@ -355,7 +364,8 @@ def _get_result(self):
355
364
timeout = PROGRESS_UPDATE
356
365
while True :
357
366
if use_faulthandler :
358
- faulthandler .dump_traceback_later (timeout * 2.0 , exit = True )
367
+ faulthandler .dump_traceback_later (MAIN_PROCESS_TIMEOUT ,
368
+ exit = True )
359
369
360
370
# wait for a thread
361
371
try :
0 commit comments