Skip to content

Commit 0ec618a

Browse files
authored
bpo-37531: regrtest ignores output on timeout (GH-16659)
bpo-37531, bpo-38207: On timeout, regrtest no longer attempts to call `popen.communicate() again: it can hang until all child processes using stdout and stderr pipes completes. Kill the worker process and ignores its output. Reenable test_regrtest.test_multiprocessing_timeout(). bpo-37531: Change also the faulthandler timeout of the main process from 1 minute to 5 minutes, for Python slowest buildbots.
1 parent e53c580 commit 0ec618a

File tree

3 files changed

+42
-28
lines changed

3 files changed

+42
-28
lines changed

Lib/test/libregrtest/runtest_mp.py

Lines changed: 37 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@
2222
PROGRESS_UPDATE = 30.0 # seconds
2323
assert PROGRESS_UPDATE >= PROGRESS_MIN_TIME
2424

25+
# Kill the main process after 5 minutes. It is supposed to write an update
26+
# every PROGRESS_UPDATE seconds. Tolerate 5 minutes for Python slowest
27+
# buildbot workers.
28+
MAIN_PROCESS_TIMEOUT = 5 * 60.0
29+
assert MAIN_PROCESS_TIMEOUT >= PROGRESS_UPDATE
30+
2531
# Time to wait until a worker completes: should be immediate
2632
JOIN_TIMEOUT = 30.0 # seconds
2733

@@ -160,20 +166,6 @@ def mp_result_error(self, test_name, error_type, stdout='', stderr='',
160166
result = TestResult(test_name, error_type, test_time, None)
161167
return MultiprocessResult(result, stdout, stderr, err_msg)
162168

163-
def _timedout(self, test_name):
164-
self._kill()
165-
166-
stdout = stderr = ''
167-
popen = self._popen
168-
try:
169-
stdout, stderr = popen.communicate(timeout=JOIN_TIMEOUT)
170-
except (subprocess.TimeoutExpired, OSError) as exc:
171-
print_warning(f"Failed to read {self} output "
172-
f"(timeout={format_duration(JOIN_TIMEOUT)}): "
173-
f"{exc!r}")
174-
175-
return self.mp_result_error(test_name, TIMEOUT, stdout, stderr)
176-
177169
def _run_process(self, test_name):
178170
self.start_time = time.monotonic()
179171

@@ -197,23 +189,32 @@ def _run_process(self, test_name):
197189

198190
try:
199191
stdout, stderr = popen.communicate(timeout=self.timeout)
192+
retcode = popen.returncode
193+
assert retcode is not None
200194
except subprocess.TimeoutExpired:
201195
if self._stopped:
202196
# kill() has been called: communicate() fails
203197
# on reading closed stdout/stderr
204198
raise ExitThread
205199

206-
return self._timedout(test_name)
200+
# On timeout, kill the process
201+
self._kill()
202+
203+
# None means TIMEOUT for the caller
204+
retcode = None
205+
# bpo-38207: Don't attempt to call communicate() again: on it
206+
# can hang until all child processes using stdout and stderr
207+
# pipes completes.
208+
stdout = stderr = ''
207209
except OSError:
208210
if self._stopped:
209211
# kill() has been called: communicate() fails
210212
# on reading closed stdout/stderr
211213
raise ExitThread
212214
raise
213-
214-
retcode = popen.returncode
215-
stdout = stdout.strip()
216-
stderr = stderr.rstrip()
215+
else:
216+
stdout = stdout.strip()
217+
stderr = stderr.rstrip()
217218

218219
return (retcode, stdout, stderr)
219220
except:
@@ -225,13 +226,10 @@ def _run_process(self, test_name):
225226
self.current_test_name = None
226227

227228
def _runtest(self, test_name):
228-
result = self._run_process(test_name)
229-
230-
if isinstance(result, MultiprocessResult):
231-
# _timedout() case
232-
return result
229+
retcode, stdout, stderr = self._run_process(test_name)
233230

234-
retcode, stdout, stderr = result
231+
if retcode is None:
232+
return self.mp_result_error(test_name, TIMEOUT, stdout, stderr)
235233

236234
err_msg = None
237235
if retcode != 0:
@@ -250,7 +248,8 @@ def _runtest(self, test_name):
250248
err_msg = "Failed to parse worker JSON: %s" % exc
251249

252250
if err_msg is not None:
253-
return self.mp_result_error(test_name, CHILD_ERROR, stdout, stderr, err_msg)
251+
return self.mp_result_error(test_name, CHILD_ERROR,
252+
stdout, stderr, err_msg)
254253

255254
return MultiprocessResult(result, stdout, stderr, err_msg)
256255

@@ -289,6 +288,16 @@ def _wait_completed(self):
289288
f"{exc!r}")
290289

291290
def wait_stopped(self, start_time):
291+
# bpo-38207: MultiprocessTestRunner.stop_workers() called self.stop()
292+
# which killed the process. Sometimes, killing the process from the
293+
# main thread does not interrupt popen.communicate() in
294+
# TestWorkerProcess thread. This loop with a timeout is a workaround
295+
# for that.
296+
#
297+
# Moreover, if this method fails to join the thread, it is likely
298+
# that Python will hang at exit while calling threading._shutdown()
299+
# which tries again to join the blocked thread. Regrtest.main()
300+
# uses EXIT_TIMEOUT to workaround this second bug.
292301
while True:
293302
# Write a message every second
294303
self.join(1.0)
@@ -355,7 +364,8 @@ def _get_result(self):
355364
timeout = PROGRESS_UPDATE
356365
while True:
357366
if use_faulthandler:
358-
faulthandler.dump_traceback_later(timeout * 2.0, exit=True)
367+
faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT,
368+
exit=True)
359369

360370
# wait for a thread
361371
try:

Lib/test/test_regrtest.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1153,7 +1153,6 @@ def test_garbage(self):
11531153
env_changed=[testname],
11541154
fail_env_changed=True)
11551155

1156-
@unittest.skipIf(True, 'bpo-37531, bpo-38207: test hangs randomly')
11571156
def test_multiprocessing_timeout(self):
11581157
code = textwrap.dedent(r"""
11591158
import time
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
On timeout, regrtest no longer attempts to call ``popen.communicate()``
2+
again: it can hang until all child processes using stdout and stderr pipes
3+
completes. Kill the worker process and ignores its output. Change also the
4+
faulthandler timeout of the main process from 1 minute to 5 minutes, for Python
5+
slowest buildbots.

0 commit comments

Comments
 (0)