Skip to content

Commit 34010ac

Browse files
committed
Improve detection of child-process SIGPIPE failures.
Commit ffa4cbd added logic to detect SIGPIPE failure of a COPY child process, but it only worked correctly if the SIGPIPE occurred in the immediate child process. Depending on the shell in use and the complexity of the shell command string, we might instead get back an exit code of 128 + SIGPIPE, representing a shell error exit reporting SIGPIPE in the child process. We could just hack up ClosePipeToProgram() to add the extra case, but it seems like this is a fairly general issue deserving a more general and better-documented solution. I chose to add a couple of functions in src/common/wait_error.c, which is a natural place to know about wait-result encodings, that will test for either a specific child-process signal type or any child-process signal failure. Then, adjust other places that were doing ad-hoc tests of this type to use the common functions. In RestoreArchivedFile, this fixes a race condition affecting whether the process will report an error or just silently proc_exit(1): before, that depended on whether the intermediate shell got SIGTERM'd itself or reported a child process failing on SIGTERM. Like the previous patch, back-patch to v10; we could go further but there seems no real need to. Per report from Erik Rijkers. Discussion: https://postgr.es/m/f3683f87ab1701bea5d86a7742b22432@xs4all.nl
1 parent 8b90174 commit 34010ac

File tree

5 files changed

+55
-23
lines changed

5 files changed

+55
-23
lines changed

src/backend/access/transam/xlogarchive.c

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ RestoreArchivedFile(char *path, const char *xlogfname,
5959
char *endp;
6060
const char *sp;
6161
int rc;
62-
bool signaled;
6362
struct stat stat_buf;
6463
XLogSegNo restartSegNo;
6564
XLogRecPtr restartRedoPtr;
@@ -288,17 +287,12 @@ RestoreArchivedFile(char *path, const char *xlogfname,
288287
* will perform an immediate shutdown when it sees us exiting
289288
* unexpectedly.
290289
*
291-
* Per the Single Unix Spec, shells report exit status > 128 when a called
292-
* command died on a signal. Also, 126 and 127 are used to report
293-
* problems such as an unfindable command; treat those as fatal errors
294-
* too.
290+
* We treat hard shell errors such as "command not found" as fatal, too.
295291
*/
296-
if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
292+
if (wait_result_is_signal(rc, SIGTERM))
297293
proc_exit(1);
298294

299-
signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
300-
301-
ereport(signaled ? FATAL : DEBUG2,
295+
ereport(wait_result_is_any_signal(rc, true) ? FATAL : DEBUG2,
302296
(errmsg("could not restore file \"%s\" from archive: %s",
303297
xlogfname, wait_result_to_str(rc))));
304298

@@ -334,7 +328,6 @@ ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal)
334328
char *endp;
335329
const char *sp;
336330
int rc;
337-
bool signaled;
338331
XLogSegNo restartSegNo;
339332
XLogRecPtr restartRedoPtr;
340333
TimeLineID restartTli;
@@ -401,12 +394,9 @@ ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal)
401394
{
402395
/*
403396
* If the failure was due to any sort of signal, it's best to punt and
404-
* abort recovery. See also detailed comments on signals in
405-
* RestoreArchivedFile().
397+
* abort recovery. See comments in RestoreArchivedFile().
406398
*/
407-
signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
408-
409-
ereport((signaled && failOnSignal) ? FATAL : WARNING,
399+
ereport((failOnSignal && wait_result_is_any_signal(rc, true)) ? FATAL : WARNING,
410400
/*------
411401
translator: First %s represents a recovery.conf parameter name like
412402
"recovery_end_command", the 2nd is the value of that parameter, the

src/backend/commands/copy.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
#include <ctype.h>
1818
#include <unistd.h>
1919
#include <sys/stat.h>
20-
#include <sys/wait.h>
2120
#include <netinet/in.h>
2221
#include <arpa/inet.h>
2322

@@ -1713,7 +1712,7 @@ ClosePipeToProgram(CopyState cstate)
17131712
* problem.
17141713
*/
17151714
if (cstate->is_copy_from && !cstate->reached_eof &&
1716-
WIFSIGNALED(pclose_rc) && WTERMSIG(pclose_rc) == SIGPIPE)
1715+
wait_result_is_signal(pclose_rc, SIGPIPE))
17171716
return;
17181717

17191718
ereport(ERROR,

src/backend/postmaster/pgarch.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -573,13 +573,11 @@ pgarch_archiveXlog(char *xlog)
573573
* If either the shell itself, or a called command, died on a signal,
574574
* abort the archiver. We do this because system() ignores SIGINT and
575575
* SIGQUIT while waiting; so a signal is very likely something that
576-
* should have interrupted us too. If we overreact it's no big deal,
577-
* the postmaster will just start the archiver again.
578-
*
579-
* Per the Single Unix Spec, shells report exit status > 128 when a
580-
* called command died on a signal.
576+
* should have interrupted us too. Also die if the shell got a hard
577+
* "command not found" type of error. If we overreact it's no big
578+
* deal, the postmaster will just start the archiver again.
581579
*/
582-
int lev = (WIFSIGNALED(rc) || WEXITSTATUS(rc) > 128) ? FATAL : LOG;
580+
int lev = wait_result_is_any_signal(rc, true) ? FATAL : LOG;
583581

584582
if (WIFEXITED(rc))
585583
{

src/common/wait_error.c

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,46 @@ wait_result_to_str(int exitstatus)
8282

8383
return pstrdup(str);
8484
}
85+
86+
/*
87+
* Return true if a wait(2) result indicates that the child process
88+
* died due to the specified signal.
89+
*
90+
* The reason this is worth having a wrapper function for is that
91+
* there are two cases: the signal might have been received by our
92+
* immediate child process, or there might've been a shell process
93+
* between us and the child that died. The shell will, per POSIX,
94+
* report the child death using exit code 128 + signal number.
95+
*
96+
* If there is no possibility of an intermediate shell, this function
97+
* need not (and probably should not) be used.
98+
*/
99+
bool
100+
wait_result_is_signal(int exit_status, int signum)
101+
{
102+
if (WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum)
103+
return true;
104+
if (WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == 128 + signum)
105+
return true;
106+
return false;
107+
}
108+
109+
/*
110+
* Return true if a wait(2) result indicates that the child process
111+
* died due to any signal. We consider either direct child death
112+
* or a shell report of child process death as matching the condition.
113+
*
114+
* If include_command_not_found is true, also return true for shell
115+
* exit codes indicating "command not found" and the like
116+
* (specifically, exit codes 126 and 127; see above).
117+
*/
118+
bool
119+
wait_result_is_any_signal(int exit_status, bool include_command_not_found)
120+
{
121+
if (WIFSIGNALED(exit_status))
122+
return true;
123+
if (WIFEXITED(exit_status) &&
124+
WEXITSTATUS(exit_status) > (include_command_not_found ? 125 : 128))
125+
return true;
126+
return false;
127+
}

src/include/port.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,5 +483,7 @@ extern char *escape_single_quotes_ascii(const char *src);
483483

484484
/* port/wait_error.c */
485485
extern char *wait_result_to_str(int exit_status);
486+
extern bool wait_result_is_signal(int exit_status, int signum);
487+
extern bool wait_result_is_any_signal(int exit_status, bool include_command_not_found);
486488

487489
#endif /* PG_PORT_H */

0 commit comments

Comments
 (0)