Skip to content

Commit 6718f07

Browse files
committed
Fix postmaster's handling of a startup-process crash.
Ordinarily, a failure (unexpected exit status) of the startup subprocess should be considered fatal, so the postmaster should just close up shop and quit. However, if we sent the startup process a SIGQUIT or SIGKILL signal, the failure is hardly "unexpected", and we should attempt restart; this is necessary for recovery from ordinary backend crashes in hot-standby scenarios. I attempted to implement the latter rule with a two-line patch in commit 442231d, but it now emerges that that patch was a few bricks shy of a load: it failed to distinguish the case of a signaled startup process from the case where the new startup process crashes before reaching database consistency. That resulted in infinitely respawning a new startup process only to have it crash again. To handle this properly, we really must track whether we have sent the *current* startup process a kill signal. Rather than add yet another ad-hoc boolean to the postmaster's state, I chose to unify this with the existing RecoveryError flag into an enum tracking the startup process's state. That seems more consistent with the postmaster's general state machine design. Back-patch to 9.0, like the previous patch.
1 parent 3386399 commit 6718f07

File tree

1 file changed

+31
-13
lines changed

1 file changed

+31
-13
lines changed

src/backend/postmaster/postmaster.c

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,17 @@ static pid_t StartupPID = 0,
218218
PgStatPID = 0,
219219
SysLoggerPID = 0;
220220

221+
/* Startup process's status */
222+
typedef enum
223+
{
224+
STARTUP_NOT_RUNNING,
225+
STARTUP_RUNNING,
226+
STARTUP_SIGNALED, /* we sent it a SIGQUIT or SIGKILL */
227+
STARTUP_CRASHED
228+
} StartupStatusEnum;
229+
230+
static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING;
231+
221232
/* Startup/shutdown state */
222233
#define NoShutdown 0
223234
#define SmartShutdown 1
@@ -226,7 +237,6 @@ static pid_t StartupPID = 0,
226237
static int Shutdown = NoShutdown;
227238

228239
static bool FatalError = false; /* T if recovering from backend crash */
229-
static bool RecoveryError = false; /* T if WAL recovery failed */
230240

231241
/*
232242
* We use a simple state machine to control startup, shutdown, and
@@ -269,8 +279,6 @@ static bool RecoveryError = false; /* T if WAL recovery failed */
269279
* states, nor in PM_SHUTDOWN states (because we don't enter those states
270280
* when trying to recover from a crash). It can be true in PM_STARTUP state,
271281
* because we don't clear it until we've successfully started WAL redo.
272-
* Similarly, RecoveryError means that we have crashed during recovery, and
273-
* should not try to restart.
274282
*/
275283
typedef enum
276284
{
@@ -1115,6 +1123,7 @@ PostmasterMain(int argc, char *argv[])
11151123
*/
11161124
StartupPID = StartupDataBase();
11171125
Assert(StartupPID != 0);
1126+
StartupStatus = STARTUP_RUNNING;
11181127
pmState = PM_STARTUP;
11191128

11201129
status = ServerLoop();
@@ -2381,6 +2390,7 @@ reaper(SIGNAL_ARGS)
23812390
if (Shutdown > NoShutdown &&
23822391
(EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus)))
23832392
{
2393+
StartupStatus = STARTUP_NOT_RUNNING;
23842394
pmState = PM_WAIT_BACKENDS;
23852395
/* PostmasterStateMachine logic does the rest */
23862396
continue;
@@ -2403,16 +2413,18 @@ reaper(SIGNAL_ARGS)
24032413
/*
24042414
* After PM_STARTUP, any unexpected exit (including FATAL exit) of
24052415
* the startup process is catastrophic, so kill other children,
2406-
* and set RecoveryError so we don't try to reinitialize after
2407-
* they're gone. Exception: if FatalError is already set, that
2408-
* implies we previously sent the startup process a SIGQUIT, so
2416+
* and set StartupStatus so we don't try to reinitialize after
2417+
* they're gone. Exception: if StartupStatus is STARTUP_SIGNALED,
2418+
* then we previously sent the startup process a SIGQUIT; so
24092419
* that's probably the reason it died, and we do want to try to
24102420
* restart in that case.
24112421
*/
24122422
if (!EXIT_STATUS_0(exitstatus))
24132423
{
2414-
if (!FatalError)
2415-
RecoveryError = true;
2424+
if (StartupStatus == STARTUP_SIGNALED)
2425+
StartupStatus = STARTUP_NOT_RUNNING;
2426+
else
2427+
StartupStatus = STARTUP_CRASHED;
24162428
HandleChildCrash(pid, exitstatus,
24172429
_("startup process"));
24182430
continue;
@@ -2421,6 +2433,7 @@ reaper(SIGNAL_ARGS)
24212433
/*
24222434
* Startup succeeded, commence normal operations
24232435
*/
2436+
StartupStatus = STARTUP_NOT_RUNNING;
24242437
FatalError = false;
24252438
ReachedNormalRunning = true;
24262439
pmState = PM_RUN;
@@ -2756,14 +2769,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
27562769

27572770
/* Take care of the startup process too */
27582771
if (pid == StartupPID)
2772+
{
27592773
StartupPID = 0;
2774+
StartupStatus = STARTUP_CRASHED;
2775+
}
27602776
else if (StartupPID != 0 && !FatalError)
27612777
{
27622778
ereport(DEBUG2,
27632779
(errmsg_internal("sending %s to process %d",
27642780
(SendStop ? "SIGSTOP" : "SIGQUIT"),
27652781
(int) StartupPID)));
27662782
signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT));
2783+
StartupStatus = STARTUP_SIGNALED;
27672784
}
27682785

27692786
/* Take care of the bgwriter too */
@@ -3110,12 +3127,12 @@ PostmasterStateMachine(void)
31103127
}
31113128

31123129
/*
3113-
* If recovery failed, wait for all non-syslogger children to exit, and
3114-
* then exit postmaster. We don't try to reinitialize when recovery fails,
3115-
* because more than likely it will just fail again and we will keep
3116-
* trying forever.
3130+
* If the startup process failed, wait for all non-syslogger children to
3131+
* exit, and then exit postmaster. We don't try to reinitialize when the
3132+
* startup process fails, because more than likely it will just fail again
3133+
* and we will keep trying forever.
31173134
*/
3118-
if (RecoveryError && pmState == PM_NO_CHILDREN)
3135+
if (pmState == PM_NO_CHILDREN && StartupStatus == STARTUP_CRASHED)
31193136
ExitPostmaster(1);
31203137

31213138
/*
@@ -3132,6 +3149,7 @@ PostmasterStateMachine(void)
31323149

31333150
StartupPID = StartupDataBase();
31343151
Assert(StartupPID != 0);
3152+
StartupStatus = STARTUP_RUNNING;
31353153
pmState = PM_STARTUP;
31363154
}
31373155
}

0 commit comments

Comments
 (0)