Skip to content

Commit 1e8c5cf

Browse files
committed
Make pg_ctl stop/restart/promote recheck postmaster aliveness.
"pg_ctl stop/restart" checked that the postmaster PID is valid just once, as a side-effect of sending the stop signal, and then would wait-till-timeout for the postmaster.pid file to go away. This neglects the case wherein the postmaster dies uncleanly after we signal it. Similarly, once "pg_ctl promote" has sent the signal, it'd wait for the corresponding on-disk state change to occur even if the postmaster dies. I'm not sure how we've managed not to notice this problem, but it seems to explain slow execution of the 017_shm.pl test script on AIX since commit 4fdbf9a, which added a speculative "pg_ctl stop" with the idea of making real sure that the postmaster isn't there. In the test steps that kill-9 and then restart the postmaster, it's possible to get past the initial signal attempt before kill() stops working for the doomed postmaster. If that happens, pg_ctl waited till PGCTLTIMEOUT before giving up ... and the buildfarm's AIX members have that set very high. To fix, include a "kill(pid, 0)" test (similar to what postmaster_is_alive uses) in these wait loops, so that we'll give up immediately if the postmaster PID disappears. While here, I chose to refactor those loops out of where they were. do_stop() and do_restart() can perfectly well share one copy of the wait-for-stop loop, and it seems desirable to put a similar function beside that for wait-for-promote. Back-patch to all supported versions, since pg_ctl's wait logic is substantially identical in all, and we're seeing the slow test behavior in all branches. Discussion: https://postgr.es/m/20220210023537.GA3222837@rfd.leadboat.com
1 parent 92f60f5 commit 1e8c5cf

File tree

1 file changed

+80
-48
lines changed

1 file changed

+80
-48
lines changed

src/bin/pg_ctl/pg_ctl.c

Lines changed: 80 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,9 @@ static void free_readfile(char **optlines);
155155
static pgpid_t start_postmaster(void);
156156
static void read_post_opts(void);
157157

158-
static WaitPMResult wait_for_postmaster(pgpid_t pm_pid, bool do_checkpoint);
158+
static WaitPMResult wait_for_postmaster_start(pgpid_t pm_pid, bool do_checkpoint);
159+
static bool wait_for_postmaster_stop(void);
160+
static bool wait_for_postmaster_promote(void);
159161
static bool postmaster_is_alive(pid_t pid);
160162

161163
#if defined(HAVE_GETRLIMIT) && defined(RLIMIT_CORE)
@@ -590,7 +592,7 @@ start_postmaster(void)
590592
* manager checkpoint, it's got nothing to do with database checkpoints!!
591593
*/
592594
static WaitPMResult
593-
wait_for_postmaster(pgpid_t pm_pid, bool do_checkpoint)
595+
wait_for_postmaster_start(pgpid_t pm_pid, bool do_checkpoint)
594596
{
595597
int i;
596598

@@ -700,6 +702,76 @@ wait_for_postmaster(pgpid_t pm_pid, bool do_checkpoint)
700702
}
701703

702704

705+
/*
706+
* Wait for the postmaster to stop.
707+
*
708+
* Returns true if the postmaster stopped cleanly (i.e., removed its pidfile).
709+
* Returns false if the postmaster dies uncleanly, or if we time out.
710+
*/
711+
static bool
712+
wait_for_postmaster_stop(void)
713+
{
714+
int cnt;
715+
716+
for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
717+
{
718+
pgpid_t pid;
719+
720+
if ((pid = get_pgpid(false)) == 0)
721+
return true; /* pid file is gone */
722+
723+
if (kill((pid_t) pid, 0) != 0)
724+
{
725+
/*
726+
* Postmaster seems to have died. Check the pid file once more to
727+
* avoid a race condition, but give up waiting.
728+
*/
729+
if (get_pgpid(false) == 0)
730+
return true; /* pid file is gone */
731+
return false; /* postmaster died untimely */
732+
}
733+
734+
if (cnt % WAITS_PER_SEC == 0)
735+
print_msg(".");
736+
pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
737+
}
738+
return false; /* timeout reached */
739+
}
740+
741+
742+
/*
743+
* Wait for the postmaster to promote.
744+
*
745+
* Returns true on success, else false.
746+
* To avoid waiting uselessly, we check for postmaster death here too.
747+
*/
748+
static bool
749+
wait_for_postmaster_promote(void)
750+
{
751+
int cnt;
752+
753+
for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
754+
{
755+
pgpid_t pid;
756+
DBState state;
757+
758+
if ((pid = get_pgpid(false)) == 0)
759+
return false; /* pid file is gone */
760+
if (kill((pid_t) pid, 0) != 0)
761+
return false; /* postmaster died */
762+
763+
state = get_control_dbstate();
764+
if (state == DB_IN_PRODUCTION)
765+
return true; /* successful promotion */
766+
767+
if (cnt % WAITS_PER_SEC == 0)
768+
print_msg(".");
769+
pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
770+
}
771+
return false; /* timeout reached */
772+
}
773+
774+
703775
#if defined(HAVE_GETRLIMIT) && defined(RLIMIT_CORE)
704776
static void
705777
unlimit_core_size(void)
@@ -913,7 +985,7 @@ do_start(void)
913985

914986
print_msg(_("waiting for server to start..."));
915987

916-
switch (wait_for_postmaster(pm_pid, false))
988+
switch (wait_for_postmaster_start(pm_pid, false))
917989
{
918990
case POSTMASTER_READY:
919991
print_msg(_(" done\n"));
@@ -948,7 +1020,6 @@ do_start(void)
9481020
static void
9491021
do_stop(void)
9501022
{
951-
int cnt;
9521023
pgpid_t pid;
9531024
struct stat statbuf;
9541025

@@ -999,19 +1070,7 @@ do_stop(void)
9991070

10001071
print_msg(_("waiting for server to shut down..."));
10011072

1002-
for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
1003-
{
1004-
if ((pid = get_pgpid(false)) != 0)
1005-
{
1006-
if (cnt % WAITS_PER_SEC == 0)
1007-
print_msg(".");
1008-
pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
1009-
}
1010-
else
1011-
break;
1012-
}
1013-
1014-
if (pid != 0) /* pid file still exists */
1073+
if (!wait_for_postmaster_stop())
10151074
{
10161075
print_msg(_(" failed\n"));
10171076

@@ -1035,7 +1094,6 @@ do_stop(void)
10351094
static void
10361095
do_restart(void)
10371096
{
1038-
int cnt;
10391097
pgpid_t pid;
10401098
struct stat statbuf;
10411099

@@ -1089,20 +1147,7 @@ do_restart(void)
10891147
print_msg(_("waiting for server to shut down..."));
10901148

10911149
/* always wait for restart */
1092-
1093-
for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
1094-
{
1095-
if ((pid = get_pgpid(false)) != 0)
1096-
{
1097-
if (cnt % WAITS_PER_SEC == 0)
1098-
print_msg(".");
1099-
pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
1100-
}
1101-
else
1102-
break;
1103-
}
1104-
1105-
if (pid != 0) /* pid file still exists */
1150+
if (!wait_for_postmaster_stop())
11061151
{
11071152
print_msg(_(" failed\n"));
11081153

@@ -1222,21 +1267,8 @@ do_promote(void)
12221267

12231268
if (do_wait)
12241269
{
1225-
DBState state = DB_STARTUP;
1226-
int cnt;
1227-
12281270
print_msg(_("waiting for server to promote..."));
1229-
for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
1230-
{
1231-
state = get_control_dbstate();
1232-
if (state == DB_IN_PRODUCTION)
1233-
break;
1234-
1235-
if (cnt % WAITS_PER_SEC == 0)
1236-
print_msg(".");
1237-
pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
1238-
}
1239-
if (state == DB_IN_PRODUCTION)
1271+
if (wait_for_postmaster_promote())
12401272
{
12411273
print_msg(_(" done\n"));
12421274
print_msg(_("server promoted\n"));
@@ -1655,7 +1687,7 @@ pgwin32_ServiceMain(DWORD argc, LPTSTR *argv)
16551687
if (do_wait)
16561688
{
16571689
write_eventlog(EVENTLOG_INFORMATION_TYPE, _("Waiting for server startup...\n"));
1658-
if (wait_for_postmaster(postmasterPID, true) != POSTMASTER_READY)
1690+
if (wait_for_postmaster_start(postmasterPID, true) != POSTMASTER_READY)
16591691
{
16601692
write_eventlog(EVENTLOG_ERROR_TYPE, _("Timed out waiting for server startup\n"));
16611693
pgwin32_SetServiceStatus(SERVICE_STOPPED);
@@ -1676,7 +1708,7 @@ pgwin32_ServiceMain(DWORD argc, LPTSTR *argv)
16761708
{
16771709
/*
16781710
* status.dwCheckPoint can be incremented by
1679-
* wait_for_postmaster(), so it might not start from 0.
1711+
* wait_for_postmaster_start(), so it might not start from 0.
16801712
*/
16811713
int maxShutdownCheckPoint = status.dwCheckPoint + 12;
16821714

0 commit comments

Comments
 (0)