Skip to content

Commit c61559e

Browse files
committed
Reduce pg_ctl's reaction time when waiting for postmaster start/stop.
pg_ctl has traditionally waited one second between probes for whether the start or stop request has completed. That behavior was embodied in the original shell script written in 1999 (commit 5b912b0) and I doubt anyone's questioned it since. Nowadays, machines are a lot faster, and the shell script is long since replaced by C code, so it's fair to reconsider how long we ought to wait. This patch adjusts the coding so that the wait time can be any even divisor of 1 second, and sets the actual probe rate to 10 per second. That's based on experimentation with the src/test/recovery TAP tests, which include a lot of postmaster starts and stops. This patch alone reduces the (non-parallelized) runtime of those tests from ~4m30s to ~3m5s on my machine. Increasing the probe rate further doesn't help much, so this seems like a good number. In the real world this probably won't have much impact, since people don't start/stop production postmasters often, and the shutdown checkpoint usually takes nontrivial time too. But it makes development work and testing noticeably snappier, and that's good enough reason for me. Also, by reducing the dead time in postmaster restart sequences, this change has made it easier to reproduce some bugs that have been lurking for awhile. Patches for those will follow. Discussion: https://postgr.es/m/18444.1498428798@sss.pgh.pa.us
1 parent 5c77690 commit c61559e

File tree

1 file changed

+37
-26
lines changed

1 file changed

+37
-26
lines changed

src/bin/pg_ctl/pg_ctl.c

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ typedef enum
6868

6969
#define DEFAULT_WAIT 60
7070

71+
#define USEC_PER_SEC 1000000
72+
73+
#define WAITS_PER_SEC 10 /* should divide USEC_PER_SEC evenly */
74+
7175
static bool do_wait = true;
7276
static int wait_seconds = DEFAULT_WAIT;
7377
static bool wait_seconds_arg = false;
@@ -531,7 +535,7 @@ test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint)
531535

532536
connstr[0] = '\0';
533537

534-
for (i = 0; i < wait_seconds; i++)
538+
for (i = 0; i < wait_seconds * WAITS_PER_SEC; i++)
535539
{
536540
/* Do we need a connection string? */
537541
if (connstr[0] == '\0')
@@ -701,24 +705,28 @@ test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint)
701705
#endif
702706

703707
/* No response, or startup still in process; wait */
704-
#ifdef WIN32
705-
if (do_checkpoint)
708+
if (i % WAITS_PER_SEC == 0)
706709
{
707-
/*
708-
* Increment the wait hint by 6 secs (connection timeout + sleep)
709-
* We must do this to indicate to the SCM that our startup time is
710-
* changing, otherwise it'll usually send a stop signal after 20
711-
* seconds, despite incrementing the checkpoint counter.
712-
*/
713-
status.dwWaitHint += 6000;
714-
status.dwCheckPoint++;
715-
SetServiceStatus(hStatus, (LPSERVICE_STATUS) &status);
716-
}
717-
else
710+
#ifdef WIN32
711+
if (do_checkpoint)
712+
{
713+
/*
714+
* Increment the wait hint by 6 secs (connection timeout +
715+
* sleep). We must do this to indicate to the SCM that our
716+
* startup time is changing, otherwise it'll usually send a
717+
* stop signal after 20 seconds, despite incrementing the
718+
* checkpoint counter.
719+
*/
720+
status.dwWaitHint += 6000;
721+
status.dwCheckPoint++;
722+
SetServiceStatus(hStatus, (LPSERVICE_STATUS) &status);
723+
}
724+
else
718725
#endif
719-
print_msg(".");
726+
print_msg(".");
727+
}
720728

721-
pg_usleep(1000000); /* 1 sec */
729+
pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
722730
}
723731

724732
/* return result of last call to PQping */
@@ -998,12 +1006,13 @@ do_stop(void)
9981006

9991007
print_msg(_("waiting for server to shut down..."));
10001008

1001-
for (cnt = 0; cnt < wait_seconds; cnt++)
1009+
for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
10021010
{
10031011
if ((pid = get_pgpid(false)) != 0)
10041012
{
1005-
print_msg(".");
1006-
pg_usleep(1000000); /* 1 sec */
1013+
if (cnt % WAITS_PER_SEC == 0)
1014+
print_msg(".");
1015+
pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
10071016
}
10081017
else
10091018
break;
@@ -1088,12 +1097,13 @@ do_restart(void)
10881097

10891098
/* always wait for restart */
10901099

1091-
for (cnt = 0; cnt < wait_seconds; cnt++)
1100+
for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
10921101
{
10931102
if ((pid = get_pgpid(false)) != 0)
10941103
{
1095-
print_msg(".");
1096-
pg_usleep(1000000); /* 1 sec */
1104+
if (cnt % WAITS_PER_SEC == 0)
1105+
print_msg(".");
1106+
pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
10971107
}
10981108
else
10991109
break;
@@ -1225,17 +1235,18 @@ do_promote(void)
12251235
if (do_wait)
12261236
{
12271237
DBState state = DB_STARTUP;
1238+
int cnt;
12281239

12291240
print_msg(_("waiting for server to promote..."));
1230-
while (wait_seconds > 0)
1241+
for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++)
12311242
{
12321243
state = get_control_dbstate();
12331244
if (state == DB_IN_PRODUCTION)
12341245
break;
12351246

1236-
print_msg(".");
1237-
pg_usleep(1000000); /* 1 sec */
1238-
wait_seconds--;
1247+
if (cnt % WAITS_PER_SEC == 0)
1248+
print_msg(".");
1249+
pg_usleep(USEC_PER_SEC / WAITS_PER_SEC);
12391250
}
12401251
if (state == DB_IN_PRODUCTION)
12411252
{

0 commit comments

Comments
 (0)