Skip to content

Commit c6d9012

Browse files
committed
Fix incorrect order of lock file removal and failure to close() sockets.
Commit c9b0cbe accidentally broke the order of operations during postmaster shutdown: it resulted in removing the per-socket lockfiles after, not before, postmaster.pid. This creates a race-condition hazard for a new postmaster that's started immediately after observing that postmaster.pid has disappeared; if it sees the socket lockfile still present, it will quite properly refuse to start. This error appears to be the explanation for at least some of the intermittent buildfarm failures we've seen in the pg_upgrade test. Another problem, which has been there all along, is that the postmaster has never bothered to close() its listen sockets, but has just allowed them to close at process death. This creates a different race condition for an incoming postmaster: it might be unable to bind to the desired listen address because the old postmaster is still incumbent. This might explain some odd failures we've seen in the past, too. (Note: this is not related to the fact that individual backends don't close their client communication sockets. That behavior is intentional and is not changed by this patch.) Fix by adding an on_proc_exit function that closes the postmaster's ports explicitly, and (in 9.3 and up) reshuffling the responsibility for where to unlink the Unix socket files. Lock file unlinking can stay where it is, but teach it to unlink the lock files in reverse order of creation.
1 parent bab9599 commit c6d9012

File tree

4 files changed

+75
-30
lines changed

4 files changed

+75
-30
lines changed

src/backend/libpq/pqcomm.c

Lines changed: 22 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -247,28 +247,6 @@ pq_close(int code, Datum arg)
247247
*/
248248

249249

250-
/* StreamDoUnlink()
251-
* Shutdown routine for backend connection
252-
* If any Unix sockets are used for communication, explicitly close them.
253-
*/
254-
#ifdef HAVE_UNIX_SOCKETS
255-
static void
256-
StreamDoUnlink(int code, Datum arg)
257-
{
258-
ListCell *l;
259-
260-
/* Loop through all created sockets... */
261-
foreach(l, sock_paths)
262-
{
263-
char *sock_path = (char *) lfirst(l);
264-
265-
unlink(sock_path);
266-
}
267-
/* Since we're about to exit, no need to reclaim storage */
268-
sock_paths = NIL;
269-
}
270-
#endif /* HAVE_UNIX_SOCKETS */
271-
272250
/*
273251
* StreamServerPort -- open a "listening" port to accept connections.
274252
*
@@ -550,16 +528,11 @@ Lock_AF_UNIX(char *unixSocketDir, char *unixSocketPath)
550528
* Once we have the interlock, we can safely delete any pre-existing
551529
* socket file to avoid failure at bind() time.
552530
*/
553-
unlink(unixSocketPath);
531+
(void) unlink(unixSocketPath);
554532

555533
/*
556-
* Arrange to unlink the socket file(s) at proc_exit. If this is the
557-
* first one, set up the on_proc_exit function to do it; then add this
558-
* socket file to the list of files to unlink.
534+
* Remember socket file pathnames for later maintenance.
559535
*/
560-
if (sock_paths == NIL)
561-
on_proc_exit(StreamDoUnlink, 0);
562-
563536
sock_paths = lappend(sock_paths, pstrdup(unixSocketPath));
564537

565538
return STATUS_OK;
@@ -788,6 +761,26 @@ TouchSocketFiles(void)
788761
}
789762
}
790763

764+
/*
765+
* RemoveSocketFiles -- unlink socket files at postmaster shutdown
766+
*/
767+
void
768+
RemoveSocketFiles(void)
769+
{
770+
ListCell *l;
771+
772+
/* Loop through all created sockets... */
773+
foreach(l, sock_paths)
774+
{
775+
char *sock_path = (char *) lfirst(l);
776+
777+
/* Ignore any error. */
778+
(void) unlink(sock_path);
779+
}
780+
/* Since we're about to exit, no need to reclaim storage */
781+
sock_paths = NIL;
782+
}
783+
791784

792785
/* --------------------------------
793786
* Low-level I/O routines begin here.

src/backend/postmaster/postmaster.c

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ static DNSServiceRef bonjour_sdref = NULL;
370370
/*
371371
* postmaster.c - function prototypes
372372
*/
373+
static void CloseServerPorts(int status, Datum arg);
373374
static void unlink_external_pid_file(int status, Datum arg);
374375
static void getInstallationPaths(const char *argv0);
375376
static void checkDataDir(void);
@@ -892,6 +893,11 @@ PostmasterMain(int argc, char *argv[])
892893
* interlock (thanks to whoever decided to put socket files in /tmp :-().
893894
* For the same reason, it's best to grab the TCP socket(s) before the
894895
* Unix socket(s).
896+
*
897+
* Also note that this internally sets up the on_proc_exit function that
898+
* is responsible for removing both data directory and socket lockfiles;
899+
* so it must happen before opening sockets so that at exit, the socket
900+
* lockfiles go away after CloseServerPorts runs.
895901
*/
896902
CreateDataDirLockFile(true);
897903

@@ -916,10 +922,15 @@ PostmasterMain(int argc, char *argv[])
916922

917923
/*
918924
* Establish input sockets.
925+
*
926+
* First, mark them all closed, and set up an on_proc_exit function that's
927+
* charged with closing the sockets again at postmaster shutdown.
919928
*/
920929
for (i = 0; i < MAXLISTEN; i++)
921930
ListenSocket[i] = PGINVALID_SOCKET;
922931

932+
on_proc_exit(CloseServerPorts, 0);
933+
923934
if (ListenAddresses)
924935
{
925936
char *rawstring;
@@ -1263,6 +1274,42 @@ PostmasterMain(int argc, char *argv[])
12631274
}
12641275

12651276

1277+
/*
1278+
* on_proc_exit callback to close server's listen sockets
1279+
*/
1280+
static void
1281+
CloseServerPorts(int status, Datum arg)
1282+
{
1283+
int i;
1284+
1285+
/*
1286+
* First, explicitly close all the socket FDs. We used to just let this
1287+
* happen implicitly at postmaster exit, but it's better to close them
1288+
* before we remove the postmaster.pid lockfile; otherwise there's a race
1289+
* condition if a new postmaster wants to re-use the TCP port number.
1290+
*/
1291+
for (i = 0; i < MAXLISTEN; i++)
1292+
{
1293+
if (ListenSocket[i] != PGINVALID_SOCKET)
1294+
{
1295+
StreamClose(ListenSocket[i]);
1296+
ListenSocket[i] = PGINVALID_SOCKET;
1297+
}
1298+
}
1299+
1300+
/*
1301+
* Next, remove any filesystem entries for Unix sockets. To avoid race
1302+
* conditions against incoming postmasters, this must happen after closing
1303+
* the sockets and before removing lock files.
1304+
*/
1305+
RemoveSocketFiles();
1306+
1307+
/*
1308+
* We don't do anything about socket lock files here; those will be
1309+
* removed in a later on_proc_exit callback.
1310+
*/
1311+
}
1312+
12661313
/*
12671314
* on_proc_exit callback to delete external_pid_file
12681315
*/

src/backend/utils/init/miscinit.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -884,7 +884,11 @@ CreateLockFile(const char *filename, bool amPostmaster,
884884
if (lock_files == NIL)
885885
on_proc_exit(UnlinkLockFiles, 0);
886886

887-
lock_files = lappend(lock_files, pstrdup(filename));
887+
/*
888+
* Use lcons so that the lock files are unlinked in reverse order of
889+
* creation; this is critical!
890+
*/
891+
lock_files = lcons(pstrdup(filename), lock_files);
888892
}
889893

890894
/*

src/include/libpq/libpq.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ extern int StreamServerPort(int family, char *hostName,
5050
extern int StreamConnection(pgsocket server_fd, Port *port);
5151
extern void StreamClose(pgsocket sock);
5252
extern void TouchSocketFiles(void);
53+
extern void RemoveSocketFiles(void);
5354
extern void pq_init(void);
5455
extern void pq_comm_reset(void);
5556
extern int pq_getbytes(char *s, size_t len);

0 commit comments

Comments
 (0)