Skip to content

Commit 61c21dd

Browse files
committed
Remove select(2) backed latch implementation.
poll(2) is required by Single Unix Spec v2, the usual baseline for postgres (leaving windows aside). There's not been any buildfarm animals without poll(2) for a long while, leaving the select(2) implementation to be largely untested. On windows, including mingw, poll() is not available, but we have a special case implementation for windows anyway. Author: Andres Freund Discussion: https://postgr.es/m/20170420003611.7r2sdvehesdyiz2i@alap3.anarazel.de
1 parent 546c13e commit 61c21dd

File tree

1 file changed

+28
-193
lines changed

1 file changed

+28
-193
lines changed

src/backend/storage/ipc/latch.c

Lines changed: 28 additions & 193 deletions
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,24 @@
33
* latch.c
44
* Routines for inter-process latches
55
*
6-
* The Unix implementation uses the so-called self-pipe trick to overcome
7-
* the race condition involved with select() and setting a global flag
8-
* in the signal handler. When a latch is set and the current process
9-
* is waiting for it, the signal handler wakes up the select() in
10-
* WaitLatch by writing a byte to a pipe. A signal by itself doesn't
11-
* interrupt select() on all platforms, and even on platforms where it
12-
* does, a signal that arrives just before the select() call does not
13-
* prevent the select() from entering sleep. An incoming byte on a pipe
14-
* however reliably interrupts the sleep, and causes select() to return
15-
* immediately even if the signal arrives before select() begins.
16-
*
17-
* (Actually, we prefer epoll_wait() over poll() over select() where
18-
* available, but the same comments apply.)
6+
* The Unix implementation uses the so-called self-pipe trick to overcome the
7+
* race condition involved with poll() (or epoll_wait() on linux) and setting
8+
* a global flag in the signal handler. When a latch is set and the current
9+
* process is waiting for it, the signal handler wakes up the poll() in
10+
* WaitLatch by writing a byte to a pipe. A signal by itself doesn't interrupt
11+
* poll() on all platforms, and even on platforms where it does, a signal that
12+
* arrives just before the poll() call does not prevent poll() from entering
13+
* sleep. An incoming byte on a pipe however reliably interrupts the sleep,
14+
* and causes poll() to return immediately even if the signal arrives before
15+
* poll() begins.
1916
*
2017
* When SetLatch is called from the same process that owns the latch,
2118
* SetLatch writes the byte directly to the pipe. If it's owned by another
2219
* process, SIGUSR1 is sent and the signal handler in the waiting process
2320
* writes the byte to the pipe on behalf of the signaling process.
2421
*
25-
* The Windows implementation uses Windows events that are inherited by
26-
* all postmaster child processes.
22+
* The Windows implementation uses Windows events that are inherited by all
23+
* postmaster child processes. There's no need for the self-pipe trick there.
2724
*
2825
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
2926
* Portions Copyright (c) 1994, Regents of the University of California
@@ -39,7 +36,6 @@
3936
#include <limits.h>
4037
#include <signal.h>
4138
#include <unistd.h>
42-
#include <sys/time.h>
4339
#ifdef HAVE_SYS_EPOLL_H
4440
#include <sys/epoll.h>
4541
#endif
@@ -49,9 +45,6 @@
4945
#ifdef HAVE_SYS_POLL_H
5046
#include <sys/poll.h>
5147
#endif
52-
#ifdef HAVE_SYS_SELECT_H
53-
#include <sys/select.h>
54-
#endif
5548

5649
#include "miscadmin.h"
5750
#include "pgstat.h"
@@ -69,14 +62,12 @@
6962
* define somewhere before this block.
7063
*/
7164
#if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
72-
defined(WAIT_USE_SELECT) || defined(WAIT_USE_WIN32)
65+
defined(WAIT_USE_WIN32)
7366
/* don't overwrite manual choice */
7467
#elif defined(HAVE_SYS_EPOLL_H)
7568
#define WAIT_USE_EPOLL
7669
#elif defined(HAVE_POLL)
7770
#define WAIT_USE_POLL
78-
#elif HAVE_SYS_SELECT_H
79-
#define WAIT_USE_SELECT
8071
#elif WIN32
8172
#define WAIT_USE_WIN32
8273
#else
@@ -162,8 +153,8 @@ InitializeLatchSupport(void)
162153

163154
/*
164155
* Set up the self-pipe that allows a signal handler to wake up the
165-
* select() in WaitLatch. Make the write-end non-blocking, so that
166-
* SetLatch won't block if the event has already been set many times
156+
* poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
157+
* that SetLatch won't block if the event has already been set many times
167158
* filling the kernel buffer. Make the read-end non-blocking too, so that
168159
* we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
169160
*/
@@ -401,8 +392,9 @@ SetLatch(volatile Latch *latch)
401392

402393
/*
403394
* See if anyone's waiting for the latch. It can be the current process if
404-
* we're in a signal handler. We use the self-pipe to wake up the select()
405-
* in that case. If it's another process, send a signal.
395+
* we're in a signal handler. We use the self-pipe to wake up the
396+
* poll()/epoll_wait() in that case. If it's another process, send a
397+
* signal.
406398
*
407399
* Fetch owner_pid only once, in case the latch is concurrently getting
408400
* owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
@@ -666,8 +658,6 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
666658
WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
667659
#elif defined(WAIT_USE_POLL)
668660
WaitEventAdjustPoll(set, event);
669-
#elif defined(WAIT_USE_SELECT)
670-
/* nothing to do */
671661
#elif defined(WAIT_USE_WIN32)
672662
WaitEventAdjustWin32(set, event);
673663
#endif
@@ -724,8 +714,6 @@ ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
724714
WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
725715
#elif defined(WAIT_USE_POLL)
726716
WaitEventAdjustPoll(set, event);
727-
#elif defined(WAIT_USE_SELECT)
728-
/* nothing to do */
729717
#elif defined(WAIT_USE_WIN32)
730718
WaitEventAdjustWin32(set, event);
731719
#endif
@@ -1055,9 +1043,11 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
10551043
* because we don't expect the pipe to become readable or to have
10561044
* any errors either, treat those cases as postmaster death, too.
10571045
*
1058-
* As explained in the WAIT_USE_SELECT implementation, select(2)
1059-
* may spuriously return. Be paranoid about that here too, a
1060-
* spurious WL_POSTMASTER_DEATH would be painful.
1046+
* Be paranoid about a spurious event signalling the postmaster as
1047+
* being dead. There have been reports about that happening with
1048+
* older primitives (select(2) to be specific), and a spurious
1049+
* WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1050+
* cost much.
10611051
*/
10621052
if (!PostmasterIsAlive())
10631053
{
@@ -1171,9 +1161,11 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
11711161
* we don't expect the pipe to become readable or to have any
11721162
* errors either, treat those cases as postmaster death, too.
11731163
*
1174-
* As explained in the WAIT_USE_SELECT implementation, select(2)
1175-
* may spuriously return. Be paranoid about that here too, a
1176-
* spurious WL_POSTMASTER_DEATH would be painful.
1164+
* Be paranoid about a spurious event signalling the postmaster as
1165+
* being dead. There have been reports about that happening with
1166+
* older primitives (select(2) to be specific), and a spurious
1167+
* WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1168+
* cost much.
11771169
*/
11781170
if (!PostmasterIsAlive())
11791171
{
@@ -1214,163 +1206,6 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
12141206
return returned_events;
12151207
}
12161208

1217-
#elif defined(WAIT_USE_SELECT)
1218-
1219-
/*
1220-
* Wait using select(2).
1221-
*
1222-
* XXX: On at least older linux kernels select(), in violation of POSIX,
1223-
* doesn't reliably return a socket as writable if closed - but we rely on
1224-
* that. So far all the known cases of this problem are on platforms that also
1225-
* provide a poll() implementation without that bug. If we find one where
1226-
* that's not the case, we'll need to add a workaround.
1227-
*/
1228-
static inline int
1229-
WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1230-
WaitEvent *occurred_events, int nevents)
1231-
{
1232-
int returned_events = 0;
1233-
int rc;
1234-
WaitEvent *cur_event;
1235-
fd_set input_mask;
1236-
fd_set output_mask;
1237-
int hifd;
1238-
struct timeval tv;
1239-
struct timeval *tvp = NULL;
1240-
1241-
FD_ZERO(&input_mask);
1242-
FD_ZERO(&output_mask);
1243-
1244-
/*
1245-
* Prepare input/output masks. We do so every loop iteration as there's no
1246-
* entirely portable way to copy fd_sets.
1247-
*/
1248-
for (cur_event = set->events;
1249-
cur_event < (set->events + set->nevents);
1250-
cur_event++)
1251-
{
1252-
if (cur_event->events == WL_LATCH_SET)
1253-
FD_SET(cur_event->fd, &input_mask);
1254-
else if (cur_event->events == WL_POSTMASTER_DEATH)
1255-
FD_SET(cur_event->fd, &input_mask);
1256-
else
1257-
{
1258-
Assert(cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
1259-
if (cur_event->events == WL_SOCKET_READABLE)
1260-
FD_SET(cur_event->fd, &input_mask);
1261-
else if (cur_event->events == WL_SOCKET_WRITEABLE)
1262-
FD_SET(cur_event->fd, &output_mask);
1263-
}
1264-
1265-
if (cur_event->fd > hifd)
1266-
hifd = cur_event->fd;
1267-
}
1268-
1269-
/* Sleep */
1270-
if (cur_timeout >= 0)
1271-
{
1272-
tv.tv_sec = cur_timeout / 1000L;
1273-
tv.tv_usec = (cur_timeout % 1000L) * 1000L;
1274-
tvp = &tv;
1275-
}
1276-
rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
1277-
1278-
/* Check return code */
1279-
if (rc < 0)
1280-
{
1281-
/* EINTR is okay, otherwise complain */
1282-
if (errno != EINTR)
1283-
{
1284-
waiting = false;
1285-
ereport(ERROR,
1286-
(errcode_for_socket_access(),
1287-
errmsg("select() failed: %m")));
1288-
}
1289-
return 0; /* retry */
1290-
}
1291-
else if (rc == 0)
1292-
{
1293-
/* timeout exceeded */
1294-
return -1;
1295-
}
1296-
1297-
/*
1298-
* To associate events with select's masks, we have to check the status of
1299-
* the file descriptors associated with an event; by looping through all
1300-
* events.
1301-
*/
1302-
for (cur_event = set->events;
1303-
cur_event < (set->events + set->nevents)
1304-
&& returned_events < nevents;
1305-
cur_event++)
1306-
{
1307-
occurred_events->pos = cur_event->pos;
1308-
occurred_events->user_data = cur_event->user_data;
1309-
occurred_events->events = 0;
1310-
1311-
if (cur_event->events == WL_LATCH_SET &&
1312-
FD_ISSET(cur_event->fd, &input_mask))
1313-
{
1314-
/* There's data in the self-pipe, clear it. */
1315-
drainSelfPipe();
1316-
1317-
if (set->latch->is_set)
1318-
{
1319-
occurred_events->fd = PGINVALID_SOCKET;
1320-
occurred_events->events = WL_LATCH_SET;
1321-
occurred_events++;
1322-
returned_events++;
1323-
}
1324-
}
1325-
else if (cur_event->events == WL_POSTMASTER_DEATH &&
1326-
FD_ISSET(cur_event->fd, &input_mask))
1327-
{
1328-
/*
1329-
* According to the select(2) man page on Linux, select(2) may
1330-
* spuriously return and report a file descriptor as readable,
1331-
* when it's not; and presumably so can poll(2). It's not clear
1332-
* that the relevant cases would ever apply to the postmaster
1333-
* pipe, but since the consequences of falsely returning
1334-
* WL_POSTMASTER_DEATH could be pretty unpleasant, we take the
1335-
* trouble to positively verify EOF with PostmasterIsAlive().
1336-
*/
1337-
if (!PostmasterIsAlive())
1338-
{
1339-
occurred_events->fd = PGINVALID_SOCKET;
1340-
occurred_events->events = WL_POSTMASTER_DEATH;
1341-
occurred_events++;
1342-
returned_events++;
1343-
}
1344-
}
1345-
else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1346-
{
1347-
Assert(cur_event->fd != PGINVALID_SOCKET);
1348-
1349-
if ((cur_event->events & WL_SOCKET_READABLE) &&
1350-
FD_ISSET(cur_event->fd, &input_mask))
1351-
{
1352-
/* data available in socket, or EOF */
1353-
occurred_events->events |= WL_SOCKET_READABLE;
1354-
}
1355-
1356-
if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1357-
FD_ISSET(cur_event->fd, &output_mask))
1358-
{
1359-
/* socket is writeable, or EOF */
1360-
occurred_events->events |= WL_SOCKET_WRITEABLE;
1361-
}
1362-
1363-
if (occurred_events->events != 0)
1364-
{
1365-
occurred_events->fd = cur_event->fd;
1366-
occurred_events++;
1367-
returned_events++;
1368-
}
1369-
}
1370-
}
1371-
return returned_events;
1372-
}
1373-
13741209
#elif defined(WAIT_USE_WIN32)
13751210

13761211
/*

0 commit comments

Comments
 (0)