Skip to content

Commit f5de090

Browse files
committed
Fix a recently-introduced race condition in LISTEN/NOTIFY handling.
Commit 566372b3d fixed some race conditions involving concurrent SimpleLruTruncate calls, but it introduced new ones in async.c. A newly-listening backend could attempt to read Notify SLRU pages that were in process of being truncated, possibly causing an error. Also, the QUEUE_TAIL pointer could become set to a value that's not equal to the queue position of any backend. While that's fairly harmless in v13 and up (thanks to commit 51004c7), in older branches it resulted in near-permanent disabling of the queue truncation logic, so that continued use of NOTIFY led to queue-fill warnings and eventual inability to send any more notifies. (A server restart is enough to make that go away, but it's still pretty unpleasant.) The core of the problem is confusion about whether QUEUE_TAIL represents the "logical" tail of the queue (i.e., the oldest still-interesting data) or the "physical" tail (the oldest data we've not yet truncated away). To fix, split that into two variables. QUEUE_TAIL regains its definition as the logical tail, and we introduce a new variable to track the oldest un-truncated page. Per report from Mikael Gustavsson. Like the previous patch, back-patch to all supported branches. Discussion: https://postgr.es/m/1b8561412e8a4f038d7a491c8b922788@smhi.se
1 parent dcc2094 commit f5de090

File tree

1 file changed

+39
-13
lines changed

1 file changed

+39
-13
lines changed

src/backend/commands/async.c

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ typedef struct QueueBackendStatus
255255
* When holding NotifyQueueLock in EXCLUSIVE mode, backends can inspect the
256256
* entries of other backends and also change the head pointer. When holding
257257
* both NotifyQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends
258-
* can change the tail pointer.
258+
* can change the tail pointers.
259259
*
260260
* NotifySLRULock is used as the control lock for the pg_notify SLRU buffers.
261261
* In order to avoid deadlocks, whenever we need multiple locks, we first get
@@ -276,6 +276,8 @@ typedef struct AsyncQueueControl
276276
QueuePosition head; /* head points to the next free location */
277277
QueuePosition tail; /* tail must be <= the queue position of every
278278
* listening backend */
279+
int stopPage; /* oldest unrecycled page; must be <=
280+
* tail.page */
279281
BackendId firstListener; /* id of first listener, or InvalidBackendId */
280282
TimestampTz lastQueueFillWarn; /* time of last queue-full msg */
281283
QueueBackendStatus backend[FLEXIBLE_ARRAY_MEMBER];
@@ -286,6 +288,7 @@ static AsyncQueueControl *asyncQueueControl;
286288

287289
#define QUEUE_HEAD (asyncQueueControl->head)
288290
#define QUEUE_TAIL (asyncQueueControl->tail)
291+
#define QUEUE_STOP_PAGE (asyncQueueControl->stopPage)
289292
#define QUEUE_FIRST_LISTENER (asyncQueueControl->firstListener)
290293
#define QUEUE_BACKEND_PID(i) (asyncQueueControl->backend[i].pid)
291294
#define QUEUE_BACKEND_DBOID(i) (asyncQueueControl->backend[i].dboid)
@@ -540,6 +543,7 @@ AsyncShmemInit(void)
540543
/* First time through, so initialize it */
541544
SET_QUEUE_POS(QUEUE_HEAD, 0, 0);
542545
SET_QUEUE_POS(QUEUE_TAIL, 0, 0);
546+
QUEUE_STOP_PAGE = 0;
543547
QUEUE_FIRST_LISTENER = InvalidBackendId;
544548
asyncQueueControl->lastQueueFillWarn = 0;
545549
/* zero'th entry won't be used, but let's initialize it anyway */
@@ -1362,7 +1366,7 @@ asyncQueueIsFull(void)
13621366
nexthead = QUEUE_POS_PAGE(QUEUE_HEAD) + 1;
13631367
if (nexthead > QUEUE_MAX_PAGE)
13641368
nexthead = 0; /* wrap around */
1365-
boundary = QUEUE_POS_PAGE(QUEUE_TAIL);
1369+
boundary = QUEUE_STOP_PAGE;
13661370
boundary -= boundary % SLRU_PAGES_PER_SEGMENT;
13671371
return asyncQueuePagePrecedes(nexthead, boundary);
13681372
}
@@ -1576,6 +1580,11 @@ pg_notification_queue_usage(PG_FUNCTION_ARGS)
15761580
* Return the fraction of the queue that is currently occupied.
15771581
*
15781582
* The caller must hold NotifyQueueLock in (at least) shared mode.
1583+
*
1584+
* Note: we measure the distance to the logical tail page, not the physical
1585+
* tail page. In some sense that's wrong, but the relative position of the
1586+
* physical tail is affected by details such as SLRU segment boundaries,
1587+
* so that a result based on that is unpleasantly unstable.
15791588
*/
15801589
static double
15811590
asyncQueueUsage(void)
@@ -2183,15 +2192,32 @@ asyncQueueAdvanceTail(void)
21832192
/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
21842193
LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE);
21852194

2186-
/* Compute the new tail. */
2195+
/*
2196+
* Compute the new tail. Pre-v13, it's essential that QUEUE_TAIL be exact
2197+
* (ie, exactly match at least one backend's queue position), so it must
2198+
* be updated atomically with the actual computation. Since v13, we could
2199+
* get away with not doing it like that, but it seems prudent to keep it
2200+
* so.
2201+
*
2202+
* Also, because incoming backends will scan forward from QUEUE_TAIL, that
2203+
* must be advanced before we can truncate any data. Thus, QUEUE_TAIL is
2204+
* the logical tail, while QUEUE_STOP_PAGE is the physical tail, or oldest
2205+
* un-truncated page. When QUEUE_STOP_PAGE != QUEUE_POS_PAGE(QUEUE_TAIL),
2206+
* there are pages we can truncate but haven't yet finished doing so.
2207+
*
2208+
* For concurrency's sake, we don't want to hold NotifyQueueLock while
2209+
* performing SimpleLruTruncate. This is OK because no backend will try
2210+
* to access the pages we are in the midst of truncating.
2211+
*/
21872212
LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE);
21882213
min = QUEUE_HEAD;
21892214
for (BackendId i = QUEUE_FIRST_LISTENER; i > 0; i = QUEUE_NEXT_LISTENER(i))
21902215
{
21912216
Assert(QUEUE_BACKEND_PID(i) != InvalidPid);
21922217
min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i));
21932218
}
2194-
oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL);
2219+
QUEUE_TAIL = min;
2220+
oldtailpage = QUEUE_STOP_PAGE;
21952221
LWLockRelease(NotifyQueueLock);
21962222

21972223
/*
@@ -2210,16 +2236,16 @@ asyncQueueAdvanceTail(void)
22102236
* release the lock again.
22112237
*/
22122238
SimpleLruTruncate(NotifyCtl, newtailpage);
2213-
}
22142239

2215-
/*
2216-
* Advertise the new tail. This changes asyncQueueIsFull()'s verdict for
2217-
* the segment immediately prior to the new tail, allowing fresh data into
2218-
* that segment.
2219-
*/
2220-
LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE);
2221-
QUEUE_TAIL = min;
2222-
LWLockRelease(NotifyQueueLock);
2240+
/*
2241+
* Update QUEUE_STOP_PAGE. This changes asyncQueueIsFull()'s verdict
2242+
* for the segment immediately prior to the old tail, allowing fresh
2243+
* data into that segment.
2244+
*/
2245+
LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE);
2246+
QUEUE_STOP_PAGE = newtailpage;
2247+
LWLockRelease(NotifyQueueLock);
2248+
}
22232249

22242250
LWLockRelease(NotifyQueueTailLock);
22252251
}

0 commit comments

Comments
 (0)