Skip to content

Commit 9c83b54

Browse files
committed
Fix a recently-introduced race condition in LISTEN/NOTIFY handling.
Commit 566372b fixed some race conditions involving concurrent SimpleLruTruncate calls, but it introduced new ones in async.c. A newly-listening backend could attempt to read Notify SLRU pages that were in process of being truncated, possibly causing an error. Also, the QUEUE_TAIL pointer could become set to a value that's not equal to the queue position of any backend. While that's fairly harmless in v13 and up (thanks to commit 51004c7), in older branches it resulted in near-permanent disabling of the queue truncation logic, so that continued use of NOTIFY led to queue-fill warnings and eventual inability to send any more notifies. (A server restart is enough to make that go away, but it's still pretty unpleasant.) The core of the problem is confusion about whether QUEUE_TAIL represents the "logical" tail of the queue (i.e., the oldest still-interesting data) or the "physical" tail (the oldest data we've not yet truncated away). To fix, split that into two variables. QUEUE_TAIL regains its definition as the logical tail, and we introduce a new variable to track the oldest un-truncated page. Per report from Mikael Gustavsson. Like the previous patch, back-patch to all supported branches. Discussion: https://postgr.es/m/1b8561412e8a4f038d7a491c8b922788@smhi.se
1 parent 3df51ca commit 9c83b54

File tree

1 file changed

+39
-13
lines changed

1 file changed

+39
-13
lines changed

src/backend/commands/async.c

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ typedef struct QueueBackendStatus
255255
* When holding NotifyQueueLock in EXCLUSIVE mode, backends can inspect the
256256
* entries of other backends and also change the head pointer. When holding
257257
* both NotifyQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends
258-
* can change the tail pointer.
258+
* can change the tail pointers.
259259
*
260260
* NotifySLRULock is used as the control lock for the pg_notify SLRU buffers.
261261
* In order to avoid deadlocks, whenever we need multiple locks, we first get
@@ -276,6 +276,8 @@ typedef struct AsyncQueueControl
276276
QueuePosition head; /* head points to the next free location */
277277
QueuePosition tail; /* tail must be <= the queue position of every
278278
* listening backend */
279+
int stopPage; /* oldest unrecycled page; must be <=
280+
* tail.page */
279281
BackendId firstListener; /* id of first listener, or InvalidBackendId */
280282
TimestampTz lastQueueFillWarn; /* time of last queue-full msg */
281283
QueueBackendStatus backend[FLEXIBLE_ARRAY_MEMBER];
@@ -286,6 +288,7 @@ static AsyncQueueControl *asyncQueueControl;
286288

287289
#define QUEUE_HEAD (asyncQueueControl->head)
288290
#define QUEUE_TAIL (asyncQueueControl->tail)
291+
#define QUEUE_STOP_PAGE (asyncQueueControl->stopPage)
289292
#define QUEUE_FIRST_LISTENER (asyncQueueControl->firstListener)
290293
#define QUEUE_BACKEND_PID(i) (asyncQueueControl->backend[i].pid)
291294
#define QUEUE_BACKEND_DBOID(i) (asyncQueueControl->backend[i].dboid)
@@ -537,6 +540,7 @@ AsyncShmemInit(void)
537540
/* First time through, so initialize it */
538541
SET_QUEUE_POS(QUEUE_HEAD, 0, 0);
539542
SET_QUEUE_POS(QUEUE_TAIL, 0, 0);
543+
QUEUE_STOP_PAGE = 0;
540544
QUEUE_FIRST_LISTENER = InvalidBackendId;
541545
asyncQueueControl->lastQueueFillWarn = 0;
542546
/* zero'th entry won't be used, but let's initialize it anyway */
@@ -1358,7 +1362,7 @@ asyncQueueIsFull(void)
13581362
nexthead = QUEUE_POS_PAGE(QUEUE_HEAD) + 1;
13591363
if (nexthead > QUEUE_MAX_PAGE)
13601364
nexthead = 0; /* wrap around */
1361-
boundary = QUEUE_POS_PAGE(QUEUE_TAIL);
1365+
boundary = QUEUE_STOP_PAGE;
13621366
boundary -= boundary % SLRU_PAGES_PER_SEGMENT;
13631367
return asyncQueuePagePrecedes(nexthead, boundary);
13641368
}
@@ -1572,6 +1576,11 @@ pg_notification_queue_usage(PG_FUNCTION_ARGS)
15721576
* Return the fraction of the queue that is currently occupied.
15731577
*
15741578
* The caller must hold NotifyQueueLock in (at least) shared mode.
1579+
*
1580+
* Note: we measure the distance to the logical tail page, not the physical
1581+
* tail page. In some sense that's wrong, but the relative position of the
1582+
* physical tail is affected by details such as SLRU segment boundaries,
1583+
* so that a result based on that is unpleasantly unstable.
15751584
*/
15761585
static double
15771586
asyncQueueUsage(void)
@@ -2178,15 +2187,32 @@ asyncQueueAdvanceTail(void)
21782187
/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
21792188
LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE);
21802189

2181-
/* Compute the new tail. */
2190+
/*
2191+
* Compute the new tail. Pre-v13, it's essential that QUEUE_TAIL be exact
2192+
* (ie, exactly match at least one backend's queue position), so it must
2193+
* be updated atomically with the actual computation. Since v13, we could
2194+
* get away with not doing it like that, but it seems prudent to keep it
2195+
* so.
2196+
*
2197+
* Also, because incoming backends will scan forward from QUEUE_TAIL, that
2198+
* must be advanced before we can truncate any data. Thus, QUEUE_TAIL is
2199+
* the logical tail, while QUEUE_STOP_PAGE is the physical tail, or oldest
2200+
* un-truncated page. When QUEUE_STOP_PAGE != QUEUE_POS_PAGE(QUEUE_TAIL),
2201+
* there are pages we can truncate but haven't yet finished doing so.
2202+
*
2203+
* For concurrency's sake, we don't want to hold NotifyQueueLock while
2204+
* performing SimpleLruTruncate. This is OK because no backend will try
2205+
* to access the pages we are in the midst of truncating.
2206+
*/
21822207
LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE);
21832208
min = QUEUE_HEAD;
21842209
for (BackendId i = QUEUE_FIRST_LISTENER; i > 0; i = QUEUE_NEXT_LISTENER(i))
21852210
{
21862211
Assert(QUEUE_BACKEND_PID(i) != InvalidPid);
21872212
min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i));
21882213
}
2189-
oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL);
2214+
QUEUE_TAIL = min;
2215+
oldtailpage = QUEUE_STOP_PAGE;
21902216
LWLockRelease(NotifyQueueLock);
21912217

21922218
/*
@@ -2205,16 +2231,16 @@ asyncQueueAdvanceTail(void)
22052231
* release the lock again.
22062232
*/
22072233
SimpleLruTruncate(NotifyCtl, newtailpage);
2208-
}
22092234

2210-
/*
2211-
* Advertise the new tail. This changes asyncQueueIsFull()'s verdict for
2212-
* the segment immediately prior to the new tail, allowing fresh data into
2213-
* that segment.
2214-
*/
2215-
LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE);
2216-
QUEUE_TAIL = min;
2217-
LWLockRelease(NotifyQueueLock);
2235+
/*
2236+
* Update QUEUE_STOP_PAGE. This changes asyncQueueIsFull()'s verdict
2237+
* for the segment immediately prior to the old tail, allowing fresh
2238+
* data into that segment.
2239+
*/
2240+
LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE);
2241+
QUEUE_STOP_PAGE = newtailpage;
2242+
LWLockRelease(NotifyQueueLock);
2243+
}
22182244

22192245
LWLockRelease(NotifyQueueTailLock);
22202246
}

0 commit comments

Comments
 (0)