Skip to content

Commit 8fc23a9

Browse files
committed
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out, but not yet locked. If a backend pinned and locked the page in that window, it saw the zeroed page instead of the old page or new page contents, which could lead to missing rows in a result set, or errors. To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins, zeroes, and locks the page, if it's not in the buffer cache already. In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE, to avoid breaking any 3rd party extensions that might use RBM_ZERO. More importantly, this avoids renumbering the other enum values, which would cause even bigger confusion in extensions that use ReadBufferExtended, but haven't been recompiled. Backpatch to all supported versions; this has been racy since hot standby was introduced.
1 parent 955b4ba commit 8fc23a9

File tree

6 files changed

+66
-25
lines changed

6 files changed

+66
-25
lines changed

src/backend/access/hash/hashpage.c

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,8 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno)
155155
if (blkno == P_NEW)
156156
elog(ERROR, "hash AM does not use P_NEW");
157157

158-
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO, NULL);
159-
160-
LockBuffer(buf, HASH_WRITE);
158+
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
159+
NULL);
161160

162161
/* ref count and lock type are correct */
163162

@@ -198,11 +197,13 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
198197
if (BufferGetBlockNumber(buf) != blkno)
199198
elog(ERROR, "unexpected hash relation size: %u, should be %u",
200199
BufferGetBlockNumber(buf), blkno);
200+
LockBuffer(buf, HASH_WRITE);
201201
}
202202
else
203-
buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO, NULL);
204-
205-
LockBuffer(buf, HASH_WRITE);
203+
{
204+
buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
205+
NULL);
206+
}
206207

207208
/* ref count and lock type are correct */
208209

src/backend/access/heap/heapam.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7527,9 +7527,8 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
75277527
* not do anything that assumes we are touching a heap.
75287528
*/
75297529
buffer = XLogReadBufferExtended(xlrec->node, xlrec->forknum, xlrec->blkno,
7530-
RBM_ZERO);
7530+
RBM_ZERO_AND_LOCK);
75317531
Assert(BufferIsValid(buffer));
7532-
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
75337532
page = (Page) BufferGetPage(buffer);
75347533

75357534
if (xlrec->hole_length == 0)

src/backend/access/transam/xlog.c

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4054,12 +4054,8 @@ RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
40544054
Page page;
40554055

40564056
buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
4057-
RBM_ZERO);
4057+
get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
40584058
Assert(BufferIsValid(buffer));
4059-
if (get_cleanup_lock)
4060-
LockBufferForCleanup(buffer);
4061-
else
4062-
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
40634059

40644060
page = (Page) BufferGetPage(buffer);
40654061

src/backend/access/transam/xlogutils.c

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -257,16 +257,17 @@ XLogCheckInvalidPages(void)
257257
* The returned buffer is exclusively-locked.
258258
*
259259
* For historical reasons, instead of a ReadBufferMode argument, this only
260-
* supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
260+
* supports RBM_ZERO_AND_LOCK (init == true) and RBM_NORMAL (init == false)
261+
* modes.
261262
*/
262263
Buffer
263264
XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
264265
{
265266
Buffer buf;
266267

267268
buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
268-
init ? RBM_ZERO : RBM_NORMAL);
269-
if (BufferIsValid(buf))
269+
init ? RBM_ZERO_AND_LOCK : RBM_NORMAL);
270+
if (BufferIsValid(buf) && !init)
270271
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
271272

272273
return buf;
@@ -285,8 +286,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
285286
* dropped or truncated. If we don't see evidence of that later in the WAL
286287
* sequence, we'll complain at the end of WAL replay.)
287288
*
288-
* In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
289-
* relation is extended with all-zeroes pages up to the given block number.
289+
* In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
290+
* with all-zeroes pages up to the given block number.
290291
*
291292
* In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
292293
* exist, and we don't check for all-zeroes. Thus, no log entry is made
@@ -340,14 +341,20 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
340341
do
341342
{
342343
if (buffer != InvalidBuffer)
344+
{
345+
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
346+
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
343347
ReleaseBuffer(buffer);
348+
}
344349
buffer = ReadBufferWithoutRelcache(rnode, forknum,
345350
P_NEW, mode, NULL);
346351
}
347352
while (BufferGetBlockNumber(buffer) < blkno);
348353
/* Handle the corner case that P_NEW returns non-consecutive pages */
349354
if (BufferGetBlockNumber(buffer) != blkno)
350355
{
356+
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
357+
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
351358
ReleaseBuffer(buffer);
352359
buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
353360
mode, NULL);

src/backend/storage/buffer/bufmgr.c

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -213,14 +213,19 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
213213
* valid, the page is zeroed instead of throwing an error. This is intended
214214
* for non-critical data, where the caller is prepared to repair errors.
215215
*
216-
* In RBM_ZERO mode, if the page isn't in buffer cache already, it's filled
217-
* with zeros instead of reading it from disk. Useful when the caller is
218-
* going to fill the page from scratch, since this saves I/O and avoids
216+
* In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
217+
* filled with zeros instead of reading it from disk. Useful when the caller
218+
* is going to fill the page from scratch, since this saves I/O and avoids
219219
* unnecessary failure if the page-on-disk has corrupt page headers.
220+
* The page is returned locked to ensure that the caller has a chance to
221+
* initialize the page before it's made visible to others.
220222
* Caution: do not use this mode to read a page that is beyond the relation's
221223
* current physical EOF; that is likely to cause problems in md.c when
222224
* the page is modified and written out. P_NEW is OK, though.
223225
*
226+
* RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
227+
* a cleanup-strength lock on the page.
228+
*
224229
* RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
225230
*
226231
* If strategy is not NULL, a nondefault buffer access strategy is used.
@@ -362,6 +367,18 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
362367
isExtend,
363368
found);
364369

370+
/*
371+
* In RBM_ZERO_AND_LOCK mode, the caller expects the buffer to
372+
* be already locked on return.
373+
*/
374+
if (!isLocalBuf)
375+
{
376+
if (mode == RBM_ZERO_AND_LOCK)
377+
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
378+
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
379+
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
380+
}
381+
365382
return BufferDescriptorGetBuffer(bufHdr);
366383
}
367384

@@ -443,8 +460,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
443460
* Read in the page, unless the caller intends to overwrite it and
444461
* just wants us to allocate a buffer.
445462
*/
446-
if (mode == RBM_ZERO)
463+
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK ||
464+
mode == RBM_DO_NOT_USE)
465+
{
447466
MemSet((char *) bufBlock, 0, BLCKSZ);
467+
}
448468
else
449469
{
450470
instr_time io_start,
@@ -485,6 +505,19 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
485505
}
486506
}
487507

508+
/*
509+
* In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
510+
* the page as valid, to make sure that no other backend sees the zeroed
511+
* page before the caller has had a chance to initialize it.
512+
*
513+
* Since no-one else can be looking at the page contents yet, there is no
514+
* difference between an exclusive lock and a cleanup-strength lock.
515+
* (Note that we cannot use LockBuffer() of LockBufferForCleanup() here,
516+
* because they assert that the buffer is already valid.)
517+
*/
518+
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
519+
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
520+
488521
if (isLocalBuf)
489522
{
490523
/* Only need to adjust flags */

src/include/storage/bufmgr.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,16 @@ typedef enum BufferAccessStrategyType
3636
typedef enum
3737
{
3838
RBM_NORMAL, /* Normal read */
39-
RBM_ZERO, /* Don't read from disk, caller will
40-
* initialize */
39+
RBM_DO_NOT_USE, /* This used to be RBM_ZERO. Only kept for
40+
* binary compatibility with 3rd party
41+
* extensions. */
4142
RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
42-
RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL
43+
RBM_NORMAL_NO_LOG, /* Don't log page as invalid during WAL
4344
* replay; otherwise same as RBM_NORMAL */
45+
RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will
46+
* initialize. Also locks the page. */
47+
RBM_ZERO_AND_CLEANUP_LOCK /* Like RBM_ZERO_AND_LOCK, but locks the page
48+
* in "cleanup" mode */
4449
} ReadBufferMode;
4550

4651
/* in globals.c ... this duplicates miscadmin.h */

0 commit comments

Comments
 (0)