Skip to content

Commit 38c579b

Browse files
committedDec 20, 2024
Fix corruption when relation truncation fails.
RelationTruncate() does three things, while holding an AccessExclusiveLock and preventing checkpoints: 1. Logs the truncation. 2. Drops buffers, even if they're dirty. 3. Truncates some number of files. Step 2 could previously be canceled if it had to wait for I/O, and step 3 could and still can fail in file APIs. All orderings of these operations have data corruption hazards if interrupted, so we can't give up until the whole operation is done. When dirty pages were discarded but the corresponding blocks were left on disk due to ERROR, old page versions could come back from disk, reviving deleted data (see pgsql-bugs #18146 and several like it). When primary and standby were allowed to disagree on relation size, standbys could panic (see pgsql-bugs #18426) or revive data unknown to visibility management on the primary (theorized). Changes: * WAL is now unconditionally flushed first * smgrtruncate() is now called in a critical section, preventing interrupts and causing PANIC on file API failure * smgrtruncate() has a new parameter for existing fork sizes, because it can't call smgrnblocks() itself inside a critical section The changes apply to RelationTruncate(), smgr_redo() and pg_truncate_visibility_map(). That last is also brought up to date with other evolutions of the truncation protocol. The VACUUM FileTruncate() failure mode had been discussed in older reports than the ones referenced below, with independent analysis from many people, but earlier theories on how to fix it were too complicated to back-patch. The more recently invented cancellation bug was diagnosed by Alexander Lakhin. Other corruption scenarios were spotted by me while iterating on this patch and earlier commit 75818b3. Back-patch to all supported releases. Reviewed-by: Michael Paquier <michael@paquier.xyz> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reported-by: rootcause000@gmail.com Reported-by: Alexander Lakhin <exclusion@gmail.com> Discussion: https://postgr.es/m/18146-04e908c662113ad5%40postgresql.org Discussion: https://postgr.es/m/18426-2d18da6586f152d6%40postgresql.org
1 parent 02a8d0c commit 38c579b

File tree

6 files changed

+89
-33
lines changed

6 files changed

+89
-33
lines changed
 

‎contrib/pg_visibility/pg_visibility.c

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "funcapi.h"
2020
#include "miscadmin.h"
2121
#include "storage/bufmgr.h"
22+
#include "storage/proc.h"
2223
#include "storage/procarray.h"
2324
#include "storage/read_stream.h"
2425
#include "storage/smgr.h"
@@ -390,6 +391,7 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS)
390391
Relation rel;
391392
ForkNumber fork;
392393
BlockNumber block;
394+
BlockNumber old_block;
393395

394396
rel = relation_open(relid, AccessExclusiveLock);
395397

@@ -399,15 +401,22 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS)
399401
/* Forcibly reset cached file size */
400402
RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
401403

404+
/* Compute new and old size before entering critical section. */
405+
fork = VISIBILITYMAP_FORKNUM;
402406
block = visibilitymap_prepare_truncate(rel, 0);
403-
if (BlockNumberIsValid(block))
404-
{
405-
fork = VISIBILITYMAP_FORKNUM;
406-
smgrtruncate(RelationGetSmgr(rel), &fork, 1, &block);
407-
}
407+
old_block = BlockNumberIsValid(block) ? smgrnblocks(RelationGetSmgr(rel), fork) : 0;
408+
409+
/*
410+
* WAL-logging, buffer dropping, file truncation must be atomic and all on
411+
* one side of a checkpoint. See RelationTruncate() for discussion.
412+
*/
413+
Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
414+
MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
415+
START_CRIT_SECTION();
408416

409417
if (RelationNeedsWAL(rel))
410418
{
419+
XLogRecPtr lsn;
411420
xl_smgr_truncate xlrec;
412421

413422
xlrec.blkno = 0;
@@ -417,9 +426,17 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS)
417426
XLogBeginInsert();
418427
XLogRegisterData((char *) &xlrec, sizeof(xlrec));
419428

420-
XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
429+
lsn = XLogInsert(RM_SMGR_ID,
430+
XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
431+
XLogFlush(lsn);
421432
}
422433

434+
if (BlockNumberIsValid(block))
435+
smgrtruncate(RelationGetSmgr(rel), &fork, 1, &old_block, &block);
436+
437+
END_CRIT_SECTION();
438+
MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
439+
423440
/*
424441
* Release the lock right away, not at commit time.
425442
*

‎src/backend/catalog/storage.c

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
291291
bool vm;
292292
bool need_fsm_vacuum = false;
293293
ForkNumber forks[MAX_FORKNUM];
294+
BlockNumber old_blocks[MAX_FORKNUM];
294295
BlockNumber blocks[MAX_FORKNUM];
295296
int nforks = 0;
296297
SMgrRelation reln;
@@ -306,6 +307,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
306307

307308
/* Prepare for truncation of MAIN fork of the relation */
308309
forks[nforks] = MAIN_FORKNUM;
310+
old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
309311
blocks[nforks] = nblocks;
310312
nforks++;
311313

@@ -317,6 +319,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
317319
if (BlockNumberIsValid(blocks[nforks]))
318320
{
319321
forks[nforks] = FSM_FORKNUM;
322+
old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
320323
nforks++;
321324
need_fsm_vacuum = true;
322325
}
@@ -330,6 +333,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
330333
if (BlockNumberIsValid(blocks[nforks]))
331334
{
332335
forks[nforks] = VISIBILITYMAP_FORKNUM;
336+
old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
333337
nforks++;
334338
}
335339
}
@@ -366,14 +370,20 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
366370
MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
367371

368372
/*
369-
* We WAL-log the truncation before actually truncating, which means
370-
* trouble if the truncation fails. If we then crash, the WAL replay
371-
* likely isn't going to succeed in the truncation either, and cause a
372-
* PANIC. It's tempting to put a critical section here, but that cure
373-
* would be worse than the disease. It would turn a usually harmless
374-
* failure to truncate, that might spell trouble at WAL replay, into a
375-
* certain PANIC.
373+
* We WAL-log the truncation first and then truncate in a critical
374+
* section. Truncation drops buffers, even if dirty, and then truncates
375+
* disk files. All of that work needs to complete before the lock is
376+
* released, or else old versions of pages on disk that are missing recent
377+
* changes would become accessible again. We'll try the whole operation
378+
* again in crash recovery if we panic, but even then we can't give up
379+
* because we don't want standbys' relation sizes to diverge and break
380+
* replay or visibility invariants downstream. The critical section also
381+
* suppresses interrupts.
382+
*
383+
* (See also pg_visibilitymap.c if changing this code.)
376384
*/
385+
START_CRIT_SECTION();
386+
377387
if (RelationNeedsWAL(rel))
378388
{
379389
/*
@@ -397,18 +407,20 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
397407
* hit the disk before the WAL record, and the truncation of the FSM
398408
* or visibility map. If we crashed during that window, we'd be left
399409
* with a truncated heap, but the FSM or visibility map would still
400-
* contain entries for the non-existent heap pages.
410+
* contain entries for the non-existent heap pages, and standbys would
411+
* also never replay the truncation.
401412
*/
402-
if (fsm || vm)
403-
XLogFlush(lsn);
413+
XLogFlush(lsn);
404414
}
405415

406416
/*
407417
* This will first remove any buffers from the buffer pool that should no
408418
* longer exist after truncation is complete, and then truncate the
409419
* corresponding files on disk.
410420
*/
411-
smgrtruncate(RelationGetSmgr(rel), forks, nforks, blocks);
421+
smgrtruncate(RelationGetSmgr(rel), forks, nforks, old_blocks, blocks);
422+
423+
END_CRIT_SECTION();
412424

413425
/* We've done all the critical work, so checkpoints are OK now. */
414426
MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
@@ -973,6 +985,7 @@ smgr_redo(XLogReaderState *record)
973985
Relation rel;
974986
ForkNumber forks[MAX_FORKNUM];
975987
BlockNumber blocks[MAX_FORKNUM];
988+
BlockNumber old_blocks[MAX_FORKNUM];
976989
int nforks = 0;
977990
bool need_fsm_vacuum = false;
978991

@@ -1007,6 +1020,7 @@ smgr_redo(XLogReaderState *record)
10071020
if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
10081021
{
10091022
forks[nforks] = MAIN_FORKNUM;
1023+
old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
10101024
blocks[nforks] = xlrec->blkno;
10111025
nforks++;
10121026

@@ -1024,6 +1038,7 @@ smgr_redo(XLogReaderState *record)
10241038
if (BlockNumberIsValid(blocks[nforks]))
10251039
{
10261040
forks[nforks] = FSM_FORKNUM;
1041+
old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
10271042
nforks++;
10281043
need_fsm_vacuum = true;
10291044
}
@@ -1035,13 +1050,18 @@ smgr_redo(XLogReaderState *record)
10351050
if (BlockNumberIsValid(blocks[nforks]))
10361051
{
10371052
forks[nforks] = VISIBILITYMAP_FORKNUM;
1053+
old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
10381054
nforks++;
10391055
}
10401056
}
10411057

10421058
/* Do the real work to truncate relation forks */
10431059
if (nforks > 0)
1044-
smgrtruncate(reln, forks, nforks, blocks);
1060+
{
1061+
START_CRIT_SECTION();
1062+
smgrtruncate(reln, forks, nforks, old_blocks, blocks);
1063+
END_CRIT_SECTION();
1064+
}
10451065

10461066
/*
10471067
* Update upper-level FSM pages to account for the truncation. This is

‎src/backend/storage/smgr/md.c

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,19 +1154,21 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
11541154

11551155
/*
11561156
* mdtruncate() -- Truncate relation to specified number of blocks.
1157+
*
1158+
* Guaranteed not to allocate memory, so it can be used in a critical section.
1159+
* Caller must have called smgrnblocks() to obtain curnblk while holding a
1160+
* sufficient lock to prevent a change in relation size, and not used any smgr
1161+
* functions for this relation or handled interrupts in between. This makes
1162+
* sure we have opened all active segments, so that truncate loop will get
1163+
* them all!
11571164
*/
11581165
void
1159-
mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
1166+
mdtruncate(SMgrRelation reln, ForkNumber forknum,
1167+
BlockNumber curnblk, BlockNumber nblocks)
11601168
{
1161-
BlockNumber curnblk;
11621169
BlockNumber priorblocks;
11631170
int curopensegs;
11641171

1165-
/*
1166-
* NOTE: mdnblocks makes sure we have opened all active segments, so that
1167-
* truncation loop will get them all!
1168-
*/
1169-
curnblk = mdnblocks(reln, forknum);
11701172
if (nblocks > curnblk)
11711173
{
11721174
/* Bogus request ... but no complaint if InRecovery */
@@ -1505,7 +1507,7 @@ _fdvec_resize(SMgrRelation reln,
15051507
reln->md_seg_fds[forknum] =
15061508
MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
15071509
}
1508-
else
1510+
else if (nseg > reln->md_num_open_segs[forknum])
15091511
{
15101512
/*
15111513
* It doesn't seem worthwhile complicating the code to amortize
@@ -1517,6 +1519,16 @@ _fdvec_resize(SMgrRelation reln,
15171519
repalloc(reln->md_seg_fds[forknum],
15181520
sizeof(MdfdVec) * nseg);
15191521
}
1522+
else
1523+
{
1524+
/*
1525+
* We don't reallocate a smaller array, because we want mdtruncate()
1526+
* to be able to promise that it won't allocate memory, so that it is
1527+
* allowed in a critical section. This means that a bit of space in
1528+
* the array is now wasted, until the next time we add a segment and
1529+
* reallocate.
1530+
*/
1531+
}
15201532

15211533
reln->md_num_open_segs[forknum] = nseg;
15221534
}

‎src/backend/storage/smgr/smgr.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ typedef struct f_smgr
101101
BlockNumber blocknum, BlockNumber nblocks);
102102
BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
103103
void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
104-
BlockNumber nblocks);
104+
BlockNumber old_blocks, BlockNumber nblocks);
105105
void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
106106
void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum);
107107
} f_smgr;
@@ -719,10 +719,15 @@ smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
719719
*
720720
* The caller must hold AccessExclusiveLock on the relation, to ensure that
721721
* other backends receive the smgr invalidation event that this function sends
722-
* before they access any forks of the relation again.
722+
* before they access any forks of the relation again. The current size of
723+
* the forks should be provided in old_nblocks. This function should normally
724+
* be called in a critical section, but the current size must be checked
725+
* outside the critical section, and no interrupts or smgr functions relating
726+
* to this relation should be called in between.
723727
*/
724728
void
725-
smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks)
729+
smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
730+
BlockNumber *old_nblocks, BlockNumber *nblocks)
726731
{
727732
int i;
728733

@@ -750,7 +755,8 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
750755
/* Make the cached size is invalid if we encounter an error. */
751756
reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
752757

753-
smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]);
758+
smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i],
759+
old_nblocks[i], nblocks[i]);
754760

755761
/*
756762
* We might as well update the local smgr_cached_nblocks values. The

‎src/include/storage/md.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
4343
BlockNumber blocknum, BlockNumber nblocks);
4444
extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
4545
extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
46-
BlockNumber nblocks);
46+
BlockNumber old_blocks, BlockNumber nblocks);
4747
extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
4848
extern void mdregistersync(SMgrRelation reln, ForkNumber forknum);
4949

‎src/include/storage/smgr.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,9 @@ extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
105105
BlockNumber blocknum, BlockNumber nblocks);
106106
extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
107107
extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum);
108-
extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
109-
int nforks, BlockNumber *nblocks);
108+
extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
109+
BlockNumber *old_nblocks,
110+
BlockNumber *nblocks);
110111
extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
111112
extern void smgrregistersync(SMgrRelation reln, ForkNumber forknum);
112113
extern void AtEOXact_SMgr(void);

0 commit comments

Comments
 (0)