Skip to content

Commit 3821d66

Browse files
committed
Fix possible recovery trouble if TRUNCATE overlaps a checkpoint.
If TRUNCATE causes some buffers to be invalidated and thus the checkpoint does not flush them, TRUNCATE must also ensure that the corresponding files are truncated on disk. Otherwise, a replay from the checkpoint might find that the buffers exist but have the wrong contents, which may cause replay to fail. Report by Teja Mupparti. Patch by Kyotaro Horiguchi, per a design suggestion from Heikki Linnakangas, with some changes to the comments by me. Review of this and a prior patch that approached the issue differently by Heikki Linnakangas, Andres Freund, Álvaro Herrera, Masahiko Sawada, and Tom Lane. Discussion: http://postgr.es/m/BYAPR06MB6373BF50B469CA393C614257ABF00@BYAPR06MB6373.namprd06.prod.outlook.com
1 parent 61a007f commit 3821d66

File tree

11 files changed

+117
-29
lines changed

11 files changed

+117
-29
lines changed

src/backend/access/transam/multixact.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3069,8 +3069,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
30693069
* crash/basebackup, even though the state of the data directory would
30703070
* require it.
30713071
*/
3072-
Assert(!MyPgXact->delayChkpt);
3073-
MyPgXact->delayChkpt = true;
3072+
Assert((MyPgXact->delayChkpt & DELAY_CHKPT_START) == 0);
3073+
MyPgXact->delayChkpt |= DELAY_CHKPT_START;
30743074

30753075
/* WAL log truncation */
30763076
WriteMTruncateXlogRec(newOldestMultiDB,
@@ -3096,7 +3096,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
30963096
/* Then offsets */
30973097
PerformOffsetsTruncation(oldestMulti, newOldestMulti);
30983098

3099-
MyPgXact->delayChkpt = false;
3099+
MyPgXact->delayChkpt &= ~DELAY_CHKPT_START;
31003100

31013101
END_CRIT_SECTION();
31023102
LWLockRelease(MultiXactTruncationLock);

src/backend/access/transam/twophase.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
477477
}
478478
pgxact->xid = xid;
479479
pgxact->xmin = InvalidTransactionId;
480-
pgxact->delayChkpt = false;
480+
pgxact->delayChkpt = 0;
481481
pgxact->vacuumFlags = 0;
482482
proc->pid = 0;
483483
proc->databaseId = databaseid;
@@ -1187,7 +1187,8 @@ EndPrepare(GlobalTransaction gxact)
11871187

11881188
START_CRIT_SECTION();
11891189

1190-
MyPgXact->delayChkpt = true;
1190+
Assert((MyPgXact->delayChkpt & DELAY_CHKPT_START) == 0);
1191+
MyPgXact->delayChkpt |= DELAY_CHKPT_START;
11911192

11921193
XLogBeginInsert();
11931194
for (record = records.head; record != NULL; record = record->next)
@@ -1230,7 +1231,7 @@ EndPrepare(GlobalTransaction gxact)
12301231
* checkpoint starting after this will certainly see the gxact as a
12311232
* candidate for fsyncing.
12321233
*/
1233-
MyPgXact->delayChkpt = false;
1234+
MyPgXact->delayChkpt &= ~DELAY_CHKPT_START;
12341235

12351236
/*
12361237
* Remember that we have this GlobalTransaction entry locked for us. If
@@ -2337,7 +2338,8 @@ RecordTransactionCommitPrepared(TransactionId xid,
23372338
START_CRIT_SECTION();
23382339

23392340
/* See notes in RecordTransactionCommit */
2340-
MyPgXact->delayChkpt = true;
2341+
Assert((MyPgXact->delayChkpt & DELAY_CHKPT_START) == 0);
2342+
MyPgXact->delayChkpt |= DELAY_CHKPT_START;
23412343

23422344
/*
23432345
* Emit the XLOG commit record. Note that we mark 2PC commits as
@@ -2385,7 +2387,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
23852387
TransactionIdCommitTree(xid, nchildren, children);
23862388

23872389
/* Checkpoint can proceed now */
2388-
MyPgXact->delayChkpt = false;
2390+
MyPgXact->delayChkpt &= ~DELAY_CHKPT_START;
23892391

23902392
END_CRIT_SECTION();
23912393

src/backend/access/transam/xact.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1306,8 +1306,9 @@ RecordTransactionCommit(void)
13061306
* This makes checkpoint's determination of which xacts are delayChkpt
13071307
* a bit fuzzy, but it doesn't matter.
13081308
*/
1309+
Assert((MyPgXact->delayChkpt & DELAY_CHKPT_START) == 0);
13091310
START_CRIT_SECTION();
1310-
MyPgXact->delayChkpt = true;
1311+
MyPgXact->delayChkpt |= DELAY_CHKPT_START;
13111312

13121313
SetCurrentTransactionStopTimestamp();
13131314

@@ -1408,7 +1409,7 @@ RecordTransactionCommit(void)
14081409
*/
14091410
if (markXidCommitted)
14101411
{
1411-
MyPgXact->delayChkpt = false;
1412+
MyPgXact->delayChkpt &= ~DELAY_CHKPT_START;
14121413
END_CRIT_SECTION();
14131414
}
14141415

src/backend/access/transam/xlog.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8920,18 +8920,30 @@ CreateCheckPoint(int flags)
89208920
* and we will correctly flush the update below. So we cannot miss any
89218921
* xacts we need to wait for.
89228922
*/
8923-
vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8923+
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
89248924
if (nvxids > 0)
89258925
{
89268926
do
89278927
{
89288928
pg_usleep(10000L); /* wait for 10 msec */
8929-
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8929+
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
8930+
DELAY_CHKPT_START));
89308931
}
89318932
pfree(vxids);
89328933

89338934
CheckPointGuts(checkPoint.redo, flags);
89348935

8936+
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
8937+
if (nvxids > 0)
8938+
{
8939+
do
8940+
{
8941+
pg_usleep(10000L); /* wait for 10 msec */
8942+
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
8943+
DELAY_CHKPT_COMPLETE));
8944+
}
8945+
pfree(vxids);
8946+
89358947
/*
89368948
* Take a snapshot of running transactions and write this to WAL. This
89378949
* allows us to reconstruct the state of running transactions during

src/backend/access/transam/xloginsert.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -899,7 +899,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
899899
/*
900900
* Ensure no checkpoint can change our view of RedoRecPtr.
901901
*/
902-
Assert(MyPgXact->delayChkpt);
902+
Assert((MyPgXact->delayChkpt & DELAY_CHKPT_START) != 0);
903903

904904
/*
905905
* Update RedoRecPtr so that we can make the right decision

src/backend/catalog/storage.c

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "catalog/storage.h"
3030
#include "catalog/storage_xlog.h"
3131
#include "storage/freespace.h"
32+
#include "storage/proc.h"
3233
#include "storage/smgr.h"
3334
#include "utils/memutils.h"
3435
#include "utils/rel.h"
@@ -252,6 +253,22 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
252253
if (vm)
253254
visibilitymap_truncate(rel, nblocks);
254255

256+
/*
257+
* Make sure that a concurrent checkpoint can't complete while truncation
258+
* is in progress.
259+
*
260+
* The truncation operation might drop buffers that the checkpoint
261+
* otherwise would have flushed. If it does, then it's essential that
262+
* the files actually get truncated on disk before the checkpoint record
263+
* is written. Otherwise, if reply begins from that checkpoint, the
264+
* to-be-truncated blocks might still exist on disk but have older
265+
* contents than expected, which can cause replay to fail. It's OK for
266+
* the blocks to not exist on disk at all, but not for them to have the
267+
* wrong contents.
268+
*/
269+
Assert((MyPgXact->delayChkpt & DELAY_CHKPT_COMPLETE) == 0);
270+
MyPgXact->delayChkpt |= DELAY_CHKPT_COMPLETE;
271+
255272
/*
256273
* We WAL-log the truncation before actually truncating, which means
257274
* trouble if the truncation fails. If we then crash, the WAL replay
@@ -290,8 +307,15 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
290307
XLogFlush(lsn);
291308
}
292309

293-
/* Do the real work */
310+
/*
311+
* This will first remove any buffers from the buffer pool that should no
312+
* longer exist after truncation is complete, and then truncate the
313+
* corresponding files on disk.
314+
*/
294315
smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
316+
317+
/* We've done all the critical work, so checkpoints are OK now. */
318+
MyPgXact->delayChkpt &= ~DELAY_CHKPT_COMPLETE;
295319
}
296320

297321
/*

src/backend/storage/buffer/bufmgr.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3514,7 +3514,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
35143514
* essential that CreateCheckpoint waits for virtual transactions
35153515
* rather than full transactionids.
35163516
*/
3517-
MyPgXact->delayChkpt = delayChkpt = true;
3517+
Assert((MyPgXact->delayChkpt & DELAY_CHKPT_START) == 0);
3518+
MyPgXact->delayChkpt |= DELAY_CHKPT_START;
3519+
delayChkpt = true;
35183520
lsn = XLogSaveBufferForHint(buffer, buffer_std);
35193521
}
35203522

@@ -3547,7 +3549,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
35473549
UnlockBufHdr(bufHdr, buf_state);
35483550

35493551
if (delayChkpt)
3550-
MyPgXact->delayChkpt = false;
3552+
MyPgXact->delayChkpt &= ~DELAY_CHKPT_START;
35513553

35523554
if (dirtied)
35533555
{

src/backend/storage/ipc/procarray.c

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
434434
pgxact->xmin = InvalidTransactionId;
435435
/* must be cleared with xid/xmin: */
436436
pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
437-
pgxact->delayChkpt = false; /* be sure this is cleared in abort */
437+
438+
/* be sure this is cleared in abort */
439+
pgxact->delayChkpt = 0;
440+
438441
proc->recoveryConflictPending = false;
439442

440443
Assert(pgxact->nxids == 0);
@@ -456,7 +459,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
456459
pgxact->xmin = InvalidTransactionId;
457460
/* must be cleared with xid/xmin: */
458461
pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
459-
pgxact->delayChkpt = false; /* be sure this is cleared in abort */
462+
463+
/* be sure this is cleared in abort */
464+
pgxact->delayChkpt = 0;
465+
460466
proc->recoveryConflictPending = false;
461467

462468
/* Clear the subtransaction-XID cache too while holding the lock */
@@ -2261,7 +2267,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
22612267
* delaying checkpoint because they have critical actions in progress.
22622268
*
22632269
* Constructs an array of VXIDs of transactions that are currently in commit
2264-
* critical sections, as shown by having delayChkpt set in their PGXACT.
2270+
* critical sections, as shown by having specified delayChkpt bits set in their
2271+
* PGXACT.
22652272
*
22662273
* Returns a palloc'd array that should be freed by the caller.
22672274
* *nvxids is the number of valid entries.
@@ -2275,13 +2282,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
22752282
* for clearing of delayChkpt to propagate is unimportant for correctness.
22762283
*/
22772284
VirtualTransactionId *
2278-
GetVirtualXIDsDelayingChkpt(int *nvxids)
2285+
GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
22792286
{
22802287
VirtualTransactionId *vxids;
22812288
ProcArrayStruct *arrayP = procArray;
22822289
int count = 0;
22832290
int index;
22842291

2292+
Assert(type != 0);
2293+
22852294
/* allocate what's certainly enough result space */
22862295
vxids = (VirtualTransactionId *)
22872296
palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
@@ -2294,7 +2303,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
22942303
PGPROC *proc = &allProcs[pgprocno];
22952304
PGXACT *pgxact = &allPgXact[pgprocno];
22962305

2297-
if (pgxact->delayChkpt)
2306+
if ((pgxact->delayChkpt & type) != 0)
22982307
{
22992308
VirtualTransactionId vxid;
23002309

@@ -2320,12 +2329,14 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
23202329
* those numbers should be small enough for it not to be a problem.
23212330
*/
23222331
bool
2323-
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
2332+
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
23242333
{
23252334
bool result = false;
23262335
ProcArrayStruct *arrayP = procArray;
23272336
int index;
23282337

2338+
Assert(type != 0);
2339+
23292340
LWLockAcquire(ProcArrayLock, LW_SHARED);
23302341

23312342
for (index = 0; index < arrayP->numProcs; index++)
@@ -2337,7 +2348,8 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
23372348

23382349
GET_VXID_FROM_PGPROC(vxid, *proc);
23392350

2340-
if (pgxact->delayChkpt && VirtualTransactionIdIsValid(vxid))
2351+
if ((pgxact->delayChkpt & type) != 0 &&
2352+
VirtualTransactionIdIsValid(vxid))
23412353
{
23422354
int i;
23432355

src/backend/storage/lmgr/proc.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ InitProcess(void)
397397
MyProc->roleId = InvalidOid;
398398
MyProc->tempNamespaceId = InvalidOid;
399399
MyProc->isBackgroundWorker = IsBackgroundWorker;
400-
MyPgXact->delayChkpt = false;
400+
MyPgXact->delayChkpt = 0;
401401
MyPgXact->vacuumFlags = 0;
402402
/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
403403
if (IsAutoVacuumWorkerProcess())
@@ -579,7 +579,7 @@ InitAuxiliaryProcess(void)
579579
MyProc->roleId = InvalidOid;
580580
MyProc->tempNamespaceId = InvalidOid;
581581
MyProc->isBackgroundWorker = IsBackgroundWorker;
582-
MyPgXact->delayChkpt = false;
582+
MyPgXact->delayChkpt = 0;
583583
MyPgXact->vacuumFlags = 0;
584584
MyProc->lwWaiting = false;
585585
MyProc->lwWaitMode = 0;

src/include/storage/proc.h

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,41 @@ struct XidCache
7676
*/
7777
#define INVALID_PGPROCNO PG_INT32_MAX
7878

79+
/*
80+
* Flags for PGPROC.delayChkpt
81+
*
82+
* These flags can be used to delay the start or completion of a checkpoint
83+
* for short periods. A flag is in effect if the corresponding bit is set in
84+
* the PGPROC of any backend.
85+
*
86+
* For our purposes here, a checkpoint has three phases: (1) determine the
87+
* location to which the redo pointer will be moved, (2) write all the
88+
* data durably to disk, and (3) WAL-log the checkpoint.
89+
*
90+
* Setting DELAY_CHKPT_START prevents the system from moving from phase 1
91+
* to phase 2. This is useful when we are performing a WAL-logged modification
92+
* of data that will be flushed to disk in phase 2. By setting this flag
93+
* before writing WAL and clearing it after we've both written WAL and
94+
* performed the corresponding modification, we ensure that if the WAL record
95+
* is inserted prior to the new redo point, the corresponding data changes will
96+
* also be flushed to disk before the checkpoint can complete. (In the
97+
* extremely common case where the data being modified is in shared buffers
98+
* and we acquire an exclusive content lock on the relevant buffers before
99+
* writing WAL, this mechanism is not needed, because phase 2 will block
100+
* until we release the content lock and then flush the modified data to
101+
* disk.)
102+
*
103+
* Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
104+
* to phase 3. This is useful if we are performing a WAL-logged operation that
105+
* might invalidate buffers, such as relation truncation. In this case, we need
106+
* to ensure that any buffers which were invalidated and thus not flushed by
107+
* the checkpoint are actaully destroyed on disk. Replay can cope with a file
108+
* or block that doesn't exist, but not with a block that has the wrong
109+
* contents.
110+
*/
111+
#define DELAY_CHKPT_START (1<<0)
112+
#define DELAY_CHKPT_COMPLETE (1<<1)
113+
79114
/*
80115
* Each backend has a PGPROC struct in shared memory. There is also a list of
81116
* currently-unused PGPROC structs that will be reallocated to new backends.
@@ -232,8 +267,7 @@ typedef struct PGXACT
232267

233268
uint8 vacuumFlags; /* vacuum-related flags, see above */
234269
bool overflowed;
235-
bool delayChkpt; /* true if this proc delays checkpoint start;
236-
* previously called InCommit */
270+
int delayChkpt; /* for DELAY_CHKPT_* flags */
237271

238272
uint8 nxids;
239273
} PGXACT;

src/include/storage/procarray.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,9 @@ extern TransactionId GetOldestXmin(Relation rel, int flags);
9292
extern TransactionId GetOldestActiveTransactionId(void);
9393
extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
9494

95-
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
96-
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
95+
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, int type);
96+
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids,
97+
int nvxids, int type);
9798

9899
extern PGPROC *BackendPidGetProc(int pid);
99100
extern PGPROC *BackendPidGetProcWithLock(int pid);

0 commit comments

Comments
 (0)