Skip to content

Commit afd1277

Browse files
committed
During online checkpoints, insert XLOG_CHECKPOINT_REDO at redo point.
This allows tools that read the WAL sequentially to identify (possible) redo points when they're reached, rather than only being able to detect them in retrospect when XLOG_CHECKPOINT_ONLINE is found, possibly much later in the WAL stream. There are other possible applications as well; see the discussion links below. Any redo location that precedes the checkpoint location should now point to an XLOG_CHECKPOINT_REDO record, so add a cross-check to verify this. While adjusting the code in CreateCheckPoint() for this patch, I made it call WALInsertLockAcquireExclusive a bit later than before, since there appears to be no need for it to be held while checking whether the system is idle, whether this is an end-of-recovery checkpoint, or what the current timeline is. Bump XLOG_PAGE_MAGIC. Patch by me, based in part on earlier work from Dilip Kumar. Review by Dilip Kumar, Amit Kapila, Andres Freund, and Michael Paquier. Discussion: http://postgr.es/m/CA+TgmoYy-Vc6G9QKcAKNksCa29cv__czr+N9X_QCxEfQVpp_8w@mail.gmail.com Discussion: http://postgr.es/m/20230614194717.jyuw3okxup4cvtbt%40awork3.anarazel.de Discussion: http://postgr.es/m/CA+hUKG+b2ego8=YNW2Ohe9QmSiReh1-ogrv8V_WZpJTqP3O+2w@mail.gmail.com
1 parent 8483a54 commit afd1277

File tree

9 files changed

+178
-61
lines changed

9 files changed

+178
-61
lines changed

contrib/pg_walinspect/expected/pg_walinspect.out

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,20 @@ SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
127127
t
128128
(1 row)
129129

130-
-- Force full-page image on the next update.
130+
-- Force a checkpoint so that the next update will log a full-page image.
131131
SELECT pg_current_wal_lsn() AS wal_lsn5 \gset
132132
CHECKPOINT;
133+
-- Verify that an XLOG_CHECKPOINT_REDO record begins at precisely the redo LSN
134+
-- of the checkpoint we just performed.
135+
SELECT redo_lsn FROM pg_control_checkpoint() \gset
136+
SELECT start_lsn = :'redo_lsn'::pg_lsn AS same_lsn, resource_manager,
137+
record_type FROM pg_get_wal_record_info(:'redo_lsn');
138+
same_lsn | resource_manager | record_type
139+
----------+------------------+-----------------
140+
t | XLOG | CHECKPOINT_REDO
141+
(1 row)
142+
143+
-- This update should produce a full-page image because of the checkpoint.
133144
UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 2;
134145
SELECT pg_current_wal_lsn() AS wal_lsn6 \gset
135146
-- Check if we get FPI from WAL record.

contrib/pg_walinspect/sql/pg_walinspect.sql

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,17 @@ SELECT pg_current_wal_lsn() AS wal_lsn4 \gset
8080
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_block_info(:'wal_lsn3', :'wal_lsn4')
8181
WHERE relfilenode = :'sample_tbl_oid' AND block_data IS NOT NULL;
8282

83-
-- Force full-page image on the next update.
83+
-- Force a checkpoint so that the next update will log a full-page image.
8484
SELECT pg_current_wal_lsn() AS wal_lsn5 \gset
8585
CHECKPOINT;
86+
87+
-- Verify that an XLOG_CHECKPOINT_REDO record begins at precisely the redo LSN
88+
-- of the checkpoint we just performed.
89+
SELECT redo_lsn FROM pg_control_checkpoint() \gset
90+
SELECT start_lsn = :'redo_lsn'::pg_lsn AS same_lsn, resource_manager,
91+
record_type FROM pg_get_wal_record_info(:'redo_lsn');
92+
93+
-- This update should produce a full-page image because of the checkpoint.
8694
UPDATE sample_tbl SET col1 = col1 + 1 WHERE col1 = 2;
8795
SELECT pg_current_wal_lsn() AS wal_lsn6 \gset
8896
-- Check if we get FPI from WAL record.

src/backend/access/rmgrdesc/xlogdesc.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
148148
LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
149149
timestamptz_to_str(xlrec.overwrite_time));
150150
}
151+
else if (info == XLOG_CHECKPOINT_REDO)
152+
{
153+
/* No details to write out */
154+
}
151155
}
152156

153157
const char *
@@ -196,6 +200,9 @@ xlog_identify(uint8 info)
196200
case XLOG_FPI_FOR_HINT:
197201
id = "FPI_FOR_HINT";
198202
break;
203+
case XLOG_CHECKPOINT_REDO:
204+
id = "CHECKPOINT_REDO";
205+
break;
199206
}
200207

201208
return id;

src/backend/access/transam/xlog.c

Lines changed: 135 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,16 @@ typedef struct XLogCtlData
559559
slock_t info_lck; /* locks shared variables shown above */
560560
} XLogCtlData;
561561

562+
/*
563+
* Classification of XLogRecordInsert operations.
564+
*/
565+
typedef enum
566+
{
567+
WALINSERT_NORMAL,
568+
WALINSERT_SPECIAL_SWITCH,
569+
WALINSERT_SPECIAL_CHECKPOINT
570+
} WalInsertClass;
571+
562572
static XLogCtlData *XLogCtl = NULL;
563573

564574
/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
@@ -739,13 +749,21 @@ XLogInsertRecord(XLogRecData *rdata,
739749
bool inserted;
740750
XLogRecord *rechdr = (XLogRecord *) rdata->data;
741751
uint8 info = rechdr->xl_info & ~XLR_INFO_MASK;
742-
bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
743-
info == XLOG_SWITCH);
752+
WalInsertClass class = WALINSERT_NORMAL;
744753
XLogRecPtr StartPos;
745754
XLogRecPtr EndPos;
746755
bool prevDoPageWrites = doPageWrites;
747756
TimeLineID insertTLI;
748757

758+
/* Does this record type require special handling? */
759+
if (unlikely(rechdr->xl_rmid == RM_XLOG_ID))
760+
{
761+
if (info == XLOG_SWITCH)
762+
class = WALINSERT_SPECIAL_SWITCH;
763+
else if (info == XLOG_CHECKPOINT_REDO)
764+
class = WALINSERT_SPECIAL_CHECKPOINT;
765+
}
766+
749767
/* we assume that all of the record header is in the first chunk */
750768
Assert(rdata->len >= SizeOfXLogRecord);
751769

@@ -793,7 +811,7 @@ XLogInsertRecord(XLogRecData *rdata,
793811
*/
794812
START_CRIT_SECTION();
795813

796-
if (likely(!isLogSwitch))
814+
if (likely(class == WALINSERT_NORMAL))
797815
{
798816
WALInsertLockAcquire();
799817

@@ -843,7 +861,7 @@ XLogInsertRecord(XLogRecData *rdata,
843861
/* Normal records are always inserted. */
844862
inserted = true;
845863
}
846-
else
864+
else if (class == WALINSERT_SPECIAL_SWITCH)
847865
{
848866
/*
849867
* In order to insert an XLOG_SWITCH record, we need to hold all of
@@ -852,14 +870,32 @@ XLogInsertRecord(XLogRecData *rdata,
852870
* remains in the current WAL segment and claimed all of it.
853871
*
854872
* Nonetheless, this case is simpler than the normal cases handled
855-
* above, which must check for changes in doPageWrites and RedoRecPtr.
856-
* Those checks are only needed for records that can contain
857-
* full-pages images, and an XLOG_SWITCH record never does.
873+
* below, which must check for changes in doPageWrites and RedoRecPtr.
874+
* Those checks are only needed for records that can contain buffer
875+
* references, and an XLOG_SWITCH record never does.
858876
*/
859877
Assert(fpw_lsn == InvalidXLogRecPtr);
860878
WALInsertLockAcquireExclusive();
861879
inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
862880
}
881+
else
882+
{
883+
Assert(class == WALINSERT_SPECIAL_CHECKPOINT);
884+
885+
/*
886+
* We need to update both the local and shared copies of RedoRecPtr,
887+
* which means that we need to hold all the WAL insertion locks.
888+
* However, there can't be any buffer references, so as above, we need
889+
* not check RedoRecPtr before inserting the record; we just need to
890+
* update it afterwards.
891+
*/
892+
Assert(fpw_lsn == InvalidXLogRecPtr);
893+
WALInsertLockAcquireExclusive();
894+
ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
895+
&rechdr->xl_prev);
896+
RedoRecPtr = Insert->RedoRecPtr = StartPos;
897+
inserted = true;
898+
}
863899

864900
if (inserted)
865901
{
@@ -876,7 +912,8 @@ XLogInsertRecord(XLogRecData *rdata,
876912
* All the record data, including the header, is now ready to be
877913
* inserted. Copy the record in the space reserved.
878914
*/
879-
CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
915+
CopyXLogRecordToWAL(rechdr->xl_tot_len,
916+
class == WALINSERT_SPECIAL_SWITCH, rdata,
880917
StartPos, EndPos, insertTLI);
881918

882919
/*
@@ -935,7 +972,7 @@ XLogInsertRecord(XLogRecData *rdata,
935972
* padding space that fills the rest of the segment, and perform
936973
* end-of-segment actions (eg, notifying archiver).
937974
*/
938-
if (isLogSwitch)
975+
if (class == WALINSERT_SPECIAL_SWITCH)
939976
{
940977
TRACE_POSTGRESQL_WAL_SWITCH();
941978
XLogFlush(EndPos);
@@ -1054,8 +1091,12 @@ XLogInsertRecord(XLogRecData *rdata,
10541091
*
10551092
* NB: The space calculation here must match the code in CopyXLogRecordToWAL,
10561093
* where we actually copy the record to the reserved space.
1094+
*
1095+
* NB: Testing shows that XLogInsertRecord runs faster if this code is inlined;
1096+
* however, because there are two call sites, the compiler is reluctant to
1097+
* inline. We use pg_attribute_always_inline here to try to convince it.
10571098
*/
1058-
static void
1099+
static pg_attribute_always_inline void
10591100
ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
10601101
XLogRecPtr *PrevPtr)
10611102
{
@@ -6475,17 +6516,22 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset)
64756516
* In particular note that this routine is synchronous and does not pay
64766517
* attention to CHECKPOINT_WAIT.
64776518
*
6478-
* If !shutdown then we are writing an online checkpoint. This is a very special
6479-
* kind of operation and WAL record because the checkpoint action occurs over
6480-
* a period of time yet logically occurs at just a single LSN. The logical
6481-
* position of the WAL record (redo ptr) is the same or earlier than the
6482-
* physical position. When we replay WAL we locate the checkpoint via its
6483-
* physical position then read the redo ptr and actually start replay at the
6484-
* earlier logical position. Note that we don't write *anything* to WAL at
6485-
* the logical position, so that location could be any other kind of WAL record.
6486-
* All of this mechanism allows us to continue working while we checkpoint.
6487-
* As a result, timing of actions is critical here and be careful to note that
6488-
* this function will likely take minutes to execute on a busy system.
6519+
* If !shutdown then we are writing an online checkpoint. An XLOG_CHECKPOINT_REDO
6520+
* record is inserted into WAL at the logical location of the checkpoint, before
6521+
* flushing anything to disk, and when the checkpoint is eventually completed,
6522+
* and it is from this point that WAL replay will begin in the case of a recovery
6523+
* from this checkpoint. Once everything is written to disk, an
6524+
* XLOG_CHECKPOINT_ONLINE record is written to complete the checkpoint, and
6525+
* points back to the earlier XLOG_CHECKPOINT_REDO record. This mechanism allows
6526+
* other write-ahead log records to be written while the checkpoint is in
6527+
* progress, but we must be very careful about order of operations. This function
6528+
* may take many minutes to execute on a busy system.
6529+
*
6530+
* On the other hand, when shutdown is true, concurrent insertion into the
6531+
* write-ahead log is impossible, so there is no need for two separate records.
6532+
* In this case, we only insert an XLOG_CHECKPOINT_SHUTDOWN record, and it's
6533+
* both the record marking the completion of the checkpoint and the location
6534+
* from which WAL replay would begin if needed.
64896535
*/
64906536
void
64916537
CreateCheckPoint(int flags)
@@ -6497,7 +6543,6 @@ CreateCheckPoint(int flags)
64976543
XLogCtlInsert *Insert = &XLogCtl->Insert;
64986544
uint32 freespace;
64996545
XLogRecPtr PriorRedoPtr;
6500-
XLogRecPtr curInsert;
65016546
XLogRecPtr last_important_lsn;
65026547
VirtualTransactionId *vxids;
65036548
int nvxids;
@@ -6567,13 +6612,6 @@ CreateCheckPoint(int flags)
65676612
*/
65686613
last_important_lsn = GetLastImportantRecPtr();
65696614

6570-
/*
6571-
* We must block concurrent insertions while examining insert state to
6572-
* determine the checkpoint REDO pointer.
6573-
*/
6574-
WALInsertLockAcquireExclusive();
6575-
curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
6576-
65776615
/*
65786616
* If this isn't a shutdown or forced checkpoint, and if there has been no
65796617
* WAL activity requiring a checkpoint, skip it. The idea here is to
@@ -6584,7 +6622,6 @@ CreateCheckPoint(int flags)
65846622
{
65856623
if (last_important_lsn == ControlFile->checkPoint)
65866624
{
6587-
WALInsertLockRelease();
65886625
END_CRIT_SECTION();
65896626
ereport(DEBUG1,
65906627
(errmsg_internal("checkpoint skipped because system is idle")));
@@ -6606,45 +6643,81 @@ CreateCheckPoint(int flags)
66066643
else
66076644
checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
66086645

6609-
checkPoint.fullPageWrites = Insert->fullPageWrites;
6610-
66116646
/*
6612-
* Compute new REDO record ptr = location of next XLOG record.
6613-
*
6614-
* NB: this is NOT necessarily where the checkpoint record itself will be,
6615-
* since other backends may insert more XLOG records while we're off doing
6616-
* the buffer flush work. Those XLOG records are logically after the
6617-
* checkpoint, even though physically before it. Got that?
6647+
* We must block concurrent insertions while examining insert state.
66186648
*/
6619-
freespace = INSERT_FREESPACE(curInsert);
6620-
if (freespace == 0)
6649+
WALInsertLockAcquireExclusive();
6650+
6651+
checkPoint.fullPageWrites = Insert->fullPageWrites;
6652+
6653+
if (shutdown)
66216654
{
6622-
if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
6623-
curInsert += SizeOfXLogLongPHD;
6624-
else
6625-
curInsert += SizeOfXLogShortPHD;
6626-
}
6627-
checkPoint.redo = curInsert;
6655+
XLogRecPtr curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
66286656

6629-
/*
6630-
* Here we update the shared RedoRecPtr for future XLogInsert calls; this
6631-
* must be done while holding all the insertion locks.
6632-
*
6633-
* Note: if we fail to complete the checkpoint, RedoRecPtr will be left
6634-
* pointing past where it really needs to point. This is okay; the only
6635-
* consequence is that XLogInsert might back up whole buffers that it
6636-
* didn't really need to. We can't postpone advancing RedoRecPtr because
6637-
* XLogInserts that happen while we are dumping buffers must assume that
6638-
* their buffer changes are not included in the checkpoint.
6639-
*/
6640-
RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6657+
/*
6658+
* Compute new REDO record ptr = location of next XLOG record.
6659+
*
6660+
* Since this is a shutdown checkpoint, there can't be any concurrent
6661+
* WAL insertion.
6662+
*/
6663+
freespace = INSERT_FREESPACE(curInsert);
6664+
if (freespace == 0)
6665+
{
6666+
if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
6667+
curInsert += SizeOfXLogLongPHD;
6668+
else
6669+
curInsert += SizeOfXLogShortPHD;
6670+
}
6671+
checkPoint.redo = curInsert;
6672+
6673+
/*
6674+
* Here we update the shared RedoRecPtr for future XLogInsert calls;
6675+
* this must be done while holding all the insertion locks.
6676+
*
6677+
* Note: if we fail to complete the checkpoint, RedoRecPtr will be
6678+
* left pointing past where it really needs to point. This is okay;
6679+
* the only consequence is that XLogInsert might back up whole buffers
6680+
* that it didn't really need to. We can't postpone advancing
6681+
* RedoRecPtr because XLogInserts that happen while we are dumping
6682+
* buffers must assume that their buffer changes are not included in
6683+
* the checkpoint.
6684+
*/
6685+
RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6686+
}
66416687

66426688
/*
66436689
* Now we can release the WAL insertion locks, allowing other xacts to
66446690
* proceed while we are flushing disk buffers.
66456691
*/
66466692
WALInsertLockRelease();
66476693

6694+
/*
6695+
* If this is an online checkpoint, we have not yet determined the redo
6696+
* point. We do so now by inserting the special XLOG_CHECKPOINT_REDO
6697+
* record; the LSN at which it starts becomes the new redo pointer. We
6698+
* don't do this for a shutdown checkpoint, because in that case no WAL
6699+
* can be written between the redo point and the insertion of the
6700+
* checkpoint record itself, so the checkpoint record itself serves to
6701+
* mark the redo point.
6702+
*/
6703+
if (!shutdown)
6704+
{
6705+
int dummy = 0;
6706+
6707+
/* Record must have payload to avoid assertion failure. */
6708+
XLogBeginInsert();
6709+
XLogRegisterData((char *) &dummy, sizeof(dummy));
6710+
(void) XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO);
6711+
6712+
/*
6713+
* XLogInsertRecord will have updated XLogCtl->Insert.RedoRecPtr in
6714+
* shared memory and RedoRecPtr in backend-local memory, but we need
6715+
* to copy that into the record that will be inserted when the
6716+
* checkpoint is complete.
6717+
*/
6718+
checkPoint.redo = RedoRecPtr;
6719+
}
6720+
66486721
/* Update the info_lck-protected copy of RedoRecPtr as well */
66496722
SpinLockAcquire(&XLogCtl->info_lck);
66506723
XLogCtl->RedoRecPtr = checkPoint.redo;
@@ -8105,6 +8178,10 @@ xlog_redo(XLogReaderState *record)
81058178
/* Keep track of full_page_writes */
81068179
lastFullPageWrites = fpw;
81078180
}
8181+
else if (info == XLOG_CHECKPOINT_REDO)
8182+
{
8183+
/* nothing to do here, just for informational purposes */
8184+
}
81088185
}
81098186

81108187
/*

src/backend/access/transam/xlogrecovery.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1638,6 +1638,17 @@ PerformWalRecovery(void)
16381638
replayTLI = RedoStartTLI;
16391639
XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
16401640
record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1641+
1642+
/*
1643+
* If a checkpoint record's redo pointer points back to an earlier
1644+
* LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1645+
* record.
1646+
*/
1647+
if (record->xl_rmid != RM_XLOG_ID ||
1648+
(record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1649+
ereport(FATAL,
1650+
(errmsg("unexpected record type found at redo point %X/%X",
1651+
LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
16411652
}
16421653
else
16431654
{

0 commit comments

Comments
 (0)