@@ -559,6 +559,16 @@ typedef struct XLogCtlData
559
559
slock_t info_lck ; /* locks shared variables shown above */
560
560
} XLogCtlData ;
561
561
562
+ /*
563
+ * Classification of XLogRecordInsert operations.
564
+ */
565
+ typedef enum
566
+ {
567
+ WALINSERT_NORMAL ,
568
+ WALINSERT_SPECIAL_SWITCH ,
569
+ WALINSERT_SPECIAL_CHECKPOINT
570
+ } WalInsertClass ;
571
+
562
572
static XLogCtlData * XLogCtl = NULL ;
563
573
564
574
/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
@@ -739,13 +749,21 @@ XLogInsertRecord(XLogRecData *rdata,
739
749
bool inserted ;
740
750
XLogRecord * rechdr = (XLogRecord * ) rdata -> data ;
741
751
uint8 info = rechdr -> xl_info & ~XLR_INFO_MASK ;
742
- bool isLogSwitch = (rechdr -> xl_rmid == RM_XLOG_ID &&
743
- info == XLOG_SWITCH );
752
+ WalInsertClass class = WALINSERT_NORMAL ;
744
753
XLogRecPtr StartPos ;
745
754
XLogRecPtr EndPos ;
746
755
bool prevDoPageWrites = doPageWrites ;
747
756
TimeLineID insertTLI ;
748
757
758
+ /* Does this record type require special handling? */
759
+ if (unlikely (rechdr -> xl_rmid == RM_XLOG_ID ))
760
+ {
761
+ if (info == XLOG_SWITCH )
762
+ class = WALINSERT_SPECIAL_SWITCH ;
763
+ else if (info == XLOG_CHECKPOINT_REDO )
764
+ class = WALINSERT_SPECIAL_CHECKPOINT ;
765
+ }
766
+
749
767
/* we assume that all of the record header is in the first chunk */
750
768
Assert (rdata -> len >= SizeOfXLogRecord );
751
769
@@ -793,7 +811,7 @@ XLogInsertRecord(XLogRecData *rdata,
793
811
*/
794
812
START_CRIT_SECTION ();
795
813
796
- if (likely (! isLogSwitch ))
814
+ if (likely (class == WALINSERT_NORMAL ))
797
815
{
798
816
WALInsertLockAcquire ();
799
817
@@ -843,7 +861,7 @@ XLogInsertRecord(XLogRecData *rdata,
843
861
/* Normal records are always inserted. */
844
862
inserted = true;
845
863
}
846
- else
864
+ else if ( class == WALINSERT_SPECIAL_SWITCH )
847
865
{
848
866
/*
849
867
* In order to insert an XLOG_SWITCH record, we need to hold all of
@@ -852,14 +870,32 @@ XLogInsertRecord(XLogRecData *rdata,
852
870
* remains in the current WAL segment and claimed all of it.
853
871
*
854
872
* Nonetheless, this case is simpler than the normal cases handled
855
- * above , which must check for changes in doPageWrites and RedoRecPtr.
856
- * Those checks are only needed for records that can contain
857
- * full-pages images , and an XLOG_SWITCH record never does.
873
+ * below , which must check for changes in doPageWrites and RedoRecPtr.
874
+ * Those checks are only needed for records that can contain buffer
875
+ * references , and an XLOG_SWITCH record never does.
858
876
*/
859
877
Assert (fpw_lsn == InvalidXLogRecPtr );
860
878
WALInsertLockAcquireExclusive ();
861
879
inserted = ReserveXLogSwitch (& StartPos , & EndPos , & rechdr -> xl_prev );
862
880
}
881
+ else
882
+ {
883
+ Assert (class == WALINSERT_SPECIAL_CHECKPOINT );
884
+
885
+ /*
886
+ * We need to update both the local and shared copies of RedoRecPtr,
887
+ * which means that we need to hold all the WAL insertion locks.
888
+ * However, there can't be any buffer references, so as above, we need
889
+ * not check RedoRecPtr before inserting the record; we just need to
890
+ * update it afterwards.
891
+ */
892
+ Assert (fpw_lsn == InvalidXLogRecPtr );
893
+ WALInsertLockAcquireExclusive ();
894
+ ReserveXLogInsertLocation (rechdr -> xl_tot_len , & StartPos , & EndPos ,
895
+ & rechdr -> xl_prev );
896
+ RedoRecPtr = Insert -> RedoRecPtr = StartPos ;
897
+ inserted = true;
898
+ }
863
899
864
900
if (inserted )
865
901
{
@@ -876,7 +912,8 @@ XLogInsertRecord(XLogRecData *rdata,
876
912
* All the record data, including the header, is now ready to be
877
913
* inserted. Copy the record in the space reserved.
878
914
*/
879
- CopyXLogRecordToWAL (rechdr -> xl_tot_len , isLogSwitch , rdata ,
915
+ CopyXLogRecordToWAL (rechdr -> xl_tot_len ,
916
+ class == WALINSERT_SPECIAL_SWITCH , rdata ,
880
917
StartPos , EndPos , insertTLI );
881
918
882
919
/*
@@ -935,7 +972,7 @@ XLogInsertRecord(XLogRecData *rdata,
935
972
* padding space that fills the rest of the segment, and perform
936
973
* end-of-segment actions (eg, notifying archiver).
937
974
*/
938
- if (isLogSwitch )
975
+ if (class == WALINSERT_SPECIAL_SWITCH )
939
976
{
940
977
TRACE_POSTGRESQL_WAL_SWITCH ();
941
978
XLogFlush (EndPos );
@@ -1054,8 +1091,12 @@ XLogInsertRecord(XLogRecData *rdata,
1054
1091
*
1055
1092
* NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1056
1093
* where we actually copy the record to the reserved space.
1094
+ *
1095
+ * NB: Testing shows that XLogInsertRecord runs faster if this code is inlined;
1096
+ * however, because there are two call sites, the compiler is reluctant to
1097
+ * inline. We use pg_attribute_always_inline here to try to convince it.
1057
1098
*/
1058
- static void
1099
+ static pg_attribute_always_inline void
1059
1100
ReserveXLogInsertLocation (int size , XLogRecPtr * StartPos , XLogRecPtr * EndPos ,
1060
1101
XLogRecPtr * PrevPtr )
1061
1102
{
@@ -6475,17 +6516,22 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset)
6475
6516
* In particular note that this routine is synchronous and does not pay
6476
6517
* attention to CHECKPOINT_WAIT.
6477
6518
*
6478
- * If !shutdown then we are writing an online checkpoint. This is a very special
6479
- * kind of operation and WAL record because the checkpoint action occurs over
6480
- * a period of time yet logically occurs at just a single LSN. The logical
6481
- * position of the WAL record (redo ptr) is the same or earlier than the
6482
- * physical position. When we replay WAL we locate the checkpoint via its
6483
- * physical position then read the redo ptr and actually start replay at the
6484
- * earlier logical position. Note that we don't write *anything* to WAL at
6485
- * the logical position, so that location could be any other kind of WAL record.
6486
- * All of this mechanism allows us to continue working while we checkpoint.
6487
- * As a result, timing of actions is critical here and be careful to note that
6488
- * this function will likely take minutes to execute on a busy system.
6519
+ * If !shutdown then we are writing an online checkpoint. An XLOG_CHECKPOINT_REDO
6520
+ * record is inserted into WAL at the logical location of the checkpoint, before
6521
+ * flushing anything to disk, and when the checkpoint is eventually completed,
6522
+ * and it is from this point that WAL replay will begin in the case of a recovery
6523
+ * from this checkpoint. Once everything is written to disk, an
6524
+ * XLOG_CHECKPOINT_ONLINE record is written to complete the checkpoint, and
6525
+ * points back to the earlier XLOG_CHECKPOINT_REDO record. This mechanism allows
6526
+ * other write-ahead log records to be written while the checkpoint is in
6527
+ * progress, but we must be very careful about order of operations. This function
6528
+ * may take many minutes to execute on a busy system.
6529
+ *
6530
+ * On the other hand, when shutdown is true, concurrent insertion into the
6531
+ * write-ahead log is impossible, so there is no need for two separate records.
6532
+ * In this case, we only insert an XLOG_CHECKPOINT_SHUTDOWN record, and it's
6533
+ * both the record marking the completion of the checkpoint and the location
6534
+ * from which WAL replay would begin if needed.
6489
6535
*/
6490
6536
void
6491
6537
CreateCheckPoint (int flags )
@@ -6497,7 +6543,6 @@ CreateCheckPoint(int flags)
6497
6543
XLogCtlInsert * Insert = & XLogCtl -> Insert ;
6498
6544
uint32 freespace ;
6499
6545
XLogRecPtr PriorRedoPtr ;
6500
- XLogRecPtr curInsert ;
6501
6546
XLogRecPtr last_important_lsn ;
6502
6547
VirtualTransactionId * vxids ;
6503
6548
int nvxids ;
@@ -6567,13 +6612,6 @@ CreateCheckPoint(int flags)
6567
6612
*/
6568
6613
last_important_lsn = GetLastImportantRecPtr ();
6569
6614
6570
- /*
6571
- * We must block concurrent insertions while examining insert state to
6572
- * determine the checkpoint REDO pointer.
6573
- */
6574
- WALInsertLockAcquireExclusive ();
6575
- curInsert = XLogBytePosToRecPtr (Insert -> CurrBytePos );
6576
-
6577
6615
/*
6578
6616
* If this isn't a shutdown or forced checkpoint, and if there has been no
6579
6617
* WAL activity requiring a checkpoint, skip it. The idea here is to
@@ -6584,7 +6622,6 @@ CreateCheckPoint(int flags)
6584
6622
{
6585
6623
if (last_important_lsn == ControlFile -> checkPoint )
6586
6624
{
6587
- WALInsertLockRelease ();
6588
6625
END_CRIT_SECTION ();
6589
6626
ereport (DEBUG1 ,
6590
6627
(errmsg_internal ("checkpoint skipped because system is idle" )));
@@ -6606,45 +6643,81 @@ CreateCheckPoint(int flags)
6606
6643
else
6607
6644
checkPoint .PrevTimeLineID = checkPoint .ThisTimeLineID ;
6608
6645
6609
- checkPoint .fullPageWrites = Insert -> fullPageWrites ;
6610
-
6611
6646
/*
6612
- * Compute new REDO record ptr = location of next XLOG record.
6613
- *
6614
- * NB: this is NOT necessarily where the checkpoint record itself will be,
6615
- * since other backends may insert more XLOG records while we're off doing
6616
- * the buffer flush work. Those XLOG records are logically after the
6617
- * checkpoint, even though physically before it. Got that?
6647
+ * We must block concurrent insertions while examining insert state.
6618
6648
*/
6619
- freespace = INSERT_FREESPACE (curInsert );
6620
- if (freespace == 0 )
6649
+ WALInsertLockAcquireExclusive ();
6650
+
6651
+ checkPoint .fullPageWrites = Insert -> fullPageWrites ;
6652
+
6653
+ if (shutdown )
6621
6654
{
6622
- if (XLogSegmentOffset (curInsert , wal_segment_size ) == 0 )
6623
- curInsert += SizeOfXLogLongPHD ;
6624
- else
6625
- curInsert += SizeOfXLogShortPHD ;
6626
- }
6627
- checkPoint .redo = curInsert ;
6655
+ XLogRecPtr curInsert = XLogBytePosToRecPtr (Insert -> CurrBytePos );
6628
6656
6629
- /*
6630
- * Here we update the shared RedoRecPtr for future XLogInsert calls; this
6631
- * must be done while holding all the insertion locks.
6632
- *
6633
- * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
6634
- * pointing past where it really needs to point. This is okay; the only
6635
- * consequence is that XLogInsert might back up whole buffers that it
6636
- * didn't really need to. We can't postpone advancing RedoRecPtr because
6637
- * XLogInserts that happen while we are dumping buffers must assume that
6638
- * their buffer changes are not included in the checkpoint.
6639
- */
6640
- RedoRecPtr = XLogCtl -> Insert .RedoRecPtr = checkPoint .redo ;
6657
+ /*
6658
+ * Compute new REDO record ptr = location of next XLOG record.
6659
+ *
6660
+ * Since this is a shutdown checkpoint, there can't be any concurrent
6661
+ * WAL insertion.
6662
+ */
6663
+ freespace = INSERT_FREESPACE (curInsert );
6664
+ if (freespace == 0 )
6665
+ {
6666
+ if (XLogSegmentOffset (curInsert , wal_segment_size ) == 0 )
6667
+ curInsert += SizeOfXLogLongPHD ;
6668
+ else
6669
+ curInsert += SizeOfXLogShortPHD ;
6670
+ }
6671
+ checkPoint .redo = curInsert ;
6672
+
6673
+ /*
6674
+ * Here we update the shared RedoRecPtr for future XLogInsert calls;
6675
+ * this must be done while holding all the insertion locks.
6676
+ *
6677
+ * Note: if we fail to complete the checkpoint, RedoRecPtr will be
6678
+ * left pointing past where it really needs to point. This is okay;
6679
+ * the only consequence is that XLogInsert might back up whole buffers
6680
+ * that it didn't really need to. We can't postpone advancing
6681
+ * RedoRecPtr because XLogInserts that happen while we are dumping
6682
+ * buffers must assume that their buffer changes are not included in
6683
+ * the checkpoint.
6684
+ */
6685
+ RedoRecPtr = XLogCtl -> Insert .RedoRecPtr = checkPoint .redo ;
6686
+ }
6641
6687
6642
6688
/*
6643
6689
* Now we can release the WAL insertion locks, allowing other xacts to
6644
6690
* proceed while we are flushing disk buffers.
6645
6691
*/
6646
6692
WALInsertLockRelease ();
6647
6693
6694
+ /*
6695
+ * If this is an online checkpoint, we have not yet determined the redo
6696
+ * point. We do so now by inserting the special XLOG_CHECKPOINT_REDO
6697
+ * record; the LSN at which it starts becomes the new redo pointer. We
6698
+ * don't do this for a shutdown checkpoint, because in that case no WAL
6699
+ * can be written between the redo point and the insertion of the
6700
+ * checkpoint record itself, so the checkpoint record itself serves to
6701
+ * mark the redo point.
6702
+ */
6703
+ if (!shutdown )
6704
+ {
6705
+ int dummy = 0 ;
6706
+
6707
+ /* Record must have payload to avoid assertion failure. */
6708
+ XLogBeginInsert ();
6709
+ XLogRegisterData ((char * ) & dummy , sizeof (dummy ));
6710
+ (void ) XLogInsert (RM_XLOG_ID , XLOG_CHECKPOINT_REDO );
6711
+
6712
+ /*
6713
+ * XLogInsertRecord will have updated XLogCtl->Insert.RedoRecPtr in
6714
+ * shared memory and RedoRecPtr in backend-local memory, but we need
6715
+ * to copy that into the record that will be inserted when the
6716
+ * checkpoint is complete.
6717
+ */
6718
+ checkPoint .redo = RedoRecPtr ;
6719
+ }
6720
+
6648
6721
/* Update the info_lck-protected copy of RedoRecPtr as well */
6649
6722
SpinLockAcquire (& XLogCtl -> info_lck );
6650
6723
XLogCtl -> RedoRecPtr = checkPoint .redo ;
@@ -8105,6 +8178,10 @@ xlog_redo(XLogReaderState *record)
8105
8178
/* Keep track of full_page_writes */
8106
8179
lastFullPageWrites = fpw ;
8107
8180
}
8181
+ else if (info == XLOG_CHECKPOINT_REDO )
8182
+ {
8183
+ /* nothing to do here, just for informational purposes */
8184
+ }
8108
8185
}
8109
8186
8110
8187
/*
0 commit comments