Skip to content

Commit 1df0a91

Browse files
committed
Fix WAL replay in presence of an incomplete record
Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d8, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
1 parent c5f7e70 commit 1df0a91

File tree

8 files changed

+323
-7
lines changed

8 files changed

+323
-7
lines changed

src/backend/access/rmgrdesc/xlogdesc.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,16 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
140140
xlrec.ThisTimeLineID, xlrec.PrevTimeLineID,
141141
timestamptz_to_str(xlrec.end_time));
142142
}
143+
else if (info == XLOG_OVERWRITE_CONTRECORD)
144+
{
145+
xl_overwrite_contrecord xlrec;
146+
147+
memcpy(&xlrec, rec, sizeof(xl_overwrite_contrecord));
148+
appendStringInfo(buf, "lsn %X/%X; time %s",
149+
(uint32) (xlrec.overwritten_lsn >> 32),
150+
(uint32) xlrec.overwritten_lsn,
151+
timestamptz_to_str(xlrec.overwrite_time));
152+
}
143153
}
144154

145155
const char *
@@ -179,6 +189,9 @@ xlog_identify(uint8 info)
179189
case XLOG_END_OF_RECOVERY:
180190
id = "END_OF_RECOVERY";
181191
break;
192+
case XLOG_OVERWRITE_CONTRECORD:
193+
id = "OVERWRITE_CONTRECORD";
194+
break;
182195
case XLOG_FPI:
183196
id = "FPI";
184197
break;

src/backend/access/transam/xlog.c

Lines changed: 152 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,15 @@ static XLogRecPtr LastRec;
208208
static XLogRecPtr receivedUpto = 0;
209209
static TimeLineID receiveTLI = 0;
210210

211+
/*
212+
* abortedRecPtr is the start pointer of a broken record at end of WAL when
213+
* recovery completes; missingContrecPtr is the location of the first
214+
* contrecord that went missing. See CreateOverwriteContrecordRecord for
215+
* details.
216+
*/
217+
static XLogRecPtr abortedRecPtr;
218+
static XLogRecPtr missingContrecPtr;
219+
211220
/*
212221
* During recovery, lastFullPageWrites keeps track of full_page_writes that
213222
* the replayed WAL records indicate. It's initialized with full_page_writes
@@ -871,8 +880,11 @@ static void CheckRequiredParameterValues(void);
871880
static void XLogReportParameters(void);
872881
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
873882
TimeLineID prevTLI);
883+
static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec,
884+
XLogReaderState *state);
874885
static void LocalSetXLogInsertAllowed(void);
875886
static void CreateEndOfRecoveryRecord(void);
887+
static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn);
876888
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
877889
static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
878890
static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
@@ -2212,6 +2224,18 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
22122224
if (!Insert->forcePageWrites)
22132225
NewPage->xlp_info |= XLP_BKP_REMOVABLE;
22142226

2227+
/*
2228+
* If a record was found to be broken at the end of recovery, and
2229+
* we're going to write on the page where its first contrecord was
2230+
* lost, set the XLP_FIRST_IS_OVERWRITE_CONTRECORD flag on the page
2231+
* header. See CreateOverwriteContrecordRecord().
2232+
*/
2233+
if (missingContrecPtr == NewPageBeginPtr)
2234+
{
2235+
NewPage->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
2236+
missingContrecPtr = InvalidXLogRecPtr;
2237+
}
2238+
22152239
/*
22162240
* If first page of an XLOG segment file, make it a long header.
22172241
*/
@@ -4293,6 +4317,19 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
42934317
EndRecPtr = xlogreader->EndRecPtr;
42944318
if (record == NULL)
42954319
{
4320+
/*
4321+
* When not in standby mode we find that WAL ends in an incomplete
4322+
* record, keep track of that record. After recovery is done,
4323+
* we'll write a record to indicate downstream WAL readers that
4324+
* that portion is to be ignored.
4325+
*/
4326+
if (!StandbyMode &&
4327+
!XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
4328+
{
4329+
abortedRecPtr = xlogreader->abortedRecPtr;
4330+
missingContrecPtr = xlogreader->missingContrecPtr;
4331+
}
4332+
42964333
if (readFile >= 0)
42974334
{
42984335
close(readFile);
@@ -6789,6 +6826,12 @@ StartupXLOG(void)
67896826
InRecovery = true;
67906827
}
67916828

6829+
/*
6830+
* Start recovery assuming that the final record isn't lost.
6831+
*/
6832+
abortedRecPtr = InvalidXLogRecPtr;
6833+
missingContrecPtr = InvalidXLogRecPtr;
6834+
67926835
/* REDO */
67936836
if (InRecovery)
67946837
{
@@ -7369,8 +7412,9 @@ StartupXLOG(void)
73697412

73707413
/*
73717414
* Kill WAL receiver, if it's still running, before we continue to write
7372-
* the startup checkpoint record. It will trump over the checkpoint and
7373-
* subsequent records if it's still alive when we start writing WAL.
7415+
* the startup checkpoint and aborted-contrecord records. It will trump
7416+
* over these records and subsequent ones if it's still alive when we
7417+
* start writing WAL.
73747418
*/
73757419
ShutdownWalRcv();
73767420

@@ -7403,8 +7447,12 @@ StartupXLOG(void)
74037447
StandbyMode = false;
74047448

74057449
/*
7406-
* Re-fetch the last valid or last applied record, so we can identify the
7407-
* exact endpoint of what we consider the valid portion of WAL.
7450+
* Determine where to start writing WAL next.
7451+
*
7452+
* When recovery ended in an incomplete record, write a WAL record about
7453+
* that and continue after it. In all other cases, re-fetch the last
7454+
* valid or last applied record, so we can identify the exact endpoint of
7455+
* what we consider the valid portion of WAL.
74087456
*/
74097457
record = ReadRecord(xlogreader, LastRec, PANIC, false);
74107458
EndOfLog = EndRecPtr;
@@ -7553,6 +7601,18 @@ StartupXLOG(void)
75537601
XLogCtl->ThisTimeLineID = ThisTimeLineID;
75547602
XLogCtl->PrevTimeLineID = PrevTimeLineID;
75557603

7604+
/*
7605+
* Actually, if WAL ended in an incomplete record, skip the parts that
7606+
* made it through and start writing after the portion that persisted.
7607+
* (It's critical to first write an OVERWRITE_CONTRECORD message, which
7608+
* we'll do as soon as we're open for writing new WAL.)
7609+
*/
7610+
if (!XLogRecPtrIsInvalid(missingContrecPtr))
7611+
{
7612+
Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
7613+
EndOfLog = missingContrecPtr;
7614+
}
7615+
75567616
/*
75577617
* Prepare to write WAL starting at EndOfLog location, and init xlog
75587618
* buffer cache using the block containing the last record from the
@@ -7605,13 +7665,23 @@ StartupXLOG(void)
76057665
XLogCtl->LogwrtRqst.Write = EndOfLog;
76067666
XLogCtl->LogwrtRqst.Flush = EndOfLog;
76077667

7668+
LocalSetXLogInsertAllowed();
7669+
7670+
/* If necessary, write overwrite-contrecord before doing anything else */
7671+
if (!XLogRecPtrIsInvalid(abortedRecPtr))
7672+
{
7673+
Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
7674+
CreateOverwriteContrecordRecord(abortedRecPtr);
7675+
abortedRecPtr = InvalidXLogRecPtr;
7676+
missingContrecPtr = InvalidXLogRecPtr;
7677+
}
7678+
76087679
/*
76097680
* Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
76107681
* record before resource manager writes cleanup WAL records or checkpoint
76117682
* record is written.
76127683
*/
76137684
Insert->fullPageWrites = lastFullPageWrites;
7614-
LocalSetXLogInsertAllowed();
76157685
UpdateFullPageWrites();
76167686
LocalXLogInsertAllowed = -1;
76177687

@@ -9047,6 +9117,53 @@ CreateEndOfRecoveryRecord(void)
90479117
LocalXLogInsertAllowed = -1; /* return to "check" state */
90489118
}
90499119

9120+
/*
9121+
* Write an OVERWRITE_CONTRECORD message.
9122+
*
9123+
* When on WAL replay we expect a continuation record at the start of a page
9124+
* that is not there, recovery ends and WAL writing resumes at that point.
9125+
* But it's wrong to resume writing new WAL back at the start of the record
9126+
* that was broken, because downstream consumers of that WAL (physical
9127+
* replicas) are not prepared to "rewind". So the first action after
9128+
* finishing replay of all valid WAL must be to write a record of this type
9129+
* at the point where the contrecord was missing; to support xlogreader
9130+
* detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
9131+
* to the page header where the record occurs. xlogreader has an ad-hoc
9132+
* mechanism to report metadata about the broken record, which is what we
9133+
* use here.
9134+
*
9135+
* At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
9136+
* skip the record it was reading, and pass back the LSN of the skipped
9137+
* record, so that its caller can verify (on "replay" of that record) that the
9138+
* XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
9139+
*/
9140+
static XLogRecPtr
9141+
CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)
9142+
{
9143+
xl_overwrite_contrecord xlrec;
9144+
XLogRecPtr recptr;
9145+
9146+
/* sanity check */
9147+
if (!RecoveryInProgress())
9148+
elog(ERROR, "can only be used at end of recovery");
9149+
9150+
xlrec.overwritten_lsn = aborted_lsn;
9151+
xlrec.overwrite_time = GetCurrentTimestamp();
9152+
9153+
START_CRIT_SECTION();
9154+
9155+
XLogBeginInsert();
9156+
XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
9157+
9158+
recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
9159+
9160+
XLogFlush(recptr);
9161+
9162+
END_CRIT_SECTION();
9163+
9164+
return recptr;
9165+
}
9166+
90509167
/*
90519168
* Flush all data in shared memory to disk, and fsync
90529169
*
@@ -9846,6 +9963,13 @@ xlog_redo(XLogReaderState *record)
98469963

98479964
RecoveryRestartPoint(&checkPoint);
98489965
}
9966+
else if (info == XLOG_OVERWRITE_CONTRECORD)
9967+
{
9968+
xl_overwrite_contrecord xlrec;
9969+
9970+
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
9971+
VerifyOverwriteContrecord(&xlrec, record);
9972+
}
98499973
else if (info == XLOG_END_OF_RECOVERY)
98509974
{
98519975
xl_end_of_recovery xlrec;
@@ -10006,6 +10130,29 @@ xlog_redo(XLogReaderState *record)
1000610130
}
1000710131
}
1000810132

10133+
/*
10134+
* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record.
10135+
*/
10136+
static void
10137+
VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, XLogReaderState *state)
10138+
{
10139+
if (xlrec->overwritten_lsn != state->overwrittenRecPtr)
10140+
elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
10141+
(uint32) (xlrec->overwritten_lsn >> 32),
10142+
(uint32) xlrec->overwritten_lsn,
10143+
(uint32) (state->overwrittenRecPtr >> 32),
10144+
(uint32) state->overwrittenRecPtr);
10145+
10146+
ereport(LOG,
10147+
(errmsg("sucessfully skipped missing contrecord at %X/%X, overwritten at %s",
10148+
(uint32) (xlrec->overwritten_lsn >> 32),
10149+
(uint32) xlrec->overwritten_lsn,
10150+
timestamptz_to_str(xlrec->overwrite_time))));
10151+
10152+
/* Verifying the record should only happen once */
10153+
state->overwrittenRecPtr = InvalidXLogRecPtr;
10154+
}
10155+
1000910156
#ifdef WAL_DEBUG
1001010157

1001110158
static void

src/backend/access/transam/xlogreader.c

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg)
222222
total_len;
223223
uint32 targetRecOff;
224224
uint32 pageHeaderSize;
225+
bool assembled;
225226
bool gotheader;
226227
int readOff;
227228

@@ -237,6 +238,8 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg)
237238
state->errormsg_buf[0] = '\0';
238239

239240
ResetDecoder(state);
241+
state->abortedRecPtr = InvalidXLogRecPtr;
242+
state->missingContrecPtr = InvalidXLogRecPtr;
240243

241244
if (RecPtr == InvalidXLogRecPtr)
242245
{
@@ -265,7 +268,9 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg)
265268
randAccess = true;
266269
}
267270

271+
restart:
268272
state->currRecPtr = RecPtr;
273+
assembled = false;
269274

270275
targetPagePtr = RecPtr - (RecPtr % XLOG_BLCKSZ);
271276
targetRecOff = RecPtr % XLOG_BLCKSZ;
@@ -362,6 +367,8 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg)
362367
char *buffer;
363368
uint32 gotlen;
364369

370+
assembled = true;
371+
365372
/*
366373
* Enlarge readRecordBuf as needed.
367374
*/
@@ -396,8 +403,25 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg)
396403

397404
Assert(SizeOfXLogShortPHD <= readOff);
398405

399-
/* Check that the continuation on next page looks valid */
400406
pageHeader = (XLogPageHeader) state->readBuf;
407+
408+
/*
409+
* If we were expecting a continuation record and got an
410+
* "overwrite contrecord" flag, that means the continuation record
411+
* was overwritten with a different record. Restart the read by
412+
* assuming the address to read is the location where we found
413+
* this flag; but keep track of the LSN of the record we were
414+
* reading, for later verification.
415+
*/
416+
if (pageHeader->xlp_info & XLP_FIRST_IS_OVERWRITE_CONTRECORD)
417+
{
418+
state->overwrittenRecPtr = state->currRecPtr;
419+
ResetDecoder(state);
420+
RecPtr = targetPagePtr;
421+
goto restart;
422+
}
423+
424+
/* Check that the continuation on next page looks valid */
401425
if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD))
402426
{
403427
report_invalid_record(state,
@@ -498,6 +522,20 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg)
498522
return NULL;
499523

500524
err:
525+
if (assembled)
526+
{
527+
/*
528+
* We get here when a record that spans multiple pages needs to be
529+
* assembled, but something went wrong -- perhaps a contrecord piece
530+
* was lost. If caller is WAL replay, it will know where the aborted
531+
* record was and where to direct followup WAL to be written, marking
532+
* the next piece with XLP_FIRST_IS_OVERWRITE_CONTRECORD, which will
533+
* in turn signal downstream WAL consumers that the broken WAL record
534+
* is to be ignored.
535+
*/
536+
state->abortedRecPtr = RecPtr;
537+
state->missingContrecPtr = targetPagePtr;
538+
}
501539

502540
/*
503541
* Invalidate the read state. We might read from a different source after

src/include/access/xlog_internal.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,10 @@ typedef XLogLongPageHeaderData *XLogLongPageHeader;
7979
#define XLP_LONG_HEADER 0x0002
8080
/* This flag indicates backup blocks starting in this page are optional */
8181
#define XLP_BKP_REMOVABLE 0x0004
82+
/* Replaces a missing contrecord; see CreateOverwriteContrecordRecord */
83+
#define XLP_FIRST_IS_OVERWRITE_CONTRECORD 0x0008
8284
/* All defined flag bits in xlp_info (used for validity checking of header) */
83-
#define XLP_ALL_FLAGS 0x0007
85+
#define XLP_ALL_FLAGS 0x000F
8486

8587
#define XLogPageHeaderSize(hdr) \
8688
(((hdr)->xlp_info & XLP_LONG_HEADER) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD)
@@ -241,6 +243,13 @@ typedef struct xl_restore_point
241243
char rp_name[MAXFNAMELEN];
242244
} xl_restore_point;
243245

246+
/* Overwrite of prior contrecord */
247+
typedef struct xl_overwrite_contrecord
248+
{
249+
XLogRecPtr overwritten_lsn;
250+
TimestampTz overwrite_time;
251+
} xl_overwrite_contrecord;
252+
244253
/* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */
245254
typedef struct xl_end_of_recovery
246255
{

0 commit comments

Comments
 (0)