Skip to content

Commit dda8b87

Browse files
committed
Avoid bogus "out-of-sequence timeline ID" errors in standby-mode.
When startup process opens a WAL segment after replaying part of it, it validates the first page on the WAL segment, even though the page it's really interested in later in the file. As part of the validation, it checks that the TLI on the page header is >= the TLI it saw on the last page it read. If the segment contains a timeline switch, and we have already replayed it, and then re-open the WAL segment (because of streaming replication got disconnected and reconnected, for example), the TLI check will fail when the first page is validated. Fix that by relaxing the TLI check when re-opening a WAL segment. Backpatch to 9.0. Earlier versions had the same code, but before standby mode was introduced in 9.0, recovery never tried to re-read a segment after partially replaying it. Reported by Amit Kapila, while testing a new feature.
1 parent 8af60da commit dda8b87

File tree

1 file changed

+23
-9
lines changed
  • src/backend/access/transam

1 file changed

+23
-9
lines changed

src/backend/access/transam/xlog.c

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,7 @@ static uint32 readRecordBufSize = 0;
572572
static XLogRecPtr ReadRecPtr; /* start of last record read */
573573
static XLogRecPtr EndRecPtr; /* end+1 of last record read */
574574
static TimeLineID lastPageTLI = 0;
575+
static TimeLineID lastSegmentTLI = 0;
575576

576577
static XLogRecPtr minRecoveryPoint; /* local copy of
577578
* ControlFile->minRecoveryPoint */
@@ -655,7 +656,7 @@ static void CleanupBackupHistory(void);
655656
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
656657
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
657658
static void CheckRecoveryConsistency(void);
658-
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
659+
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode, bool segmentonly);
659660
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
660661
static List *readTimeLineHistory(TimeLineID targetTLI);
661662
static bool existsTimeLineHistory(TimeLineID probeTLI);
@@ -3949,7 +3950,7 @@ ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt)
39493950
* to go backwards (but we can't reset that variable right here, since
39503951
* we might not change files at all).
39513952
*/
3952-
lastPageTLI = 0; /* see comment in ValidXLOGHeader */
3953+
lastPageTLI = lastSegmentTLI = 0; /* see comment in ValidXLOGHeader */
39533954
randAccess = true; /* allow curFileTLI to go backwards too */
39543955
}
39553956

@@ -4212,7 +4213,7 @@ ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt)
42124213
* ReadRecord. It's not intended for use from anywhere else.
42134214
*/
42144215
static bool
4215-
ValidXLOGHeader(XLogPageHeader hdr, int emode)
4216+
ValidXLOGHeader(XLogPageHeader hdr, int emode, bool segmentonly)
42164217
{
42174218
XLogRecPtr recaddr;
42184219

@@ -4307,18 +4308,31 @@ ValidXLOGHeader(XLogPageHeader hdr, int emode)
43074308
* successive pages of a consistent WAL sequence.
43084309
*
43094310
* Of course this check should only be applied when advancing sequentially
4310-
* across pages; therefore ReadRecord resets lastPageTLI to zero when
4311-
* going to a random page.
4311+
* across pages; therefore ReadRecord resets lastPageTLI and
4312+
* lastSegmentTLI to zero when going to a random page.
4313+
*
4314+
* Sometimes we re-open a segment that's already been partially replayed.
4315+
* In that case we cannot perform the normal TLI check: if there is a
4316+
* timeline switch within the segment, the first page has a smaller TLI
4317+
* than later pages following the timeline switch, and we might've read
4318+
* them already. As a weaker test, we still check that it's not smaller
4319+
* than the TLI we last saw at the beginning of a segment. Pass
4320+
* segmentonly = true when re-validating the first page like that, and the
4321+
* page you're actually interested in comes later.
43124322
*/
4313-
if (hdr->xlp_tli < lastPageTLI)
4323+
if (hdr->xlp_tli < (segmentonly ? lastSegmentTLI : lastPageTLI))
43144324
{
43154325
ereport(emode_for_corrupt_record(emode, recaddr),
43164326
(errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
4317-
hdr->xlp_tli, lastPageTLI,
4327+
hdr->xlp_tli,
4328+
segmentonly ? lastSegmentTLI : lastPageTLI,
43184329
readId, readSeg, readOff)));
43194330
return false;
43204331
}
43214332
lastPageTLI = hdr->xlp_tli;
4333+
if (readOff == 0)
4334+
lastSegmentTLI = hdr->xlp_tli;
4335+
43224336
return true;
43234337
}
43244338

@@ -10462,7 +10476,7 @@ XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
1046210476
readId, readSeg, readOff)));
1046310477
goto next_record_is_invalid;
1046410478
}
10465-
if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
10479+
if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, true))
1046610480
goto next_record_is_invalid;
1046710481
}
1046810482

@@ -10484,7 +10498,7 @@ XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
1048410498
readId, readSeg, readOff)));
1048510499
goto next_record_is_invalid;
1048610500
}
10487-
if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
10501+
if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, false))
1048810502
goto next_record_is_invalid;
1048910503

1049010504
Assert(targetId == readId);

0 commit comments

Comments
 (0)