Skip to content

Commit c06380e

Browse files
committed
Fix scenario where streaming standby gets stuck at a continuation record.
If a continuation record is split so that its first half has already been removed from the master, and is only present in pg_wal, and there is a recycled WAL segment in the standby server that looks like it would contain the second half, recovery would get stuck. The code in XLogPageRead() incorrectly started streaming at the beginning of the WAL record, even if we had already read the first page. Backpatch to 9.4. In principle, older versions have the same problem, but without replication slots, there was no straightforward mechanism to prevent the master from recycling old WAL that was still needed by standby. Without such a mechanism, I think it's reasonable to assume that there's enough slack in how many old segments are kept around to not run into this, or you have a WAL archive. Reported by Jonathon Nelson. Analysis and patch by Kyotaro HORIGUCHI, with some extra comments by me. Discussion: https://www.postgresql.org/message-id/CACJqAM3xVz0JY1XFDKPP%2BJoJAjoGx%3DGNuOAshEDWCext7BFvCQ%40mail.gmail.com
1 parent 134db37 commit c06380e

File tree

3 files changed

+62
-13
lines changed

3 files changed

+62
-13
lines changed

src/backend/access/transam/xlog.c

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10988,6 +10988,40 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
1098810988
Assert(reqLen <= readLen);
1098910989

1099010990
*readTLI = curFileTLI;
10991+
10992+
/*
10993+
* Check the page header immediately, so that we can retry immediately if
10994+
* it's not valid. This may seem unnecessary, because XLogReadRecord()
10995+
* validates the page header anyway, and would propagate the failure up to
10996+
* ReadRecord(), which would retry. However, there's a corner case with
10997+
* continuation records, if a record is split across two pages such that
10998+
* we would need to read the two pages from different sources. For
10999+
* example, imagine a scenario where a streaming replica is started up,
11000+
* and replay reaches a record that's split across two WAL segments. The
11001+
* first page is only available locally, in pg_wal, because it's already
11002+
* been recycled in the master. The second page, however, is not present
11003+
* in pg_wal, and we should stream it from the master. There is a recycled
11004+
* WAL segment present in pg_wal, with garbage contents, however. We would
11005+
* read the first page from the local WAL segment, but when reading the
11006+
* second page, we would read the bogus, recycled, WAL segment. If we
11007+
* didn't catch that case here, we would never recover, because
11008+
* ReadRecord() would retry reading the whole record from the beginning.
11009+
*
11010+
* Of course, this only catches errors in the page header, which is what
11011+
* happens in the case of a recycled WAL segment. Other kinds of errors or
11012+
* corruption still has the same problem. But this at least fixes the
11013+
* common case, which can happen as part of normal operation.
11014+
*
11015+
* Validating the page header is cheap enough that doing it twice
11016+
* shouldn't be a big deal from a performance point of view.
11017+
*/
11018+
if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
11019+
{
11020+
/* reset any error XLogReaderValidatePageHeader() might have set */
11021+
xlogreader->errormsg_buf[0] = '\0';
11022+
goto next_record_is_invalid;
11023+
}
11024+
1099111025
return readLen;
1099211026

1099311027
next_record_is_invalid:
@@ -11121,12 +11155,18 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
1112111155
}
1112211156
else
1112311157
{
11124-
ptr = tliRecPtr;
11158+
ptr = RecPtr;
11159+
11160+
/*
11161+
* Use the record begin position to determine the
11162+
* TLI, rather than the position we're reading.
11163+
*/
1112511164
tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
1112611165

1112711166
if (curFileTLI > 0 && tli < curFileTLI)
1112811167
elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
11129-
(uint32) (ptr >> 32), (uint32) ptr,
11168+
(uint32) (tliRecPtr >> 32),
11169+
(uint32) tliRecPtr,
1113011170
tli, curFileTLI);
1113111171
}
1113211172
curFileTLI = tli;

src/backend/access/transam/xlogreader.c

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@
2323

2424
static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength);
2525

26-
static bool ValidXLogPageHeader(XLogReaderState *state, XLogRecPtr recptr,
27-
XLogPageHeader hdr);
2826
static bool ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
2927
XLogRecPtr PrevRecPtr, XLogRecord *record, bool randAccess);
3028
static bool ValidXLogRecord(XLogReaderState *state, XLogRecord *record,
@@ -500,7 +498,6 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
500498
*/
501499
if (targetSegNo != state->readSegNo && targetPageOff != 0)
502500
{
503-
XLogPageHeader hdr;
504501
XLogRecPtr targetSegmentPtr = pageptr - targetPageOff;
505502

506503
readLen = state->read_page(state, targetSegmentPtr, XLOG_BLCKSZ,
@@ -512,9 +509,8 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
512509
/* we can be sure to have enough WAL available, we scrolled back */
513510
Assert(readLen == XLOG_BLCKSZ);
514511

515-
hdr = (XLogPageHeader) state->readBuf;
516-
517-
if (!ValidXLogPageHeader(state, targetSegmentPtr, hdr))
512+
if (!XLogReaderValidatePageHeader(state, targetSegmentPtr,
513+
state->readBuf))
518514
goto err;
519515
}
520516

@@ -551,7 +547,7 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
551547
/*
552548
* Now that we know we have the full header, validate it.
553549
*/
554-
if (!ValidXLogPageHeader(state, pageptr, hdr))
550+
if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr))
555551
goto err;
556552

557553
/* update cache information */
@@ -751,15 +747,19 @@ ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr)
751747
}
752748

753749
/*
754-
* Validate a page header
750+
* Validate a page header.
751+
*
752+
* Check if 'phdr' is valid as the header of the XLog page at position
753+
* 'recptr'.
755754
*/
756-
static bool
757-
ValidXLogPageHeader(XLogReaderState *state, XLogRecPtr recptr,
758-
XLogPageHeader hdr)
755+
bool
756+
XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
757+
char *phdr)
759758
{
760759
XLogRecPtr recaddr;
761760
XLogSegNo segno;
762761
int32 offset;
762+
XLogPageHeader hdr = (XLogPageHeader) phdr;
763763

764764
Assert((recptr % XLOG_BLCKSZ) == 0);
765765

@@ -847,6 +847,11 @@ ValidXLogPageHeader(XLogReaderState *state, XLogRecPtr recptr,
847847
return false;
848848
}
849849

850+
/*
851+
* Check that the address on the page agrees with what we expected.
852+
* This check typically fails when an old WAL segment is recycled,
853+
* and hasn't yet been overwritten with new data yet.
854+
*/
850855
if (hdr->xlp_pageaddr != recaddr)
851856
{
852857
char fname[MAXFNAMELEN];

src/include/access/xlogreader.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,10 @@ extern void XLogReaderFree(XLogReaderState *state);
119119
extern struct XLogRecord *XLogReadRecord(XLogReaderState *state,
120120
XLogRecPtr recptr, char **errormsg);
121121

122+
/* Validate a page */
123+
extern bool XLogReaderValidatePageHeader(XLogReaderState *state,
124+
XLogRecPtr recptr, char *phdr);
125+
122126
#ifdef FRONTEND
123127
extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr);
124128
#endif /* FRONTEND */

0 commit comments

Comments
 (0)