Skip to content

Commit 014a508

Browse files
nmischmichaelpq
authored andcommitted
Skip WAL recycling and preallocation during archive recovery.
The previous commit addressed the chief consequences of a race condition between InstallXLogFileSegment() and KeepFileRestoredFromArchive(). Fix three lesser consequences. A spurious durable_rename_excl() LOG message remained possible. KeepFileRestoredFromArchive() wasted the proceeds of WAL recycling and preallocation. Finally, XLogFileInitInternal() could return a descriptor for a file that KeepFileRestoredFromArchive() had already unlinked. That felt like a recipe for future bugs. This commit has been applied as of cc2c7d6 in v15 and newer versions. This is required on stable branches of v13 and v14 to fix a regression reported by Noah Misch, introduced by 1f95181, causing spurious failures in archive recovery (neither streaming nor archive recovery) with concurrent restartpoints. The backpatched versions of the patches have been aligned on these branches by me, Noah Misch is the author. Tests have been conducted by the both of us. Note that this commit is known to have introduced a regression of its own. This is fixed by the commit following this one, and not grouped in a single commit to keep the commit history consistent across all branches. Reported-by: Arun Thirupathi Author: Noah Misch <noah@leadboat.com> Discussion: https://postgr.es/m/20210202151416.GB3304930@rfd.leadboat.com Discussion: https://postgr.es/m/20250306193013.36.nmisch@google.com Backpatch-through: 13
1 parent 675b771 commit 014a508

File tree

1 file changed

+57
-8
lines changed
  • src/backend/access/transam

1 file changed

+57
-8
lines changed

src/backend/access/transam/xlog.c

Lines changed: 57 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,16 @@ typedef struct XLogCtlData
673673
*/
674674
bool SharedHotStandbyActive;
675675

676+
/*
677+
* InstallXLogFileSegmentActive indicates whether the checkpointer should
678+
* arrange for future segments by recycling and/or PreallocXlogFiles().
679+
* Protected by ControlFileLock. Only the startup process changes it. If
680+
* true, anyone can use InstallXLogFileSegment(). If false, the startup
681+
* process owns the exclusive right to install segments, by reading from
682+
* the archive and possibly replacing existing files.
683+
*/
684+
bool InstallXLogFileSegmentActive;
685+
676686
/*
677687
* SharedPromoteIsTriggered indicates if a standby promotion has been
678688
* triggered. Protected by info_lck.
@@ -935,6 +945,7 @@ static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
935945
int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
936946
static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
937947
bool fetching_ckpt, XLogRecPtr tliRecPtr);
948+
static void XLogShutdownWalRcv(void);
938949
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
939950
static void XLogFileClose(void);
940951
static void PreallocXlogFiles(XLogRecPtr endptr);
@@ -3653,8 +3664,8 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
36533664
* is false.)
36543665
*
36553666
* Returns true if the file was installed successfully. false indicates that
3656-
* max_segno limit was exceeded, or an error occurred while renaming the
3657-
* file into place.
3667+
* max_segno limit was exceeded, the startup process has disabled this
3668+
* function for now, or an error occurred while renaming the file into place.
36583669
*/
36593670
static bool
36603671
InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
@@ -3666,6 +3677,11 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
36663677
XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
36673678

36683679
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3680+
if (!XLogCtl->InstallXLogFileSegmentActive)
3681+
{
3682+
LWLockRelease(ControlFileLock);
3683+
return false;
3684+
}
36693685

36703686
if (!find_free)
36713687
{
@@ -3770,6 +3786,7 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
37703786
*/
37713787
if (source == XLOG_FROM_ARCHIVE)
37723788
{
3789+
Assert(!XLogCtl->InstallXLogFileSegmentActive);
37733790
KeepFileRestoredFromArchive(path, xlogfname);
37743791

37753792
/*
@@ -3971,6 +3988,9 @@ PreallocXlogFiles(XLogRecPtr endptr)
39713988
char path[MAXPGPATH];
39723989
uint64 offset;
39733990

3991+
if (!XLogCtl->InstallXLogFileSegmentActive)
3992+
return; /* unlocked check says no */
3993+
39743994
XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
39753995
offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
39763996
if (offset >= (uint32) (0.75 * wal_segment_size))
@@ -4252,6 +4272,7 @@ RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
42524272
*/
42534273
if (wal_recycle &&
42544274
*endlogSegNo <= recycleSegNo &&
4275+
XLogCtl->InstallXLogFileSegmentActive && /* callee rechecks this */
42554276
lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
42564277
InstallXLogFileSegment(endlogSegNo, path,
42574278
true, recycleSegNo))
@@ -4265,7 +4286,7 @@ RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
42654286
}
42664287
else
42674288
{
4268-
/* No need for any more future segments... */
4289+
/* No need for any more future segments, or recycling failed ... */
42694290
int rc;
42704291

42714292
ereport(DEBUG2,
@@ -5270,6 +5291,7 @@ XLOGShmemInit(void)
52705291
XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
52715292
XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
52725293
XLogCtl->SharedHotStandbyActive = false;
5294+
XLogCtl->InstallXLogFileSegmentActive = false;
52735295
XLogCtl->SharedPromoteIsTriggered = false;
52745296
XLogCtl->WalWriterSleeping = false;
52755297

@@ -5297,6 +5319,11 @@ BootStrapXLOG(void)
52975319
struct timeval tv;
52985320
pg_crc32c crc;
52995321

5322+
/* allow ordinary WAL segment creation, like StartupXLOG() would */
5323+
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
5324+
XLogCtl->InstallXLogFileSegmentActive = true;
5325+
LWLockRelease(ControlFileLock);
5326+
53005327
/*
53015328
* Select a hopefully-unique system identifier code for this installation.
53025329
* We use the result of gettimeofday(), including the fractional seconds
@@ -7685,7 +7712,7 @@ StartupXLOG(void)
76857712
* over these records and subsequent ones if it's still alive when we
76867713
* start writing WAL.
76877714
*/
7688-
ShutdownWalRcv();
7715+
XLogShutdownWalRcv();
76897716

76907717
/*
76917718
* Reset unlogged relations to the contents of their INIT fork. This is
@@ -7710,7 +7737,7 @@ StartupXLOG(void)
77107737
* recovery, e.g., timeline history file) from archive or pg_wal.
77117738
*
77127739
* Note that standby mode must be turned off after killing WAL receiver,
7713-
* i.e., calling ShutdownWalRcv().
7740+
* i.e., calling XLogShutdownWalRcv().
77147741
*/
77157742
Assert(!WalRcvStreaming());
77167743
StandbyMode = false;
@@ -7779,6 +7806,14 @@ StartupXLOG(void)
77797806
*/
77807807
oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
77817808

7809+
/*
7810+
* Allow ordinary WAL segment creation before any exitArchiveRecovery(),
7811+
* which sometimes creates a segment, and after the last ReadRecord().
7812+
*/
7813+
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7814+
XLogCtl->InstallXLogFileSegmentActive = true;
7815+
LWLockRelease(ControlFileLock);
7816+
77827817
/*
77837818
* Consider whether we need to assign a new timeline ID.
77847819
*
@@ -12717,7 +12752,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
1271712752
*/
1271812753
if (StandbyMode && CheckForStandbyTrigger())
1271912754
{
12720-
ShutdownWalRcv();
12755+
XLogShutdownWalRcv();
1272112756
return false;
1272212757
}
1272312758

@@ -12765,7 +12800,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
1276512800
* WAL that we restore from archive.
1276612801
*/
1276712802
if (WalRcvStreaming())
12768-
ShutdownWalRcv();
12803+
XLogShutdownWalRcv();
1276912804

1277012805
/*
1277112806
* Before we sleep, re-scan for possible new timelines if
@@ -12895,7 +12930,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
1289512930
*/
1289612931
if (pendingWalRcvRestart && !startWalReceiver)
1289712932
{
12898-
ShutdownWalRcv();
12933+
XLogShutdownWalRcv();
1289912934

1290012935
/*
1290112936
* Re-scan for possible new timelines if we were
@@ -12945,6 +12980,9 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
1294512980
tli, curFileTLI);
1294612981
}
1294712982
curFileTLI = tli;
12983+
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
12984+
XLogCtl->InstallXLogFileSegmentActive = true;
12985+
LWLockRelease(ControlFileLock);
1294812986
RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
1294912987
PrimarySlotName,
1295012988
wal_receiver_create_temp_slot);
@@ -13115,6 +13153,17 @@ StartupRequestWalReceiverRestart(void)
1311513153
}
1311613154
}
1311713155

13156+
/* Thin wrapper around ShutdownWalRcv(). */
13157+
static void
13158+
XLogShutdownWalRcv(void)
13159+
{
13160+
ShutdownWalRcv();
13161+
13162+
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
13163+
XLogCtl->InstallXLogFileSegmentActive = false;
13164+
LWLockRelease(ControlFileLock);
13165+
}
13166+
1311813167
/*
1311913168
* Determine what log level should be used to report a corrupt WAL record
1312013169
* in the current WAL page, previously read by XLogPageRead().

0 commit comments

Comments
 (0)