Skip to content

Commit 5065aea

Browse files
committed
Avoid creating archive status ".ready" files too early
WAL records may span multiple segments, but XLogWrite() does not wait for the entire record to be written out to disk before creating archive status files. Instead, as soon as the last WAL page of the segment is written, the archive status file is created, and the archiver may process it. If PostgreSQL crashes before it is able to write and flush the rest of the record (in the next WAL segment), the wrong version of the first segment file lingers in the archive, which causes operations such as point-in-time restores to fail. To fix this, keep track of records that span across segments and ensure that segments are only marked ready-for-archival once such records have been completely written to disk. This has always been wrong, so backpatch all the way back. Author: Nathan Bossart <bossartn@amazon.com> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Ryo Matsumura <matsumura.ryo@fujitsu.com> Reviewed-by: Andrey Borodin <x4mmm@yandex-team.ru> Discussion: https://postgr.es/m/CBDDFA01-6E40-46BB-9F98-9340F4379505@amazon.com
1 parent adbfde3 commit 5065aea

File tree

4 files changed

+215
-10
lines changed

4 files changed

+215
-10
lines changed

src/backend/access/transam/xlog.c

Lines changed: 206 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,18 @@ typedef struct XLogCtlData
709709
XLogRecPtr lastFpwDisableRecPtr;
710710

711711
slock_t info_lck; /* locks shared variables shown above */
712+
713+
/*
714+
* Variables used to track segment-boundary-crossing WAL records. See
715+
* RegisterSegmentBoundary. Protected by segtrack_lck.
716+
*/
717+
XLogSegNo lastNotifiedSeg;
718+
XLogSegNo earliestSegBoundary;
719+
XLogRecPtr earliestSegBoundaryEndPtr;
720+
XLogSegNo latestSegBoundary;
721+
XLogRecPtr latestSegBoundaryEndPtr;
722+
723+
slock_t segtrack_lck; /* locks shared variables shown above */
712724
} XLogCtlData;
713725

714726
static XLogCtlData *XLogCtl = NULL;
@@ -899,6 +911,7 @@ static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecP
899911
static void RemoveXlogFile(const char *segname, XLogRecPtr lastredoptr, XLogRecPtr endptr);
900912
static void UpdateLastRemovedPtr(char *filename);
901913
static void ValidateXLOGDirectoryStructure(void);
914+
static void RegisterSegmentBoundary(XLogSegNo seg, XLogRecPtr pos);
902915
static void CleanupBackupHistory(void);
903916
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
904917
static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
@@ -1129,23 +1142,56 @@ XLogInsertRecord(XLogRecData *rdata,
11291142
END_CRIT_SECTION();
11301143

11311144
/*
1132-
* Update shared LogwrtRqst.Write, if we crossed page boundary.
1145+
* If we crossed page boundary, update LogwrtRqst.Write; if we crossed
1146+
* segment boundary, register that and wake up walwriter.
11331147
*/
11341148
if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
11351149
{
1150+
XLogSegNo StartSeg;
1151+
XLogSegNo EndSeg;
1152+
1153+
XLByteToSeg(StartPos, StartSeg, wal_segment_size);
1154+
XLByteToSeg(EndPos, EndSeg, wal_segment_size);
1155+
1156+
/*
1157+
* Register our crossing the segment boundary if that occurred.
1158+
*
1159+
* Note that we did not use XLByteToPrevSeg() for determining the
1160+
* ending segment. This is so that a record that fits perfectly into
1161+
* the end of the segment causes the latter to get marked ready for
1162+
* archival immediately.
1163+
*/
1164+
if (StartSeg != EndSeg && XLogArchivingActive())
1165+
RegisterSegmentBoundary(EndSeg, EndPos);
1166+
1167+
/*
1168+
* Advance LogwrtRqst.Write so that it includes new block(s).
1169+
*
1170+
* We do this after registering the segment boundary so that the
1171+
* comparison with the flushed pointer below can use the latest value
1172+
* known globally.
1173+
*/
11361174
SpinLockAcquire(&XLogCtl->info_lck);
1137-
/* advance global request to include new block(s) */
11381175
if (XLogCtl->LogwrtRqst.Write < EndPos)
11391176
XLogCtl->LogwrtRqst.Write = EndPos;
11401177
/* update local result copy while I have the chance */
11411178
LogwrtResult = XLogCtl->LogwrtResult;
11421179
SpinLockRelease(&XLogCtl->info_lck);
1180+
1181+
/*
1182+
* There's a chance that the record was already flushed to disk and we
1183+
* missed marking segments as ready for archive. If this happens, we
1184+
* nudge the WALWriter, which will take care of notifying segments as
1185+
* needed.
1186+
*/
1187+
if (StartSeg != EndSeg && XLogArchivingActive() &&
1188+
LogwrtResult.Flush >= EndPos && ProcGlobal->walwriterLatch)
1189+
SetLatch(ProcGlobal->walwriterLatch);
11431190
}
11441191

11451192
/*
11461193
* If this was an XLOG_SWITCH record, flush the record and the empty
1147-
* padding space that fills the rest of the segment, and perform
1148-
* end-of-segment actions (eg, notifying archiver).
1194+
* padding space that fills the rest of the segment.
11491195
*/
11501196
if (isLogSwitch)
11511197
{
@@ -2388,6 +2434,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
23882434

23892435
/* We should always be inside a critical section here */
23902436
Assert(CritSectionCount > 0);
2437+
Assert(LWLockHeldByMe(WALWriteLock));
23912438

23922439
/*
23932440
* Update local LogwrtResult (caller probably did this already, but...)
@@ -2524,11 +2571,12 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
25242571
* later. Doing it here ensures that one and only one backend will
25252572
* perform this fsync.
25262573
*
2527-
* This is also the right place to notify the Archiver that the
2528-
* segment is ready to copy to archival storage, and to update the
2529-
* timer for archive_timeout, and to signal for a checkpoint if
2530-
* too many logfile segments have been used since the last
2531-
* checkpoint.
2574+
* If WAL archiving is active, we attempt to notify the archiver
2575+
* of any segments that are now ready for archival.
2576+
*
2577+
* This is also the right place to update the timer for
2578+
* archive_timeout and to signal for a checkpoint if too many
2579+
* logfile segments have been used since the last checkpoint.
25322580
*/
25332581
if (finishing_seg)
25342582
{
@@ -2540,7 +2588,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
25402588
LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
25412589

25422590
if (XLogArchivingActive())
2543-
XLogArchiveNotifySeg(openLogSegNo);
2591+
NotifySegmentsReadyForArchive(LogwrtResult.Flush);
25442592

25452593
XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
25462594
XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
@@ -2627,6 +2675,9 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
26272675
XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
26282676
SpinLockRelease(&XLogCtl->info_lck);
26292677
}
2678+
2679+
if (XLogArchivingActive())
2680+
NotifySegmentsReadyForArchive(LogwrtResult.Flush);
26302681
}
26312682

26322683
/*
@@ -4227,6 +4278,129 @@ ValidateXLOGDirectoryStructure(void)
42274278
}
42284279
}
42294280

4281+
/*
4282+
* RegisterSegmentBoundary
4283+
*
4284+
* WAL records that are split across a segment boundary require special
4285+
* treatment for archiving: the initial segment must not be archived until
4286+
* the end segment has been flushed, in case we crash before we have
4287+
* the chance to flush the end segment (because after recovery we would
4288+
* overwrite that WAL record with a different one, and so the file we
4289+
* archived no longer represents truth.) This also applies to streaming
4290+
* physical replication.
4291+
*
4292+
* To handle this, we keep track of the LSN of WAL records that cross
4293+
* segment boundaries. Two such are sufficient: the ones with the
4294+
* earliest and the latest end pointers we know about, since the flush
4295+
* position advances monotonically. WAL record writers register
4296+
* boundary-crossing records here, which is used by .ready file creation
4297+
* to delay until the end segment is known flushed.
4298+
*/
4299+
static void
4300+
RegisterSegmentBoundary(XLogSegNo seg, XLogRecPtr endpos)
4301+
{
4302+
XLogSegNo segno PG_USED_FOR_ASSERTS_ONLY;
4303+
4304+
/* verify caller computed segment number correctly */
4305+
AssertArg((XLByteToSeg(endpos, segno, wal_segment_size), segno == seg));
4306+
4307+
SpinLockAcquire(&XLogCtl->segtrack_lck);
4308+
4309+
/*
4310+
* If no segment boundaries are registered, store the new segment boundary
4311+
* in earliestSegBoundary. Otherwise, store the greater segment
4312+
* boundaries in latestSegBoundary.
4313+
*/
4314+
if (XLogCtl->earliestSegBoundary == MaxXLogSegNo)
4315+
{
4316+
XLogCtl->earliestSegBoundary = seg;
4317+
XLogCtl->earliestSegBoundaryEndPtr = endpos;
4318+
}
4319+
else if (seg > XLogCtl->earliestSegBoundary &&
4320+
(XLogCtl->latestSegBoundary == MaxXLogSegNo ||
4321+
seg > XLogCtl->latestSegBoundary))
4322+
{
4323+
XLogCtl->latestSegBoundary = seg;
4324+
XLogCtl->latestSegBoundaryEndPtr = endpos;
4325+
}
4326+
4327+
SpinLockRelease(&XLogCtl->segtrack_lck);
4328+
}
4329+
4330+
/*
4331+
* NotifySegmentsReadyForArchive
4332+
*
4333+
* Mark segments as ready for archival, given that it is safe to do so.
4334+
* This function is idempotent.
4335+
*/
4336+
void
4337+
NotifySegmentsReadyForArchive(XLogRecPtr flushRecPtr)
4338+
{
4339+
XLogSegNo latest_boundary_seg;
4340+
XLogSegNo last_notified;
4341+
XLogSegNo flushed_seg;
4342+
XLogSegNo seg;
4343+
bool keep_latest;
4344+
4345+
XLByteToSeg(flushRecPtr, flushed_seg, wal_segment_size);
4346+
4347+
SpinLockAcquire(&XLogCtl->segtrack_lck);
4348+
4349+
if (XLogCtl->latestSegBoundary <= flushed_seg &&
4350+
XLogCtl->latestSegBoundaryEndPtr <= flushRecPtr)
4351+
{
4352+
latest_boundary_seg = XLogCtl->latestSegBoundary;
4353+
keep_latest = false;
4354+
}
4355+
else if (XLogCtl->earliestSegBoundary <= flushed_seg &&
4356+
XLogCtl->earliestSegBoundaryEndPtr <= flushRecPtr)
4357+
{
4358+
latest_boundary_seg = XLogCtl->earliestSegBoundary;
4359+
keep_latest = true;
4360+
}
4361+
else
4362+
{
4363+
SpinLockRelease(&XLogCtl->segtrack_lck);
4364+
return;
4365+
}
4366+
4367+
last_notified = XLogCtl->lastNotifiedSeg;
4368+
4369+
/*
4370+
* Update shared memory and discard segment boundaries that are no longer
4371+
* needed.
4372+
*
4373+
* It is safe to update shared memory before we attempt to create the
4374+
* .ready files. If our calls to XLogArchiveNotifySeg() fail,
4375+
* RemoveOldXlogFiles() will retry it as needed.
4376+
*/
4377+
if (last_notified < latest_boundary_seg - 1)
4378+
XLogCtl->lastNotifiedSeg = latest_boundary_seg - 1;
4379+
4380+
if (keep_latest)
4381+
{
4382+
XLogCtl->earliestSegBoundary = XLogCtl->latestSegBoundary;
4383+
XLogCtl->earliestSegBoundaryEndPtr = XLogCtl->latestSegBoundaryEndPtr;
4384+
}
4385+
else
4386+
{
4387+
XLogCtl->earliestSegBoundary = MaxXLogSegNo;
4388+
XLogCtl->earliestSegBoundaryEndPtr = InvalidXLogRecPtr;
4389+
}
4390+
4391+
XLogCtl->latestSegBoundary = MaxXLogSegNo;
4392+
XLogCtl->latestSegBoundaryEndPtr = InvalidXLogRecPtr;
4393+
4394+
SpinLockRelease(&XLogCtl->segtrack_lck);
4395+
4396+
/*
4397+
* Notify archiver about segments that are ready for archival (by creating
4398+
* the corresponding .ready files).
4399+
*/
4400+
for (seg = last_notified + 1; seg < latest_boundary_seg; seg++)
4401+
XLogArchiveNotifySeg(seg);
4402+
}
4403+
42304404
/*
42314405
* Remove previous backup history files. This also retries creation of
42324406
* .ready files for any backup history files for which XLogArchiveNotify
@@ -5112,8 +5286,16 @@ XLOGShmemInit(void)
51125286

51135287
SpinLockInit(&XLogCtl->Insert.insertpos_lck);
51145288
SpinLockInit(&XLogCtl->info_lck);
5289+
SpinLockInit(&XLogCtl->segtrack_lck);
51155290
SpinLockInit(&XLogCtl->ulsn_lck);
51165291
InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5292+
5293+
/* Initialize stuff for marking segments as ready for archival. */
5294+
XLogCtl->lastNotifiedSeg = MaxXLogSegNo;
5295+
XLogCtl->earliestSegBoundary = MaxXLogSegNo;
5296+
XLogCtl->earliestSegBoundaryEndPtr = InvalidXLogRecPtr;
5297+
XLogCtl->latestSegBoundary = MaxXLogSegNo;
5298+
XLogCtl->latestSegBoundaryEndPtr = InvalidXLogRecPtr;
51175299
}
51185300

51195301
/*
@@ -7605,6 +7787,20 @@ StartupXLOG(void)
76057787
XLogCtl->LogwrtRqst.Write = EndOfLog;
76067788
XLogCtl->LogwrtRqst.Flush = EndOfLog;
76077789

7790+
/*
7791+
* Initialize XLogCtl->lastNotifiedSeg to the previous WAL file.
7792+
*/
7793+
if (XLogArchivingActive())
7794+
{
7795+
XLogSegNo EndOfLogSeg;
7796+
7797+
XLByteToSeg(EndOfLog, EndOfLogSeg, wal_segment_size);
7798+
7799+
SpinLockAcquire(&XLogCtl->segtrack_lck);
7800+
XLogCtl->lastNotifiedSeg = EndOfLogSeg - 1;
7801+
SpinLockRelease(&XLogCtl->segtrack_lck);
7802+
}
7803+
76087804
/*
76097805
* Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
76107806
* record before resource manager writes cleanup WAL records or checkpoint

src/backend/postmaster/walwriter.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,13 @@ WalWriterMain(void)
256256
proc_exit(0); /* done */
257257
}
258258

259+
/*
260+
* Notify the archiver of any WAL segments that are ready. We do this
261+
* here to handle a race condition where WAL is flushed to disk prior
262+
* to registering the segment boundary.
263+
*/
264+
NotifySegmentsReadyForArchive(GetFlushRecPtr());
265+
259266
/*
260267
* Do what we're here for; then, if XLogBackgroundFlush() found useful
261268
* work to do, reset hibernation counter.

src/include/access/xlog.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,7 @@ extern XLogRecPtr GetInsertRecPtr(void);
322322
extern XLogRecPtr GetFlushRecPtr(void);
323323
extern XLogRecPtr GetLastImportantRecPtr(void);
324324
extern void RemovePromoteSignalFiles(void);
325+
extern void NotifySegmentsReadyForArchive(XLogRecPtr flushRecPtr);
325326

326327
extern bool CheckPromoteSignal(void);
327328
extern void WakeupRecovery(void);

src/include/access/xlogdefs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ typedef uint64 XLogRecPtr;
3939
* XLogSegNo - physical log file sequence number.
4040
*/
4141
typedef uint64 XLogSegNo;
42+
#define MaxXLogSegNo ((XLogSegNo) 0xFFFFFFFFFFFFFFFF)
4243

4344
/*
4445
* TimeLineID (TLI) - identifies different database histories to prevent

0 commit comments

Comments
 (0)