Skip to content

Commit 8a53539

Browse files
committed
Wait for WAL summarization to catch up before creating .partial file.
When a standby is promoted, CleanupAfterArchiveRecovery() may decide to rename the final WAL file from the old timeline by adding ".partial" to the name. If WAL summarization is enabled and this file is renamed before its partial contents are summarized, WAL summarization breaks: the summarizer gets stuck at that point in the WAL stream and just errors out. To fix that, first make the startup process wait for WAL summarization to catch up before renaming the file. Generally, this should be quick, and if it's not, the user can shut off summarize_wal and try again. To make this fix work, also teach the WAL summarizer that after a promotion has occurred, no more WAL can appear on the previous timeline: previously, the WAL summarizer wouldn't switch to the new timeline until we actually started writing WAL there, but that meant that when the startup process was waiting for the WAL summarizer, it was waiting for an action that the summarizer wasn't yet prepared to take. In the process of fixing these bugs, I realized that the logic to wait for WAL summarization to catch up was spread out in a way that made it difficult to reuse properly, so this code refactors things to make it easier. Finally, add a test case that would have caught this bug and the previously-fixed bug that WAL summarization sometimes needs to back up when the timeline changes. Discussion: https://postgr.es/m/CA+TgmoZGEsZodXC4f=XZNkAeyuDmWTSkpkjCEOcF19Am0mt_OA@mail.gmail.com
1 parent 454aab4 commit 8a53539

File tree

7 files changed

+241
-110
lines changed

7 files changed

+241
-110
lines changed

src/backend/access/transam/xlog.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,11 @@ typedef struct XLogCtlData
498498
* If we create a new timeline when the system was started up,
499499
* PrevTimeLineID is the old timeline's ID that we forked off from.
500500
* Otherwise it's equal to InsertTimeLineID.
501+
*
502+
* We set these fields while holding info_lck. Most that reads these
503+
* values knows that recovery is no longer in progress and so can safely
504+
* read the value without a lock, but code that could be run either during
505+
* or after recovery can take info_lck while reading these values.
501506
*/
502507
TimeLineID InsertTimeLineID;
503508
TimeLineID PrevTimeLineID;
@@ -5315,6 +5320,13 @@ CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
53155320
char partialfname[MAXFNAMELEN];
53165321
char partialpath[MAXPGPATH];
53175322

5323+
/*
5324+
* If we're summarizing WAL, we can't rename the partial file
5325+
* until the summarizer finishes with it, else it will fail.
5326+
*/
5327+
if (summarize_wal)
5328+
WaitForWalSummarization(EndOfLog);
5329+
53185330
XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
53195331
snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
53205332
snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
@@ -5945,8 +5957,10 @@ StartupXLOG(void)
59455957
}
59465958

59475959
/* Save the selected TimeLineID in shared memory, too */
5960+
SpinLockAcquire(&XLogCtl->info_lck);
59485961
XLogCtl->InsertTimeLineID = newTLI;
59495962
XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
5963+
SpinLockRelease(&XLogCtl->info_lck);
59505964

59515965
/*
59525966
* Actually, if WAL ended in an incomplete record, skip the parts that
@@ -6481,6 +6495,25 @@ GetWALInsertionTimeLine(void)
64816495
return XLogCtl->InsertTimeLineID;
64826496
}
64836497

6498+
/*
6499+
* GetWALInsertionTimeLineIfSet -- If the system is not in recovery, returns
6500+
* the WAL insertion timeline; else, returns 0. Wherever possible, use
6501+
* GetWALInsertionTimeLine() instead, since it's cheaper. Note that this
6502+
* function decides recovery has ended as soon as the insert TLI is set, which
6503+
* happens before we set XLogCtl->SharedRecoveryState to RECOVERY_STATE_DONE.
6504+
*/
6505+
TimeLineID
6506+
GetWALInsertionTimeLineIfSet(void)
6507+
{
6508+
TimeLineID insertTLI;
6509+
6510+
SpinLockAcquire(&XLogCtl->info_lck);
6511+
insertTLI = XLogCtl->InsertTimeLineID;
6512+
SpinLockRelease(&XLogCtl->info_lck);
6513+
6514+
return insertTLI;
6515+
}
6516+
64846517
/*
64856518
* GetLastImportantRecPtr -- Returns the LSN of the last important record
64866519
* inserted. All records not explicitly marked as unimportant are considered

src/backend/backup/basebackup_incremental.c

Lines changed: 6 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -277,12 +277,6 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
277277
TimeLineID earliest_wal_range_tli = 0;
278278
XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr;
279279
TimeLineID latest_wal_range_tli = 0;
280-
XLogRecPtr summarized_lsn;
281-
XLogRecPtr pending_lsn;
282-
XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr;
283-
int deadcycles = 0;
284-
TimestampTz initial_time,
285-
current_time;
286280

287281
Assert(ib->buf.data == NULL);
288282

@@ -458,85 +452,13 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
458452
}
459453

460454
/*
461-
* Wait for WAL summarization to catch up to the backup start LSN (but
462-
* time out if it doesn't do so quickly enough).
455+
* Wait for WAL summarization to catch up to the backup start LSN. This
456+
* will throw an error if the WAL summarizer appears to be stuck. If WAL
457+
* summarization gets disabled while we're waiting, this will return
458+
* immediately, and we'll error out further down if the WAL summaries are
459+
* incomplete.
463460
*/
464-
initial_time = current_time = GetCurrentTimestamp();
465-
while (1)
466-
{
467-
long timeout_in_ms = 10000;
468-
long elapsed_seconds;
469-
470-
/*
471-
* Align the wait time to prevent drift. This doesn't really matter,
472-
* but we'd like the warnings about how long we've been waiting to say
473-
* 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
474-
* drifting to something that is not a multiple of ten.
475-
*/
476-
timeout_in_ms -=
477-
TimestampDifferenceMilliseconds(initial_time, current_time) %
478-
timeout_in_ms;
479-
480-
/* Wait for up to 10 seconds. */
481-
summarized_lsn = WaitForWalSummarization(backup_state->startpoint,
482-
timeout_in_ms, &pending_lsn);
483-
484-
/* If WAL summarization has progressed sufficiently, stop waiting. */
485-
if (summarized_lsn >= backup_state->startpoint)
486-
break;
487-
488-
/*
489-
* Keep track of the number of cycles during which there has been no
490-
* progression of pending_lsn. If pending_lsn is not advancing, that
491-
* means that not only are no new files appearing on disk, but we're
492-
* not even incorporating new records into the in-memory state.
493-
*/
494-
if (pending_lsn > prior_pending_lsn)
495-
{
496-
prior_pending_lsn = pending_lsn;
497-
deadcycles = 0;
498-
}
499-
else
500-
++deadcycles;
501-
502-
/*
503-
* If we've managed to wait for an entire minute without the WAL
504-
* summarizer absorbing a single WAL record, error out; probably
505-
* something is wrong.
506-
*
507-
* We could consider also erroring out if the summarizer is taking too
508-
* long to catch up, but it's not clear what rate of progress would be
509-
* acceptable and what would be too slow. So instead, we just try to
510-
* error out in the case where there's no progress at all. That seems
511-
* likely to catch a reasonable number of the things that can go wrong
512-
* in practice (e.g. the summarizer process is completely hung, say
513-
* because somebody hooked up a debugger to it or something) without
514-
* giving up too quickly when the system is just slow.
515-
*/
516-
if (deadcycles >= 6)
517-
ereport(ERROR,
518-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
519-
errmsg("WAL summarization is not progressing"),
520-
errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
521-
LSN_FORMAT_ARGS(backup_state->startpoint),
522-
LSN_FORMAT_ARGS(summarized_lsn),
523-
LSN_FORMAT_ARGS(pending_lsn))));
524-
525-
/*
526-
* Otherwise, just let the user know what's happening.
527-
*/
528-
current_time = GetCurrentTimestamp();
529-
elapsed_seconds =
530-
TimestampDifferenceMilliseconds(initial_time, current_time) / 1000;
531-
ereport(WARNING,
532-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
533-
errmsg("still waiting for WAL summarization through %X/%X after %ld seconds",
534-
LSN_FORMAT_ARGS(backup_state->startpoint),
535-
elapsed_seconds),
536-
errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
537-
LSN_FORMAT_ARGS(summarized_lsn),
538-
LSN_FORMAT_ARGS(pending_lsn))));
539-
}
461+
WaitForWalSummarization(backup_state->startpoint);
540462

541463
/*
542464
* Retrieve a list of all WAL summaries on any timeline that overlap with

src/backend/postmaster/walsummarizer.c

Lines changed: 118 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -650,54 +650,132 @@ SetWalSummarizerLatch(void)
650650
}
651651

652652
/*
653-
* Wait until WAL summarization reaches the given LSN, but not longer than
654-
* the given timeout.
653+
* Wait until WAL summarization reaches the given LSN, but time out with an
654+
* error if the summarizer seems to be stick.
655655
*
656-
* The return value is the first still-unsummarized LSN. If it's greater than
657-
* or equal to the passed LSN, then that LSN was reached. If not, we timed out.
658-
*
659-
* Either way, *pending_lsn is set to the value taken from WalSummarizerCtl.
656+
* Returns immediately if summarize_wal is turned off while we wait. Caller
657+
* is expected to handle this case, if necessary.
660658
*/
661-
XLogRecPtr
662-
WaitForWalSummarization(XLogRecPtr lsn, long timeout, XLogRecPtr *pending_lsn)
659+
void
660+
WaitForWalSummarization(XLogRecPtr lsn)
663661
{
664-
TimestampTz start_time = GetCurrentTimestamp();
665-
TimestampTz deadline = TimestampTzPlusMilliseconds(start_time, timeout);
666-
XLogRecPtr summarized_lsn;
662+
TimestampTz initial_time,
663+
cycle_time,
664+
current_time;
665+
XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr;
666+
int deadcycles = 0;
667667

668-
Assert(!XLogRecPtrIsInvalid(lsn));
669-
Assert(timeout > 0);
668+
initial_time = cycle_time = GetCurrentTimestamp();
670669

671670
while (1)
672671
{
673-
TimestampTz now;
674-
long remaining_timeout;
672+
long timeout_in_ms = 10000;
673+
XLogRecPtr summarized_lsn;
674+
XLogRecPtr pending_lsn;
675+
676+
CHECK_FOR_INTERRUPTS();
677+
678+
/* If WAL summarization is disabled while we're waiting, give up. */
679+
if (!summarize_wal)
680+
return;
675681

676682
/*
677683
* If the LSN summarized on disk has reached the target value, stop.
678684
*/
679685
LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
680686
summarized_lsn = WalSummarizerCtl->summarized_lsn;
681-
*pending_lsn = WalSummarizerCtl->pending_lsn;
687+
pending_lsn = WalSummarizerCtl->pending_lsn;
682688
LWLockRelease(WALSummarizerLock);
689+
690+
/* If WAL summarization has progressed sufficiently, stop waiting. */
683691
if (summarized_lsn >= lsn)
684692
break;
685693

686-
/* Timeout reached? If yes, stop. */
687-
now = GetCurrentTimestamp();
688-
remaining_timeout = TimestampDifferenceMilliseconds(now, deadline);
689-
if (remaining_timeout <= 0)
690-
break;
694+
/* Recheck current time. */
695+
current_time = GetCurrentTimestamp();
696+
697+
/* Have we finished the current cycle of waiting? */
698+
if (TimestampDifferenceMilliseconds(cycle_time,
699+
current_time) >= timeout_in_ms)
700+
{
701+
long elapsed_seconds;
702+
703+
/* Begin new wait cycle. */
704+
cycle_time = TimestampTzPlusMilliseconds(cycle_time,
705+
timeout_in_ms);
706+
707+
/*
708+
* Keep track of the number of cycles during which there has been
709+
* no progression of pending_lsn. If pending_lsn is not advancing,
710+
* that means that not only are no new files appearing on disk,
711+
* but we're not even incorporating new records into the in-memory
712+
* state.
713+
*/
714+
if (pending_lsn > prior_pending_lsn)
715+
{
716+
prior_pending_lsn = pending_lsn;
717+
deadcycles = 0;
718+
}
719+
else
720+
++deadcycles;
721+
722+
/*
723+
* If we've managed to wait for an entire minute without the WAL
724+
* summarizer absorbing a single WAL record, error out; probably
725+
* something is wrong.
726+
*
727+
* We could consider also erroring out if the summarizer is taking
728+
* too long to catch up, but it's not clear what rate of progress
729+
* would be acceptable and what would be too slow. So instead, we
730+
* just try to error out in the case where there's no progress at
731+
* all. That seems likely to catch a reasonable number of the
732+
* things that can go wrong in practice (e.g. the summarizer
733+
* process is completely hung, say because somebody hooked up a
734+
* debugger to it or something) without giving up too quickly when
735+
* the system is just slow.
736+
*/
737+
if (deadcycles >= 6)
738+
ereport(ERROR,
739+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
740+
errmsg("WAL summarization is not progressing"),
741+
errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
742+
LSN_FORMAT_ARGS(lsn),
743+
LSN_FORMAT_ARGS(summarized_lsn),
744+
LSN_FORMAT_ARGS(pending_lsn))));
745+
746+
747+
/*
748+
* Otherwise, just let the user know what's happening.
749+
*/
750+
elapsed_seconds =
751+
TimestampDifferenceMilliseconds(initial_time,
752+
current_time) / 1000;
753+
ereport(WARNING,
754+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
755+
errmsg("still waiting for WAL summarization through %X/%X after %ld seconds",
756+
LSN_FORMAT_ARGS(lsn),
757+
elapsed_seconds),
758+
errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
759+
LSN_FORMAT_ARGS(summarized_lsn),
760+
LSN_FORMAT_ARGS(pending_lsn))));
761+
}
762+
763+
/*
764+
* Align the wait time to prevent drift. This doesn't really matter,
765+
* but we'd like the warnings about how long we've been waiting to say
766+
* 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
767+
* drifting to something that is not a multiple of ten.
768+
*/
769+
timeout_in_ms -=
770+
TimestampDifferenceMilliseconds(cycle_time, current_time);
691771

692772
/* Wait and see. */
693773
ConditionVariableTimedSleep(&WalSummarizerCtl->summary_file_cv,
694-
remaining_timeout,
774+
timeout_in_ms,
695775
WAIT_EVENT_WAL_SUMMARY_READY);
696776
}
697777

698778
ConditionVariableCancelSleep();
699-
700-
return summarized_lsn;
701779
}
702780

703781
/*
@@ -730,6 +808,22 @@ GetLatestLSN(TimeLineID *tli)
730808
TimeLineID flush_tli;
731809
XLogRecPtr replay_lsn;
732810
TimeLineID replay_tli;
811+
TimeLineID insert_tli;
812+
813+
/*
814+
* After the insert TLI has been set and before the control file has
815+
* been updated to show the DB in production, RecoveryInProgress()
816+
* will return true, because it's not yet safe for all backends to
817+
* begin writing WAL. However, replay has already ceased, so from our
818+
* point of view, recovery is already over. We should summarize up to
819+
* where replay stopped and then prepare to resume at the start of the
820+
* insert timeline.
821+
*/
822+
if ((insert_tli = GetWALInsertionTimeLineIfSet()) != 0)
823+
{
824+
*tli = insert_tli;
825+
return GetXLogReplayRecPtr(NULL);
826+
}
733827

734828
/*
735829
* What we really want to know is how much WAL has been flushed to

src/bin/pg_combinebackup/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ tests += {
3535
't/005_integrity.pl',
3636
't/006_db_file_copy.pl',
3737
't/007_wal_level_minimal.pl',
38+
't/008_promote.pl',
3839
],
3940
}
4041
}

0 commit comments

Comments
 (0)