Skip to content

Commit 065583c

Browse files
committed
Prevent summarizer hang when summarize_wal turned off and back on.
Before this commit, when the WAL summarizer started up or recovered from an error, it would resume summarization from wherever it left off. That was OK normally, but wrong if summarize_wal=off had been turned off temporary, allowing some WAL to be removed, and then turned back on again. In such cases, the WAL summarizer would simply hang forever. This commit changes the reinitialization sequence for WAL summarizer to rederive the starting position in the way we were already doing at initial startup, fixing the problem. Per report from Israel Barth Rubio. Reviewed by Tom Lane. Discussion: http://postgr.es/m/CA+TgmoYN6x=YS+FoFOS6=nr6=qkXZFWhdiL7k0oatGwug2hcuA@mail.gmail.com
1 parent 55e56c8 commit 065583c

File tree

3 files changed

+45
-43
lines changed

3 files changed

+45
-43
lines changed

src/backend/access/transam/xlog.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7916,7 +7916,7 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
79167916
* If WAL summarization is in use, don't remove WAL that has yet to be
79177917
* summarized.
79187918
*/
7919-
keep = GetOldestUnsummarizedLSN(NULL, NULL, false);
7919+
keep = GetOldestUnsummarizedLSN(NULL, NULL);
79207920
if (keep != InvalidXLogRecPtr)
79217921
{
79227922
XLogSegNo unsummarized_segno;

src/backend/postmaster/walsummarizer.c

Lines changed: 43 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ WalSummarizerMain(char *startup_data, size_t startup_data_len)
337337
*
338338
* If we discover that WAL summarization is not enabled, just exit.
339339
*/
340-
current_lsn = GetOldestUnsummarizedLSN(&current_tli, &exact, true);
340+
current_lsn = GetOldestUnsummarizedLSN(&current_tli, &exact);
341341
if (XLogRecPtrIsInvalid(current_lsn))
342342
proc_exit(0);
343343

@@ -479,46 +479,40 @@ GetWalSummarizerState(TimeLineID *summarized_tli, XLogRecPtr *summarized_lsn,
479479

480480
/*
481481
* Get the oldest LSN in this server's timeline history that has not yet been
482-
* summarized.
482+
* summarized, and update shared memory state as appropriate.
483483
*
484484
* If *tli != NULL, it will be set to the TLI for the LSN that is returned.
485485
*
486486
* If *lsn_is_exact != NULL, it will be set to true if the returned LSN is
487487
* necessarily the start of a WAL record and false if it's just the beginning
488488
* of a WAL segment.
489-
*
490-
* If reset_pending_lsn is true, resets the pending_lsn in shared memory to
491-
* be equal to the summarized_lsn.
492489
*/
493490
XLogRecPtr
494-
GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact,
495-
bool reset_pending_lsn)
491+
GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact)
496492
{
497493
TimeLineID latest_tli;
498-
LWLockMode mode = reset_pending_lsn ? LW_EXCLUSIVE : LW_SHARED;
499494
int n;
500495
List *tles;
501496
XLogRecPtr unsummarized_lsn = InvalidXLogRecPtr;
502497
TimeLineID unsummarized_tli = 0;
503498
bool should_make_exact = false;
504499
List *existing_summaries;
505500
ListCell *lc;
501+
bool am_wal_summarizer = AmWalSummarizerProcess();
506502

507503
/* If not summarizing WAL, do nothing. */
508504
if (!summarize_wal)
509505
return InvalidXLogRecPtr;
510506

511507
/*
512-
* Unless we need to reset the pending_lsn, we initially acquire the lock
513-
* in shared mode and try to fetch the required information. If we acquire
514-
* in shared mode and find that the data structure hasn't been
515-
* initialized, we reacquire the lock in exclusive mode so that we can
516-
* initialize it. However, if someone else does that first before we get
517-
* the lock, then we can just return the requested information after all.
508+
* If we are not the WAL summarizer process, then we normally just want
509+
* to read the values from shared memory. However, as an exception, if
510+
* shared memory hasn't been initialized yet, then we need to do that so
511+
* that we can read legal values and not remove any WAL too early.
518512
*/
519-
while (1)
513+
if (!am_wal_summarizer)
520514
{
521-
LWLockAcquire(WALSummarizerLock, mode);
515+
LWLockAcquire(WALSummarizerLock, LW_SHARED);
522516

523517
if (WalSummarizerCtl->initialized)
524518
{
@@ -527,27 +521,22 @@ GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact,
527521
*tli = WalSummarizerCtl->summarized_tli;
528522
if (lsn_is_exact != NULL)
529523
*lsn_is_exact = WalSummarizerCtl->lsn_is_exact;
530-
if (reset_pending_lsn)
531-
WalSummarizerCtl->pending_lsn =
532-
WalSummarizerCtl->summarized_lsn;
533524
LWLockRelease(WALSummarizerLock);
534525
return unsummarized_lsn;
535526
}
536527

537-
if (mode == LW_EXCLUSIVE)
538-
break;
539-
540528
LWLockRelease(WALSummarizerLock);
541-
mode = LW_EXCLUSIVE;
542529
}
543530

544531
/*
545-
* The data structure needs to be initialized, and we are the first to
546-
* obtain the lock in exclusive mode, so it's our job to do that
547-
* initialization.
532+
* Find the oldest timeline on which WAL still exists, and the earliest
533+
* segment for which it exists.
548534
*
549-
* So, find the oldest timeline on which WAL still exists, and the
550-
* earliest segment for which it exists.
535+
* Note that we do this every time the WAL summarizer process restarts
536+
* or recovers from an error, in case the contents of pg_wal have changed
537+
* under us e.g. if some files were removed, either manually - which
538+
* shouldn't really happen, but might - or by postgres itself, if
539+
* summarize_wal was turned off and then back on again.
551540
*/
552541
(void) GetLatestLSN(&latest_tli);
553542
tles = readTimeLineHistory(latest_tli);
@@ -568,12 +557,6 @@ GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact,
568557
}
569558
}
570559

571-
/* It really should not be possible for us to find no WAL. */
572-
if (unsummarized_tli == 0)
573-
ereport(ERROR,
574-
errcode(ERRCODE_INTERNAL_ERROR),
575-
errmsg_internal("no WAL found on timeline %u", latest_tli));
576-
577560
/*
578561
* Don't try to summarize anything older than the end LSN of the newest
579562
* summary file that exists for this timeline.
@@ -592,12 +575,32 @@ GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact,
592575
}
593576
}
594577

595-
/* Update shared memory with the discovered values. */
596-
WalSummarizerCtl->initialized = true;
597-
WalSummarizerCtl->summarized_lsn = unsummarized_lsn;
598-
WalSummarizerCtl->summarized_tli = unsummarized_tli;
599-
WalSummarizerCtl->lsn_is_exact = should_make_exact;
600-
WalSummarizerCtl->pending_lsn = unsummarized_lsn;
578+
/* It really should not be possible for us to find no WAL. */
579+
if (unsummarized_tli == 0)
580+
ereport(ERROR,
581+
errcode(ERRCODE_INTERNAL_ERROR),
582+
errmsg_internal("no WAL found on timeline %u", latest_tli));
583+
584+
/*
585+
* If we're the WAL summarizer, we always want to store the values we
586+
* just computed into shared memory, because those are the values we're
587+
* going to use to drive our operation, and so they are the authoritative
588+
* values. Otherwise, we only store values into shared memory if shared
589+
* memory is uninitialized. Our values are not canonical in such a case,
590+
* but it's better to have something than nothing, to guide WAL
591+
* retention.
592+
*/
593+
LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
594+
if (am_wal_summarizer|| !WalSummarizerCtl->initialized)
595+
{
596+
WalSummarizerCtl->initialized = true;
597+
WalSummarizerCtl->summarized_lsn = unsummarized_lsn;
598+
WalSummarizerCtl->summarized_tli = unsummarized_tli;
599+
WalSummarizerCtl->lsn_is_exact = should_make_exact;
600+
WalSummarizerCtl->pending_lsn = unsummarized_lsn;
601+
}
602+
else
603+
unsummarized_lsn = WalSummarizerCtl->summarized_lsn;
601604

602605
/* Also return the to the caller as required. */
603606
if (tli != NULL)

src/include/postmaster/walsummarizer.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ extern void GetWalSummarizerState(TimeLineID *summarized_tli,
2828
XLogRecPtr *pending_lsn,
2929
int *summarizer_pid);
3030
extern XLogRecPtr GetOldestUnsummarizedLSN(TimeLineID *tli,
31-
bool *lsn_is_exact,
32-
bool reset_pending_lsn);
31+
bool *lsn_is_exact);
3332
extern void SetWalSummarizerLatch(void);
3433
extern XLogRecPtr WaitForWalSummarization(XLogRecPtr lsn, long timeout,
3534
XLogRecPtr *pending_lsn);

0 commit comments

Comments
 (0)