Skip to content

Commit 009eeee

Browse files
committed
pg_rewind: Fix determining TLI when server was just promoted.
If the source server was just promoted, and it hasn't written the checkpoint record yet, pg_rewind considered the server to be still on the old timeline. Because of that, it would claim incorrectly that no rewind is required. Fix that by looking at minRecoveryPointTLI in the control file in addition to the ThisTimeLineID on the checkpoint. This has been a known issue since forever, and we had worked around it in the regression tests by issuing a checkpoint after each promotion, before running pg_rewind. But that was always quite hacky, so better to fix this properly. This doesn't add any new tests for this, but removes the previously-added workarounds from the existing tests, so that they should occasionally hit this codepath again. This is arguably a bug fix, but don't backpatch because we haven't really treated it as a bug so far. Also, the patch didn't apply cleanly to v13 and below. I'm sure sure it could be made to work on v13, but doesn't seem worth the risk and effort. Reviewed-by: Kyotaro Horiguchi, Ibrar Ahmed, Aleksander Alekseev Discussion: https://www.postgresql.org/message-id/9f568c97-87fe-a716-bd39-65299b8a60f4%40iki.fi
1 parent 75c7376 commit 009eeee

File tree

4 files changed

+64
-59
lines changed

4 files changed

+64
-59
lines changed

src/bin/pg_rewind/pg_rewind.c

Lines changed: 64 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,13 @@ static void digestControlFile(ControlFileData *ControlFile,
4545
const char *content, size_t size);
4646
static void getRestoreCommand(const char *argv0);
4747
static void sanityChecks(void);
48-
static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
48+
static TimeLineHistoryEntry *getTimelineHistory(TimeLineID tli, bool is_source,
49+
int *nentries);
50+
static void findCommonAncestorTimeline(TimeLineHistoryEntry *a_history,
51+
int a_nentries,
52+
TimeLineHistoryEntry *b_history,
53+
int b_nentries,
54+
XLogRecPtr *recptr, int *tliIndex);
4955
static void ensureCleanShutdown(const char *argv0);
5056
static void disconnect_atexit(void);
5157

@@ -134,6 +140,8 @@ main(int argc, char **argv)
134140
XLogRecPtr chkptrec;
135141
TimeLineID chkpttli;
136142
XLogRecPtr chkptredo;
143+
TimeLineID source_tli;
144+
TimeLineID target_tli;
137145
XLogRecPtr target_wal_endrec;
138146
size_t size;
139147
char *buffer;
@@ -332,14 +340,28 @@ main(int argc, char **argv)
332340

333341
sanityChecks();
334342

343+
/*
344+
* Usually, the TLI can be found in the latest checkpoint record. But if
345+
* the source server is just being promoted (or it's a standby that's
346+
* following a primary that's just being promoted), and the checkpoint
347+
* requested by the promotion hasn't completed yet, the latest timeline is
348+
* in minRecoveryPoint. So we check which is later, the TLI of the
349+
* minRecoveryPoint or the latest checkpoint.
350+
*/
351+
source_tli = Max(ControlFile_source.minRecoveryPointTLI,
352+
ControlFile_source.checkPointCopy.ThisTimeLineID);
353+
354+
/* Similarly for the target. */
355+
target_tli = Max(ControlFile_target.minRecoveryPointTLI,
356+
ControlFile_target.checkPointCopy.ThisTimeLineID);
357+
335358
/*
336359
* Find the common ancestor timeline between the clusters.
337360
*
338361
* If both clusters are already on the same timeline, there's nothing to
339362
* do.
340363
*/
341-
if (ControlFile_target.checkPointCopy.ThisTimeLineID ==
342-
ControlFile_source.checkPointCopy.ThisTimeLineID)
364+
if (target_tli == source_tli)
343365
{
344366
pg_log_info("source and target cluster are on the same timeline");
345367
rewind_needed = false;
@@ -348,12 +370,31 @@ main(int argc, char **argv)
348370
else
349371
{
350372
XLogRecPtr chkptendrec;
373+
TimeLineHistoryEntry *sourceHistory;
374+
int sourceNentries;
375+
376+
/*
377+
* Retrieve timelines for both source and target, and find the point
378+
* where they diverged.
379+
*/
380+
sourceHistory = getTimelineHistory(source_tli, true, &sourceNentries);
381+
targetHistory = getTimelineHistory(target_tli, false, &targetNentries);
382+
383+
findCommonAncestorTimeline(sourceHistory, sourceNentries,
384+
targetHistory, targetNentries,
385+
&divergerec, &lastcommontliIndex);
351386

352-
findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
353387
pg_log_info("servers diverged at WAL location %X/%X on timeline %u",
354388
LSN_FORMAT_ARGS(divergerec),
355389
targetHistory[lastcommontliIndex].tli);
356390

391+
/*
392+
* Don't need the source history anymore. The target history is still
393+
* needed by the routines in parsexlog.c, when we read the target WAL.
394+
*/
395+
pfree(sourceHistory);
396+
397+
357398
/*
358399
* Determine the end-of-WAL on the target.
359400
*
@@ -654,7 +695,8 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
654695
pg_fatal("source system was in unexpected state at end of rewind");
655696

656697
endrec = source->get_current_wal_insert_lsn(source);
657-
endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
698+
endtli = Max(ControlFile_source_after.checkPointCopy.ThisTimeLineID,
699+
ControlFile_source_after.minRecoveryPointTLI);
658700
}
659701
}
660702
else
@@ -796,16 +838,12 @@ MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
796838
}
797839

798840
/*
799-
* Retrieve timeline history for given control file which should behold
800-
* either source or target.
841+
* Retrieve timeline history for the source or target system.
801842
*/
802843
static TimeLineHistoryEntry *
803-
getTimelineHistory(ControlFileData *controlFile, int *nentries)
844+
getTimelineHistory(TimeLineID tli, bool is_source, int *nentries)
804845
{
805846
TimeLineHistoryEntry *history;
806-
TimeLineID tli;
807-
808-
tli = controlFile->checkPointCopy.ThisTimeLineID;
809847

810848
/*
811849
* Timeline 1 does not have a history file, so there is no need to check
@@ -826,12 +864,10 @@ getTimelineHistory(ControlFileData *controlFile, int *nentries)
826864
TLHistoryFilePath(path, tli);
827865

828866
/* Get history file from appropriate source */
829-
if (controlFile == &ControlFile_source)
867+
if (is_source)
830868
histfile = source->fetch_file(source, path, NULL);
831-
else if (controlFile == &ControlFile_target)
832-
histfile = slurpFile(datadir_target, path, NULL);
833869
else
834-
pg_fatal("invalid control file");
870+
histfile = slurpFile(datadir_target, path, NULL);
835871

836872
history = rewind_parseTimeLineHistory(histfile, tli, nentries);
837873
pg_free(histfile);
@@ -841,12 +877,10 @@ getTimelineHistory(ControlFileData *controlFile, int *nentries)
841877
{
842878
int i;
843879

844-
if (controlFile == &ControlFile_source)
880+
if (is_source)
845881
pg_log_debug("Source timeline history:");
846-
else if (controlFile == &ControlFile_target)
847-
pg_log_debug("Target timeline history:");
848882
else
849-
Assert(false);
883+
pg_log_debug("Target timeline history:");
850884

851885
/*
852886
* Print the target timeline history.
@@ -866,28 +900,19 @@ getTimelineHistory(ControlFileData *controlFile, int *nentries)
866900
}
867901

868902
/*
869-
* Determine the TLI of the last common timeline in the timeline history of the
870-
* two clusters. targetHistory is filled with target timeline history and
871-
* targetNentries is number of items in targetHistory. *tliIndex is set to the
872-
* index of last common timeline in targetHistory array, and *recptr is set to
873-
* the position where the timeline history diverged (ie. the first WAL record
874-
* that's not the same in both clusters).
875-
*
876-
* Control files of both clusters must be read into ControlFile_target/source
877-
* before calling this routine.
903+
* Determine the TLI of the last common timeline in the timeline history of
904+
* two clusters. *tliIndex is set to the index of last common timeline in
905+
* the arrays, and *recptr is set to the position where the timeline history
906+
* diverged (ie. the first WAL record that's not the same in both clusters).
878907
*/
879908
static void
880-
findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
909+
findCommonAncestorTimeline(TimeLineHistoryEntry *a_history, int a_nentries,
910+
TimeLineHistoryEntry *b_history, int b_nentries,
911+
XLogRecPtr *recptr, int *tliIndex)
881912
{
882-
TimeLineHistoryEntry *sourceHistory;
883-
int sourceNentries;
884913
int i,
885914
n;
886915

887-
/* Retrieve timelines for both source and target */
888-
sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries);
889-
targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries);
890-
891916
/*
892917
* Trace the history forward, until we hit the timeline diverge. It may
893918
* still be possible that the source and target nodes used the same
@@ -896,21 +921,19 @@ findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
896921
* recovery processes. Hence check the start position of the new timeline
897922
* as well and move down by one extra timeline entry if they do not match.
898923
*/
899-
n = Min(sourceNentries, targetNentries);
924+
n = Min(a_nentries, b_nentries);
900925
for (i = 0; i < n; i++)
901926
{
902-
if (sourceHistory[i].tli != targetHistory[i].tli ||
903-
sourceHistory[i].begin != targetHistory[i].begin)
927+
if (a_history[i].tli != b_history[i].tli ||
928+
a_history[i].begin != b_history[i].begin)
904929
break;
905930
}
906931

907932
if (i > 0)
908933
{
909934
i--;
910-
*recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end);
935+
*recptr = MinXLogRecPtr(a_history[i].end, b_history[i].end);
911936
*tliIndex = i;
912-
913-
pg_free(sourceHistory);
914937
return;
915938
}
916939
else

src/bin/pg_rewind/t/007_standby_source.pl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,6 @@
8383
# A (primary) <--- B (standby) C (primary)
8484

8585
$node_c->promote;
86-
$node_c->safe_psql('postgres', "checkpoint");
8786

8887

8988
# Insert a row in A. This causes A/B and C to have "diverged", so that it's

src/bin/pg_rewind/t/008_min_recovery_point.pl

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,6 @@
7676
#
7777
$node_1->stop('fast');
7878
$node_3->promote;
79-
# Force a checkpoint after the promotion. pg_rewind looks at the control
80-
# file to determine what timeline the server is on, and that isn't updated
81-
# immediately at promotion, but only at the next checkpoint. When running
82-
# pg_rewind in remote mode, it's possible that we complete the test steps
83-
# after promotion so quickly that when pg_rewind runs, the standby has not
84-
# performed a checkpoint after promotion yet.
85-
$node_3->safe_psql('postgres', "checkpoint");
8679

8780
# reconfigure node_1 as a standby following node_3
8881
my $node_3_connstr = $node_3->connstr;
@@ -108,8 +101,6 @@
108101
$node_3->wait_for_catchup('node_1');
109102

110103
$node_1->promote;
111-
# Force a checkpoint after promotion, like earlier.
112-
$node_1->safe_psql('postgres', "checkpoint");
113104

114105
#
115106
# We now have a split-brain with two primaries. Insert a row on both to

src/bin/pg_rewind/t/RewindTest.pm

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -198,14 +198,6 @@ sub promote_standby
198198
# the primary out-of-sync with the standby.
199199
$node_standby->promote;
200200

201-
# Force a checkpoint after the promotion. pg_rewind looks at the control
202-
# file to determine what timeline the server is on, and that isn't updated
203-
# immediately at promotion, but only at the next checkpoint. When running
204-
# pg_rewind in remote mode, it's possible that we complete the test steps
205-
# after promotion so quickly that when pg_rewind runs, the standby has not
206-
# performed a checkpoint after promotion yet.
207-
standby_psql("checkpoint");
208-
209201
return;
210202
}
211203

0 commit comments

Comments
 (0)