Skip to content

Commit 81e3c86

Browse files
committed
Fix pg_rewind bugs when rewinding a standby server.
If the target is a standby server, its WAL doesn't end at the last checkpoint record, but at minRecoveryPoint. We must scan all the WAL from the last common checkpoint all the way up to minRecoveryPoint for modified pages, and also consider that portion when determining whether the server needs rewinding. Backpatch to all supported versions. Author: Ian Barwick and me Discussion: https://www.postgresql.org/message-id/CABvVfJU-LDWvoz4-Yow3Ay5LZYTuPD7eSjjE4kGyNZpXC6FrVQ%40mail.gmail.com
1 parent fb50029 commit 81e3c86

File tree

3 files changed

+211
-25
lines changed

3 files changed

+211
-25
lines changed

src/bin/pg_rewind/parsexlog.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader,
5757
* Read WAL from the datadir/pg_xlog, starting from 'startpoint' on timeline
5858
* 'tli', until 'endpoint'. Make note of the data blocks touched by the WAL
5959
* records, and return them in a page map.
60+
*
61+
* 'endpoint' is the end of the last record to read. The record starting at
62+
* 'endpoint' is the first one that is not read.
6063
*/
6164
void
6265
extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli,
@@ -96,7 +99,13 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli,
9699

97100
startpoint = InvalidXLogRecPtr; /* continue reading at next record */
98101

99-
} while (xlogreader->ReadRecPtr != endpoint);
102+
} while (xlogreader->EndRecPtr < endpoint);
103+
104+
/*
105+
* If 'endpoint' didn't point exactly at a record boundary, the caller
106+
* messed up.
107+
*/
108+
Assert(xlogreader->EndRecPtr == endpoint);
100109

101110
XLogReaderFree(xlogreader);
102111
if (xlogreadfd != -1)

src/bin/pg_rewind/pg_rewind.c

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ main(int argc, char **argv)
9393
XLogRecPtr chkptrec;
9494
TimeLineID chkpttli;
9595
XLogRecPtr chkptredo;
96+
XLogRecPtr target_wal_endrec;
9697
size_t size;
9798
char *buffer;
9899
bool rewind_needed;
@@ -223,41 +224,54 @@ main(int argc, char **argv)
223224
{
224225
printf(_("source and target cluster are on the same timeline\n"));
225226
rewind_needed = false;
227+
target_wal_endrec = 0;
226228
}
227229
else
228230
{
231+
XLogRecPtr chkptendrec;
232+
229233
findCommonAncestorTimeline(&divergerec, &lastcommontli);
230234
printf(_("servers diverged at WAL position %X/%X on timeline %u\n"),
231235
(uint32) (divergerec >> 32), (uint32) divergerec, lastcommontli);
232236

233237
/*
234-
* Check for the possibility that the target is in fact a direct ancestor
235-
* of the source. In that case, there is no divergent history in the
236-
* target that needs rewinding.
238+
* Determine the end-of-WAL on the target.
239+
*
240+
* The WAL ends at the last shutdown checkpoint, or at
241+
* minRecoveryPoint if it was a standby. (If we supported rewinding a
242+
* server that was not shut down cleanly, we would need to replay
243+
* until we reach the first invalid record, like crash recovery does.)
237244
*/
238-
if (ControlFile_target.checkPoint >= divergerec)
245+
246+
/* read the checkpoint record on the target to see where it ends. */
247+
chkptendrec = readOneRecord(datadir_target,
248+
ControlFile_target.checkPoint,
249+
ControlFile_target.checkPointCopy.ThisTimeLineID);
250+
251+
if (ControlFile_target.minRecoveryPoint > chkptendrec)
239252
{
240-
rewind_needed = true;
253+
target_wal_endrec = ControlFile_target.minRecoveryPoint;
241254
}
242255
else
243256
{
244-
XLogRecPtr chkptendrec;
257+
target_wal_endrec = chkptendrec;
258+
}
245259

246-
/* Read the checkpoint record on the target to see where it ends. */
247-
chkptendrec = readOneRecord(datadir_target,
248-
ControlFile_target.checkPoint,
249-
ControlFile_target.checkPointCopy.ThisTimeLineID);
260+
/*
261+
* Check for the possibility that the target is in fact a direct
262+
* ancestor of the source. In that case, there is no divergent history
263+
* in the target that needs rewinding.
264+
*/
265+
if (target_wal_endrec > divergerec)
266+
{
267+
rewind_needed = true;
268+
}
269+
else
270+
{
271+
/* the last common checkpoint record must be part of target WAL */
272+
Assert(target_wal_endrec == divergerec);
250273

251-
/*
252-
* If the histories diverged exactly at the end of the shutdown
253-
* checkpoint record on the target, there are no WAL records in the
254-
* target that don't belong in the source's history, and no rewind is
255-
* needed.
256-
*/
257-
if (chkptendrec == divergerec)
258-
rewind_needed = false;
259-
else
260-
rewind_needed = true;
274+
rewind_needed = false;
261275
}
262276
}
263277

@@ -285,13 +299,11 @@ main(int argc, char **argv)
285299
/*
286300
* Read the target WAL from last checkpoint before the point of fork, to
287301
* extract all the pages that were modified on the target cluster after
288-
* the fork. We can stop reading after reaching the final shutdown record.
289-
* XXX: If we supported rewinding a server that was not shut down cleanly,
290-
* we would need to replay until the end of WAL here.
302+
* the fork.
291303
*/
292304
pg_log(PG_PROGRESS, "reading WAL in target\n");
293305
extractPageMap(datadir_target, chkptrec, lastcommontli,
294-
ControlFile_target.checkPoint);
306+
target_wal_endrec);
295307
filemap_finalize();
296308

297309
if (showprogress)
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
#
2+
# Test situation where a target data directory contains
3+
# WAL records beyond both the last checkpoint and the divergence
4+
# point:
5+
#
6+
# Target WAL (TLI 2):
7+
#
8+
# backup ... Checkpoint A ... INSERT 'rewind this'
9+
# (TLI 1 -> 2)
10+
#
11+
# ^ last common ^ minRecoveryPoint
12+
# checkpoint
13+
#
14+
# Source WAL (TLI 3):
15+
#
16+
# backup ... Checkpoint A ... Checkpoint B ... INSERT 'keep this'
17+
# (TLI 1 -> 2) (TLI 2 -> 3)
18+
#
19+
#
20+
# The last common checkpoint is Checkpoint A. But there is WAL on TLI 2
21+
# after the last common checkpoint that needs to be rewound. We used to
22+
# have a bug where minRecoveryPoint was ignored, and pg_rewind concluded
23+
# that the target doesn't need rewinding in this scenario, because the
24+
# last checkpoint on the target TLI was an ancestor of the source TLI.
25+
#
26+
#
27+
# This test does not make use of RewindTest as it requires three
28+
# nodes.
29+
30+
use strict;
31+
use warnings;
32+
use PostgresNode;
33+
use TestLib;
34+
use Test::More tests => 3;
35+
36+
use File::Copy;
37+
38+
my $tmp_folder = TestLib::tempdir;
39+
40+
my $node_1 = get_new_node('node_1');
41+
$node_1->init(allows_streaming => 1);
42+
$node_1->append_conf('postgresql.conf', qq(
43+
wal_keep_segments='5'
44+
));
45+
46+
$node_1->start;
47+
48+
# Create a couple of test tables
49+
$node_1->safe_psql('postgres', 'CREATE TABLE public.foo (t TEXT)');
50+
$node_1->safe_psql('postgres', 'CREATE TABLE public.bar (t TEXT)');
51+
$node_1->safe_psql('postgres', "INSERT INTO public.bar VALUES ('in both')");
52+
53+
54+
# Take backup
55+
my $backup_name = 'my_backup';
56+
$node_1->backup($backup_name);
57+
58+
# Create streaming standby from backup
59+
my $node_2 = get_new_node('node_2');
60+
$node_2->init_from_backup($node_1, $backup_name,
61+
has_streaming => 1);
62+
$node_2->start;
63+
64+
# Create streaming standby from backup
65+
my $node_3 = get_new_node('node_3');
66+
$node_3->init_from_backup($node_1, $backup_name,
67+
has_streaming => 1);
68+
$node_3->start;
69+
70+
# Stop node_1
71+
72+
$node_1->stop('fast');
73+
74+
# Promote node_3
75+
$node_3->promote;
76+
77+
# node_1 rejoins node_3
78+
79+
my $node_3_connstr = $node_3->connstr;
80+
81+
unlink($node_2->data_dir . '/recovery.conf');
82+
$node_1->append_conf('recovery.conf', qq(
83+
standby_mode=on
84+
primary_conninfo='$node_3_connstr'
85+
recovery_target_timeline='latest'
86+
));
87+
$node_1->start();
88+
89+
# node_2 follows node_3
90+
91+
unlink($node_2->data_dir . '/recovery.conf');
92+
$node_2->append_conf('recovery.conf', qq(
93+
standby_mode=on
94+
primary_conninfo='$node_3_connstr'
95+
recovery_target_timeline='latest'
96+
));
97+
$node_2->restart();
98+
99+
# Promote node_1
100+
101+
$node_1->promote;
102+
103+
# Wait until nodes 1 and 3 have been fully promoted.
104+
$node_1->poll_query_until('postgres', "SELECT pg_is_in_recovery() <> true");
105+
$node_3->poll_query_until('postgres', "SELECT pg_is_in_recovery() <> true");
106+
107+
# We now have a split-brain with two primaries. Insert a row on both to
108+
# demonstratively create a split brain. After the rewind, we should only
109+
# see the insert on 1, as the insert on node 3 is rewound away.
110+
$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('keep this')");
111+
112+
# Insert more rows in node 1, to bump up the XID counter. Otherwise, if
113+
# rewind doesn't correctly rewind the changes made on the other node,
114+
# we might fail to notice if the inserts are invisible because the XIDs
115+
# are not marked as committed.
116+
$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this')");
117+
$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this too')");
118+
119+
# Also insert a row in 'bar' on node 3. It is unmodified in node 1, so it won't get
120+
# overwritten by replaying the WAL from node 1.
121+
$node_3->safe_psql('postgres', "INSERT INTO public.bar (t) VALUES ('rewind this')");
122+
123+
# Wait for node 2 to catch up
124+
$node_2->poll_query_until('postgres',
125+
q|SELECT COUNT(*) > 1 FROM public.bar|, 't');
126+
127+
# At this point node_2 will shut down without a shutdown checkpoint,
128+
# but with WAL entries beyond the preceding shutdown checkpoint.
129+
$node_2->stop('fast');
130+
$node_3->stop('fast');
131+
132+
my $node_2_pgdata = $node_2->data_dir;
133+
my $node_1_connstr = $node_1->connstr;
134+
135+
# Keep a temporary postgresql.conf or it would be overwritten during the rewind.
136+
copy(
137+
"$node_2_pgdata/postgresql.conf",
138+
"$tmp_folder/node_2-postgresql.conf.tmp");
139+
140+
command_ok(
141+
[
142+
'pg_rewind',
143+
"--source-server=$node_1_connstr",
144+
"--target-pgdata=$node_2_pgdata"
145+
],
146+
'pg_rewind detects rewind needed');
147+
148+
# Now move back postgresql.conf with old settings
149+
move(
150+
"$tmp_folder/node_2-postgresql.conf.tmp",
151+
"$node_2_pgdata/postgresql.conf");
152+
153+
$node_2->start;
154+
155+
# Check contents of the test tables after rewind. The rows inserted in node 3
156+
# before rewind should've been overwritten with the data from node 1.
157+
my $result;
158+
$result = $node_2->safe_psql('postgres', 'checkpoint');
159+
$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.foo');
160+
is($result, qq(keep this
161+
and this
162+
and this too), 'table foo after rewind');
163+
164+
$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.bar');
165+
is($result, qq(in both), 'table bar after rewind');

0 commit comments

Comments
 (0)