Skip to content

Commit ba25358

Browse files
alvherrehughcapetCyberDem0n
committed
Avoid deleting critical WAL segments during pg_rewind
Previously, in unlucky cases, it was possible for pg_rewind to remove certain WAL segments from the rewound demoted primary. In particular this happens if those files have been marked for archival (i.e., their .ready files were created) but not yet archived; the newly promoted node no longer has such files because of them having been recycled, but they are likely critical for recovery in the demoted node. If pg_rewind removes them, recovery is not possible anymore. Fix this by maintaining a hash table of files in this situation in the scan that looks for a checkpoint, which the decide_file_actions phase can consult so that it knows to preserve them. Backpatch to 14. The problem also exists in 13, but that branch was not blessed with commit eb00f1d, so this patch is difficult to apply there. Users of older releases will just have to continue to be extra careful when rewinding. Co-authored-by: Полина Бунгина (Polina Bungina) <bungina@gmail.com> Co-authored-by: Alexander Kukushkin <cyberdemn@gmail.com> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Atsushi Torikoshi <torikoshia@oss.nttdata.com> Discussion: https://postgr.es/m/CAAtGL4AhzmBRsEsaDdz7065T+k+BscNadfTqP1NcPmsqwA5HBw@mail.gmail.com
1 parent 2a30b68 commit ba25358

File tree

6 files changed

+168
-7
lines changed

6 files changed

+168
-7
lines changed

src/bin/pg_rewind/filemap.c

+77-7
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,14 @@
3939
* appearing in source and target systems.
4040
*/
4141
static uint32 hash_string_pointer(const char *s);
42-
#define SH_PREFIX filehash
43-
#define SH_ELEMENT_TYPE file_entry_t
44-
#define SH_KEY_TYPE const char *
45-
#define SH_KEY path
42+
#define SH_PREFIX filehash
43+
#define SH_ELEMENT_TYPE file_entry_t
44+
#define SH_KEY_TYPE const char *
45+
#define SH_KEY path
4646
#define SH_HASH_KEY(tb, key) hash_string_pointer(key)
4747
#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0)
48-
#define SH_SCOPE static inline
49-
#define SH_RAW_ALLOCATOR pg_malloc0
48+
#define SH_SCOPE static inline
49+
#define SH_RAW_ALLOCATOR pg_malloc0
5050
#define SH_DECLARE
5151
#define SH_DEFINE
5252
#include "lib/simplehash.h"
@@ -61,7 +61,36 @@ static char *datasegpath(RelFileNode rnode, ForkNumber forknum,
6161

6262
static file_entry_t *insert_filehash_entry(const char *path);
6363
static file_entry_t *lookup_filehash_entry(const char *path);
64+
65+
/*
66+
* A separate hash table which tracks WAL files that must not be deleted.
67+
*/
68+
typedef struct keepwal_entry
69+
{
70+
const char *path;
71+
uint32 status;
72+
} keepwal_entry;
73+
74+
#define SH_PREFIX keepwal
75+
#define SH_ELEMENT_TYPE keepwal_entry
76+
#define SH_KEY_TYPE const char *
77+
#define SH_KEY path
78+
#define SH_HASH_KEY(tb, key) hash_string_pointer(key)
79+
#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0)
80+
#define SH_SCOPE static inline
81+
#define SH_RAW_ALLOCATOR pg_malloc0
82+
#define SH_DECLARE
83+
#define SH_DEFINE
84+
#include "lib/simplehash.h"
85+
86+
#define KEEPWAL_INITIAL_SIZE 1000
87+
88+
89+
static keepwal_hash *keepwal = NULL;
90+
static bool keepwal_entry_exists(const char *path);
91+
6492
static int final_filemap_cmp(const void *a, const void *b);
93+
6594
static bool check_file_excluded(const char *path, bool is_source);
6695

6796
/*
@@ -208,6 +237,39 @@ lookup_filehash_entry(const char *path)
208237
return filehash_lookup(filehash, path);
209238
}
210239

240+
/*
241+
* Initialize a hash table to store WAL file names that must be kept.
242+
*/
243+
void
244+
keepwal_init(void)
245+
{
246+
/* An initial hash size out of thin air */
247+
keepwal = keepwal_create(KEEPWAL_INITIAL_SIZE, NULL);
248+
}
249+
250+
/* Mark the given file to prevent its removal */
251+
void
252+
keepwal_add_entry(const char *path)
253+
{
254+
keepwal_entry *entry;
255+
bool found;
256+
257+
/* Should only be called with keepwal initialized */
258+
Assert(keepwal != NULL);
259+
260+
entry = keepwal_insert(keepwal, path, &found);
261+
262+
if (!found)
263+
entry->path = pg_strdup(path);
264+
}
265+
266+
/* Return true if file is marked as not to be removed, false otherwise */
267+
static bool
268+
keepwal_entry_exists(const char *path)
269+
{
270+
return keepwal_lookup(keepwal, path) != NULL;
271+
}
272+
211273
/*
212274
* Callback for processing source file list.
213275
*
@@ -683,7 +745,15 @@ decide_file_action(file_entry_t *entry)
683745
}
684746
else if (entry->target_exists && !entry->source_exists)
685747
{
686-
/* File exists in target, but not source. Remove it. */
748+
/*
749+
* For files that exist in target but not in source, we check the
750+
* keepwal hash table; any files listed therein must not be removed.
751+
*/
752+
if (keepwal_entry_exists(path))
753+
{
754+
pg_log_debug("Not removing file \"%s\" because it is required for recovery", path);
755+
return FILE_ACTION_NONE;
756+
}
687757
return FILE_ACTION_REMOVE;
688758
}
689759
else if (!entry->target_exists && !entry->source_exists)

src/bin/pg_rewind/filemap.h

+3
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,7 @@ extern filemap_t *decide_file_actions(void);
110110
extern void calculate_totals(filemap_t *filemap);
111111
extern void print_filemap(filemap_t *filemap);
112112

113+
extern void keepwal_init(void);
114+
extern void keepwal_add_entry(const char *path);
115+
113116
#endif /* FILEMAP_H */

src/bin/pg_rewind/parsexlog.c

+21
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
173173
XLogReaderState *xlogreader;
174174
char *errormsg;
175175
XLogPageReadPrivate private;
176+
XLogSegNo current_segno = 0;
177+
TimeLineID current_tli = 0;
176178

177179
/*
178180
* The given fork pointer points to the end of the last common record,
@@ -215,6 +217,25 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
215217
LSN_FORMAT_ARGS(searchptr));
216218
}
217219

220+
/* Detect if a new WAL file has been opened */
221+
if (xlogreader->seg.ws_tli != current_tli ||
222+
xlogreader->seg.ws_segno != current_segno)
223+
{
224+
char xlogfname[MAXFNAMELEN];
225+
226+
snprintf(xlogfname, MAXFNAMELEN, XLOGDIR "/");
227+
228+
/* update curent values */
229+
current_tli = xlogreader->seg.ws_tli;
230+
current_segno = xlogreader->seg.ws_segno;
231+
232+
XLogFileName(xlogfname + sizeof(XLOGDIR),
233+
current_tli, current_segno, WalSegSz);
234+
235+
/* Track this filename as one to not remove */
236+
keepwal_add_entry(xlogfname);
237+
}
238+
218239
/*
219240
* Check if it is a checkpoint record. This checkpoint record needs to
220241
* be the latest checkpoint before WAL forked and not the checkpoint

src/bin/pg_rewind/pg_rewind.c

+3
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,9 @@ main(int argc, char **argv)
398398
exit(0);
399399
}
400400

401+
/* Initialize hashtable that tracks WAL files protected from removal */
402+
keepwal_init();
403+
401404
findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex,
402405
&chkptrec, &chkpttli, &chkptredo, restore_command);
403406
pg_log_info("rewinding from last common checkpoint at %X/%X on timeline %u",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Copyright (c) 2021-2024, PostgreSQL Global Development Group
2+
#
3+
# Test situation where a target data directory contains
4+
# WAL files that were already recycled by the new primary.
5+
#
6+
7+
use strict;
8+
use warnings FATAL => 'all';
9+
use PostgreSQL::Test::Utils;
10+
use Test::More;
11+
12+
use FindBin;
13+
use lib $FindBin::RealBin;
14+
use RewindTest;
15+
16+
RewindTest::setup_cluster();
17+
$node_primary->enable_archiving();
18+
RewindTest::start_primary();
19+
20+
RewindTest::create_standby();
21+
$node_standby->enable_restoring($node_primary, 0);
22+
$node_standby->reload();
23+
24+
RewindTest::primary_psql("CHECKPOINT"); # last common checkpoint
25+
26+
# We use "perl -e 'exit(1)'" as an alternative to "false", because the latter
27+
# might not be available on Windows.
28+
my $false = "$^X -e 'exit(1)'";
29+
$node_primary->append_conf(
30+
'postgresql.conf', qq(
31+
archive_command = '$false'
32+
));
33+
$node_primary->reload();
34+
35+
# advance WAL on primary; this WAL segment will never make it to the archive
36+
RewindTest::primary_psql("CREATE TABLE t(a int)");
37+
RewindTest::primary_psql("INSERT INTO t VALUES(0)");
38+
RewindTest::primary_psql("SELECT pg_switch_wal()");
39+
40+
RewindTest::promote_standby;
41+
42+
# new primary loses diverging WAL segment
43+
RewindTest::standby_psql("INSERT INTO t values(0)");
44+
RewindTest::standby_psql("SELECT pg_switch_wal()");
45+
46+
$node_standby->stop();
47+
$node_primary->stop();
48+
49+
my ($stdout, $stderr) = run_command(
50+
[
51+
'pg_rewind', '--debug',
52+
'--source-pgdata', $node_standby->data_dir,
53+
'--target-pgdata', $node_primary->data_dir,
54+
'--no-sync',
55+
]);
56+
57+
like(
58+
$stderr,
59+
qr/Not removing file .* because it is required for recovery/,
60+
"some WAL files were skipped");
61+
62+
done_testing();

src/tools/pgindent/typedefs.list

+2
Original file line numberDiff line numberDiff line change
@@ -3232,6 +3232,8 @@ json_manifest_perwalrange_callback
32323232
json_ofield_action
32333233
json_scalar_action
32343234
json_struct_action
3235+
keepwal_entry
3236+
keepwal_hash
32353237
keyEntryData
32363238
key_t
32373239
lclContext

0 commit comments

Comments
 (0)