Skip to content

Commit e297ddc

Browse files
committed
Test restartpoints in archive recovery.
v14 commit 1f95181 and its v13 equivalent caused timing-dependent failures in archive recovery, at restartpoints. The symptom was "invalid magic number 0000 in log segment X, offset 0", "unexpected pageaddr X in log segment Y, offset 0" [X < Y], or an assertion failure. Commit 3635a0a and predecessors back-patched v15 changes to fix that. This test reproduces the problem probabilistically, typically in less than 1000 iterations of the test. Hence, buildfarm and CI runs would have surfaced enough failures to get attention within a day. Reported-by: Arun Thirupathi <arunth@google.com> Discussion: https://postgr.es/m/20250306193013.36.nmisch@google.com Backpatch-through: 13
1 parent d0a0499 commit e297ddc

File tree

2 files changed

+58
-0
lines changed

2 files changed

+58
-0
lines changed

src/test/recovery/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ tests += {
5252
't/041_checkpoint_at_promote.pl',
5353
't/042_low_level_backup.pl',
5454
't/043_no_contrecord_switch.pl',
55+
't/045_archive_restartpoint.pl',
5556
],
5657
},
5758
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
2+
# Copyright (c) 2024-2025, PostgreSQL Global Development Group
3+
4+
# Test restartpoints during archive recovery.
5+
use strict;
6+
use warnings;
7+
8+
use PostgreSQL::Test::Cluster;
9+
use PostgreSQL::Test::Utils;
10+
use Test::More;
11+
12+
my $archive_max_mb = 320;
13+
my $wal_segsize = 1;
14+
15+
# Initialize primary node
16+
my $node_primary = PostgreSQL::Test::Cluster->new('primary');
17+
$node_primary->init(
18+
has_archiving => 1,
19+
allows_streaming => 1,
20+
extra => [ '--wal-segsize' => $wal_segsize ]);
21+
$node_primary->start;
22+
my $backup_name = 'my_backup';
23+
$node_primary->backup($backup_name);
24+
25+
$node_primary->safe_psql('postgres',
26+
('DO $$BEGIN FOR i IN 1..' . $archive_max_mb / $wal_segsize)
27+
. ' LOOP CHECKPOINT; PERFORM pg_switch_wal(); END LOOP; END$$;');
28+
29+
# Force archiving of WAL file containing recovery target
30+
my $until_lsn = $node_primary->lsn('write');
31+
$node_primary->safe_psql('postgres', "SELECT pg_switch_wal()");
32+
$node_primary->stop;
33+
34+
# Archive recovery
35+
my $node_restore = PostgreSQL::Test::Cluster->new('restore');
36+
$node_restore->init_from_backup($node_primary, $backup_name,
37+
has_restoring => 1);
38+
$node_restore->append_conf('postgresql.conf',
39+
"recovery_target_lsn = '$until_lsn'");
40+
$node_restore->append_conf('postgresql.conf',
41+
'recovery_target_action = pause');
42+
$node_restore->append_conf('postgresql.conf',
43+
'max_wal_size = ' . 2 * $wal_segsize);
44+
$node_restore->append_conf('postgresql.conf', 'log_checkpoints = on');
45+
46+
$node_restore->start;
47+
48+
# Wait until restore has replayed enough data
49+
my $caughtup_query =
50+
"SELECT '$until_lsn'::pg_lsn <= pg_last_wal_replay_lsn()";
51+
$node_restore->poll_query_until('postgres', $caughtup_query)
52+
or die "Timed out while waiting for restore to catch up";
53+
54+
$node_restore->stop;
55+
ok(1, 'restore caught up');
56+
57+
done_testing();

0 commit comments

Comments
 (0)