Skip to content

Commit eb124c3

Browse files
committed
Add TAP tests to check replication slot advance during the checkpoint
The new tests verify that logical and physical replication slots are still valid after an immediate restart on checkpoint completion when the slot was advanced during the checkpoint. This commit introduces two new injection points to make these tests possible: * checkpoint-before-old-wal-removal - triggered in the checkpointer process just before old WAL segments cleanup; * logical-replication-slot-advance-segment - triggered in LogicalConfirmReceivedLocation() when restart_lsn was changed enough to point to the next WAL segment. Discussion: https://postgr.es/m/flat/1d12d2-67235980-35-19a406a0%4063439497 Author: Vitaly Davydov <v.davydov@postgrespro.ru> Author: Tomas Vondra <tomas@vondra.me> Reviewed-by: Alexander Korotkov <aekorotkov@gmail.com> Reviewed-by: Amit Kapila <amit.kapila16@gmail.com> Backpatch-through: 17
1 parent ca307d5 commit eb124c3

File tree

5 files changed

+296
-0
lines changed

5 files changed

+296
-0
lines changed

src/backend/access/transam/xlog.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7498,6 +7498,10 @@ CreateCheckPoint(int flags)
74987498
if (PriorRedoPtr != InvalidXLogRecPtr)
74997499
UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
75007500

7501+
#ifdef USE_INJECTION_POINTS
7502+
INJECTION_POINT("checkpoint-before-old-wal-removal", NULL);
7503+
#endif
7504+
75017505
/*
75027506
* Delete old log files, those no longer needed for last checkpoint to
75037507
* prevent the disk holding the xlog from growing full.

src/backend/replication/logical/logical.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "postgres.h"
3030

3131
#include "access/xact.h"
32+
#include "access/xlog_internal.h"
3233
#include "access/xlogutils.h"
3334
#include "fmgr.h"
3435
#include "miscadmin.h"
@@ -41,6 +42,7 @@
4142
#include "storage/proc.h"
4243
#include "storage/procarray.h"
4344
#include "utils/builtins.h"
45+
#include "utils/injection_point.h"
4446
#include "utils/inval.h"
4547
#include "utils/memutils.h"
4648

@@ -1825,9 +1827,13 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn)
18251827
{
18261828
bool updated_xmin = false;
18271829
bool updated_restart = false;
1830+
XLogRecPtr restart_lsn pg_attribute_unused();
18281831

18291832
SpinLockAcquire(&MyReplicationSlot->mutex);
18301833

1834+
/* remember the old restart lsn */
1835+
restart_lsn = MyReplicationSlot->data.restart_lsn;
1836+
18311837
/*
18321838
* Prevent moving the confirmed_flush backwards, as this could lead to
18331839
* data duplication issues caused by replicating already replicated
@@ -1881,6 +1887,18 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn)
18811887
/* first write new xmin to disk, so we know what's up after a crash */
18821888
if (updated_xmin || updated_restart)
18831889
{
1890+
#ifdef USE_INJECTION_POINTS
1891+
XLogSegNo seg1,
1892+
seg2;
1893+
1894+
XLByteToSeg(restart_lsn, seg1, wal_segment_size);
1895+
XLByteToSeg(MyReplicationSlot->data.restart_lsn, seg2, wal_segment_size);
1896+
1897+
/* trigger injection point, but only if segment changes */
1898+
if (seg1 != seg2)
1899+
INJECTION_POINT("logical-replication-slot-advance-segment", NULL);
1900+
#endif
1901+
18841902
ReplicationSlotMarkDirty();
18851903
ReplicationSlotSave();
18861904
elog(DEBUG1, "updated xmin: %u restart: %u", updated_xmin, updated_restart);

src/test/recovery/meson.build

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ tests += {
5454
't/043_no_contrecord_switch.pl',
5555
't/044_invalidate_inactive_slots.pl',
5656
't/045_archive_restartpoint.pl',
57+
't/046_checkpoint_logical_slot.pl',
58+
't/047_checkpoint_physical_slot.pl'
5759
],
5860
},
5961
}
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
# Copyright (c) 2025, PostgreSQL Global Development Group
2+
#
3+
# This test verifies the case when the logical slot is advanced during
4+
# checkpoint. The test checks that the logical slot's restart_lsn still refers
5+
# to an existed WAL segment after immediate restart.
6+
#
7+
use strict;
8+
use warnings FATAL => 'all';
9+
10+
use PostgreSQL::Test::Cluster;
11+
use PostgreSQL::Test::Utils;
12+
13+
use Test::More;
14+
15+
if ($ENV{enable_injection_points} ne 'yes')
16+
{
17+
plan skip_all => 'Injection points not supported by this build';
18+
}
19+
20+
my ($node, $result);
21+
22+
$node = PostgreSQL::Test::Cluster->new('mike');
23+
$node->init;
24+
$node->append_conf('postgresql.conf',
25+
"shared_preload_libraries = 'injection_points'");
26+
$node->append_conf('postgresql.conf', "wal_level = 'logical'");
27+
$node->start;
28+
$node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
29+
30+
# Create a simple table to generate data into.
31+
$node->safe_psql('postgres',
32+
q{create table t (id serial primary key, b text)});
33+
34+
# Create the two slots we'll need.
35+
$node->safe_psql('postgres',
36+
q{select pg_create_logical_replication_slot('slot_logical', 'test_decoding')}
37+
);
38+
$node->safe_psql('postgres',
39+
q{select pg_create_physical_replication_slot('slot_physical', true)});
40+
41+
# Advance both slots to the current position just to have everything "valid".
42+
$node->safe_psql('postgres',
43+
q{select count(*) from pg_logical_slot_get_changes('slot_logical', null, null)}
44+
);
45+
$node->safe_psql('postgres',
46+
q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())}
47+
);
48+
49+
# Run checkpoint to flush current state to disk and set a baseline.
50+
$node->safe_psql('postgres', q{checkpoint});
51+
52+
# Generate some transactions to get RUNNING_XACTS.
53+
my $xacts = $node->background_psql('postgres');
54+
$xacts->query_until(
55+
qr/run_xacts/,
56+
q(\echo run_xacts
57+
SELECT 1 \watch 0.1
58+
\q
59+
));
60+
61+
# Insert 2M rows; that's about 260MB (~20 segments) worth of WAL.
62+
$node->safe_psql('postgres',
63+
q{insert into t (b) select md5(i::text) from generate_series(1,1000000) s(i)}
64+
);
65+
66+
# Run another checkpoint to set a new restore LSN.
67+
$node->safe_psql('postgres', q{checkpoint});
68+
69+
# Another 2M rows; that's about 260MB (~20 segments) worth of WAL.
70+
$node->safe_psql('postgres',
71+
q{insert into t (b) select md5(i::text) from generate_series(1,1000000) s(i)}
72+
);
73+
74+
# Run another checkpoint, this time in the background, and make it wait
75+
# on the injection point) so that the checkpoint stops right before
76+
# removing old WAL segments.
77+
note('starting checkpoint\n');
78+
79+
my $checkpoint = $node->background_psql('postgres');
80+
$checkpoint->query_safe(
81+
q(select injection_points_attach('checkpoint-before-old-wal-removal','wait'))
82+
);
83+
$checkpoint->query_until(
84+
qr/starting_checkpoint/,
85+
q(\echo starting_checkpoint
86+
checkpoint;
87+
\q
88+
));
89+
90+
# Wait until the checkpoint stops right before removing WAL segments.
91+
note('waiting for injection_point\n');
92+
$node->wait_for_event('checkpointer', 'checkpoint-before-old-wal-removal');
93+
note('injection_point is reached');
94+
95+
# Try to advance the logical slot, but make it stop when it moves to the next
96+
# WAL segment (this has to happen in the background, too).
97+
my $logical = $node->background_psql('postgres');
98+
$logical->query_safe(
99+
q{select injection_points_attach('logical-replication-slot-advance-segment','wait');}
100+
);
101+
$logical->query_until(
102+
qr/get_changes/,
103+
q(
104+
\echo get_changes
105+
select count(*) from pg_logical_slot_get_changes('slot_logical', null, null) \watch 1
106+
\q
107+
));
108+
109+
# Wait until the slot's restart_lsn points to the next WAL segment.
110+
note('waiting for injection_point\n');
111+
$node->wait_for_event('client backend',
112+
'logical-replication-slot-advance-segment');
113+
note('injection_point is reached');
114+
115+
# OK, we're in the right situation: time to advance the physical slot, which
116+
# recalculates the required LSN, and then unblock the checkpoint, which
117+
# removes the WAL still needed by the logical slot.
118+
$node->safe_psql('postgres',
119+
q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())}
120+
);
121+
122+
# Continue the checkpoint.
123+
$node->safe_psql('postgres',
124+
q{select injection_points_wakeup('checkpoint-before-old-wal-removal')});
125+
126+
# Abruptly stop the server (1 second should be enough for the checkpoint
127+
# to finish; it would be better).
128+
$node->stop('immediate');
129+
130+
$node->start;
131+
132+
eval {
133+
$node->safe_psql('postgres',
134+
q{select count(*) from pg_logical_slot_get_changes('slot_logical', null, null);}
135+
);
136+
};
137+
is($@, '', "Logical slot still valid");
138+
139+
done_testing();
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# Copyright (c) 2025, PostgreSQL Global Development Group
2+
#
3+
# This test verifies the case when the physical slot is advanced during
4+
# checkpoint. The test checks that the physical slot's restart_lsn still refers
5+
# to an existed WAL segment after immediate restart.
6+
#
7+
use strict;
8+
use warnings FATAL => 'all';
9+
10+
use PostgreSQL::Test::Cluster;
11+
use PostgreSQL::Test::Utils;
12+
13+
use Test::More;
14+
15+
if ($ENV{enable_injection_points} ne 'yes')
16+
{
17+
plan skip_all => 'Injection points not supported by this build';
18+
}
19+
20+
my ($node, $result);
21+
22+
$node = PostgreSQL::Test::Cluster->new('mike');
23+
$node->init;
24+
$node->append_conf('postgresql.conf',
25+
"shared_preload_libraries = 'injection_points'");
26+
$node->append_conf('postgresql.conf', "wal_level = 'replica'");
27+
$node->start;
28+
$node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
29+
30+
# Create a simple table to generate data into.
31+
$node->safe_psql('postgres',
32+
q{create table t (id serial primary key, b text)});
33+
34+
# Create a physical replication slot.
35+
$node->safe_psql('postgres',
36+
q{select pg_create_physical_replication_slot('slot_physical', true)});
37+
38+
# Advance slot to the current position, just to have everything "valid".
39+
$node->safe_psql('postgres',
40+
q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())}
41+
);
42+
43+
# Run checkpoint to flush current state to disk and set a baseline.
44+
$node->safe_psql('postgres', q{checkpoint});
45+
46+
# Insert 2M rows; that's about 260MB (~20 segments) worth of WAL.
47+
$node->safe_psql('postgres',
48+
q{insert into t (b) select md5(i::text) from generate_series(1,100000) s(i)}
49+
);
50+
51+
# Advance slot to the current position, just to have everything "valid".
52+
$node->safe_psql('postgres',
53+
q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())}
54+
);
55+
56+
# Run another checkpoint to set a new restore LSN.
57+
$node->safe_psql('postgres', q{checkpoint});
58+
59+
# Another 2M rows; that's about 260MB (~20 segments) worth of WAL.
60+
$node->safe_psql('postgres',
61+
q{insert into t (b) select md5(i::text) from generate_series(1,1000000) s(i)}
62+
);
63+
64+
my $restart_lsn_init = $node->safe_psql('postgres',
65+
q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'}
66+
);
67+
chomp($restart_lsn_init);
68+
note("restart lsn before checkpoint: $restart_lsn_init");
69+
70+
# Run another checkpoint, this time in the background, and make it wait
71+
# on the injection point) so that the checkpoint stops right before
72+
# removing old WAL segments.
73+
note('starting checkpoint');
74+
75+
my $checkpoint = $node->background_psql('postgres');
76+
$checkpoint->query_safe(
77+
q{select injection_points_attach('checkpoint-before-old-wal-removal','wait')}
78+
);
79+
$checkpoint->query_until(
80+
qr/starting_checkpoint/,
81+
q(\echo starting_checkpoint
82+
checkpoint;
83+
\q
84+
));
85+
86+
# Wait until the checkpoint stops right before removing WAL segments.
87+
note('waiting for injection_point');
88+
$node->wait_for_event('checkpointer', 'checkpoint-before-old-wal-removal');
89+
note('injection_point is reached');
90+
91+
# OK, we're in the right situation: time to advance the physical slot, which
92+
# recalculates the required LSN and then unblock the checkpoint, which
93+
# removes the WAL still needed by the physical slot.
94+
$node->safe_psql('postgres',
95+
q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())}
96+
);
97+
98+
# Continue the checkpoint.
99+
$node->safe_psql('postgres',
100+
q{select injection_points_wakeup('checkpoint-before-old-wal-removal')});
101+
102+
my $restart_lsn_old = $node->safe_psql('postgres',
103+
q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'}
104+
);
105+
chomp($restart_lsn_old);
106+
note("restart lsn before stop: $restart_lsn_old");
107+
108+
# Abruptly stop the server (1 second should be enough for the checkpoint
109+
# to finish; it would be better).
110+
$node->stop('immediate');
111+
112+
$node->start;
113+
114+
# Get the restart_lsn of the slot right after restarting.
115+
my $restart_lsn = $node->safe_psql('postgres',
116+
q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'}
117+
);
118+
chomp($restart_lsn);
119+
note("restart lsn: $restart_lsn");
120+
121+
# Get the WAL segment name for the slot's restart_lsn.
122+
my $restart_lsn_segment = $node->safe_psql('postgres',
123+
"SELECT pg_walfile_name('$restart_lsn'::pg_lsn)");
124+
chomp($restart_lsn_segment);
125+
126+
# Check if the required wal segment exists.
127+
note("required by slot segment name: $restart_lsn_segment");
128+
my $datadir = $node->data_dir;
129+
ok( -f "$datadir/pg_wal/$restart_lsn_segment",
130+
"WAL segment $restart_lsn_segment for physical slot's restart_lsn $restart_lsn exists"
131+
);
132+
133+
done_testing();

0 commit comments

Comments
 (0)