Skip to content

Commit 6782709

Browse files
committed
Add regression test for restart points during promotion
This test serves as a way to demonstrate how to use the features introduced in 37b369d, while providing coverage for 7863ee4 that caused the startup process to throw "PANIC: could not locate a valid checkpoint record" when starting recovery. The test checks that a node is able to properly restart following a crash when a restart point was finishing across a promotion, with an injection point added in the middle of CreateRestartPoint() to stop the restartpoint in flight. Note that this test fails when 7863ee4 is reverted. Kyotaro Horiguchi is the original author of this test, that has been originally posted on the thread where 7863ee4 was discussed. I have just upgraded and polished it to rely on injection points, making it much cheaper to reproduce the failure. This test requires injection points to be enabled in the builds, hence meson and ./configure need an update to pass this knowledge down to the test. The name of the new injection point follows the same naming convention as 6a1ea02. The Makefile's EXTRA_INSTALL of recovery TAP tests is updated to include modules/injection_points. Author: Kyotaro Horiguchi, Michael Paquier Reviewed-by: Andrey Borodin, Bertrand Drouvot Discussion: https://postgr.es/m/ZdLuxBk5hGpol91B@paquier.xyz
1 parent 37b369d commit 6782709

File tree

4 files changed

+187
-1
lines changed

4 files changed

+187
-1
lines changed

src/backend/access/transam/xlog.c

+7
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@
100100
#include "storage/sync.h"
101101
#include "utils/guc_hooks.h"
102102
#include "utils/guc_tables.h"
103+
#include "utils/injection_point.h"
103104
#include "utils/memutils.h"
104105
#include "utils/ps_status.h"
105106
#include "utils/relmapper.h"
@@ -7528,6 +7529,12 @@ CreateRestartPoint(int flags)
75287529

75297530
CheckPointGuts(lastCheckPoint.redo, flags);
75307531

7532+
/*
7533+
* This location needs to be after CheckPointGuts() to ensure that some
7534+
* work has already happened during this checkpoint.
7535+
*/
7536+
INJECTION_POINT("create-restart-point");
7537+
75317538
/*
75327539
* Remember the prior checkpoint's redo ptr for
75337540
* UpdateCheckPointDistanceEstimate()

src/test/recovery/Makefile

+6-1
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,17 @@
99
#
1010
#-------------------------------------------------------------------------
1111

12-
EXTRA_INSTALL=contrib/pg_prewarm contrib/pg_stat_statements contrib/test_decoding
12+
EXTRA_INSTALL=contrib/pg_prewarm \
13+
contrib/pg_stat_statements \
14+
contrib/test_decoding \
15+
src/test/modules/injection_points
1316

1417
subdir = src/test/recovery
1518
top_builddir = ../../..
1619
include $(top_builddir)/src/Makefile.global
1720

21+
export enable_injection_points enable_injection_points
22+
1823
# required for 017_shm.pl and 027_stream_regress.pl
1924
REGRESS_SHLIB=$(abs_top_builddir)/src/test/regress/regress$(DLSUFFIX)
2025
export REGRESS_SHLIB

src/test/recovery/meson.build

+4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ tests += {
66
'bd': meson.current_build_dir(),
77
'tap': {
88
'test_kwargs': {'priority': 40}, # recovery tests are slow, start early
9+
'env': {
10+
'enable_injection_points': get_option('injection_points') ? 'yes' : 'no',
11+
},
912
'tests': [
1013
't/001_stream_rep.pl',
1114
't/002_archiving.pl',
@@ -46,6 +49,7 @@ tests += {
4649
't/038_save_logical_slots_shutdown.pl',
4750
't/039_end_of_wal.pl',
4851
't/040_standby_failover_slots_sync.pl',
52+
't/041_checkpoint_at_promote.pl',
4953
],
5054
},
5155
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
2+
# Copyright (c) 2024, PostgreSQL Global Development Group
3+
4+
use strict;
5+
use warnings;
6+
use PostgreSQL::Test::Cluster;
7+
use PostgreSQL::Test::Utils;
8+
use Time::HiRes qw(usleep);
9+
use Test::More;
10+
11+
##################################################
12+
# Test race condition when a restart point is running during a promotion,
13+
# checking that WAL segments are correctly removed in the restart point
14+
# while the promotion finishes.
15+
#
16+
# This test relies on an injection point that causes the checkpointer to
17+
# wait in the middle of a restart point on a standby. The checkpointer
18+
# is awaken to finish its restart point only once the promotion of the
19+
# standby is completed, and the node should be able to restart properly.
20+
##################################################
21+
22+
if ($ENV{enable_injection_points} ne 'yes')
23+
{
24+
plan skip_all => 'Injection points not supported by this build';
25+
}
26+
27+
# Initialize primary node. log_checkpoints is required as the checkpoint
28+
# activity is monitored based on the contents of the logs.
29+
my $node_primary = PostgreSQL::Test::Cluster->new('master');
30+
$node_primary->init(allows_streaming => 1);
31+
$node_primary->append_conf(
32+
'postgresql.conf', q[
33+
log_checkpoints = on
34+
restart_after_crash = on
35+
]);
36+
$node_primary->start;
37+
38+
my $backup_name = 'my_backup';
39+
$node_primary->backup($backup_name);
40+
41+
# Setup a standby.
42+
my $node_standby = PostgreSQL::Test::Cluster->new('standby1');
43+
$node_standby->init_from_backup($node_primary, $backup_name,
44+
has_streaming => 1);
45+
$node_standby->start;
46+
47+
# Dummy table for the upcoming tests.
48+
$node_primary->safe_psql('postgres', 'checkpoint');
49+
$node_primary->safe_psql('postgres', 'CREATE TABLE prim_tab (a int);');
50+
51+
# Register an injection point on the standby so as the follow-up
52+
# restart point will wait on it.
53+
$node_primary->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
54+
# Wait until the extension has been created on the standby
55+
$node_primary->wait_for_replay_catchup($node_standby);
56+
57+
# Note that from this point the checkpointer will wait in the middle of
58+
# a restart point on the standby.
59+
$node_standby->safe_psql('postgres',
60+
"SELECT injection_points_attach('create-restart-point', 'wait');");
61+
62+
# Execute a restart point on the standby, that we will now be waiting on.
63+
# This needs to be in the background.
64+
my $logstart = -s $node_standby->logfile;
65+
my $psql_session =
66+
$node_standby->background_psql('postgres', on_error_stop => 0);
67+
$psql_session->query_until(
68+
qr/starting_checkpoint/, q(
69+
\echo starting_checkpoint
70+
CHECKPOINT;
71+
));
72+
73+
# Switch one WAL segment to make the previous restart point remove the
74+
# segment once the restart point completes.
75+
$node_primary->safe_psql('postgres', 'INSERT INTO prim_tab VALUES (1);');
76+
$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal();');
77+
$node_primary->wait_for_replay_catchup($node_standby);
78+
79+
# Wait until the checkpointer is in the middle of the restart point
80+
# processing.
81+
ok( $node_standby->poll_query_until(
82+
'postgres',
83+
qq[SELECT count(*) FROM pg_stat_activity
84+
WHERE backend_type = 'checkpointer' AND wait_event = 'create-restart-point' ;],
85+
'1'),
86+
'checkpointer is waiting in restart point'
87+
) or die "Timed out while waiting for checkpointer to run restart point";
88+
89+
# Check the logs that the restart point has started on standby. This is
90+
# optional, but let's be sure.
91+
ok( $node_standby->log_contains(
92+
"restartpoint starting: immediate wait", $logstart),
93+
"restartpoint has started");
94+
95+
# Trigger promotion during the restart point.
96+
$node_primary->stop;
97+
$node_standby->promote;
98+
99+
# Update the start position before waking up the checkpointer!
100+
$logstart = -s $node_standby->logfile;
101+
102+
# Now wake up the checkpointer.
103+
$node_standby->safe_psql('postgres',
104+
"SELECT injection_points_wakeup('create-restart-point');");
105+
106+
# Wait until the previous restart point completes on the newly-promoted
107+
# standby, checking the logs for that.
108+
my $checkpoint_complete = 0;
109+
foreach my $i (0 .. 10 * $PostgreSQL::Test::Utils::timeout_default)
110+
{
111+
if ($node_standby->log_contains("restartpoint complete"), $logstart)
112+
{
113+
$checkpoint_complete = 1;
114+
last;
115+
}
116+
usleep(100_000);
117+
}
118+
is($checkpoint_complete, 1, 'restart point has completed');
119+
120+
# Kill with SIGKILL, forcing all the backends to restart.
121+
my $psql_timeout = IPC::Run::timer(3600);
122+
my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
123+
my $killme = IPC::Run::start(
124+
[
125+
'psql', '-XAtq', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d',
126+
$node_standby->connstr('postgres')
127+
],
128+
'<',
129+
\$killme_stdin,
130+
'>',
131+
\$killme_stdout,
132+
'2>',
133+
\$killme_stderr,
134+
$psql_timeout);
135+
$killme_stdin .= q[
136+
SELECT pg_backend_pid();
137+
];
138+
$killme->pump until $killme_stdout =~ /[[:digit:]]+[\r\n]$/;
139+
my $pid = $killme_stdout;
140+
chomp($pid);
141+
$killme_stdout = '';
142+
$killme_stderr = '';
143+
144+
my $ret = PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'KILL', $pid);
145+
is($ret, 0, 'killed process with KILL');
146+
147+
# Wait until the server restarts, finish consuming output.
148+
$killme_stdin .= q[
149+
SELECT 1;
150+
];
151+
ok( pump_until(
152+
$killme,
153+
$psql_timeout,
154+
\$killme_stderr,
155+
qr/server closed the connection unexpectedly|connection to server was lost|could not send data to server/m
156+
),
157+
"psql query died successfully after SIGKILL");
158+
$killme->finish;
159+
160+
# Wait till server finishes restarting.
161+
$node_standby->poll_query_until('postgres', undef, '');
162+
163+
# After recovery, the server should be able to start.
164+
my $stdout;
165+
my $stderr;
166+
($ret, $stdout, $stderr) = $node_standby->psql('postgres', 'select 1');
167+
is($ret, 0, "psql connect success");
168+
is($stdout, 1, "psql select 1");
169+
170+
done_testing();

0 commit comments

Comments
 (0)