|
| 1 | + |
| 2 | +# Copyright (c) 2024, PostgreSQL Global Development Group |
| 3 | + |
| 4 | +use strict; |
| 5 | +use warnings; |
| 6 | +use PostgreSQL::Test::Cluster; |
| 7 | +use PostgreSQL::Test::Utils; |
| 8 | +use Time::HiRes qw(usleep); |
| 9 | +use Test::More; |
| 10 | + |
| 11 | +################################################## |
| 12 | +# Test race condition when a restart point is running during a promotion, |
| 13 | +# checking that WAL segments are correctly removed in the restart point |
| 14 | +# while the promotion finishes. |
| 15 | +# |
| 16 | +# This test relies on an injection point that causes the checkpointer to |
| 17 | +# wait in the middle of a restart point on a standby. The checkpointer |
| 18 | +# is awaken to finish its restart point only once the promotion of the |
| 19 | +# standby is completed, and the node should be able to restart properly. |
| 20 | +################################################## |
| 21 | + |
| 22 | +if ($ENV{enable_injection_points} ne 'yes') |
| 23 | +{ |
| 24 | + plan skip_all => 'Injection points not supported by this build'; |
| 25 | +} |
| 26 | + |
| 27 | +# Initialize primary node. log_checkpoints is required as the checkpoint |
| 28 | +# activity is monitored based on the contents of the logs. |
| 29 | +my $node_primary = PostgreSQL::Test::Cluster->new('master'); |
| 30 | +$node_primary->init(allows_streaming => 1); |
| 31 | +$node_primary->append_conf( |
| 32 | + 'postgresql.conf', q[ |
| 33 | +log_checkpoints = on |
| 34 | +restart_after_crash = on |
| 35 | +]); |
| 36 | +$node_primary->start; |
| 37 | + |
| 38 | +my $backup_name = 'my_backup'; |
| 39 | +$node_primary->backup($backup_name); |
| 40 | + |
| 41 | +# Setup a standby. |
| 42 | +my $node_standby = PostgreSQL::Test::Cluster->new('standby1'); |
| 43 | +$node_standby->init_from_backup($node_primary, $backup_name, |
| 44 | + has_streaming => 1); |
| 45 | +$node_standby->start; |
| 46 | + |
| 47 | +# Dummy table for the upcoming tests. |
| 48 | +$node_primary->safe_psql('postgres', 'checkpoint'); |
| 49 | +$node_primary->safe_psql('postgres', 'CREATE TABLE prim_tab (a int);'); |
| 50 | + |
| 51 | +# Register an injection point on the standby so as the follow-up |
| 52 | +# restart point will wait on it. |
| 53 | +$node_primary->safe_psql('postgres', 'CREATE EXTENSION injection_points;'); |
| 54 | +# Wait until the extension has been created on the standby |
| 55 | +$node_primary->wait_for_replay_catchup($node_standby); |
| 56 | + |
| 57 | +# Note that from this point the checkpointer will wait in the middle of |
| 58 | +# a restart point on the standby. |
| 59 | +$node_standby->safe_psql('postgres', |
| 60 | + "SELECT injection_points_attach('create-restart-point', 'wait');"); |
| 61 | + |
| 62 | +# Execute a restart point on the standby, that we will now be waiting on. |
| 63 | +# This needs to be in the background. |
| 64 | +my $logstart = -s $node_standby->logfile; |
| 65 | +my $psql_session = |
| 66 | + $node_standby->background_psql('postgres', on_error_stop => 0); |
| 67 | +$psql_session->query_until( |
| 68 | + qr/starting_checkpoint/, q( |
| 69 | + \echo starting_checkpoint |
| 70 | + CHECKPOINT; |
| 71 | +)); |
| 72 | + |
| 73 | +# Switch one WAL segment to make the previous restart point remove the |
| 74 | +# segment once the restart point completes. |
| 75 | +$node_primary->safe_psql('postgres', 'INSERT INTO prim_tab VALUES (1);'); |
| 76 | +$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal();'); |
| 77 | +$node_primary->wait_for_replay_catchup($node_standby); |
| 78 | + |
| 79 | +# Wait until the checkpointer is in the middle of the restart point |
| 80 | +# processing. |
| 81 | +ok( $node_standby->poll_query_until( |
| 82 | + 'postgres', |
| 83 | + qq[SELECT count(*) FROM pg_stat_activity |
| 84 | + WHERE backend_type = 'checkpointer' AND wait_event = 'create-restart-point' ;], |
| 85 | + '1'), |
| 86 | + 'checkpointer is waiting in restart point' |
| 87 | +) or die "Timed out while waiting for checkpointer to run restart point"; |
| 88 | + |
| 89 | +# Check the logs that the restart point has started on standby. This is |
| 90 | +# optional, but let's be sure. |
| 91 | +ok( $node_standby->log_contains( |
| 92 | + "restartpoint starting: immediate wait", $logstart), |
| 93 | + "restartpoint has started"); |
| 94 | + |
| 95 | +# Trigger promotion during the restart point. |
| 96 | +$node_primary->stop; |
| 97 | +$node_standby->promote; |
| 98 | + |
| 99 | +# Update the start position before waking up the checkpointer! |
| 100 | +$logstart = -s $node_standby->logfile; |
| 101 | + |
| 102 | +# Now wake up the checkpointer. |
| 103 | +$node_standby->safe_psql('postgres', |
| 104 | + "SELECT injection_points_wakeup('create-restart-point');"); |
| 105 | + |
| 106 | +# Wait until the previous restart point completes on the newly-promoted |
| 107 | +# standby, checking the logs for that. |
| 108 | +my $checkpoint_complete = 0; |
| 109 | +foreach my $i (0 .. 10 * $PostgreSQL::Test::Utils::timeout_default) |
| 110 | +{ |
| 111 | + if ($node_standby->log_contains("restartpoint complete"), $logstart) |
| 112 | + { |
| 113 | + $checkpoint_complete = 1; |
| 114 | + last; |
| 115 | + } |
| 116 | + usleep(100_000); |
| 117 | +} |
| 118 | +is($checkpoint_complete, 1, 'restart point has completed'); |
| 119 | + |
| 120 | +# Kill with SIGKILL, forcing all the backends to restart. |
| 121 | +my $psql_timeout = IPC::Run::timer(3600); |
| 122 | +my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', ''); |
| 123 | +my $killme = IPC::Run::start( |
| 124 | + [ |
| 125 | + 'psql', '-XAtq', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', |
| 126 | + $node_standby->connstr('postgres') |
| 127 | + ], |
| 128 | + '<', |
| 129 | + \$killme_stdin, |
| 130 | + '>', |
| 131 | + \$killme_stdout, |
| 132 | + '2>', |
| 133 | + \$killme_stderr, |
| 134 | + $psql_timeout); |
| 135 | +$killme_stdin .= q[ |
| 136 | +SELECT pg_backend_pid(); |
| 137 | +]; |
| 138 | +$killme->pump until $killme_stdout =~ /[[:digit:]]+[\r\n]$/; |
| 139 | +my $pid = $killme_stdout; |
| 140 | +chomp($pid); |
| 141 | +$killme_stdout = ''; |
| 142 | +$killme_stderr = ''; |
| 143 | + |
| 144 | +my $ret = PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'KILL', $pid); |
| 145 | +is($ret, 0, 'killed process with KILL'); |
| 146 | + |
| 147 | +# Wait until the server restarts, finish consuming output. |
| 148 | +$killme_stdin .= q[ |
| 149 | +SELECT 1; |
| 150 | +]; |
| 151 | +ok( pump_until( |
| 152 | + $killme, |
| 153 | + $psql_timeout, |
| 154 | + \$killme_stderr, |
| 155 | + qr/server closed the connection unexpectedly|connection to server was lost|could not send data to server/m |
| 156 | + ), |
| 157 | + "psql query died successfully after SIGKILL"); |
| 158 | +$killme->finish; |
| 159 | + |
| 160 | +# Wait till server finishes restarting. |
| 161 | +$node_standby->poll_query_until('postgres', undef, ''); |
| 162 | + |
| 163 | +# After recovery, the server should be able to start. |
| 164 | +my $stdout; |
| 165 | +my $stderr; |
| 166 | +($ret, $stdout, $stderr) = $node_standby->psql('postgres', 'select 1'); |
| 167 | +is($ret, 0, "psql connect success"); |
| 168 | +is($stdout, 1, "psql select 1"); |
| 169 | + |
| 170 | +done_testing(); |
0 commit comments