|
| 1 | +# Copyright (c) 2022, PostgreSQL Global Development Group |
| 2 | + |
| 3 | +# Test recovering to a point-in-time using WAL archive, such that the |
| 4 | +# target point is physically in a WAL segment with a higher TLI than |
| 5 | +# the target point's TLI. For example, imagine that the following WAL |
| 6 | +# segments exist in the WAL archive: |
| 7 | +# |
| 8 | +# 000000010000000000000001 |
| 9 | +# 000000010000000000000002 |
| 10 | +# 000000020000000000000003 |
| 11 | +# |
| 12 | +# The timeline switch happened in the middle of WAL segment 3, but it |
| 13 | +# was never archived on timeline 1. The first half of |
| 14 | +# 000000020000000000000003 contains the WAL from timeline 1 up to the |
| 15 | +# point where the timeline switch happened. If you now perform |
| 16 | +# archive recovery with recovery target point in that first half of |
| 17 | +# segment 3, archive recovery will find the WAL up to that point in |
| 18 | +# segment 000000020000000000000003, but it will not follow the |
| 19 | +# timeline switch to timeline 2, and creates a timeline switching |
| 20 | +# end-of-recovery record with TLI 1 -> 3. That's what this test case |
| 21 | +# tests. |
| 22 | +# |
| 23 | +# The comments below contain lists of WAL segments at different points |
| 24 | +# in the tests, to make it easier to follow along. They are correct |
| 25 | +# as of this writing, but the exact WAL segment numbers could change |
| 26 | +# if the backend logic for when it switches to a new segment changes. |
| 27 | +# The actual checks are not sensitive to that. |
| 28 | + |
| 29 | +use strict; |
| 30 | +use warnings; |
| 31 | +use PostgreSQL::Test::Cluster; |
| 32 | +use PostgreSQL::Test::Utils; |
| 33 | +use Test::More; |
| 34 | +use File::Compare; |
| 35 | + |
| 36 | +# Initialize and start primary node with WAL archiving |
| 37 | +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); |
| 38 | +$node_primary->init(has_archiving => 1, allows_streaming => 1); |
| 39 | +$node_primary->start; |
| 40 | + |
| 41 | +# Take a backup. |
| 42 | +my $backup_name = 'my_backup'; |
| 43 | +$node_primary->backup($backup_name); |
| 44 | + |
| 45 | +# Workload with some transactions, and the target restore point. |
| 46 | +$node_primary->psql( |
| 47 | + 'postgres', qq{ |
| 48 | +CREATE TABLE foo(i int); |
| 49 | +INSERT INTO foo VALUES(1); |
| 50 | +SELECT pg_create_restore_point('rp'); |
| 51 | +INSERT INTO foo VALUES(2); |
| 52 | +}); |
| 53 | + |
| 54 | +# Contents of the WAL archive at this point: |
| 55 | +# |
| 56 | +# 000000010000000000000001 |
| 57 | +# 000000010000000000000002 |
| 58 | +# 000000010000000000000002.00000028.backup |
| 59 | +# |
| 60 | +# The operations on the test table and the restore point went into WAL |
| 61 | +# segment 3, but it hasn't been archived yet. |
| 62 | + |
| 63 | +# Start a standby node, and wait for it to catch up. |
| 64 | +my $node_standby = PostgreSQL::Test::Cluster->new('standby'); |
| 65 | +$node_standby->init_from_backup( |
| 66 | + $node_primary, $backup_name, |
| 67 | + standby => 1, |
| 68 | + has_streaming => 1, |
| 69 | + has_archiving => 1, |
| 70 | + has_restoring => 0); |
| 71 | +$node_standby->append_conf('postgresql.conf', 'archive_mode = always'); |
| 72 | +$node_standby->start; |
| 73 | +$node_primary->wait_for_catchup($node_standby); |
| 74 | + |
| 75 | +# Check that it's really caught up. |
| 76 | +my $result = $node_standby->safe_psql('postgres', "SELECT max(i) FROM foo;"); |
| 77 | +is($result, qq{2}, "check table contents after archive recovery"); |
| 78 | + |
| 79 | +# Kill the old primary, before it archives the most recent WAL segment that |
| 80 | +# contains all the INSERTs. |
| 81 | +$node_primary->stop('immediate'); |
| 82 | + |
| 83 | +# Promote the standby, and switch WAL so that it archives a WAL segment |
| 84 | +# that contains all the INSERTs, on a new timeline. |
| 85 | +$node_standby->promote; |
| 86 | + |
| 87 | +# Find next WAL segment to be archived. |
| 88 | +my $walfile_to_be_archived = $node_standby->safe_psql('postgres', |
| 89 | + "SELECT pg_walfile_name(pg_current_wal_lsn());"); |
| 90 | + |
| 91 | +# Make WAL segment eligible for archival |
| 92 | +$node_standby->safe_psql('postgres', 'SELECT pg_switch_wal()'); |
| 93 | + |
| 94 | +# Wait until the WAL segment has been archived. |
| 95 | +my $archive_wait_query = |
| 96 | + "SELECT '$walfile_to_be_archived' <= last_archived_wal FROM pg_stat_archiver;"; |
| 97 | +$node_standby->poll_query_until('postgres', $archive_wait_query) |
| 98 | + or die "Timed out while waiting for WAL segment to be archived"; |
| 99 | +my $last_archived_wal_file = $walfile_to_be_archived; |
| 100 | + |
| 101 | +# Ok, the standby has now archived the WAL on timeline 2. We don't |
| 102 | +# need the standby anymore. |
| 103 | +$node_standby->stop; |
| 104 | + |
| 105 | +# Contents of the WAL archive at this point: |
| 106 | +# |
| 107 | +# 000000010000000000000001 |
| 108 | +# 000000010000000000000002 |
| 109 | +# 000000010000000000000002.00000028.backup |
| 110 | +# 000000010000000000000003.partial |
| 111 | +# 000000020000000000000003 |
| 112 | +# 00000002.history |
| 113 | +# |
| 114 | +# The operations on the test table and the restore point are in |
| 115 | +# segment 3. They are part of timeline 1, but were not archived by |
| 116 | +# the primary yet. However, they were copied into the beginning of |
| 117 | +# segment 000000020000000000000003, before the timeline switching |
| 118 | +# record. (They are also present in the |
| 119 | +# 000000010000000000000003.partial file, but .partial files are not |
| 120 | +# used automatically.) |
| 121 | + |
| 122 | +# Now test PITR to the recovery target. It should find the WAL in |
| 123 | +# segment 000000020000000000000003, but not follow the timeline switch |
| 124 | +# to timeline 2. |
| 125 | +my $node_pitr = PostgreSQL::Test::Cluster->new('node_pitr'); |
| 126 | +$node_pitr->init_from_backup( |
| 127 | + $node_primary, $backup_name, |
| 128 | + standby => 0, |
| 129 | + has_restoring => 1); |
| 130 | +$node_pitr->append_conf( |
| 131 | + 'postgresql.conf', qq{ |
| 132 | +recovery_target_name = 'rp' |
| 133 | +recovery_target_action = 'promote' |
| 134 | +}); |
| 135 | + |
| 136 | +$node_pitr->start; |
| 137 | + |
| 138 | +# Wait until recovery finishes. |
| 139 | +$node_pitr->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") |
| 140 | + or die "Timed out while waiting for PITR promotion"; |
| 141 | + |
| 142 | +# Check that we see the data we expect. |
| 143 | +$result = $node_pitr->safe_psql('postgres', "SELECT max(i) FROM foo;"); |
| 144 | +is($result, qq{1}, "check table contents after point-in-time recovery"); |
| 145 | + |
| 146 | +# Insert a row so that we can check later that we successfully recover |
| 147 | +# back to this timeline. |
| 148 | +$node_pitr->safe_psql('postgres', "INSERT INTO foo VALUES(3);"); |
| 149 | + |
| 150 | +# Stop the node. This archives the last segment. |
| 151 | +$node_pitr->stop(); |
| 152 | + |
| 153 | +# Test archive recovery on the timeline created by the PITR. This |
| 154 | +# replays the end-of-recovery record that switches from timeline 1 to |
| 155 | +# 3. |
| 156 | +my $node_pitr2 = PostgreSQL::Test::Cluster->new('node_pitr2'); |
| 157 | +$node_pitr2->init_from_backup( |
| 158 | + $node_primary, $backup_name, |
| 159 | + standby => 0, |
| 160 | + has_restoring => 1); |
| 161 | +$node_pitr2->append_conf( |
| 162 | + 'postgresql.conf', qq{ |
| 163 | +recovery_target_action = 'promote' |
| 164 | +}); |
| 165 | + |
| 166 | +$node_pitr2->start; |
| 167 | + |
| 168 | +# Wait until recovery finishes. |
| 169 | +$node_pitr2->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") |
| 170 | + or die "Timed out while waiting for PITR promotion"; |
| 171 | + |
| 172 | +# Verify that we can see the row inserted after the PITR. |
| 173 | +$result = $node_pitr2->safe_psql('postgres', "SELECT max(i) FROM foo;"); |
| 174 | +is($result, qq{3}, "check table contents after point-in-time recovery"); |
| 175 | + |
| 176 | +done_testing(); |
0 commit comments