Skip to content

Commit 3c6a05b

Browse files
committed
amcheck: Distinguish interrupted page deletion from corruption.
This prevents false-positive reports about "the first child of leftmost target page is not leftmost of its level", "block %u is not leftmost" and "left link/right link pair". They appeared if amcheck ran before VACUUM cleaned things, after a cluster exited recovery between the first-stage and second-stage WAL records of a deletion. Back-patch to v11 (all supported versions). Reviewed by Peter Geoghegan. Discussion: https://postgr.es/m/20231005025232.c7.nmisch@google.com
1 parent 67738db commit 3c6a05b

File tree

4 files changed

+163
-4
lines changed

4 files changed

+163
-4
lines changed

contrib/amcheck/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ PGFILEDESC = "amcheck - function for verifying relation integrity"
1212

1313
REGRESS = check check_btree check_heap
1414

15+
EXTRA_INSTALL = contrib/pg_walinspect
1516
TAP_TESTS = 1
1617

1718
ifdef USE_PGXS

contrib/amcheck/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ tests += {
4242
't/001_verify_heapam.pl',
4343
't/002_cic.pl',
4444
't/003_cic_2pc.pl',
45+
't/005_pitr.pl',
4546
],
4647
},
4748
}

contrib/amcheck/t/005_pitr.pl

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Copyright (c) 2021-2023, PostgreSQL Global Development Group
2+
3+
# Test integrity of intermediate states by PITR to those states
4+
use strict;
5+
use warnings;
6+
use PostgreSQL::Test::Cluster;
7+
use PostgreSQL::Test::Utils;
8+
use Test::More;
9+
10+
# origin node: generate WAL records of interest.
11+
my $origin = PostgreSQL::Test::Cluster->new('origin');
12+
$origin->init(has_archiving => 1, allows_streaming => 1);
13+
$origin->append_conf('postgresql.conf', 'autovacuum = off');
14+
$origin->start;
15+
$origin->backup('my_backup');
16+
# Create a table with each of 6 PK values spanning 1/4 of a block. Delete the
17+
# first four, so one index leaf is eligible for deletion. Make a replication
18+
# slot just so pg_walinspect will always have access to later WAL.
19+
my $setup = <<EOSQL;
20+
BEGIN;
21+
CREATE EXTENSION amcheck;
22+
CREATE EXTENSION pg_walinspect;
23+
CREATE TABLE not_leftmost (c text STORAGE PLAIN);
24+
INSERT INTO not_leftmost
25+
SELECT repeat(n::text, database_block_size / 4)
26+
FROM generate_series(1,6) t(n), pg_control_init();
27+
ALTER TABLE not_leftmost ADD CONSTRAINT not_leftmost_pk PRIMARY KEY (c);
28+
DELETE FROM not_leftmost WHERE c ~ '^[1-4]';
29+
SELECT pg_create_physical_replication_slot('for_walinspect', true, false);
30+
COMMIT;
31+
EOSQL
32+
$origin->safe_psql('postgres', $setup);
33+
my $before_vacuum_lsn =
34+
$origin->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
35+
# VACUUM to delete the aforementioned leaf page. Force an XLogFlush() by
36+
# dropping a permanent table. That way, the XLogReader infrastructure can
37+
# always see VACUUM's records, even under synchronous_commit=off. Finally,
38+
# find the LSN of that VACUUM's last UNLINK_PAGE record.
39+
my $vacuum = <<EOSQL;
40+
SET synchronous_commit = off;
41+
VACUUM (VERBOSE, INDEX_CLEANUP ON) not_leftmost;
42+
CREATE TABLE XLogFlush ();
43+
DROP TABLE XLogFlush;
44+
SELECT max(start_lsn)
45+
FROM pg_get_wal_records_info('$before_vacuum_lsn', 'FFFFFFFF/FFFFFFFF')
46+
WHERE resource_manager = 'Btree' AND record_type = 'UNLINK_PAGE';
47+
EOSQL
48+
my $unlink_lsn = $origin->safe_psql('postgres', $vacuum);
49+
$origin->stop;
50+
die "did not find UNLINK_PAGE record" unless $unlink_lsn;
51+
52+
# replica node: amcheck at notable points in the WAL stream
53+
my $replica = PostgreSQL::Test::Cluster->new('replica');
54+
$replica->init_from_backup($origin, 'my_backup', has_restoring => 1);
55+
$replica->append_conf('postgresql.conf',
56+
"recovery_target_lsn = '$unlink_lsn'");
57+
$replica->append_conf('postgresql.conf', 'recovery_target_inclusive = off');
58+
$replica->append_conf('postgresql.conf', 'recovery_target_action = promote');
59+
$replica->start;
60+
$replica->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';")
61+
or die "Timed out while waiting for PITR promotion";
62+
# recovery done; run amcheck
63+
my $debug = "SET client_min_messages = 'debug1'";
64+
my ($rc, $stderr);
65+
$rc = $replica->psql(
66+
'postgres',
67+
"$debug; SELECT bt_index_parent_check('not_leftmost_pk', true)",
68+
stderr => \$stderr);
69+
print STDERR $stderr, "\n";
70+
is($rc, 0, "bt_index_parent_check passes");
71+
like(
72+
$stderr,
73+
qr/interrupted page deletion detected/,
74+
"bt_index_parent_check: interrupted page deletion detected");
75+
$rc = $replica->psql(
76+
'postgres',
77+
"$debug; SELECT bt_index_check('not_leftmost_pk', true)",
78+
stderr => \$stderr);
79+
print STDERR $stderr, "\n";
80+
is($rc, 0, "bt_index_check passes");
81+
82+
done_testing();

contrib/amcheck/verify_nbtree.c

Lines changed: 79 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel,
148148
bool rootdescend);
149149
static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
150150
BtreeLevel level);
151+
static bool bt_leftmost_ignoring_half_dead(BtreeCheckState *state,
152+
BlockNumber start,
153+
BTPageOpaque start_opaque);
151154
static void bt_recheck_sibling_links(BtreeCheckState *state,
152155
BlockNumber btpo_prev_from_target,
153156
BlockNumber leftcurrent);
@@ -776,7 +779,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
776779
*/
777780
if (state->readonly)
778781
{
779-
if (!P_LEFTMOST(opaque))
782+
if (!bt_leftmost_ignoring_half_dead(state, current, opaque))
780783
ereport(ERROR,
781784
(errcode(ERRCODE_INDEX_CORRUPTED),
782785
errmsg("block %u is not leftmost in index \"%s\"",
@@ -830,8 +833,16 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
830833
*/
831834
}
832835

833-
/* Sibling links should be in mutual agreement */
834-
if (opaque->btpo_prev != leftcurrent)
836+
/*
837+
* Sibling links should be in mutual agreement. There arises
838+
* leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling
839+
* of the parent's low-key downlink is half-dead. (A half-dead page
840+
* has no downlink from its parent.) Under heavyweight locking, the
841+
* last bt_leftmost_ignoring_half_dead() validated this btpo_prev.
842+
* Without heavyweight locking, validation of the P_NONE case remains
843+
* unimplemented.
844+
*/
845+
if (opaque->btpo_prev != leftcurrent && leftcurrent != P_NONE)
835846
bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent);
836847

837848
/* Check level */
@@ -912,6 +923,66 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
912923
return nextleveldown;
913924
}
914925

926+
/*
927+
* Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of
928+
* half-dead, sibling-linked pages to the left. If a half-dead page appears
929+
* under state->readonly, the database exited recovery between the first-stage
930+
* and second-stage WAL records of a deletion.
931+
*/
932+
static bool
933+
bt_leftmost_ignoring_half_dead(BtreeCheckState *state,
934+
BlockNumber start,
935+
BTPageOpaque start_opaque)
936+
{
937+
BlockNumber reached = start_opaque->btpo_prev,
938+
reached_from = start;
939+
bool all_half_dead = true;
940+
941+
/*
942+
* To handle the !readonly case, we'd need to accept BTP_DELETED pages and
943+
* potentially observe nbtree/README "Page deletion and backwards scans".
944+
*/
945+
Assert(state->readonly);
946+
947+
while (reached != P_NONE && all_half_dead)
948+
{
949+
Page page = palloc_btree_page(state, reached);
950+
BTPageOpaque reached_opaque = BTPageGetOpaque(page);
951+
952+
CHECK_FOR_INTERRUPTS();
953+
954+
/*
955+
* Try to detect btpo_prev circular links. _bt_unlink_halfdead_page()
956+
* writes that side-links will continue to point to the siblings.
957+
* Check btpo_next for that property.
958+
*/
959+
all_half_dead = P_ISHALFDEAD(reached_opaque) &&
960+
reached != start &&
961+
reached != reached_from &&
962+
reached_opaque->btpo_next == reached_from;
963+
if (all_half_dead)
964+
{
965+
XLogRecPtr pagelsn = PageGetLSN(page);
966+
967+
/* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */
968+
ereport(DEBUG1,
969+
(errcode(ERRCODE_NO_DATA),
970+
errmsg_internal("harmless interrupted page deletion detected in index \"%s\"",
971+
RelationGetRelationName(state->rel)),
972+
errdetail_internal("Block=%u right block=%u page lsn=%X/%X.",
973+
reached, reached_from,
974+
LSN_FORMAT_ARGS(pagelsn))));
975+
976+
reached_from = reached;
977+
reached = reached_opaque->btpo_prev;
978+
}
979+
980+
pfree(page);
981+
}
982+
983+
return all_half_dead;
984+
}
985+
915986
/*
916987
* Raise an error when target page's left link does not point back to the
917988
* previous target page, called leftcurrent here. The leftcurrent page's
@@ -952,6 +1023,9 @@ bt_recheck_sibling_links(BtreeCheckState *state,
9521023
BlockNumber btpo_prev_from_target,
9531024
BlockNumber leftcurrent)
9541025
{
1026+
/* passing metapage to BTPageGetOpaque() would give irrelevant findings */
1027+
Assert(leftcurrent != P_NONE);
1028+
9551029
if (!state->readonly)
9561030
{
9571031
Buffer lbuf;
@@ -1935,7 +2009,8 @@ bt_child_highkey_check(BtreeCheckState *state,
19352009
opaque = BTPageGetOpaque(page);
19362010

19372011
/* The first page we visit at the level should be leftmost */
1938-
if (first && !BlockNumberIsValid(state->prevrightlink) && !P_LEFTMOST(opaque))
2012+
if (first && !BlockNumberIsValid(state->prevrightlink) &&
2013+
!bt_leftmost_ignoring_half_dead(state, blkno, opaque))
19392014
ereport(ERROR,
19402015
(errcode(ERRCODE_INDEX_CORRUPTED),
19412016
errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"",

0 commit comments

Comments
 (0)