Skip to content

Commit e5d8a99

Browse files
Use full 64-bit XIDs in deleted nbtree pages.
Otherwise we risk "leaking" deleted pages by making them non-recyclable indefinitely. Commit 6655a729 did the same thing for deleted pages in GiST indexes. That work was used as a starting point here. Stop storing an XID indicating the oldest bpto.xact across all deleted though unrecycled pages in nbtree metapages. There is no longer any reason to care about that condition/the oldest XID. It only ever made sense when wraparound was something _bt_vacuum_needs_cleanup() had to consider. The btm_oldest_btpo_xact metapage field has been repurposed and renamed. It is now btm_last_cleanup_num_delpages, which is used to remember how many non-recycled deleted pages remain from the last VACUUM (in practice its value is usually the precise number of pages that were _newly deleted_ during the specific VACUUM operation that last set the field). The general idea behind storing btm_last_cleanup_num_delpages is to use it to give _some_ consideration to non-recycled deleted pages inside _bt_vacuum_needs_cleanup() -- though never too much. We only really need to avoid leaving a truly excessive number of deleted pages in an unrecycled state forever. We only do this to cover certain narrow cases where no other factor makes VACUUM do a full scan, and yet the index continues to grow (and so actually misses out on recycling existing deleted pages). These metapage changes result in a clear user-visible benefit: We no longer trigger full index scans during VACUUM operations solely due to the presence of only 1 or 2 known deleted (though unrecycled) blocks from a very large index. All that matters now is keeping the costs and benefits in balance over time. Fix an issue that has been around since commit 857f9c3, which added the "skip full scan of index" mechanism (i.e. the _bt_vacuum_needs_cleanup() logic). The accuracy of btm_last_cleanup_num_heap_tuples accidentally hinged upon _when_ the source value gets stored. We now always store btm_last_cleanup_num_heap_tuples in btvacuumcleanup(). This fixes the issue because IndexVacuumInfo.num_heap_tuples (the source field) is expected to accurately indicate the state of the table _after_ the VACUUM completes inside btvacuumcleanup(). A backpatchable fix cannot easily be extracted from this commit. A targeted fix for the issue will follow in a later commit, though that won't happen today. I (pgeoghegan) have chosen to remove any mention of deleted pages in the documentation of the vacuum_cleanup_index_scale_factor GUC/param, since the presence of deleted (though unrecycled) pages is no longer of much concern to users. The vacuum_cleanup_index_scale_factor description in the docs now seems rather unclear in any case, and it should probably be rewritten in the near future. Perhaps some passing mention of page deletion will be added back at the same time. Bump XLOG_PAGE_MAGIC due to nbtree WAL records using full XIDs now. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznpdHvujGUwYZ8sihX=d5u-tRYhi-F4wnV2uN2zHpMUXw@mail.gmail.com
1 parent 8a4f952 commit e5d8a99

File tree

20 files changed

+623
-407
lines changed

20 files changed

+623
-407
lines changed

contrib/amcheck/verify_nbtree.c

Lines changed: 56 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -769,7 +769,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
769769
P_FIRSTDATAKEY(opaque));
770770
itup = (IndexTuple) PageGetItem(state->target, itemid);
771771
nextleveldown.leftmost = BTreeTupleGetDownLink(itup);
772-
nextleveldown.level = opaque->btpo.level - 1;
772+
nextleveldown.level = opaque->btpo_level - 1;
773773
}
774774
else
775775
{
@@ -794,14 +794,14 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
794794
if (opaque->btpo_prev != leftcurrent)
795795
bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent);
796796

797-
/* Check level, which must be valid for non-ignorable page */
798-
if (level.level != opaque->btpo.level)
797+
/* Check level */
798+
if (level.level != opaque->btpo_level)
799799
ereport(ERROR,
800800
(errcode(ERRCODE_INDEX_CORRUPTED),
801801
errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down",
802802
RelationGetRelationName(state->rel)),
803803
errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
804-
current, level.level, opaque->btpo.level)));
804+
current, level.level, opaque->btpo_level)));
805805

806806
/* Verify invariants for page */
807807
bt_target_page_check(state);
@@ -1164,7 +1164,7 @@ bt_target_page_check(BtreeCheckState *state)
11641164
bt_child_highkey_check(state,
11651165
offset,
11661166
NULL,
1167-
topaque->btpo.level);
1167+
topaque->btpo_level);
11681168
}
11691169
continue;
11701170
}
@@ -1520,7 +1520,7 @@ bt_target_page_check(BtreeCheckState *state)
15201520
if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
15211521
{
15221522
bt_child_highkey_check(state, InvalidOffsetNumber,
1523-
NULL, topaque->btpo.level);
1523+
NULL, topaque->btpo_level);
15241524
}
15251525
}
15261526

@@ -1597,7 +1597,7 @@ bt_right_page_check_scankey(BtreeCheckState *state)
15971597
ereport(DEBUG1,
15981598
(errcode(ERRCODE_NO_DATA),
15991599
errmsg_internal("level %u leftmost page of index \"%s\" was found deleted or half dead",
1600-
opaque->btpo.level, RelationGetRelationName(state->rel)),
1600+
opaque->btpo_level, RelationGetRelationName(state->rel)),
16011601
errdetail_internal("Deleted page found when building scankey from right sibling.")));
16021602

16031603
/* Be slightly more pro-active in freeing this memory, just in case */
@@ -1900,14 +1900,15 @@ bt_child_highkey_check(BtreeCheckState *state,
19001900
state->targetblock, blkno,
19011901
LSN_FORMAT_ARGS(state->targetlsn))));
19021902

1903-
/* Check level for non-ignorable page */
1904-
if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1)
1903+
/* Do level sanity check */
1904+
if ((!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) &&
1905+
opaque->btpo_level != target_level - 1)
19051906
ereport(ERROR,
19061907
(errcode(ERRCODE_INDEX_CORRUPTED),
19071908
errmsg("block found while following rightlinks from child of index \"%s\" has invalid level",
19081909
RelationGetRelationName(state->rel)),
19091910
errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
1910-
blkno, target_level - 1, opaque->btpo.level)));
1911+
blkno, target_level - 1, opaque->btpo_level)));
19111912

19121913
/* Try to detect circular links */
19131914
if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev)
@@ -2132,7 +2133,7 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
21322133
* check for downlink connectivity.
21332134
*/
21342135
bt_child_highkey_check(state, downlinkoffnum,
2135-
child, topaque->btpo.level);
2136+
child, topaque->btpo_level);
21362137

21372138
/*
21382139
* Since there cannot be a concurrent VACUUM operation in readonly mode,
@@ -2275,7 +2276,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
22752276
errmsg_internal("harmless interrupted page split detected in index %s",
22762277
RelationGetRelationName(state->rel)),
22772278
errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
2278-
blkno, opaque->btpo.level,
2279+
blkno, opaque->btpo_level,
22792280
opaque->btpo_prev,
22802281
LSN_FORMAT_ARGS(pagelsn))));
22812282
return;
@@ -2304,7 +2305,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
23042305
elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
23052306
RelationGetRelationName(state->rel));
23062307

2307-
level = opaque->btpo.level;
2308+
level = opaque->btpo_level;
23082309
itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque));
23092310
itup = (IndexTuple) PageGetItem(page, itemid);
23102311
childblk = BTreeTupleGetDownLink(itup);
@@ -2319,16 +2320,16 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
23192320
break;
23202321

23212322
/* Do an extra sanity check in passing on internal pages */
2322-
if (copaque->btpo.level != level - 1)
2323+
if (copaque->btpo_level != level - 1)
23232324
ereport(ERROR,
23242325
(errcode(ERRCODE_INDEX_CORRUPTED),
23252326
errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
23262327
RelationGetRelationName(state->rel)),
23272328
errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
23282329
blkno, childblk,
2329-
level - 1, copaque->btpo.level)));
2330+
level - 1, copaque->btpo_level)));
23302331

2331-
level = copaque->btpo.level;
2332+
level = copaque->btpo_level;
23322333
itemid = PageGetItemIdCareful(state, childblk, child,
23332334
P_FIRSTDATAKEY(copaque));
23342335
itup = (IndexTuple) PageGetItem(child, itemid);
@@ -2389,7 +2390,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
23892390
errmsg("internal index block lacks downlink in index \"%s\"",
23902391
RelationGetRelationName(state->rel)),
23912392
errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
2392-
blkno, opaque->btpo.level,
2393+
blkno, opaque->btpo_level,
23932394
LSN_FORMAT_ARGS(pagelsn))));
23942395
}
23952396

@@ -2983,21 +2984,28 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
29832984
}
29842985

29852986
/*
2986-
* Deleted pages have no sane "level" field, so can only check non-deleted
2987-
* page level
2987+
* Deleted pages that still use the old 32-bit XID representation have no
2988+
* sane "level" field because they type pun the field, but all other pages
2989+
* (including pages deleted on Postgres 14+) have a valid value.
29882990
*/
2989-
if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0)
2990-
ereport(ERROR,
2991-
(errcode(ERRCODE_INDEX_CORRUPTED),
2992-
errmsg("invalid leaf page level %u for block %u in index \"%s\"",
2993-
opaque->btpo.level, blocknum, RelationGetRelationName(state->rel))));
2991+
if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque))
2992+
{
2993+
/* Okay, no reason not to trust btpo_level field from page */
29942994

2995-
if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) &&
2996-
opaque->btpo.level == 0)
2997-
ereport(ERROR,
2998-
(errcode(ERRCODE_INDEX_CORRUPTED),
2999-
errmsg("invalid internal page level 0 for block %u in index \"%s\"",
3000-
blocknum, RelationGetRelationName(state->rel))));
2995+
if (P_ISLEAF(opaque) && opaque->btpo_level != 0)
2996+
ereport(ERROR,
2997+
(errcode(ERRCODE_INDEX_CORRUPTED),
2998+
errmsg_internal("invalid leaf page level %u for block %u in index \"%s\"",
2999+
opaque->btpo_level, blocknum,
3000+
RelationGetRelationName(state->rel))));
3001+
3002+
if (!P_ISLEAF(opaque) && opaque->btpo_level == 0)
3003+
ereport(ERROR,
3004+
(errcode(ERRCODE_INDEX_CORRUPTED),
3005+
errmsg_internal("invalid internal page level 0 for block %u in index \"%s\"",
3006+
blocknum,
3007+
RelationGetRelationName(state->rel))));
3008+
}
30013009

30023010
/*
30033011
* Sanity checks for number of items on page.
@@ -3044,8 +3052,6 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
30443052
* state. This state is nonetheless treated as corruption by VACUUM on
30453053
* from version 9.4 on, so do the same here. See _bt_pagedel() for full
30463054
* details.
3047-
*
3048-
* Internal pages should never have garbage items, either.
30493055
*/
30503056
if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque))
30513057
ereport(ERROR,
@@ -3054,11 +3060,27 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
30543060
blocknum, RelationGetRelationName(state->rel)),
30553061
errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
30563062

3063+
/*
3064+
* Check that internal pages have no garbage items, and that no page has
3065+
* an invalid combination of deletion-related page level flags
3066+
*/
30573067
if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))
30583068
ereport(ERROR,
30593069
(errcode(ERRCODE_INDEX_CORRUPTED),
3060-
errmsg("internal page block %u in index \"%s\" has garbage items",
3061-
blocknum, RelationGetRelationName(state->rel))));
3070+
errmsg_internal("internal page block %u in index \"%s\" has garbage items",
3071+
blocknum, RelationGetRelationName(state->rel))));
3072+
3073+
if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque))
3074+
ereport(ERROR,
3075+
(errcode(ERRCODE_INDEX_CORRUPTED),
3076+
errmsg_internal("full transaction id page flag appears in non-deleted block %u in index \"%s\"",
3077+
blocknum, RelationGetRelationName(state->rel))));
3078+
3079+
if (P_ISDELETED(opaque) && P_ISHALFDEAD(opaque))
3080+
ereport(ERROR,
3081+
(errcode(ERRCODE_INDEX_CORRUPTED),
3082+
errmsg_internal("deleted page block %u in index \"%s\" is half-dead",
3083+
blocknum, RelationGetRelationName(state->rel))));
30623084

30633085
return page;
30643086
}

contrib/pageinspect/btreefuncs.c

Lines changed: 49 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,7 @@ typedef struct BTPageStat
7575
/* opaque data */
7676
BlockNumber btpo_prev;
7777
BlockNumber btpo_next;
78-
union
79-
{
80-
uint32 level;
81-
TransactionId xact;
82-
} btpo;
78+
uint32 btpo_level;
8379
uint16 btpo_flags;
8480
BTCycleId btpo_cycleid;
8581
} BTPageStat;
@@ -112,9 +108,33 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
112108
/* page type (flags) */
113109
if (P_ISDELETED(opaque))
114110
{
115-
stat->type = 'd';
116-
stat->btpo.xact = opaque->btpo.xact;
117-
return;
111+
/* We divide deleted pages into leaf ('d') or internal ('D') */
112+
if (P_ISLEAF(opaque) || !P_HAS_FULLXID(opaque))
113+
stat->type = 'd';
114+
else
115+
stat->type = 'D';
116+
117+
/*
118+
* Report safexid in a deleted page.
119+
*
120+
* Handle pg_upgrade'd deleted pages that used the previous safexid
121+
* representation in btpo_level field (this used to be a union type
122+
* called "bpto").
123+
*/
124+
if (P_HAS_FULLXID(opaque))
125+
{
126+
FullTransactionId safexid = BTPageGetDeleteXid(page);
127+
128+
elog(NOTICE, "deleted page from block %u has safexid %u:%u",
129+
blkno, EpochFromFullTransactionId(safexid),
130+
XidFromFullTransactionId(safexid));
131+
}
132+
else
133+
elog(NOTICE, "deleted page from block %u has safexid %u",
134+
blkno, opaque->btpo_level);
135+
136+
/* Don't interpret BTDeletedPageData as index tuples */
137+
maxoff = InvalidOffsetNumber;
118138
}
119139
else if (P_IGNORE(opaque))
120140
stat->type = 'e';
@@ -128,7 +148,7 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
128148
/* btpage opaque data */
129149
stat->btpo_prev = opaque->btpo_prev;
130150
stat->btpo_next = opaque->btpo_next;
131-
stat->btpo.level = opaque->btpo.level;
151+
stat->btpo_level = opaque->btpo_level;
132152
stat->btpo_flags = opaque->btpo_flags;
133153
stat->btpo_cycleid = opaque->btpo_cycleid;
134154

@@ -237,7 +257,7 @@ bt_page_stats_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version)
237257
values[j++] = psprintf("%u", stat.free_size);
238258
values[j++] = psprintf("%u", stat.btpo_prev);
239259
values[j++] = psprintf("%u", stat.btpo_next);
240-
values[j++] = psprintf("%u", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level);
260+
values[j++] = psprintf("%u", stat.btpo_level);
241261
values[j++] = psprintf("%d", stat.btpo_flags);
242262

243263
tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
@@ -503,10 +523,14 @@ bt_page_items_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version)
503523

504524
opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
505525

506-
if (P_ISDELETED(opaque))
507-
elog(NOTICE, "page is deleted");
508-
509-
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
526+
if (!P_ISDELETED(opaque))
527+
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
528+
else
529+
{
530+
/* Don't interpret BTDeletedPageData as index tuples */
531+
elog(NOTICE, "page from block " INT64_FORMAT " is deleted", blkno);
532+
fctx->max_calls = 0;
533+
}
510534
uargs->leafpage = P_ISLEAF(opaque);
511535
uargs->rightmost = P_RIGHTMOST(opaque);
512536

@@ -603,7 +627,14 @@ bt_page_items_bytea(PG_FUNCTION_ARGS)
603627
if (P_ISDELETED(opaque))
604628
elog(NOTICE, "page is deleted");
605629

606-
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
630+
if (!P_ISDELETED(opaque))
631+
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
632+
else
633+
{
634+
/* Don't interpret BTDeletedPageData as index tuples */
635+
elog(NOTICE, "page from block is deleted");
636+
fctx->max_calls = 0;
637+
}
607638
uargs->leafpage = P_ISLEAF(opaque);
608639
uargs->rightmost = P_RIGHTMOST(opaque);
609640

@@ -692,10 +723,7 @@ bt_metap(PG_FUNCTION_ARGS)
692723

693724
/*
694725
* We need a kluge here to detect API versions prior to 1.8. Earlier
695-
* versions incorrectly used int4 for certain columns. This caused
696-
* various problems. For example, an int4 version of the "oldest_xact"
697-
* column would not work with TransactionId values that happened to exceed
698-
* PG_INT32_MAX.
726+
* versions incorrectly used int4 for certain columns.
699727
*
700728
* There is no way to reliably avoid the problems created by the old
701729
* function definition at this point, so insist that the user update the
@@ -723,7 +751,8 @@ bt_metap(PG_FUNCTION_ARGS)
723751
*/
724752
if (metad->btm_version >= BTREE_NOVAC_VERSION)
725753
{
726-
values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
754+
values[j++] = psprintf(INT64_FORMAT,
755+
(int64) metad->btm_last_cleanup_num_delpages);
727756
values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples);
728757
values[j++] = metad->btm_allequalimage ? "t" : "f";
729758
}

contrib/pageinspect/expected/btree.out

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,16 @@ INSERT INTO test1 VALUES (72057594037927937, 'text');
33
CREATE INDEX test1_a_idx ON test1 USING btree (a);
44
\x
55
SELECT * FROM bt_metap('test1_a_idx');
6-
-[ RECORD 1 ]-----------+-------
7-
magic | 340322
8-
version | 4
9-
root | 1
10-
level | 0
11-
fastroot | 1
12-
fastlevel | 0
13-
oldest_xact | 0
14-
last_cleanup_num_tuples | -1
15-
allequalimage | t
6+
-[ RECORD 1 ]-------------+-------
7+
magic | 340322
8+
version | 4
9+
root | 1
10+
level | 0
11+
fastroot | 1
12+
fastlevel | 0
13+
last_cleanup_num_delpages | 0
14+
last_cleanup_num_tuples | -1
15+
allequalimage | t
1616

1717
SELECT * FROM bt_page_stats('test1_a_idx', -1);
1818
ERROR: invalid block number
@@ -29,7 +29,7 @@ page_size | 8192
2929
free_size | 8128
3030
btpo_prev | 0
3131
btpo_next | 0
32-
btpo | 0
32+
btpo_level | 0
3333
btpo_flags | 3
3434

3535
SELECT * FROM bt_page_stats('test1_a_idx', 2);

0 commit comments

Comments
 (0)