Skip to content

Commit bf78f50

Browse files
Don't leave behind junk nbtree pages during split.
Commit 8fa30f9 reduced the elevel of a number of "can't happen" _bt_split() errors from PANIC to ERROR. At the same time, the new right page buffer for the split could continue to be acquired well before the critical section. This was possible because it was relatively straightforward to make sure that _bt_split() could not throw an error, with a few specific exceptions. The exceptional cases were safe because they involved specific, well understood errors, making it possible to consistently zero the right page before actually raising an error using elog(). There was no danger of leaving around a junk page, provided _bt_split() stuck to this coding rule. Commit 8224de4, which introduced INCLUDE indexes, added code to make _bt_split() truncate away non-key attributes. This happened at a point that broke the rule around zeroing the right page in _bt_split(). If truncation failed (perhaps due to palloc() failure), that would result in an errant right page buffer with junk contents. This could confuse VACUUM when it attempted to delete the page, and should be avoided on general principle. To fix, reorganize _bt_split() so that truncation occurs before the new right page buffer is even acquired. A junk page/buffer will not be left behind if _bt_nonkey_truncate()/_bt_truncate() raise an error. Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com Backpatch: 11-, where INCLUDE indexes were introduced.
1 parent 6b0e941 commit bf78f50

File tree

1 file changed

+123
-84
lines changed

1 file changed

+123
-84
lines changed

src/backend/access/nbtree/nbtinsert.c

Lines changed: 123 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,7 @@ static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf,
7171
OffsetNumber newitemoff,
7272
bool split_only_page);
7373
static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf,
74-
OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz,
75-
IndexTuple newitem, bool newitemonleft);
74+
OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem);
7675
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
7776
BTStack stack, bool is_root, bool is_only);
7877
static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
@@ -842,7 +841,6 @@ _bt_insertonpg(Relation rel,
842841
{
843842
Page page;
844843
BTPageOpaque lpageop;
845-
OffsetNumber firstright = InvalidOffsetNumber;
846844
Size itemsz;
847845

848846
page = BufferGetPage(buf);
@@ -878,7 +876,6 @@ _bt_insertonpg(Relation rel,
878876
{
879877
bool is_root = P_ISROOT(lpageop);
880878
bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
881-
bool newitemonleft;
882879
Buffer rbuf;
883880

884881
/*
@@ -899,14 +896,8 @@ _bt_insertonpg(Relation rel,
899896
Assert(!(P_ISLEAF(lpageop) &&
900897
BlockNumberIsValid(RelationGetTargetBlock(rel))));
901898

902-
/* Choose the split point */
903-
firstright = _bt_findsplitloc(rel, page,
904-
newitemoff, itemsz,
905-
&newitemonleft);
906-
907899
/* split the buffer into left and right halves */
908-
rbuf = _bt_split(rel, buf, cbuf, firstright,
909-
newitemoff, itemsz, itup, newitemonleft);
900+
rbuf = _bt_split(rel, buf, cbuf, newitemoff, itemsz, itup);
910901
PredicateLockPageSplit(rel,
911902
BufferGetBlockNumber(buf),
912903
BufferGetBlockNumber(rbuf));
@@ -1106,9 +1097,8 @@ _bt_insertonpg(Relation rel,
11061097
* _bt_split() -- split a page in the btree.
11071098
*
11081099
* On entry, buf is the page to split, and is pinned and write-locked.
1109-
* firstright is the item index of the first item to be moved to the
1110-
* new right page. newitemoff etc. tell us about the new item that
1111-
* must be inserted along with the data from the old page.
1100+
* newitemoff etc. tell us about the new item that must be inserted
1101+
* along with the data from the original page.
11121102
*
11131103
* When splitting a non-leaf page, 'cbuf' is the left-sibling of the
11141104
* page we're inserting the downlink for. This function will clear the
@@ -1118,9 +1108,8 @@ _bt_insertonpg(Relation rel,
11181108
* The pin and lock on buf are maintained.
11191109
*/
11201110
static Buffer
1121-
_bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
1122-
OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1123-
bool newitemonleft)
1111+
_bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
1112+
Size newitemsz, IndexTuple newitem)
11241113
{
11251114
Buffer rbuf;
11261115
Page origpage;
@@ -1139,99 +1128,81 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
11391128
IndexTuple item;
11401129
OffsetNumber leftoff,
11411130
rightoff;
1131+
OffsetNumber firstright;
11421132
OffsetNumber maxoff;
11431133
OffsetNumber i;
1144-
bool isleaf;
1134+
bool newitemonleft,
1135+
isleaf;
11451136
IndexTuple lefthikey;
11461137
int indnatts = IndexRelationGetNumberOfAttributes(rel);
11471138
int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
11481139

1149-
/* Acquire a new page to split into */
1150-
rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1151-
11521140
/*
11531141
* origpage is the original page to be split. leftpage is a temporary
11541142
* buffer that receives the left-sibling data, which will be copied back
1155-
* into origpage on success. rightpage is the new page that receives the
1156-
* right-sibling data. If we fail before reaching the critical section,
1157-
* origpage hasn't been modified and leftpage is only workspace. In
1158-
* principle we shouldn't need to worry about rightpage either, because it
1159-
* hasn't been linked into the btree page structure; but to avoid leaving
1160-
* possibly-confusing junk behind, we are careful to rewrite rightpage as
1161-
* zeroes before throwing any error.
1143+
* into origpage on success. rightpage is the new page that will receive
1144+
* the right-sibling data.
1145+
*
1146+
* leftpage is allocated after choosing a split point. rightpage's new
1147+
* buffer isn't acquired until after leftpage is initialized and has new
1148+
* high key, the last point where splitting the page may fail (barring
1149+
* corruption). Failing before acquiring new buffer won't have lasting
1150+
* consequences, since origpage won't have been modified and leftpage is
1151+
* only workspace.
11621152
*/
11631153
origpage = BufferGetPage(buf);
1164-
leftpage = PageGetTempPage(origpage);
1165-
rightpage = BufferGetPage(rbuf);
1166-
1154+
oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
11671155
origpagenumber = BufferGetBlockNumber(buf);
1168-
rightpagenumber = BufferGetBlockNumber(rbuf);
1169-
1170-
_bt_pageinit(leftpage, BufferGetPageSize(buf));
1171-
/* rightpage was already initialized by _bt_getbuf */
11721156

11731157
/*
1174-
* Copy the original page's LSN into leftpage, which will become the
1175-
* updated version of the page. We need this because XLogInsert will
1176-
* examine the LSN and possibly dump it in a page image.
1158+
* Choose a point to split origpage at.
1159+
*
1160+
* A split point can be thought of as a point _between_ two existing
1161+
* tuples on origpage (lastleft and firstright tuples), provided you
1162+
* pretend that the new item that didn't fit is already on origpage.
1163+
*
1164+
* Since origpage does not actually contain newitem, the representation of
1165+
* split points needs to work with two boundary cases: splits where
1166+
* newitem is lastleft, and splits where newitem is firstright.
1167+
* newitemonleft resolves the ambiguity that would otherwise exist when
1168+
* newitemoff == firstright. In all other cases it's clear which side of
1169+
* the split every tuple goes on from context. newitemonleft is usually
1170+
* (but not always) redundant information.
11771171
*/
1178-
PageSetLSN(leftpage, PageGetLSN(origpage));
1172+
firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1173+
&newitemonleft);
11791174

1180-
/* init btree private data */
1181-
oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
1175+
/* Allocate temp buffer for leftpage */
1176+
leftpage = PageGetTempPage(origpage);
1177+
_bt_pageinit(leftpage, BufferGetPageSize(buf));
11821178
lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
1183-
ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1184-
1185-
isleaf = P_ISLEAF(oopaque);
11861179

1187-
/* if we're splitting this page, it won't be the root when we're done */
1188-
/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
1180+
/*
1181+
* leftpage won't be the root when we're done. Also, clear the SPLIT_END
1182+
* and HAS_GARBAGE flags.
1183+
*/
11891184
lopaque->btpo_flags = oopaque->btpo_flags;
11901185
lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1191-
ropaque->btpo_flags = lopaque->btpo_flags;
1192-
/* set flag in left page indicating that the right page has no downlink */
1186+
/* set flag in leftpage indicating that rightpage has no downlink yet */
11931187
lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
11941188
lopaque->btpo_prev = oopaque->btpo_prev;
1195-
lopaque->btpo_next = rightpagenumber;
1196-
ropaque->btpo_prev = origpagenumber;
1197-
ropaque->btpo_next = oopaque->btpo_next;
1198-
lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
1199-
/* Since we already have write-lock on both pages, ok to read cycleid */
1200-
lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1201-
ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1189+
/* handle btpo_next after rightpage buffer acquired */
1190+
lopaque->btpo.level = oopaque->btpo.level;
1191+
/* handle btpo_cycleid after rightpage buffer acquired */
12021192

12031193
/*
1204-
* If the page we're splitting is not the rightmost page at its level in
1205-
* the tree, then the first entry on the page is the high key for the
1206-
* page. We need to copy that to the right half. Otherwise (meaning the
1207-
* rightmost page case), all the items on the right half will be user
1208-
* data.
1194+
* Copy the original page's LSN into leftpage, which will become the
1195+
* updated version of the page. We need this because XLogInsert will
1196+
* examine the LSN and possibly dump it in a page image.
12091197
*/
1210-
rightoff = P_HIKEY;
1211-
1212-
if (!P_RIGHTMOST(oopaque))
1213-
{
1214-
itemid = PageGetItemId(origpage, P_HIKEY);
1215-
itemsz = ItemIdGetLength(itemid);
1216-
item = (IndexTuple) PageGetItem(origpage, itemid);
1217-
Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts);
1218-
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
1219-
false, false) == InvalidOffsetNumber)
1220-
{
1221-
memset(rightpage, 0, BufferGetPageSize(rbuf));
1222-
elog(ERROR, "failed to add hikey to the right sibling"
1223-
" while splitting block %u of index \"%s\"",
1224-
origpagenumber, RelationGetRelationName(rel));
1225-
}
1226-
rightoff = OffsetNumberNext(rightoff);
1227-
}
1198+
PageSetLSN(leftpage, PageGetLSN(origpage));
1199+
isleaf = P_ISLEAF(oopaque);
12281200

12291201
/*
12301202
* The "high key" for the new left page will be the first key that's going
12311203
* to go into the new right page. This might be either the existing data
12321204
* item at position firstright, or the incoming tuple.
12331205
*/
1234-
leftoff = P_HIKEY;
12351206
if (!newitemonleft && newitemoff == firstright)
12361207
{
12371208
/* incoming tuple will become first on right page */
@@ -1265,22 +1236,89 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
12651236
else
12661237
lefthikey = item;
12671238

1239+
/*
1240+
* Add new high key to leftpage
1241+
*/
1242+
leftoff = P_HIKEY;
1243+
12681244
Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts);
12691245
if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
12701246
false, false) == InvalidOffsetNumber)
1271-
{
1272-
memset(rightpage, 0, BufferGetPageSize(rbuf));
12731247
elog(ERROR, "failed to add hikey to the left sibling"
12741248
" while splitting block %u of index \"%s\"",
12751249
origpagenumber, RelationGetRelationName(rel));
1276-
}
12771250
leftoff = OffsetNumberNext(leftoff);
12781251
/* be tidy */
12791252
if (lefthikey != item)
12801253
pfree(lefthikey);
12811254

12821255
/*
1283-
* Now transfer all the data items to the appropriate page.
1256+
* Acquire a new right page to split into, now that left page has a new
1257+
* high key. From here on, it's not okay to throw an error without
1258+
* zeroing rightpage first. This coding rule ensures that we won't
1259+
* confuse future VACUUM operations, which might otherwise try to re-find
1260+
* a downlink to a leftover junk page as the page undergoes deletion.
1261+
*
1262+
* It would be reasonable to start the critical section just after the new
1263+
* rightpage buffer is acquired instead; that would allow us to avoid
1264+
* leftover junk pages without bothering to zero rightpage. We do it this
1265+
* way because it avoids an unnecessary PANIC when either origpage or its
1266+
* existing sibling page are corrupt.
1267+
*/
1268+
rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
1269+
rightpage = BufferGetPage(rbuf);
1270+
rightpagenumber = BufferGetBlockNumber(rbuf);
1271+
/* rightpage was initialized by _bt_getbuf */
1272+
ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
1273+
1274+
/*
1275+
* Finish off remaining leftpage special area fields. They cannot be set
1276+
* before both origpage (leftpage) and rightpage buffers are acquired and
1277+
* locked.
1278+
*/
1279+
lopaque->btpo_next = rightpagenumber;
1280+
lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1281+
1282+
/*
1283+
* rightpage won't be the root when we're done. Also, clear the SPLIT_END
1284+
* and HAS_GARBAGE flags.
1285+
*/
1286+
ropaque->btpo_flags = oopaque->btpo_flags;
1287+
ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1288+
ropaque->btpo_prev = origpagenumber;
1289+
ropaque->btpo_next = oopaque->btpo_next;
1290+
ropaque->btpo.level = oopaque->btpo.level;
1291+
ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1292+
1293+
/*
1294+
* Add new high key to rightpage where necessary.
1295+
*
1296+
* If the page we're splitting is not the rightmost page at its level in
1297+
* the tree, then the first entry on the page is the high key from
1298+
* origpage.
1299+
*/
1300+
rightoff = P_HIKEY;
1301+
1302+
if (!P_RIGHTMOST(oopaque))
1303+
{
1304+
itemid = PageGetItemId(origpage, P_HIKEY);
1305+
itemsz = ItemIdGetLength(itemid);
1306+
item = (IndexTuple) PageGetItem(origpage, itemid);
1307+
Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts);
1308+
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
1309+
false, false) == InvalidOffsetNumber)
1310+
{
1311+
memset(rightpage, 0, BufferGetPageSize(rbuf));
1312+
elog(ERROR, "failed to add hikey to the right sibling"
1313+
" while splitting block %u of index \"%s\"",
1314+
origpagenumber, RelationGetRelationName(rel));
1315+
}
1316+
rightoff = OffsetNumberNext(rightoff);
1317+
}
1318+
1319+
/*
1320+
* Now transfer all the data items (non-pivot tuples in isleaf case, or
1321+
* additional pivot tuples in !isleaf case) to the appropriate page.
12841322
*
12851323
* Note: we *must* insert at least the right page's items in item-number
12861324
* order, for the benefit of _bt_restore_page().
@@ -1298,6 +1336,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
12981336
{
12991337
if (newitemonleft)
13001338
{
1339+
Assert(newitemoff <= firstright);
13011340
if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
13021341
{
13031342
memset(rightpage, 0, BufferGetPageSize(rbuf));
@@ -1309,6 +1348,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
13091348
}
13101349
else
13111350
{
1351+
Assert(newitemoff >= firstright);
13121352
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
13131353
{
13141354
memset(rightpage, 0, BufferGetPageSize(rbuf));
@@ -1371,7 +1411,6 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
13711411
* all readers release locks on a page before trying to fetch its
13721412
* neighbors.
13731413
*/
1374-
13751414
if (!P_RIGHTMOST(oopaque))
13761415
{
13771416
sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);

0 commit comments

Comments
 (0)