Skip to content

Commit 8fa30f9

Browse files
committed
Reduce PANIC to ERROR in some occasionally-reported btree failure cases.
This patch changes _bt_split() and _bt_pagedel() to throw a plain ERROR, rather than PANIC, for several cases that are reported from the field from time to time: * right sibling's left-link doesn't match; * PageAddItem failure during _bt_split(); * parent page's next child isn't right sibling during _bt_pagedel(). In addition the error messages for these cases have been made a bit more verbose, with additional values included. The original motivation for PANIC here was to capture core dumps for subsequent analysis. But with so many users whose platforms don't capture core dumps by default, or who are unprepared to analyze them anyway, it's hard to justify a forced database restart when we can fairly easily detect the problems before we've reached the critical sections where PANIC would be necessary. It is not currently known whether the reports of these messages indicate well-hidden bugs in Postgres, or are a result of storage-level malfeasance; the latter possibility suggests that we ought to try to be more robust even if there is a bug here that's ultimately found. Backpatch to 8.2. The code before that is sufficiently different that it doesn't seem worth the trouble to back-port further.
1 parent a9a999b commit 8fa30f9

File tree

2 files changed

+135
-52
lines changed

2 files changed

+135
-52
lines changed

src/backend/access/nbtree/nbtinsert.c

Lines changed: 93 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.178 2010/03/28 09:27:01 sriggs Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.179 2010/08/29 19:33:14 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -74,9 +74,8 @@ static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
7474
static void _bt_checksplitloc(FindSplitData *state,
7575
OffsetNumber firstoldonright, bool newitemonleft,
7676
int dataitemstoleft, Size firstoldonrightsz);
77-
static void _bt_pgaddtup(Relation rel, Page page,
78-
Size itemsize, IndexTuple itup,
79-
OffsetNumber itup_off, const char *where);
77+
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
78+
OffsetNumber itup_off);
8079
static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
8180
int keysz, ScanKey scankey);
8281
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
@@ -753,7 +752,9 @@ _bt_insertonpg(Relation rel,
753752
/* Do the update. No ereport(ERROR) until changes are logged */
754753
START_CRIT_SECTION();
755754

756-
_bt_pgaddtup(rel, page, itemsz, itup, newitemoff, "page");
755+
if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
756+
elog(PANIC, "failed to add new item to block %u in index \"%s\"",
757+
itup_blkno, RelationGetRelationName(rel));
757758

758759
MarkBufferDirty(buf);
759760

@@ -879,6 +880,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
879880
Page origpage;
880881
Page leftpage,
881882
rightpage;
883+
BlockNumber origpagenumber,
884+
rightpagenumber;
882885
BTPageOpaque ropaque,
883886
lopaque,
884887
oopaque;
@@ -894,11 +897,27 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
894897
OffsetNumber i;
895898
bool isroot;
896899

900+
/* Acquire a new page to split into */
897901
rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
902+
903+
/*
904+
* origpage is the original page to be split. leftpage is a temporary
905+
* buffer that receives the left-sibling data, which will be copied back
906+
* into origpage on success. rightpage is the new page that receives
907+
* the right-sibling data. If we fail before reaching the critical
908+
* section, origpage hasn't been modified and leftpage is only workspace.
909+
* In principle we shouldn't need to worry about rightpage either,
910+
* because it hasn't been linked into the btree page structure; but to
911+
* avoid leaving possibly-confusing junk behind, we are careful to rewrite
912+
* rightpage as zeroes before throwing any error.
913+
*/
898914
origpage = BufferGetPage(buf);
899915
leftpage = PageGetTempPage(origpage);
900916
rightpage = BufferGetPage(rbuf);
901917

918+
origpagenumber = BufferGetBlockNumber(buf);
919+
rightpagenumber = BufferGetBlockNumber(rbuf);
920+
902921
_bt_pageinit(leftpage, BufferGetPageSize(buf));
903922
/* rightpage was already initialized by _bt_getbuf */
904923

@@ -923,8 +942,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
923942
lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
924943
ropaque->btpo_flags = lopaque->btpo_flags;
925944
lopaque->btpo_prev = oopaque->btpo_prev;
926-
lopaque->btpo_next = BufferGetBlockNumber(rbuf);
927-
ropaque->btpo_prev = BufferGetBlockNumber(buf);
945+
lopaque->btpo_next = rightpagenumber;
946+
ropaque->btpo_prev = origpagenumber;
928947
ropaque->btpo_next = oopaque->btpo_next;
929948
lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
930949
/* Since we already have write-lock on both pages, ok to read cycleid */
@@ -947,9 +966,12 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
947966
item = (IndexTuple) PageGetItem(origpage, itemid);
948967
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
949968
false, false) == InvalidOffsetNumber)
950-
elog(PANIC, "failed to add hikey to the right sibling"
969+
{
970+
memset(rightpage, 0, BufferGetPageSize(rbuf));
971+
elog(ERROR, "failed to add hikey to the right sibling"
951972
" while splitting block %u of index \"%s\"",
952-
BufferGetBlockNumber(buf), RelationGetRelationName(rel));
973+
origpagenumber, RelationGetRelationName(rel));
974+
}
953975
rightoff = OffsetNumberNext(rightoff);
954976
}
955977

@@ -974,9 +996,12 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
974996
}
975997
if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
976998
false, false) == InvalidOffsetNumber)
977-
elog(PANIC, "failed to add hikey to the left sibling"
999+
{
1000+
memset(rightpage, 0, BufferGetPageSize(rbuf));
1001+
elog(ERROR, "failed to add hikey to the left sibling"
9781002
" while splitting block %u of index \"%s\"",
979-
BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1003+
origpagenumber, RelationGetRelationName(rel));
1004+
}
9801005
leftoff = OffsetNumberNext(leftoff);
9811006

9821007
/*
@@ -998,29 +1023,49 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
9981023
{
9991024
if (newitemonleft)
10001025
{
1001-
_bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
1002-
"left sibling");
1026+
if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
1027+
{
1028+
memset(rightpage, 0, BufferGetPageSize(rbuf));
1029+
elog(ERROR, "failed to add new item to the left sibling"
1030+
" while splitting block %u of index \"%s\"",
1031+
origpagenumber, RelationGetRelationName(rel));
1032+
}
10031033
leftoff = OffsetNumberNext(leftoff);
10041034
}
10051035
else
10061036
{
1007-
_bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
1008-
"right sibling");
1037+
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1038+
{
1039+
memset(rightpage, 0, BufferGetPageSize(rbuf));
1040+
elog(ERROR, "failed to add new item to the right sibling"
1041+
" while splitting block %u of index \"%s\"",
1042+
origpagenumber, RelationGetRelationName(rel));
1043+
}
10091044
rightoff = OffsetNumberNext(rightoff);
10101045
}
10111046
}
10121047

10131048
/* decide which page to put it on */
10141049
if (i < firstright)
10151050
{
1016-
_bt_pgaddtup(rel, leftpage, itemsz, item, leftoff,
1017-
"left sibling");
1051+
if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff))
1052+
{
1053+
memset(rightpage, 0, BufferGetPageSize(rbuf));
1054+
elog(ERROR, "failed to add old item to the left sibling"
1055+
" while splitting block %u of index \"%s\"",
1056+
origpagenumber, RelationGetRelationName(rel));
1057+
}
10181058
leftoff = OffsetNumberNext(leftoff);
10191059
}
10201060
else
10211061
{
1022-
_bt_pgaddtup(rel, rightpage, itemsz, item, rightoff,
1023-
"right sibling");
1062+
if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff))
1063+
{
1064+
memset(rightpage, 0, BufferGetPageSize(rbuf));
1065+
elog(ERROR, "failed to add old item to the right sibling"
1066+
" while splitting block %u of index \"%s\"",
1067+
origpagenumber, RelationGetRelationName(rel));
1068+
}
10241069
rightoff = OffsetNumberNext(rightoff);
10251070
}
10261071
}
@@ -1034,8 +1079,13 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
10341079
* not be splitting the page).
10351080
*/
10361081
Assert(!newitemonleft);
1037-
_bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
1038-
"right sibling");
1082+
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
1083+
{
1084+
memset(rightpage, 0, BufferGetPageSize(rbuf));
1085+
elog(ERROR, "failed to add new item to the right sibling"
1086+
" while splitting block %u of index \"%s\"",
1087+
origpagenumber, RelationGetRelationName(rel));
1088+
}
10391089
rightoff = OffsetNumberNext(rightoff);
10401090
}
10411091

@@ -1047,16 +1097,19 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
10471097
* neighbors.
10481098
*/
10491099

1050-
if (!P_RIGHTMOST(ropaque))
1100+
if (!P_RIGHTMOST(oopaque))
10511101
{
1052-
sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
1102+
sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
10531103
spage = BufferGetPage(sbuf);
10541104
sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
1055-
if (sopaque->btpo_prev != ropaque->btpo_prev)
1056-
elog(PANIC, "right sibling's left-link doesn't match: "
1057-
"block %u links to %u instead of expected %u in index \"%s\"",
1058-
ropaque->btpo_next, sopaque->btpo_prev, ropaque->btpo_prev,
1105+
if (sopaque->btpo_prev != origpagenumber)
1106+
{
1107+
memset(rightpage, 0, BufferGetPageSize(rbuf));
1108+
elog(ERROR, "right sibling's left-link doesn't match: "
1109+
"block %u links to %u instead of expected %u in index \"%s\"",
1110+
oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
10591111
RelationGetRelationName(rel));
1112+
}
10601113

10611114
/*
10621115
* Check to see if we can set the SPLIT_END flag in the right-hand
@@ -1081,8 +1134,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
10811134
*
10821135
* NO EREPORT(ERROR) till right sibling is updated. We can get away with
10831136
* not starting the critical section till here because we haven't been
1084-
* scribbling on the original page yet, and we don't care about the new
1085-
* sibling until it's linked into the btree.
1137+
* scribbling on the original page yet; see comments above.
10861138
*/
10871139
START_CRIT_SECTION();
10881140

@@ -1094,19 +1146,21 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
10941146
* (in the page management code) that the center of a page always be
10951147
* clean, and the most efficient way to guarantee this is just to compact
10961148
* the data by reinserting it into a new left page. (XXX the latter
1097-
* comment is probably obsolete.)
1149+
* comment is probably obsolete; but in any case it's good to not scribble
1150+
* on the original page until we enter the critical section.)
10981151
*
10991152
* We need to do this before writing the WAL record, so that XLogInsert
11001153
* can WAL log an image of the page if necessary.
11011154
*/
11021155
PageRestoreTempPage(leftpage, origpage);
1156+
/* leftpage, lopaque must not be used below here */
11031157

11041158
MarkBufferDirty(buf);
11051159
MarkBufferDirty(rbuf);
11061160

11071161
if (!P_RIGHTMOST(ropaque))
11081162
{
1109-
sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
1163+
sopaque->btpo_prev = rightpagenumber;
11101164
MarkBufferDirty(sbuf);
11111165
}
11121166

@@ -1120,8 +1174,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
11201174
XLogRecData *lastrdata;
11211175

11221176
xlrec.node = rel->rd_node;
1123-
xlrec.leftsib = BufferGetBlockNumber(buf);
1124-
xlrec.rightsib = BufferGetBlockNumber(rbuf);
1177+
xlrec.leftsib = origpagenumber;
1178+
xlrec.rightsib = rightpagenumber;
11251179
xlrec.rnext = ropaque->btpo_next;
11261180
xlrec.level = ropaque->btpo.level;
11271181
xlrec.firstright = firstright;
@@ -1920,13 +1974,11 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
19201974
* we insert the tuples in order, so that the given itup_off does
19211975
* represent the final position of the tuple!
19221976
*/
1923-
static void
1924-
_bt_pgaddtup(Relation rel,
1925-
Page page,
1977+
static bool
1978+
_bt_pgaddtup(Page page,
19261979
Size itemsize,
19271980
IndexTuple itup,
1928-
OffsetNumber itup_off,
1929-
const char *where)
1981+
OffsetNumber itup_off)
19301982
{
19311983
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
19321984
IndexTupleData trunctuple;
@@ -1941,8 +1993,9 @@ _bt_pgaddtup(Relation rel,
19411993

19421994
if (PageAddItem(page, (Item) itup, itemsize, itup_off,
19431995
false, false) == InvalidOffsetNumber)
1944-
elog(PANIC, "failed to add item to the %s in index \"%s\"",
1945-
where, RelationGetRelationName(rel));
1996+
return false;
1997+
1998+
return true;
19461999
}
19472000

19482001
/*

src/backend/access/nbtree/nbtpage.c

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
*
1010
*
1111
* IDENTIFICATION
12-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.123 2010/07/06 19:18:55 momjian Exp $
12+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.124 2010/08/29 19:33:14 tgl Exp $
1313
*
1414
* NOTES
1515
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -1175,6 +1175,13 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
11751175
*/
11761176
rightsib = opaque->btpo_next;
11771177
rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
1178+
page = BufferGetPage(rbuf);
1179+
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1180+
if (opaque->btpo_prev != target)
1181+
elog(ERROR, "right sibling's left-link doesn't match: "
1182+
"block %u links to %u instead of expected %u in index \"%s\"",
1183+
rightsib, opaque->btpo_prev, target,
1184+
RelationGetRelationName(rel));
11781185

11791186
/*
11801187
* Next find and write-lock the current parent of the target page. This is
@@ -1252,6 +1259,38 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
12521259
}
12531260
}
12541261

1262+
/*
1263+
* Check that the parent-page index items we're about to delete/overwrite
1264+
* contain what we expect. This can fail if the index has become
1265+
* corrupt for some reason. We want to throw any error before entering
1266+
* the critical section --- otherwise it'd be a PANIC.
1267+
*
1268+
* The test on the target item is just an Assert because _bt_getstackbuf
1269+
* should have guaranteed it has the expected contents. The test on the
1270+
* next-child downlink is known to sometimes fail in the field, though.
1271+
*/
1272+
page = BufferGetPage(pbuf);
1273+
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1274+
1275+
#ifdef USE_ASSERT_CHECKING
1276+
itemid = PageGetItemId(page, poffset);
1277+
itup = (IndexTuple) PageGetItem(page, itemid);
1278+
Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
1279+
#endif
1280+
1281+
if (!parent_half_dead)
1282+
{
1283+
OffsetNumber nextoffset;
1284+
1285+
nextoffset = OffsetNumberNext(poffset);
1286+
itemid = PageGetItemId(page, nextoffset);
1287+
itup = (IndexTuple) PageGetItem(page, itemid);
1288+
if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
1289+
elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
1290+
rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)),
1291+
parent, RelationGetRelationName(rel));
1292+
}
1293+
12551294
/*
12561295
* Here we begin doing the deletion.
12571296
*/
@@ -1265,8 +1304,6 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
12651304
* to copy the right sibling's downlink over the target downlink, and then
12661305
* delete the following item.
12671306
*/
1268-
page = BufferGetPage(pbuf);
1269-
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
12701307
if (parent_half_dead)
12711308
{
12721309
PageIndexTupleDelete(page, poffset);
@@ -1278,23 +1315,16 @@ _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
12781315

12791316
itemid = PageGetItemId(page, poffset);
12801317
itup = (IndexTuple) PageGetItem(page, itemid);
1281-
Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
12821318
ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
12831319

12841320
nextoffset = OffsetNumberNext(poffset);
1285-
/* This part is just for double-checking */
1286-
itemid = PageGetItemId(page, nextoffset);
1287-
itup = (IndexTuple) PageGetItem(page, itemid);
1288-
if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
1289-
elog(PANIC, "right sibling %u of block %u is not next child of %u in index \"%s\"",
1290-
rightsib, target, BufferGetBlockNumber(pbuf),
1291-
RelationGetRelationName(rel));
12921321
PageIndexTupleDelete(page, nextoffset);
12931322
}
12941323

12951324
/*
12961325
* Update siblings' side-links. Note the target page's side-links will
1297-
* continue to point to the siblings.
1326+
* continue to point to the siblings. Asserts here are just rechecking
1327+
* things we already verified above.
12981328
*/
12991329
if (BufferIsValid(lbuf))
13001330
{

0 commit comments

Comments
 (0)