@@ -71,8 +71,7 @@ static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf,
71
71
OffsetNumber newitemoff ,
72
72
bool split_only_page );
73
73
static Buffer _bt_split (Relation rel , Buffer buf , Buffer cbuf ,
74
- OffsetNumber firstright , OffsetNumber newitemoff , Size newitemsz ,
75
- IndexTuple newitem , bool newitemonleft );
74
+ OffsetNumber newitemoff , Size newitemsz , IndexTuple newitem );
76
75
static void _bt_insert_parent (Relation rel , Buffer buf , Buffer rbuf ,
77
76
BTStack stack , bool is_root , bool is_only );
78
77
static OffsetNumber _bt_findsplitloc (Relation rel , Page page ,
@@ -842,7 +841,6 @@ _bt_insertonpg(Relation rel,
842
841
{
843
842
Page page ;
844
843
BTPageOpaque lpageop ;
845
- OffsetNumber firstright = InvalidOffsetNumber ;
846
844
Size itemsz ;
847
845
848
846
page = BufferGetPage (buf );
@@ -878,7 +876,6 @@ _bt_insertonpg(Relation rel,
878
876
{
879
877
bool is_root = P_ISROOT (lpageop );
880
878
bool is_only = P_LEFTMOST (lpageop ) && P_RIGHTMOST (lpageop );
881
- bool newitemonleft ;
882
879
Buffer rbuf ;
883
880
884
881
/*
@@ -899,14 +896,8 @@ _bt_insertonpg(Relation rel,
899
896
Assert (!(P_ISLEAF (lpageop ) &&
900
897
BlockNumberIsValid (RelationGetTargetBlock (rel ))));
901
898
902
- /* Choose the split point */
903
- firstright = _bt_findsplitloc (rel , page ,
904
- newitemoff , itemsz ,
905
- & newitemonleft );
906
-
907
899
/* split the buffer into left and right halves */
908
- rbuf = _bt_split (rel , buf , cbuf , firstright ,
909
- newitemoff , itemsz , itup , newitemonleft );
900
+ rbuf = _bt_split (rel , buf , cbuf , newitemoff , itemsz , itup );
910
901
PredicateLockPageSplit (rel ,
911
902
BufferGetBlockNumber (buf ),
912
903
BufferGetBlockNumber (rbuf ));
@@ -1106,9 +1097,8 @@ _bt_insertonpg(Relation rel,
1106
1097
* _bt_split() -- split a page in the btree.
1107
1098
*
1108
1099
* On entry, buf is the page to split, and is pinned and write-locked.
1109
- * firstright is the item index of the first item to be moved to the
1110
- * new right page. newitemoff etc. tell us about the new item that
1111
- * must be inserted along with the data from the old page.
1100
+ * newitemoff etc. tell us about the new item that must be inserted
1101
+ * along with the data from the original page.
1112
1102
*
1113
1103
* When splitting a non-leaf page, 'cbuf' is the left-sibling of the
1114
1104
* page we're inserting the downlink for. This function will clear the
@@ -1118,9 +1108,8 @@ _bt_insertonpg(Relation rel,
1118
1108
* The pin and lock on buf are maintained.
1119
1109
*/
1120
1110
static Buffer
1121
- _bt_split (Relation rel , Buffer buf , Buffer cbuf , OffsetNumber firstright ,
1122
- OffsetNumber newitemoff , Size newitemsz , IndexTuple newitem ,
1123
- bool newitemonleft )
1111
+ _bt_split (Relation rel , Buffer buf , Buffer cbuf , OffsetNumber newitemoff ,
1112
+ Size newitemsz , IndexTuple newitem )
1124
1113
{
1125
1114
Buffer rbuf ;
1126
1115
Page origpage ;
@@ -1139,99 +1128,81 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
1139
1128
IndexTuple item ;
1140
1129
OffsetNumber leftoff ,
1141
1130
rightoff ;
1131
+ OffsetNumber firstright ;
1142
1132
OffsetNumber maxoff ;
1143
1133
OffsetNumber i ;
1144
- bool isleaf ;
1134
+ bool newitemonleft ,
1135
+ isleaf ;
1145
1136
IndexTuple lefthikey ;
1146
1137
int indnatts = IndexRelationGetNumberOfAttributes (rel );
1147
1138
int indnkeyatts = IndexRelationGetNumberOfKeyAttributes (rel );
1148
1139
1149
- /* Acquire a new page to split into */
1150
- rbuf = _bt_getbuf (rel , P_NEW , BT_WRITE );
1151
-
1152
1140
/*
1153
1141
* origpage is the original page to be split. leftpage is a temporary
1154
1142
* buffer that receives the left-sibling data, which will be copied back
1155
- * into origpage on success. rightpage is the new page that receives the
1156
- * right-sibling data. If we fail before reaching the critical section,
1157
- * origpage hasn't been modified and leftpage is only workspace. In
1158
- * principle we shouldn't need to worry about rightpage either, because it
1159
- * hasn't been linked into the btree page structure; but to avoid leaving
1160
- * possibly-confusing junk behind, we are careful to rewrite rightpage as
1161
- * zeroes before throwing any error.
1143
+ * into origpage on success. rightpage is the new page that will receive
1144
+ * the right-sibling data.
1145
+ *
1146
+ * leftpage is allocated after choosing a split point. rightpage's new
1147
+ * buffer isn't acquired until after leftpage is initialized and has new
1148
+ * high key, the last point where splitting the page may fail (barring
1149
+ * corruption). Failing before acquiring new buffer won't have lasting
1150
+ * consequences, since origpage won't have been modified and leftpage is
1151
+ * only workspace.
1162
1152
*/
1163
1153
origpage = BufferGetPage (buf );
1164
- leftpage = PageGetTempPage (origpage );
1165
- rightpage = BufferGetPage (rbuf );
1166
-
1154
+ oopaque = (BTPageOpaque ) PageGetSpecialPointer (origpage );
1167
1155
origpagenumber = BufferGetBlockNumber (buf );
1168
- rightpagenumber = BufferGetBlockNumber (rbuf );
1169
-
1170
- _bt_pageinit (leftpage , BufferGetPageSize (buf ));
1171
- /* rightpage was already initialized by _bt_getbuf */
1172
1156
1173
1157
/*
1174
- * Copy the original page's LSN into leftpage, which will become the
1175
- * updated version of the page. We need this because XLogInsert will
1176
- * examine the LSN and possibly dump it in a page image.
1158
+ * Choose a point to split origpage at.
1159
+ *
1160
+ * A split point can be thought of as a point _between_ two existing
1161
+ * tuples on origpage (lastleft and firstright tuples), provided you
1162
+ * pretend that the new item that didn't fit is already on origpage.
1163
+ *
1164
+ * Since origpage does not actually contain newitem, the representation of
1165
+ * split points needs to work with two boundary cases: splits where
1166
+ * newitem is lastleft, and splits where newitem is firstright.
1167
+ * newitemonleft resolves the ambiguity that would otherwise exist when
1168
+ * newitemoff == firstright. In all other cases it's clear which side of
1169
+ * the split every tuple goes on from context. newitemonleft is usually
1170
+ * (but not always) redundant information.
1177
1171
*/
1178
- PageSetLSN (leftpage , PageGetLSN (origpage ));
1172
+ firstright = _bt_findsplitloc (rel , origpage , newitemoff , newitemsz ,
1173
+ & newitemonleft );
1179
1174
1180
- /* init btree private data */
1181
- oopaque = (BTPageOpaque ) PageGetSpecialPointer (origpage );
1175
+ /* Allocate temp buffer for leftpage */
1176
+ leftpage = PageGetTempPage (origpage );
1177
+ _bt_pageinit (leftpage , BufferGetPageSize (buf ));
1182
1178
lopaque = (BTPageOpaque ) PageGetSpecialPointer (leftpage );
1183
- ropaque = (BTPageOpaque ) PageGetSpecialPointer (rightpage );
1184
-
1185
- isleaf = P_ISLEAF (oopaque );
1186
1179
1187
- /* if we're splitting this page, it won't be the root when we're done */
1188
- /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
1180
+ /*
1181
+ * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1182
+ * and HAS_GARBAGE flags.
1183
+ */
1189
1184
lopaque -> btpo_flags = oopaque -> btpo_flags ;
1190
1185
lopaque -> btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE );
1191
- ropaque -> btpo_flags = lopaque -> btpo_flags ;
1192
- /* set flag in left page indicating that the right page has no downlink */
1186
+ /* set flag in leftpage indicating that rightpage has no downlink yet */
1193
1187
lopaque -> btpo_flags |= BTP_INCOMPLETE_SPLIT ;
1194
1188
lopaque -> btpo_prev = oopaque -> btpo_prev ;
1195
- lopaque -> btpo_next = rightpagenumber ;
1196
- ropaque -> btpo_prev = origpagenumber ;
1197
- ropaque -> btpo_next = oopaque -> btpo_next ;
1198
- lopaque -> btpo .level = ropaque -> btpo .level = oopaque -> btpo .level ;
1199
- /* Since we already have write-lock on both pages, ok to read cycleid */
1200
- lopaque -> btpo_cycleid = _bt_vacuum_cycleid (rel );
1201
- ropaque -> btpo_cycleid = lopaque -> btpo_cycleid ;
1189
+ /* handle btpo_next after rightpage buffer acquired */
1190
+ lopaque -> btpo .level = oopaque -> btpo .level ;
1191
+ /* handle btpo_cycleid after rightpage buffer acquired */
1202
1192
1203
1193
/*
1204
- * If the page we're splitting is not the rightmost page at its level in
1205
- * the tree, then the first entry on the page is the high key for the
1206
- * page. We need to copy that to the right half. Otherwise (meaning the
1207
- * rightmost page case), all the items on the right half will be user
1208
- * data.
1194
+ * Copy the original page's LSN into leftpage, which will become the
1195
+ * updated version of the page. We need this because XLogInsert will
1196
+ * examine the LSN and possibly dump it in a page image.
1209
1197
*/
1210
- rightoff = P_HIKEY ;
1211
-
1212
- if (!P_RIGHTMOST (oopaque ))
1213
- {
1214
- itemid = PageGetItemId (origpage , P_HIKEY );
1215
- itemsz = ItemIdGetLength (itemid );
1216
- item = (IndexTuple ) PageGetItem (origpage , itemid );
1217
- Assert (BTreeTupleGetNAtts (item , rel ) == indnkeyatts );
1218
- if (PageAddItem (rightpage , (Item ) item , itemsz , rightoff ,
1219
- false, false) == InvalidOffsetNumber )
1220
- {
1221
- memset (rightpage , 0 , BufferGetPageSize (rbuf ));
1222
- elog (ERROR , "failed to add hikey to the right sibling"
1223
- " while splitting block %u of index \"%s\"" ,
1224
- origpagenumber , RelationGetRelationName (rel ));
1225
- }
1226
- rightoff = OffsetNumberNext (rightoff );
1227
- }
1198
+ PageSetLSN (leftpage , PageGetLSN (origpage ));
1199
+ isleaf = P_ISLEAF (oopaque );
1228
1200
1229
1201
/*
1230
1202
* The "high key" for the new left page will be the first key that's going
1231
1203
* to go into the new right page. This might be either the existing data
1232
1204
* item at position firstright, or the incoming tuple.
1233
1205
*/
1234
- leftoff = P_HIKEY ;
1235
1206
if (!newitemonleft && newitemoff == firstright )
1236
1207
{
1237
1208
/* incoming tuple will become first on right page */
@@ -1265,22 +1236,89 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
1265
1236
else
1266
1237
lefthikey = item ;
1267
1238
1239
+ /*
1240
+ * Add new high key to leftpage
1241
+ */
1242
+ leftoff = P_HIKEY ;
1243
+
1268
1244
Assert (BTreeTupleGetNAtts (lefthikey , rel ) == indnkeyatts );
1269
1245
if (PageAddItem (leftpage , (Item ) lefthikey , itemsz , leftoff ,
1270
1246
false, false) == InvalidOffsetNumber )
1271
- {
1272
- memset (rightpage , 0 , BufferGetPageSize (rbuf ));
1273
1247
elog (ERROR , "failed to add hikey to the left sibling"
1274
1248
" while splitting block %u of index \"%s\"" ,
1275
1249
origpagenumber , RelationGetRelationName (rel ));
1276
- }
1277
1250
leftoff = OffsetNumberNext (leftoff );
1278
1251
/* be tidy */
1279
1252
if (lefthikey != item )
1280
1253
pfree (lefthikey );
1281
1254
1282
1255
/*
1283
- * Now transfer all the data items to the appropriate page.
1256
+ * Acquire a new right page to split into, now that left page has a new
1257
+ * high key. From here on, it's not okay to throw an error without
1258
+ * zeroing rightpage first. This coding rule ensures that we won't
1259
+ * confuse future VACUUM operations, which might otherwise try to re-find
1260
+ * a downlink to a leftover junk page as the page undergoes deletion.
1261
+ *
1262
+ * It would be reasonable to start the critical section just after the new
1263
+ * rightpage buffer is acquired instead; that would allow us to avoid
1264
+ * leftover junk pages without bothering to zero rightpage. We do it this
1265
+ * way because it avoids an unnecessary PANIC when either origpage or its
1266
+ * existing sibling page are corrupt.
1267
+ */
1268
+ rbuf = _bt_getbuf (rel , P_NEW , BT_WRITE );
1269
+ rightpage = BufferGetPage (rbuf );
1270
+ rightpagenumber = BufferGetBlockNumber (rbuf );
1271
+ /* rightpage was initialized by _bt_getbuf */
1272
+ ropaque = (BTPageOpaque ) PageGetSpecialPointer (rightpage );
1273
+
1274
+ /*
1275
+ * Finish off remaining leftpage special area fields. They cannot be set
1276
+ * before both origpage (leftpage) and rightpage buffers are acquired and
1277
+ * locked.
1278
+ */
1279
+ lopaque -> btpo_next = rightpagenumber ;
1280
+ lopaque -> btpo_cycleid = _bt_vacuum_cycleid (rel );
1281
+
1282
+ /*
1283
+ * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1284
+ * and HAS_GARBAGE flags.
1285
+ */
1286
+ ropaque -> btpo_flags = oopaque -> btpo_flags ;
1287
+ ropaque -> btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE );
1288
+ ropaque -> btpo_prev = origpagenumber ;
1289
+ ropaque -> btpo_next = oopaque -> btpo_next ;
1290
+ ropaque -> btpo .level = oopaque -> btpo .level ;
1291
+ ropaque -> btpo_cycleid = lopaque -> btpo_cycleid ;
1292
+
1293
+ /*
1294
+ * Add new high key to rightpage where necessary.
1295
+ *
1296
+ * If the page we're splitting is not the rightmost page at its level in
1297
+ * the tree, then the first entry on the page is the high key from
1298
+ * origpage.
1299
+ */
1300
+ rightoff = P_HIKEY ;
1301
+
1302
+ if (!P_RIGHTMOST (oopaque ))
1303
+ {
1304
+ itemid = PageGetItemId (origpage , P_HIKEY );
1305
+ itemsz = ItemIdGetLength (itemid );
1306
+ item = (IndexTuple ) PageGetItem (origpage , itemid );
1307
+ Assert (BTreeTupleGetNAtts (item , rel ) == indnkeyatts );
1308
+ if (PageAddItem (rightpage , (Item ) item , itemsz , rightoff ,
1309
+ false, false) == InvalidOffsetNumber )
1310
+ {
1311
+ memset (rightpage , 0 , BufferGetPageSize (rbuf ));
1312
+ elog (ERROR , "failed to add hikey to the right sibling"
1313
+ " while splitting block %u of index \"%s\"" ,
1314
+ origpagenumber , RelationGetRelationName (rel ));
1315
+ }
1316
+ rightoff = OffsetNumberNext (rightoff );
1317
+ }
1318
+
1319
+ /*
1320
+ * Now transfer all the data items (non-pivot tuples in isleaf case, or
1321
+ * additional pivot tuples in !isleaf case) to the appropriate page.
1284
1322
*
1285
1323
* Note: we *must* insert at least the right page's items in item-number
1286
1324
* order, for the benefit of _bt_restore_page().
@@ -1298,6 +1336,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
1298
1336
{
1299
1337
if (newitemonleft )
1300
1338
{
1339
+ Assert (newitemoff <= firstright );
1301
1340
if (!_bt_pgaddtup (leftpage , newitemsz , newitem , leftoff ))
1302
1341
{
1303
1342
memset (rightpage , 0 , BufferGetPageSize (rbuf ));
@@ -1309,6 +1348,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
1309
1348
}
1310
1349
else
1311
1350
{
1351
+ Assert (newitemoff >= firstright );
1312
1352
if (!_bt_pgaddtup (rightpage , newitemsz , newitem , rightoff ))
1313
1353
{
1314
1354
memset (rightpage , 0 , BufferGetPageSize (rbuf ));
@@ -1371,7 +1411,6 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
1371
1411
* all readers release locks on a page before trying to fetch its
1372
1412
* neighbors.
1373
1413
*/
1374
-
1375
1414
if (!P_RIGHTMOST (oopaque ))
1376
1415
{
1377
1416
sbuf = _bt_getbuf (rel , oopaque -> btpo_next , BT_WRITE );
0 commit comments