34
34
#include "utils/snapmgr.h"
35
35
36
36
static BTMetaPageData * _bt_getmeta (Relation rel , Buffer metabuf );
37
- static bool _bt_mark_page_halfdead (Relation rel , Buffer buf , BTStack stack );
37
+ static bool _bt_mark_page_halfdead (Relation rel , Buffer leafbuf ,
38
+ BTStack stack );
38
39
static bool _bt_unlink_halfdead_page (Relation rel , Buffer leafbuf ,
39
- bool * rightsib_empty );
40
+ bool * rightsib_empty ,
41
+ TransactionId * oldestBtpoXact );
40
42
static bool _bt_lock_branch_parent (Relation rel , BlockNumber child ,
41
43
BTStack stack , Buffer * topparent , OffsetNumber * topoff ,
42
44
BlockNumber * target , BlockNumber * rightsib );
@@ -1282,27 +1284,35 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
1282
1284
}
1283
1285
1284
1286
/*
1285
- * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
1287
+ * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
1286
1288
*
1287
- * This action unlinks the page from the b-tree structure, removing all
1289
+ * This action unlinks the leaf page from the b-tree structure, removing all
1288
1290
* pointers leading to it --- but not touching its own left and right links.
1289
1291
* The page cannot be physically reclaimed right away, since other processes
1290
1292
* may currently be trying to follow links leading to the page; they have to
1291
1293
* be allowed to use its right-link to recover. See nbtree/README.
1292
1294
*
1293
1295
* On entry, the target buffer must be pinned and locked (either read or write
1294
- * lock is OK). This lock and pin will be dropped before exiting.
1296
+ * lock is OK). The page must be an empty leaf page, which may be half-dead
1297
+ * already (a half-dead page should only be passed to us when an earlier
1298
+ * VACUUM operation was interrupted, though). Note in particular that caller
1299
+ * should never pass a buffer containing an existing deleted page here. The
1300
+ * lock and pin on caller's buffer will be dropped before we return.
1295
1301
*
1296
1302
* Returns the number of pages successfully deleted (zero if page cannot
1297
- * be deleted now; could be more than one if parent or sibling pages were
1298
- * deleted too).
1303
+ * be deleted now; could be more than one if parent or right sibling pages
1304
+ * were deleted too).
1305
+ *
1306
+ * Maintains *oldestBtpoXact for any pages that get deleted. Caller is
1307
+ * responsible for maintaining *oldestBtpoXact in the case of pages that were
1308
+ * deleted by a previous VACUUM.
1299
1309
*
1300
1310
* NOTE: this leaks memory. Rather than trying to clean up everything
1301
1311
* carefully, it's better to run it in a temp context that can be reset
1302
1312
* frequently.
1303
1313
*/
1304
1314
int
1305
- _bt_pagedel (Relation rel , Buffer buf )
1315
+ _bt_pagedel (Relation rel , Buffer leafbuf , TransactionId * oldestBtpoXact )
1306
1316
{
1307
1317
int ndeleted = 0 ;
1308
1318
BlockNumber rightsib ;
@@ -1323,14 +1333,21 @@ _bt_pagedel(Relation rel, Buffer buf)
1323
1333
1324
1334
for (;;)
1325
1335
{
1326
- page = BufferGetPage (buf );
1336
+ page = BufferGetPage (leafbuf );
1327
1337
opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
1328
1338
1329
1339
/*
1330
1340
* Internal pages are never deleted directly, only as part of deleting
1331
1341
* the whole branch all the way down to leaf level.
1342
+ *
1343
+ * Also check for deleted pages here. Caller never passes us a fully
1344
+ * deleted page. Only VACUUM can delete pages, so there can't have
1345
+ * been a concurrent deletion. Assume that we reached any deleted
1346
+ * page encountered here by following a sibling link, and that the
1347
+ * index is corrupt.
1332
1348
*/
1333
- if (!P_ISLEAF (opaque ))
1349
+ Assert (!P_ISDELETED (opaque ));
1350
+ if (!P_ISLEAF (opaque ) || P_ISDELETED (opaque ))
1334
1351
{
1335
1352
/*
1336
1353
* Pre-9.4 page deletion only marked internal pages as half-dead,
@@ -1349,13 +1366,22 @@ _bt_pagedel(Relation rel, Buffer buf)
1349
1366
errmsg ("index \"%s\" contains a half-dead internal page" ,
1350
1367
RelationGetRelationName (rel )),
1351
1368
errhint ("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it." )));
1352
- _bt_relbuf (rel , buf );
1369
+
1370
+ if (P_ISDELETED (opaque ))
1371
+ ereport (LOG ,
1372
+ (errcode (ERRCODE_INDEX_CORRUPTED ),
1373
+ errmsg_internal ("found deleted block %u while following right link in index \"%s\"" ,
1374
+ BufferGetBlockNumber (leafbuf ),
1375
+ RelationGetRelationName (rel ))));
1376
+
1377
+ _bt_relbuf (rel , leafbuf );
1353
1378
return ndeleted ;
1354
1379
}
1355
1380
1356
1381
/*
1357
1382
* We can never delete rightmost pages nor root pages. While at it,
1358
- * check that page is not already deleted and is empty.
1383
+ * check that page is empty, since it's possible that the leafbuf page
1384
+ * was empty a moment ago, but has since had some inserts.
1359
1385
*
1360
1386
* To keep the algorithm simple, we also never delete an incompletely
1361
1387
* split page (they should be rare enough that this doesn't make any
@@ -1370,14 +1396,14 @@ _bt_pagedel(Relation rel, Buffer buf)
1370
1396
* to. On subsequent iterations, we know we stepped right from a page
1371
1397
* that passed these tests, so it's OK.
1372
1398
*/
1373
- if (P_RIGHTMOST (opaque ) || P_ISROOT (opaque ) || P_ISDELETED ( opaque ) ||
1399
+ if (P_RIGHTMOST (opaque ) || P_ISROOT (opaque ) ||
1374
1400
P_FIRSTDATAKEY (opaque ) <= PageGetMaxOffsetNumber (page ) ||
1375
1401
P_INCOMPLETE_SPLIT (opaque ))
1376
1402
{
1377
1403
/* Should never fail to delete a half-dead page */
1378
1404
Assert (!P_ISHALFDEAD (opaque ));
1379
1405
1380
- _bt_relbuf (rel , buf );
1406
+ _bt_relbuf (rel , leafbuf );
1381
1407
return ndeleted ;
1382
1408
}
1383
1409
@@ -1415,7 +1441,7 @@ _bt_pagedel(Relation rel, Buffer buf)
1415
1441
* To avoid deadlocks, we'd better drop the leaf page lock
1416
1442
* before going further.
1417
1443
*/
1418
- LockBuffer (buf , BUFFER_LOCK_UNLOCK );
1444
+ LockBuffer (leafbuf , BUFFER_LOCK_UNLOCK );
1419
1445
1420
1446
/*
1421
1447
* Fetch the left sibling, to check that it's not marked with
@@ -1439,10 +1465,10 @@ _bt_pagedel(Relation rel, Buffer buf)
1439
1465
* incompletely-split page to be split again. So we don't
1440
1466
* need to walk right here.
1441
1467
*/
1442
- if (lopaque -> btpo_next == BufferGetBlockNumber (buf ) &&
1468
+ if (lopaque -> btpo_next == BufferGetBlockNumber (leafbuf ) &&
1443
1469
P_INCOMPLETE_SPLIT (lopaque ))
1444
1470
{
1445
- ReleaseBuffer (buf );
1471
+ ReleaseBuffer (leafbuf );
1446
1472
_bt_relbuf (rel , lbuf );
1447
1473
return ndeleted ;
1448
1474
}
@@ -1458,40 +1484,59 @@ _bt_pagedel(Relation rel, Buffer buf)
1458
1484
_bt_relbuf (rel , lbuf );
1459
1485
1460
1486
/*
1461
- * Re-lock the leaf page, and start over, to re-check that the
1462
- * page can still be deleted.
1487
+ * Re-lock the leaf page, and start over to use our stack
1488
+ * within _bt_mark_page_halfdead. We must do it that way
1489
+ * because it's possible that leafbuf can no longer be
1490
+ * deleted. We need to recheck.
1463
1491
*/
1464
- LockBuffer (buf , BT_WRITE );
1492
+ LockBuffer (leafbuf , BT_WRITE );
1465
1493
continue ;
1466
1494
}
1467
1495
1468
- if (!_bt_mark_page_halfdead (rel , buf , stack ))
1496
+ /*
1497
+ * See if it's safe to delete the leaf page, and determine how
1498
+ * many parent/internal pages above the leaf level will be
1499
+ * deleted. If it's safe then _bt_mark_page_halfdead will also
1500
+ * perform the first phase of deletion, which includes marking the
1501
+ * leafbuf page half-dead.
1502
+ */
1503
+ Assert (P_ISLEAF (opaque ) && !P_IGNORE (opaque ));
1504
+ if (!_bt_mark_page_halfdead (rel , leafbuf , stack ))
1469
1505
{
1470
- _bt_relbuf (rel , buf );
1506
+ _bt_relbuf (rel , leafbuf );
1471
1507
return ndeleted ;
1472
1508
}
1473
1509
}
1474
1510
1475
1511
/*
1476
1512
* Then unlink it from its siblings. Each call to
1477
1513
* _bt_unlink_halfdead_page unlinks the topmost page from the branch,
1478
- * making it shallower. Iterate until the leaf page is gone.
1514
+ * making it shallower. Iterate until the leafbuf page is deleted.
1515
+ *
1516
+ * _bt_unlink_halfdead_page should never fail, since we established
1517
+ * that deletion is generally safe in _bt_mark_page_halfdead.
1479
1518
*/
1480
1519
rightsib_empty = false;
1520
+ Assert (P_ISLEAF (opaque ) && P_ISHALFDEAD (opaque ));
1481
1521
while (P_ISHALFDEAD (opaque ))
1482
1522
{
1483
- /* will check for interrupts, once lock is released */
1484
- if (!_bt_unlink_halfdead_page (rel , buf , & rightsib_empty ))
1523
+ /* Check for interrupts in _bt_unlink_halfdead_page */
1524
+ if (!_bt_unlink_halfdead_page (rel , leafbuf , & rightsib_empty ,
1525
+ oldestBtpoXact ))
1485
1526
{
1486
- /* _bt_unlink_halfdead_page already released buffer */
1527
+ /* _bt_unlink_halfdead_page failed, released buffer */
1487
1528
return ndeleted ;
1488
1529
}
1489
1530
ndeleted ++ ;
1490
1531
}
1491
1532
1533
+ Assert (P_ISLEAF (opaque ) && P_ISDELETED (opaque ));
1534
+ Assert (TransactionIdFollowsOrEquals (opaque -> btpo .xact ,
1535
+ * oldestBtpoXact ));
1536
+
1492
1537
rightsib = opaque -> btpo_next ;
1493
1538
1494
- _bt_relbuf (rel , buf );
1539
+ _bt_relbuf (rel , leafbuf );
1495
1540
1496
1541
/*
1497
1542
* Check here, as calling loops will have locks held, preventing
@@ -1511,7 +1556,7 @@ _bt_pagedel(Relation rel, Buffer buf)
1511
1556
if (!rightsib_empty )
1512
1557
break ;
1513
1558
1514
- buf = _bt_getbuf (rel , rightsib , BT_WRITE );
1559
+ leafbuf = _bt_getbuf (rel , rightsib , BT_WRITE );
1515
1560
}
1516
1561
1517
1562
return ndeleted ;
@@ -1714,17 +1759,28 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
1714
1759
* of the whole branch, including the leaf page itself, iterate until the
1715
1760
* leaf page is deleted.
1716
1761
*
1717
- * Returns 'false' if the page could not be unlinked (shouldn't happen).
1718
- * If the (new) right sibling of the page is empty, *rightsib_empty is set
1719
- * to true.
1762
+ * Returns 'false' if the page could not be unlinked (shouldn't happen). If
1763
+ * the right sibling of the current target page is empty, *rightsib_empty is
1764
+ * set to true, allowing caller to delete the target's right sibling page in
1765
+ * passing. Note that *rightsib_empty is only actually used by caller when
1766
+ * target page is leafbuf, following last call here for leafbuf/the subtree
1767
+ * containing leafbuf. (We always set *rightsib_empty for caller, just to be
1768
+ * consistent.)
1769
+ *
1770
+ * We maintain *oldestBtpoXact for pages that are deleted by the current
1771
+ * VACUUM operation here. This must be handled here because we conservatively
1772
+ * assume that there needs to be a new call to ReadNewTransactionId() each
1773
+ * time a page gets deleted. See comments about the underlying assumption
1774
+ * below.
1720
1775
*
1721
1776
* Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
1722
1777
* On success exit, we'll be holding pin and write lock. On failure exit,
1723
1778
* we'll release both pin and lock before returning (we define it that way
1724
1779
* to avoid having to reacquire a lock we already released).
1725
1780
*/
1726
1781
static bool
1727
- _bt_unlink_halfdead_page (Relation rel , Buffer leafbuf , bool * rightsib_empty )
1782
+ _bt_unlink_halfdead_page (Relation rel , Buffer leafbuf , bool * rightsib_empty ,
1783
+ TransactionId * oldestBtpoXact )
1728
1784
{
1729
1785
BlockNumber leafblkno = BufferGetBlockNumber (leafbuf );
1730
1786
BlockNumber leafleftsib ;
@@ -1861,9 +1917,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
1861
1917
lbuf = InvalidBuffer ;
1862
1918
1863
1919
/*
1864
- * Next write-lock the target page itself. It should be okay to take just
1865
- * a write lock not a superexclusive lock, since no scans would stop on an
1866
- * empty page.
1920
+ * Next write-lock the target page itself. It's okay to take a write lock
1921
+ * rather than a superexclusive lock, since no scan will stop on an empty
1922
+ * page.
1867
1923
*/
1868
1924
LockBuffer (buf , BT_WRITE );
1869
1925
page = BufferGetPage (buf );
@@ -2001,6 +2057,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
2001
2057
*/
2002
2058
page = BufferGetPage (buf );
2003
2059
opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
2060
+ Assert (P_ISHALFDEAD (opaque ) || !P_ISLEAF (opaque ));
2004
2061
opaque -> btpo_flags &= ~BTP_HALF_DEAD ;
2005
2062
opaque -> btpo_flags |= BTP_DELETED ;
2006
2063
opaque -> btpo .xact = ReadNewTransactionId ();
@@ -2105,6 +2162,10 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
2105
2162
_bt_relbuf (rel , lbuf );
2106
2163
_bt_relbuf (rel , rbuf );
2107
2164
2165
+ if (!TransactionIdIsValid (* oldestBtpoXact ) ||
2166
+ TransactionIdPrecedes (opaque -> btpo .xact , * oldestBtpoXact ))
2167
+ * oldestBtpoXact = opaque -> btpo .xact ;
2168
+
2108
2169
/*
2109
2170
* Release the target, if it was not the leaf block. The leaf is always
2110
2171
* kept locked.
0 commit comments