Skip to content

Commit 29b64d1

Browse files
Add nbtree high key "continuescan" optimization.
Teach nbtree forward index scans to check the high key before moving to the right sibling page in the hope of finding that it isn't actually necessary to do so. The new check may indicate that the scan definitely cannot find matching tuples to the right, ending the scan immediately. We already opportunistically force a similar "continuescan orientated" key check of the final non-pivot tuple when it's clear that it cannot be returned to the scan due to being dead-to-all. The new high key check is complementary. The new approach for forward scans is more effective than checking the final non-pivot tuple, especially with composite indexes and non-unique indexes. The improvements to the logic for picking a split point added by commit fab2502 make it likely that relatively dissimilar high keys will appear on a page. A distinguishing key value that can only appear on non-pivot tuples on the right sibling page will often be present in leaf page high keys. Since forcing the final item to be key checked no longer makes any difference in the case of forward scans, the existing extra key check is now only used for backwards scans. Backward scans continue to opportunistically check the final non-pivot tuple, which is actually the first non-pivot tuple on the page (not the last). Note that even pg_upgrade'd v3 indexes make use of this optimization. Author: Peter Geoghegan, Heikki Linnakangas Reviewed-By: Heikki Linnakangas Discussion: https://postgr.es/m/CAH2-WzkOmUduME31QnuTFpimejuQoiZ-HOf0pOWeFZNhTMctvA@mail.gmail.com
1 parent 4ba96d1 commit 29b64d1

File tree

3 files changed

+128
-77
lines changed

3 files changed

+128
-77
lines changed

src/backend/access/nbtree/nbtsearch.c

Lines changed: 78 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1406,8 +1406,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
14061406
OffsetNumber minoff;
14071407
OffsetNumber maxoff;
14081408
int itemIndex;
1409-
IndexTuple itup;
14101409
bool continuescan;
1410+
int indnatts;
14111411

14121412
/*
14131413
* We must have the buffer pinned and locked, but the usual macro can't be
@@ -1427,6 +1427,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
14271427
_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
14281428
}
14291429

1430+
continuescan = true; /* default assumption */
1431+
indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
14301432
minoff = P_FIRSTDATAKEY(opaque);
14311433
maxoff = PageGetMaxOffsetNumber(page);
14321434

@@ -1468,23 +1470,58 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
14681470

14691471
while (offnum <= maxoff)
14701472
{
1471-
itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1472-
if (itup != NULL)
1473+
ItemId iid = PageGetItemId(page, offnum);
1474+
IndexTuple itup;
1475+
1476+
/*
1477+
* If the scan specifies not to return killed tuples, then we
1478+
* treat a killed tuple as not passing the qual
1479+
*/
1480+
if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1481+
{
1482+
offnum = OffsetNumberNext(offnum);
1483+
continue;
1484+
}
1485+
1486+
itup = (IndexTuple) PageGetItem(page, iid);
1487+
1488+
if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
14731489
{
14741490
/* tuple passes all scan key conditions, so remember it */
14751491
_bt_saveitem(so, itemIndex, offnum, itup);
14761492
itemIndex++;
14771493
}
1494+
/* When !continuescan, there can't be any more matches, so stop */
14781495
if (!continuescan)
1479-
{
1480-
/* there can't be any more matches, so stop */
1481-
so->currPos.moreRight = false;
14821496
break;
1483-
}
14841497

14851498
offnum = OffsetNumberNext(offnum);
14861499
}
14871500

1501+
/*
1502+
* We don't need to visit page to the right when the high key
1503+
* indicates that no more matches will be found there.
1504+
*
1505+
* Checking the high key like this works out more often than you might
1506+
* think. Leaf page splits pick a split point between the two most
1507+
* dissimilar tuples (this is weighed against the need to evenly share
1508+
* free space). Leaf pages with high key attribute values that can
1509+
* only appear on non-pivot tuples on the right sibling page are
1510+
* common.
1511+
*/
1512+
if (continuescan && !P_RIGHTMOST(opaque))
1513+
{
1514+
ItemId iid = PageGetItemId(page, P_HIKEY);
1515+
IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1516+
int truncatt;
1517+
1518+
truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1519+
_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1520+
}
1521+
1522+
if (!continuescan)
1523+
so->currPos.moreRight = false;
1524+
14881525
Assert(itemIndex <= MaxIndexTuplesPerPage);
14891526
so->currPos.firstItem = 0;
14901527
so->currPos.lastItem = itemIndex - 1;
@@ -1499,8 +1536,40 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
14991536

15001537
while (offnum >= minoff)
15011538
{
1502-
itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1503-
if (itup != NULL)
1539+
ItemId iid = PageGetItemId(page, offnum);
1540+
IndexTuple itup;
1541+
bool tuple_alive;
1542+
bool passes_quals;
1543+
1544+
/*
1545+
* If the scan specifies not to return killed tuples, then we
1546+
* treat a killed tuple as not passing the qual. Most of the
1547+
* time, it's a win to not bother examining the tuple's index
1548+
* keys, but just skip to the next tuple (previous, actually,
1549+
* since we're scanning backwards). However, if this is the first
1550+
* tuple on the page, we do check the index keys, to prevent
1551+
* uselessly advancing to the page to the left. This is similar
1552+
* to the high key optimization used by forward scans.
1553+
*/
1554+
if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1555+
{
1556+
Assert(offnum >= P_FIRSTDATAKEY(opaque));
1557+
if (offnum > P_FIRSTDATAKEY(opaque))
1558+
{
1559+
offnum = OffsetNumberPrev(offnum);
1560+
continue;
1561+
}
1562+
1563+
tuple_alive = false;
1564+
}
1565+
else
1566+
tuple_alive = true;
1567+
1568+
itup = (IndexTuple) PageGetItem(page, iid);
1569+
1570+
passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1571+
&continuescan);
1572+
if (passes_quals && tuple_alive)
15041573
{
15051574
/* tuple passes all scan key conditions, so remember it */
15061575
itemIndex--;

src/backend/access/nbtree/nbtutils.c

Lines changed: 48 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
4848
static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
4949
static void _bt_mark_scankey_required(ScanKey skey);
5050
static bool _bt_check_rowcompare(ScanKey skey,
51-
IndexTuple tuple, TupleDesc tupdesc,
51+
IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
5252
ScanDirection dir, bool *continuescan);
5353
static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
5454
IndexTuple firstright, BTScanInsert itup_key);
@@ -1333,72 +1333,34 @@ _bt_mark_scankey_required(ScanKey skey)
13331333
/*
13341334
* Test whether an indextuple satisfies all the scankey conditions.
13351335
*
1336-
* If so, return the address of the index tuple on the index page.
1337-
* If not, return NULL.
1336+
* Return true if so, false if not. If the tuple fails to pass the qual,
1337+
* we also determine whether there's any need to continue the scan beyond
1338+
* this tuple, and set *continuescan accordingly. See comments for
1339+
* _bt_preprocess_keys(), above, about how this is done.
13381340
*
1339-
* If the tuple fails to pass the qual, we also determine whether there's
1340-
* any need to continue the scan beyond this tuple, and set *continuescan
1341-
* accordingly. See comments for _bt_preprocess_keys(), above, about how
1342-
* this is done.
1341+
* Forward scan callers can pass a high key tuple in the hopes of having
1342+
* us set *continuescanthat to false, and avoiding an unnecessary visit to
1343+
* the page to the right.
13431344
*
13441345
* scan: index scan descriptor (containing a search-type scankey)
1345-
* page: buffer page containing index tuple
1346-
* offnum: offset number of index tuple (must be a valid item!)
1346+
* tuple: index tuple to test
1347+
* tupnatts: number of attributes in tupnatts (high key may be truncated)
13471348
* dir: direction we are scanning in
13481349
* continuescan: output parameter (will be set correctly in all cases)
1349-
*
1350-
* Caller must hold pin and lock on the index page.
13511350
*/
1352-
IndexTuple
1353-
_bt_checkkeys(IndexScanDesc scan,
1354-
Page page, OffsetNumber offnum,
1351+
bool
1352+
_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
13551353
ScanDirection dir, bool *continuescan)
13561354
{
1357-
ItemId iid = PageGetItemId(page, offnum);
1358-
bool tuple_alive;
1359-
IndexTuple tuple;
13601355
TupleDesc tupdesc;
13611356
BTScanOpaque so;
13621357
int keysz;
13631358
int ikey;
13641359
ScanKey key;
13651360

1366-
*continuescan = true; /* default assumption */
1367-
1368-
/*
1369-
* If the scan specifies not to return killed tuples, then we treat a
1370-
* killed tuple as not passing the qual. Most of the time, it's a win to
1371-
* not bother examining the tuple's index keys, but just return
1372-
* immediately with continuescan = true to proceed to the next tuple.
1373-
* However, if this is the last tuple on the page, we should check the
1374-
* index keys to prevent uselessly advancing to the next page.
1375-
*/
1376-
if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1377-
{
1378-
/* return immediately if there are more tuples on the page */
1379-
if (ScanDirectionIsForward(dir))
1380-
{
1381-
if (offnum < PageGetMaxOffsetNumber(page))
1382-
return NULL;
1383-
}
1384-
else
1385-
{
1386-
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1387-
1388-
if (offnum > P_FIRSTDATAKEY(opaque))
1389-
return NULL;
1390-
}
1391-
1392-
/*
1393-
* OK, we want to check the keys so we can set continuescan correctly,
1394-
* but we'll return NULL even if the tuple passes the key tests.
1395-
*/
1396-
tuple_alive = false;
1397-
}
1398-
else
1399-
tuple_alive = true;
1361+
Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
14001362

1401-
tuple = (IndexTuple) PageGetItem(page, iid);
1363+
*continuescan = true; /* default assumption */
14021364

14031365
tupdesc = RelationGetDescr(scan->indexRelation);
14041366
so = (BTScanOpaque) scan->opaque;
@@ -1410,13 +1372,25 @@ _bt_checkkeys(IndexScanDesc scan,
14101372
bool isNull;
14111373
Datum test;
14121374

1413-
Assert(key->sk_attno <= BTreeTupleGetNAtts(tuple, scan->indexRelation));
1375+
if (key->sk_attno > tupnatts)
1376+
{
1377+
/*
1378+
* This attribute is truncated (must be high key). The value for
1379+
* this attribute in the first non-pivot tuple on the page to the
1380+
* right could be any possible value. Assume that truncated
1381+
* attribute passes the qual.
1382+
*/
1383+
Assert(ScanDirectionIsForward(dir));
1384+
continue;
1385+
}
1386+
14141387
/* row-comparison keys need special processing */
14151388
if (key->sk_flags & SK_ROW_HEADER)
14161389
{
1417-
if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
1390+
if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
1391+
continuescan))
14181392
continue;
1419-
return NULL;
1393+
return false;
14201394
}
14211395

14221396
datum = index_getattr(tuple,
@@ -1454,7 +1428,7 @@ _bt_checkkeys(IndexScanDesc scan,
14541428
/*
14551429
* In any case, this indextuple doesn't match the qual.
14561430
*/
1457-
return NULL;
1431+
return false;
14581432
}
14591433

14601434
if (isNull)
@@ -1495,7 +1469,7 @@ _bt_checkkeys(IndexScanDesc scan,
14951469
/*
14961470
* In any case, this indextuple doesn't match the qual.
14971471
*/
1498-
return NULL;
1472+
return false;
14991473
}
15001474

15011475
test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
@@ -1523,16 +1497,12 @@ _bt_checkkeys(IndexScanDesc scan,
15231497
/*
15241498
* In any case, this indextuple doesn't match the qual.
15251499
*/
1526-
return NULL;
1500+
return false;
15271501
}
15281502
}
15291503

1530-
/* Check for failure due to it being a killed tuple. */
1531-
if (!tuple_alive)
1532-
return NULL;
1533-
15341504
/* If we get here, the tuple passes all index quals. */
1535-
return tuple;
1505+
return true;
15361506
}
15371507

15381508
/*
@@ -1545,8 +1515,8 @@ _bt_checkkeys(IndexScanDesc scan,
15451515
* This is a subroutine for _bt_checkkeys, which see for more info.
15461516
*/
15471517
static bool
1548-
_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
1549-
ScanDirection dir, bool *continuescan)
1518+
_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
1519+
TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
15501520
{
15511521
ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
15521522
int32 cmpresult = 0;
@@ -1563,6 +1533,19 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
15631533

15641534
Assert(subkey->sk_flags & SK_ROW_MEMBER);
15651535

1536+
if (subkey->sk_attno > tupnatts)
1537+
{
1538+
/*
1539+
* This attribute is truncated (must be high key). The value for
1540+
* this attribute in the first non-pivot tuple on the page to the
1541+
* right could be any possible value. Assume that truncated
1542+
* attribute passes the qual.
1543+
*/
1544+
Assert(ScanDirectionIsForward(dir));
1545+
cmpresult = 0;
1546+
continue;
1547+
}
1548+
15661549
datum = index_getattr(tuple,
15671550
subkey->sk_attno,
15681551
tupdesc,

src/include/access/nbtree.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -772,9 +772,8 @@ extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
772772
extern void _bt_mark_array_keys(IndexScanDesc scan);
773773
extern void _bt_restore_array_keys(IndexScanDesc scan);
774774
extern void _bt_preprocess_keys(IndexScanDesc scan);
775-
extern IndexTuple _bt_checkkeys(IndexScanDesc scan,
776-
Page page, OffsetNumber offnum,
777-
ScanDirection dir, bool *continuescan);
775+
extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
776+
int tupnatts, ScanDirection dir, bool *continuescan);
778777
extern void _bt_killitems(IndexScanDesc scan);
779778
extern BTCycleId _bt_vacuum_cycleid(Relation rel);
780779
extern BTCycleId _bt_start_vacuum(Relation rel);

0 commit comments

Comments
 (0)