Skip to content

Commit 9a2e2a2

Browse files
Improve nbtree array primitive scan scheduling.
Add a new scheduling heuristic: don't end the ongoing primitive index scan immediately (at the point where _bt_advance_array_keys notices that the next set of matching tuples must be on a later page) if the primscan already managed to step right/left from its first leaf page. Schedule a recheck against the next sibling leaf page's finaltup instead. The new heuristic tends to avoid scenarios where the top-level scan repeatedly starts and ends primitive index scans that each read only one leaf page from a group of neighboring leaf pages. Affected top-level scans will now tend to step forward (or backward) through the index instead, without wasting cycles on descending the index anew. The recheck mechanism isn't exactly new. But up until now it has only been used to deal with edge cases involving high key finaltups with one or more truncated -inf attributes that _bt_advance_array_keys deemed "provisionally satisfied" (satisfied for the purposes of allowing the scan to step onto the next page, subject to recheck once on that page). The mechanism was added by commit 5bf748b, which invented the general concept of primitive scan scheduling. It was later enhanced by commit 79fa7b3, which taught it about cases involving -inf attributes that satisfy inequality scan keys required in the opposite-to-scan direction only (arguably, they should have been covered by the earliest version). Now the recheck mechanism can be applied based on scan-level heuristics, which have nothing to do with truncated high keys. Now rechecks might be performed by _bt_readpage when scanning in _either_ scan direction. The theory behind the new heuristic is that any primitive scan that makes it past its first leaf page is one that is already likely to have arrays whose key values match index tuples that are closely clustered together in the index. The rules that determine whether we ever get past the first page are still conservative (that'll still only happen when pstate.finaltup strongly suggests that it's the right thing to do). Surviving past the first leaf page is a strong signal in itself. Preparation for an upcoming patch that will add skip scan optimizations to nbtree. That'll work by adding skip arrays, which behave similarly to SAOP arrays, but generate their elements procedurally and on-demand. Note that this commit isn't specifically concerned with skip arrays; the scheduling logic doesn't (and won't) condition anything on whether the scan uses skip arrays, SAOP arrays, or some combination of the two (which seems like a good general principle for _bt_advance_array_keys). While the problems that this commit ameliorates are more likely with skip arrays (at least in practice), SAOP arrays (or those with very dense, contiguous array elements) are also affected. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Matthias van de Meent <boekewurm+postgres@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzkz0wPe6+02kr+hC+JJNKfGtjGTzpG3CFVTQmKwWNrXNw@mail.gmail.com
1 parent e215166 commit 9a2e2a2

File tree

3 files changed

+162
-138
lines changed

3 files changed

+162
-138
lines changed

src/backend/access/nbtree/nbtsearch.c

Lines changed: 39 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
3333
static int _bt_binsrch_posting(BTScanInsert key, Page page,
3434
OffsetNumber offnum);
3535
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
36-
OffsetNumber offnum, bool firstPage);
36+
OffsetNumber offnum, bool firstpage);
3737
static void _bt_saveitem(BTScanOpaque so, int itemIndex,
3838
OffsetNumber offnum, IndexTuple itup);
3939
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
@@ -1500,7 +1500,7 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
15001500
*/
15011501
static bool
15021502
_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
1503-
bool firstPage)
1503+
bool firstpage)
15041504
{
15051505
Relation rel = scan->indexRelation;
15061506
BTScanOpaque so = (BTScanOpaque) scan->opaque;
@@ -1556,6 +1556,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
15561556
pstate.maxoff = maxoff;
15571557
pstate.finaltup = NULL;
15581558
pstate.page = page;
1559+
pstate.firstpage = firstpage;
15591560
pstate.offnum = InvalidOffsetNumber;
15601561
pstate.skip = InvalidOffsetNumber;
15611562
pstate.continuescan = true; /* default assumption */
@@ -1604,7 +1605,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
16041605
* required < or <= strategy scan keys) during the precheck, we can safely
16051606
* assume that this must also be true of all earlier tuples from the page.
16061607
*/
1607-
if (!firstPage && !so->scanBehind && minoff < maxoff)
1608+
if (!pstate.firstpage && !so->scanBehind && minoff < maxoff)
16081609
{
16091610
ItemId iid;
16101611
IndexTuple itup;
@@ -1621,36 +1622,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
16211622
if (ScanDirectionIsForward(dir))
16221623
{
16231624
/* SK_SEARCHARRAY forward scans must provide high key up front */
1624-
if (arrayKeys && !P_RIGHTMOST(opaque))
1625+
if (arrayKeys)
16251626
{
1626-
ItemId iid = PageGetItemId(page, P_HIKEY);
1627-
1628-
pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1629-
1630-
if (unlikely(so->oppositeDirCheck))
1627+
if (!P_RIGHTMOST(opaque))
16311628
{
1632-
Assert(so->scanBehind);
1629+
ItemId iid = PageGetItemId(page, P_HIKEY);
16331630

1634-
/*
1635-
* Last _bt_readpage call scheduled a recheck of finaltup for
1636-
* required scan keys up to and including a > or >= scan key.
1637-
*
1638-
* _bt_checkkeys won't consider the scanBehind flag unless the
1639-
* scan is stopped by a scan key required in the current scan
1640-
* direction. We need this recheck so that we'll notice when
1641-
* all tuples on this page are still before the _bt_first-wise
1642-
* start of matches for the current set of array keys.
1643-
*/
1644-
if (!_bt_oppodir_checkkeys(scan, dir, pstate.finaltup))
1631+
pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1632+
1633+
if (so->scanBehind &&
1634+
!_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
16451635
{
16461636
/* Schedule another primitive index scan after all */
16471637
so->currPos.moreRight = false;
16481638
so->needPrimScan = true;
1639+
if (scan->parallel_scan)
1640+
_bt_parallel_primscan_schedule(scan,
1641+
so->currPos.currPage);
16491642
return false;
16501643
}
1651-
1652-
/* Deliberately don't unset scanBehind flag just yet */
16531644
}
1645+
1646+
so->scanBehind = so->oppositeDirCheck = false; /* reset */
16541647
}
16551648

16561649
/* load items[] in ascending order */
@@ -1746,7 +1739,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
17461739
* only appear on non-pivot tuples on the right sibling page are
17471740
* common.
17481741
*/
1749-
if (pstate.continuescan && !P_RIGHTMOST(opaque))
1742+
if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque))
17501743
{
17511744
ItemId iid = PageGetItemId(page, P_HIKEY);
17521745
IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
@@ -1768,11 +1761,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
17681761
else
17691762
{
17701763
/* SK_SEARCHARRAY backward scans must provide final tuple up front */
1771-
if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
1764+
if (arrayKeys)
17721765
{
1773-
ItemId iid = PageGetItemId(page, minoff);
1766+
if (minoff <= maxoff && !P_LEFTMOST(opaque))
1767+
{
1768+
ItemId iid = PageGetItemId(page, minoff);
1769+
1770+
pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1771+
1772+
if (so->scanBehind &&
1773+
!_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
1774+
{
1775+
/* Schedule another primitive index scan after all */
1776+
so->currPos.moreLeft = false;
1777+
so->needPrimScan = true;
1778+
if (scan->parallel_scan)
1779+
_bt_parallel_primscan_schedule(scan,
1780+
so->currPos.currPage);
1781+
return false;
1782+
}
1783+
}
17741784

1775-
pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1785+
so->scanBehind = so->oppositeDirCheck = false; /* reset */
17761786
}
17771787

17781788
/* load items[] in descending order */
@@ -2276,14 +2286,14 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
22762286
if (ScanDirectionIsForward(dir))
22772287
{
22782288
/* note that this will clear moreRight if we can stop */
2279-
if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
2289+
if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
22802290
break;
22812291
blkno = so->currPos.nextPage;
22822292
}
22832293
else
22842294
{
22852295
/* note that this will clear moreLeft if we can stop */
2286-
if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
2296+
if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
22872297
break;
22882298
blkno = so->currPos.prevPage;
22892299
}

0 commit comments

Comments
 (0)