Skip to content

Commit 79fa7b3

Browse files
Normalize nbtree truncated high key array behavior.
Commit 5bf748b taught nbtree ScalarArrayOp index scans to decide when and how to start the next primitive index scan based on physical index characteristics. This included rules for deciding whether to start a new primitive index scan (or whether to move onto the right sibling leaf page instead) that specifically consider truncated lower-order columns (-inf columns) from leaf page high keys. These omitted columns were treated as satisfying the scan's required scan keys, though only for scan keys marked required in the current scan direction (forward). Scan keys that didn't get this behavior (those marked required in the backwards direction only) usually didn't give the scan reasonable cause to reposition itself to a later leaf page (via another descent of the index in _bt_first), but _bt_advance_array_keys would nevertheless always give up by forcing another call to _bt_first. _bt_advance_array_keys was unwilling to allow the scan to continue onto the next leaf page, to reconsider whether we really should start another primitive scan based on the details of the sibling page's tuples. This didn't match its behavior with similar cases involving keys required in the current scan direction (forward), which seems unprincipled. It led to an excessive number of primitive scans/index descents for queries with a higher-order = array scan key (with dense, contiguous values) mixed with a lower-order required > or >= scan key. Bring > and >= strategy scan keys in line with other required scan key types: treat truncated -inf scan keys as having satisfied scan keys required in either scan direction (forwards and backwards alike) during array advancement. That way affected scans can continue to the right sibling leaf page. Advancement must now schedule an explicit recheck of the right sibling page's high key in cases involving > or >= scan keys. The recheck gives the scan a way to back out and start another primitive index scan (we can't just rely on _bt_checkkeys with > or >= scan keys). This work can be considered a stand alone optimization on top of the work from commit 5bf748b. But it was written in preparation for an upcoming patch that will add skip scan to nbtree. In practice scans that use "skip arrays" will tend to be much more sensitive to any implementation deficiencies in this area. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Tomas Vondra <tomas@vondra.me> Discussion: https://postgr.es/m/CAH2-Wz=9A_UtM7HzUThSkQ+BcrQsQZuNhWOvQWK06PRkEp=SKQ@mail.gmail.com
1 parent c259b15 commit 79fa7b3

File tree

4 files changed

+95
-56
lines changed

4 files changed

+95
-56
lines changed

src/backend/access/nbtree/nbtree.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
332332

333333
so->needPrimScan = false;
334334
so->scanBehind = false;
335+
so->oppositeDirCheck = false;
335336
so->arrayKeys = NULL;
336337
so->orderProcs = NULL;
337338
so->arrayContext = NULL;
@@ -375,6 +376,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
375376
so->markItemIndex = -1;
376377
so->needPrimScan = false;
377378
so->scanBehind = false;
379+
so->oppositeDirCheck = false;
378380
BTScanPosUnpinIfPinned(so->markPos);
379381
BTScanPosInvalidate(so->markPos);
380382

@@ -618,6 +620,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
618620
*/
619621
so->needPrimScan = false;
620622
so->scanBehind = false;
623+
so->oppositeDirCheck = false;
621624
}
622625
else
623626
{
@@ -676,6 +679,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
676679
*/
677680
so->needPrimScan = true;
678681
so->scanBehind = false;
682+
so->oppositeDirCheck = false;
679683
}
680684
else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
681685
{

src/backend/access/nbtree/nbtsearch.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1703,6 +1703,31 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
17031703
ItemId iid = PageGetItemId(page, P_HIKEY);
17041704

17051705
pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1706+
1707+
if (unlikely(so->oppositeDirCheck))
1708+
{
1709+
Assert(so->scanBehind);
1710+
1711+
/*
1712+
* Last _bt_readpage call scheduled a recheck of finaltup for
1713+
* required scan keys up to and including a > or >= scan key.
1714+
*
1715+
* _bt_checkkeys won't consider the scanBehind flag unless the
1716+
* scan is stoppped by a scan key required in the current scan
1717+
* direction. We need this recheck so that we'll notice when
1718+
* all tuples on this page are still before the _bt_first-wise
1719+
* start of matches for the current set of array keys.
1720+
*/
1721+
if (!_bt_oppodir_checkkeys(scan, dir, pstate.finaltup))
1722+
{
1723+
/* Schedule another primitive index scan after all */
1724+
so->currPos.moreRight = false;
1725+
so->needPrimScan = true;
1726+
return false;
1727+
}
1728+
1729+
/* Deliberately don't unset scanBehind flag just yet */
1730+
}
17061731
}
17071732

17081733
/* load items[] in ascending order */

src/backend/access/nbtree/nbtutils.c

Lines changed: 63 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1371,7 +1371,7 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
13711371
curArrayKey->cur_elem = 0;
13721372
skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
13731373
}
1374-
so->scanBehind = false;
1374+
so->scanBehind = so->oppositeDirCheck = false; /* reset */
13751375
}
13761376

13771377
/*
@@ -1680,8 +1680,7 @@ _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
16801680

16811681
Assert(so->numArrayKeys);
16821682

1683-
/* scanBehind flag doesn't persist across primitive index scans - reset */
1684-
so->scanBehind = false;
1683+
so->scanBehind = so->oppositeDirCheck = false; /* reset */
16851684

16861685
/*
16871686
* Array keys are advanced within _bt_checkkeys when the scan reaches the
@@ -1817,7 +1816,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
18171816
Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
18181817
tupnatts, false, 0, NULL));
18191818

1820-
so->scanBehind = false; /* reset */
1819+
so->scanBehind = so->oppositeDirCheck = false; /* reset */
18211820

18221821
/*
18231822
* Required scan key wasn't satisfied, so required arrays will have to
@@ -2302,19 +2301,22 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
23022301
if (so->scanBehind && has_required_opposite_direction_only)
23032302
{
23042303
/*
2305-
* However, we avoid this behavior whenever the scan involves a scan
2304+
* However, we need to work harder whenever the scan involves a scan
23062305
* key required in the opposite direction to the scan only, along with
23072306
* a finaltup with at least one truncated attribute that's associated
23082307
* with a scan key marked required (required in either direction).
23092308
*
23102309
* _bt_check_compare simply won't stop the scan for a scan key that's
23112310
* marked required in the opposite scan direction only. That leaves
2312-
* us without any reliable way of reconsidering any opposite-direction
2311+
* us without an automatic way of reconsidering any opposite-direction
23132312
* inequalities if it turns out that starting a new primitive index
2314-
* scan will allow _bt_first to skip ahead by a great many leaf pages
2315-
* (see next section for details of how that works).
2313+
* scan will allow _bt_first to skip ahead by a great many leaf pages.
2314+
*
2315+
* We deal with this by explicitly scheduling a finaltup recheck on
2316+
* the right sibling page. _bt_readpage calls _bt_oppodir_checkkeys
2317+
* for next page's finaltup (and we skip it for this page's finaltup).
23162318
*/
2317-
goto new_prim_scan;
2319+
so->oppositeDirCheck = true; /* recheck next page's high key */
23182320
}
23192321

23202322
/*
@@ -2345,61 +2347,24 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
23452347
* similar "deduce NOT NULL" rule, where it finishes its insertion scan
23462348
* key by consing up an explicit SK_SEARCHNOTNULL key.)
23472349
*
2348-
* Apply a test against finaltup to detect and recover from these problem:
2350+
* Apply a test against finaltup to detect and recover from the problem:
23492351
* if even finaltup doesn't satisfy such an inequality, we just skip by
23502352
* starting a new primitive index scan. When we skip, we know for sure
23512353
* that all of the tuples on the current page following caller's tuple are
23522354
* also before the _bt_first-wise start of tuples for our new qual. That
23532355
* at least suggests many more skippable pages beyond the current page.
2356+
* (when so->oppositeDirCheck was set, this'll happen on the next page.)
23542357
*/
2355-
if (has_required_opposite_direction_only && pstate->finaltup &&
2356-
(all_required_satisfied || oppodir_inequality_sktrig))
2358+
else if (has_required_opposite_direction_only && pstate->finaltup &&
2359+
(all_required_satisfied || oppodir_inequality_sktrig) &&
2360+
unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup)))
23572361
{
2358-
int nfinaltupatts = BTreeTupleGetNAtts(pstate->finaltup, rel);
2359-
ScanDirection flipped;
2360-
bool continuescanflip;
2361-
int opsktrig;
2362-
23632362
/*
2364-
* We're checking finaltup (which is usually not caller's tuple), so
2365-
* cannot reuse work from caller's earlier _bt_check_compare call.
2366-
*
2367-
* Flip the scan direction when calling _bt_check_compare this time,
2368-
* so that it will set continuescanflip=false when it encounters an
2369-
* inequality required in the opposite scan direction.
2363+
* Make sure that any non-required arrays are set to the first array
2364+
* element for the current scan direction
23702365
*/
2371-
Assert(!so->scanBehind);
2372-
opsktrig = 0;
2373-
flipped = -dir;
2374-
_bt_check_compare(scan, flipped,
2375-
pstate->finaltup, nfinaltupatts, tupdesc,
2376-
false, false, false,
2377-
&continuescanflip, &opsktrig);
2378-
2379-
/*
2380-
* Only start a new primitive index scan when finaltup has a required
2381-
* unsatisfied inequality (unsatisfied in the opposite direction)
2382-
*/
2383-
Assert(all_required_satisfied != oppodir_inequality_sktrig);
2384-
if (unlikely(!continuescanflip &&
2385-
so->keyData[opsktrig].sk_strategy != BTEqualStrategyNumber))
2386-
{
2387-
/*
2388-
* It's possible for the same inequality to be unsatisfied by both
2389-
* caller's tuple (in scan's direction) and finaltup (in the
2390-
* opposite direction) due to _bt_check_compare's behavior with
2391-
* NULLs
2392-
*/
2393-
Assert(opsktrig >= sktrig); /* not opsktrig > sktrig due to NULLs */
2394-
2395-
/*
2396-
* Make sure that any non-required arrays are set to the first
2397-
* array element for the current scan direction
2398-
*/
2399-
_bt_rewind_nonrequired_arrays(scan, dir);
2400-
2401-
goto new_prim_scan;
2402-
}
2366+
_bt_rewind_nonrequired_arrays(scan, dir);
2367+
goto new_prim_scan;
24032368
}
24042369

24052370
/*
@@ -3511,7 +3476,8 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
35113476
*
35123477
* Assert that the scan isn't in danger of becoming confused.
35133478
*/
3514-
Assert(!so->scanBehind && !pstate->prechecked && !pstate->firstmatch);
3479+
Assert(!so->scanBehind && !so->oppositeDirCheck);
3480+
Assert(!pstate->prechecked && !pstate->firstmatch);
35153481
Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
35163482
tupnatts, false, 0, NULL));
35173483
}
@@ -3623,6 +3589,47 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
36233589
ikey, true);
36243590
}
36253591

3592+
/*
3593+
* Test whether an indextuple fails to satisfy an inequality required in the
3594+
* opposite direction only.
3595+
*
3596+
* Caller's finaltup tuple is the page high key (for forwards scans), or the
3597+
* first non-pivot tuple (for backwards scans). Called during scans with
3598+
* required array keys and required opposite-direction inequalities.
3599+
*
3600+
* Returns false if an inequality scan key required in the opposite direction
3601+
* only isn't satisfied (and any earlier required scan keys are satisfied).
3602+
* Otherwise returns true.
3603+
*
3604+
* An unsatisfied inequality required in the opposite direction only might
3605+
* well enable skipping over many leaf pages, provided another _bt_first call
3606+
* takes place. This type of unsatisfied inequality won't usually cause
3607+
* _bt_checkkeys to stop the scan to consider array advancement/starting a new
3608+
* primitive index scan.
3609+
*/
3610+
bool
3611+
_bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
3612+
IndexTuple finaltup)
3613+
{
3614+
Relation rel = scan->indexRelation;
3615+
TupleDesc tupdesc = RelationGetDescr(rel);
3616+
BTScanOpaque so = (BTScanOpaque) scan->opaque;
3617+
int nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel);
3618+
bool continuescan;
3619+
ScanDirection flipped = -dir;
3620+
int ikey = 0;
3621+
3622+
Assert(so->numArrayKeys);
3623+
3624+
_bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc,
3625+
false, false, false, &continuescan, &ikey);
3626+
3627+
if (!continuescan && so->keyData[ikey].sk_strategy != BTEqualStrategyNumber)
3628+
return false;
3629+
3630+
return true;
3631+
}
3632+
36263633
/*
36273634
* Test whether an indextuple satisfies current scan condition.
36283635
*

src/include/access/nbtree.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,6 +1048,7 @@ typedef struct BTScanOpaqueData
10481048
int numArrayKeys; /* number of equality-type array keys */
10491049
bool needPrimScan; /* New prim scan to continue in current dir? */
10501050
bool scanBehind; /* Last array advancement matched -inf attr? */
1051+
bool oppositeDirCheck; /* explicit scanBehind recheck needed? */
10511052
BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */
10521053
FmgrInfo *orderProcs; /* ORDER procs for required equality keys */
10531054
MemoryContext arrayContext; /* scan-lifespan context for array data */
@@ -1289,6 +1290,8 @@ extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
12891290
extern void _bt_preprocess_keys(IndexScanDesc scan);
12901291
extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
12911292
IndexTuple tuple, int tupnatts);
1293+
extern bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
1294+
IndexTuple finaltup);
12921295
extern void _bt_killitems(IndexScanDesc scan);
12931296
extern BTCycleId _bt_vacuum_cycleid(Relation rel);
12941297
extern BTCycleId _bt_start_vacuum(Relation rel);

0 commit comments

Comments
 (0)