Skip to content

Commit 21a152b

Browse files
Improve nbtree skip scan primitive scan scheduling.
Don't allow nbtree scans with skip arrays to end any primitive scan on its first leaf page without giving some consideration to how many times the scan's arrays advanced while changing at least one skip array (though continue not caring about the number of array advancements that only affected SAOP arrays, even during skip scans with SAOP arrays). Now when a scan performs more than 3 such array advancements in the course of reading a single leaf page, it is taken as a signal that the next page is unlikely to be skippable. We'll therefore continue the ongoing primitive index scan, at least until we can perform a recheck against the next page's finaltup. Testing has shown that this new heuristic occasionally makes all the difference with skip scans that were expected to rely on the "passed first page" heuristic added by commit 9a2e2a2. Without it, there is a remaining risk that certain kinds of skip scans will never quite manage to clear the initial hurdle of performing a primitive scan that lasts beyond its first leaf page (or that such a skip scan will only clear that initial hurdle when it has already wasted noticeably-many cycles due to inefficient primitive scan scheduling). Follow-up to commits 92fe23d and 9a2e2a2. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Matthias van de Meent <boekewurm+postgres@gmail.com> Discussion: https://postgr.es/m/CAH2-Wz=RVdG3zWytFWBsyW7fWH7zveFvTHed5JKEsuTT0RCO_A@mail.gmail.com
1 parent cf2655a commit 21a152b

File tree

3 files changed

+78
-31
lines changed

3 files changed

+78
-31
lines changed

src/backend/access/nbtree/nbtsearch.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1655,6 +1655,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
16551655
pstate.continuescan = true; /* default assumption */
16561656
pstate.rechecks = 0;
16571657
pstate.targetdistance = 0;
1658+
pstate.nskipadvances = 0;
16581659

16591660
if (ScanDirectionIsForward(dir))
16601661
{
@@ -1884,6 +1885,21 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
18841885
passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
18851886
itup, indnatts);
18861887

1888+
if (arrayKeys && so->scanBehind)
1889+
{
1890+
/*
1891+
* Done scanning this page, but not done with the current
1892+
* primscan.
1893+
*
1894+
* Note: Forward scans don't check this explicitly, since they
1895+
* prefer to reuse pstate.skip for this instead.
1896+
*/
1897+
Assert(!passes_quals && pstate.continuescan);
1898+
Assert(!pstate.forcenonrequired);
1899+
1900+
break;
1901+
}
1902+
18871903
/*
18881904
* Check if we need to skip ahead to a later tuple (only possible
18891905
* when the scan uses array keys)

src/backend/access/nbtree/nbtutils.c

Lines changed: 60 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
#define LOOK_AHEAD_REQUIRED_RECHECKS 3
2828
#define LOOK_AHEAD_DEFAULT_DISTANCE 5
29+
#define NSKIPADVANCES_THRESHOLD 3
2930

3031
static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc,
3132
Datum tupdatum, bool tupnull,
@@ -41,7 +42,8 @@ static void _bt_array_set_low_or_high(Relation rel, ScanKey skey,
4142
BTArrayKeyInfo *array, bool low_not_high);
4243
static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array);
4344
static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array);
44-
static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir);
45+
static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir,
46+
bool *skip_array_set);
4547
static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir);
4648
static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
4749
IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
@@ -970,7 +972,8 @@ _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array)
970972
* advanced (every array remains at its final element for scan direction).
971973
*/
972974
static bool
973-
_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir)
975+
_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir,
976+
bool *skip_array_set)
974977
{
975978
Relation rel = scan->indexRelation;
976979
BTScanOpaque so = (BTScanOpaque) scan->opaque;
@@ -985,6 +988,9 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir)
985988
BTArrayKeyInfo *array = &so->arrayKeys[i];
986989
ScanKey skey = &so->keyData[array->scan_key];
987990

991+
if (array->num_elems == -1)
992+
*skip_array_set = true;
993+
988994
if (ScanDirectionIsForward(dir))
989995
{
990996
if (_bt_array_increment(rel, skey, array))
@@ -1460,6 +1466,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
14601466
ScanDirection dir = so->currPos.dir;
14611467
int arrayidx = 0;
14621468
bool beyond_end_advance = false,
1469+
skip_array_advanced = false,
14631470
has_required_opposite_direction_only = false,
14641471
all_required_satisfied = true,
14651472
all_satisfied = true;
@@ -1756,6 +1763,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
17561763
/* Skip array's new element is tupdatum (or MINVAL/MAXVAL) */
17571764
_bt_skiparray_set_element(rel, cur, array, result,
17581765
tupdatum, tupnull);
1766+
skip_array_advanced = true;
17591767
}
17601768
else if (array->cur_elem != set_elem)
17611769
{
@@ -1772,11 +1780,19 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
17721780
* higher-order arrays (might exhaust all the scan's arrays instead, which
17731781
* ends the top-level scan).
17741782
*/
1775-
if (beyond_end_advance && !_bt_advance_array_keys_increment(scan, dir))
1783+
if (beyond_end_advance &&
1784+
!_bt_advance_array_keys_increment(scan, dir, &skip_array_advanced))
17761785
goto end_toplevel_scan;
17771786

17781787
Assert(_bt_verify_keys_with_arraykeys(scan));
17791788

1789+
/*
1790+
* Maintain a page-level count of the number of times the scan's array
1791+
* keys advanced in a way that affected at least one skip array
1792+
*/
1793+
if (sktrig_required && skip_array_advanced)
1794+
pstate->nskipadvances++;
1795+
17801796
/*
17811797
* Does tuple now satisfy our new qual? Recheck with _bt_check_compare.
17821798
*
@@ -1946,26 +1962,12 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
19461962
* Being pessimistic would also give some scans with non-required arrays a
19471963
* perverse advantage over similar scans that use required arrays instead.
19481964
*
1949-
* You can think of this as a speculative bet on what the scan is likely
1950-
* to find on the next page. It's not much of a gamble, though, since the
1951-
* untruncated prefix of attributes must strictly satisfy the new qual.
1965+
* This is similar to our scan-level heuristics, below. They also set
1966+
* scanBehind to speculatively continue the primscan onto the next page.
19521967
*/
19531968
if (so->scanBehind)
19541969
{
1955-
/*
1956-
* Truncated high key -- _bt_scanbehind_checkkeys recheck scheduled.
1957-
*
1958-
* Remember if recheck needs to call _bt_oppodir_checkkeys for next
1959-
* page's finaltup (see below comments about "Handle inequalities
1960-
* marked required in the opposite scan direction" for why).
1961-
*/
1962-
so->oppositeDirCheck = has_required_opposite_direction_only;
1963-
1964-
/*
1965-
* Make sure that any SAOP arrays that were not marked required by
1966-
* preprocessing are reset to their first element for this direction
1967-
*/
1968-
_bt_rewind_nonrequired_arrays(scan, dir);
1970+
/* Truncated high key -- _bt_scanbehind_checkkeys recheck scheduled */
19691971
}
19701972

19711973
/*
@@ -2006,6 +2008,10 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
20062008
else if (has_required_opposite_direction_only && pstate->finaltup &&
20072009
unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup)))
20082010
{
2011+
/*
2012+
* Make sure that any SAOP arrays that were not marked required by
2013+
* preprocessing are reset to their first element for this direction
2014+
*/
20092015
_bt_rewind_nonrequired_arrays(scan, dir);
20102016
goto new_prim_scan;
20112017
}
@@ -2032,11 +2038,21 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
20322038

20332039
if (so->scanBehind)
20342040
{
2035-
/* Optimization: skip by setting "look ahead" mechanism's offnum */
2041+
/*
2042+
* Remember if recheck needs to call _bt_oppodir_checkkeys for next
2043+
* page's finaltup (see above comments about "Handle inequalities
2044+
* marked required in the opposite scan direction" for why).
2045+
*/
2046+
so->oppositeDirCheck = has_required_opposite_direction_only;
2047+
2048+
_bt_rewind_nonrequired_arrays(scan, dir);
2049+
2050+
/*
2051+
* skip by setting "look ahead" mechanism's offnum for forwards scans
2052+
* (backwards scans check scanBehind flag directly instead)
2053+
*/
20362054
if (ScanDirectionIsForward(dir))
20372055
pstate->skip = pstate->maxoff + 1;
2038-
else
2039-
pstate->skip = pstate->minoff - 1;
20402056
}
20412057

20422058
/* Caller's tuple doesn't match the new qual */
@@ -2059,19 +2075,31 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
20592075
* This will in turn encourage _bt_readpage to apply the pstate.startikey
20602076
* optimization more often.
20612077
*
2062-
* Note: This heuristic isn't as aggressive as you might think. We're
2078+
* Also continue the ongoing primitive index scan when it is still on the
2079+
* first page if there have been more than NSKIPADVANCES_THRESHOLD calls
2080+
* here that each advanced at least one of the scan's skip arrays
2081+
* (deliberately ignore advancements that only affected SAOP arrays here).
2082+
* A page that cycles through this many skip array elements is quite
2083+
* likely to neighbor similar pages, that we'll also need to read.
2084+
*
2085+
* Note: These heuristics aren't as aggressive as you might think. We're
20632086
* conservative about allowing a primitive scan to step from the first
20642087
* leaf page it reads to the page's sibling page (we only allow it on
2065-
* first pages whose finaltup strongly suggests that it'll work out).
2088+
* first pages whose finaltup strongly suggests that it'll work out, as
2089+
* well as first pages that have a large number of skip array advances).
20662090
* Clearing this first page finaltup hurdle is a strong signal in itself.
2091+
*
2092+
* Note: The NSKIPADVANCES_THRESHOLD heuristic exists only to avoid
2093+
* pathological cases. Specifically, cases where a skip scan should just
2094+
* behave like a traditional full index scan, but ends up "skipping" again
2095+
* and again, descending to the prior leaf page's direct sibling leaf page
2096+
* each time. This misbehavior would otherwise be possible during scans
2097+
* that never quite manage to "clear the first page finaltup hurdle".
20672098
*/
2068-
if (!pstate->firstpage)
2099+
if (!pstate->firstpage || pstate->nskipadvances > NSKIPADVANCES_THRESHOLD)
20692100
{
20702101
/* Schedule a recheck once on the next (or previous) page */
20712102
so->scanBehind = true;
2072-
so->oppositeDirCheck = has_required_opposite_direction_only;
2073-
2074-
_bt_rewind_nonrequired_arrays(scan, dir);
20752103

20762104
/* Continue the current primitive scan after all */
20772105
goto continue_scan;
@@ -2441,7 +2469,9 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
24412469
* the first page (first for the current primitive scan) avoids wasting cycles
24422470
* during selective point queries. They typically don't stand to gain as much
24432471
* when we can set pstate.startikey, and are likely to notice the overhead of
2444-
* calling here.
2472+
* calling here. (Also, allowing pstate.forcenonrequired to be set on a
2473+
* primscan's first page would mislead _bt_advance_array_keys, which expects
2474+
* pstate.nskipadvances to be representative of every first page's key space.)
24452475
*
24462476
* Caller must reset startikey and forcenonrequired ahead of the _bt_checkkeys
24472477
* call for pstate.finaltup iff we set forcenonrequired=true. This will give

src/include/access/nbtree.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1118,10 +1118,11 @@ typedef struct BTReadPageState
11181118

11191119
/*
11201120
* Private _bt_checkkeys state used to manage "look ahead" optimization
1121-
* (only used during scans with array keys)
1121+
* and primscan scheduling (only used during scans with array keys)
11221122
*/
11231123
int16 rechecks;
11241124
int16 targetdistance;
1125+
int16 nskipadvances;
11251126

11261127
} BTReadPageState;
11271128

0 commit comments

Comments
 (0)