Skip to content

Commit 7c70996

Browse files
committed
Allow bitmap scans to operate as index-only scans when possible.
If we don't have to return any columns from heap tuples, and there's no need to recheck qual conditions, and the heap page is all-visible, then we can skip fetching the heap page altogether. Skip prefetching pages too, when possible, on the assumption that the recheck flag will remain the same from one page to the next. While that assumption is hardly bulletproof, it seems like a good bet most of the time, and better than prefetching pages we don't need. This commit installs the executor infrastructure, but doesn't change any planner cost estimates, thus possibly causing bitmap scans to not be chosen in cases where this change renders them the best choice. I (tgl) am not entirely convinced that we need to account for this behavior in the planner, because I think typically the bitmap scan would get chosen anyway if it's the best bet. In any case the submitted patch took way too many shortcuts, resulting in too many clearly-bad choices, to be committable. Alexander Kuzmenkov, reviewed by Alexey Chernyshov, and whacked around rather heavily by me. Discussion: https://postgr.es/m/239a8955-c0fc-f506-026d-c837e86c827b@postgrespro.ru
1 parent ec7ce54 commit 7c70996

File tree

5 files changed

+150
-40
lines changed

5 files changed

+150
-40
lines changed

src/backend/executor/nodeBitmapHeapscan.c

Lines changed: 126 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939

4040
#include "access/relscan.h"
4141
#include "access/transam.h"
42+
#include "access/visibilitymap.h"
4243
#include "executor/execdebug.h"
4344
#include "executor/nodeBitmapHeapscan.h"
4445
#include "miscadmin.h"
@@ -225,9 +226,31 @@ BitmapHeapNext(BitmapHeapScanState *node)
225226
}
226227

227228
/*
228-
* Fetch the current heap page and identify candidate tuples.
229+
* We can skip fetching the heap page if we don't need any fields
230+
* from the heap, and the bitmap entries don't need rechecking,
231+
* and all tuples on the page are visible to our transaction.
229232
*/
230-
bitgetpage(scan, tbmres);
233+
node->skip_fetch = (node->can_skip_fetch &&
234+
!tbmres->recheck &&
235+
VM_ALL_VISIBLE(node->ss.ss_currentRelation,
236+
tbmres->blockno,
237+
&node->vmbuffer));
238+
239+
if (node->skip_fetch)
240+
{
241+
/*
242+
* The number of tuples on this page is put into
243+
* scan->rs_ntuples; note we don't fill scan->rs_vistuples.
244+
*/
245+
scan->rs_ntuples = tbmres->ntuples;
246+
}
247+
else
248+
{
249+
/*
250+
* Fetch the current heap page and identify candidate tuples.
251+
*/
252+
bitgetpage(scan, tbmres);
253+
}
231254

232255
if (tbmres->ntuples >= 0)
233256
node->exact_pages++;
@@ -289,45 +312,55 @@ BitmapHeapNext(BitmapHeapScanState *node)
289312
*/
290313
BitmapPrefetch(node, scan);
291314

292-
/*
293-
* Okay to fetch the tuple
294-
*/
295-
targoffset = scan->rs_vistuples[scan->rs_cindex];
296-
dp = (Page) BufferGetPage(scan->rs_cbuf);
297-
lp = PageGetItemId(dp, targoffset);
298-
Assert(ItemIdIsNormal(lp));
315+
if (node->skip_fetch)
316+
{
317+
/*
318+
* If we don't have to fetch the tuple, just return nulls.
319+
*/
320+
ExecStoreAllNullTuple(slot);
321+
}
322+
else
323+
{
324+
/*
325+
* Okay to fetch the tuple.
326+
*/
327+
targoffset = scan->rs_vistuples[scan->rs_cindex];
328+
dp = (Page) BufferGetPage(scan->rs_cbuf);
329+
lp = PageGetItemId(dp, targoffset);
330+
Assert(ItemIdIsNormal(lp));
299331

300-
scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
301-
scan->rs_ctup.t_len = ItemIdGetLength(lp);
302-
scan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
303-
ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
332+
scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
333+
scan->rs_ctup.t_len = ItemIdGetLength(lp);
334+
scan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
335+
ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
304336

305-
pgstat_count_heap_fetch(scan->rs_rd);
337+
pgstat_count_heap_fetch(scan->rs_rd);
306338

307-
/*
308-
* Set up the result slot to point to this tuple. Note that the slot
309-
* acquires a pin on the buffer.
310-
*/
311-
ExecStoreTuple(&scan->rs_ctup,
312-
slot,
313-
scan->rs_cbuf,
314-
false);
315-
316-
/*
317-
* If we are using lossy info, we have to recheck the qual conditions
318-
* at every tuple.
319-
*/
320-
if (tbmres->recheck)
321-
{
322-
econtext->ecxt_scantuple = slot;
323-
ResetExprContext(econtext);
339+
/*
340+
* Set up the result slot to point to this tuple. Note that the
341+
* slot acquires a pin on the buffer.
342+
*/
343+
ExecStoreTuple(&scan->rs_ctup,
344+
slot,
345+
scan->rs_cbuf,
346+
false);
324347

325-
if (!ExecQual(node->bitmapqualorig, econtext))
348+
/*
349+
* If we are using lossy info, we have to recheck the qual
350+
* conditions at every tuple.
351+
*/
352+
if (tbmres->recheck)
326353
{
327-
/* Fails recheck, so drop it and loop back for another */
328-
InstrCountFiltered2(node, 1);
329-
ExecClearTuple(slot);
330-
continue;
354+
econtext->ecxt_scantuple = slot;
355+
ResetExprContext(econtext);
356+
357+
if (!ExecQual(node->bitmapqualorig, econtext))
358+
{
359+
/* Fails recheck, so drop it and loop back for another */
360+
InstrCountFiltered2(node, 1);
361+
ExecClearTuple(slot);
362+
continue;
363+
}
331364
}
332365
}
333366

@@ -582,6 +615,7 @@ BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan)
582615
while (node->prefetch_pages < node->prefetch_target)
583616
{
584617
TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
618+
bool skip_fetch;
585619

586620
if (tbmpre == NULL)
587621
{
@@ -591,7 +625,26 @@ BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan)
591625
break;
592626
}
593627
node->prefetch_pages++;
594-
PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
628+
629+
/*
630+
* If we expect not to have to actually read this heap page,
631+
* skip this prefetch call, but continue to run the prefetch
632+
* logic normally. (Would it be better not to increment
633+
* prefetch_pages?)
634+
*
635+
* This depends on the assumption that the index AM will
636+
* report the same recheck flag for this future heap page as
637+
* it did for the current heap page; which is not a certainty
638+
* but is true in many cases.
639+
*/
640+
skip_fetch = (node->can_skip_fetch &&
641+
(node->tbmres ? !node->tbmres->recheck : false) &&
642+
VM_ALL_VISIBLE(node->ss.ss_currentRelation,
643+
tbmpre->blockno,
644+
&node->pvmbuffer));
645+
646+
if (!skip_fetch)
647+
PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
595648
}
596649
}
597650

@@ -608,6 +661,7 @@ BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan)
608661
{
609662
TBMIterateResult *tbmpre;
610663
bool do_prefetch = false;
664+
bool skip_fetch;
611665

612666
/*
613667
* Recheck under the mutex. If some other process has already
@@ -633,7 +687,15 @@ BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan)
633687
break;
634688
}
635689

636-
PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
690+
/* As above, skip prefetch if we expect not to need page */
691+
skip_fetch = (node->can_skip_fetch &&
692+
(node->tbmres ? !node->tbmres->recheck : false) &&
693+
VM_ALL_VISIBLE(node->ss.ss_currentRelation,
694+
tbmpre->blockno,
695+
&node->pvmbuffer));
696+
697+
if (!skip_fetch)
698+
PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
637699
}
638700
}
639701
}
@@ -687,6 +749,7 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
687749
/* rescan to release any page pin */
688750
heap_rescan(node->ss.ss_currentScanDesc, NULL);
689751

752+
/* release bitmaps and buffers if any */
690753
if (node->tbmiterator)
691754
tbm_end_iterate(node->tbmiterator);
692755
if (node->prefetch_iterator)
@@ -697,13 +760,19 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
697760
tbm_end_shared_iterate(node->shared_prefetch_iterator);
698761
if (node->tbm)
699762
tbm_free(node->tbm);
763+
if (node->vmbuffer != InvalidBuffer)
764+
ReleaseBuffer(node->vmbuffer);
765+
if (node->pvmbuffer != InvalidBuffer)
766+
ReleaseBuffer(node->pvmbuffer);
700767
node->tbm = NULL;
701768
node->tbmiterator = NULL;
702769
node->tbmres = NULL;
703770
node->prefetch_iterator = NULL;
704771
node->initialized = false;
705772
node->shared_tbmiterator = NULL;
706773
node->shared_prefetch_iterator = NULL;
774+
node->vmbuffer = InvalidBuffer;
775+
node->pvmbuffer = InvalidBuffer;
707776

708777
ExecScanReScan(&node->ss);
709778

@@ -748,7 +817,7 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
748817
ExecEndNode(outerPlanState(node));
749818

750819
/*
751-
* release bitmap if any
820+
* release bitmaps and buffers if any
752821
*/
753822
if (node->tbmiterator)
754823
tbm_end_iterate(node->tbmiterator);
@@ -760,6 +829,10 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
760829
tbm_end_shared_iterate(node->shared_tbmiterator);
761830
if (node->shared_prefetch_iterator)
762831
tbm_end_shared_iterate(node->shared_prefetch_iterator);
832+
if (node->vmbuffer != InvalidBuffer)
833+
ReleaseBuffer(node->vmbuffer);
834+
if (node->pvmbuffer != InvalidBuffer)
835+
ReleaseBuffer(node->pvmbuffer);
763836

764837
/*
765838
* close heap scan
@@ -805,6 +878,9 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
805878
scanstate->tbm = NULL;
806879
scanstate->tbmiterator = NULL;
807880
scanstate->tbmres = NULL;
881+
scanstate->skip_fetch = false;
882+
scanstate->vmbuffer = InvalidBuffer;
883+
scanstate->pvmbuffer = InvalidBuffer;
808884
scanstate->exact_pages = 0;
809885
scanstate->lossy_pages = 0;
810886
scanstate->prefetch_iterator = NULL;
@@ -815,8 +891,19 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
815891
scanstate->pscan_len = 0;
816892
scanstate->initialized = false;
817893
scanstate->shared_tbmiterator = NULL;
894+
scanstate->shared_prefetch_iterator = NULL;
818895
scanstate->pstate = NULL;
819896

897+
/*
898+
* We can potentially skip fetching heap pages if we do not need any
899+
* columns of the table, either for checking non-indexable quals or for
900+
* returning data. This test is a bit simplistic, as it checks the
901+
* stronger condition that there's no qual or return tlist at all. But in
902+
* most cases it's probably not worth working harder than that.
903+
*/
904+
scanstate->can_skip_fetch = (node->scan.plan.qual == NIL &&
905+
node->scan.plan.targetlist == NIL);
906+
820907
/*
821908
* Miscellaneous initialization
822909
*

src/backend/optimizer/plan/createplan.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -807,6 +807,15 @@ use_physical_tlist(PlannerInfo *root, Path *path, int flags)
807807
if (IsA(path, CustomPath))
808808
return false;
809809

810+
/*
811+
* If a bitmap scan's tlist is empty, keep it as-is. This may allow the
812+
* executor to skip heap page fetches, and in any case, the benefit of
813+
* using a physical tlist instead would be minimal.
814+
*/
815+
if (IsA(path, BitmapHeapPath) &&
816+
path->pathtarget->exprs == NIL)
817+
return false;
818+
810819
/*
811820
* Can't do it if any system columns or whole-row Vars are requested.
812821
* (This could possibly be fixed but would take some fragile assumptions

src/include/nodes/execnodes.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,7 @@ typedef struct EState
507507
bool *es_epqTupleSet; /* true if EPQ tuple is provided */
508508
bool *es_epqScanDone; /* true if EPQ tuple has been fetched */
509509

510-
bool es_use_parallel_mode; /* can we use parallel workers? */
510+
bool es_use_parallel_mode; /* can we use parallel workers? */
511511

512512
/* The per-query shared memory area to use for parallel execution. */
513513
struct dsa_area *es_query_dsa;
@@ -1331,6 +1331,10 @@ typedef struct ParallelBitmapHeapState
13311331
* tbm bitmap obtained from child index scan(s)
13321332
* tbmiterator iterator for scanning current pages
13331333
* tbmres current-page data
1334+
* can_skip_fetch can we potentially skip tuple fetches in this scan?
1335+
* skip_fetch are we skipping tuple fetches on this page?
1336+
* vmbuffer buffer for visibility-map lookups
1337+
* pvmbuffer ditto, for prefetched pages
13341338
* exact_pages total number of exact pages retrieved
13351339
* lossy_pages total number of lossy pages retrieved
13361340
* prefetch_iterator iterator for prefetching ahead of current page
@@ -1351,6 +1355,10 @@ typedef struct BitmapHeapScanState
13511355
TIDBitmap *tbm;
13521356
TBMIterator *tbmiterator;
13531357
TBMIterateResult *tbmres;
1358+
bool can_skip_fetch;
1359+
bool skip_fetch;
1360+
Buffer vmbuffer;
1361+
Buffer pvmbuffer;
13541362
long exact_pages;
13551363
long lossy_pages;
13561364
TBMIterator *prefetch_iterator;

src/test/regress/expected/stats.out

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,12 +136,15 @@ SELECT count(*) FROM tenk2;
136136
(1 row)
137137

138138
-- do an indexscan
139+
-- make sure it is not a bitmap scan, which might skip fetching heap tuples
140+
SET enable_bitmapscan TO off;
139141
SELECT count(*) FROM tenk2 WHERE unique1 = 1;
140142
count
141143
-------
142144
1
143145
(1 row)
144146

147+
RESET enable_bitmapscan;
145148
-- We can't just call wait_for_stats() at this point, because we only
146149
-- transmit stats when the session goes idle, and we probably didn't
147150
-- transmit the last couple of counts yet thanks to the rate-limiting logic

src/test/regress/sql/stats.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,10 @@ ROLLBACK;
138138
-- do a seqscan
139139
SELECT count(*) FROM tenk2;
140140
-- do an indexscan
141+
-- make sure it is not a bitmap scan, which might skip fetching heap tuples
142+
SET enable_bitmapscan TO off;
141143
SELECT count(*) FROM tenk2 WHERE unique1 = 1;
144+
RESET enable_bitmapscan;
142145

143146
-- We can't just call wait_for_stats() at this point, because we only
144147
-- transmit stats when the session goes idle, and we probably didn't

0 commit comments

Comments
 (0)