Skip to content

Commit bfe56cd

Browse files
Delay extraction of TIDBitmap per page offsets
Pages from the bitmap created by the TIDBitmap API can be exact or lossy. The TIDBitmap API extracts the tuple offsets from exact pages into an array for the convenience of the caller. This was done in tbm_private|shared_iterate() right after advancing the iterator. However, as long as tbm_private|shared_iterate() set a reference to the PagetableEntry in the TBMIterateResult, the offset extraction can be done later. Waiting to extract the tuple offsets has a few benefits. For the shared iterator case, it allows us to extract the offsets after dropping the shared iterator state lock, reducing time spent holding a contended lock. Separating the iteration step and extracting the offsets later also allows us to avoid extracting the offsets for prefetched blocks. Those offsets were never used, so the overhead of extracting and storing them was wasted. The real motivation for this change, however, is that future commits will make bitmap heap scan use the read stream API. This requires a TBMIterateResult per issued block. By removing the array of tuple offsets from the TBMIterateResult and only extracting the offsets when they are used, we reduce the memory required for per buffer data substantially. Suggested-by: Thomas Munro <thomas.munro@gmail.com> Reviewed-by: Thomas Munro <thomas.munro@gmail.com> Discussion: https://postgr.es/m/CA%2BhUKGLHbKP3jwJ6_%2BhnGi37Pw3BD5j2amjV3oSk7j-KyCnY7Q%40mail.gmail.com
1 parent b8778c4 commit bfe56cd

File tree

6 files changed

+87
-50
lines changed

6 files changed

+87
-50
lines changed

src/backend/access/gin/ginget.c

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,7 @@ startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot)
333333
entry->nlist = 0;
334334
entry->matchBitmap = NULL;
335335
entry->matchResult = NULL;
336+
entry->matchNtuples = -1;
336337
entry->reduceResult = false;
337338
entry->predictNumberResult = 0;
338339

@@ -828,7 +829,7 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
828829
*/
829830
while (entry->matchResult == NULL ||
830831
(!entry->matchResult->lossy &&
831-
entry->offset >= entry->matchResult->ntuples) ||
832+
entry->offset >= entry->matchNtuples) ||
832833
entry->matchResult->blockno < advancePastBlk ||
833834
(ItemPointerIsLossyPage(&advancePast) &&
834835
entry->matchResult->blockno == advancePastBlk))
@@ -845,9 +846,15 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
845846
break;
846847
}
847848

849+
/* Exact pages need their tuple offsets extracted. */
850+
if (!entry->matchResult->lossy)
851+
entry->matchNtuples = tbm_extract_page_tuple(entry->matchResult,
852+
entry->matchOffsets,
853+
TBM_MAX_TUPLES_PER_PAGE);
854+
848855
/*
849856
* Reset counter to the beginning of entry->matchResult. Note:
850-
* entry->offset is still greater than matchResult->ntuples if
857+
* entry->offset is still greater than entry->matchNtuples if
851858
* matchResult is lossy. So, on next call we will get next
852859
* result from TIDBitmap.
853860
*/
@@ -874,32 +881,35 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
874881
}
875882

876883
/*
877-
* Not a lossy page. Skip over any offsets <= advancePast, and
878-
* return that.
884+
* Not a lossy page. If tuple offsets were extracted,
885+
* entry->matchNtuples must be > -1
879886
*/
887+
Assert(entry->matchNtuples > -1);
888+
889+
/* Skip over any offsets <= advancePast, and return that. */
880890
if (entry->matchResult->blockno == advancePastBlk)
881891
{
882-
Assert(entry->matchResult->ntuples > 0);
892+
Assert(entry->matchNtuples > 0);
883893

884894
/*
885895
* First, do a quick check against the last offset on the
886896
* page. If that's > advancePast, so are all the other
887897
* offsets, so just go back to the top to get the next page.
888898
*/
889-
if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
899+
if (entry->matchOffsets[entry->matchNtuples - 1] <= advancePastOff)
890900
{
891-
entry->offset = entry->matchResult->ntuples;
901+
entry->offset = entry->matchNtuples;
892902
continue;
893903
}
894904

895905
/* Otherwise scan to find the first item > advancePast */
896-
while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
906+
while (entry->matchOffsets[entry->offset] <= advancePastOff)
897907
entry->offset++;
898908
}
899909

900910
ItemPointerSet(&entry->curItem,
901911
entry->matchResult->blockno,
902-
entry->matchResult->offsets[entry->offset]);
912+
entry->matchOffsets[entry->offset]);
903913
entry->offset++;
904914

905915
/* Done unless we need to reduce the result */

src/backend/access/gin/ginscan.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ ginFillScanEntry(GinScanOpaque so, OffsetNumber attnum,
107107
scanEntry->matchBitmap = NULL;
108108
scanEntry->matchIterator = NULL;
109109
scanEntry->matchResult = NULL;
110+
scanEntry->matchNtuples = -1;
110111
scanEntry->list = NULL;
111112
scanEntry->nlist = 0;
112113
scanEntry->offset = InvalidOffsetNumber;

src/backend/access/heap/heapam_handler.c

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2127,6 +2127,8 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
21272127
Snapshot snapshot;
21282128
int ntup;
21292129
TBMIterateResult *tbmres;
2130+
OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
2131+
int noffsets = -1;
21302132

21312133
Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN);
21322134

@@ -2145,6 +2147,11 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
21452147
if (tbmres == NULL)
21462148
return false;
21472149

2150+
/* Exact pages need their tuple offsets extracted. */
2151+
if (!tbmres->lossy)
2152+
noffsets = tbm_extract_page_tuple(tbmres, offsets,
2153+
TBM_MAX_TUPLES_PER_PAGE);
2154+
21482155
/*
21492156
* Ignore any claimed entries past what we think is the end of the
21502157
* relation. It may have been extended after the start of our scan (we
@@ -2172,8 +2179,9 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
21722179
/* can't be lossy in the skip_fetch case */
21732180
Assert(!tbmres->lossy);
21742181
Assert(bscan->rs_empty_tuples_pending >= 0);
2182+
Assert(noffsets > -1);
21752183

2176-
bscan->rs_empty_tuples_pending += tbmres->ntuples;
2184+
bscan->rs_empty_tuples_pending += noffsets;
21772185

21782186
return true;
21792187
}
@@ -2216,9 +2224,12 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
22162224
*/
22172225
int curslot;
22182226

2219-
for (curslot = 0; curslot < tbmres->ntuples; curslot++)
2227+
/* We must have extracted the tuple offsets by now */
2228+
Assert(noffsets > -1);
2229+
2230+
for (curslot = 0; curslot < noffsets; curslot++)
22202231
{
2221-
OffsetNumber offnum = tbmres->offsets[curslot];
2232+
OffsetNumber offnum = offsets[curslot];
22222233
ItemPointerData tid;
22232234
HeapTupleData heapTuple;
22242235

src/backend/nodes/tidbitmap.c

Lines changed: 24 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -40,22 +40,13 @@
4040

4141
#include <limits.h>
4242

43-
#include "access/htup_details.h"
4443
#include "common/hashfn.h"
4544
#include "common/int.h"
4645
#include "nodes/bitmapset.h"
4746
#include "nodes/tidbitmap.h"
4847
#include "storage/lwlock.h"
4948
#include "utils/dsa.h"
5049

51-
/*
52-
* The maximum number of tuples per page is not large (typically 256 with
53-
* 8K pages, or 1024 with 32K pages). So there's not much point in making
54-
* the per-page bitmaps variable size. We just legislate that the size
55-
* is this:
56-
*/
57-
#define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
58-
5950
/*
6051
* When we have to switch over to lossy storage, we use a data structure
6152
* with one bit per page, where all pages having the same number DIV
@@ -67,7 +58,7 @@
6758
* table, using identical data structures. (This is because the memory
6859
* management for hashtables doesn't easily/efficiently allow space to be
6960
* transferred easily from one hashtable to another.) Therefore it's best
70-
* if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
61+
* if PAGES_PER_CHUNK is the same as TBM_MAX_TUPLES_PER_PAGE, or at least not
7162
* too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
7263
* avoid expensive integer remainder operations. So, define it like this:
7364
*/
@@ -79,7 +70,7 @@
7970
#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
8071

8172
/* number of active words for an exact page: */
82-
#define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
73+
#define WORDS_PER_PAGE ((TBM_MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
8374
/* number of active words for a lossy chunk: */
8475
#define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
8576

@@ -181,7 +172,7 @@ struct TBMPrivateIterator
181172
int spageptr; /* next spages index */
182173
int schunkptr; /* next schunks index */
183174
int schunkbit; /* next bit to check in current schunk */
184-
TBMIterateResult output; /* MUST BE LAST (because variable-size) */
175+
TBMIterateResult output;
185176
};
186177

187178
/*
@@ -222,7 +213,7 @@ struct TBMSharedIterator
222213
PTEntryArray *ptbase; /* pagetable element array */
223214
PTIterationArray *ptpages; /* sorted exact page index list */
224215
PTIterationArray *ptchunks; /* sorted lossy page index list */
225-
TBMIterateResult output; /* MUST BE LAST (because variable-size) */
216+
TBMIterateResult output;
226217
};
227218

228219
/* Local function prototypes */
@@ -390,7 +381,7 @@ tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
390381
bitnum;
391382

392383
/* safety check to ensure we don't overrun bit array bounds */
393-
if (off < 1 || off > MAX_TUPLES_PER_PAGE)
384+
if (off < 1 || off > TBM_MAX_TUPLES_PER_PAGE)
394385
elog(ERROR, "tuple offset out of range: %u", off);
395386

396387
/*
@@ -696,9 +687,7 @@ tbm_begin_private_iterate(TIDBitmap *tbm)
696687
* Create the TBMPrivateIterator struct, with enough trailing space to
697688
* serve the needs of the TBMIterateResult sub-struct.
698689
*/
699-
iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator) +
700-
MAX_TUPLES_PER_PAGE *
701-
sizeof(OffsetNumber));
690+
iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator));
702691
iterator->tbm = tbm;
703692

704693
/*
@@ -906,11 +895,16 @@ tbm_prepare_shared_iterate(TIDBitmap *tbm)
906895
/*
907896
* tbm_extract_page_tuple - extract the tuple offsets from a page
908897
*
909-
* The extracted offsets are stored into TBMIterateResult.
898+
* Returns the number of offsets it filled in if <= max_offsets. Otherwise,
899+
* fills in as many offsets as fit and returns the total number of offsets in
900+
* the page.
910901
*/
911-
static inline int
912-
tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
902+
int
903+
tbm_extract_page_tuple(TBMIterateResult *iteritem,
904+
OffsetNumber *offsets,
905+
uint32 max_offsets)
913906
{
907+
PagetableEntry *page = iteritem->internal_page;
914908
int wordnum;
915909
int ntuples = 0;
916910

@@ -925,7 +919,11 @@ tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
925919
while (w != 0)
926920
{
927921
if (w & 1)
928-
output->offsets[ntuples++] = (OffsetNumber) off;
922+
{
923+
if (ntuples < max_offsets)
924+
offsets[ntuples] = (OffsetNumber) off;
925+
ntuples++;
926+
}
929927
off++;
930928
w >>= 1;
931929
}
@@ -1012,9 +1010,9 @@ tbm_private_iterate(TBMPrivateIterator *iterator)
10121010
{
10131011
/* Return a lossy page indicator from the chunk */
10141012
output->blockno = chunk_blockno;
1015-
output->ntuples = -1;
10161013
output->lossy = true;
10171014
output->recheck = true;
1015+
output->internal_page = NULL;
10181016
iterator->schunkbit++;
10191017
return output;
10201018
}
@@ -1023,18 +1021,15 @@ tbm_private_iterate(TBMPrivateIterator *iterator)
10231021
if (iterator->spageptr < tbm->npages)
10241022
{
10251023
PagetableEntry *page;
1026-
int ntuples;
10271024

10281025
/* In TBM_ONE_PAGE state, we don't allocate an spages[] array */
10291026
if (tbm->status == TBM_ONE_PAGE)
10301027
page = &tbm->entry1;
10311028
else
10321029
page = tbm->spages[iterator->spageptr];
10331030

1034-
/* scan bitmap to extract individual offset numbers */
1035-
ntuples = tbm_extract_page_tuple(page, output);
1031+
output->internal_page = page;
10361032
output->blockno = page->blockno;
1037-
output->ntuples = ntuples;
10381033
output->lossy = false;
10391034
output->recheck = page->recheck;
10401035
iterator->spageptr++;
@@ -1107,9 +1102,9 @@ tbm_shared_iterate(TBMSharedIterator *iterator)
11071102
{
11081103
/* Return a lossy page indicator from the chunk */
11091104
output->blockno = chunk_blockno;
1110-
output->ntuples = -1;
11111105
output->lossy = true;
11121106
output->recheck = true;
1107+
output->internal_page = NULL;
11131108
istate->schunkbit++;
11141109

11151110
LWLockRelease(&istate->lock);
@@ -1120,12 +1115,9 @@ tbm_shared_iterate(TBMSharedIterator *iterator)
11201115
if (istate->spageptr < istate->npages)
11211116
{
11221117
PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1123-
int ntuples;
11241118

1125-
/* scan bitmap to extract individual offset numbers */
1126-
ntuples = tbm_extract_page_tuple(page, output);
1119+
output->internal_page = page;
11271120
output->blockno = page->blockno;
1128-
output->ntuples = ntuples;
11291121
output->lossy = false;
11301122
output->recheck = page->recheck;
11311123
istate->spageptr++;
@@ -1473,8 +1465,7 @@ tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
14731465
* Create the TBMSharedIterator struct, with enough trailing space to
14741466
* serve the needs of the TBMIterateResult sub-struct.
14751467
*/
1476-
iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1477-
MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1468+
iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator));
14781469

14791470
istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
14801471

src/include/access/gin_private.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,8 @@ typedef struct GinScanEntryData
354354
TIDBitmap *matchBitmap;
355355
TBMPrivateIterator *matchIterator;
356356
TBMIterateResult *matchResult;
357+
OffsetNumber matchOffsets[TBM_MAX_TUPLES_PER_PAGE];
358+
int matchNtuples;
357359

358360
/* used for Posting list and one page in Posting tree */
359361
ItemPointerData *list;

src/include/nodes/tidbitmap.h

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,17 @@
2222
#ifndef TIDBITMAP_H
2323
#define TIDBITMAP_H
2424

25+
#include "access/htup_details.h"
2526
#include "storage/itemptr.h"
2627
#include "utils/dsa.h"
2728

29+
/*
30+
* The maximum number of tuples per page is not large (typically 256 with
31+
* 8K pages, or 1024 with 32K pages). So there's not much point in making
32+
* the per-page bitmaps variable size. We just legislate that the size
33+
* is this:
34+
*/
35+
#define TBM_MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
2836

2937
/*
3038
* Actual bitmap representation is private to tidbitmap.c. Callers can
@@ -53,12 +61,22 @@ typedef struct TBMIterator
5361
/* Result structure for tbm_iterate */
5462
typedef struct TBMIterateResult
5563
{
56-
BlockNumber blockno; /* page number containing tuples */
57-
int ntuples; /* -1 when lossy */
64+
BlockNumber blockno; /* block number containing tuples */
65+
5866
bool lossy;
59-
bool recheck; /* should the tuples be rechecked? */
60-
/* Note: recheck is always true if lossy */
61-
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
67+
68+
/*
69+
* Whether or not the tuples should be rechecked. This is always true if
70+
* the page is lossy but may also be true if the query requires recheck.
71+
*/
72+
bool recheck;
73+
74+
/*
75+
* Pointer to the page containing the bitmap for this block. It is a void *
76+
* to avoid exposing the details of the tidbitmap PagetableEntry to API
77+
* users.
78+
*/
79+
void *internal_page;
6280
} TBMIterateResult;
6381

6482
/* function prototypes in nodes/tidbitmap.c */
@@ -75,6 +93,10 @@ extern void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno);
7593
extern void tbm_union(TIDBitmap *a, const TIDBitmap *b);
7694
extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b);
7795

96+
extern int tbm_extract_page_tuple(TBMIterateResult *iteritem,
97+
OffsetNumber *offsets,
98+
uint32 max_offsets);
99+
78100
extern bool tbm_is_empty(const TIDBitmap *tbm);
79101

80102
extern TBMPrivateIterator *tbm_begin_private_iterate(TIDBitmap *tbm);

0 commit comments

Comments
 (0)