Skip to content

Commit 0fe5f64

Browse files
committed
Teach radix tree to embed values at runtime
Previously, the decision to store values in leaves or within the child pointer was made at compile time, with variable length values using leaves by necessity. This commit allows introspecting the length of variable length values at runtime for that decision. This requires the ability to tell whether the last-level child pointer is actually a value, so we use a pointer tag in the lowest level bit. Use this in TID store. This entails adding a byte to the header to reserve space for the tag. Commit f35bd9b stores up to three offsets within the header with no bitmap, and now the header can be embedded as above. This reduces worst-case memory usage when TIDs are sparse. Reviewed (in an earlier version) by Masahiko Sawada Discussion: https://postgr.es/m/CANWCAZYw+_KAaUNruhJfE=h6WgtBKeDG32St8vBJBEY82bGVRQ@mail.gmail.com Discussion: https://postgr.es/m/CAD21AoBci3Hujzijubomo1tdwH3XtQ9F89cTNQ4bsQijOmqnEw@mail.gmail.com
1 parent f35bd9b commit 0fe5f64

File tree

2 files changed

+89
-24
lines changed

2 files changed

+89
-24
lines changed

src/backend/access/common/tidstore.c

Lines changed: 57 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -35,27 +35,58 @@
3535
#define WORDS_PER_PAGE(n) ((n) / BITS_PER_BITMAPWORD + 1)
3636

3737
/* number of offsets we can store in the header of a BlocktableEntry */
38-
#define NUM_FULL_OFFSETS ((sizeof(bitmapword) - sizeof(uint16)) / sizeof(OffsetNumber))
38+
#define NUM_FULL_OFFSETS ((sizeof(uintptr_t) - sizeof(uint8) - sizeof(int8)) / sizeof(OffsetNumber))
3939

4040
/*
4141
* This is named similarly to PagetableEntry in tidbitmap.c
4242
* because the two have a similar function.
4343
*/
4444
typedef struct BlocktableEntry
4545
{
46-
uint16 nwords;
47-
48-
/*
49-
* We can store a small number of offsets here to avoid wasting space with
50-
* a sparse bitmap.
51-
*/
52-
OffsetNumber full_offsets[NUM_FULL_OFFSETS];
46+
union
47+
{
48+
struct
49+
{
50+
#ifndef WORDS_BIGENDIAN
51+
/*
52+
* We need to position this member so that the backing radix tree
53+
* can use the lowest bit for a pointer tag. In particular, it
54+
* must be placed within 'header' so that it corresponds to the
55+
* lowest byte in 'ptr'. We position 'nwords' along with it to
56+
* avoid struct padding.
57+
*/
58+
uint8 flags;
59+
60+
int8 nwords;
61+
#endif
62+
63+
/*
64+
* We can store a small number of offsets here to avoid wasting
65+
* space with a sparse bitmap.
66+
*/
67+
OffsetNumber full_offsets[NUM_FULL_OFFSETS];
68+
69+
#ifdef WORDS_BIGENDIAN
70+
int8 nwords;
71+
uint8 flags;
72+
#endif
73+
};
74+
uintptr_t ptr;
75+
} header;
5376

5477
bitmapword words[FLEXIBLE_ARRAY_MEMBER];
5578
} BlocktableEntry;
79+
80+
/*
81+
* The type of 'nwords' limits the max number of words in the 'words' array.
82+
* This computes the max offset we can actually store in the bitmap. In
83+
* practice, it's almost always the same as MaxOffsetNumber.
84+
*/
85+
#define MAX_OFFSET_IN_BITMAP Min(BITS_PER_BITMAPWORD * PG_INT8_MAX - 1, MaxOffsetNumber)
86+
5687
#define MaxBlocktableEntrySize \
5788
offsetof(BlocktableEntry, words) + \
58-
(sizeof(bitmapword) * WORDS_PER_PAGE(MaxOffsetNumber))
89+
(sizeof(bitmapword) * WORDS_PER_PAGE(MAX_OFFSET_IN_BITMAP))
5990

6091
#define RT_PREFIX local_ts
6192
#define RT_SCOPE static
@@ -64,7 +95,8 @@ typedef struct BlocktableEntry
6495
#define RT_VALUE_TYPE BlocktableEntry
6596
#define RT_VARLEN_VALUE_SIZE(page) \
6697
(offsetof(BlocktableEntry, words) + \
67-
sizeof(bitmapword) * (page)->nwords)
98+
sizeof(bitmapword) * (page)->header.nwords)
99+
#define RT_RUNTIME_EMBEDDABLE_VALUE
68100
#include "lib/radixtree.h"
69101

70102
#define RT_PREFIX shared_ts
@@ -75,7 +107,8 @@ typedef struct BlocktableEntry
75107
#define RT_VALUE_TYPE BlocktableEntry
76108
#define RT_VARLEN_VALUE_SIZE(page) \
77109
(offsetof(BlocktableEntry, words) + \
78-
sizeof(bitmapword) * (page)->nwords)
110+
sizeof(bitmapword) * (page)->header.nwords)
111+
#define RT_RUNTIME_EMBEDDABLE_VALUE
79112
#include "lib/radixtree.h"
80113

81114
/* Per-backend state for a TidStore */
@@ -350,13 +383,13 @@ TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets,
350383
OffsetNumber off = offsets[i];
351384

352385
/* safety check to ensure we don't overrun bit array bounds */
353-
if (!OffsetNumberIsValid(off))
386+
if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
354387
elog(ERROR, "tuple offset out of range: %u", off);
355388

356-
page->full_offsets[i] = off;
389+
page->header.full_offsets[i] = off;
357390
}
358391

359-
page->nwords = 0;
392+
page->header.nwords = 0;
360393
}
361394
else
362395
{
@@ -371,7 +404,7 @@ TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets,
371404
OffsetNumber off = offsets[idx];
372405

373406
/* safety check to ensure we don't overrun bit array bounds */
374-
if (!OffsetNumberIsValid(off))
407+
if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
375408
elog(ERROR, "tuple offset out of range: %u", off);
376409

377410
if (off >= next_word_threshold)
@@ -385,8 +418,8 @@ TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets,
385418
page->words[wordnum] = word;
386419
}
387420

388-
page->nwords = wordnum;
389-
Assert(page->nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
421+
page->header.nwords = wordnum;
422+
Assert(page->header.nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
390423
}
391424

392425
if (TidStoreIsShared(ts))
@@ -414,12 +447,12 @@ TidStoreIsMember(TidStore *ts, ItemPointer tid)
414447
if (page == NULL)
415448
return false;
416449

417-
if (page->nwords == 0)
450+
if (page->header.nwords == 0)
418451
{
419452
/* we have offsets in the header */
420453
for (int i = 0; i < NUM_FULL_OFFSETS; i++)
421454
{
422-
if (page->full_offsets[i] == off)
455+
if (page->header.full_offsets[i] == off)
423456
return true;
424457
}
425458
return false;
@@ -430,7 +463,7 @@ TidStoreIsMember(TidStore *ts, ItemPointer tid)
430463
bitnum = BITNUM(off);
431464

432465
/* no bitmap for the off */
433-
if (wordnum >= page->nwords)
466+
if (wordnum >= page->header.nwords)
434467
return false;
435468

436469
return (page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0;
@@ -554,18 +587,18 @@ tidstore_iter_extract_tids(TidStoreIter *iter, BlockNumber blkno,
554587
result->num_offsets = 0;
555588
result->blkno = blkno;
556589

557-
if (page->nwords == 0)
590+
if (page->header.nwords == 0)
558591
{
559592
/* we have offsets in the header */
560593
for (int i = 0; i < NUM_FULL_OFFSETS; i++)
561594
{
562-
if (page->full_offsets[i] != InvalidOffsetNumber)
563-
result->offsets[result->num_offsets++] = page->full_offsets[i];
595+
if (page->header.full_offsets[i] != InvalidOffsetNumber)
596+
result->offsets[result->num_offsets++] = page->header.full_offsets[i];
564597
}
565598
}
566599
else
567600
{
568-
for (wordnum = 0; wordnum < page->nwords; wordnum++)
601+
for (wordnum = 0; wordnum < page->header.nwords; wordnum++)
569602
{
570603
bitmapword w = page->words[wordnum];
571604
int off = wordnum * BITS_PER_BITMAPWORD;

src/include/lib/radixtree.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@
105105
* involving a pointer to the value type, to calculate size.
106106
* NOTE: implies that the value is in fact variable-length,
107107
* so do not set for fixed-length values.
108+
* - RT_RUNTIME_EMBEDDABLE_VALUE - for variable length values, allows
109+
* storing the value in a child pointer slot, rather than as a single-
110+
* value leaf, if small enough. This requires that the value, when
111+
* read as a child pointer, can be tagged in the lowest bit.
108112
*
109113
* Optional parameters:
110114
* - RT_SHMEM - if defined, the radix tree is created in the DSA area
@@ -437,7 +441,13 @@ static inline bool
437441
RT_VALUE_IS_EMBEDDABLE(RT_VALUE_TYPE * value_p)
438442
{
439443
#ifdef RT_VARLEN_VALUE_SIZE
444+
445+
#ifdef RT_RUNTIME_EMBEDDABLE_VALUE
446+
return RT_GET_VALUE_SIZE(value_p) <= sizeof(RT_PTR_ALLOC);
447+
#else
440448
return false;
449+
#endif
450+
441451
#else
442452
return RT_GET_VALUE_SIZE(value_p) <= sizeof(RT_PTR_ALLOC);
443453
#endif
@@ -451,7 +461,19 @@ static inline bool
451461
RT_CHILDPTR_IS_VALUE(RT_PTR_ALLOC child)
452462
{
453463
#ifdef RT_VARLEN_VALUE_SIZE
464+
465+
#ifdef RT_RUNTIME_EMBEDDABLE_VALUE
466+
/* check for pointer tag */
467+
#ifdef RT_SHMEM
468+
return child & 1;
469+
#else
470+
return ((uintptr_t) child) & 1;
471+
#endif
472+
473+
#else
454474
return false;
475+
#endif
476+
455477
#else
456478
return sizeof(RT_VALUE_TYPE) <= sizeof(RT_PTR_ALLOC);
457479
#endif
@@ -1729,6 +1751,15 @@ RT_SET(RT_RADIX_TREE * tree, uint64 key, RT_VALUE_TYPE * value_p)
17291751
{
17301752
/* store value directly in child pointer slot */
17311753
memcpy(slot, value_p, value_sz);
1754+
1755+
#ifdef RT_RUNTIME_EMBEDDABLE_VALUE
1756+
/* tag child pointer */
1757+
#ifdef RT_SHMEM
1758+
*slot |= 1;
1759+
#else
1760+
*((uintptr_t *) slot) |= 1;
1761+
#endif
1762+
#endif
17321763
}
17331764
else
17341765
{
@@ -2888,6 +2919,7 @@ RT_DUMP_NODE(RT_NODE * node)
28882919
#undef RT_DEFINE
28892920
#undef RT_VALUE_TYPE
28902921
#undef RT_VARLEN_VALUE_SIZE
2922+
#undef RT_RUNTIME_EMBEDDABLE_VALUE
28912923
#undef RT_SHMEM
28922924
#undef RT_USE_DELETE
28932925
#undef RT_DEBUG

0 commit comments

Comments
 (0)