Skip to content

Commit 8224de4

Browse files
committed
Indexes with INCLUDE columns and their support in B-tree
This patch introduces INCLUDE clause to index definition. This clause specifies a list of columns which will be included as a non-key part in the index. The INCLUDE columns exist solely to allow more queries to benefit from index-only scans. Also, such columns don't need to have appropriate operator classes. Expressions are not supported as INCLUDE columns since they cannot be used in index-only scans. Index access methods supporting INCLUDE are indicated by amcaninclude flag in IndexAmRoutine. For now, only B-tree indexes support INCLUDE clause. In B-tree indexes INCLUDE columns are truncated from pivot index tuples (tuples located in non-leaf pages and high keys). Therefore, B-tree indexes now might have variable number of attributes. This patch also provides generic facility to support that: pivot tuples contain number of their attributes in t_tid.ip_posid. Free 13th bit of t_info is used for indicating that. This facility will simplify further support of index suffix truncation. The changes of above are backward-compatible, pg_upgrade doesn't need special handling of B-tree indexes for that. Bump catalog version Author: Anastasia Lubennikova with contribition by Alexander Korotkov and me Reviewed by: Peter Geoghegan, Tomas Vondra, Antonin Houska, Jeff Janes, David Rowley, Alexander Korotkov Discussion: https://www.postgresql.org/message-id/flat/56168952.4010101@postgrespro.ru
1 parent 01bb851 commit 8224de4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

89 files changed

+2115
-470
lines changed

contrib/amcheck/expected/check_btree.out

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
-- minimal test, basically just verifying that amcheck
22
CREATE TABLE bttest_a(id int8);
33
CREATE TABLE bttest_b(id int8);
4+
CREATE TABLE bttest_multi(id int8, data int8);
45
INSERT INTO bttest_a SELECT * FROM generate_series(1, 100000);
56
INSERT INTO bttest_b SELECT * FROM generate_series(100000, 1, -1);
7+
INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i;
68
CREATE INDEX bttest_a_idx ON bttest_a USING btree (id);
79
CREATE INDEX bttest_b_idx ON bttest_b USING btree (id);
10+
CREATE UNIQUE INDEX bttest_multi_idx ON bttest_multi
11+
USING btree (id) INCLUDE (data);
812
CREATE ROLE bttest_role;
913
-- verify permissions are checked (error due to function not callable)
1014
SET ROLE bttest_role;
@@ -93,8 +97,50 @@ WHERE relation = ANY(ARRAY['bttest_a', 'bttest_a_idx', 'bttest_b', 'bttest_b_idx
9397
(0 rows)
9498

9599
COMMIT;
100+
-- normal check outside of xact for index with included columns
101+
SELECT bt_index_check('bttest_multi_idx');
102+
bt_index_check
103+
----------------
104+
105+
(1 row)
106+
107+
-- more expansive test for index with included columns
108+
SELECT bt_index_parent_check('bttest_multi_idx', true);
109+
bt_index_parent_check
110+
-----------------------
111+
112+
(1 row)
113+
114+
SELECT bt_index_parent_check('bttest_multi_idx', true);
115+
bt_index_parent_check
116+
-----------------------
117+
118+
(1 row)
119+
120+
-- repeat same checks with index made by insertions
121+
TRUNCATE bttest_multi;
122+
INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i;
123+
SELECT bt_index_check('bttest_multi_idx');
124+
bt_index_check
125+
----------------
126+
127+
(1 row)
128+
129+
SELECT bt_index_parent_check('bttest_multi_idx', true);
130+
bt_index_parent_check
131+
-----------------------
132+
133+
(1 row)
134+
135+
SELECT bt_index_parent_check('bttest_multi_idx', true);
136+
bt_index_parent_check
137+
-----------------------
138+
139+
(1 row)
140+
96141
-- cleanup
97142
DROP TABLE bttest_a;
98143
DROP TABLE bttest_b;
144+
DROP TABLE bttest_multi;
99145
DROP OWNED BY bttest_role; -- permissions
100146
DROP ROLE bttest_role;

contrib/amcheck/sql/check_btree.sql

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
-- minimal test, basically just verifying that amcheck
22
CREATE TABLE bttest_a(id int8);
33
CREATE TABLE bttest_b(id int8);
4+
CREATE TABLE bttest_multi(id int8, data int8);
45

56
INSERT INTO bttest_a SELECT * FROM generate_series(1, 100000);
67
INSERT INTO bttest_b SELECT * FROM generate_series(100000, 1, -1);
8+
INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i;
79

810
CREATE INDEX bttest_a_idx ON bttest_a USING btree (id);
911
CREATE INDEX bttest_b_idx ON bttest_b USING btree (id);
12+
CREATE UNIQUE INDEX bttest_multi_idx ON bttest_multi
13+
USING btree (id) INCLUDE (data);
1014

1115
CREATE ROLE bttest_role;
1216

@@ -57,8 +61,23 @@ WHERE relation = ANY(ARRAY['bttest_a', 'bttest_a_idx', 'bttest_b', 'bttest_b_idx
5761
AND pid = pg_backend_pid();
5862
COMMIT;
5963

64+
-- normal check outside of xact for index with included columns
65+
SELECT bt_index_check('bttest_multi_idx');
66+
-- more expansive test for index with included columns
67+
SELECT bt_index_parent_check('bttest_multi_idx', true);
68+
SELECT bt_index_parent_check('bttest_multi_idx', true);
69+
70+
-- repeat same checks with index made by insertions
71+
TRUNCATE bttest_multi;
72+
INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i;
73+
SELECT bt_index_check('bttest_multi_idx');
74+
SELECT bt_index_parent_check('bttest_multi_idx', true);
75+
SELECT bt_index_parent_check('bttest_multi_idx', true);
76+
77+
6078
-- cleanup
6179
DROP TABLE bttest_a;
6280
DROP TABLE bttest_b;
81+
DROP TABLE bttest_multi;
6382
DROP OWNED BY bttest_role; -- permissions
6483
DROP ROLE bttest_role;

contrib/amcheck/verify_nbtree.c

Lines changed: 82 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -617,7 +617,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
617617
/* Internal page -- downlink gets leftmost on next level */
618618
itemid = PageGetItemId(state->target, P_FIRSTDATAKEY(opaque));
619619
itup = (IndexTuple) PageGetItem(state->target, itemid);
620-
nextleveldown.leftmost = ItemPointerGetBlockNumber(&(itup->t_tid));
620+
nextleveldown.leftmost = ItemPointerGetBlockNumberNoCheck(&(itup->t_tid));
621621
nextleveldown.level = opaque->btpo.level - 1;
622622
}
623623
else
@@ -722,6 +722,39 @@ bt_target_page_check(BtreeCheckState *state)
722722
elog(DEBUG2, "verifying %u items on %s block %u", max,
723723
P_ISLEAF(topaque) ? "leaf" : "internal", state->targetblock);
724724

725+
726+
/* Check the number of attributes in high key if any */
727+
if (!P_RIGHTMOST(topaque))
728+
{
729+
if (!_bt_check_natts(state->rel, state->target, P_HIKEY))
730+
{
731+
ItemId itemid;
732+
IndexTuple itup;
733+
char *itid,
734+
*htid;
735+
736+
itemid = PageGetItemId(state->target, P_HIKEY);
737+
itup = (IndexTuple) PageGetItem(state->target, itemid);
738+
itid = psprintf("(%u,%u)", state->targetblock, P_HIKEY);
739+
htid = psprintf("(%u,%u)",
740+
ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
741+
ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
742+
743+
ereport(ERROR,
744+
(errcode(ERRCODE_INDEX_CORRUPTED),
745+
errmsg("wrong number of index tuple attributes for index \"%s\"",
746+
RelationGetRelationName(state->rel)),
747+
errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X.",
748+
itid,
749+
BTreeTupGetNAtts(itup, state->rel),
750+
P_ISLEAF(topaque) ? "heap" : "index",
751+
htid,
752+
(uint32) (state->targetlsn >> 32),
753+
(uint32) state->targetlsn)));
754+
}
755+
}
756+
757+
725758
/*
726759
* Loop over page items, starting from first non-highkey item, not high
727760
* key (if any). Also, immediately skip "negative infinity" real item (if
@@ -760,6 +793,30 @@ bt_target_page_check(BtreeCheckState *state)
760793
(uint32) state->targetlsn),
761794
errhint("This could be a torn page problem")));
762795

796+
/* Check the number of index tuple attributes */
797+
if (!_bt_check_natts(state->rel, state->target, offset))
798+
{
799+
char *itid,
800+
*htid;
801+
802+
itid = psprintf("(%u,%u)", state->targetblock, offset);
803+
htid = psprintf("(%u,%u)",
804+
ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
805+
ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
806+
807+
ereport(ERROR,
808+
(errcode(ERRCODE_INDEX_CORRUPTED),
809+
errmsg("wrong number of index tuple attributes for index \"%s\"",
810+
RelationGetRelationName(state->rel)),
811+
errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X.",
812+
itid,
813+
BTreeTupGetNAtts(itup, state->rel),
814+
P_ISLEAF(topaque) ? "heap" : "index",
815+
htid,
816+
(uint32) (state->targetlsn >> 32),
817+
(uint32) state->targetlsn)));
818+
}
819+
763820
/*
764821
* Don't try to generate scankey using "negative infinity" garbage
765822
* data on internal pages
@@ -802,8 +859,8 @@ bt_target_page_check(BtreeCheckState *state)
802859

803860
itid = psprintf("(%u,%u)", state->targetblock, offset);
804861
htid = psprintf("(%u,%u)",
805-
ItemPointerGetBlockNumber(&(itup->t_tid)),
806-
ItemPointerGetOffsetNumber(&(itup->t_tid)));
862+
ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
863+
ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
807864

808865
ereport(ERROR,
809866
(errcode(ERRCODE_INDEX_CORRUPTED),
@@ -834,17 +891,17 @@ bt_target_page_check(BtreeCheckState *state)
834891

835892
itid = psprintf("(%u,%u)", state->targetblock, offset);
836893
htid = psprintf("(%u,%u)",
837-
ItemPointerGetBlockNumber(&(itup->t_tid)),
838-
ItemPointerGetOffsetNumber(&(itup->t_tid)));
894+
ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
895+
ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
839896
nitid = psprintf("(%u,%u)", state->targetblock,
840897
OffsetNumberNext(offset));
841898

842899
/* Reuse itup to get pointed-to heap location of second item */
843900
itemid = PageGetItemId(state->target, OffsetNumberNext(offset));
844901
itup = (IndexTuple) PageGetItem(state->target, itemid);
845902
nhtid = psprintf("(%u,%u)",
846-
ItemPointerGetBlockNumber(&(itup->t_tid)),
847-
ItemPointerGetOffsetNumber(&(itup->t_tid)));
903+
ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
904+
ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
848905

849906
ereport(ERROR,
850907
(errcode(ERRCODE_INDEX_CORRUPTED),
@@ -932,7 +989,7 @@ bt_target_page_check(BtreeCheckState *state)
932989
*/
933990
if (!P_ISLEAF(topaque) && state->readonly)
934991
{
935-
BlockNumber childblock = ItemPointerGetBlockNumber(&(itup->t_tid));
992+
BlockNumber childblock = ItemPointerGetBlockNumberNoCheck(&(itup->t_tid));
936993

937994
bt_downlink_check(state, childblock, skey);
938995
}
@@ -1326,6 +1383,11 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,
13261383
* or otherwise varied when or how compression was applied, our assumption
13271384
* would break, leading to false positive reports of corruption. For now,
13281385
* we don't decompress/normalize toasted values as part of fingerprinting.
1386+
*
1387+
* In future, non-pivot index tuples might get use of
1388+
* BT_N_KEYS_OFFSET_MASK. Then binary representation of index tuple linked
1389+
* to particular heap tuple might vary and meeds to be normalized before
1390+
* bloom filter lookup.
13291391
*/
13301392
itup = index_form_tuple(RelationGetDescr(index), values, isnull);
13311393
itup->t_tid = htup->t_self;
@@ -1336,8 +1398,8 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,
13361398
ereport(ERROR,
13371399
(errcode(ERRCODE_DATA_CORRUPTED),
13381400
errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"",
1339-
ItemPointerGetBlockNumber(&(itup->t_tid)),
1340-
ItemPointerGetOffsetNumber(&(itup->t_tid)),
1401+
ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
1402+
ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)),
13411403
RelationGetRelationName(state->heaprel),
13421404
RelationGetRelationName(state->rel)),
13431405
!state->readonly
@@ -1368,6 +1430,10 @@ offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset)
13681430
* infinity item is either first or second line item, or there is none
13691431
* within page.
13701432
*
1433+
* "Negative infinity" tuple is a special corner case of pivot tuples,
1434+
* it has zero attributes while rest of pivot tuples have nkeyatts number
1435+
* of attributes.
1436+
*
13711437
* Right-most pages don't have a high key, but could be said to
13721438
* conceptually have a "positive infinity" high key. Thus, there is a
13731439
* symmetry between down link items in parent pages, and high keys in
@@ -1391,10 +1457,10 @@ static inline bool
13911457
invariant_leq_offset(BtreeCheckState *state, ScanKey key,
13921458
OffsetNumber upperbound)
13931459
{
1394-
int16 natts = state->rel->rd_rel->relnatts;
1460+
int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
13951461
int32 cmp;
13961462

1397-
cmp = _bt_compare(state->rel, natts, key, state->target, upperbound);
1463+
cmp = _bt_compare(state->rel, nkeyatts, key, state->target, upperbound);
13981464

13991465
return cmp <= 0;
14001466
}
@@ -1410,10 +1476,10 @@ static inline bool
14101476
invariant_geq_offset(BtreeCheckState *state, ScanKey key,
14111477
OffsetNumber lowerbound)
14121478
{
1413-
int16 natts = state->rel->rd_rel->relnatts;
1479+
int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
14141480
int32 cmp;
14151481

1416-
cmp = _bt_compare(state->rel, natts, key, state->target, lowerbound);
1482+
cmp = _bt_compare(state->rel, nkeyatts, key, state->target, lowerbound);
14171483

14181484
return cmp >= 0;
14191485
}
@@ -1433,10 +1499,10 @@ invariant_leq_nontarget_offset(BtreeCheckState *state,
14331499
Page nontarget, ScanKey key,
14341500
OffsetNumber upperbound)
14351501
{
1436-
int16 natts = state->rel->rd_rel->relnatts;
1502+
int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
14371503
int32 cmp;
14381504

1439-
cmp = _bt_compare(state->rel, natts, key, nontarget, upperbound);
1505+
cmp = _bt_compare(state->rel, nkeyatts, key, nontarget, upperbound);
14401506

14411507
return cmp <= 0;
14421508
}

contrib/bloom/blutils.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ blhandler(PG_FUNCTION_ARGS)
120120
amroutine->amclusterable = false;
121121
amroutine->ampredlocks = false;
122122
amroutine->amcanparallel = false;
123+
amroutine->amcaninclude = false;
123124
amroutine->amkeytype = InvalidOid;
124125

125126
amroutine->ambuild = blbuild;

contrib/dblink/dblink.c

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ static remoteConn *getConnectionByName(const char *name);
100100
static HTAB *createConnHash(void);
101101
static void createNewConnection(const char *name, remoteConn *rconn);
102102
static void deleteConnection(const char *name);
103-
static char **get_pkey_attnames(Relation rel, int16 *numatts);
103+
static char **get_pkey_attnames(Relation rel, int16 *indnkeyatts);
104104
static char **get_text_array_contents(ArrayType *array, int *numitems);
105105
static char *get_sql_insert(Relation rel, int *pkattnums, int pknumatts, char **src_pkattvals, char **tgt_pkattvals);
106106
static char *get_sql_delete(Relation rel, int *pkattnums, int pknumatts, char **tgt_pkattvals);
@@ -1493,7 +1493,7 @@ PG_FUNCTION_INFO_V1(dblink_get_pkey);
14931493
Datum
14941494
dblink_get_pkey(PG_FUNCTION_ARGS)
14951495
{
1496-
int16 numatts;
1496+
int16 indnkeyatts;
14971497
char **results;
14981498
FuncCallContext *funcctx;
14991499
int32 call_cntr;
@@ -1519,7 +1519,7 @@ dblink_get_pkey(PG_FUNCTION_ARGS)
15191519
rel = get_rel_from_relname(PG_GETARG_TEXT_PP(0), AccessShareLock, ACL_SELECT);
15201520

15211521
/* get the array of attnums */
1522-
results = get_pkey_attnames(rel, &numatts);
1522+
results = get_pkey_attnames(rel, &indnkeyatts);
15231523

15241524
relation_close(rel, AccessShareLock);
15251525

@@ -1539,9 +1539,9 @@ dblink_get_pkey(PG_FUNCTION_ARGS)
15391539
attinmeta = TupleDescGetAttInMetadata(tupdesc);
15401540
funcctx->attinmeta = attinmeta;
15411541

1542-
if ((results != NULL) && (numatts > 0))
1542+
if ((results != NULL) && (indnkeyatts > 0))
15431543
{
1544-
funcctx->max_calls = numatts;
1544+
funcctx->max_calls = indnkeyatts;
15451545

15461546
/* got results, keep track of them */
15471547
funcctx->user_fctx = results;
@@ -2029,10 +2029,10 @@ dblink_fdw_validator(PG_FUNCTION_ARGS)
20292029
* get_pkey_attnames
20302030
*
20312031
* Get the primary key attnames for the given relation.
2032-
* Return NULL, and set numatts = 0, if no primary key exists.
2032+
* Return NULL, and set indnkeyatts = 0, if no primary key exists.
20332033
*/
20342034
static char **
2035-
get_pkey_attnames(Relation rel, int16 *numatts)
2035+
get_pkey_attnames(Relation rel, int16 *indnkeyatts)
20362036
{
20372037
Relation indexRelation;
20382038
ScanKeyData skey;
@@ -2042,8 +2042,8 @@ get_pkey_attnames(Relation rel, int16 *numatts)
20422042
char **result = NULL;
20432043
TupleDesc tupdesc;
20442044

2045-
/* initialize numatts to 0 in case no primary key exists */
2046-
*numatts = 0;
2045+
/* initialize indnkeyatts to 0 in case no primary key exists */
2046+
*indnkeyatts = 0;
20472047

20482048
tupdesc = rel->rd_att;
20492049

@@ -2064,12 +2064,12 @@ get_pkey_attnames(Relation rel, int16 *numatts)
20642064
/* we're only interested if it is the primary key */
20652065
if (index->indisprimary)
20662066
{
2067-
*numatts = index->indnatts;
2068-
if (*numatts > 0)
2067+
*indnkeyatts = index->indnkeyatts;
2068+
if (*indnkeyatts > 0)
20692069
{
2070-
result = (char **) palloc(*numatts * sizeof(char *));
2070+
result = (char **) palloc(*indnkeyatts * sizeof(char *));
20712071

2072-
for (i = 0; i < *numatts; i++)
2072+
for (i = 0; i < *indnkeyatts; i++)
20732073
result[i] = SPI_fname(tupdesc, index->indkey.values[i]);
20742074
}
20752075
break;

0 commit comments

Comments
 (0)