Skip to content

Commit 68d3585

Browse files
committed
Ensure we allocate NAMEDATALEN bytes for names in Index Only Scans
As an optimization, we store "name" columns as cstrings in btree indexes. Here we modify it so that Index Only Scans convert these cstrings back to names with NAMEDATALEN bytes rather than storing the cstring in the tuple slot, as was happening previously. Bug: #17855 Reported-by: Alexander Lakhin Reviewed-by: Alexander Lakhin, Tom Lane Discussion: https://postgr.es/m/17855-5f523e0f9769a566@postgresql.org Backpatch-through: 12, all supported versions
1 parent 92685c3 commit 68d3585

File tree

5 files changed

+141
-9
lines changed

5 files changed

+141
-9
lines changed

src/backend/executor/nodeIndexonlyscan.c

Lines changed: 88 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,21 @@
3535
#include "access/tableam.h"
3636
#include "access/tupdesc.h"
3737
#include "access/visibilitymap.h"
38+
#include "catalog/pg_type.h"
3839
#include "executor/execdebug.h"
3940
#include "executor/nodeIndexonlyscan.h"
4041
#include "executor/nodeIndexscan.h"
4142
#include "miscadmin.h"
4243
#include "storage/bufmgr.h"
4344
#include "storage/predicate.h"
45+
#include "utils/builtins.h"
4446
#include "utils/memutils.h"
4547
#include "utils/rel.h"
4648

4749

4850
static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
49-
static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup,
50-
TupleDesc itupdesc);
51+
static void StoreIndexTuple(IndexOnlyScanState *node, TupleTableSlot *slot,
52+
IndexTuple itup, TupleDesc itupdesc);
5153

5254

5355
/* ----------------------------------------------------------------
@@ -206,7 +208,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
206208
ExecForceStoreHeapTuple(scandesc->xs_hitup, slot, false);
207209
}
208210
else if (scandesc->xs_itup)
209-
StoreIndexTuple(slot, scandesc->xs_itup, scandesc->xs_itupdesc);
211+
StoreIndexTuple(node, slot, scandesc->xs_itup, scandesc->xs_itupdesc);
210212
else
211213
elog(ERROR, "no data returned for index-only scan");
212214

@@ -264,7 +266,8 @@ IndexOnlyNext(IndexOnlyScanState *node)
264266
* right now we don't need it elsewhere.
265267
*/
266268
static void
267-
StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc)
269+
StoreIndexTuple(IndexOnlyScanState *node, TupleTableSlot *slot,
270+
IndexTuple itup, TupleDesc itupdesc)
268271
{
269272
/*
270273
* Note: we must use the tupdesc supplied by the AM in index_deform_tuple,
@@ -277,6 +280,37 @@ StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc)
277280

278281
ExecClearTuple(slot);
279282
index_deform_tuple(itup, itupdesc, slot->tts_values, slot->tts_isnull);
283+
284+
/*
285+
* Copy all name columns stored as cstrings back into a NAMEDATALEN byte
286+
* sized allocation. We mark this branch as unlikely as generally "name"
287+
* is used only for the system catalogs and this would have to be a user
288+
* query running on those or some other user table with an index on a name
289+
* column.
290+
*/
291+
if (unlikely(node->ioss_NameCStringAttNums != NULL))
292+
{
293+
int attcount = node->ioss_NameCStringCount;
294+
295+
for (int idx = 0; idx < attcount; idx++)
296+
{
297+
int attnum = node->ioss_NameCStringAttNums[idx];
298+
Name name;
299+
300+
/* skip null Datums */
301+
if (slot->tts_isnull[attnum])
302+
continue;
303+
304+
/* allocate the NAMEDATALEN and copy the datum into that memory */
305+
name = (Name) MemoryContextAlloc(node->ss.ps.ps_ExprContext->ecxt_per_tuple_memory,
306+
NAMEDATALEN);
307+
308+
/* use namestrcpy to zero-pad all trailing bytes */
309+
namestrcpy(name, DatumGetCString(slot->tts_values[attnum]));
310+
slot->tts_values[attnum] = NameGetDatum(name);
311+
}
312+
}
313+
280314
ExecStoreVirtualTuple(slot);
281315
}
282316

@@ -490,8 +524,11 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
490524
{
491525
IndexOnlyScanState *indexstate;
492526
Relation currentRelation;
527+
Relation indexRelation;
493528
LOCKMODE lockmode;
494529
TupleDesc tupDesc;
530+
int indnkeyatts;
531+
int namecount;
495532

496533
/*
497534
* create state structure
@@ -564,7 +601,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
564601

565602
/* Open the index relation. */
566603
lockmode = exec_rt_fetch(node->scan.scanrelid, estate)->rellockmode;
567-
indexstate->ioss_RelationDesc = index_open(node->indexid, lockmode);
604+
indexRelation = index_open(node->indexid, lockmode);
605+
indexstate->ioss_RelationDesc = indexRelation;
568606

569607
/*
570608
* Initialize index-specific scan state
@@ -577,7 +615,7 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
577615
* build the index scan keys from the index qualification
578616
*/
579617
ExecIndexBuildScanKeys((PlanState *) indexstate,
580-
indexstate->ioss_RelationDesc,
618+
indexRelation,
581619
node->indexqual,
582620
false,
583621
&indexstate->ioss_ScanKeys,
@@ -591,7 +629,7 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
591629
* any ORDER BY exprs have to be turned into scankeys in the same way
592630
*/
593631
ExecIndexBuildScanKeys((PlanState *) indexstate,
594-
indexstate->ioss_RelationDesc,
632+
indexRelation,
595633
node->indexorderby,
596634
true,
597635
&indexstate->ioss_OrderByKeys,
@@ -620,6 +658,49 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
620658
indexstate->ioss_RuntimeContext = NULL;
621659
}
622660

661+
indexstate->ioss_NameCStringAttNums = NULL;
662+
indnkeyatts = indexRelation->rd_index->indnkeyatts;
663+
namecount = 0;
664+
665+
/*
666+
* The "name" type for btree uses text_ops which results in storing
667+
* cstrings in the indexed keys rather than names. Here we detect that in
668+
* a generic way in case other index AMs want to do the same optimization.
669+
* Check for opclasses with an opcintype of NAMEOID and an index tuple
670+
* descriptor with CSTRINGOID. If any of these are found, create an array
671+
* marking the index attribute number of each of them. StoreIndexTuple()
672+
* handles copying the name Datums into a NAMEDATALEN-byte allocation.
673+
*/
674+
675+
/* First, count the number of such index keys */
676+
for (int attnum = 0; attnum < indnkeyatts; attnum++)
677+
{
678+
if (indexRelation->rd_att->attrs[attnum].atttypid == CSTRINGOID &&
679+
indexRelation->rd_opcintype[attnum] == NAMEOID)
680+
namecount++;
681+
}
682+
683+
if (namecount > 0)
684+
{
685+
int idx = 0;
686+
687+
/*
688+
* Now create an array to mark the attribute numbers of the keys that
689+
* need to be converted from cstring to name.
690+
*/
691+
indexstate->ioss_NameCStringAttNums = (AttrNumber *)
692+
palloc(sizeof(AttrNumber) * namecount);
693+
694+
for (int attnum = 0; attnum < indnkeyatts; attnum++)
695+
{
696+
if (indexRelation->rd_att->attrs[attnum].atttypid == CSTRINGOID &&
697+
indexRelation->rd_opcintype[attnum] == NAMEOID)
698+
indexstate->ioss_NameCStringAttNums[idx++] = (AttrNumber) attnum;
699+
}
700+
}
701+
702+
indexstate->ioss_NameCStringCount = namecount;
703+
623704
/*
624705
* all done.
625706
*/

src/include/catalog/pg_opclass.dat

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,11 @@
9191
# Here's an ugly little hack to save space in the system catalog indexes.
9292
# btree doesn't ordinarily allow a storage type different from input type;
9393
# but cstring and name are the same thing except for trailing padding,
94-
# and we can safely omit that within an index entry. So we declare the
95-
# btree opclass for name as using cstring storage type.
94+
# so we choose to omit that within an index entry. Here we declare the
95+
# btree opclass for name as using cstring storage type. This does require
96+
# that we pad the cstring out with the full NAMEDATALEN bytes when performing
97+
# index-only scans. See corresponding hacks in ExecInitIndexOnlyScan() and
98+
# StoreIndexTuple().
9699
{ opcmethod => 'btree', opcname => 'name_ops', opcfamily => 'btree/text_ops',
97100
opcintype => 'name', opckeytype => 'cstring' },
98101

src/include/nodes/execnodes.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1600,6 +1600,8 @@ typedef struct IndexScanState
16001600
* TableSlot slot for holding tuples fetched from the table
16011601
* VMBuffer buffer in use for visibility map testing, if any
16021602
* PscanLen size of parallel index-only scan descriptor
1603+
* NameCStringAttNums attnums of name typed columns to pad to NAMEDATALEN
1604+
* NameCStringCount number of elements in the NameCStringAttNums array
16031605
* ----------------
16041606
*/
16051607
typedef struct IndexOnlyScanState
@@ -1619,6 +1621,8 @@ typedef struct IndexOnlyScanState
16191621
TupleTableSlot *ioss_TableSlot;
16201622
Buffer ioss_VMBuffer;
16211623
Size ioss_PscanLen;
1624+
AttrNumber *ioss_NameCStringAttNums;
1625+
int ioss_NameCStringCount;
16221626
} IndexOnlyScanState;
16231627

16241628
/* ----------------

src/test/regress/expected/index_including.out

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,3 +398,28 @@ Indexes:
398398
"tbl_c1_c2_c3_c4_key" UNIQUE CONSTRAINT, btree (c1, c2) INCLUDE (c3, c4)
399399

400400
DROP TABLE tbl;
401+
/*
402+
* 10. Test coverage for names stored as cstrings in indexes
403+
*/
404+
CREATE TABLE nametbl (c1 int, c2 name, c3 float);
405+
CREATE INDEX nametbl_c1_c2_idx ON nametbl (c2, c1) INCLUDE (c3);
406+
INSERT INTO nametbl VALUES(1, 'two', 3.0);
407+
VACUUM nametbl;
408+
SET enable_seqscan = 0;
409+
-- Ensure we get an index only scan plan
410+
EXPLAIN (COSTS OFF) SELECT c2, c1, c3 FROM nametbl WHERE c2 = 'two' AND c1 = 1;
411+
QUERY PLAN
412+
----------------------------------------------------
413+
Index Only Scan using nametbl_c1_c2_idx on nametbl
414+
Index Cond: ((c2 = 'two'::name) AND (c1 = 1))
415+
(2 rows)
416+
417+
-- Validate the results look sane
418+
SELECT c2, c1, c3 FROM nametbl WHERE c2 = 'two' AND c1 = 1;
419+
c2 | c1 | c3
420+
-----+----+----
421+
two | 1 | 3
422+
(1 row)
423+
424+
RESET enable_seqscan;
425+
DROP TABLE nametbl;

src/test/regress/sql/index_including.sql

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,3 +217,22 @@ ALTER TABLE tbl ALTER c1 TYPE bigint;
217217
ALTER TABLE tbl ALTER c3 TYPE bigint;
218218
\d tbl
219219
DROP TABLE tbl;
220+
221+
/*
222+
* 10. Test coverage for names stored as cstrings in indexes
223+
*/
224+
CREATE TABLE nametbl (c1 int, c2 name, c3 float);
225+
CREATE INDEX nametbl_c1_c2_idx ON nametbl (c2, c1) INCLUDE (c3);
226+
INSERT INTO nametbl VALUES(1, 'two', 3.0);
227+
VACUUM nametbl;
228+
SET enable_seqscan = 0;
229+
230+
-- Ensure we get an index only scan plan
231+
EXPLAIN (COSTS OFF) SELECT c2, c1, c3 FROM nametbl WHERE c2 = 'two' AND c1 = 1;
232+
233+
-- Validate the results look sane
234+
SELECT c2, c1, c3 FROM nametbl WHERE c2 = 'two' AND c1 = 1;
235+
236+
RESET enable_seqscan;
237+
238+
DROP TABLE nametbl;

0 commit comments

Comments
 (0)