Skip to content

Commit 2ac3ef7

Browse files
committed
Fix tuple routing in cases where tuple descriptors don't match.
The previous coding failed to work correctly when we have a multi-level partitioned hierarchy where tables at successive levels have different attribute numbers for the partition key attributes. To fix, have each PartitionDispatch object store a standalone TupleTableSlot initialized with the TupleDesc of the corresponding partitioned table, along with a TupleConversionMap to map tuples from the its parent's rowtype to own rowtype. After tuple routing chooses a leaf partition, we must use the leaf partition's tuple descriptor, not the root table's. To that end, a dedicated TupleTableSlot for tuple routing is now allocated in EState. Amit Langote
1 parent 12bd7dd commit 2ac3ef7

File tree

7 files changed

+190
-15
lines changed

7 files changed

+190
-15
lines changed

src/backend/catalog/partition.c

+61-13
Original file line numberDiff line numberDiff line change
@@ -923,13 +923,19 @@ RelationGetPartitionQual(Relation rel, bool recurse)
923923
return generate_partition_qual(rel, recurse);
924924
}
925925

926-
/* Turn an array of OIDs with N elements into a list */
927-
#define OID_ARRAY_TO_LIST(arr, N, list) \
926+
/*
927+
* Append OIDs of rel's partitions to the list 'partoids' and for each OID,
928+
* append pointer rel to the list 'parents'.
929+
*/
930+
#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
928931
do\
929932
{\
930933
int i;\
931-
for (i = 0; i < (N); i++)\
932-
(list) = lappend_oid((list), (arr)[i]);\
934+
for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
935+
{\
936+
(partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
937+
(parents) = lappend((parents), (rel));\
938+
}\
933939
} while(0)
934940

935941
/*
@@ -944,11 +950,13 @@ PartitionDispatch *
944950
RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
945951
int *num_parted, List **leaf_part_oids)
946952
{
947-
PartitionDesc rootpartdesc = RelationGetPartitionDesc(rel);
948953
PartitionDispatchData **pd;
949954
List *all_parts = NIL,
950-
*parted_rels;
951-
ListCell *lc;
955+
*all_parents = NIL,
956+
*parted_rels,
957+
*parted_rel_parents;
958+
ListCell *lc1,
959+
*lc2;
952960
int i,
953961
k,
954962
offset;
@@ -965,10 +973,13 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
965973
*/
966974
*num_parted = 1;
967975
parted_rels = list_make1(rel);
968-
OID_ARRAY_TO_LIST(rootpartdesc->oids, rootpartdesc->nparts, all_parts);
969-
foreach(lc, all_parts)
976+
/* Root partitioned table has no parent, so NULL for parent */
977+
parted_rel_parents = list_make1(NULL);
978+
APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
979+
forboth(lc1, all_parts, lc2, all_parents)
970980
{
971-
Relation partrel = heap_open(lfirst_oid(lc), lockmode);
981+
Relation partrel = heap_open(lfirst_oid(lc1), lockmode);
982+
Relation parent = lfirst(lc2);
972983
PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
973984

974985
/*
@@ -979,7 +990,8 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
979990
{
980991
(*num_parted)++;
981992
parted_rels = lappend(parted_rels, partrel);
982-
OID_ARRAY_TO_LIST(partdesc->oids, partdesc->nparts, all_parts);
993+
parted_rel_parents = lappend(parted_rel_parents, parent);
994+
APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
983995
}
984996
else
985997
heap_close(partrel, NoLock);
@@ -1004,10 +1016,12 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
10041016
sizeof(PartitionDispatchData *));
10051017
*leaf_part_oids = NIL;
10061018
i = k = offset = 0;
1007-
foreach(lc, parted_rels)
1019+
forboth(lc1, parted_rels, lc2, parted_rel_parents)
10081020
{
1009-
Relation partrel = lfirst(lc);
1021+
Relation partrel = lfirst(lc1);
1022+
Relation parent = lfirst(lc2);
10101023
PartitionKey partkey = RelationGetPartitionKey(partrel);
1024+
TupleDesc tupdesc = RelationGetDescr(partrel);
10111025
PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
10121026
int j,
10131027
m;
@@ -1017,6 +1031,27 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
10171031
pd[i]->key = partkey;
10181032
pd[i]->keystate = NIL;
10191033
pd[i]->partdesc = partdesc;
1034+
if (parent != NULL)
1035+
{
1036+
/*
1037+
* For every partitioned table other than root, we must store
1038+
* a tuple table slot initialized with its tuple descriptor and
1039+
* a tuple conversion map to convert a tuple from its parent's
1040+
* rowtype to its own. That is to make sure that we are looking
1041+
* at the correct row using the correct tuple descriptor when
1042+
* computing its partition key for tuple routing.
1043+
*/
1044+
pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc);
1045+
pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
1046+
tupdesc,
1047+
gettext_noop("could not convert row type"));
1048+
}
1049+
else
1050+
{
1051+
/* Not required for the root partitioned table */
1052+
pd[i]->tupslot = NULL;
1053+
pd[i]->tupmap = NULL;
1054+
}
10201055
pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
10211056

10221057
/*
@@ -1610,6 +1645,8 @@ get_partition_for_tuple(PartitionDispatch *pd,
16101645
{
16111646
PartitionKey key = parent->key;
16121647
PartitionDesc partdesc = parent->partdesc;
1648+
TupleTableSlot *myslot = parent->tupslot;
1649+
TupleConversionMap *map = parent->tupmap;
16131650

16141651
/* Quick exit */
16151652
if (partdesc->nparts == 0)
@@ -1618,6 +1655,17 @@ get_partition_for_tuple(PartitionDispatch *pd,
16181655
return -1;
16191656
}
16201657

1658+
if (myslot != NULL)
1659+
{
1660+
HeapTuple tuple = ExecFetchSlotTuple(slot);
1661+
1662+
ExecClearTuple(myslot);
1663+
Assert(map != NULL);
1664+
tuple = do_convert_tuple(tuple, map);
1665+
ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
1666+
slot = myslot;
1667+
}
1668+
16211669
/* Extract partition key from tuple */
16221670
FormPartitionKeyDatum(parent, slot, estate, values, isnull);
16231671

src/backend/commands/copy.c

+29-2
Original file line numberDiff line numberDiff line change
@@ -2435,6 +2435,15 @@ CopyFrom(CopyState cstate)
24352435
/* Triggers might need a slot as well */
24362436
estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate);
24372437

2438+
/*
2439+
* Initialize a dedicated slot to manipulate tuples of any given
2440+
* partition's rowtype.
2441+
*/
2442+
if (cstate->partition_dispatch_info)
2443+
estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
2444+
else
2445+
estate->es_partition_tuple_slot = NULL;
2446+
24382447
/*
24392448
* It's more efficient to prepare a bunch of tuples for insertion, and
24402449
* insert them in one heap_multi_insert() call, than call heap_insert()
@@ -2484,7 +2493,8 @@ CopyFrom(CopyState cstate)
24842493

24852494
for (;;)
24862495
{
2487-
TupleTableSlot *slot;
2496+
TupleTableSlot *slot,
2497+
*oldslot = NULL;
24882498
bool skip_tuple;
24892499
Oid loaded_oid = InvalidOid;
24902500

@@ -2571,7 +2581,19 @@ CopyFrom(CopyState cstate)
25712581
map = cstate->partition_tupconv_maps[leaf_part_index];
25722582
if (map)
25732583
{
2584+
Relation partrel = resultRelInfo->ri_RelationDesc;
2585+
25742586
tuple = do_convert_tuple(tuple, map);
2587+
2588+
/*
2589+
* We must use the partition's tuple descriptor from this
2590+
* point on. Use a dedicated slot from this point on until
2591+
* we're finished dealing with the partition.
2592+
*/
2593+
oldslot = slot;
2594+
slot = estate->es_partition_tuple_slot;
2595+
Assert(slot != NULL);
2596+
ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
25752597
ExecStoreTuple(tuple, slot, InvalidBuffer, true);
25762598
}
25772599

@@ -2667,6 +2689,10 @@ CopyFrom(CopyState cstate)
26672689
{
26682690
resultRelInfo = saved_resultRelInfo;
26692691
estate->es_result_relation_info = resultRelInfo;
2692+
2693+
/* Switch back to the slot corresponding to the root table */
2694+
Assert(oldslot != NULL);
2695+
slot = oldslot;
26702696
}
26712697
}
26722698
}
@@ -2714,13 +2740,14 @@ CopyFrom(CopyState cstate)
27142740
* Remember cstate->partition_dispatch_info[0] corresponds to the root
27152741
* partitioned table, which we must not try to close, because it is
27162742
* the main target table of COPY that will be closed eventually by
2717-
* DoCopy().
2743+
* DoCopy(). Also, tupslot is NULL for the root partitioned table.
27182744
*/
27192745
for (i = 1; i < cstate->num_dispatch; i++)
27202746
{
27212747
PartitionDispatch pd = cstate->partition_dispatch_info[i];
27222748

27232749
heap_close(pd->reldesc, NoLock);
2750+
ExecDropSingleTupleTableSlot(pd->tupslot);
27242751
}
27252752
for (i = 0; i < cstate->num_partitions; i++)
27262753
{

src/backend/executor/nodeModifyTable.c

+27
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ ExecInsert(ModifyTableState *mtstate,
262262
Relation resultRelationDesc;
263263
Oid newId;
264264
List *recheckIndexes = NIL;
265+
TupleTableSlot *oldslot = NULL;
265266

266267
/*
267268
* get the heap tuple out of the tuple table slot, making sure we have a
@@ -318,7 +319,19 @@ ExecInsert(ModifyTableState *mtstate,
318319
map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
319320
if (map)
320321
{
322+
Relation partrel = resultRelInfo->ri_RelationDesc;
323+
321324
tuple = do_convert_tuple(tuple, map);
325+
326+
/*
327+
* We must use the partition's tuple descriptor from this
328+
* point on, until we're finished dealing with the partition.
329+
* Use the dedicated slot for that.
330+
*/
331+
oldslot = slot;
332+
slot = estate->es_partition_tuple_slot;
333+
Assert(slot != NULL);
334+
ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
322335
ExecStoreTuple(tuple, slot, InvalidBuffer, true);
323336
}
324337
}
@@ -566,6 +579,10 @@ ExecInsert(ModifyTableState *mtstate,
566579
{
567580
resultRelInfo = saved_resultRelInfo;
568581
estate->es_result_relation_info = resultRelInfo;
582+
583+
/* Switch back to the slot corresponding to the root table */
584+
Assert(oldslot != NULL);
585+
slot = oldslot;
569586
}
570587

571588
/*
@@ -1734,7 +1751,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
17341751
mtstate->mt_partitions = partitions;
17351752
mtstate->mt_num_partitions = num_partitions;
17361753
mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
1754+
1755+
/*
1756+
* Initialize a dedicated slot to manipulate tuples of any given
1757+
* partition's rowtype.
1758+
*/
1759+
estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
17371760
}
1761+
else
1762+
estate->es_partition_tuple_slot = NULL;
17381763

17391764
/*
17401765
* Initialize any WITH CHECK OPTION constraints if needed.
@@ -2058,12 +2083,14 @@ ExecEndModifyTable(ModifyTableState *node)
20582083
* Remember node->mt_partition_dispatch_info[0] corresponds to the root
20592084
* partitioned table, which we must not try to close, because it is the
20602085
* main target table of the query that will be closed by ExecEndPlan().
2086+
* Also, tupslot is NULL for the root partitioned table.
20612087
*/
20622088
for (i = 1; i < node->mt_num_dispatch; i++)
20632089
{
20642090
PartitionDispatch pd = node->mt_partition_dispatch_info[i];
20652091

20662092
heap_close(pd->reldesc, NoLock);
2093+
ExecDropSingleTupleTableSlot(pd->tupslot);
20672094
}
20682095
for (i = 0; i < node->mt_num_partitions; i++)
20692096
{

src/include/catalog/partition.h

+7
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ typedef struct PartitionDescData *PartitionDesc;
4747
* key Partition key information of the table
4848
* keystate Execution state required for expressions in the partition key
4949
* partdesc Partition descriptor of the table
50+
* tupslot A standalone TupleTableSlot initialized with this table's tuple
51+
* descriptor
52+
* tupmap TupleConversionMap to convert from the parent's rowtype to
53+
* this table's rowtype (when extracting the partition key of a
54+
* tuple just before routing it through this table)
5055
* indexes Array with partdesc->nparts members (for details on what
5156
* individual members represent, see how they are set in
5257
* RelationGetPartitionDispatchInfo())
@@ -58,6 +63,8 @@ typedef struct PartitionDispatchData
5863
PartitionKey key;
5964
List *keystate; /* list of ExprState */
6065
PartitionDesc partdesc;
66+
TupleTableSlot *tupslot;
67+
TupleConversionMap *tupmap;
6168
int *indexes;
6269
} PartitionDispatchData;
6370

src/include/nodes/execnodes.h

+3
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,9 @@ typedef struct EState
384384
TupleTableSlot *es_trig_oldtup_slot; /* for TriggerEnabled */
385385
TupleTableSlot *es_trig_newtup_slot; /* for TriggerEnabled */
386386

387+
/* Slot used to manipulate a tuple after it is routed to a partition */
388+
TupleTableSlot *es_partition_tuple_slot;
389+
387390
/* Parameter info: */
388391
ParamListInfo es_param_list_info; /* values of external params */
389392
ParamExecData *es_param_exec_vals; /* values of internal params */

src/test/regress/expected/insert.out

+37
Original file line numberDiff line numberDiff line change
@@ -300,3 +300,40 @@ drop cascades to table part_null
300300
drop cascades to table part_ee_ff
301301
drop cascades to table part_ee_ff1
302302
drop cascades to table part_ee_ff2
303+
-- more tests for certain multi-level partitioning scenarios
304+
create table p (a int, b int) partition by range (a, b);
305+
create table p1 (b int, a int not null) partition by range (b);
306+
create table p11 (like p1);
307+
alter table p11 drop a;
308+
alter table p11 add a int;
309+
alter table p11 drop a;
310+
alter table p11 add a int not null;
311+
-- attnum for key attribute 'a' is different in p, p1, and p11
312+
select attrelid::regclass, attname, attnum
313+
from pg_attribute
314+
where attname = 'a'
315+
and (attrelid = 'p'::regclass
316+
or attrelid = 'p1'::regclass
317+
or attrelid = 'p11'::regclass);
318+
attrelid | attname | attnum
319+
----------+---------+--------
320+
p | a | 1
321+
p1 | a | 2
322+
p11 | a | 4
323+
(3 rows)
324+
325+
alter table p1 attach partition p11 for values from (2) to (5);
326+
alter table p attach partition p1 for values from (1, 2) to (1, 10);
327+
-- check that "(1, 2)" is correctly routed to p11.
328+
insert into p values (1, 2);
329+
select tableoid::regclass, * from p;
330+
tableoid | a | b
331+
----------+---+---
332+
p11 | 1 | 2
333+
(1 row)
334+
335+
-- cleanup
336+
drop table p cascade;
337+
NOTICE: drop cascades to 2 other objects
338+
DETAIL: drop cascades to table p1
339+
drop cascades to table p11

src/test/regress/sql/insert.sql

+26
Original file line numberDiff line numberDiff line change
@@ -170,3 +170,29 @@ select tableoid::regclass, * from list_parted;
170170
-- cleanup
171171
drop table range_parted cascade;
172172
drop table list_parted cascade;
173+
174+
-- more tests for certain multi-level partitioning scenarios
175+
create table p (a int, b int) partition by range (a, b);
176+
create table p1 (b int, a int not null) partition by range (b);
177+
create table p11 (like p1);
178+
alter table p11 drop a;
179+
alter table p11 add a int;
180+
alter table p11 drop a;
181+
alter table p11 add a int not null;
182+
-- attnum for key attribute 'a' is different in p, p1, and p11
183+
select attrelid::regclass, attname, attnum
184+
from pg_attribute
185+
where attname = 'a'
186+
and (attrelid = 'p'::regclass
187+
or attrelid = 'p1'::regclass
188+
or attrelid = 'p11'::regclass);
189+
190+
alter table p1 attach partition p11 for values from (2) to (5);
191+
alter table p attach partition p1 for values from (1, 2) to (1, 10);
192+
193+
-- check that "(1, 2)" is correctly routed to p11.
194+
insert into p values (1, 2);
195+
select tableoid::regclass, * from p;
196+
197+
-- cleanup
198+
drop table p cascade;

0 commit comments

Comments
 (0)