Skip to content

Commit 3c56904

Browse files
committed
Allow left join removals and unique joins on partitioned tables
This allows left join removals and unique joins to work with partitioned tables. The planner just lacked sufficient proofs that a given join would not cause any row duplication. Unique indexes currently serve as that proof, so have get_relation_info() populate the indexlist for partitioned tables too. Author: Arne Roland Reviewed-by: Alvaro Herrera, Zhihong Yu, Amit Langote, David Rowley Discussion: https://postgr.es/m/c3b2408b7a39433b8230bbcd02e9f302@index.de
1 parent 216a784 commit 3c56904

File tree

7 files changed

+179
-124
lines changed

7 files changed

+179
-124
lines changed

src/backend/optimizer/util/plancat.c

Lines changed: 149 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,9 @@ static void set_baserel_partition_constraint(Relation relation,
109109
* If inhparent is true, all we need to do is set up the attr arrays:
110110
* the RelOptInfo actually represents the appendrel formed by an inheritance
111111
* tree, and so the parent rel's physical size and index information isn't
112-
* important for it.
112+
* important for it, however, for partitioned tables, we do populate the
113+
* indexlist as the planner uses unique indexes as unique proofs for certain
114+
* optimizations.
113115
*/
114116
void
115117
get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
@@ -175,10 +177,14 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
175177

176178
/*
177179
* Make list of indexes. Ignore indexes on system catalogs if told to.
178-
* Don't bother with indexes for an inheritance parent, either.
180+
* Don't bother with indexes from traditional inheritance parents. For
181+
* partitioned tables, we need a list of at least unique indexes as these
182+
* serve as unique proofs for certain planner optimizations. However,
183+
* let's not discriminate here and just record all partitioned indexes
184+
* whether they're unique indexes or not.
179185
*/
180-
if (inhparent ||
181-
(IgnoreSystemIndexes && IsSystemRelation(relation)))
186+
if ((inhparent && relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
187+
|| (IgnoreSystemIndexes && IsSystemRelation(relation)))
182188
hasindex = false;
183189
else
184190
hasindex = relation->rd_rel->relhasindex;
@@ -231,16 +237,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
231237
continue;
232238
}
233239

234-
/*
235-
* Ignore partitioned indexes, since they are not usable for
236-
* queries.
237-
*/
238-
if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
239-
{
240-
index_close(indexRelation, NoLock);
241-
continue;
242-
}
243-
244240
/*
245241
* If the index is valid, but cannot yet be used, ignore it; but
246242
* mark the plan we are generating as transient. See
@@ -285,105 +281,129 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
285281

286282
info->relam = indexRelation->rd_rel->relam;
287283

288-
/* We copy just the fields we need, not all of rd_indam */
289-
amroutine = indexRelation->rd_indam;
290-
info->amcanorderbyop = amroutine->amcanorderbyop;
291-
info->amoptionalkey = amroutine->amoptionalkey;
292-
info->amsearcharray = amroutine->amsearcharray;
293-
info->amsearchnulls = amroutine->amsearchnulls;
294-
info->amcanparallel = amroutine->amcanparallel;
295-
info->amhasgettuple = (amroutine->amgettuple != NULL);
296-
info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
297-
relation->rd_tableam->scan_bitmap_next_block != NULL;
298-
info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
299-
amroutine->amrestrpos != NULL);
300-
info->amcostestimate = amroutine->amcostestimate;
301-
Assert(info->amcostestimate != NULL);
302-
303-
/* Fetch index opclass options */
304-
info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
305-
306284
/*
307-
* Fetch the ordering information for the index, if any.
285+
* We don't have an AM for partitioned indexes, so we'll just
286+
* NULLify the AM related fields for those.
308287
*/
309-
if (info->relam == BTREE_AM_OID)
288+
if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
310289
{
290+
/* We copy just the fields we need, not all of rd_indam */
291+
amroutine = indexRelation->rd_indam;
292+
info->amcanorderbyop = amroutine->amcanorderbyop;
293+
info->amoptionalkey = amroutine->amoptionalkey;
294+
info->amsearcharray = amroutine->amsearcharray;
295+
info->amsearchnulls = amroutine->amsearchnulls;
296+
info->amcanparallel = amroutine->amcanparallel;
297+
info->amhasgettuple = (amroutine->amgettuple != NULL);
298+
info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
299+
relation->rd_tableam->scan_bitmap_next_block != NULL;
300+
info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
301+
amroutine->amrestrpos != NULL);
302+
info->amcostestimate = amroutine->amcostestimate;
303+
Assert(info->amcostestimate != NULL);
304+
305+
/* Fetch index opclass options */
306+
info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
307+
311308
/*
312-
* If it's a btree index, we can use its opfamily OIDs
313-
* directly as the sort ordering opfamily OIDs.
309+
* Fetch the ordering information for the index, if any.
314310
*/
315-
Assert(amroutine->amcanorder);
316-
317-
info->sortopfamily = info->opfamily;
318-
info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
319-
info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
320-
321-
for (i = 0; i < nkeycolumns; i++)
311+
if (info->relam == BTREE_AM_OID)
322312
{
323-
int16 opt = indexRelation->rd_indoption[i];
313+
/*
314+
* If it's a btree index, we can use its opfamily OIDs
315+
* directly as the sort ordering opfamily OIDs.
316+
*/
317+
Assert(amroutine->amcanorder);
324318

325-
info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
326-
info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
327-
}
328-
}
329-
else if (amroutine->amcanorder)
330-
{
331-
/*
332-
* Otherwise, identify the corresponding btree opfamilies by
333-
* trying to map this index's "<" operators into btree. Since
334-
* "<" uniquely defines the behavior of a sort order, this is
335-
* a sufficient test.
336-
*
337-
* XXX This method is rather slow and also requires the
338-
* undesirable assumption that the other index AM numbers its
339-
* strategies the same as btree. It'd be better to have a way
340-
* to explicitly declare the corresponding btree opfamily for
341-
* each opfamily of the other index type. But given the lack
342-
* of current or foreseeable amcanorder index types, it's not
343-
* worth expending more effort on now.
344-
*/
345-
info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
346-
info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
347-
info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
319+
info->sortopfamily = info->opfamily;
320+
info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
321+
info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
348322

349-
for (i = 0; i < nkeycolumns; i++)
350-
{
351-
int16 opt = indexRelation->rd_indoption[i];
352-
Oid ltopr;
353-
Oid btopfamily;
354-
Oid btopcintype;
355-
int16 btstrategy;
356-
357-
info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
358-
info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
359-
360-
ltopr = get_opfamily_member(info->opfamily[i],
361-
info->opcintype[i],
362-
info->opcintype[i],
363-
BTLessStrategyNumber);
364-
if (OidIsValid(ltopr) &&
365-
get_ordering_op_properties(ltopr,
366-
&btopfamily,
367-
&btopcintype,
368-
&btstrategy) &&
369-
btopcintype == info->opcintype[i] &&
370-
btstrategy == BTLessStrategyNumber)
323+
for (i = 0; i < nkeycolumns; i++)
371324
{
372-
/* Successful mapping */
373-
info->sortopfamily[i] = btopfamily;
325+
int16 opt = indexRelation->rd_indoption[i];
326+
327+
info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
328+
info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
374329
}
375-
else
330+
}
331+
else if (amroutine->amcanorder)
332+
{
333+
/*
334+
* Otherwise, identify the corresponding btree opfamilies
335+
* by trying to map this index's "<" operators into btree.
336+
* Since "<" uniquely defines the behavior of a sort
337+
* order, this is a sufficient test.
338+
*
339+
* XXX This method is rather slow and also requires the
340+
* undesirable assumption that the other index AM numbers
341+
* its strategies the same as btree. It'd be better to
342+
* have a way to explicitly declare the corresponding
343+
* btree opfamily for each opfamily of the other index
344+
* type. But given the lack of current or foreseeable
345+
* amcanorder index types, it's not worth expending more
346+
* effort on now.
347+
*/
348+
info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
349+
info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
350+
info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
351+
352+
for (i = 0; i < nkeycolumns; i++)
376353
{
377-
/* Fail ... quietly treat index as unordered */
378-
info->sortopfamily = NULL;
379-
info->reverse_sort = NULL;
380-
info->nulls_first = NULL;
381-
break;
354+
int16 opt = indexRelation->rd_indoption[i];
355+
Oid ltopr;
356+
Oid btopfamily;
357+
Oid btopcintype;
358+
int16 btstrategy;
359+
360+
info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
361+
info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
362+
363+
ltopr = get_opfamily_member(info->opfamily[i],
364+
info->opcintype[i],
365+
info->opcintype[i],
366+
BTLessStrategyNumber);
367+
if (OidIsValid(ltopr) &&
368+
get_ordering_op_properties(ltopr,
369+
&btopfamily,
370+
&btopcintype,
371+
&btstrategy) &&
372+
btopcintype == info->opcintype[i] &&
373+
btstrategy == BTLessStrategyNumber)
374+
{
375+
/* Successful mapping */
376+
info->sortopfamily[i] = btopfamily;
377+
}
378+
else
379+
{
380+
/* Fail ... quietly treat index as unordered */
381+
info->sortopfamily = NULL;
382+
info->reverse_sort = NULL;
383+
info->nulls_first = NULL;
384+
break;
385+
}
382386
}
383387
}
388+
else
389+
{
390+
info->sortopfamily = NULL;
391+
info->reverse_sort = NULL;
392+
info->nulls_first = NULL;
393+
}
384394
}
385395
else
386396
{
397+
info->amcanorderbyop = false;
398+
info->amoptionalkey = false;
399+
info->amsearcharray = false;
400+
info->amsearchnulls = false;
401+
info->amcanparallel = false;
402+
info->amhasgettuple = false;
403+
info->amhasgetbitmap = false;
404+
info->amcanmarkpos = false;
405+
info->amcostestimate = NULL;
406+
387407
info->sortopfamily = NULL;
388408
info->reverse_sort = NULL;
389409
info->nulls_first = NULL;
@@ -416,31 +436,45 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
416436
* the number-of-tuples estimate to equal the parent table; if it
417437
* is partial then we have to use the same methods as we would for
418438
* a table, except we can be sure that the index is not larger
419-
* than the table.
439+
* than the table. We must ignore partitioned indexes here as as
440+
* there are not physical indexes.
420441
*/
421-
if (info->indpred == NIL)
442+
if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
422443
{
423-
info->pages = RelationGetNumberOfBlocks(indexRelation);
424-
info->tuples = rel->tuples;
425-
}
426-
else
427-
{
428-
double allvisfrac; /* dummy */
429-
430-
estimate_rel_size(indexRelation, NULL,
431-
&info->pages, &info->tuples, &allvisfrac);
432-
if (info->tuples > rel->tuples)
444+
if (info->indpred == NIL)
445+
{
446+
info->pages = RelationGetNumberOfBlocks(indexRelation);
433447
info->tuples = rel->tuples;
434-
}
448+
}
449+
else
450+
{
451+
double allvisfrac; /* dummy */
435452

436-
if (info->relam == BTREE_AM_OID)
437-
{
438-
/* For btrees, get tree height while we have the index open */
439-
info->tree_height = _bt_getrootheight(indexRelation);
453+
estimate_rel_size(indexRelation, NULL,
454+
&info->pages, &info->tuples, &allvisfrac);
455+
if (info->tuples > rel->tuples)
456+
info->tuples = rel->tuples;
457+
}
458+
459+
if (info->relam == BTREE_AM_OID)
460+
{
461+
/*
462+
* For btrees, get tree height while we have the index
463+
* open
464+
*/
465+
info->tree_height = _bt_getrootheight(indexRelation);
466+
}
467+
else
468+
{
469+
/* For other index types, just set it to "unknown" for now */
470+
info->tree_height = -1;
471+
}
440472
}
441473
else
442474
{
443-
/* For other index types, just set it to "unknown" for now */
475+
/* Zero these out for partitioned indexes */
476+
info->pages = 0;
477+
info->tuples = 0.0;
444478
info->tree_height = -1;
445479
}
446480

src/backend/utils/adt/selfuncs.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5994,6 +5994,10 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
59945994
rte = root->simple_rte_array[rel->relid];
59955995
Assert(rte->rtekind == RTE_RELATION);
59965996

5997+
/* ignore partitioned tables. Any indexes here are not real indexes */
5998+
if (rte->relkind == RELKIND_PARTITIONED_TABLE)
5999+
return false;
6000+
59976001
/* Search through the indexes to see if any match our problem */
59986002
foreach(lc, rel->indexlist)
59996003
{

src/include/nodes/pathnodes.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,7 @@ typedef struct PartitionSchemeData *PartitionScheme;
653653
* lateral_referencers - relids of rels that reference this one laterally
654654
* (includes both direct and indirect lateral references)
655655
* indexlist - list of IndexOptInfo nodes for relation's indexes
656-
* (always NIL if it's not a table)
656+
* (always NIL if it's not a table or partitioned table)
657657
* pages - number of disk pages in relation (zero if not a table)
658658
* tuples - number of tuples in relation (not considering restrictions)
659659
* allvisfrac - fraction of disk pages that are marked all-visible
@@ -1097,11 +1097,11 @@ struct IndexOptInfo
10971097
Oid *opfamily pg_node_attr(array_size(nkeycolumns));
10981098
/* OIDs of opclass declared input data types */
10991099
Oid *opcintype pg_node_attr(array_size(nkeycolumns));
1100-
/* OIDs of btree opfamilies, if orderable */
1100+
/* OIDs of btree opfamilies, if orderable. NULL if partitioned index */
11011101
Oid *sortopfamily pg_node_attr(array_size(nkeycolumns));
1102-
/* is sort order descending? */
1102+
/* is sort order descending? or NULL if partitioned index */
11031103
bool *reverse_sort pg_node_attr(array_size(nkeycolumns));
1104-
/* do NULLs come first in the sort order? */
1104+
/* do NULLs come first in the sort order? or NULL if partitioned index */
11051105
bool *nulls_first pg_node_attr(array_size(nkeycolumns));
11061106
/* opclass-specific options for columns */
11071107
bytea **opclassoptions pg_node_attr(read_write_ignore);
@@ -1139,7 +1139,7 @@ struct IndexOptInfo
11391139

11401140
/*
11411141
* Remaining fields are copied from the index AM's API struct
1142-
* (IndexAmRoutine).
1142+
* (IndexAmRoutine). These fields are not set for partitioned indexes.
11431143
*/
11441144
bool amcanorderbyop;
11451145
bool amoptionalkey;

src/test/regress/expected/join.out

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4860,6 +4860,16 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
48604860
Filter: (a.id = i)
48614861
(4 rows)
48624862

4863+
CREATE TEMP TABLE parted_b (id int PRIMARY KEY) partition by range(id);
4864+
CREATE TEMP TABLE parted_b1 partition of parted_b for values from (0) to (10);
4865+
-- test join removals on a partitioned table
4866+
explain (costs off)
4867+
select a.* from a left join parted_b pb on a.b_id = pb.id;
4868+
QUERY PLAN
4869+
---------------
4870+
Seq Scan on a
4871+
(1 row)
4872+
48634873
rollback;
48644874
create temp table parent (k int primary key, pd int);
48654875
create temp table child (k int unique, cd int);

0 commit comments

Comments
 (0)