Skip to content

Commit db632fb

Browse files
committed
Allow ordered partition scans in more cases
959d00e added the ability to make use of an Append node instead of a MergeAppend when we wanted to perform a scan of a partitioned table and the required sort order was the same as the partitioned keys and the partitioned table was defined in such a way that earlier partitions were guaranteed to only contain lower-order values than later partitions. However, previously we didn't allow these ordered partition scans for LIST partitioned table when there were any partitions that allowed multiple Datums. This was a very cheap check to make and we could likely have done a little better by checking if there were interleaved partitions, but at the time we didn't have visibility about which partitions were pruned, so we still may have disallowed cases where all interleaved partitions were pruned. Since 475dbd0, we now have knowledge of pruned partitions, we can do a much better job inside partitions_are_ordered(). Here we pass which partitions survived partition pruning into partitions_are_ordered() and, for LIST partitioning, have it check to see if any live partitions exist that are also in the new "interleaved_parts" field defined in PartitionBoundInfo. For RANGE partitioning we can relax the code which caused the partitions to be unordered if a DEFAULT partition existed. Since we now know which partitions were pruned, partitions_are_ordered() now returns true when the DEFAULT partition was pruned. Reviewed-by: Amit Langote, Zhihong Yu Discussion: https://postgr.es/m/CAApHDvrdoN_sXU52i=QDXe2k3WAo=EVry29r2+Tq2WYcn2xhEA@mail.gmail.com
1 parent 475dbd0 commit db632fb

File tree

6 files changed

+235
-27
lines changed

6 files changed

+235
-27
lines changed

src/backend/optimizer/path/allpaths.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1689,7 +1689,7 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel,
16891689
* for both forward and reverse scans.
16901690
*/
16911691
if (rel->part_scheme != NULL && IS_SIMPLE_REL(rel) &&
1692-
partitions_are_ordered(rel->boundinfo, rel->nparts))
1692+
partitions_are_ordered(rel->boundinfo, rel->live_parts))
16931693
{
16941694
partition_pathkeys = build_partition_pathkeys(root, rel,
16951695
ForwardScanDirection,

src/backend/optimizer/path/pathkeys.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -704,7 +704,7 @@ build_partition_pathkeys(PlannerInfo *root, RelOptInfo *partrel,
704704
int i;
705705

706706
Assert(partscheme != NULL);
707-
Assert(partitions_are_ordered(partrel->boundinfo, partrel->nparts));
707+
Assert(partitions_are_ordered(partrel->boundinfo, partrel->live_parts));
708708
/* For now, we can only cope with baserels */
709709
Assert(IS_SIMPLE_REL(partrel));
710710

src/backend/partitioning/partbounds.c

Lines changed: 81 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,7 @@ create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts,
395395
boundinfo->ndatums = nparts;
396396
boundinfo->datums = (Datum **) palloc0(nparts * sizeof(Datum *));
397397
boundinfo->kind = NULL;
398+
boundinfo->interleaved_parts = NULL;
398399
boundinfo->nindexes = greatest_modulus;
399400
boundinfo->indexes = (int *) palloc(greatest_modulus * sizeof(int));
400401
for (i = 0; i < greatest_modulus; i++)
@@ -543,6 +544,7 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts,
543544
boundinfo->ndatums = ndatums;
544545
boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
545546
boundinfo->kind = NULL;
547+
boundinfo->interleaved_parts = NULL;
546548
boundinfo->nindexes = ndatums;
547549
boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
548550

@@ -607,6 +609,69 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts,
607609
boundinfo->default_index = (*mapping)[default_index];
608610
}
609611

612+
/*
613+
* Calculate interleaved partitions. Here we look for partitions which
614+
* might be interleaved with other partitions and set a bit in
615+
* interleaved_parts for any partitions which may be interleaved with
616+
* another partition.
617+
*/
618+
619+
/*
620+
* There must be multiple partitions to have any interleaved partitions,
621+
* otherwise there's nothing to interleave with.
622+
*/
623+
if (nparts > 1)
624+
{
625+
/*
626+
* Short-circuit check to see if only 1 Datum is allowed per
627+
* partition. When this is true there's no need to do the more
628+
* expensive checks to look for interleaved values.
629+
*/
630+
if (boundinfo->ndatums +
631+
partition_bound_accepts_nulls(boundinfo) +
632+
partition_bound_has_default(boundinfo) != nparts)
633+
{
634+
int last_index = -1;
635+
636+
/*
637+
* Since the indexes array is sorted in Datum order, if any
638+
* partitions are interleaved then it will show up by the
639+
* partition indexes not being in ascending order. Here we check
640+
* for that and record all partitions that are out of order.
641+
*/
642+
for (i = 0; i < boundinfo->nindexes; i++)
643+
{
644+
int index = boundinfo->indexes[i];
645+
646+
if (index < last_index)
647+
boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts,
648+
index);
649+
650+
/*
651+
* Mark the NULL partition as interleaved if we find that it
652+
* allows some other non-NULL Datum.
653+
*/
654+
if (partition_bound_accepts_nulls(boundinfo) &&
655+
index == boundinfo->null_index)
656+
boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts,
657+
boundinfo->null_index);
658+
659+
last_index = index;
660+
}
661+
}
662+
663+
/*
664+
* The DEFAULT partition is the "catch-all" partition that can contain
665+
* anything that does not belong to any other partition. If there are
666+
* any other partitions then the DEFAULT partition must be marked as
667+
* interleaved.
668+
*/
669+
if (partition_bound_has_default(boundinfo))
670+
boundinfo->interleaved_parts = bms_add_member(boundinfo->interleaved_parts,
671+
boundinfo->default_index);
672+
}
673+
674+
610675
/* All partitions must now have been assigned canonical indexes. */
611676
Assert(next_index == nparts);
612677
return boundinfo;
@@ -750,6 +815,7 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts,
750815
boundinfo->kind = (PartitionRangeDatumKind **)
751816
palloc(ndatums *
752817
sizeof(PartitionRangeDatumKind *));
818+
boundinfo->interleaved_parts = NULL;
753819

754820
/*
755821
* For range partitioning, an additional value of -1 is stored as the last
@@ -993,6 +1059,9 @@ partition_bounds_copy(PartitionBoundInfo src,
9931059
else
9941060
dest->kind = NULL;
9951061

1062+
/* copy interleaved partitions for LIST partitioned tables */
1063+
dest->interleaved_parts = bms_copy(src->interleaved_parts);
1064+
9961065
/*
9971066
* For hash partitioning, datums array will have two elements - modulus
9981067
* and remainder.
@@ -2780,13 +2849,15 @@ add_merged_range_bounds(int partnatts, FmgrInfo *partsupfuncs,
27802849
* that is partitions appearing earlier in the PartitionDesc sequence
27812850
* contain partition keys strictly less than those appearing later.
27822851
* Also, if NULL values are possible, they must come in the last
2783-
* partition defined in the PartitionDesc.
2852+
* partition defined in the PartitionDesc. 'live_parts' marks which
2853+
* partitions we should include when checking the ordering. Partitions
2854+
* that do not appear in 'live_parts' are ignored.
27842855
*
27852856
* If out of order, or there is insufficient info to know the order,
27862857
* then we return false.
27872858
*/
27882859
bool
2789-
partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts)
2860+
partitions_are_ordered(PartitionBoundInfo boundinfo, Bitmapset *live_parts)
27902861
{
27912862
Assert(boundinfo != NULL);
27922863

@@ -2798,38 +2869,24 @@ partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts)
27982869
* RANGE-type partitioning guarantees that the partitions can be
27992870
* scanned in the order that they're defined in the PartitionDesc
28002871
* to provide sequential, non-overlapping ranges of tuples.
2801-
* However, if a DEFAULT partition exists then it doesn't work, as
2802-
* that could contain tuples from either below or above the
2803-
* defined range, or tuples belonging to gaps between partitions.
2872+
* However, if a DEFAULT partition exists and it's contained
2873+
* within live_parts, then the partitions are not ordered.
28042874
*/
2805-
if (!partition_bound_has_default(boundinfo))
2875+
if (!partition_bound_has_default(boundinfo) ||
2876+
!bms_is_member(boundinfo->default_index, live_parts))
28062877
return true;
28072878
break;
28082879

28092880
case PARTITION_STRATEGY_LIST:
28102881

28112882
/*
2812-
* LIST partitioning can also guarantee ordering, but only if the
2813-
* partitions don't accept interleaved values. We could likely
2814-
* check for this by looping over the PartitionBound's indexes
2815-
* array to check that the indexes are in order. For now, let's
2816-
* just keep it simple and just accept LIST partitioning when
2817-
* there's no DEFAULT partition, exactly one value per partition,
2818-
* and optionally a NULL partition that does not accept any other
2819-
* values. Such a NULL partition will come last in the
2820-
* PartitionDesc, and the other partitions will be properly
2821-
* ordered. This is a cheap test to make as it does not require
2822-
* any per-partition processing. Maybe we'd like to handle more
2823-
* complex cases in the future.
2883+
* LIST partitioned are ordered providing none of live_parts
2884+
* overlap with the partitioned table's interleaved partitions.
28242885
*/
2825-
if (partition_bound_has_default(boundinfo))
2826-
return false;
2827-
2828-
if (boundinfo->ndatums + partition_bound_accepts_nulls(boundinfo)
2829-
== nparts)
2886+
if (!bms_overlap(live_parts, boundinfo->interleaved_parts))
28302887
return true;
2831-
break;
28322888

2889+
break;
28332890
default:
28342891
/* HASH, or some other strategy */
28352892
break;

src/include/partitioning/partbounds.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,18 @@ struct RelOptInfo; /* avoid including pathnodes.h here */
6161
* The indexes array is indexed according to the hash key's remainder modulo
6262
* the greatest modulus, and it contains either the partition index accepting
6363
* that remainder, or -1 if there is no partition for that remainder.
64+
*
65+
* For LIST partitioned tables, we track the partition indexes of partitions
66+
* which are possibly "interleaved" partitions. A partition is considered
67+
* interleaved if it allows multiple values and there exists at least one
68+
* other partition which could contain a value that lies between those values.
69+
* For example, if a partition exists FOR VALUES IN(3,5) and another partition
70+
* exists FOR VALUES IN (4), then the IN(3,5) partition is an interleaved
71+
* partition. The same is possible with DEFAULT partitions since they can
72+
* contain any value that does not belong in another partition. This field
73+
* only serves as proof that a particular partition is not interleaved, not
74+
* proof that it is interleaved. When we're uncertain, we marked the
75+
* partition as interleaved.
6476
*/
6577
typedef struct PartitionBoundInfoData
6678
{
@@ -70,6 +82,9 @@ typedef struct PartitionBoundInfoData
7082
PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
7183
* NULL for hash and list partitioned
7284
* tables */
85+
Bitmapset *interleaved_parts; /* Partition indexes of partitions which
86+
* may be interleaved. See above. This is
87+
* only set for LIST partitioned tables */
7388
int nindexes; /* Length of the indexes[] array */
7489
int *indexes; /* Partition indexes */
7590
int null_index; /* Index of the null-accepting partition; -1
@@ -102,7 +117,8 @@ extern PartitionBoundInfo partition_bounds_merge(int partnatts,
102117
JoinType jointype,
103118
List **outer_parts,
104119
List **inner_parts);
105-
extern bool partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts);
120+
extern bool partitions_are_ordered(PartitionBoundInfo boundinfo,
121+
Bitmapset *live_parts);
106122
extern void check_new_partition_bound(char *relname, Relation parent,
107123
PartitionBoundSpec *spec,
108124
ParseState *pstate);

src/test/regress/expected/inherit.out

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2180,6 +2180,8 @@ explain (costs off) select * from mcrparted where a < 20 order by a, abs(b), c;
21802180
Index Cond: (a < 20)
21812181
(9 rows)
21822182

2183+
set enable_bitmapscan to off;
2184+
set enable_sort to off;
21832185
create table mclparted (a int) partition by list(a);
21842186
create table mclparted1 partition of mclparted for values in(1);
21852187
create table mclparted2 partition of mclparted for values in(2);
@@ -2208,7 +2210,113 @@ explain (costs off) select * from mclparted order by a;
22082210
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_4
22092211
(6 rows)
22102212

2213+
explain (costs off) select * from mclparted where a in(3,4,5) order by a;
2214+
QUERY PLAN
2215+
----------------------------------------------------------------------------
2216+
Merge Append
2217+
Sort Key: mclparted.a
2218+
-> Index Only Scan using mclparted3_5_a_idx on mclparted3_5 mclparted_1
2219+
Index Cond: (a = ANY ('{3,4,5}'::integer[]))
2220+
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_2
2221+
Index Cond: (a = ANY ('{3,4,5}'::integer[]))
2222+
(6 rows)
2223+
2224+
-- Introduce a NULL and DEFAULT partition so we can test more complex cases
2225+
create table mclparted_null partition of mclparted for values in(null);
2226+
create table mclparted_def partition of mclparted default;
2227+
-- Append can be used providing we don't scan the interleaved partition
2228+
explain (costs off) select * from mclparted where a in(1,2,4) order by a;
2229+
QUERY PLAN
2230+
------------------------------------------------------------------------
2231+
Append
2232+
-> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1
2233+
Index Cond: (a = ANY ('{1,2,4}'::integer[]))
2234+
-> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2
2235+
Index Cond: (a = ANY ('{1,2,4}'::integer[]))
2236+
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3
2237+
Index Cond: (a = ANY ('{1,2,4}'::integer[]))
2238+
(7 rows)
2239+
2240+
explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a;
2241+
QUERY PLAN
2242+
--------------------------------------------------------------------------------
2243+
Append
2244+
-> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1
2245+
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
2246+
-> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2
2247+
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
2248+
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3
2249+
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
2250+
-> Index Only Scan using mclparted_null_a_idx on mclparted_null mclparted_4
2251+
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
2252+
(9 rows)
2253+
2254+
-- Test a more complex case where the NULL partition allows some other value
2255+
drop table mclparted_null;
2256+
create table mclparted_0_null partition of mclparted for values in(0,null);
2257+
-- Ensure MergeAppend is used since 0 and NULLs are in the same partition.
2258+
explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a;
2259+
QUERY PLAN
2260+
------------------------------------------------------------------------------------
2261+
Merge Append
2262+
Sort Key: mclparted.a
2263+
-> Index Only Scan using mclparted_0_null_a_idx on mclparted_0_null mclparted_1
2264+
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
2265+
-> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_2
2266+
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
2267+
-> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_3
2268+
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
2269+
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_4
2270+
Filter: ((a = ANY ('{1,2,4}'::integer[])) OR (a IS NULL))
2271+
(10 rows)
2272+
2273+
explain (costs off) select * from mclparted where a in(0,1,2,4) order by a;
2274+
QUERY PLAN
2275+
------------------------------------------------------------------------------------
2276+
Merge Append
2277+
Sort Key: mclparted.a
2278+
-> Index Only Scan using mclparted_0_null_a_idx on mclparted_0_null mclparted_1
2279+
Index Cond: (a = ANY ('{0,1,2,4}'::integer[]))
2280+
-> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_2
2281+
Index Cond: (a = ANY ('{0,1,2,4}'::integer[]))
2282+
-> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_3
2283+
Index Cond: (a = ANY ('{0,1,2,4}'::integer[]))
2284+
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_4
2285+
Index Cond: (a = ANY ('{0,1,2,4}'::integer[]))
2286+
(10 rows)
2287+
2288+
-- Ensure Append is used when the null partition is pruned
2289+
explain (costs off) select * from mclparted where a in(1,2,4) order by a;
2290+
QUERY PLAN
2291+
------------------------------------------------------------------------
2292+
Append
2293+
-> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1
2294+
Index Cond: (a = ANY ('{1,2,4}'::integer[]))
2295+
-> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2
2296+
Index Cond: (a = ANY ('{1,2,4}'::integer[]))
2297+
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3
2298+
Index Cond: (a = ANY ('{1,2,4}'::integer[]))
2299+
(7 rows)
2300+
2301+
-- Ensure MergeAppend is used when the default partition is not pruned
2302+
explain (costs off) select * from mclparted where a in(1,2,4,100) order by a;
2303+
QUERY PLAN
2304+
------------------------------------------------------------------------------
2305+
Merge Append
2306+
Sort Key: mclparted.a
2307+
-> Index Only Scan using mclparted1_a_idx on mclparted1 mclparted_1
2308+
Index Cond: (a = ANY ('{1,2,4,100}'::integer[]))
2309+
-> Index Only Scan using mclparted2_a_idx on mclparted2 mclparted_2
2310+
Index Cond: (a = ANY ('{1,2,4,100}'::integer[]))
2311+
-> Index Only Scan using mclparted4_a_idx on mclparted4 mclparted_3
2312+
Index Cond: (a = ANY ('{1,2,4,100}'::integer[]))
2313+
-> Index Only Scan using mclparted_def_a_idx on mclparted_def mclparted_4
2314+
Index Cond: (a = ANY ('{1,2,4,100}'::integer[]))
2315+
(10 rows)
2316+
22112317
drop table mclparted;
2318+
reset enable_sort;
2319+
reset enable_bitmapscan;
22122320
-- Ensure subplans which don't have a path with the correct pathkeys get
22132321
-- sorted correctly.
22142322
drop index mcrparted_a_abs_c_idx;

src/test/regress/sql/inherit.sql

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,8 @@ explain (costs off) select a, abs(b) from mcrparted order by a, abs(b), c;
775775
-- during planning.
776776
explain (costs off) select * from mcrparted where a < 20 order by a, abs(b), c;
777777

778+
set enable_bitmapscan to off;
779+
set enable_sort to off;
778780
create table mclparted (a int) partition by list(a);
779781
create table mclparted1 partition of mclparted for values in(1);
780782
create table mclparted2 partition of mclparted for values in(2);
@@ -789,8 +791,33 @@ create table mclparted3_5 partition of mclparted for values in(3,5);
789791
create table mclparted4 partition of mclparted for values in(4);
790792

791793
explain (costs off) select * from mclparted order by a;
794+
explain (costs off) select * from mclparted where a in(3,4,5) order by a;
795+
796+
-- Introduce a NULL and DEFAULT partition so we can test more complex cases
797+
create table mclparted_null partition of mclparted for values in(null);
798+
create table mclparted_def partition of mclparted default;
799+
800+
-- Append can be used providing we don't scan the interleaved partition
801+
explain (costs off) select * from mclparted where a in(1,2,4) order by a;
802+
explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a;
803+
804+
-- Test a more complex case where the NULL partition allows some other value
805+
drop table mclparted_null;
806+
create table mclparted_0_null partition of mclparted for values in(0,null);
807+
808+
-- Ensure MergeAppend is used since 0 and NULLs are in the same partition.
809+
explain (costs off) select * from mclparted where a in(1,2,4) or a is null order by a;
810+
explain (costs off) select * from mclparted where a in(0,1,2,4) order by a;
811+
812+
-- Ensure Append is used when the null partition is pruned
813+
explain (costs off) select * from mclparted where a in(1,2,4) order by a;
814+
815+
-- Ensure MergeAppend is used when the default partition is not pruned
816+
explain (costs off) select * from mclparted where a in(1,2,4,100) order by a;
792817

793818
drop table mclparted;
819+
reset enable_sort;
820+
reset enable_bitmapscan;
794821

795822
-- Ensure subplans which don't have a path with the correct pathkeys get
796823
-- sorted correctly.

0 commit comments

Comments
 (0)