Skip to content

Commit 6f94531

Browse files
committed
Fix hash partition pruning with asymmetric partition sets.
perform_pruning_combine_step() was not taught about the number of partition indexes used in hash partitioning; more embarrassingly, get_matching_hash_bounds() also had it wrong. These errors are masked in the common case where all the partitions have the same modulus and no partition is missing. However, with missing or unequal-size partitions, we could erroneously prune some partitions that need to be scanned, leading to silently wrong query answers. While a minimal-footprint fix for this could be to export get_partition_bound_num_indexes and make the incorrect functions use it, I'm of the opinion that that function should never have existed in the first place. It's not reasonable data structure design that PartitionBoundInfoData lacks any explicit record of the length of its indexes[] array. Perhaps that was all right when it could always be assumed equal to ndatums, but something should have been done about it as soon as that stopped being true. Putting in an explicit "nindexes" field makes both partition_bounds_equal() and partition_bounds_copy() simpler, safer, and faster than before, and removes explicit knowledge of the number-of-partition-indexes rules from some other places too. This change also makes get_hash_partition_greatest_modulus obsolete. I left that in place in case any external code uses it, but no core code does anymore. Per bug #16840 from Michał Albrycht. Back-patch to v11 where the hash partitioning code came in. (In the back branches, add the new field at the end of PartitionBoundInfoData to minimize ABI risks.) Discussion: https://postgr.es/m/16840-571a22976f829ad4@postgresql.org
1 parent ee62fca commit 6f94531

File tree

7 files changed

+124
-140
lines changed

7 files changed

+124
-140
lines changed

src/backend/executor/execPartition.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1159,15 +1159,13 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull)
11591159
{
11601160
case PARTITION_STRATEGY_HASH:
11611161
{
1162-
int greatest_modulus;
11631162
uint64 rowHash;
11641163

1165-
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
11661164
rowHash = compute_partition_hash_value(key->partnatts,
11671165
key->partsupfunc,
11681166
values, isnull);
11691167

1170-
part_index = boundinfo->indexes[rowHash % greatest_modulus];
1168+
part_index = boundinfo->indexes[rowHash % boundinfo->nindexes];
11711169
}
11721170
break;
11731171

src/backend/partitioning/partbounds.c

Lines changed: 26 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
#include "utils/ruleutils.h"
3737
#include "utils/syscache.h"
3838

39-
static int get_partition_bound_num_indexes(PartitionBoundInfo b);
4039
static Expr *make_partition_op_expr(PartitionKey key, int keynum,
4140
uint16 strategy, Expr *arg1, Expr *arg2);
4241
static Oid get_partition_operator(PartitionKey key, int col,
@@ -112,45 +111,41 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
112111
if (b1->ndatums != b2->ndatums)
113112
return false;
114113

114+
if (b1->nindexes != b2->nindexes)
115+
return false;
116+
115117
if (b1->null_index != b2->null_index)
116118
return false;
117119

118120
if (b1->default_index != b2->default_index)
119121
return false;
120122

121-
if (b1->strategy == PARTITION_STRATEGY_HASH)
123+
/* For all partition strategies, the indexes[] arrays have to match */
124+
for (i = 0; i < b1->nindexes; i++)
122125
{
123-
int greatest_modulus = get_hash_partition_greatest_modulus(b1);
124-
125-
/*
126-
* If two hash partitioned tables have different greatest moduli,
127-
* their partition schemes don't match.
128-
*/
129-
if (greatest_modulus != get_hash_partition_greatest_modulus(b2))
126+
if (b1->indexes[i] != b2->indexes[i])
130127
return false;
128+
}
131129

130+
/* Finally, compare the datums[] arrays */
131+
if (b1->strategy == PARTITION_STRATEGY_HASH)
132+
{
132133
/*
133134
* We arrange the partitions in the ascending order of their moduli
134135
* and remainders. Also every modulus is factor of next larger
135136
* modulus. Therefore we can safely store index of a given partition
136137
* in indexes array at remainder of that partition. Also entries at
137138
* (remainder + N * modulus) positions in indexes array are all same
138-
* for (modulus, remainder) specification for any partition. Thus
139-
* datums array from both the given bounds are same, if and only if
140-
* their indexes array will be same. So, it suffices to compare
141-
* indexes array.
142-
*/
143-
for (i = 0; i < greatest_modulus; i++)
144-
if (b1->indexes[i] != b2->indexes[i])
145-
return false;
146-
147-
#ifdef USE_ASSERT_CHECKING
148-
149-
/*
150-
* Nonetheless make sure that the bounds are indeed same when the
139+
* for (modulus, remainder) specification for any partition. Thus the
140+
* datums arrays from the given bounds are the same, if and only if
141+
* their indexes arrays are the same. So, it suffices to compare the
142+
* indexes arrays.
143+
*
144+
* Nonetheless make sure that the bounds are indeed the same when the
151145
* indexes match. Hash partition bound stores modulus and remainder
152146
* at b1->datums[i][0] and b1->datums[i][1] position respectively.
153147
*/
148+
#ifdef USE_ASSERT_CHECKING
154149
for (i = 0; i < b1->ndatums; i++)
155150
Assert((b1->datums[i][0] == b2->datums[i][0] &&
156151
b1->datums[i][1] == b2->datums[i][1]));
@@ -196,15 +191,7 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
196191
parttypbyval[j], parttyplen[j]))
197192
return false;
198193
}
199-
200-
if (b1->indexes[i] != b2->indexes[i])
201-
return false;
202194
}
203-
204-
/* There are ndatums+1 indexes in case of range partitions */
205-
if (b1->strategy == PARTITION_STRATEGY_RANGE &&
206-
b1->indexes[i] != b2->indexes[i])
207-
return false;
208195
}
209196
return true;
210197
}
@@ -220,17 +207,16 @@ partition_bounds_copy(PartitionBoundInfo src,
220207
PartitionBoundInfo dest;
221208
int i;
222209
int ndatums;
210+
int nindexes;
223211
int partnatts;
224-
int num_indexes;
225212

226213
dest = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData));
227214

228215
dest->strategy = src->strategy;
229216
ndatums = dest->ndatums = src->ndatums;
217+
nindexes = dest->nindexes = src->nindexes;
230218
partnatts = key->partnatts;
231219

232-
num_indexes = get_partition_bound_num_indexes(src);
233-
234220
/* List partitioned tables have only a single partition key. */
235221
Assert(key->strategy != PARTITION_STRATEGY_LIST || partnatts == 1);
236222

@@ -288,8 +274,8 @@ partition_bounds_copy(PartitionBoundInfo src,
288274
}
289275
}
290276

291-
dest->indexes = (int *) palloc(sizeof(int) * num_indexes);
292-
memcpy(dest->indexes, src->indexes, sizeof(int) * num_indexes);
277+
dest->indexes = (int *) palloc(sizeof(int) * nindexes);
278+
memcpy(dest->indexes, src->indexes, sizeof(int) * nindexes);
293279

294280
dest->null_index = src->null_index;
295281
dest->default_index = src->default_index;
@@ -389,7 +375,7 @@ check_new_partition_bound(char *relname, Relation parent,
389375
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
390376
errmsg("every hash partition modulus must be a factor of the next larger modulus")));
391377

392-
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
378+
greatest_modulus = boundinfo->nindexes;
393379
remainder = spec->remainder;
394380

395381
/*
@@ -756,18 +742,15 @@ check_default_partition_contents(Relation parent, Relation default_rel,
756742
/*
757743
* get_hash_partition_greatest_modulus
758744
*
759-
* Returns the greatest modulus of the hash partition bound. The greatest
760-
* modulus will be at the end of the datums array because hash partitions are
761-
* arranged in the ascending order of their moduli and remainders.
745+
* Returns the greatest modulus of the hash partition bound.
746+
* This is no longer used in the core code, but we keep it around
747+
* in case external modules are using it.
762748
*/
763749
int
764750
get_hash_partition_greatest_modulus(PartitionBoundInfo bound)
765751
{
766752
Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH);
767-
Assert(bound->datums && bound->ndatums > 0);
768-
Assert(DatumGetInt32(bound->datums[bound->ndatums - 1][0]) > 0);
769-
770-
return DatumGetInt32(bound->datums[bound->ndatums - 1][0]);
753+
return bound->nindexes;
771754
}
772755

773756
/*
@@ -1115,46 +1098,6 @@ partition_hash_bsearch(PartitionBoundInfo boundinfo,
11151098
return lo;
11161099
}
11171100

1118-
/*
1119-
* get_partition_bound_num_indexes
1120-
*
1121-
* Returns the number of the entries in the partition bound indexes array.
1122-
*/
1123-
static int
1124-
get_partition_bound_num_indexes(PartitionBoundInfo bound)
1125-
{
1126-
int num_indexes;
1127-
1128-
Assert(bound);
1129-
1130-
switch (bound->strategy)
1131-
{
1132-
case PARTITION_STRATEGY_HASH:
1133-
1134-
/*
1135-
* The number of the entries in the indexes array is same as the
1136-
* greatest modulus.
1137-
*/
1138-
num_indexes = get_hash_partition_greatest_modulus(bound);
1139-
break;
1140-
1141-
case PARTITION_STRATEGY_LIST:
1142-
num_indexes = bound->ndatums;
1143-
break;
1144-
1145-
case PARTITION_STRATEGY_RANGE:
1146-
/* Range partitioned table has an extra index. */
1147-
num_indexes = bound->ndatums + 1;
1148-
break;
1149-
1150-
default:
1151-
elog(ERROR, "unexpected partition strategy: %d",
1152-
(int) bound->strategy);
1153-
}
1154-
1155-
return num_indexes;
1156-
}
1157-
11581101
/*
11591102
* get_partition_operator
11601103
*

src/backend/partitioning/partprune.c

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -769,7 +769,10 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
769769
scan_default = final_result->scan_default;
770770
while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0)
771771
{
772-
int partindex = context->boundinfo->indexes[i];
772+
int partindex;
773+
774+
Assert(i < context->boundinfo->nindexes);
775+
partindex = context->boundinfo->indexes[i];
773776

774777
if (partindex < 0)
775778
{
@@ -2497,20 +2500,19 @@ get_matching_hash_bounds(PartitionPruneContext *context,
24972500
for (i = 0; i < partnatts; i++)
24982501
isnull[i] = bms_is_member(i, nullkeys);
24992502

2500-
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
25012503
rowHash = compute_partition_hash_value(partnatts, partsupfunc,
25022504
values, isnull);
25032505

2506+
greatest_modulus = boundinfo->nindexes;
25042507
if (partindices[rowHash % greatest_modulus] >= 0)
25052508
result->bound_offsets =
25062509
bms_make_singleton(rowHash % greatest_modulus);
25072510
}
25082511
else
25092512
{
2510-
/* Getting here means at least one hash partition exists. */
2511-
Assert(boundinfo->ndatums > 0);
2513+
/* Report all valid offsets into the boundinfo->indexes array. */
25122514
result->bound_offsets = bms_add_range(NULL, 0,
2513-
boundinfo->ndatums - 1);
2515+
boundinfo->nindexes - 1);
25142516
}
25152517

25162518
/*
@@ -3371,30 +3373,20 @@ perform_pruning_combine_step(PartitionPruneContext *context,
33713373
PartitionPruneStepCombine *cstep,
33723374
PruneStepResult **step_results)
33733375
{
3374-
ListCell *lc1;
3375-
PruneStepResult *result = NULL;
3376+
PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
33763377
bool firststep;
3378+
ListCell *lc1;
33773379

33783380
/*
33793381
* A combine step without any source steps is an indication to not perform
33803382
* any partition pruning. Return all datum indexes in that case.
33813383
*/
3382-
result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
3383-
if (list_length(cstep->source_stepids) == 0)
3384+
if (cstep->source_stepids == NIL)
33843385
{
33853386
PartitionBoundInfo boundinfo = context->boundinfo;
3386-
int rangemax;
3387-
3388-
/*
3389-
* Add all valid offsets into the boundinfo->indexes array. For range
3390-
* partitioning, boundinfo->indexes contains (boundinfo->ndatums + 1)
3391-
* valid entries; otherwise there are boundinfo->ndatums.
3392-
*/
3393-
rangemax = context->strategy == PARTITION_STRATEGY_RANGE ?
3394-
boundinfo->ndatums : boundinfo->ndatums - 1;
33953387

33963388
result->bound_offsets =
3397-
bms_add_range(result->bound_offsets, 0, rangemax);
3389+
bms_add_range(NULL, 0, boundinfo->nindexes - 1);
33983390
result->scan_default = partition_bound_has_default(boundinfo);
33993391
result->scan_null = partition_bound_accepts_nulls(boundinfo);
34003392
return result;

src/backend/utils/cache/partcache.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,7 @@ RelationBuildPartitionDesc(Relation rel)
622622
/* Moduli are stored in ascending order */
623623
int greatest_modulus = hbounds[ndatums - 1]->modulus;
624624

625+
boundinfo->nindexes = greatest_modulus;
625626
boundinfo->indexes = (int *) palloc(greatest_modulus *
626627
sizeof(int));
627628

@@ -655,6 +656,7 @@ RelationBuildPartitionDesc(Relation rel)
655656

656657
case PARTITION_STRATEGY_LIST:
657658
{
659+
boundinfo->nindexes = ndatums;
658660
boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
659661

660662
/*
@@ -719,6 +721,7 @@ RelationBuildPartitionDesc(Relation rel)
719721
boundinfo->kind = (PartitionRangeDatumKind **)
720722
palloc(ndatums *
721723
sizeof(PartitionRangeDatumKind *));
724+
boundinfo->nindexes = ndatums + 1;
722725
boundinfo->indexes = (int *) palloc((ndatums + 1) *
723726
sizeof(int));
724727

src/include/partitioning/partbounds.h

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
* In the case of range partitioning, ndatums will typically be far less than
3131
* 2 * nparts, because a partition's upper bound and the next partition's lower
3232
* bound are the same in most common cases, and we only store one of them (the
33-
* upper bound). In case of hash partitioning, ndatums will be same as the
33+
* upper bound). In case of hash partitioning, ndatums will be the same as the
3434
* number of partitions.
3535
*
3636
* For range and list partitioned tables, datums is an array of datum-tuples
@@ -46,20 +46,26 @@
4646
* the partition key's operator classes and collations.
4747
*
4848
* In the case of list partitioning, the indexes array stores one entry for
49-
* every datum, which is the index of the partition that accepts a given datum.
50-
* In case of range partitioning, it stores one entry per distinct range
51-
* datum, which is the index of the partition for which a given datum
52-
* is an upper bound. In the case of hash partitioning, the number of the
53-
* entries in the indexes array is same as the greatest modulus amongst all
54-
* partitions. For a given partition key datum-tuple, the index of the
55-
* partition which would accept that datum-tuple would be given by the entry
56-
* pointed by remainder produced when hash value of the datum-tuple is divided
57-
* by the greatest modulus.
49+
* each datum-array entry, which is the index of the partition that accepts
50+
* rows matching that datum. So nindexes == ndatums.
51+
*
52+
* In the case of range partitioning, the indexes array stores one entry per
53+
* distinct range datum, which is the index of the partition for which that
54+
* datum is an upper bound (or -1 for a "gap" that has no partition). It is
55+
* convenient to have an extra -1 entry representing values above the last
56+
* range datum, so nindexes == ndatums + 1.
57+
*
58+
* In the case of hash partitioning, the number of entries in the indexes
59+
* array is the same as the greatest modulus amongst all partitions (which
60+
* is a multiple of all partition moduli), so nindexes == greatest modulus.
61+
* The indexes array is indexed according to the hash key's remainder modulo
62+
* the greatest modulus, and it contains either the partition index accepting
63+
* that remainder, or -1 if there is no partition for that remainder.
5864
*/
5965
typedef struct PartitionBoundInfoData
6066
{
6167
char strategy; /* hash, list or range? */
62-
int ndatums; /* Length of the datums following array */
68+
int ndatums; /* Length of the datums[] array */
6369
Datum **datums;
6470
PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
6571
* NULL for hash and list partitioned
@@ -69,6 +75,7 @@ typedef struct PartitionBoundInfoData
6975
* if there isn't one */
7076
int default_index; /* Index of the default partition; -1 if there
7177
* isn't one */
78+
int nindexes; /* Length of the indexes[] array */
7279
} PartitionBoundInfoData;
7380

7481
#define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1)

0 commit comments

Comments
 (0)