Skip to content

Commit b154ee6

Browse files
committed
Get rid of artificial restriction on hash table sizes on Windows.
The point of introducing the hash_mem_multiplier GUC was to let users reproduce the old behavior of hash aggregation, i.e. that it could use more than work_mem at need. However, the implementation failed to get the job done on Win64, where work_mem is clamped to 2GB to protect various places that calculate memory sizes using "long int". As written, the same clamp was applied to hash_mem. This resulted in severe performance regressions for queries requiring a bit more than 2GB for hash aggregation, as they now spill to disk and there's no way to stop that. Getting rid of the work_mem restriction seems like a good idea, but it's a big job and could not conceivably be back-patched. However, there's only a fairly small number of places that are concerned with the hash_mem value, and it turns out to be possible to remove the restriction there without too much code churn or any ABI breaks. So, let's do that for now to fix the regression, and leave the larger task for another day. This patch does introduce a bit more infrastructure that should help with the larger task, namely pg_bitutils.h support for working with size_t values. Per gripe from Laurent Hasson. Back-patch to v13 where the behavior change came in. Discussion: https://postgr.es/m/997817.1627074924@sss.pgh.pa.us Discussion: https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
1 parent 3d0a463 commit b154ee6

File tree

11 files changed

+172
-103
lines changed

11 files changed

+172
-103
lines changed

src/backend/executor/execGrouping.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,14 +165,16 @@ BuildTupleHashTableExt(PlanState *parent,
165165
{
166166
TupleHashTable hashtable;
167167
Size entrysize = sizeof(TupleHashEntryData) + additionalsize;
168-
int hash_mem = get_hash_mem();
168+
Size hash_mem_limit;
169169
MemoryContext oldcontext;
170170
bool allow_jit;
171171

172172
Assert(nbuckets > 0);
173173

174174
/* Limit initial table size request to not more than hash_mem */
175-
nbuckets = Min(nbuckets, (long) ((hash_mem * 1024L) / entrysize));
175+
hash_mem_limit = get_hash_memory_limit() / entrysize;
176+
if (nbuckets > hash_mem_limit)
177+
nbuckets = hash_mem_limit;
176178

177179
oldcontext = MemoryContextSwitchTo(metacxt);
178180

src/backend/executor/nodeAgg.c

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1801,15 +1801,15 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
18011801
{
18021802
int npartitions;
18031803
Size partition_mem;
1804-
int hash_mem = get_hash_mem();
1804+
Size hash_mem_limit = get_hash_memory_limit();
18051805

18061806
/* if not expected to spill, use all of hash_mem */
1807-
if (input_groups * hashentrysize < hash_mem * 1024L)
1807+
if (input_groups * hashentrysize <= hash_mem_limit)
18081808
{
18091809
if (num_partitions != NULL)
18101810
*num_partitions = 0;
1811-
*mem_limit = hash_mem * 1024L;
1812-
*ngroups_limit = *mem_limit / hashentrysize;
1811+
*mem_limit = hash_mem_limit;
1812+
*ngroups_limit = hash_mem_limit / hashentrysize;
18131813
return;
18141814
}
18151815

@@ -1834,10 +1834,10 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
18341834
* minimum number of partitions, so we aren't going to dramatically exceed
18351835
* work mem anyway.
18361836
*/
1837-
if (hash_mem * 1024L > 4 * partition_mem)
1838-
*mem_limit = hash_mem * 1024L - partition_mem;
1837+
if (hash_mem_limit > 4 * partition_mem)
1838+
*mem_limit = hash_mem_limit - partition_mem;
18391839
else
1840-
*mem_limit = hash_mem * 1024L * 0.75;
1840+
*mem_limit = hash_mem_limit * 0.75;
18411841

18421842
if (*mem_limit > hashentrysize)
18431843
*ngroups_limit = *mem_limit / hashentrysize;
@@ -1991,32 +1991,36 @@ static int
19911991
hash_choose_num_partitions(double input_groups, double hashentrysize,
19921992
int used_bits, int *log2_npartitions)
19931993
{
1994-
Size mem_wanted;
1995-
int partition_limit;
1994+
Size hash_mem_limit = get_hash_memory_limit();
1995+
double partition_limit;
1996+
double mem_wanted;
1997+
double dpartitions;
19961998
int npartitions;
19971999
int partition_bits;
1998-
int hash_mem = get_hash_mem();
19992000

20002001
/*
20012002
* Avoid creating so many partitions that the memory requirements of the
20022003
* open partition files are greater than 1/4 of hash_mem.
20032004
*/
20042005
partition_limit =
2005-
(hash_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
2006+
(hash_mem_limit * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
20062007
HASHAGG_WRITE_BUFFER_SIZE;
20072008

20082009
mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize;
20092010

20102011
/* make enough partitions so that each one is likely to fit in memory */
2011-
npartitions = 1 + (mem_wanted / (hash_mem * 1024L));
2012+
dpartitions = 1 + (mem_wanted / hash_mem_limit);
2013+
2014+
if (dpartitions > partition_limit)
2015+
dpartitions = partition_limit;
20122016

2013-
if (npartitions > partition_limit)
2014-
npartitions = partition_limit;
2017+
if (dpartitions < HASHAGG_MIN_PARTITIONS)
2018+
dpartitions = HASHAGG_MIN_PARTITIONS;
2019+
if (dpartitions > HASHAGG_MAX_PARTITIONS)
2020+
dpartitions = HASHAGG_MAX_PARTITIONS;
20152021

2016-
if (npartitions < HASHAGG_MIN_PARTITIONS)
2017-
npartitions = HASHAGG_MIN_PARTITIONS;
2018-
if (npartitions > HASHAGG_MAX_PARTITIONS)
2019-
npartitions = HASHAGG_MAX_PARTITIONS;
2022+
/* HASHAGG_MAX_PARTITIONS limit makes this safe */
2023+
npartitions = (int) dpartitions;
20202024

20212025
/* ceil(log2(npartitions)) */
20222026
partition_bits = my_log2(npartitions);
@@ -2029,7 +2033,7 @@ hash_choose_num_partitions(double input_groups, double hashentrysize,
20292033
*log2_npartitions = partition_bits;
20302034

20312035
/* number of partitions will be a power of two */
2032-
npartitions = 1L << partition_bits;
2036+
npartitions = 1 << partition_bits;
20332037

20342038
return npartitions;
20352039
}

src/backend/executor/nodeHash.c

Lines changed: 84 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -675,15 +675,12 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
675675
{
676676
int tupsize;
677677
double inner_rel_bytes;
678-
long bucket_bytes;
679-
long hash_table_bytes;
680-
long skew_table_bytes;
681-
long max_pointers;
682-
long mppow2;
678+
size_t hash_table_bytes;
679+
size_t bucket_bytes;
680+
size_t max_pointers;
683681
int nbatch = 1;
684682
int nbuckets;
685683
double dbuckets;
686-
int hash_mem = get_hash_mem();
687684

688685
/* Force a plausible relation size if no info */
689686
if (ntuples <= 0.0)
@@ -700,17 +697,24 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
700697
inner_rel_bytes = ntuples * tupsize;
701698

702699
/*
703-
* Target in-memory hashtable size is hash_mem kilobytes.
700+
* Compute in-memory hashtable size limit from GUCs.
704701
*/
705-
hash_table_bytes = hash_mem * 1024L;
702+
hash_table_bytes = get_hash_memory_limit();
706703

707704
/*
708705
* Parallel Hash tries to use the combined hash_mem of all workers to
709706
* avoid the need to batch. If that won't work, it falls back to hash_mem
710707
* per worker and tries to process batches in parallel.
711708
*/
712709
if (try_combined_hash_mem)
713-
hash_table_bytes += hash_table_bytes * parallel_workers;
710+
{
711+
/* Careful, this could overflow size_t */
712+
double newlimit;
713+
714+
newlimit = (double) hash_table_bytes * (double) (parallel_workers + 1);
715+
newlimit = Min(newlimit, (double) SIZE_MAX);
716+
hash_table_bytes = (size_t) newlimit;
717+
}
714718

715719
*space_allowed = hash_table_bytes;
716720

@@ -730,53 +734,68 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
730734
*/
731735
if (useskew)
732736
{
733-
skew_table_bytes = hash_table_bytes * SKEW_HASH_MEM_PERCENT / 100;
737+
size_t bytes_per_mcv;
738+
size_t skew_mcvs;
734739

735740
/*----------
741+
* Compute number of MCVs we could hold in hash_table_bytes
742+
*
736743
* Divisor is:
737744
* size of a hash tuple +
738745
* worst-case size of skewBucket[] per MCV +
739746
* size of skewBucketNums[] entry +
740747
* size of skew bucket struct itself
741748
*----------
742749
*/
743-
*num_skew_mcvs = skew_table_bytes / (tupsize +
744-
(8 * sizeof(HashSkewBucket *)) +
745-
sizeof(int) +
746-
SKEW_BUCKET_OVERHEAD);
747-
if (*num_skew_mcvs > 0)
748-
hash_table_bytes -= skew_table_bytes;
750+
bytes_per_mcv = tupsize +
751+
(8 * sizeof(HashSkewBucket *)) +
752+
sizeof(int) +
753+
SKEW_BUCKET_OVERHEAD;
754+
skew_mcvs = hash_table_bytes / bytes_per_mcv;
755+
756+
/*
757+
* Now scale by SKEW_HASH_MEM_PERCENT (we do it in this order so as
758+
* not to worry about size_t overflow in the multiplication)
759+
*/
760+
skew_mcvs = (skew_mcvs * SKEW_HASH_MEM_PERCENT) / 100;
761+
762+
/* Now clamp to integer range */
763+
skew_mcvs = Min(skew_mcvs, INT_MAX);
764+
765+
*num_skew_mcvs = (int) skew_mcvs;
766+
767+
/* Reduce hash_table_bytes by the amount needed for the skew table */
768+
if (skew_mcvs > 0)
769+
hash_table_bytes -= skew_mcvs * bytes_per_mcv;
749770
}
750771
else
751772
*num_skew_mcvs = 0;
752773

753774
/*
754775
* Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
755776
* memory is filled, assuming a single batch; but limit the value so that
756-
* the pointer arrays we'll try to allocate do not exceed hash_mem nor
757-
* MaxAllocSize.
777+
* the pointer arrays we'll try to allocate do not exceed hash_table_bytes
778+
* nor MaxAllocSize.
758779
*
759780
* Note that both nbuckets and nbatch must be powers of 2 to make
760781
* ExecHashGetBucketAndBatch fast.
761782
*/
762-
max_pointers = *space_allowed / sizeof(HashJoinTuple);
783+
max_pointers = hash_table_bytes / sizeof(HashJoinTuple);
763784
max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));
764785
/* If max_pointers isn't a power of 2, must round it down to one */
765-
mppow2 = 1L << my_log2(max_pointers);
766-
if (max_pointers != mppow2)
767-
max_pointers = mppow2 / 2;
786+
max_pointers = pg_prevpower2_size_t(max_pointers);
768787

769788
/* Also ensure we avoid integer overflow in nbatch and nbuckets */
770789
/* (this step is redundant given the current value of MaxAllocSize) */
771-
max_pointers = Min(max_pointers, INT_MAX / 2);
790+
max_pointers = Min(max_pointers, INT_MAX / 2 + 1);
772791

773792
dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
774793
dbuckets = Min(dbuckets, max_pointers);
775794
nbuckets = (int) dbuckets;
776795
/* don't let nbuckets be really small, though ... */
777796
nbuckets = Max(nbuckets, 1024);
778797
/* ... and force it to be a power of 2. */
779-
nbuckets = 1 << my_log2(nbuckets);
798+
nbuckets = pg_nextpower2_32(nbuckets);
780799

781800
/*
782801
* If there's not enough space to store the projected number of tuples and
@@ -786,10 +805,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
786805
if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
787806
{
788807
/* We'll need multiple batches */
789-
long lbuckets;
808+
size_t sbuckets;
790809
double dbatch;
791810
int minbatch;
792-
long bucket_size;
811+
size_t bucket_size;
793812

794813
/*
795814
* If Parallel Hash with combined hash_mem would still need multiple
@@ -813,10 +832,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
813832
* overhead for the hash code, pointer to the next tuple, etc.
814833
*/
815834
bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));
816-
lbuckets = 1L << my_log2(hash_table_bytes / bucket_size);
817-
lbuckets = Min(lbuckets, max_pointers);
818-
nbuckets = (int) lbuckets;
819-
nbuckets = 1 << my_log2(nbuckets);
835+
sbuckets = pg_nextpower2_size_t(hash_table_bytes / bucket_size);
836+
sbuckets = Min(sbuckets, max_pointers);
837+
nbuckets = (int) sbuckets;
838+
nbuckets = pg_nextpower2_32(nbuckets);
820839
bucket_bytes = nbuckets * sizeof(HashJoinTuple);
821840

822841
/*
@@ -1097,14 +1116,12 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
10971116
/* Figure out how many batches to use. */
10981117
if (hashtable->nbatch == 1)
10991118
{
1100-
int hash_mem = get_hash_mem();
1101-
11021119
/*
11031120
* We are going from single-batch to multi-batch. We need
11041121
* to switch from one large combined memory budget to the
11051122
* regular hash_mem budget.
11061123
*/
1107-
pstate->space_allowed = hash_mem * 1024L;
1124+
pstate->space_allowed = get_hash_memory_limit();
11081125

11091126
/*
11101127
* The combined hash_mem of all participants wasn't
@@ -1113,7 +1130,7 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
11131130
* insufficient. So try two batches per participant,
11141131
* rounded up to a power of two.
11151132
*/
1116-
new_nbatch = 1 << my_log2(pstate->nparticipants * 2);
1133+
new_nbatch = pg_nextpower2_32(pstate->nparticipants * 2);
11171134
}
11181135
else
11191136
{
@@ -1152,7 +1169,7 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
11521169
MaxAllocSize / sizeof(dsa_pointer_atomic));
11531170
new_nbuckets = (int) dbuckets;
11541171
new_nbuckets = Max(new_nbuckets, 1024);
1155-
new_nbuckets = 1 << my_log2(new_nbuckets);
1172+
new_nbuckets = pg_nextpower2_32(new_nbuckets);
11561173
dsa_free(hashtable->area, old_batch0->buckets);
11571174
hashtable->batches[0].shared->buckets =
11581175
dsa_allocate(hashtable->area,
@@ -3372,39 +3389,46 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
33723389
}
33733390

33743391
/*
3375-
* Get a hash_mem value by multiplying the work_mem GUC's value by the
3376-
* hash_mem_multiplier GUC's value.
3377-
*
3378-
* Returns a work_mem style KB value that hash-based nodes (including but not
3379-
* limited to hash join) use in place of work_mem. This is subject to the
3380-
* same restrictions as work_mem itself. (There is no such thing as the
3381-
* hash_mem GUC, but it's convenient for our callers to pretend that there
3382-
* is.)
3392+
* Calculate the limit on how much memory can be used by Hash and similar
3393+
* plan types. This is work_mem times hash_mem_multiplier, and is
3394+
* expressed in bytes.
33833395
*
3384-
* Exported for use by the planner, as well as other hash-based executor
3396+
* Exported for use by the planner, as well as other hash-like executor
33853397
* nodes. This is a rather random place for this, but there is no better
33863398
* place.
33873399
*/
3400+
size_t
3401+
get_hash_memory_limit(void)
3402+
{
3403+
double mem_limit;
3404+
3405+
/* Do initial calculation in double arithmetic */
3406+
mem_limit = (double) work_mem * hash_mem_multiplier * 1024.0;
3407+
3408+
/* Clamp in case it doesn't fit in size_t */
3409+
mem_limit = Min(mem_limit, (double) SIZE_MAX);
3410+
3411+
return (size_t) mem_limit;
3412+
}
3413+
3414+
/*
3415+
* Convert the hash memory limit to an integer number of kilobytes,
3416+
* that is something comparable to work_mem. Like work_mem, we clamp
3417+
* the result to ensure that multiplying it by 1024 fits in a long int.
3418+
*
3419+
* This is deprecated since it may understate the actual memory limit.
3420+
* It is unused in core and will eventually be removed.
3421+
*/
33883422
int
33893423
get_hash_mem(void)
33903424
{
3391-
double hash_mem;
3392-
3393-
Assert(hash_mem_multiplier >= 1.0);
3425+
size_t mem_limit = get_hash_memory_limit();
33943426

3395-
hash_mem = (double) work_mem * hash_mem_multiplier;
3427+
/* Remove the kilobyte factor */
3428+
mem_limit /= 1024;
33963429

3397-
/*
3398-
* guc.c enforces a MAX_KILOBYTES limitation on work_mem in order to
3399-
* support the assumption that raw derived byte values can be stored in
3400-
* 'long' variables. The returned hash_mem value must also meet this
3401-
* assumption.
3402-
*
3403-
* We clamp the final value rather than throw an error because it should
3404-
* be possible to set work_mem and hash_mem_multiplier independently.
3405-
*/
3406-
if (hash_mem < MAX_KILOBYTES)
3407-
return (int) hash_mem;
3430+
/* Clamp to MAX_KILOBYTES, like work_mem */
3431+
mem_limit = Min(mem_limit, (size_t) MAX_KILOBYTES);
34083432

3409-
return MAX_KILOBYTES;
3433+
return (int) mem_limit;
34103434
}

src/backend/executor/nodeMemoize.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -905,7 +905,7 @@ ExecInitMemoize(Memoize *node, EState *estate, int eflags)
905905
mstate->mem_used = 0;
906906

907907
/* Limit the total memory consumed by the cache to this */
908-
mstate->mem_limit = get_hash_mem() * 1024L;
908+
mstate->mem_limit = get_hash_memory_limit();
909909

910910
/* A memory context dedicated for the cache */
911911
mstate->tableContext = AllocSetContextCreate(CurrentMemoryContext,

0 commit comments

Comments
 (0)