postgrespro
diff --git a/‎src/backend/executor/execGrouping.c
Lines changed: 4 additions & 2 deletions b/‎src/backend/executor/execGrouping.c
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/backend/executor/nodeAgg.c
Lines changed: 23 additions & 19 deletions b/‎src/backend/executor/nodeAgg.c
Lines changed: 23 additions & 19 deletions
diff --git a/‎src/backend/executor/nodeHash.c
Lines changed: 84 additions & 60 deletions b/‎src/backend/executor/nodeHash.c
Lines changed: 84 additions & 60 deletions
diff --git a/‎src/backend/executor/nodeMemoize.c
Lines changed: 1 addition & 1 deletion b/‎src/backend/executor/nodeMemoize.c
Lines changed: 1 addition & 1 deletion
@@ -165,14 +165,16 @@ BuildTupleHashTableExt(PlanState *parent,
 {
 	TupleHashTable hashtable;
 	Size		entrysize = sizeof(TupleHashEntryData) + additionalsize;
-	int			hash_mem = get_hash_mem();
+	Size		hash_mem_limit;
 	MemoryContext oldcontext;
 	bool		allow_jit;
 
 	Assert(nbuckets > 0);
 
 	/* Limit initial table size request to not more than hash_mem */
-	nbuckets = Min(nbuckets, (long) ((hash_mem * 1024L) / entrysize));
+	hash_mem_limit = get_hash_memory_limit() / entrysize;
+	if (nbuckets > hash_mem_limit)
+		nbuckets = hash_mem_limit;
 
 	oldcontext = MemoryContextSwitchTo(metacxt);
 
 
@@ -1801,15 +1801,15 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
 {
 	int			npartitions;
 	Size		partition_mem;
-	int			hash_mem = get_hash_mem();
+	Size		hash_mem_limit = get_hash_memory_limit();
 
 	/* if not expected to spill, use all of hash_mem */
-	if (input_groups * hashentrysize < hash_mem * 1024L)
+	if (input_groups * hashentrysize <= hash_mem_limit)
 	{
 		if (num_partitions != NULL)
 			*num_partitions = 0;
-		*mem_limit = hash_mem * 1024L;
-		*ngroups_limit = *mem_limit / hashentrysize;
+		*mem_limit = hash_mem_limit;
+		*ngroups_limit = hash_mem_limit / hashentrysize;
 		return;
 	}
 
@@ -1834,10 +1834,10 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
 	 * minimum number of partitions, so we aren't going to dramatically exceed
 	 * work mem anyway.
 	 */
-	if (hash_mem * 1024L > 4 * partition_mem)
-		*mem_limit = hash_mem * 1024L - partition_mem;
+	if (hash_mem_limit > 4 * partition_mem)
+		*mem_limit = hash_mem_limit - partition_mem;
 	else
-		*mem_limit = hash_mem * 1024L * 0.75;
+		*mem_limit = hash_mem_limit * 0.75;
 
 	if (*mem_limit > hashentrysize)
 		*ngroups_limit = *mem_limit / hashentrysize;
@@ -1991,32 +1991,36 @@ static int
 hash_choose_num_partitions(double input_groups, double hashentrysize,
 						   int used_bits, int *log2_npartitions)
 {
-	Size		mem_wanted;
-	int			partition_limit;
+	Size		hash_mem_limit = get_hash_memory_limit();
+	double		partition_limit;
+	double		mem_wanted;
+	double		dpartitions;
 	int			npartitions;
 	int			partition_bits;
-	int			hash_mem = get_hash_mem();
 
 	/*
 	 * Avoid creating so many partitions that the memory requirements of the
 	 * open partition files are greater than 1/4 of hash_mem.
 	 */
 	partition_limit =
-		(hash_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
+		(hash_mem_limit * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
 		HASHAGG_WRITE_BUFFER_SIZE;
 
 	mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize;
 
 	/* make enough partitions so that each one is likely to fit in memory */
-	npartitions = 1 + (mem_wanted / (hash_mem * 1024L));
+	dpartitions = 1 + (mem_wanted / hash_mem_limit);
+
+	if (dpartitions > partition_limit)
+		dpartitions = partition_limit;
 
-	if (npartitions > partition_limit)
-		npartitions = partition_limit;
+	if (dpartitions < HASHAGG_MIN_PARTITIONS)
+		dpartitions = HASHAGG_MIN_PARTITIONS;
+	if (dpartitions > HASHAGG_MAX_PARTITIONS)
+		dpartitions = HASHAGG_MAX_PARTITIONS;
 
-	if (npartitions < HASHAGG_MIN_PARTITIONS)
-		npartitions = HASHAGG_MIN_PARTITIONS;
-	if (npartitions > HASHAGG_MAX_PARTITIONS)
-		npartitions = HASHAGG_MAX_PARTITIONS;
+	/* HASHAGG_MAX_PARTITIONS limit makes this safe */
+	npartitions = (int) dpartitions;
 
 	/* ceil(log2(npartitions)) */
 	partition_bits = my_log2(npartitions);
@@ -2029,7 +2033,7 @@ hash_choose_num_partitions(double input_groups, double hashentrysize,
 		*log2_npartitions = partition_bits;
 
 	/* number of partitions will be a power of two */
-	npartitions = 1L << partition_bits;
+	npartitions = 1 << partition_bits;
 
 	return npartitions;
 }
 
@@ -675,15 +675,12 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 {
 	int			tupsize;
 	double		inner_rel_bytes;
-	long		bucket_bytes;
-	long		hash_table_bytes;
-	long		skew_table_bytes;
-	long		max_pointers;
-	long		mppow2;
+	size_t		hash_table_bytes;
+	size_t		bucket_bytes;
+	size_t		max_pointers;
 	int			nbatch = 1;
 	int			nbuckets;
 	double		dbuckets;
-	int			hash_mem = get_hash_mem();
 
 	/* Force a plausible relation size if no info */
 	if (ntuples <= 0.0)
@@ -700,17 +697,24 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	inner_rel_bytes = ntuples * tupsize;
 
 	/*
-	 * Target in-memory hashtable size is hash_mem kilobytes.
+	 * Compute in-memory hashtable size limit from GUCs.
 	 */
-	hash_table_bytes = hash_mem * 1024L;
+	hash_table_bytes = get_hash_memory_limit();
 
 	/*
 	 * Parallel Hash tries to use the combined hash_mem of all workers to
 	 * avoid the need to batch.  If that won't work, it falls back to hash_mem
 	 * per worker and tries to process batches in parallel.
 	 */
 	if (try_combined_hash_mem)
-		hash_table_bytes += hash_table_bytes * parallel_workers;
+	{
+		/* Careful, this could overflow size_t */
+		double		newlimit;
+
+		newlimit = (double) hash_table_bytes * (double) (parallel_workers + 1);
+		newlimit = Min(newlimit, (double) SIZE_MAX);
+		hash_table_bytes = (size_t) newlimit;
+	}
 
 	*space_allowed = hash_table_bytes;
 
@@ -730,53 +734,68 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	 */
 	if (useskew)
 	{
-		skew_table_bytes = hash_table_bytes * SKEW_HASH_MEM_PERCENT / 100;
+		size_t		bytes_per_mcv;
+		size_t		skew_mcvs;
 
 		/*----------
+		 * Compute number of MCVs we could hold in hash_table_bytes
+		 *
 		 * Divisor is:
 		 * size of a hash tuple +
 		 * worst-case size of skewBucket[] per MCV +
 		 * size of skewBucketNums[] entry +
 		 * size of skew bucket struct itself
 		 *----------
 		 */
-		*num_skew_mcvs = skew_table_bytes / (tupsize +
-											 (8 * sizeof(HashSkewBucket *)) +
-											 sizeof(int) +
-											 SKEW_BUCKET_OVERHEAD);
-		if (*num_skew_mcvs > 0)
-			hash_table_bytes -= skew_table_bytes;
+		bytes_per_mcv = tupsize +
+			(8 * sizeof(HashSkewBucket *)) +
+			sizeof(int) +
+			SKEW_BUCKET_OVERHEAD;
+		skew_mcvs = hash_table_bytes / bytes_per_mcv;
+
+		/*
+		 * Now scale by SKEW_HASH_MEM_PERCENT (we do it in this order so as
+		 * not to worry about size_t overflow in the multiplication)
+		 */
+		skew_mcvs = (skew_mcvs * SKEW_HASH_MEM_PERCENT) / 100;
+
+		/* Now clamp to integer range */
+		skew_mcvs = Min(skew_mcvs, INT_MAX);
+
+		*num_skew_mcvs = (int) skew_mcvs;
+
+		/* Reduce hash_table_bytes by the amount needed for the skew table */
+		if (skew_mcvs > 0)
+			hash_table_bytes -= skew_mcvs * bytes_per_mcv;
 	}
 	else
 		*num_skew_mcvs = 0;
 
 	/*
 	 * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
 	 * memory is filled, assuming a single batch; but limit the value so that
-	 * the pointer arrays we'll try to allocate do not exceed hash_mem nor
-	 * MaxAllocSize.
+	 * the pointer arrays we'll try to allocate do not exceed hash_table_bytes
+	 * nor MaxAllocSize.
 	 *
 	 * Note that both nbuckets and nbatch must be powers of 2 to make
 	 * ExecHashGetBucketAndBatch fast.
 	 */
-	max_pointers = *space_allowed / sizeof(HashJoinTuple);
+	max_pointers = hash_table_bytes / sizeof(HashJoinTuple);
 	max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));
 	/* If max_pointers isn't a power of 2, must round it down to one */
-	mppow2 = 1L << my_log2(max_pointers);
-	if (max_pointers != mppow2)
-		max_pointers = mppow2 / 2;
+	max_pointers = pg_prevpower2_size_t(max_pointers);
 
 	/* Also ensure we avoid integer overflow in nbatch and nbuckets */
 	/* (this step is redundant given the current value of MaxAllocSize) */
-	max_pointers = Min(max_pointers, INT_MAX / 2);
+	max_pointers = Min(max_pointers, INT_MAX / 2 + 1);
 
 	dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
 	dbuckets = Min(dbuckets, max_pointers);
 	nbuckets = (int) dbuckets;
 	/* don't let nbuckets be really small, though ... */
 	nbuckets = Max(nbuckets, 1024);
 	/* ... and force it to be a power of 2. */
-	nbuckets = 1 << my_log2(nbuckets);
+	nbuckets = pg_nextpower2_32(nbuckets);
 
 	/*
 	 * If there's not enough space to store the projected number of tuples and
@@ -786,10 +805,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
 	{
 		/* We'll need multiple batches */
-		long		lbuckets;
+		size_t		sbuckets;
 		double		dbatch;
 		int			minbatch;
-		long		bucket_size;
+		size_t		bucket_size;
 
 		/*
 		 * If Parallel Hash with combined hash_mem would still need multiple
@@ -813,10 +832,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		 * overhead for the hash code, pointer to the next tuple, etc.
 		 */
 		bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));
-		lbuckets = 1L << my_log2(hash_table_bytes / bucket_size);
-		lbuckets = Min(lbuckets, max_pointers);
-		nbuckets = (int) lbuckets;
-		nbuckets = 1 << my_log2(nbuckets);
+		sbuckets = pg_nextpower2_size_t(hash_table_bytes / bucket_size);
+		sbuckets = Min(sbuckets, max_pointers);
+		nbuckets = (int) sbuckets;
+		nbuckets = pg_nextpower2_32(nbuckets);
 		bucket_bytes = nbuckets * sizeof(HashJoinTuple);
 
 		/*
@@ -1097,14 +1116,12 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
 				/* Figure out how many batches to use. */
 				if (hashtable->nbatch == 1)
 				{
-					int			hash_mem = get_hash_mem();
-
 					/*
 					 * We are going from single-batch to multi-batch.  We need
 					 * to switch from one large combined memory budget to the
 					 * regular hash_mem budget.
 					 */
-					pstate->space_allowed = hash_mem * 1024L;
+					pstate->space_allowed = get_hash_memory_limit();
 
 					/*
 					 * The combined hash_mem of all participants wasn't
@@ -1113,7 +1130,7 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
 					 * insufficient.  So try two batches per participant,
 					 * rounded up to a power of two.
 					 */
-					new_nbatch = 1 << my_log2(pstate->nparticipants * 2);
+					new_nbatch = pg_nextpower2_32(pstate->nparticipants * 2);
 				}
 				else
 				{
@@ -1152,7 +1169,7 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
 								   MaxAllocSize / sizeof(dsa_pointer_atomic));
 					new_nbuckets = (int) dbuckets;
 					new_nbuckets = Max(new_nbuckets, 1024);
-					new_nbuckets = 1 << my_log2(new_nbuckets);
+					new_nbuckets = pg_nextpower2_32(new_nbuckets);
 					dsa_free(hashtable->area, old_batch0->buckets);
 					hashtable->batches[0].shared->buckets =
 						dsa_allocate(hashtable->area,
@@ -3372,39 +3389,46 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
 }
 
 /*
- * Get a hash_mem value by multiplying the work_mem GUC's value by the
- * hash_mem_multiplier GUC's value.
- *
- * Returns a work_mem style KB value that hash-based nodes (including but not
- * limited to hash join) use in place of work_mem.  This is subject to the
- * same restrictions as work_mem itself.  (There is no such thing as the
- * hash_mem GUC, but it's convenient for our callers to pretend that there
- * is.)
+ * Calculate the limit on how much memory can be used by Hash and similar
+ * plan types.  This is work_mem times hash_mem_multiplier, and is
+ * expressed in bytes.
  *
- * Exported for use by the planner, as well as other hash-based executor
+ * Exported for use by the planner, as well as other hash-like executor
  * nodes.  This is a rather random place for this, but there is no better
  * place.
  */
+size_t
+get_hash_memory_limit(void)
+{
+	double		mem_limit;
+
+	/* Do initial calculation in double arithmetic */
+	mem_limit = (double) work_mem * hash_mem_multiplier * 1024.0;
+
+	/* Clamp in case it doesn't fit in size_t */
+	mem_limit = Min(mem_limit, (double) SIZE_MAX);
+
+	return (size_t) mem_limit;
+}
+
+/*
+ * Convert the hash memory limit to an integer number of kilobytes,
+ * that is something comparable to work_mem.  Like work_mem, we clamp
+ * the result to ensure that multiplying it by 1024 fits in a long int.
+ *
+ * This is deprecated since it may understate the actual memory limit.
+ * It is unused in core and will eventually be removed.
+ */
 int
 get_hash_mem(void)
 {
-	double		hash_mem;
-
-	Assert(hash_mem_multiplier >= 1.0);
+	size_t		mem_limit = get_hash_memory_limit();
 
-	hash_mem = (double) work_mem * hash_mem_multiplier;
+	/* Remove the kilobyte factor */
+	mem_limit /= 1024;
 
-	/*
-	 * guc.c enforces a MAX_KILOBYTES limitation on work_mem in order to
-	 * support the assumption that raw derived byte values can be stored in
-	 * 'long' variables.  The returned hash_mem value must also meet this
-	 * assumption.
-	 *
-	 * We clamp the final value rather than throw an error because it should
-	 * be possible to set work_mem and hash_mem_multiplier independently.
-	 */
-	if (hash_mem < MAX_KILOBYTES)
-		return (int) hash_mem;
+	/* Clamp to MAX_KILOBYTES, like work_mem */
+	mem_limit = Min(mem_limit, (size_t) MAX_KILOBYTES);
 
-	return MAX_KILOBYTES;
+	return (int) mem_limit;
 }
@@ -905,7 +905,7 @@ ExecInitMemoize(Memoize *node, EState *estate, int eflags)
 	mstate->mem_used = 0;
 
 	/* Limit the total memory consumed by the cache to this */
-	mstate->mem_limit = get_hash_mem() * 1024L;
+	mstate->mem_limit = get_hash_memory_limit();
 
 	/* A memory context dedicated for the cache */
 	mstate->tableContext = AllocSetContextCreate(CurrentMemoryContext,