Fix some issues in new hashtable size calculations in nodeHash.c.

tglsfdc · tglsfdc · commit ca5b42d85486 · 2015-10-04T14:06:51.000-04:00
Limit the size of the hashtable pointer array to not more than
MaxAllocSize, per reports from Kouhei Kaigai and others of "invalid memory
alloc request size" failures.  There was discussion of allowing the array
to get larger than that by using the "huge" palloc API, but so far no proof
that that is actually a good idea, and at this point in the 9.5 cycle major
changes from old behavior don't seem like the way to go.

Fix a rather serious secondary bug in the new code, which was that it
didn't ensure nbuckets remained a power of 2 when recomputing it for the
multiple-batch case.

Clean up sloppy division of labor between ExecHashIncreaseNumBuckets and
its sole call site.
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
@@ -131,17 +131,7 @@ MultiExecHash(HashState *node)
 
 	/* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
 	if (hashtable->nbuckets != hashtable->nbuckets_optimal)
-	{
-		/* We never decrease the number of buckets. */
-		Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
-
-#ifdef HJDEBUG
-		printf("Increasing nbuckets %d => %d\n",
-			   hashtable->nbuckets, hashtable->nbuckets_optimal);
-#endif
-
 		ExecHashIncreaseNumBuckets(hashtable);
-	}
 
 	/* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
 	hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple);
@@ -486,23 +476,31 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 
 	/*
 	 * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
-	 * memory is filled, assuming a single batch.  The Min() step limits the
-	 * results so that the pointer arrays we'll try to allocate do not exceed
-	 * work_mem.
+	 * memory is filled, assuming a single batch; but limit the value so that
+	 * the pointer arrays we'll try to allocate do not exceed work_mem nor
+	 * MaxAllocSize.
+	 *
+	 * Note that both nbuckets and nbatch must be powers of 2 to make
+	 * ExecHashGetBucketAndBatch fast.
 	 */
-	max_pointers = (work_mem * 1024L) / sizeof(void *);
+	max_pointers = (work_mem * 1024L) / sizeof(HashJoinTuple);
+	max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));
 	/* also ensure we avoid integer overflow in nbatch and nbuckets */
+	/* (this step is redundant given the current value of MaxAllocSize) */
 	max_pointers = Min(max_pointers, INT_MAX / 2);
+
 	dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
 	dbuckets = Min(dbuckets, max_pointers);
+	/* don't let nbuckets be really small, though ... */
 	nbuckets = Max((int) dbuckets, 1024);
+	/* ... and force it to be a power of 2. */
 	nbuckets = 1 << my_log2(nbuckets);
-	bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
 
 	/*
 	 * If there's not enough space to store the projected number of tuples and
 	 * the required bucket headers, we will need multiple batches.
 	 */
+	bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
 	if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
 	{
 		/* We'll need multiple batches */
@@ -521,6 +519,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		lbuckets = 1L << my_log2(hash_table_bytes / bucket_size);
 		lbuckets = Min(lbuckets, max_pointers);
 		nbuckets = (int) lbuckets;
+		nbuckets = 1 << my_log2(nbuckets);
 		bucket_bytes = nbuckets * sizeof(HashJoinTuple);
 
 		/*
@@ -760,21 +759,18 @@ ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
 	if (hashtable->nbuckets >= hashtable->nbuckets_optimal)
 		return;
 
-	/*
-	 * We already know the optimal number of buckets, so let's just compute
-	 * the log2_nbuckets for it.
-	 */
+#ifdef HJDEBUG
+	printf("Increasing nbuckets %d => %d\n",
+		   hashtable->nbuckets, hashtable->nbuckets_optimal);
+#endif
+
 	hashtable->nbuckets = hashtable->nbuckets_optimal;
-	hashtable->log2_nbuckets = my_log2(hashtable->nbuckets_optimal);
+	hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
 
 	Assert(hashtable->nbuckets > 1);
 	Assert(hashtable->nbuckets <= (INT_MAX / 2));
 	Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
 
-#ifdef HJDEBUG
-	printf("Increasing nbuckets to %d\n", hashtable->nbuckets);
-#endif
-
 	/*
 	 * Just reallocate the proper number of buckets - we don't need to walk
 	 * through them - we can walk the dense-allocated chunks (just like in
@@ -785,7 +781,7 @@ ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
 		(HashJoinTuple *) repalloc(hashtable->buckets,
 								hashtable->nbuckets * sizeof(HashJoinTuple));
 
-	memset(hashtable->buckets, 0, sizeof(void *) * hashtable->nbuckets);
+	memset(hashtable->buckets, 0, hashtable->nbuckets * sizeof(HashJoinTuple));
 
 	/* scan through all tuples in all chunks to rebuild the hash table */
 	for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next)
@@ -878,12 +874,16 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		 * NTUP_PER_BUCKET threshold, but only when there's still a single
 		 * batch.
 		 */
-		if ((hashtable->nbatch == 1) &&
-			(hashtable->nbuckets_optimal <= INT_MAX / 2) &&		/* overflow protection */
-			(ntuples >= (hashtable->nbuckets_optimal * NTUP_PER_BUCKET)))
+		if (hashtable->nbatch == 1 &&
+			ntuples > (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))
 		{
-			hashtable->nbuckets_optimal *= 2;
-			hashtable->log2_nbuckets_optimal += 1;
+			/* Guard against integer overflow and alloc size overflow */
+			if (hashtable->nbuckets_optimal <= INT_MAX / 2 &&
+				hashtable->nbuckets_optimal * 2 <= MaxAllocSize / sizeof(HashJoinTuple))
+			{
+				hashtable->nbuckets_optimal *= 2;
+				hashtable->log2_nbuckets_optimal += 1;
+			}
 		}
 
 		/* Account for space used, and back off if we've used too much */
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
@@ -131,7 +131,7 @@ typedef struct HashJoinTableData
 	int			nbuckets_original;		/* # buckets when starting the first
 										 * hash */
 	int			nbuckets_optimal;		/* optimal # buckets (per batch) */
-	int			log2_nbuckets_optimal;	/* same as log2_nbuckets optimal */
+	int			log2_nbuckets_optimal;	/* log2(nbuckets_optimal) */
 
 	/* buckets[i] is head of list of tuples in i'th in-memory bucket */
 	struct HashJoinTupleData **buckets;