Fix several hash functions that were taking chintzy shortcuts instead of

tglsfdc · tglsfdc · commit 1f559b7d3aa4 · 2007-06-01T15:33:19.000Z
delivering a well-randomized hash value.  I got religion on this after
observing that performance of multi-batch hash join degrades terribly if the
higher-order bits of hash values aren't random, as indeed was true for say
hashes of small integer values.  It's now expected and documented that hash
functions should use hash_any or some comparable method to ensure that all
bits of their output are about equally random.

initdb forced because this change invalidates existing hash indexes.  For the
same reason, this isn't back-patchable; the hash join performance problem
will get a band-aid fix in the back branches.
diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
@@ -1,19 +1,26 @@
 /*-------------------------------------------------------------------------
  *
  * hashfunc.c
- *	  Comparison functions for hash access method.
+ *	  Support functions for hash access method.
  *
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.51 2007/04/02 03:49:37 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.52 2007/06/01 15:33:18 tgl Exp $
  *
  * NOTES
  *	  These functions are stored in pg_amproc.	For each operator class
- *	  defined on hash tables, they compute the hash value of the argument.
+ *	  defined for hash indexes, they compute the hash value of the argument.
  *
+ *	  Additional hash functions appear in /utils/adt/ files for various
+ *	  specialized datatypes.
+ *
+ *	  It is expected that every bit of a hash function's 32-bit result is
+ *	  as random as every other; failure to ensure this is likely to lead
+ *	  to poor performance of hash joins, for example.  In most cases a hash
+ *	  function should use hash_any() or its variant hash_uint32().
  *-------------------------------------------------------------------------
  */
 
@@ -26,19 +33,19 @@
 Datum
 hashchar(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_UINT32(~((uint32) PG_GETARG_CHAR(0)));
+	return hash_uint32((int32) PG_GETARG_CHAR(0));
 }
 
 Datum
 hashint2(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_UINT32(~((uint32) PG_GETARG_INT16(0)));
+	return hash_uint32((int32) PG_GETARG_INT16(0));
 }
 
 Datum
 hashint4(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_UINT32(~PG_GETARG_UINT32(0));
+	return hash_uint32(PG_GETARG_INT32(0));
 }
 
 Datum
@@ -59,23 +66,23 @@ hashint8(PG_FUNCTION_ARGS)
 
 	lohalf ^= (val >= 0) ? hihalf : ~hihalf;
 
-	PG_RETURN_UINT32(~lohalf);
+	return hash_uint32(lohalf);
 #else
 	/* here if we can't count on "x >> 32" to work sanely */
-	PG_RETURN_UINT32(~((uint32) PG_GETARG_INT64(0)));
+	return hash_uint32((int32) PG_GETARG_INT64(0));
 #endif
 }
 
 Datum
 hashoid(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_UINT32(~((uint32) PG_GETARG_OID(0)));
+	return hash_uint32((uint32) PG_GETARG_OID(0));
 }
 
 Datum
 hashenum(PG_FUNCTION_ARGS)
 {
-    PG_RETURN_UINT32(~((uint32) PG_GETARG_OID(0)));
+	return hash_uint32((uint32) PG_GETARG_OID(0));
 }
 
 Datum
@@ -283,6 +290,31 @@ hash_any(register const unsigned char *k, register int keylen)
 			/* case 0: nothing left to add */
 	}
 	mix(a, b, c);
+
+	/* report the result */
+	return UInt32GetDatum(c);
+}
+
+/*
+ * hash_uint32() -- hash a 32-bit value
+ *
+ * This has the same result (at least on little-endian machines) as
+ *		hash_any(&k, sizeof(uint32))
+ * but is faster and doesn't force the caller to store k into memory.
+ */
+Datum
+hash_uint32(uint32 k)
+{
+	register uint32 a,
+				b,
+				c;
+
+	a = 0x9e3779b9 + k;
+	b = 0x9e3779b9;
+	c = 3923095 + (uint32) sizeof(uint32);
+
+	mix(a, b, c);
+
 	/* report the result */
 	return UInt32GetDatum(c);
 }
diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
@@ -14,13 +14,14 @@
  * Copyright (c) 2003-2007, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/bitmapset.c,v 1.12 2007/01/05 22:19:29 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/bitmapset.c,v 1.13 2007/06/01 15:33:18 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
 #include "nodes/bitmapset.h"
+#include "access/hash.h"
 
 
 #define WORDNUM(x)	((x) / BITS_PER_BITMAPWORD)
@@ -769,36 +770,23 @@ bms_first_member(Bitmapset *a)
  *
  * Note: we must ensure that any two bitmapsets that are bms_equal() will
  * hash to the same value; in practice this means that trailing all-zero
- * words cannot affect the result.	The circular-shift-and-XOR hash method
- * used here has this property, so long as we work from back to front.
- *
- * Note: you might wonder why we bother with the circular shift; at first
- * glance a straight longitudinal XOR seems as good and much simpler.  The
- * reason is empirical: this gives a better distribution of hash values on
- * the bitmapsets actually generated by the planner.  A common way to have
- * multiword bitmapsets is "a JOIN b JOIN c JOIN d ...", which gives rise
- * to rangetables in which base tables and JOIN nodes alternate; so
- * bitmapsets of base table RT indexes tend to use only odd-numbered or only
- * even-numbered bits.	A straight longitudinal XOR would preserve this
- * property, leading to a much smaller set of possible outputs than if
- * we include a shift.
+ * words must not affect the result.  Hence we strip those before applying
+ * hash_any().
  */
 uint32
 bms_hash_value(const Bitmapset *a)
 {
-	bitmapword	result = 0;
-	int			wordnum;
+	int			lastword;
 
-	if (a == NULL || a->nwords <= 0)
+	if (a == NULL)
 		return 0;				/* All empty sets hash to 0 */
-	for (wordnum = a->nwords; --wordnum > 0;)
+	for (lastword = a->nwords; --lastword >= 0;)
 	{
-		result ^= a->words[wordnum];
-		if (result & ((bitmapword) 1 << (BITS_PER_BITMAPWORD - 1)))
-			result = (result << 1) | 1;
-		else
-			result = (result << 1);
+		if (a->words[lastword] != 0)
+			break;
 	}
-	result ^= a->words[0];
-	return (uint32) result;
+	if (lastword < 0)
+		return 0;				/* All empty sets hash to 0 */
+	return DatumGetUInt32(hash_any((const unsigned char *) a->words,
+								   (lastword + 1) * sizeof(bitmapword)));
 }
diff --git a/src/backend/utils/hash/hashfn.c b/src/backend/utils/hash/hashfn.c
@@ -9,7 +9,13 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/hash/hashfn.c,v 1.30 2007/01/05 22:19:43 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/hash/hashfn.c,v 1.31 2007/06/01 15:33:18 tgl Exp $
+ *
+ * NOTES
+ *	  It is expected that every bit of a hash function's 32-bit result is
+ *	  as random as every other; failure to ensure this is likely to lead
+ *	  to poor performance of hash tables.  In most cases a hash
+ *	  function should use hash_any() or its variant hash_uint32().
  *
  *-------------------------------------------------------------------------
  */
@@ -58,8 +64,7 @@ uint32
 oid_hash(const void *key, Size keysize)
 {
 	Assert(keysize == sizeof(Oid));
-	/* We don't actually bother to do anything to the OID value ... */
-	return (uint32) *((const Oid *) key);
+	return DatumGetUInt32(hash_uint32((uint32) *((const Oid *) key)));
 }
 
 /*
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.81 2007/05/30 20:12:02 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.82 2007/06/01 15:33:18 tgl Exp $
  *
  * NOTES
  *		modeled after Margo Seltzer's hash implementation for unix.
@@ -265,6 +265,7 @@ extern Datum hashname(PG_FUNCTION_ARGS);
 extern Datum hashtext(PG_FUNCTION_ARGS);
 extern Datum hashvarlena(PG_FUNCTION_ARGS);
 extern Datum hash_any(register const unsigned char *k, register int keylen);
+extern Datum hash_uint32(uint32 k);
 
 /* private routines */
 
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
@@ -37,7 +37,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.407 2007/05/21 17:10:29 petere Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.408 2007/06/01 15:33:19 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	200705211
+#define CATALOG_VERSION_NO	200706011
 
 #endif

Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,13 @@`
`9`	`9`	`*`
`10`	`10`	`*`
`11`	`11`	`* IDENTIFICATION`
`12`		`- * $PostgreSQL: pgsql/src/backend/utils/hash/hashfn.c,v 1.30 2007/01/05 22:19:43 momjian Exp $`
	`12`	`+ * $PostgreSQL: pgsql/src/backend/utils/hash/hashfn.c,v 1.31 2007/06/01 15:33:18 tgl Exp $`
	`13`	`+ *`
	`14`	`+ * NOTES`
	`15`	`+ * It is expected that every bit of a hash function's 32-bit result is`
	`16`	`+ * as random as every other; failure to ensure this is likely to lead`
	`17`	`+ * to poor performance of hash tables. In most cases a hash`
	`18`	`+ * function should use hash_any() or its variant hash_uint32().`
`13`	`19`	`*`
`14`	`20`	`*-------------------------------------------------------------------------`
`15`	`21`	`*/`
`@@ -58,8 +64,7 @@ uint32`
`58`	`64`	`oid_hash(const void *key, Size keysize)`
`59`	`65`	`{`
`60`	`66`	`Assert(keysize == sizeof(Oid));`
`61`		`- /* We don't actually bother to do anything to the OID value ... */`
`62`		`- return (uint32) ((const Oid ) key);`
	`67`	`+ return DatumGetUInt32(hash_uint32((uint32) ((const Oid ) key)));`
`63`	`68`	`}`
`64`	`69`
`65`	`70`	`/*`
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group`
`8`	`8`	`* Portions Copyright (c) 1994, Regents of the University of California`
`9`	`9`	`*`
`10`		`- * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.81 2007/05/30 20:12:02 tgl Exp $`
	`10`	`+ * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.82 2007/06/01 15:33:18 tgl Exp $`
`11`	`11`	`*`
`12`	`12`	`* NOTES`
`13`	`13`	`* modeled after Margo Seltzer's hash implementation for unix.`
`@@ -265,6 +265,7 @@ extern Datum hashname(PG_FUNCTION_ARGS);`
`265`	`265`	`extern Datum hashtext(PG_FUNCTION_ARGS);`
`266`	`266`	`extern Datum hashvarlena(PG_FUNCTION_ARGS);`
`267`	`267`	`extern Datum hash_any(register const unsigned char *k, register int keylen);`
	`268`	`+extern Datum hash_uint32(uint32 k);`
`268`	`269`
`269`	`270`	`/* private routines */`
`270`	`271`
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@`
`37`	`37`	`* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group`
`38`	`38`	`* Portions Copyright (c) 1994, Regents of the University of California`
`39`	`39`	`*`
`40`		`- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.407 2007/05/21 17:10:29 petere Exp $`
	`40`	`+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.408 2007/06/01 15:33:19 tgl Exp $`
`41`	`41`	`*`
`42`	`42`	`*-------------------------------------------------------------------------`
`43`	`43`	`*/`
`@@ -53,6 +53,6 @@`
`53`	`53`	`*/`
`54`	`54`
`55`	`55`	`/* yyyymmddN */`
`56`		`-#define CATALOG_VERSION_NO 200705211`
	`56`	`+#define CATALOG_VERSION_NO 200706011`
`57`	`57`
`58`	`58`	`#endif`