Skip to content

Commit 1f559b7

Browse files
committed
Fix several hash functions that were taking chintzy shortcuts instead of
delivering a well-randomized hash value. I got religion on this after observing that performance of multi-batch hash join degrades terribly if the higher-order bits of hash values aren't random, as indeed was true for say hashes of small integer values. It's now expected and documented that hash functions should use hash_any or some comparable method to ensure that all bits of their output are about equally random. initdb forced because this change invalidates existing hash indexes. For the same reason, this isn't back-patchable; the hash join performance problem will get a band-aid fix in the back branches.
1 parent 397d00a commit 1f559b7

File tree

5 files changed

+67
-41
lines changed

5 files changed

+67
-41
lines changed

src/backend/access/hash/hashfunc.c

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,26 @@
11
/*-------------------------------------------------------------------------
22
*
33
* hashfunc.c
4-
* Comparison functions for hash access method.
4+
* Support functions for hash access method.
55
*
66
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.51 2007/04/02 03:49:37 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.52 2007/06/01 15:33:18 tgl Exp $
1212
*
1313
* NOTES
1414
* These functions are stored in pg_amproc. For each operator class
15-
* defined on hash tables, they compute the hash value of the argument.
15+
* defined for hash indexes, they compute the hash value of the argument.
1616
*
17+
* Additional hash functions appear in /utils/adt/ files for various
18+
* specialized datatypes.
19+
*
20+
* It is expected that every bit of a hash function's 32-bit result is
21+
* as random as every other; failure to ensure this is likely to lead
22+
* to poor performance of hash joins, for example. In most cases a hash
23+
* function should use hash_any() or its variant hash_uint32().
1724
*-------------------------------------------------------------------------
1825
*/
1926

@@ -26,19 +33,19 @@
2633
Datum
2734
hashchar(PG_FUNCTION_ARGS)
2835
{
29-
PG_RETURN_UINT32(~((uint32) PG_GETARG_CHAR(0)));
36+
return hash_uint32((int32) PG_GETARG_CHAR(0));
3037
}
3138

3239
Datum
3340
hashint2(PG_FUNCTION_ARGS)
3441
{
35-
PG_RETURN_UINT32(~((uint32) PG_GETARG_INT16(0)));
42+
return hash_uint32((int32) PG_GETARG_INT16(0));
3643
}
3744

3845
Datum
3946
hashint4(PG_FUNCTION_ARGS)
4047
{
41-
PG_RETURN_UINT32(~PG_GETARG_UINT32(0));
48+
return hash_uint32(PG_GETARG_INT32(0));
4249
}
4350

4451
Datum
@@ -59,23 +66,23 @@ hashint8(PG_FUNCTION_ARGS)
5966

6067
lohalf ^= (val >= 0) ? hihalf : ~hihalf;
6168

62-
PG_RETURN_UINT32(~lohalf);
69+
return hash_uint32(lohalf);
6370
#else
6471
/* here if we can't count on "x >> 32" to work sanely */
65-
PG_RETURN_UINT32(~((uint32) PG_GETARG_INT64(0)));
72+
return hash_uint32((int32) PG_GETARG_INT64(0));
6673
#endif
6774
}
6875

6976
Datum
7077
hashoid(PG_FUNCTION_ARGS)
7178
{
72-
PG_RETURN_UINT32(~((uint32) PG_GETARG_OID(0)));
79+
return hash_uint32((uint32) PG_GETARG_OID(0));
7380
}
7481

7582
Datum
7683
hashenum(PG_FUNCTION_ARGS)
7784
{
78-
PG_RETURN_UINT32(~((uint32) PG_GETARG_OID(0)));
85+
return hash_uint32((uint32) PG_GETARG_OID(0));
7986
}
8087

8188
Datum
@@ -283,6 +290,31 @@ hash_any(register const unsigned char *k, register int keylen)
283290
/* case 0: nothing left to add */
284291
}
285292
mix(a, b, c);
293+
294+
/* report the result */
295+
return UInt32GetDatum(c);
296+
}
297+
298+
/*
299+
* hash_uint32() -- hash a 32-bit value
300+
*
301+
* This has the same result (at least on little-endian machines) as
302+
* hash_any(&k, sizeof(uint32))
303+
* but is faster and doesn't force the caller to store k into memory.
304+
*/
305+
Datum
306+
hash_uint32(uint32 k)
307+
{
308+
register uint32 a,
309+
b,
310+
c;
311+
312+
a = 0x9e3779b9 + k;
313+
b = 0x9e3779b9;
314+
c = 3923095 + (uint32) sizeof(uint32);
315+
316+
mix(a, b, c);
317+
286318
/* report the result */
287319
return UInt32GetDatum(c);
288320
}

src/backend/nodes/bitmapset.c

Lines changed: 13 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,14 @@
1414
* Copyright (c) 2003-2007, PostgreSQL Global Development Group
1515
*
1616
* IDENTIFICATION
17-
* $PostgreSQL: pgsql/src/backend/nodes/bitmapset.c,v 1.12 2007/01/05 22:19:29 momjian Exp $
17+
* $PostgreSQL: pgsql/src/backend/nodes/bitmapset.c,v 1.13 2007/06/01 15:33:18 tgl Exp $
1818
*
1919
*-------------------------------------------------------------------------
2020
*/
2121
#include "postgres.h"
2222

2323
#include "nodes/bitmapset.h"
24+
#include "access/hash.h"
2425

2526

2627
#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
@@ -769,36 +770,23 @@ bms_first_member(Bitmapset *a)
769770
*
770771
* Note: we must ensure that any two bitmapsets that are bms_equal() will
771772
* hash to the same value; in practice this means that trailing all-zero
772-
* words cannot affect the result. The circular-shift-and-XOR hash method
773-
* used here has this property, so long as we work from back to front.
774-
*
775-
* Note: you might wonder why we bother with the circular shift; at first
776-
* glance a straight longitudinal XOR seems as good and much simpler. The
777-
* reason is empirical: this gives a better distribution of hash values on
778-
* the bitmapsets actually generated by the planner. A common way to have
779-
* multiword bitmapsets is "a JOIN b JOIN c JOIN d ...", which gives rise
780-
* to rangetables in which base tables and JOIN nodes alternate; so
781-
* bitmapsets of base table RT indexes tend to use only odd-numbered or only
782-
* even-numbered bits. A straight longitudinal XOR would preserve this
783-
* property, leading to a much smaller set of possible outputs than if
784-
* we include a shift.
773+
* words must not affect the result. Hence we strip those before applying
774+
* hash_any().
785775
*/
786776
uint32
787777
bms_hash_value(const Bitmapset *a)
788778
{
789-
bitmapword result = 0;
790-
int wordnum;
779+
int lastword;
791780

792-
if (a == NULL || a->nwords <= 0)
781+
if (a == NULL)
793782
return 0; /* All empty sets hash to 0 */
794-
for (wordnum = a->nwords; --wordnum > 0;)
783+
for (lastword = a->nwords; --lastword >= 0;)
795784
{
796-
result ^= a->words[wordnum];
797-
if (result & ((bitmapword) 1 << (BITS_PER_BITMAPWORD - 1)))
798-
result = (result << 1) | 1;
799-
else
800-
result = (result << 1);
785+
if (a->words[lastword] != 0)
786+
break;
801787
}
802-
result ^= a->words[0];
803-
return (uint32) result;
788+
if (lastword < 0)
789+
return 0; /* All empty sets hash to 0 */
790+
return DatumGetUInt32(hash_any((const unsigned char *) a->words,
791+
(lastword + 1) * sizeof(bitmapword)));
804792
}

src/backend/utils/hash/hashfn.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,13 @@
99
*
1010
*
1111
* IDENTIFICATION
12-
* $PostgreSQL: pgsql/src/backend/utils/hash/hashfn.c,v 1.30 2007/01/05 22:19:43 momjian Exp $
12+
* $PostgreSQL: pgsql/src/backend/utils/hash/hashfn.c,v 1.31 2007/06/01 15:33:18 tgl Exp $
13+
*
14+
* NOTES
15+
* It is expected that every bit of a hash function's 32-bit result is
16+
* as random as every other; failure to ensure this is likely to lead
17+
* to poor performance of hash tables. In most cases a hash
18+
* function should use hash_any() or its variant hash_uint32().
1319
*
1420
*-------------------------------------------------------------------------
1521
*/
@@ -58,8 +64,7 @@ uint32
5864
oid_hash(const void *key, Size keysize)
5965
{
6066
Assert(keysize == sizeof(Oid));
61-
/* We don't actually bother to do anything to the OID value ... */
62-
return (uint32) *((const Oid *) key);
67+
return DatumGetUInt32(hash_uint32((uint32) *((const Oid *) key)));
6368
}
6469

6570
/*

src/include/access/hash.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/access/hash.h,v 1.81 2007/05/30 20:12:02 tgl Exp $
10+
* $PostgreSQL: pgsql/src/include/access/hash.h,v 1.82 2007/06/01 15:33:18 tgl Exp $
1111
*
1212
* NOTES
1313
* modeled after Margo Seltzer's hash implementation for unix.
@@ -265,6 +265,7 @@ extern Datum hashname(PG_FUNCTION_ARGS);
265265
extern Datum hashtext(PG_FUNCTION_ARGS);
266266
extern Datum hashvarlena(PG_FUNCTION_ARGS);
267267
extern Datum hash_any(register const unsigned char *k, register int keylen);
268+
extern Datum hash_uint32(uint32 k);
268269

269270
/* private routines */
270271

src/include/catalog/catversion.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
3838
* Portions Copyright (c) 1994, Regents of the University of California
3939
*
40-
* $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.407 2007/05/21 17:10:29 petere Exp $
40+
* $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.408 2007/06/01 15:33:19 tgl Exp $
4141
*
4242
*-------------------------------------------------------------------------
4343
*/
@@ -53,6 +53,6 @@
5353
*/
5454

5555
/* yyyymmddN */
56-
#define CATALOG_VERSION_NO 200705211
56+
#define CATALOG_VERSION_NO 200706011
5757

5858
#endif

0 commit comments

Comments
 (0)