From b9cde1067f95ec1114faf8f375a88c6565356f86 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 20 Apr 2021 20:12:23 -0700 Subject: [PATCH 1/6] bpo-43475: Fix worst case collision behavior for NaN instances --- Doc/library/sys.rst | 2 +- Include/pyhash.h | 3 +-- .../2021-04-20-20-10-46.bpo-43475.oV8Mbs.rst | 3 +++ Objects/complexobject.c | 4 ++-- Objects/floatobject.c | 2 +- Python/pyhash.c | 12 ++++++++---- Python/sysmodule.c | 2 +- 7 files changed, 17 insertions(+), 11 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-04-20-20-10-46.bpo-43475.oV8Mbs.rst diff --git a/Doc/library/sys.rst b/Doc/library/sys.rst index 721edd1495aa5b..2260b9ca72f0ba 100644 --- a/Doc/library/sys.rst +++ b/Doc/library/sys.rst @@ -855,7 +855,7 @@ always available. +---------------------+--------------------------------------------------+ | :const:`inf` | hash value returned for a positive infinity | +---------------------+--------------------------------------------------+ - | :const:`nan` | hash value returned for a nan | + | :const:`nan` | (this attribute is no longer used) | +---------------------+--------------------------------------------------+ | :const:`imag` | multiplier used for the imaginary part of a | | | complex number | diff --git a/Include/pyhash.h b/Include/pyhash.h index 4437b870332bde..728ef932a1accb 100644 --- a/Include/pyhash.h +++ b/Include/pyhash.h @@ -7,7 +7,7 @@ extern "C" { /* Helpers for hash functions */ #ifndef Py_LIMITED_API -PyAPI_FUNC(Py_hash_t) _Py_HashDouble(double); +PyAPI_FUNC(Py_hash_t) _Py_HashDouble(PyObject *, double); PyAPI_FUNC(Py_hash_t) _Py_HashPointer(const void*); // Similar to _Py_HashPointer(), but don't replace -1 with -2 PyAPI_FUNC(Py_hash_t) _Py_HashPointerRaw(const void*); @@ -29,7 +29,6 @@ PyAPI_FUNC(Py_hash_t) _Py_HashBytes(const void*, Py_ssize_t); #define _PyHASH_MODULUS (((size_t)1 << _PyHASH_BITS) - 1) #define _PyHASH_INF 314159 -#define _PyHASH_NAN 0 #define _PyHASH_IMAG _PyHASH_MULTIPLIER diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-04-20-20-10-46.bpo-43475.oV8Mbs.rst b/Misc/NEWS.d/next/Core and Builtins/2021-04-20-20-10-46.bpo-43475.oV8Mbs.rst new file mode 100644 index 00000000000000..73ed0222ba3b0c --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-04-20-20-10-46.bpo-43475.oV8Mbs.rst @@ -0,0 +1,3 @@ +Hashes of NaN values now depend on object identity. Formerly, they always +hashed to 0 even though NaN values are not equal to one another. Having the +same hash for unequal values caused pile-ups in hash tables. diff --git a/Objects/complexobject.c b/Objects/complexobject.c index a65ebdfa6cdf93..8240478da8dafd 100644 --- a/Objects/complexobject.c +++ b/Objects/complexobject.c @@ -412,10 +412,10 @@ static Py_hash_t complex_hash(PyComplexObject *v) { Py_uhash_t hashreal, hashimag, combined; - hashreal = (Py_uhash_t)_Py_HashDouble(v->cval.real); + hashreal = (Py_uhash_t)_Py_HashDouble(v, v->cval.real); if (hashreal == (Py_uhash_t)-1) return -1; - hashimag = (Py_uhash_t)_Py_HashDouble(v->cval.imag); + hashimag = (Py_uhash_t)_Py_HashDouble(v, v->cval.imag); if (hashimag == (Py_uhash_t)-1) return -1; /* Note: if the imaginary part is 0, hashimag is 0 now, diff --git a/Objects/floatobject.c b/Objects/floatobject.c index b3c41b1ca051d1..50e4431ede2c61 100644 --- a/Objects/floatobject.c +++ b/Objects/floatobject.c @@ -556,7 +556,7 @@ float_richcompare(PyObject *v, PyObject *w, int op) static Py_hash_t float_hash(PyFloatObject *v) { - return _Py_HashDouble(v->ob_fval); + return _Py_HashDouble(v, v->ob_fval); } static PyObject * diff --git a/Python/pyhash.c b/Python/pyhash.c index 3b6c34eefd515a..61f13e312494ed 100644 --- a/Python/pyhash.c +++ b/Python/pyhash.c @@ -56,8 +56,12 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0}; If the result of the reduction is infinity (this is impossible for integers, floats and Decimals) then use the predefined hash value _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead. - _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the - hashes of float and Decimal infinities and nans. + _PyHASH_INF and -_PyHASH_INF are also used for the + hashes of float and Decimal infinities. + + NaNs hash to their object id. Having distinct hash values prevents + catastrophic piles-up for unique NaNs which used to always have + the same hash value but would compare unequal. A selling point for the above strategy is that it makes it possible to compute hashes of decimal and binary floating-point numbers @@ -83,7 +87,7 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0}; */ Py_hash_t -_Py_HashDouble(double v) +_Py_HashDouble(PyObject *inst, double v) { int e, sign; double m; @@ -93,7 +97,7 @@ _Py_HashDouble(double v) if (Py_IS_INFINITY(v)) return v > 0 ? _PyHASH_INF : -_PyHASH_INF; else - return _PyHASH_NAN; + return (Py_hash_t) inst; } m = frexp(v, &e); diff --git a/Python/sysmodule.c b/Python/sysmodule.c index a36d90f9de1682..0674c5d0935936 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -1405,7 +1405,7 @@ get_hash_info(PyThreadState *tstate) PyStructSequence_SET_ITEM(hash_info, field++, PyLong_FromLong(_PyHASH_INF)); PyStructSequence_SET_ITEM(hash_info, field++, - PyLong_FromLong(_PyHASH_NAN)); + PyLong_FromLong(0)); // This is now longer used PyStructSequence_SET_ITEM(hash_info, field++, PyLong_FromLong(_PyHASH_IMAG)); PyStructSequence_SET_ITEM(hash_info, field++, From 20b73c374d7cac8311edb130d662b61a7a3b6512 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 20 Apr 2021 20:23:37 -0700 Subject: [PATCH 2/6] Fix decimal and docs as well --- Doc/library/stdtypes.rst | 9 ++++----- Lib/_pydecimal.py | 2 +- Modules/_decimal/_decimal.c | 3 +-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index 68b60508b73d2f..257d4b5bc0b7b5 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -692,10 +692,9 @@ Here are the rules in detail: as ``-hash(-x)``. If the resulting hash is ``-1``, replace it with ``-2``. -- The particular values ``sys.hash_info.inf``, ``-sys.hash_info.inf`` - and ``sys.hash_info.nan`` are used as hash values for positive - infinity, negative infinity, or nans (respectively). (All hashable - nans have the same hash value.) +- The particular values ``sys.hash_info.inf`` and ``-sys.hash_info.inf`` + are used as hash values for positive + infinity or negative infinity (respectively). - For a :class:`complex` number ``z``, the hash values of the real and imaginary parts are combined by computing ``hash(z.real) + @@ -740,7 +739,7 @@ number, :class:`float`, or :class:`complex`:: """Compute the hash of a float x.""" if math.isnan(x): - return sys.hash_info.nan + return id(x) elif math.isinf(x): return sys.hash_info.inf if x > 0 else -sys.hash_info.inf else: diff --git a/Lib/_pydecimal.py b/Lib/_pydecimal.py index ab989e5206a9e9..78c02c968dab9b 100644 --- a/Lib/_pydecimal.py +++ b/Lib/_pydecimal.py @@ -951,7 +951,7 @@ def __hash__(self): if self.is_snan(): raise TypeError('Cannot hash a signaling NaN value.') elif self.is_nan(): - return _PyHASH_NAN + return id(self) else: if self._sign: return -_PyHASH_INF diff --git a/Modules/_decimal/_decimal.c b/Modules/_decimal/_decimal.c index 9a4329f494f31b..5c8f7802d65ae4 100644 --- a/Modules/_decimal/_decimal.c +++ b/Modules/_decimal/_decimal.c @@ -4536,7 +4536,6 @@ _dec_hash(PyDecObject *v) #error "No valid combination of CONFIG_64, CONFIG_32 and _PyHASH_BITS" #endif const Py_hash_t py_hash_inf = 314159; - const Py_hash_t py_hash_nan = 0; mpd_uint_t ten_data[1] = {10}; mpd_t ten = {MPD_POS|MPD_STATIC|MPD_CONST_DATA, 0, 2, 1, 1, ten_data}; @@ -4555,7 +4554,7 @@ _dec_hash(PyDecObject *v) return -1; } else if (mpd_isnan(MPD(v))) { - return py_hash_nan; + return (Py_hash_t) v; } else { return py_hash_inf * mpd_arith_sign(MPD(v)); From 056a4f72859d46f6588d83210f7f9d50f93c56a3 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 20 Apr 2021 20:46:48 -0700 Subject: [PATCH 3/6] Add casts --- Objects/complexobject.c | 4 ++-- Objects/floatobject.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Objects/complexobject.c b/Objects/complexobject.c index 8240478da8dafd..91e06a8c2c2ab7 100644 --- a/Objects/complexobject.c +++ b/Objects/complexobject.c @@ -412,10 +412,10 @@ static Py_hash_t complex_hash(PyComplexObject *v) { Py_uhash_t hashreal, hashimag, combined; - hashreal = (Py_uhash_t)_Py_HashDouble(v, v->cval.real); + hashreal = (Py_uhash_t)_Py_HashDouble((PyObject *) v, v->cval.real); if (hashreal == (Py_uhash_t)-1) return -1; - hashimag = (Py_uhash_t)_Py_HashDouble(v, v->cval.imag); + hashimag = (Py_uhash_t)_Py_HashDouble((PyObject *)v, v->cval.imag); if (hashimag == (Py_uhash_t)-1) return -1; /* Note: if the imaginary part is 0, hashimag is 0 now, diff --git a/Objects/floatobject.c b/Objects/floatobject.c index 50e4431ede2c61..7e78132c01ca27 100644 --- a/Objects/floatobject.c +++ b/Objects/floatobject.c @@ -556,7 +556,7 @@ float_richcompare(PyObject *v, PyObject *w, int op) static Py_hash_t float_hash(PyFloatObject *v) { - return _Py_HashDouble(v, v->ob_fval); + return _Py_HashDouble((PyObject *)v, v->ob_fval); } static PyObject * From dc73c3c38504b972c46b5af238196e514aff9b95 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 21 Apr 2021 07:50:41 -0700 Subject: [PATCH 4/6] Use existing pointer hash logic --- Doc/library/stdtypes.rst | 2 +- Lib/_pydecimal.py | 2 +- Modules/_decimal/_decimal.c | 4 +--- Python/pyhash.c | 4 +++- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index 257d4b5bc0b7b5..b83d0d87f587fc 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -739,7 +739,7 @@ number, :class:`float`, or :class:`complex`:: """Compute the hash of a float x.""" if math.isnan(x): - return id(x) + return super().__hash__() elif math.isinf(x): return sys.hash_info.inf if x > 0 else -sys.hash_info.inf else: diff --git a/Lib/_pydecimal.py b/Lib/_pydecimal.py index 78c02c968dab9b..ff23322ed5603e 100644 --- a/Lib/_pydecimal.py +++ b/Lib/_pydecimal.py @@ -951,7 +951,7 @@ def __hash__(self): if self.is_snan(): raise TypeError('Cannot hash a signaling NaN value.') elif self.is_nan(): - return id(self) + return super().__hash__() else: if self._sign: return -_PyHASH_INF diff --git a/Modules/_decimal/_decimal.c b/Modules/_decimal/_decimal.c index 5c8f7802d65ae4..9b89fa40c926b1 100644 --- a/Modules/_decimal/_decimal.c +++ b/Modules/_decimal/_decimal.c @@ -4554,7 +4554,7 @@ _dec_hash(PyDecObject *v) return -1; } else if (mpd_isnan(MPD(v))) { - return (Py_hash_t) v; + return _Py_HashPointer(v); } else { return py_hash_inf * mpd_arith_sign(MPD(v)); @@ -5938,5 +5938,3 @@ PyInit__decimal(void) return NULL; /* GCOV_NOT_REACHED */ } - - diff --git a/Python/pyhash.c b/Python/pyhash.c index 61f13e312494ed..69971f855df286 100644 --- a/Python/pyhash.c +++ b/Python/pyhash.c @@ -86,6 +86,8 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0}; */ +Py_hash_t _Py_HashPointer(const void *); + Py_hash_t _Py_HashDouble(PyObject *inst, double v) { @@ -97,7 +99,7 @@ _Py_HashDouble(PyObject *inst, double v) if (Py_IS_INFINITY(v)) return v > 0 ? _PyHASH_INF : -_PyHASH_INF; else - return (Py_hash_t) inst; + return _Py_HashPointer(inst); } m = frexp(v, &e); From 9f3f9e9373b222a330b91a9156116891f4d0bede Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 21 Apr 2021 07:57:21 -0700 Subject: [PATCH 5/6] Improve comment wording --- Python/pyhash.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Python/pyhash.c b/Python/pyhash.c index 69971f855df286..f0c82356f1e26c 100644 --- a/Python/pyhash.c +++ b/Python/pyhash.c @@ -59,9 +59,9 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0}; _PyHASH_INF and -_PyHASH_INF are also used for the hashes of float and Decimal infinities. - NaNs hash to their object id. Having distinct hash values prevents - catastrophic piles-up for unique NaNs which used to always have - the same hash value but would compare unequal. + NaNs hash with a pointer hash. Having distinct hash values prevents + catastrophic pileups from distinct NaN instances which used to always + have the same hash value but would compare unequal. A selling point for the above strategy is that it makes it possible to compute hashes of decimal and binary floating-point numbers From 6ac6975be0327d63ff2a2075a8f1a2d48d4e7200 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 21 Apr 2021 15:39:48 -0700 Subject: [PATCH 6/6] Fix typo in comment --- Python/sysmodule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 0674c5d0935936..911c2d967b010a 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -1405,7 +1405,7 @@ get_hash_info(PyThreadState *tstate) PyStructSequence_SET_ITEM(hash_info, field++, PyLong_FromLong(_PyHASH_INF)); PyStructSequence_SET_ITEM(hash_info, field++, - PyLong_FromLong(0)); // This is now longer used + PyLong_FromLong(0)); // This is no longer used PyStructSequence_SET_ITEM(hash_info, field++, PyLong_FromLong(_PyHASH_IMAG)); PyStructSequence_SET_ITEM(hash_info, field++,