bpo-43475: Fix worst case collision behavior for NaN instances (pythonGH-25493)

rhettinger · web-flow · commit a07da09ad5bd · 2021-04-22T08:34:57.000-07:00
diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst
@@ -692,10 +692,9 @@ Here are the rules in detail:
   as ``-hash(-x)``.  If the resulting hash is ``-1``, replace it with
   ``-2``.
 
-- The particular values ``sys.hash_info.inf``, ``-sys.hash_info.inf``
-  and ``sys.hash_info.nan`` are used as hash values for positive
-  infinity, negative infinity, or nans (respectively).  (All hashable
-  nans have the same hash value.)
+- The particular values ``sys.hash_info.inf`` and ``-sys.hash_info.inf``
+  are used as hash values for positive
+  infinity or negative infinity (respectively).
 
 - For a :class:`complex` number ``z``, the hash values of the real
   and imaginary parts are combined by computing ``hash(z.real) +
@@ -740,7 +739,7 @@ number, :class:`float`, or :class:`complex`::
        """Compute the hash of a float x."""
 
        if math.isnan(x):
-           return sys.hash_info.nan
+           return super().__hash__()
        elif math.isinf(x):
            return sys.hash_info.inf if x > 0 else -sys.hash_info.inf
        else:
diff --git a/Doc/library/sys.rst b/Doc/library/sys.rst
@@ -855,7 +855,7 @@ always available.
    +---------------------+--------------------------------------------------+
    | :const:`inf`        | hash value returned for a positive infinity      |
    +---------------------+--------------------------------------------------+
-   | :const:`nan`        | hash value returned for a nan                    |
+   | :const:`nan`        | (this attribute is no longer used)               |
    +---------------------+--------------------------------------------------+
    | :const:`imag`       | multiplier used for the imaginary part of a      |
    |                     | complex number                                   |
diff --git a/Include/pyhash.h b/Include/pyhash.h
@@ -7,7 +7,7 @@ extern "C" {
 
 /* Helpers for hash functions */
 #ifndef Py_LIMITED_API
-PyAPI_FUNC(Py_hash_t) _Py_HashDouble(double);
+PyAPI_FUNC(Py_hash_t) _Py_HashDouble(PyObject *, double);
 PyAPI_FUNC(Py_hash_t) _Py_HashPointer(const void*);
 // Similar to _Py_HashPointer(), but don't replace -1 with -2
 PyAPI_FUNC(Py_hash_t) _Py_HashPointerRaw(const void*);
@@ -29,7 +29,6 @@ PyAPI_FUNC(Py_hash_t) _Py_HashBytes(const void*, Py_ssize_t);
 
 #define _PyHASH_MODULUS (((size_t)1 << _PyHASH_BITS) - 1)
 #define _PyHASH_INF 314159
-#define _PyHASH_NAN 0
 #define _PyHASH_IMAG _PyHASH_MULTIPLIER
 
 
diff --git a/Lib/_pydecimal.py b/Lib/_pydecimal.py
@@ -951,7 +951,7 @@ def __hash__(self):
             if self.is_snan():
                 raise TypeError('Cannot hash a signaling NaN value.')
             elif self.is_nan():
-                return _PyHASH_NAN
+                return super().__hash__()
             else:
                 if self._sign:
                     return -_PyHASH_INF
diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-04-20-20-10-46.bpo-43475.oV8Mbs.rst b/Misc/NEWS.d/next/Core and Builtins/2021-04-20-20-10-46.bpo-43475.oV8Mbs.rst
@@ -0,0 +1,3 @@
+Hashes of NaN values now depend on object identity.  Formerly, they always
+hashed to 0 even though NaN values are not equal to one another.  Having the
+same hash for unequal values caused pile-ups in hash tables.
diff --git a/Modules/_decimal/_decimal.c b/Modules/_decimal/_decimal.c
@@ -4536,7 +4536,6 @@ _dec_hash(PyDecObject *v)
     #error "No valid combination of CONFIG_64, CONFIG_32 and _PyHASH_BITS"
 #endif
     const Py_hash_t py_hash_inf = 314159;
-    const Py_hash_t py_hash_nan = 0;
     mpd_uint_t ten_data[1] = {10};
     mpd_t ten = {MPD_POS|MPD_STATIC|MPD_CONST_DATA,
                  0, 2, 1, 1, ten_data};
@@ -4555,7 +4554,7 @@ _dec_hash(PyDecObject *v)
             return -1;
         }
         else if (mpd_isnan(MPD(v))) {
-            return py_hash_nan;
+            return _Py_HashPointer(v);
         }
         else {
             return py_hash_inf * mpd_arith_sign(MPD(v));
@@ -5939,5 +5938,3 @@ PyInit__decimal(void)
 
     return NULL; /* GCOV_NOT_REACHED */
 }
-
-
diff --git a/Objects/complexobject.c b/Objects/complexobject.c
@@ -412,10 +412,10 @@ static Py_hash_t
 complex_hash(PyComplexObject *v)
 {
     Py_uhash_t hashreal, hashimag, combined;
-    hashreal = (Py_uhash_t)_Py_HashDouble(v->cval.real);
+    hashreal = (Py_uhash_t)_Py_HashDouble((PyObject *) v, v->cval.real);
     if (hashreal == (Py_uhash_t)-1)
         return -1;
-    hashimag = (Py_uhash_t)_Py_HashDouble(v->cval.imag);
+    hashimag = (Py_uhash_t)_Py_HashDouble((PyObject *)v, v->cval.imag);
     if (hashimag == (Py_uhash_t)-1)
         return -1;
     /* Note:  if the imaginary part is 0, hashimag is 0 now,
diff --git a/Objects/floatobject.c b/Objects/floatobject.c
@@ -556,7 +556,7 @@ float_richcompare(PyObject *v, PyObject *w, int op)
 static Py_hash_t
 float_hash(PyFloatObject *v)
 {
-    return _Py_HashDouble(v->ob_fval);
+    return _Py_HashDouble((PyObject *)v, v->ob_fval);
 }
 
 static PyObject *
diff --git a/Python/pyhash.c b/Python/pyhash.c
@@ -56,8 +56,12 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
    If the result of the reduction is infinity (this is impossible for
    integers, floats and Decimals) then use the predefined hash value
    _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
-   _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
-   hashes of float and Decimal infinities and nans.
+   _PyHASH_INF and -_PyHASH_INF are also used for the
+   hashes of float and Decimal infinities.
+
+   NaNs hash with a pointer hash.  Having distinct hash values prevents
+   catastrophic pileups from distinct NaN instances which used to always
+   have the same hash value but would compare unequal.
 
    A selling point for the above strategy is that it makes it possible
    to compute hashes of decimal and binary floating-point numbers
@@ -82,8 +86,10 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
 
    */
 
+Py_hash_t _Py_HashPointer(const void *);
+
 Py_hash_t
-_Py_HashDouble(double v)
+_Py_HashDouble(PyObject *inst, double v)
 {
     int e, sign;
     double m;
@@ -93,7 +99,7 @@ _Py_HashDouble(double v)
         if (Py_IS_INFINITY(v))
             return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
         else
-            return _PyHASH_NAN;
+            return _Py_HashPointer(inst);
     }
 
     m = frexp(v, &e);
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
@@ -1405,7 +1405,7 @@ get_hash_info(PyThreadState *tstate)
     PyStructSequence_SET_ITEM(hash_info, field++,
                               PyLong_FromLong(_PyHASH_INF));
     PyStructSequence_SET_ITEM(hash_info, field++,
-                              PyLong_FromLong(_PyHASH_NAN));
+                              PyLong_FromLong(0));  // This is no longer used
     PyStructSequence_SET_ITEM(hash_info, field++,
                               PyLong_FromLong(_PyHASH_IMAG));
     PyStructSequence_SET_ITEM(hash_info, field++,

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Hashes of NaN values now depend on object identity. Formerly, they always`
	`2`	`+hashed to 0 even though NaN values are not equal to one another. Having the`
	`3`	`+same hash for unequal values caused pile-ups in hash tables.`
Original file line number	Diff line number	Diff line change
`@@ -556,7 +556,7 @@ float_richcompare(PyObject v, PyObject w, int op)`
`556`	`556`	`static Py_hash_t`
`557`	`557`	`float_hash(PyFloatObject *v)`
`558`	`558`	`{`
`559`		`- return _Py_HashDouble(v->ob_fval);`
	`559`	`+ return _Py_HashDouble((PyObject *)v, v->ob_fval);`
`560`	`560`	`}`
`561`	`561`
`562`	`562`	`static PyObject *`