From b9cde1067f95ec1114faf8f375a88c6565356f86 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Tue, 20 Apr 2021 20:12:23 -0700
Subject: [PATCH 1/6] bpo-43475:  Fix worst case collision behavior for NaN
 instances

---
 Doc/library/sys.rst                                  |  2 +-
 Include/pyhash.h                                     |  3 +--
 .../2021-04-20-20-10-46.bpo-43475.oV8Mbs.rst         |  3 +++
 Objects/complexobject.c                              |  4 ++--
 Objects/floatobject.c                                |  2 +-
 Python/pyhash.c                                      | 12 ++++++++----
 Python/sysmodule.c                                   |  2 +-
 7 files changed, 17 insertions(+), 11 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-04-20-20-10-46.bpo-43475.oV8Mbs.rst

diff --git a/Doc/library/sys.rst b/Doc/library/sys.rst
index 721edd1495aa5b..2260b9ca72f0ba 100644
--- a/Doc/library/sys.rst
+++ b/Doc/library/sys.rst
@@ -855,7 +855,7 @@ always available.
    +---------------------+--------------------------------------------------+
    | :const:`inf`        | hash value returned for a positive infinity      |
    +---------------------+--------------------------------------------------+
-   | :const:`nan`        | hash value returned for a nan                    |
+   | :const:`nan`        | (this attribute is no longer used)               |
    +---------------------+--------------------------------------------------+
    | :const:`imag`       | multiplier used for the imaginary part of a      |
    |                     | complex number                                   |
diff --git a/Include/pyhash.h b/Include/pyhash.h
index 4437b870332bde..728ef932a1accb 100644
--- a/Include/pyhash.h
+++ b/Include/pyhash.h
@@ -7,7 +7,7 @@ extern "C" {
 
 /* Helpers for hash functions */
 #ifndef Py_LIMITED_API
-PyAPI_FUNC(Py_hash_t) _Py_HashDouble(double);
+PyAPI_FUNC(Py_hash_t) _Py_HashDouble(PyObject *, double);
 PyAPI_FUNC(Py_hash_t) _Py_HashPointer(const void*);
 // Similar to _Py_HashPointer(), but don't replace -1 with -2
 PyAPI_FUNC(Py_hash_t) _Py_HashPointerRaw(const void*);
@@ -29,7 +29,6 @@ PyAPI_FUNC(Py_hash_t) _Py_HashBytes(const void*, Py_ssize_t);
 
 #define _PyHASH_MODULUS (((size_t)1 << _PyHASH_BITS) - 1)
 #define _PyHASH_INF 314159
-#define _PyHASH_NAN 0
 #define _PyHASH_IMAG _PyHASH_MULTIPLIER
 
 
diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-04-20-20-10-46.bpo-43475.oV8Mbs.rst b/Misc/NEWS.d/next/Core and Builtins/2021-04-20-20-10-46.bpo-43475.oV8Mbs.rst
new file mode 100644
index 00000000000000..73ed0222ba3b0c
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-04-20-20-10-46.bpo-43475.oV8Mbs.rst	
@@ -0,0 +1,3 @@
+Hashes of NaN values now depend on object identity.  Formerly, they always
+hashed to 0 even though NaN values are not equal to one another.  Having the
+same hash for unequal values caused pile-ups in hash tables.
diff --git a/Objects/complexobject.c b/Objects/complexobject.c
index a65ebdfa6cdf93..8240478da8dafd 100644
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@@ -412,10 +412,10 @@ static Py_hash_t
 complex_hash(PyComplexObject *v)
 {
     Py_uhash_t hashreal, hashimag, combined;
-    hashreal = (Py_uhash_t)_Py_HashDouble(v->cval.real);
+    hashreal = (Py_uhash_t)_Py_HashDouble(v, v->cval.real);
     if (hashreal == (Py_uhash_t)-1)
         return -1;
-    hashimag = (Py_uhash_t)_Py_HashDouble(v->cval.imag);
+    hashimag = (Py_uhash_t)_Py_HashDouble(v, v->cval.imag);
     if (hashimag == (Py_uhash_t)-1)
         return -1;
     /* Note:  if the imaginary part is 0, hashimag is 0 now,
diff --git a/Objects/floatobject.c b/Objects/floatobject.c
index b3c41b1ca051d1..50e4431ede2c61 100644
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@@ -556,7 +556,7 @@ float_richcompare(PyObject *v, PyObject *w, int op)
 static Py_hash_t
 float_hash(PyFloatObject *v)
 {
-    return _Py_HashDouble(v->ob_fval);
+    return _Py_HashDouble(v, v->ob_fval);
 }
 
 static PyObject *
diff --git a/Python/pyhash.c b/Python/pyhash.c
index 3b6c34eefd515a..61f13e312494ed 100644
--- a/Python/pyhash.c
+++ b/Python/pyhash.c
@@ -56,8 +56,12 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
    If the result of the reduction is infinity (this is impossible for
    integers, floats and Decimals) then use the predefined hash value
    _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
-   _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
-   hashes of float and Decimal infinities and nans.
+   _PyHASH_INF and -_PyHASH_INF are also used for the
+   hashes of float and Decimal infinities.
+
+   NaNs hash to their object id.  Having distinct hash values prevents
+   catastrophic piles-up for unique NaNs which used to always have
+   the same hash value but would compare unequal.
 
    A selling point for the above strategy is that it makes it possible
    to compute hashes of decimal and binary floating-point numbers
@@ -83,7 +87,7 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
    */
 
 Py_hash_t
-_Py_HashDouble(double v)
+_Py_HashDouble(PyObject *inst, double v)
 {
     int e, sign;
     double m;
@@ -93,7 +97,7 @@ _Py_HashDouble(double v)
         if (Py_IS_INFINITY(v))
             return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
         else
-            return _PyHASH_NAN;
+            return (Py_hash_t) inst;
     }
 
     m = frexp(v, &e);
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index a36d90f9de1682..0674c5d0935936 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -1405,7 +1405,7 @@ get_hash_info(PyThreadState *tstate)
     PyStructSequence_SET_ITEM(hash_info, field++,
                               PyLong_FromLong(_PyHASH_INF));
     PyStructSequence_SET_ITEM(hash_info, field++,
-                              PyLong_FromLong(_PyHASH_NAN));
+                              PyLong_FromLong(0));  // This is now longer used
     PyStructSequence_SET_ITEM(hash_info, field++,
                               PyLong_FromLong(_PyHASH_IMAG));
     PyStructSequence_SET_ITEM(hash_info, field++,

From 20b73c374d7cac8311edb130d662b61a7a3b6512 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Tue, 20 Apr 2021 20:23:37 -0700
Subject: [PATCH 2/6] Fix decimal and docs as well

---
 Doc/library/stdtypes.rst    | 9 ++++-----
 Lib/_pydecimal.py           | 2 +-
 Modules/_decimal/_decimal.c | 3 +--
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst
index 68b60508b73d2f..257d4b5bc0b7b5 100644
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@@ -692,10 +692,9 @@ Here are the rules in detail:
   as ``-hash(-x)``.  If the resulting hash is ``-1``, replace it with
   ``-2``.
 
-- The particular values ``sys.hash_info.inf``, ``-sys.hash_info.inf``
-  and ``sys.hash_info.nan`` are used as hash values for positive
-  infinity, negative infinity, or nans (respectively).  (All hashable
-  nans have the same hash value.)
+- The particular values ``sys.hash_info.inf`` and ``-sys.hash_info.inf``
+  are used as hash values for positive
+  infinity or negative infinity (respectively).
 
 - For a :class:`complex` number ``z``, the hash values of the real
   and imaginary parts are combined by computing ``hash(z.real) +
@@ -740,7 +739,7 @@ number, :class:`float`, or :class:`complex`::
        """Compute the hash of a float x."""
 
        if math.isnan(x):
-           return sys.hash_info.nan
+           return id(x)
        elif math.isinf(x):
            return sys.hash_info.inf if x > 0 else -sys.hash_info.inf
        else:
diff --git a/Lib/_pydecimal.py b/Lib/_pydecimal.py
index ab989e5206a9e9..78c02c968dab9b 100644
--- a/Lib/_pydecimal.py
+++ b/Lib/_pydecimal.py
@@ -951,7 +951,7 @@ def __hash__(self):
             if self.is_snan():
                 raise TypeError('Cannot hash a signaling NaN value.')
             elif self.is_nan():
-                return _PyHASH_NAN
+                return id(self)
             else:
                 if self._sign:
                     return -_PyHASH_INF
diff --git a/Modules/_decimal/_decimal.c b/Modules/_decimal/_decimal.c
index 9a4329f494f31b..5c8f7802d65ae4 100644
--- a/Modules/_decimal/_decimal.c
+++ b/Modules/_decimal/_decimal.c
@@ -4536,7 +4536,6 @@ _dec_hash(PyDecObject *v)
     #error "No valid combination of CONFIG_64, CONFIG_32 and _PyHASH_BITS"
 #endif
     const Py_hash_t py_hash_inf = 314159;
-    const Py_hash_t py_hash_nan = 0;
     mpd_uint_t ten_data[1] = {10};
     mpd_t ten = {MPD_POS|MPD_STATIC|MPD_CONST_DATA,
                  0, 2, 1, 1, ten_data};
@@ -4555,7 +4554,7 @@ _dec_hash(PyDecObject *v)
             return -1;
         }
         else if (mpd_isnan(MPD(v))) {
-            return py_hash_nan;
+            return (Py_hash_t) v;
         }
         else {
             return py_hash_inf * mpd_arith_sign(MPD(v));

From 056a4f72859d46f6588d83210f7f9d50f93c56a3 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Tue, 20 Apr 2021 20:46:48 -0700
Subject: [PATCH 3/6] Add casts

---
 Objects/complexobject.c | 4 ++--
 Objects/floatobject.c   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Objects/complexobject.c b/Objects/complexobject.c
index 8240478da8dafd..91e06a8c2c2ab7 100644
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@@ -412,10 +412,10 @@ static Py_hash_t
 complex_hash(PyComplexObject *v)
 {
     Py_uhash_t hashreal, hashimag, combined;
-    hashreal = (Py_uhash_t)_Py_HashDouble(v, v->cval.real);
+    hashreal = (Py_uhash_t)_Py_HashDouble((PyObject *) v, v->cval.real);
     if (hashreal == (Py_uhash_t)-1)
         return -1;
-    hashimag = (Py_uhash_t)_Py_HashDouble(v, v->cval.imag);
+    hashimag = (Py_uhash_t)_Py_HashDouble((PyObject *)v, v->cval.imag);
     if (hashimag == (Py_uhash_t)-1)
         return -1;
     /* Note:  if the imaginary part is 0, hashimag is 0 now,
diff --git a/Objects/floatobject.c b/Objects/floatobject.c
index 50e4431ede2c61..7e78132c01ca27 100644
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@@ -556,7 +556,7 @@ float_richcompare(PyObject *v, PyObject *w, int op)
 static Py_hash_t
 float_hash(PyFloatObject *v)
 {
-    return _Py_HashDouble(v, v->ob_fval);
+    return _Py_HashDouble((PyObject *)v, v->ob_fval);
 }
 
 static PyObject *

From dc73c3c38504b972c46b5af238196e514aff9b95 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Wed, 21 Apr 2021 07:50:41 -0700
Subject: [PATCH 4/6] Use existing pointer hash logic

---
 Doc/library/stdtypes.rst    | 2 +-
 Lib/_pydecimal.py           | 2 +-
 Modules/_decimal/_decimal.c | 4 +---
 Python/pyhash.c             | 4 +++-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst
index 257d4b5bc0b7b5..b83d0d87f587fc 100644
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@@ -739,7 +739,7 @@ number, :class:`float`, or :class:`complex`::
        """Compute the hash of a float x."""
 
        if math.isnan(x):
-           return id(x)
+           return super().__hash__()
        elif math.isinf(x):
            return sys.hash_info.inf if x > 0 else -sys.hash_info.inf
        else:
diff --git a/Lib/_pydecimal.py b/Lib/_pydecimal.py
index 78c02c968dab9b..ff23322ed5603e 100644
--- a/Lib/_pydecimal.py
+++ b/Lib/_pydecimal.py
@@ -951,7 +951,7 @@ def __hash__(self):
             if self.is_snan():
                 raise TypeError('Cannot hash a signaling NaN value.')
             elif self.is_nan():
-                return id(self)
+                return super().__hash__()
             else:
                 if self._sign:
                     return -_PyHASH_INF
diff --git a/Modules/_decimal/_decimal.c b/Modules/_decimal/_decimal.c
index 5c8f7802d65ae4..9b89fa40c926b1 100644
--- a/Modules/_decimal/_decimal.c
+++ b/Modules/_decimal/_decimal.c
@@ -4554,7 +4554,7 @@ _dec_hash(PyDecObject *v)
             return -1;
         }
         else if (mpd_isnan(MPD(v))) {
-            return (Py_hash_t) v;
+            return _Py_HashPointer(v);
         }
         else {
             return py_hash_inf * mpd_arith_sign(MPD(v));
@@ -5938,5 +5938,3 @@ PyInit__decimal(void)
 
     return NULL; /* GCOV_NOT_REACHED */
 }
-
-
diff --git a/Python/pyhash.c b/Python/pyhash.c
index 61f13e312494ed..69971f855df286 100644
--- a/Python/pyhash.c
+++ b/Python/pyhash.c
@@ -86,6 +86,8 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
 
    */
 
+Py_hash_t _Py_HashPointer(const void *);
+
 Py_hash_t
 _Py_HashDouble(PyObject *inst, double v)
 {
@@ -97,7 +99,7 @@ _Py_HashDouble(PyObject *inst, double v)
         if (Py_IS_INFINITY(v))
             return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
         else
-            return (Py_hash_t) inst;
+            return _Py_HashPointer(inst);
     }
 
     m = frexp(v, &e);

From 9f3f9e9373b222a330b91a9156116891f4d0bede Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Wed, 21 Apr 2021 07:57:21 -0700
Subject: [PATCH 5/6] Improve comment wording

---
 Python/pyhash.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Python/pyhash.c b/Python/pyhash.c
index 69971f855df286..f0c82356f1e26c 100644
--- a/Python/pyhash.c
+++ b/Python/pyhash.c
@@ -59,9 +59,9 @@ static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
    _PyHASH_INF and -_PyHASH_INF are also used for the
    hashes of float and Decimal infinities.
 
-   NaNs hash to their object id.  Having distinct hash values prevents
-   catastrophic piles-up for unique NaNs which used to always have
-   the same hash value but would compare unequal.
+   NaNs hash with a pointer hash.  Having distinct hash values prevents
+   catastrophic pileups from distinct NaN instances which used to always
+   have the same hash value but would compare unequal.
 
    A selling point for the above strategy is that it makes it possible
    to compute hashes of decimal and binary floating-point numbers

From 6ac6975be0327d63ff2a2075a8f1a2d48d4e7200 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Wed, 21 Apr 2021 15:39:48 -0700
Subject: [PATCH 6/6] Fix typo in comment

---
 Python/sysmodule.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 0674c5d0935936..911c2d967b010a 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -1405,7 +1405,7 @@ get_hash_info(PyThreadState *tstate)
     PyStructSequence_SET_ITEM(hash_info, field++,
                               PyLong_FromLong(_PyHASH_INF));
     PyStructSequence_SET_ITEM(hash_info, field++,
-                              PyLong_FromLong(0));  // This is now longer used
+                              PyLong_FromLong(0));  // This is no longer used
     PyStructSequence_SET_ITEM(hash_info, field++,
                               PyLong_FromLong(_PyHASH_IMAG));
     PyStructSequence_SET_ITEM(hash_info, field++,