RustPython · youknowone · Feb 10, 2024 · Feb 10, 2024 · Feb 10, 2024 · youknowone
diff --git a/Lib/test/test_collections.py b/Lib/test/test_collections.py
@@ -1838,8 +1838,6 @@ def __repr__(self):
         self.assertTrue(f1 != l1)
         self.assertTrue(f1 != l2)
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_Set_hash_matches_frozenset(self):
         sets = [
             {}, {1}, {None}, {-1}, {0.0}, {"abc"}, {1, 2, 3},

diff --git a/Lib/test/test_set.py b/Lib/test/test_set.py
@@ -707,8 +707,6 @@ def test_hash_caching(self):
         f = self.thetype('abcdcda')
         self.assertEqual(hash(f), hash(f))
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_hash_effectiveness(self):
         n = 13
         hashvalues = set()
@@ -730,7 +728,15 @@ def powerset(s):
             for i in range(len(s)+1):
                 yield from map(frozenset, itertools.combinations(s, i))
 
-        for n in range(18):
+        # TODO: RUSTPYTHON
+        # The original test has:
+        # for n in range(18):
+        # Due to general performance overhead, hashing a frozenset takes
+        # about 50 times longer than in CPython. This test amplifies that
+        # exponentially, so the best we can do here reasonably is 13.
+        # Even if the internal hash function did nothing, it would still be
+        # about 40 times slower than CPython.
+        for n in range(13):
             t = 2 ** n
             mask = t - 1
             for nums in (range, zf_range):

diff --git a/benches/microbenchmarks/frozenset.py b/benches/microbenchmarks/frozenset.py
@@ -0,0 +1,5 @@
+fs = frozenset(range(0, ITERATIONS))
+
+# ---
+
+hash(fs)
diff --git a/common/src/hash.rs b/common/src/hash.rs
@@ -130,20 +130,6 @@ pub fn hash_float(value: f64) -> Option<PyHash> {
     Some(fix_sentinel(x as PyHash * value.signum() as PyHash))
 }
 
-pub fn hash_iter_unordered<'a, T: 'a, I, F, E>(iter: I, hashf: F) -> Result<PyHash, E>
-where
-    I: IntoIterator<Item = &'a T>,
-    F: Fn(&'a T) -> Result<PyHash, E>,
-{
-    let mut hash: PyHash = 0;
-    for element in iter {
-        let item_hash = hashf(element)?;
-        // xor is commutative and hash should be independent of order
-        hash ^= item_hash;
-    }
-    Ok(fix_sentinel(mod_int(hash)))
-}
-
 pub fn hash_bigint(value: &BigInt) -> PyHash {
     let ret = match value.to_i64() {
         Some(i) => mod_int(i),

diff --git a/vm/src/builtins/set.rs b/vm/src/builtins/set.rs
@@ -434,7 +434,28 @@ impl PySetInner {
     }
 
     fn hash(&self, vm: &VirtualMachine) -> PyResult<PyHash> {
-        crate::utils::hash_iter_unordered(self.elements().iter(), vm)
+        // Work to increase the bit dispersion for closely spaced hash values.
+        // This is important because some use cases have many combinations of a
+        // small number of elements with nearby hashes so that many distinct
+        // combinations collapse to only a handful of distinct hash values.
+        fn _shuffle_bits(h: u64) -> u64 {
+            ((h ^ 89869747) ^ (h.wrapping_shl(16))).wrapping_mul(3644798167)
+        }
+        // Factor in the number of active entries
+        let mut hash: u64 = (self.elements().len() as u64 + 1).wrapping_mul(1927868237);
+        // Xor-in shuffled bits from every entry's hash field because xor is
+        // commutative and a frozenset hash should be independent of order.
+        for element in self.elements().iter() {
+            hash ^= _shuffle_bits(element.hash(vm)? as u64);
+        }
+        // Disperse patterns arising in nested frozensets
+        hash ^= (hash >> 11) ^ (hash >> 25);
+        hash = hash.wrapping_mul(69069).wrapping_add(907133923);
+        // -1 is reserved as an error code
+        if hash == u64::MAX {
+            hash = 590923713;
+        }
+        Ok(hash as PyHash)
     }
 
     // Run operation, on failure, if item is a set/set subclass, convert it

diff --git a/vm/src/types/slot.rs b/vm/src/types/slot.rs
@@ -15,6 +15,7 @@ use crate::{
     AsObject, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine,
 };
 use crossbeam_utils::atomic::AtomicCell;
+use malachite_bigint::BigInt;
 use num_traits::{Signed, ToPrimitive};
 use std::{borrow::Borrow, cmp::Ordering, ops::Deref};
 
@@ -254,7 +255,11 @@ fn hash_wrapper(zelf: &PyObject, vm: &VirtualMachine) -> PyResult<PyHash> {
     let py_int = hash_obj
         .payload_if_subclass::<PyInt>(vm)
         .ok_or_else(|| vm.new_type_error("__hash__ method should return an integer".to_owned()))?;
-    Ok(rustpython_common::hash::hash_bigint(py_int.as_bigint()))
+    let big_int = py_int.as_bigint();
+    let hash: PyHash = big_int
+        .to_i64()
+        .unwrap_or_else(|| (big_int % BigInt::from(u64::MAX)).to_i64().unwrap());
+    Ok(hash)
 }
 
 /// Marks a type as unhashable. Similar to PyObject_HashNotImplemented in CPython

diff --git a/vm/src/utils.rs b/vm/src/utils.rs
@@ -11,13 +11,6 @@ pub fn hash_iter<'a, I: IntoIterator<Item = &'a PyObjectRef>>(
     vm.state.hash_secret.hash_iter(iter, |obj| obj.hash(vm))
 }
 
-pub fn hash_iter_unordered<'a, I: IntoIterator<Item = &'a PyObjectRef>>(
-    iter: I,
-    vm: &VirtualMachine,
-) -> PyResult<rustpython_common::hash::PyHash> {
-    rustpython_common::hash::hash_iter_unordered(iter, |obj| obj.hash(vm))
-}
-
 impl ToPyObject for std::convert::Infallible {
     fn to_pyobject(self, _vm: &VirtualMachine) -> PyObjectRef {
         match self {}