From 4715734a37f564b95823ec1e021b540ca429870c Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Mon, 22 Jan 2024 10:37:38 -0800
Subject: [PATCH 1/6] Make PyDictKeysObject thread-safe

---
 Include/internal/pycore_dict.h |   6 +
 Objects/dictobject.c           | 318 +++++++++++++++++++++++----------
 2 files changed, 228 insertions(+), 96 deletions(-)

diff --git a/Include/internal/pycore_dict.h b/Include/internal/pycore_dict.h
index e5ef9a8607a83b..bf7a2abcad3814 100644
--- a/Include/internal/pycore_dict.h
+++ b/Include/internal/pycore_dict.h
@@ -136,6 +136,11 @@ struct _dictkeysobject {
     /* Kind of keys */
     uint8_t dk_kind;
 
+#ifdef Py_GIL_DISABLED
+    /* Lock used to protect shared keys */
+    PyMutex dk_mutex;
+#endif
+
     /* Version number -- Reset to 0 by any modification to keys */
     uint32_t dk_version;
 
@@ -145,6 +150,7 @@ struct _dictkeysobject {
     /* Number of used entries in dk_entries. */
     Py_ssize_t dk_nentries;
 
+
     /* Actual hash table of dk_size entries. It holds indices in dk_entries,
        or DKIX_EMPTY(-1) or DKIX_DUMMY(-2).
 
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index 25ab21881f8f74..21937562f42987 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -151,9 +151,27 @@ ASSERT_DICT_LOCKED(PyObject *op)
 }
 #define ASSERT_DICT_LOCKED(op) ASSERT_DICT_LOCKED(_Py_CAST(PyObject*, op))
 
-#else
+#define LOCK_KEYS(keys) PyMutex_Lock(&keys->dk_mutex)
+#define UNLOCK_KEYS(keys) PyMutex_Unlock(&keys->dk_mutex)
+
+#define ASSERT_KEYS_LOCKED(keys) assert(PyMutex_IsLocked(&keys->dk_mutex))
+#define LOAD_SHARED_KEY(key) _Py_atomic_load_ptr_acquire(&key)
+#define STORE_SHARED_KEY(key, value) _Py_atomic_store_ptr_release(&key, value)
+// Inc refs the keys object, giving the previous value
+#define INCREF_KEYS(dk)  _Py_atomic_add_ssize(&dk->dk_refcnt, 1)
+// Dec refs the keys object, giving the previous value
+#define DECREF_KEYS(dk)  _Py_atomic_add_ssize(&dk->dk_refcnt, -1)
+
+#else /* Py_GIL_DISABLED */
 
 #define ASSERT_DICT_LOCKED(op)
+#define LOCK_KEYS(keys)
+#define UNLOCK_KEYS(keys)
+#define ASSERT_KEYS_LOCKED(keys)
+#define LOAD_SHARED_KEY(key) key
+#define STORE_SHARED_KEY(key, value) key = value
+#define INCREF_KEYS(dk)  dk->dk_refcnt++
+#define DECREF_KEYS(dk)  dk->dk_refcnt--
 
 #endif
 
@@ -348,7 +366,7 @@ dictkeys_incref(PyDictKeysObject *dk)
 #ifdef Py_REF_DEBUG
     _Py_IncRefTotal(_PyInterpreterState_GET());
 #endif
-    dk->dk_refcnt++;
+    INCREF_KEYS(dk);
 }
 
 static inline void
@@ -361,7 +379,7 @@ dictkeys_decref(PyInterpreterState *interp, PyDictKeysObject *dk)
 #ifdef Py_REF_DEBUG
     _Py_DecRefTotal(_PyInterpreterState_GET());
 #endif
-    if (--dk->dk_refcnt == 0) {
+    if (DECREF_KEYS(dk) == 1) {
         if (DK_IS_UNICODE(dk)) {
             PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(dk);
             Py_ssize_t i, n;
@@ -512,6 +530,9 @@ static PyDictKeysObject empty_keys_struct = {
         0, /* dk_log2_size */
         0, /* dk_log2_index_bytes */
         DICT_KEYS_UNICODE, /* dk_kind */
+#ifdef Py_GIL_DISABLED
+        {0}, /* dk_lock */
+#endif
         1, /* dk_version */
         0, /* dk_usable (immutable) */
         0, /* dk_nentries */
@@ -697,6 +718,9 @@ new_keys_object(PyInterpreterState *interp, uint8_t log2_size, bool unicode)
     dk->dk_log2_size = log2_size;
     dk->dk_log2_index_bytes = log2_bytes;
     dk->dk_kind = unicode ? DICT_KEYS_UNICODE : DICT_KEYS_GENERAL;
+#ifdef Py_GIL_DISABLED
+    dk->dk_mutex = (PyMutex){0};
+#endif
     dk->dk_nentries = 0;
     dk->dk_usable = usable;
     dk->dk_version = 0;
@@ -785,6 +809,7 @@ new_dict(PyInterpreterState *interp,
 static inline size_t
 shared_keys_usable_size(PyDictKeysObject *keys)
 {
+    ASSERT_KEYS_LOCKED(keys);
     return (size_t)keys->dk_nentries + (size_t)keys->dk_usable;
 }
 
@@ -792,17 +817,21 @@ shared_keys_usable_size(PyDictKeysObject *keys)
 static PyObject *
 new_dict_with_shared_keys(PyInterpreterState *interp, PyDictKeysObject *keys)
 {
+    LOCK_KEYS(keys);
     size_t size = shared_keys_usable_size(keys);
     PyDictValues *values = new_values(size);
     if (values == NULL) {
         dictkeys_decref(interp, keys);
+        UNLOCK_KEYS(keys);
         return PyErr_NoMemory();
     }
     ((char *)values)[-2] = 0;
     for (size_t i = 0; i < size; i++) {
         values->values[i] = NULL;
     }
-    return new_dict(interp, keys, values, 0, 1);
+    PyObject *res = new_dict(interp, keys, values, 0, 1);
+    UNLOCK_KEYS(keys);
+    return res;
 }
 
 
@@ -1074,7 +1103,7 @@ _Py_dict_lookup(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **valu
     PyDictKeysObject *dk;
     DictKeysKind kind;
     Py_ssize_t ix;
-
+    // TODO: Thread safety
 start:
     dk = mp->ma_keys;
     kind = dk->dk_kind;
@@ -1190,8 +1219,7 @@ _PyDict_MaybeUntrack(PyObject *op)
 
 /* Internal function to find slot for an item from its hash
    when it is known that the key is not present in the dict.
-
-   The dict must be combined. */
+ */
 static Py_ssize_t
 find_empty_slot(PyDictKeysObject *keys, Py_hash_t hash)
 {
@@ -1218,6 +1246,8 @@ static Py_ssize_t
 insert_into_dictkeys(PyDictKeysObject *keys, PyObject *name)
 {
     assert(PyUnicode_CheckExact(name));
+    ASSERT_KEYS_LOCKED(keys);
+
     Py_hash_t hash = unicode_get_hash(name);
     if (hash == -1) {
         hash = PyUnicode_Type.tp_hash(name);
@@ -1246,6 +1276,101 @@ insert_into_dictkeys(PyDictKeysObject *keys, PyObject *name)
     return ix;
 }
 
+
+static inline int
+insert_unsplit_dict(PyInterpreterState *interp, PyDictObject *mp,
+                    Py_hash_t hash, PyObject *key, PyObject *value, int unicode)
+{
+    if (mp->ma_keys->dk_usable <= 0) {
+        /* Need to resize. */
+        if (insertion_resize(interp, mp, 1) < 0) {
+            return -1;
+        }
+    }
+
+    Py_ssize_t hashpos = find_empty_slot(mp->ma_keys, hash);
+    dictkeys_set_index(mp->ma_keys, hashpos, mp->ma_keys->dk_nentries);
+
+    if (unicode) {
+        PyDictUnicodeEntry *ep;
+        ep = &DK_UNICODE_ENTRIES(mp->ma_keys)[mp->ma_keys->dk_nentries];
+        ep->me_key = key;
+        ep->me_value = value;
+    }
+    else {
+        PyDictKeyEntry *ep;
+        ep = &DK_ENTRIES(mp->ma_keys)[mp->ma_keys->dk_nentries];
+        ep->me_key = key;
+        ep->me_hash = hash;
+        ep->me_value = value;
+    }
+    mp->ma_keys->dk_usable--;
+    mp->ma_keys->dk_nentries++;
+    assert(mp->ma_keys->dk_usable >= 0);
+    return 0;
+}
+
+static int
+insert_split_dict(PyInterpreterState *interp, PyDictObject *mp,
+                  Py_hash_t hash, PyObject *key, PyObject *value)
+{
+    PyDictKeysObject *keys = mp->ma_keys;
+    LOCK_KEYS(keys);
+    if (keys->dk_usable <= 0) {
+        /* Need to resize. */
+        if (insertion_resize(interp, mp, 1) < 0) {
+            UNLOCK_KEYS(keys);
+            return -1;
+        }
+        assert(!_PyDict_HasSplitTable(mp));
+        assert(DK_IS_UNICODE(keys));
+        UNLOCK_KEYS(keys);
+        return insert_unsplit_dict(interp, mp, hash, key, value, 1);
+    }
+
+    Py_ssize_t hashpos = find_empty_slot(keys, hash);
+    dictkeys_set_index(keys, hashpos, keys->dk_nentries);
+
+    PyDictUnicodeEntry *ep;
+    ep = &DK_UNICODE_ENTRIES(keys)[keys->dk_nentries];
+    STORE_SHARED_KEY(ep->me_key, key);
+
+    Py_ssize_t index = keys->dk_nentries;
+    _PyDictValues_AddToInsertionOrder(mp->ma_values, index);
+    assert (mp->ma_values->values[index] == NULL);
+    mp->ma_values->values[index] = value;
+
+    keys->dk_usable--;
+    keys->dk_nentries++;
+    assert(keys->dk_usable >= 0);
+    UNLOCK_KEYS(keys);
+    return 0;
+}
+
+static int
+convert_to_nonunicode_keys(PyInterpreterState *interp, PyDictObject *mp)
+{
+    PyDictKeysObject *keys = mp->ma_keys;
+    if (_PyDict_HasSplitTable(mp)) {
+        LOCK_KEYS(keys);
+        dictkeys_incref(keys);
+    } else {
+        keys = NULL;
+    }
+
+    int res = insertion_resize(interp, mp, 0);
+
+    if (keys != NULL) {
+        dictkeys_decref(interp, keys);
+        UNLOCK_KEYS(keys);
+    }
+    if (res < 0) {
+        return res;
+    }
+    assert(mp->ma_keys->dk_kind == DICT_KEYS_GENERAL);
+    return 0;
+}
+
 /*
 Internal routine to insert a new item into the table.
 Used both by the internal resize routine and by the public insert routine.
@@ -1261,9 +1386,9 @@ insertdict(PyInterpreterState *interp, PyDictObject *mp,
     ASSERT_DICT_LOCKED(mp);
 
     if (DK_IS_UNICODE(mp->ma_keys) && !PyUnicode_CheckExact(key)) {
-        if (insertion_resize(interp, mp, 0) < 0)
+        if (convert_to_nonunicode_keys(interp, mp) < 0) {
             goto Fail;
-        assert(mp->ma_keys->dk_kind == DICT_KEYS_GENERAL);
+        }
     }
 
     Py_ssize_t ix = _Py_dict_lookup(mp, key, hash, &old_value);
@@ -1278,41 +1403,19 @@ insertdict(PyInterpreterState *interp, PyDictObject *mp,
         /* Insert into new slot. */
         mp->ma_keys->dk_version = 0;
         assert(old_value == NULL);
-        if (mp->ma_keys->dk_usable <= 0) {
-            /* Need to resize. */
-            if (insertion_resize(interp, mp, 1) < 0)
-                goto Fail;
-        }
 
-        Py_ssize_t hashpos = find_empty_slot(mp->ma_keys, hash);
-        dictkeys_set_index(mp->ma_keys, hashpos, mp->ma_keys->dk_nentries);
-
-        if (DK_IS_UNICODE(mp->ma_keys)) {
-            PyDictUnicodeEntry *ep;
-            ep = &DK_UNICODE_ENTRIES(mp->ma_keys)[mp->ma_keys->dk_nentries];
-            ep->me_key = key;
-            if (mp->ma_values) {
-                Py_ssize_t index = mp->ma_keys->dk_nentries;
-                _PyDictValues_AddToInsertionOrder(mp->ma_values, index);
-                assert (mp->ma_values->values[index] == NULL);
-                mp->ma_values->values[index] = value;
-            }
-            else {
-                ep->me_value = value;
+        int unicode = DK_IS_UNICODE(mp->ma_keys);
+        if (!unicode || !_PyDict_HasSplitTable(mp)) {
+            if (insert_unsplit_dict(interp, mp, hash, key, value, unicode) < 0) {
+                goto Fail;
             }
+        } else {
+            if (insert_split_dict(interp, mp, hash, key, value) < 0)
+                goto Fail;
         }
-        else {
-            PyDictKeyEntry *ep;
-            ep = &DK_ENTRIES(mp->ma_keys)[mp->ma_keys->dk_nentries];
-            ep->me_key = key;
-            ep->me_hash = hash;
-            ep->me_value = value;
-        }
+
         mp->ma_used++;
         mp->ma_version_tag = new_version;
-        mp->ma_keys->dk_usable--;
-        mp->ma_keys->dk_nentries++;
-        assert(mp->ma_keys->dk_usable >= 0);
         ASSERT_CONSISTENT(mp);
         return 0;
     }
@@ -1482,7 +1585,8 @@ dictresize(PyInterpreterState *interp, PyDictObject *mp,
     Py_ssize_t numentries = mp->ma_used;
 
     if (oldvalues != NULL) {
-         PyDictUnicodeEntry *oldentries = DK_UNICODE_ENTRIES(oldkeys);
+        ASSERT_KEYS_LOCKED(oldkeys);
+        PyDictUnicodeEntry *oldentries = DK_UNICODE_ENTRIES(oldkeys);
         /* Convert split table into new combined table.
          * We must incref keys; we can transfer values.
          */
@@ -2025,7 +2129,7 @@ delitem_common(PyDictObject *mp, Py_hash_t hash, Py_ssize_t ix,
 
     mp->ma_used--;
     mp->ma_version_tag = new_version;
-    if (mp->ma_values) {
+    if (_PyDict_HasSplitTable(mp)) {
         assert(old_value == mp->ma_values->values[ix]);
         mp->ma_values->values[ix] = NULL;
         assert(ix < SHARED_KEYS_MAX_SIZE);
@@ -2244,14 +2348,13 @@ _PyDict_Next(PyObject *op, Py_ssize_t *ppos, PyObject **pkey,
 
     mp = (PyDictObject *)op;
     i = *ppos;
-    if (mp->ma_values) {
+    if (_PyDict_HasSplitTable(mp)) {
         assert(mp->ma_used <= SHARED_KEYS_MAX_SIZE);
         if (i < 0 || i >= mp->ma_used)
             return 0;
         int index = get_index_from_order(mp, i);
         value = mp->ma_values->values[index];
-
-        key = DK_UNICODE_ENTRIES(mp->ma_keys)[index].me_key;
+        key = LOAD_SHARED_KEY(DK_UNICODE_ENTRIES(mp->ma_keys)[index].me_key);
         hash = unicode_get_hash(key);
         assert(value != NULL);
     }
@@ -3143,7 +3246,7 @@ dict_dict_merge(PyInterpreterState *interp, PyDictObject *mp, PyDictObject *othe
 
             dictkeys_decref(interp, mp->ma_keys);
             mp->ma_keys = keys;
-            if (mp->ma_values != NULL) {
+            if (_PyDict_HasSplitTable(mp)) {
                 free_values(mp->ma_values);
                 mp->ma_values = NULL;
             }
@@ -3368,7 +3471,9 @@ copy_lock_held(PyObject *o)
 
     if (_PyDict_HasSplitTable(mp)) {
         PyDictObject *split_copy;
+        LOCK_KEYS(mp->ma_keys);
         size_t size = shared_keys_usable_size(mp->ma_keys);
+        UNLOCK_KEYS(mp->ma_keys);
         PyDictValues *newvalues = new_values(size);
         if (newvalues == NULL)
             return PyErr_NoMemory();
@@ -3484,17 +3589,31 @@ dict_equal_lock_held(PyDictObject *a, PyDictObject *b)
         /* can't be equal if # of entries differ */
         return 0;
     /* Same # of entries -- check all of 'em.  Exit early on any diff. */
-    for (i = 0; i < a->ma_keys->dk_nentries; i++) {
+    Py_ssize_t n_entries;
+#ifdef Py_GIL_DISABLED
+    if (_PyDict_HasSplitTable(a)) {
+        // New entries can be appended, but existing ones won't change, and
+        // our keys won't change because the dict is locked.  So capture
+        // the current number of keys at entry.
+        LOCK_KEYS(a->ma_keys);
+        n_entries = a->ma_keys->dk_nentries;
+        UNLOCK_KEYS(a->ma_keys);
+    } else
+#endif
+    {
+        n_entries = a->ma_keys->dk_nentries;
+    }
+    for (i = 0; i < n_entries; i++) {
         PyObject *key, *aval;
         Py_hash_t hash;
         if (DK_IS_UNICODE(a->ma_keys)) {
             PyDictUnicodeEntry *ep = &DK_UNICODE_ENTRIES(a->ma_keys)[i];
-            key = ep->me_key;
+            key = LOAD_SHARED_KEY(ep->me_key);
             if (key == NULL) {
                 continue;
             }
             hash = unicode_get_hash(key);
-            if (a->ma_values)
+            if (_PyDict_HasSplitTable(a))
                 aval = a->ma_values->values[i];
             else
                 aval = ep->me_value;
@@ -3676,7 +3795,7 @@ dict_setdefault_ref_lock_held(PyObject *d, PyObject *key, PyObject *default_valu
     }
 
     if (!PyUnicode_CheckExact(key) && DK_IS_UNICODE(mp->ma_keys)) {
-        if (insertion_resize(interp, mp, 0) < 0) {
+        if (convert_to_nonunicode_keys(interp, mp) < 0) {
             if (result) {
                 *result = NULL;
             }
@@ -3697,42 +3816,31 @@ dict_setdefault_ref_lock_held(PyObject *d, PyObject *key, PyObject *default_valu
                 interp, PyDict_EVENT_ADDED, mp, key, default_value);
         mp->ma_keys->dk_version = 0;
         value = default_value;
-        if (mp->ma_keys->dk_usable <= 0) {
-            if (insertion_resize(interp, mp, 1) < 0) {
+
+        int unicode = DK_IS_UNICODE(mp->ma_keys);
+        if (!unicode || !_PyDict_HasSplitTable(mp)) {
+            if (insert_unsplit_dict(interp, mp, hash, Py_NewRef(key), Py_NewRef(value), unicode) < 0) {
+                Py_DECREF(key);
+                Py_DECREF(value);
                 if (result) {
                     *result = NULL;
                 }
                 return -1;
             }
-        }
-        Py_ssize_t hashpos = find_empty_slot(mp->ma_keys, hash);
-        dictkeys_set_index(mp->ma_keys, hashpos, mp->ma_keys->dk_nentries);
-        if (DK_IS_UNICODE(mp->ma_keys)) {
-            assert(PyUnicode_CheckExact(key));
-            PyDictUnicodeEntry *ep = &DK_UNICODE_ENTRIES(mp->ma_keys)[mp->ma_keys->dk_nentries];
-            ep->me_key = Py_NewRef(key);
-            if (_PyDict_HasSplitTable(mp)) {
-                Py_ssize_t index = (int)mp->ma_keys->dk_nentries;
-                assert(index < SHARED_KEYS_MAX_SIZE);
-                assert(mp->ma_values->values[index] == NULL);
-                mp->ma_values->values[index] = Py_NewRef(value);
-                _PyDictValues_AddToInsertionOrder(mp->ma_values, index);
-            }
-            else {
-                ep->me_value = Py_NewRef(value);
+        } else {
+            if (insert_split_dict(interp, mp, hash, Py_NewRef(key), Py_NewRef(value)) < 0) {
+                Py_DECREF(key);
+                Py_DECREF(value);
+                if (result) {
+                    *result = NULL;
+                }
+                return -1;
             }
         }
-        else {
-            PyDictKeyEntry *ep = &DK_ENTRIES(mp->ma_keys)[mp->ma_keys->dk_nentries];
-            ep->me_key = Py_NewRef(key);
-            ep->me_hash = hash;
-            ep->me_value = Py_NewRef(value);
-        }
+
         MAINTAIN_TRACKING(mp, key, value);
         mp->ma_used++;
         mp->ma_version_tag = new_version;
-        mp->ma_keys->dk_usable--;
-        mp->ma_keys->dk_nentries++;
         assert(mp->ma_keys->dk_usable >= 0);
         ASSERT_CONSISTENT(mp);
         if (result) {
@@ -3881,8 +3989,16 @@ dict_popitem_impl(PyDictObject *self)
         return NULL;
     }
     /* Convert split table to combined table */
-    if (self->ma_keys->dk_kind == DICT_KEYS_SPLIT) {
-        if (dictresize(interp, self, DK_LOG_SIZE(self->ma_keys), 1)) {
+    if (_PyDict_HasSplitTable(self)) {
+        PyDictKeysObject *keys = self->ma_keys;
+        dictkeys_incref(keys);
+        LOCK_KEYS(keys);
+
+        int status = dictresize(interp, self, DK_LOG_SIZE(self->ma_keys), 1);
+        dictkeys_decref(interp, keys);
+        UNLOCK_KEYS(keys);
+
+        if (status < 0) {
             Py_DECREF(res);
             return NULL;
         }
@@ -3949,7 +4065,7 @@ dict_traverse(PyObject *op, visitproc visit, void *arg)
     Py_ssize_t i, n = keys->dk_nentries;
 
     if (DK_IS_UNICODE(keys)) {
-        if (mp->ma_values != NULL) {
+        if (_PyDict_HasSplitTable(mp)) {
             for (i = 0; i < n; i++) {
                 Py_VISIT(mp->ma_values->values[i]);
             }
@@ -3986,8 +4102,10 @@ static Py_ssize_t
 sizeof_lock_held(PyDictObject *mp)
 {
     size_t res = _PyObject_SIZE(Py_TYPE(mp));
-    if (mp->ma_values) {
+    if (_PyDict_HasSplitTable(mp)) {
+        LOCK_KEYS(mp->ma_keys);
         res += shared_keys_usable_size(mp->ma_keys) * sizeof(PyObject*);
+        UNLOCK_KEYS(mp->ma_keys);
     }
     /* If the dictionary is split, the keys portion is accounted-for
        in the type object. */
@@ -4431,7 +4549,7 @@ dictiter_new(PyDictObject *dict, PyTypeObject *itertype)
     if (itertype == &PyDictRevIterKey_Type ||
          itertype == &PyDictRevIterItem_Type ||
          itertype == &PyDictRevIterValue_Type) {
-        if (dict->ma_values) {
+        if (_PyDict_HasSplitTable(dict)) {
             di->di_pos = dict->ma_used - 1;
         }
         else {
@@ -4523,11 +4641,11 @@ dictiter_iternextkey_lock_held(PyDictObject *d, PyObject *self)
     i = di->di_pos;
     k = d->ma_keys;
     assert(i >= 0);
-    if (d->ma_values) {
+    if (_PyDict_HasSplitTable(d)) {
         if (i >= d->ma_used)
             goto fail;
         int index = get_index_from_order(d, i);
-        key = DK_UNICODE_ENTRIES(k)[index].me_key;
+        key = LOAD_SHARED_KEY(DK_UNICODE_ENTRIES(k)[index].me_key);
         assert(d->ma_values->values[index] != NULL);
     }
     else {
@@ -4638,7 +4756,7 @@ dictiter_iternextvalue_lock_held(PyDictObject *d, PyObject *self)
 
     i = di->di_pos;
     assert(i >= 0);
-    if (d->ma_values) {
+    if (_PyDict_HasSplitTable(d)) {
         if (i >= d->ma_used)
             goto fail;
         int index = get_index_from_order(d, i);
@@ -4752,11 +4870,11 @@ dictiter_iternextitem_lock_held(PyDictObject *d, PyObject *self)
 
     i = di->di_pos;
     assert(i >= 0);
-    if (d->ma_values) {
+    if (_PyDict_HasSplitTable(d)) {
         if (i >= d->ma_used)
             goto fail;
         int index = get_index_from_order(d, i);
-        key = DK_UNICODE_ENTRIES(d->ma_keys)[index].me_key;
+        key = LOAD_SHARED_KEY(DK_UNICODE_ENTRIES(d->ma_keys)[index].me_key);
         value = d->ma_values->values[index];
         assert(value != NULL);
     }
@@ -4897,9 +5015,9 @@ dictreviter_iter_PyDict_Next(PyDictObject *d, PyObject *self)
     if (i < 0) {
         goto fail;
     }
-    if (d->ma_values) {
+    if (_PyDict_HasSplitTable(d)) {
         int index = get_index_from_order(d, i);
-        key = DK_UNICODE_ENTRIES(k)[index].me_key;
+        key = LOAD_SHARED_KEY(DK_UNICODE_ENTRIES(k)[index].me_key);
         value = d->ma_values->values[index];
         assert (value != NULL);
     }
@@ -5915,7 +6033,9 @@ _PyObject_InitInlineValues(PyObject *obj, PyTypeObject *tp)
     if (keys->dk_usable > 1) {
         keys->dk_usable--;
     }
+    LOCK_KEYS(keys);
     size_t size = shared_keys_usable_size(keys);
+    UNLOCK_KEYS(keys);
     PyDictValues *values = new_values(size);
     if (values == NULL) {
         PyErr_NoMemory();
@@ -5965,7 +6085,9 @@ make_dict_from_instance_attributes(PyInterpreterState *interp,
     dictkeys_incref(keys);
     Py_ssize_t used = 0;
     Py_ssize_t track = 0;
+    LOCK_KEYS(keys);
     size_t size = shared_keys_usable_size(keys);
+    UNLOCK_KEYS(keys);
     for (size_t i = 0; i < size; i++) {
         PyObject *val = values->values[i];
         if (val != NULL) {
@@ -6045,22 +6167,26 @@ _PyObject_StoreInstanceAttribute(PyObject *obj, PyDictValues *values,
     assert(Py_TYPE(obj)->tp_flags & Py_TPFLAGS_MANAGED_DICT);
     Py_ssize_t ix = DKIX_EMPTY;
     if (PyUnicode_CheckExact(name)) {
+        LOCK_KEYS(keys);
         ix = insert_into_dictkeys(keys, name);
-    }
-    if (ix == DKIX_EMPTY) {
 #ifdef Py_STATS
-        if (PyUnicode_CheckExact(name)) {
-            if (shared_keys_usable_size(keys) == SHARED_KEYS_MAX_SIZE) {
-                OBJECT_STAT_INC(dict_materialized_too_big);
+        if (ix == DKIX_EMPTY) {
+            if (PyUnicode_CheckExact(name)) {
+                if (shared_keys_usable_size(keys) == SHARED_KEYS_MAX_SIZE) {
+                    OBJECT_STAT_INC(dict_materialized_too_big);
+                }
+                else {
+                    OBJECT_STAT_INC(dict_materialized_new_key);
+                }
             }
             else {
-                OBJECT_STAT_INC(dict_materialized_new_key);
+                OBJECT_STAT_INC(dict_materialized_str_subclass);
             }
         }
-        else {
-            OBJECT_STAT_INC(dict_materialized_str_subclass);
-        }
 #endif
+        UNLOCK_KEYS(keys);
+    }
+    if (ix == DKIX_EMPTY) {
         PyObject *dict = make_dict_from_instance_attributes(
                 interp, keys, values);
         if (dict == NULL) {

From 18d29166d564365166b6a0db81f5d0622bae2b3d Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Wed, 31 Jan 2024 09:03:43 -0800
Subject: [PATCH 2/6] Fix naming of unsplit -> combined

---
 Objects/dictobject.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index 21937562f42987..cc28275a47225f 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -1278,7 +1278,7 @@ insert_into_dictkeys(PyDictKeysObject *keys, PyObject *name)
 
 
 static inline int
-insert_unsplit_dict(PyInterpreterState *interp, PyDictObject *mp,
+insert_combined_dict(PyInterpreterState *interp, PyDictObject *mp,
                     Py_hash_t hash, PyObject *key, PyObject *value, int unicode)
 {
     if (mp->ma_keys->dk_usable <= 0) {
@@ -1325,7 +1325,7 @@ insert_split_dict(PyInterpreterState *interp, PyDictObject *mp,
         assert(!_PyDict_HasSplitTable(mp));
         assert(DK_IS_UNICODE(keys));
         UNLOCK_KEYS(keys);
-        return insert_unsplit_dict(interp, mp, hash, key, value, 1);
+        return insert_combined_dict(interp, mp, hash, key, value, 1);
     }
 
     Py_ssize_t hashpos = find_empty_slot(keys, hash);
@@ -1406,7 +1406,7 @@ insertdict(PyInterpreterState *interp, PyDictObject *mp,
 
         int unicode = DK_IS_UNICODE(mp->ma_keys);
         if (!unicode || !_PyDict_HasSplitTable(mp)) {
-            if (insert_unsplit_dict(interp, mp, hash, key, value, unicode) < 0) {
+            if (insert_combined_dict(interp, mp, hash, key, value, unicode) < 0) {
                 goto Fail;
             }
         } else {
@@ -3819,7 +3819,7 @@ dict_setdefault_ref_lock_held(PyObject *d, PyObject *key, PyObject *default_valu
 
         int unicode = DK_IS_UNICODE(mp->ma_keys);
         if (!unicode || !_PyDict_HasSplitTable(mp)) {
-            if (insert_unsplit_dict(interp, mp, hash, Py_NewRef(key), Py_NewRef(value), unicode) < 0) {
+            if (insert_combined_dict(interp, mp, hash, Py_NewRef(key), Py_NewRef(value), unicode) < 0) {
                 Py_DECREF(key);
                 Py_DECREF(value);
                 if (result) {

From 61caf84549c9f9eb53301faa89383c9fff947a03 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Fri, 2 Feb 2024 11:58:49 -0800
Subject: [PATCH 3/6] Lock less and deal with atomic updates to shared size

---
 Include/cpython/pyatomic.h     |  3 ++
 Include/cpython/pyatomic_gcc.h |  4 +++
 Include/cpython/pyatomic_msc.h | 12 +++++++
 Include/cpython/pyatomic_std.h |  8 ++++-
 Objects/dictobject.c           | 60 +++++++++++++++++++++++-----------
 5 files changed, 67 insertions(+), 20 deletions(-)

diff --git a/Include/cpython/pyatomic.h b/Include/cpython/pyatomic.h
index 737eed8b12dd81..d40fdb8a906736 100644
--- a/Include/cpython/pyatomic.h
+++ b/Include/cpython/pyatomic.h
@@ -484,6 +484,9 @@ _Py_atomic_load_uint64_acquire(const uint64_t *obj);
 static inline uint32_t
 _Py_atomic_load_uint32_acquire(const uint32_t *obj);
 
+static inline Py_ssize_t
+_Py_atomic_load_ssize_acquire(const Py_ssize_t *obj);
+
 
 // --- _Py_atomic_fence ------------------------------------------------------
 
diff --git a/Include/cpython/pyatomic_gcc.h b/Include/cpython/pyatomic_gcc.h
index de23edfc6877d2..56e57d83a3d2cb 100644
--- a/Include/cpython/pyatomic_gcc.h
+++ b/Include/cpython/pyatomic_gcc.h
@@ -516,6 +516,10 @@ static inline uint32_t
 _Py_atomic_load_uint32_acquire(const uint32_t *obj)
 { return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
 
+static inline Py_ssize_t
+_Py_atomic_load_ssize_acquire(const Py_ssize_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
+
 // --- _Py_atomic_fence ------------------------------------------------------
 
 static inline void
diff --git a/Include/cpython/pyatomic_msc.h b/Include/cpython/pyatomic_msc.h
index 9809d9806d7b57..32c5d4548bb905 100644
--- a/Include/cpython/pyatomic_msc.h
+++ b/Include/cpython/pyatomic_msc.h
@@ -990,6 +990,18 @@ _Py_atomic_load_uint32_acquire(const uint32_t *obj)
 #endif
 }
 
+static inline Py_ssize_t
+_Py_atomic_load_ssize_acquire(const Py_ssize_t *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(Py_ssize_t volatile *)obj;
+#elif defined(_M_ARM64)
+    return (Py_ssize_t)__ldar64((unsigned __int64 volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_ssize_acquire"
+#endif
+}
+
 // --- _Py_atomic_fence ------------------------------------------------------
 
  static inline void
diff --git a/Include/cpython/pyatomic_std.h b/Include/cpython/pyatomic_std.h
index f5bd73a8a49e31..93acefe696d4b7 100644
--- a/Include/cpython/pyatomic_std.h
+++ b/Include/cpython/pyatomic_std.h
@@ -908,7 +908,13 @@ _Py_atomic_load_uint32_acquire(const uint32_t *obj)
 {
     _Py_USING_STD;
     return atomic_load_explicit((const _Atomic(uint32_t)*)obj,
-                                memory_order_acquire);
+}
+
+static inline Py_ssize_t
+_Py_atomic_load_ssize_acquire(const Py_ssize_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(Py_ssize_t)*)obj,
 }
 
 
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index cc28275a47225f..0e6191ba6dd5e1 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -151,7 +151,7 @@ ASSERT_DICT_LOCKED(PyObject *op)
 }
 #define ASSERT_DICT_LOCKED(op) ASSERT_DICT_LOCKED(_Py_CAST(PyObject*, op))
 
-#define LOCK_KEYS(keys) PyMutex_Lock(&keys->dk_mutex)
+#define LOCK_KEYS(keys) PyMutex_LockFlags(&keys->dk_mutex, _Py_LOCK_DONT_DETACH)
 #define UNLOCK_KEYS(keys) PyMutex_Unlock(&keys->dk_mutex)
 
 #define ASSERT_KEYS_LOCKED(keys) assert(PyMutex_IsLocked(&keys->dk_mutex))
@@ -161,6 +161,15 @@ ASSERT_DICT_LOCKED(PyObject *op)
 #define INCREF_KEYS(dk)  _Py_atomic_add_ssize(&dk->dk_refcnt, 1)
 // Dec refs the keys object, giving the previous value
 #define DECREF_KEYS(dk)  _Py_atomic_add_ssize(&dk->dk_refcnt, -1)
+static inline void split_keys_entry_added(PyDictKeysObject *keys)
+{
+    ASSERT_KEYS_LOCKED(keys);
+
+    // We increase before we decrease so we never get too small of a value
+    // when we're racing with reads
+    _Py_atomic_store_ssize(&keys->dk_nentries, keys->dk_nentries + 1);
+    _Py_atomic_store_ssize(&keys->dk_usable, keys->dk_usable - 1);
+}
 
 #else /* Py_GIL_DISABLED */
 
@@ -172,6 +181,11 @@ ASSERT_DICT_LOCKED(PyObject *op)
 #define STORE_SHARED_KEY(key, value) key = value
 #define INCREF_KEYS(dk)  dk->dk_refcnt++
 #define DECREF_KEYS(dk)  dk->dk_refcnt--
+static inline void split_keys_entry_added(PyDictKeysObject *keys)
+{
+    keys->dk_usable--;
+    keys->dk_nentries++;
+}
 
 #endif
 
@@ -809,15 +823,25 @@ new_dict(PyInterpreterState *interp,
 static inline size_t
 shared_keys_usable_size(PyDictKeysObject *keys)
 {
-    ASSERT_KEYS_LOCKED(keys);
+#ifdef Py_GIL_DISABLED
+    // dk_usable will decrease for each instance that is created and each
+    // value that is added.  dk_entries will increase for each value that
+    // is added.  We want to always return the right value or larger.
+    // We therefore increase dk_entries first and we decrease dk_usable
+    // second, and conversely here we read dk_usable first and dk_entries
+    // second (to avoid the case where we read entries before the increment
+    // and read usable after the decrement)
+    return (size_t)(_Py_atomic_load_ssize_acquire(&keys->dk_usable) +
+                    _Py_atomic_load_ssize_acquire(&keys->dk_nentries));
+#else
     return (size_t)keys->dk_nentries + (size_t)keys->dk_usable;
+#endif
 }
 
 /* Consumes a reference to the keys object */
 static PyObject *
 new_dict_with_shared_keys(PyInterpreterState *interp, PyDictKeysObject *keys)
 {
-    LOCK_KEYS(keys);
     size_t size = shared_keys_usable_size(keys);
     PyDictValues *values = new_values(size);
     if (values == NULL) {
@@ -830,7 +854,6 @@ new_dict_with_shared_keys(PyInterpreterState *interp, PyDictKeysObject *keys)
         values->values[i] = NULL;
     }
     PyObject *res = new_dict(interp, keys, values, 0, 1);
-    UNLOCK_KEYS(keys);
     return res;
 }
 
@@ -1269,8 +1292,7 @@ insert_into_dictkeys(PyDictKeysObject *keys, PyObject *name)
         dictkeys_set_index(keys, hashpos, ix);
         assert(ep->me_key == NULL);
         ep->me_key = Py_NewRef(name);
-        keys->dk_usable--;
-        keys->dk_nentries++;
+        split_keys_entry_added(keys);
     }
     assert (ix < SHARED_KEYS_MAX_SIZE);
     return ix;
@@ -1279,7 +1301,7 @@ insert_into_dictkeys(PyDictKeysObject *keys, PyObject *name)
 
 static inline int
 insert_combined_dict(PyInterpreterState *interp, PyDictObject *mp,
-                    Py_hash_t hash, PyObject *key, PyObject *value, int unicode)
+                     Py_hash_t hash, PyObject *key, PyObject *value, int unicode)
 {
     if (mp->ma_keys->dk_usable <= 0) {
         /* Need to resize. */
@@ -1340,8 +1362,7 @@ insert_split_dict(PyInterpreterState *interp, PyDictObject *mp,
     assert (mp->ma_values->values[index] == NULL);
     mp->ma_values->values[index] = value;
 
-    keys->dk_usable--;
-    keys->dk_nentries++;
+    split_keys_entry_added(keys);
     assert(keys->dk_usable >= 0);
     UNLOCK_KEYS(keys);
     return 0;
@@ -3471,9 +3492,7 @@ copy_lock_held(PyObject *o)
 
     if (_PyDict_HasSplitTable(mp)) {
         PyDictObject *split_copy;
-        LOCK_KEYS(mp->ma_keys);
         size_t size = shared_keys_usable_size(mp->ma_keys);
-        UNLOCK_KEYS(mp->ma_keys);
         PyDictValues *newvalues = new_values(size);
         if (newvalues == NULL)
             return PyErr_NoMemory();
@@ -3608,7 +3627,7 @@ dict_equal_lock_held(PyDictObject *a, PyDictObject *b)
         Py_hash_t hash;
         if (DK_IS_UNICODE(a->ma_keys)) {
             PyDictUnicodeEntry *ep = &DK_UNICODE_ENTRIES(a->ma_keys)[i];
-            key = LOAD_SHARED_KEY(ep->me_key);
+            key = ep->me_key;
             if (key == NULL) {
                 continue;
             }
@@ -3995,8 +4014,8 @@ dict_popitem_impl(PyDictObject *self)
         LOCK_KEYS(keys);
 
         int status = dictresize(interp, self, DK_LOG_SIZE(self->ma_keys), 1);
-        dictkeys_decref(interp, keys);
         UNLOCK_KEYS(keys);
+        dictkeys_decref(interp, keys);
 
         if (status < 0) {
             Py_DECREF(res);
@@ -4103,9 +4122,7 @@ sizeof_lock_held(PyDictObject *mp)
 {
     size_t res = _PyObject_SIZE(Py_TYPE(mp));
     if (_PyDict_HasSplitTable(mp)) {
-        LOCK_KEYS(mp->ma_keys);
         res += shared_keys_usable_size(mp->ma_keys) * sizeof(PyObject*);
-        UNLOCK_KEYS(mp->ma_keys);
     }
     /* If the dictionary is split, the keys portion is accounted-for
        in the type object. */
@@ -6030,12 +6047,19 @@ _PyObject_InitInlineValues(PyObject *obj, PyTypeObject *tp)
     assert(tp->tp_flags & Py_TPFLAGS_MANAGED_DICT);
     PyDictKeysObject *keys = CACHED_KEYS(tp);
     assert(keys != NULL);
+#ifdef Py_GIL_DISABLED
+    Py_ssize_t usable = keys->dk_usable;
+    while (usable > 1) {
+        if (_Py_atomic_compare_exchange_ssize(&keys->dk_usable, &usable, usable - 1)) {
+            break;
+        }
+    }
+#else
     if (keys->dk_usable > 1) {
         keys->dk_usable--;
     }
-    LOCK_KEYS(keys);
+#endif
     size_t size = shared_keys_usable_size(keys);
-    UNLOCK_KEYS(keys);
     PyDictValues *values = new_values(size);
     if (values == NULL) {
         PyErr_NoMemory();
@@ -6085,9 +6109,7 @@ make_dict_from_instance_attributes(PyInterpreterState *interp,
     dictkeys_incref(keys);
     Py_ssize_t used = 0;
     Py_ssize_t track = 0;
-    LOCK_KEYS(keys);
     size_t size = shared_keys_usable_size(keys);
-    UNLOCK_KEYS(keys);
     for (size_t i = 0; i < size; i++) {
         PyObject *val = values->values[i];
         if (val != NULL) {

From 268c80631dcfc7357b4886aa15d9b3b7057d8d85 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Wed, 7 Feb 2024 16:30:18 -0800
Subject: [PATCH 4/6] Address review feedback

---
 Include/cpython/pyatomic.h     |  3 ++
 Include/cpython/pyatomic_gcc.h |  4 ++
 Include/cpython/pyatomic_msc.h | 12 ++++++
 Include/cpython/pyatomic_std.h |  8 ++++
 Objects/dictobject.c           | 72 +++++++++++++++-------------------
 5 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/Include/cpython/pyatomic.h b/Include/cpython/pyatomic.h
index d40fdb8a906736..c3e132d3877ca5 100644
--- a/Include/cpython/pyatomic.h
+++ b/Include/cpython/pyatomic.h
@@ -469,6 +469,9 @@ _Py_atomic_load_ptr_acquire(const void *obj);
 static inline void
 _Py_atomic_store_ptr_release(void *obj, void *value);
 
+static inline void
+_Py_atomic_store_ssize_release(Py_ssize_t *obj, Py_ssize_t value);
+
 static inline void
 _Py_atomic_store_int_release(int *obj, int value);
 
diff --git a/Include/cpython/pyatomic_gcc.h b/Include/cpython/pyatomic_gcc.h
index 56e57d83a3d2cb..0b40f81bd8736d 100644
--- a/Include/cpython/pyatomic_gcc.h
+++ b/Include/cpython/pyatomic_gcc.h
@@ -500,6 +500,10 @@ static inline void
 _Py_atomic_store_int_release(int *obj, int value)
 { __atomic_store_n(obj, value, __ATOMIC_RELEASE); }
 
+static inline void
+_Py_atomic_store_ssize_release(Py_ssize_t *obj, Py_ssize_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELEASE); }
+
 static inline int
 _Py_atomic_load_int_acquire(const int *obj)
 { return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
diff --git a/Include/cpython/pyatomic_msc.h b/Include/cpython/pyatomic_msc.h
index 32c5d4548bb905..6fbfedf5f19ee5 100644
--- a/Include/cpython/pyatomic_msc.h
+++ b/Include/cpython/pyatomic_msc.h
@@ -939,6 +939,18 @@ _Py_atomic_store_int_release(int *obj, int value)
 #endif
 }
 
+static inline void
+_Py_atomic_store_ssize_release(Py_ssize_t *obj, Py_ssize_t value)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    *(Py_ssize_t volatile *)obj = value;
+#elif defined(_M_ARM64)
+    __stlr64((unsigned __int64 volatile *)obj, (uintptr_t)value);
+#else
+#  error "no implementation of _Py_atomic_store_int_release"
+#endif
+}
+
 static inline int
 _Py_atomic_load_int_acquire(const int *obj)
 {
diff --git a/Include/cpython/pyatomic_std.h b/Include/cpython/pyatomic_std.h
index 93acefe696d4b7..f3970a45df24f9 100644
--- a/Include/cpython/pyatomic_std.h
+++ b/Include/cpython/pyatomic_std.h
@@ -879,6 +879,14 @@ _Py_atomic_store_int_release(int *obj, int value)
                           memory_order_release);
 }
 
+static inline void
+_Py_atomic_store_ssize_release(Py_ssize_t *obj, Py_ssize_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(Py_ssize_t)*)obj, value,
+                          memory_order_release);
+}
+
 static inline int
 _Py_atomic_load_int_acquire(const int *obj)
 {
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index 0e6191ba6dd5e1..4cf706aab2ad75 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -161,14 +161,16 @@ ASSERT_DICT_LOCKED(PyObject *op)
 #define INCREF_KEYS(dk)  _Py_atomic_add_ssize(&dk->dk_refcnt, 1)
 // Dec refs the keys object, giving the previous value
 #define DECREF_KEYS(dk)  _Py_atomic_add_ssize(&dk->dk_refcnt, -1)
+#define LOAD_KEYS_NENTIRES(keys) _Py_atomic_load_ssize_relaxed(&keys->dk_nentries)
+
 static inline void split_keys_entry_added(PyDictKeysObject *keys)
 {
     ASSERT_KEYS_LOCKED(keys);
 
     // We increase before we decrease so we never get too small of a value
     // when we're racing with reads
-    _Py_atomic_store_ssize(&keys->dk_nentries, keys->dk_nentries + 1);
-    _Py_atomic_store_ssize(&keys->dk_usable, keys->dk_usable - 1);
+    _Py_atomic_store_ssize_relaxed(&keys->dk_nentries, keys->dk_nentries + 1);
+    _Py_atomic_store_ssize_release(&keys->dk_usable, keys->dk_usable - 1);
 }
 
 #else /* Py_GIL_DISABLED */
@@ -181,6 +183,8 @@ static inline void split_keys_entry_added(PyDictKeysObject *keys)
 #define STORE_SHARED_KEY(key, value) key = value
 #define INCREF_KEYS(dk)  dk->dk_refcnt++
 #define DECREF_KEYS(dk)  dk->dk_refcnt--
+#define LOAD_KEYS_NENTIRES(keys) keys->dk_nentries
+
 static inline void split_keys_entry_added(PyDictKeysObject *keys)
 {
     keys->dk_usable--;
@@ -545,7 +549,7 @@ static PyDictKeysObject empty_keys_struct = {
         0, /* dk_log2_index_bytes */
         DICT_KEYS_UNICODE, /* dk_kind */
 #ifdef Py_GIL_DISABLED
-        {0}, /* dk_lock */
+        {0}, /* dk_mutex */
 #endif
         1, /* dk_version */
         0, /* dk_usable (immutable) */
@@ -825,9 +829,9 @@ shared_keys_usable_size(PyDictKeysObject *keys)
 {
 #ifdef Py_GIL_DISABLED
     // dk_usable will decrease for each instance that is created and each
-    // value that is added.  dk_entries will increase for each value that
+    // value that is added.  dk_nentries will increase for each value that
     // is added.  We want to always return the right value or larger.
-    // We therefore increase dk_entries first and we decrease dk_usable
+    // We therefore increase dk_nentries first and we decrease dk_usable
     // second, and conversely here we read dk_usable first and dk_entries
     // second (to avoid the case where we read entries before the increment
     // and read usable after the decrement)
@@ -846,15 +850,13 @@ new_dict_with_shared_keys(PyInterpreterState *interp, PyDictKeysObject *keys)
     PyDictValues *values = new_values(size);
     if (values == NULL) {
         dictkeys_decref(interp, keys);
-        UNLOCK_KEYS(keys);
         return PyErr_NoMemory();
     }
     ((char *)values)[-2] = 0;
     for (size_t i = 0; i < size; i++) {
         values->values[i] = NULL;
     }
-    PyObject *res = new_dict(interp, keys, values, 0, 1);
-    return res;
+    return new_dict(interp, keys, values, 0, 1);
 }
 
 
@@ -1266,7 +1268,7 @@ insertion_resize(PyInterpreterState *interp, PyDictObject *mp, int unicode)
 }
 
 static Py_ssize_t
-insert_into_dictkeys(PyDictKeysObject *keys, PyObject *name)
+insert_into_splitdictkeys(PyDictKeysObject *keys, PyObject *name)
 {
     assert(PyUnicode_CheckExact(name));
     ASSERT_KEYS_LOCKED(keys);
@@ -1301,7 +1303,7 @@ insert_into_dictkeys(PyDictKeysObject *keys, PyObject *name)
 
 static inline int
 insert_combined_dict(PyInterpreterState *interp, PyDictObject *mp,
-                     Py_hash_t hash, PyObject *key, PyObject *value, int unicode)
+                     Py_hash_t hash, PyObject *key, PyObject *value)
 {
     if (mp->ma_keys->dk_usable <= 0) {
         /* Need to resize. */
@@ -1313,7 +1315,7 @@ insert_combined_dict(PyInterpreterState *interp, PyDictObject *mp,
     Py_ssize_t hashpos = find_empty_slot(mp->ma_keys, hash);
     dictkeys_set_index(mp->ma_keys, hashpos, mp->ma_keys->dk_nentries);
 
-    if (unicode) {
+    if (DK_IS_UNICODE(mp->ma_keys)) {
         PyDictUnicodeEntry *ep;
         ep = &DK_UNICODE_ENTRIES(mp->ma_keys)[mp->ma_keys->dk_nentries];
         ep->me_key = key;
@@ -1340,14 +1342,16 @@ insert_split_dict(PyInterpreterState *interp, PyDictObject *mp,
     LOCK_KEYS(keys);
     if (keys->dk_usable <= 0) {
         /* Need to resize. */
-        if (insertion_resize(interp, mp, 1) < 0) {
-            UNLOCK_KEYS(keys);
+        dictkeys_incref(keys);
+        int ins = insertion_resize(interp, mp, 1);
+        dictkeys_decref(interp, keys);
+        UNLOCK_KEYS(keys);
+        if (ins < 0) {
             return -1;
         }
         assert(!_PyDict_HasSplitTable(mp));
         assert(DK_IS_UNICODE(keys));
-        UNLOCK_KEYS(keys);
-        return insert_combined_dict(interp, mp, hash, key, value, 1);
+        return insert_combined_dict(interp, mp, hash, key, value);
     }
 
     Py_ssize_t hashpos = find_empty_slot(keys, hash);
@@ -1425,9 +1429,8 @@ insertdict(PyInterpreterState *interp, PyDictObject *mp,
         mp->ma_keys->dk_version = 0;
         assert(old_value == NULL);
 
-        int unicode = DK_IS_UNICODE(mp->ma_keys);
-        if (!unicode || !_PyDict_HasSplitTable(mp)) {
-            if (insert_combined_dict(interp, mp, hash, key, value, unicode) < 0) {
+        if (!_PyDict_HasSplitTable(mp)) {
+            if (insert_combined_dict(interp, mp, hash, key, value) < 0) {
                 goto Fail;
             }
         } else {
@@ -3608,21 +3611,7 @@ dict_equal_lock_held(PyDictObject *a, PyDictObject *b)
         /* can't be equal if # of entries differ */
         return 0;
     /* Same # of entries -- check all of 'em.  Exit early on any diff. */
-    Py_ssize_t n_entries;
-#ifdef Py_GIL_DISABLED
-    if (_PyDict_HasSplitTable(a)) {
-        // New entries can be appended, but existing ones won't change, and
-        // our keys won't change because the dict is locked.  So capture
-        // the current number of keys at entry.
-        LOCK_KEYS(a->ma_keys);
-        n_entries = a->ma_keys->dk_nentries;
-        UNLOCK_KEYS(a->ma_keys);
-    } else
-#endif
-    {
-        n_entries = a->ma_keys->dk_nentries;
-    }
-    for (i = 0; i < n_entries; i++) {
+    for (i = 0; i < LOAD_KEYS_NENTIRES(a->ma_keys); i++) {
         PyObject *key, *aval;
         Py_hash_t hash;
         if (DK_IS_UNICODE(a->ma_keys)) {
@@ -3836,9 +3825,8 @@ dict_setdefault_ref_lock_held(PyObject *d, PyObject *key, PyObject *default_valu
         mp->ma_keys->dk_version = 0;
         value = default_value;
 
-        int unicode = DK_IS_UNICODE(mp->ma_keys);
-        if (!unicode || !_PyDict_HasSplitTable(mp)) {
-            if (insert_combined_dict(interp, mp, hash, Py_NewRef(key), Py_NewRef(value), unicode) < 0) {
+        if (!_PyDict_HasSplitTable(mp)) {
+            if (insert_combined_dict(interp, mp, hash, Py_NewRef(key), Py_NewRef(value)) < 0) {
                 Py_DECREF(key);
                 Py_DECREF(value);
                 if (result) {
@@ -6048,11 +6036,13 @@ _PyObject_InitInlineValues(PyObject *obj, PyTypeObject *tp)
     PyDictKeysObject *keys = CACHED_KEYS(tp);
     assert(keys != NULL);
 #ifdef Py_GIL_DISABLED
-    Py_ssize_t usable = keys->dk_usable;
-    while (usable > 1) {
-        if (_Py_atomic_compare_exchange_ssize(&keys->dk_usable, &usable, usable - 1)) {
-            break;
+    Py_ssize_t usable = _Py_atomic_load_ssize_relaxed(&keys->dk_usable);
+    if (usable > 1) {
+        LOCK_KEYS(keys);
+        if (keys->dk_usable > 1) {
+            _Py_atomic_store_ssize(&keys->dk_usable, keys->dk_usable - 1);
         }
+        UNLOCK_KEYS(keys);
     }
 #else
     if (keys->dk_usable > 1) {
@@ -6190,7 +6180,7 @@ _PyObject_StoreInstanceAttribute(PyObject *obj, PyDictValues *values,
     Py_ssize_t ix = DKIX_EMPTY;
     if (PyUnicode_CheckExact(name)) {
         LOCK_KEYS(keys);
-        ix = insert_into_dictkeys(keys, name);
+        ix = insert_into_splitdictkeys(keys, name);
 #ifdef Py_STATS
         if (ix == DKIX_EMPTY) {
             if (PyUnicode_CheckExact(name)) {

From 7407ce9820a900550bf12655825dc30d2784c823 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Thu, 15 Feb 2024 09:57:54 -0800
Subject: [PATCH 5/6] Lock in dictresize, not in callers

---
 Include/cpython/pyatomic_msc.h |  4 ++--
 Objects/dictobject.c           | 12 +++---------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/Include/cpython/pyatomic_msc.h b/Include/cpython/pyatomic_msc.h
index 6fbfedf5f19ee5..3205e253b28546 100644
--- a/Include/cpython/pyatomic_msc.h
+++ b/Include/cpython/pyatomic_msc.h
@@ -945,9 +945,9 @@ _Py_atomic_store_ssize_release(Py_ssize_t *obj, Py_ssize_t value)
 #if defined(_M_X64) || defined(_M_IX86)
     *(Py_ssize_t volatile *)obj = value;
 #elif defined(_M_ARM64)
-    __stlr64((unsigned __int64 volatile *)obj, (uintptr_t)value);
+    __stlr64((unsigned __int64 volatile *)obj, (unsigned __int64)value);
 #else
-#  error "no implementation of _Py_atomic_store_int_release"
+#  error "no implementation of _Py_atomic_store_ssize_release"
 #endif
 }
 
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index 4cf706aab2ad75..e1e634db5bb609 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -1350,7 +1350,6 @@ insert_split_dict(PyInterpreterState *interp, PyDictObject *mp,
             return -1;
         }
         assert(!_PyDict_HasSplitTable(mp));
-        assert(DK_IS_UNICODE(keys));
         return insert_combined_dict(interp, mp, hash, key, value);
     }
 
@@ -1609,7 +1608,7 @@ dictresize(PyInterpreterState *interp, PyDictObject *mp,
     Py_ssize_t numentries = mp->ma_used;
 
     if (oldvalues != NULL) {
-        ASSERT_KEYS_LOCKED(oldkeys);
+        LOCK_KEYS(oldkeys);
         PyDictUnicodeEntry *oldentries = DK_UNICODE_ENTRIES(oldkeys);
         /* Convert split table into new combined table.
          * We must incref keys; we can transfer values.
@@ -1640,6 +1639,7 @@ dictresize(PyInterpreterState *interp, PyDictObject *mp,
             }
             build_indices_unicode(mp->ma_keys, newentries, numentries);
         }
+        UNLOCK_KEYS(oldkeys);
         dictkeys_decref(interp, oldkeys);
         mp->ma_values = NULL;
         free_values(oldvalues);
@@ -3998,14 +3998,8 @@ dict_popitem_impl(PyDictObject *self)
     /* Convert split table to combined table */
     if (_PyDict_HasSplitTable(self)) {
         PyDictKeysObject *keys = self->ma_keys;
-        dictkeys_incref(keys);
-        LOCK_KEYS(keys);
-
-        int status = dictresize(interp, self, DK_LOG_SIZE(self->ma_keys), 1);
-        UNLOCK_KEYS(keys);
-        dictkeys_decref(interp, keys);
 
-        if (status < 0) {
+        if (dictresize(interp, self, DK_LOG_SIZE(self->ma_keys), 1) < 0) {
             Py_DECREF(res);
             return NULL;
         }

From a9d3666dbe9983fd0e938f3f082849fecb444ed9 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Tue, 20 Feb 2024 14:20:17 -0800
Subject: [PATCH 6/6] Fixup recursive locking

---
 Objects/dictobject.c | 42 ++++++++----------------------------------
 1 file changed, 8 insertions(+), 34 deletions(-)

diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index e1e634db5bb609..a07267169fdc20 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -1342,10 +1342,8 @@ insert_split_dict(PyInterpreterState *interp, PyDictObject *mp,
     LOCK_KEYS(keys);
     if (keys->dk_usable <= 0) {
         /* Need to resize. */
-        dictkeys_incref(keys);
-        int ins = insertion_resize(interp, mp, 1);
-        dictkeys_decref(interp, keys);
         UNLOCK_KEYS(keys);
+        int ins = insertion_resize(interp, mp, 1);
         if (ins < 0) {
             return -1;
         }
@@ -1371,30 +1369,6 @@ insert_split_dict(PyInterpreterState *interp, PyDictObject *mp,
     return 0;
 }
 
-static int
-convert_to_nonunicode_keys(PyInterpreterState *interp, PyDictObject *mp)
-{
-    PyDictKeysObject *keys = mp->ma_keys;
-    if (_PyDict_HasSplitTable(mp)) {
-        LOCK_KEYS(keys);
-        dictkeys_incref(keys);
-    } else {
-        keys = NULL;
-    }
-
-    int res = insertion_resize(interp, mp, 0);
-
-    if (keys != NULL) {
-        dictkeys_decref(interp, keys);
-        UNLOCK_KEYS(keys);
-    }
-    if (res < 0) {
-        return res;
-    }
-    assert(mp->ma_keys->dk_kind == DICT_KEYS_GENERAL);
-    return 0;
-}
-
 /*
 Internal routine to insert a new item into the table.
 Used both by the internal resize routine and by the public insert routine.
@@ -1410,9 +1384,9 @@ insertdict(PyInterpreterState *interp, PyDictObject *mp,
     ASSERT_DICT_LOCKED(mp);
 
     if (DK_IS_UNICODE(mp->ma_keys) && !PyUnicode_CheckExact(key)) {
-        if (convert_to_nonunicode_keys(interp, mp) < 0) {
+        if (insertion_resize(interp, mp, 0) < 0)
             goto Fail;
-        }
+        assert(mp->ma_keys->dk_kind == DICT_KEYS_GENERAL);
     }
 
     Py_ssize_t ix = _Py_dict_lookup(mp, key, hash, &old_value);
@@ -1432,7 +1406,8 @@ insertdict(PyInterpreterState *interp, PyDictObject *mp,
             if (insert_combined_dict(interp, mp, hash, key, value) < 0) {
                 goto Fail;
             }
-        } else {
+        }
+        else {
             if (insert_split_dict(interp, mp, hash, key, value) < 0)
                 goto Fail;
         }
@@ -3803,7 +3778,7 @@ dict_setdefault_ref_lock_held(PyObject *d, PyObject *key, PyObject *default_valu
     }
 
     if (!PyUnicode_CheckExact(key) && DK_IS_UNICODE(mp->ma_keys)) {
-        if (convert_to_nonunicode_keys(interp, mp) < 0) {
+        if (insertion_resize(interp, mp, 0) < 0) {
             if (result) {
                 *result = NULL;
             }
@@ -3834,7 +3809,8 @@ dict_setdefault_ref_lock_held(PyObject *d, PyObject *key, PyObject *default_valu
                 }
                 return -1;
             }
-        } else {
+        }
+        else {
             if (insert_split_dict(interp, mp, hash, Py_NewRef(key), Py_NewRef(value)) < 0) {
                 Py_DECREF(key);
                 Py_DECREF(value);
@@ -3997,8 +3973,6 @@ dict_popitem_impl(PyDictObject *self)
     }
     /* Convert split table to combined table */
     if (_PyDict_HasSplitTable(self)) {
-        PyDictKeysObject *keys = self->ma_keys;
-
         if (dictresize(interp, self, DK_LOG_SIZE(self->ma_keys), 1) < 0) {
             Py_DECREF(res);
             return NULL;