From 217d8446043ca86953b1c8dbcc8db82b928e1756 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 29 May 2025 16:42:01 +0200 Subject: [PATCH 1/6] Add PyUnstable_Unicode_GET_CACHED_HASH --- Doc/c-api/unicode.rst | 16 ++++++++++++++++ Doc/whatsnew/3.15.rst | 5 +++++ Include/cpython/unicodeobject.h | 6 ++++++ Lib/test/test_capi/test_unicode.py | 14 ++++++++++++++ Modules/_testcapi/unicode.c | 7 +++++++ 5 files changed, 48 insertions(+) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 684432da81c61f..badf260771d3c1 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -191,6 +191,22 @@ access to internal read-only data of Unicode objects: .. versionadded:: 3.2 +.. c:function:: Py_hash_t PyUnstable_Unicode_GET_CACHED_HASH(PyObject *str) + + If the hash of *str*, as returned by :c:func:`PyObject_Hash`, has been + cached and is immediately available, return it. + Otherwise, return ``-1`` *without* setting an exception. + + If *str* is not a string (that is, if :c:expr:`PyUnicode_Check(obj)` + is false), the behavior is undefined. + + This function never fails with an exception. + + Note that there are no guarantees on when a object's hash is cached, + and the (non-)existence of a cached hash does not imply that the string has + any other properties. + + Unicode Character Properties """""""""""""""""""""""""""" diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index ced9c63071a53c..0279a620a9db91 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -226,6 +226,11 @@ New features functions as replacements for :c:func:`PySys_GetObject`. (Contributed by Serhiy Storchaka in :gh:`108512`.) +* Add :c:type:`PyUnstable_Unicode_GET_CACHED_HASH` to get the cached hash of + a string. See the documentation for caveats. + (Contributed by Petr Viktorin in :gh:`131510`) + + Porting to Python 3.15 ---------------------- diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 136f5d5c5f8425..c65a2c23cc8c67 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -300,6 +300,12 @@ static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) { } #define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op)) +/* Returns the cached hash, or -1 if not cached yet. */ +static inline Py_hash_t +PyUnstable_Unicode_GET_CACHED_HASH(PyObject *op) { + return _PyASCIIObject_CAST(op)->hash; +} + /* Write into the canonical representation, this function does not do any sanity checks and is intended for usage in loops. The caller should cache the kind and data pointers obtained from other function calls. diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 3408c10f426058..99d3c2dccaaa50 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1739,6 +1739,20 @@ def test_pep393_utf8_caching_bug(self): # Check that the second call returns the same result self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_GET_CACHED_HASH(self): + from _testcapi import unicode_GET_CACHED_HASH + content_bytes = b'some new string' + # avoid parser interning & constant folding + obj = str(content_bytes, 'ascii') + # impl detail: fresh strings do not have cached hash + self.assertEqual(unicode_GET_CACHED_HASH(obj), -1) + # impl detail: adding string to a dict caches its hash + {obj: obj} + # impl detail: ASCII string hashes are equal to bytes ones + self.assertEqual(unicode_GET_CACHED_HASH(obj), hash(content_bytes)) + class PyUnicodeWriterTest(unittest.TestCase): def create_writer(self, size): diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index b8ecf53f4f8b9c..6894d53b33bb46 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -220,6 +220,12 @@ unicode_copycharacters(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", to_copy, copied); } +static PyObject* +unicode_GET_CACHED_HASH(PyObject *self, PyObject *arg) +{ + return PyLong_FromLong((long)PyUnstable_Unicode_GET_CACHED_HASH(arg)); +} + // --- PyUnicodeWriter type ------------------------------------------------- @@ -548,6 +554,7 @@ static PyMethodDef TestMethods[] = { {"unicode_asucs4copy", unicode_asucs4copy, METH_VARARGS}, {"unicode_asutf8", unicode_asutf8, METH_VARARGS}, {"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, + {"unicode_GET_CACHED_HASH", unicode_GET_CACHED_HASH, METH_O}, {NULL}, }; From 4c85e4247563b554371673a68269f5f0639c76ec Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 29 May 2025 16:56:25 +0200 Subject: [PATCH 2/6] blurb --- .../next/C_API/2025-05-29-16-56-23.gh-issue-134891.7eKO8U.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/C_API/2025-05-29-16-56-23.gh-issue-134891.7eKO8U.rst diff --git a/Misc/NEWS.d/next/C_API/2025-05-29-16-56-23.gh-issue-134891.7eKO8U.rst b/Misc/NEWS.d/next/C_API/2025-05-29-16-56-23.gh-issue-134891.7eKO8U.rst new file mode 100644 index 00000000000000..db30d5e9a94584 --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2025-05-29-16-56-23.gh-issue-134891.7eKO8U.rst @@ -0,0 +1,2 @@ +Add :c:type:`PyUnstable_Unicode_GET_CACHED_HASH` to get the cached hash of a +string. From a32900c05294e14f29a44214c67b3d2c93e11a45 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Fri, 30 May 2025 13:00:41 +0200 Subject: [PATCH 3/6] Use this as the internal PyUnicode_HASH --- Include/cpython/unicodeobject.h | 5 +++++ Modules/_testcapi/unicode.c | 2 +- Objects/unicodeobject.c | 6 +----- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index dbef787b13db74..7c1aac9696dec9 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -303,7 +303,12 @@ static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) { /* Returns the cached hash, or -1 if not cached yet. */ static inline Py_hash_t PyUnstable_Unicode_GET_CACHED_HASH(PyObject *op) { + assert(PyUnicode_Check(op)); +#ifdef Py_GIL_DISABLED + return _Py_atomic_load_ssize_relaxed(&_PyASCIIObject_CAST(op)->hash); +#else return _PyASCIIObject_CAST(op)->hash; +#endif } /* Write into the canonical representation, this function does not do any sanity diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index ba65f228000107..109bad72b6a8b2 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -223,7 +223,7 @@ unicode_copycharacters(PyObject *self, PyObject *args) static PyObject* unicode_GET_CACHED_HASH(PyObject *self, PyObject *arg) { - return PyLong_FromLong((long)PyUnstable_Unicode_GET_CACHED_HASH(arg)); + return PyLong_FromSSize_t(PyUnstable_Unicode_GET_CACHED_HASH(arg)); } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5611f839627a2e..5c2308a012142a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -167,11 +167,7 @@ static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length) #define _PyUnicode_HASH(op) \ (_PyASCIIObject_CAST(op)->hash) -static inline Py_hash_t PyUnicode_HASH(PyObject *op) -{ - assert(_PyUnicode_CHECK(op)); - return FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash); -} +#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash) { From d7e32d77017c505ede5a66c76c69261d0af5f433 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Fri, 30 May 2025 13:01:34 +0200 Subject: [PATCH 4/6] Apply suggestions from code review Co-authored-by: Colin Marquardt --- Doc/c-api/unicode.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 91a2b4ed815fef..dda14d83c89b31 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -202,7 +202,7 @@ access to internal read-only data of Unicode objects: This function never fails with an exception. - Note that there are no guarantees on when a object's hash is cached, + Note that there are no guarantees on when an object's hash is cached, and the (non-)existence of a cached hash does not imply that the string has any other properties. From 6b51715eaec4c99ae90ebfb706a269c1542761db Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Fri, 30 May 2025 13:03:00 +0200 Subject: [PATCH 5/6] Avoid unknown identifier in docs --- Doc/c-api/unicode.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index dda14d83c89b31..efb3b95d0a4a07 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -197,7 +197,7 @@ access to internal read-only data of Unicode objects: cached and is immediately available, return it. Otherwise, return ``-1`` *without* setting an exception. - If *str* is not a string (that is, if :c:expr:`PyUnicode_Check(obj)` + If *str* is not a string (that is, if ``PyUnicode_Check(obj)`` is false), the behavior is undefined. This function never fails with an exception. From 9002e068c27393b6e99fd882be1119e9b8bcdb1f Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Fri, 30 May 2025 13:23:13 +0200 Subject: [PATCH 6/6] Fix typo --- Modules/_testcapi/unicode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 109bad72b6a8b2..203282dd53dd0a 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -223,7 +223,7 @@ unicode_copycharacters(PyObject *self, PyObject *args) static PyObject* unicode_GET_CACHED_HASH(PyObject *self, PyObject *arg) { - return PyLong_FromSSize_t(PyUnstable_Unicode_GET_CACHED_HASH(arg)); + return PyLong_FromSsize_t(PyUnstable_Unicode_GET_CACHED_HASH(arg)); }