Skip to content

[3.12] gh-106931: Intern Statically Allocated Strings Globally (gh-107272) #107358

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52,970 changes: 26,500 additions & 26,470 deletions Doc/data/python3.12.abi

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,11 @@ typedef struct {
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
set, use the PyASCIIObject structure. */
unsigned int ascii:1;
/* The object is statically allocated. */
unsigned int statically_allocated:1;
/* Padding to ensure that PyUnicode_DATA() is always aligned to
4 bytes (see issue #19537 on m68k). */
unsigned int :25;
unsigned int :24;
} state;
} PyASCIIObject;

Expand Down
6 changes: 6 additions & 0 deletions Include/internal/pycore_global_objects.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ extern "C" {
# error "this header requires Py_BUILD_CORE define"
#endif

#include "pycore_hashtable.h" // _Py_hashtable_t
#include "pycore_gc.h" // PyGC_Head
#include "pycore_global_strings.h" // struct _Py_global_strings
#include "pycore_hamt.h" // PyHamtNode_Bitmap
Expand All @@ -28,6 +29,11 @@ extern "C" {
#define _Py_SINGLETON(NAME) \
_Py_GLOBAL_OBJECT(singletons.NAME)

struct _Py_cached_objects {
// XXX We could statically allocate the hashtable.
_Py_hashtable_t *interned_strings;
};

struct _Py_static_objects {
struct {
/* Small integers are preallocated in this array so that they
Expand Down
1 change: 1 addition & 0 deletions Include/internal/pycore_hashtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ PyAPI_FUNC(int) _Py_hashtable_foreach(
void *user_data);

PyAPI_FUNC(size_t) _Py_hashtable_size(const _Py_hashtable_t *ht);
PyAPI_FUNC(size_t) _Py_hashtable_len(const _Py_hashtable_t *ht);

/* Add a new entry to the hash. The key must not be present in the hash table.
Return 0 on success, -1 on memory error. */
Expand Down
1 change: 1 addition & 0 deletions Include/internal/pycore_runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ typedef struct pyruntimestate {
struct _types_runtime_state types;

/* All the objects that are shared by the runtime's interpreters. */
struct _Py_cached_objects cached_objects;
struct _Py_static_objects static_objects;

/* The following fields are here to avoid allocation during init.
Expand Down
1 change: 1 addition & 0 deletions Include/internal/pycore_runtime_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ extern PyTypeObject _PyExc_MemoryError;
.kind = 1, \
.compact = 1, \
.ascii = (ASCII), \
.statically_allocated = 1, \
}, \
}
#define _PyASCIIObject_INIT(LITERAL) \
Expand Down
41 changes: 41 additions & 0 deletions Lib/test/test_sys.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,21 @@
from test.support.script_helper import assert_python_ok, assert_python_failure
from test.support import threading_helper
from test.support import import_helper
try:
from test.support import interpreters
except ImportError:
interpreters = None
import textwrap
import unittest
import warnings


def requires_subinterpreters(meth):
"""Decorator to skip a test if subinterpreters are not supported."""
return unittest.skipIf(interpreters is None,
'subinterpreters required')(meth)


# count the number of test runs, used to create unique
# strings to intern in test_intern()
INTERN_NUMRUNS = 0
Expand Down Expand Up @@ -699,6 +709,37 @@ def __hash__(self):

self.assertRaises(TypeError, sys.intern, S("abc"))

@requires_subinterpreters
def test_subinterp_intern_dynamically_allocated(self):
global INTERN_NUMRUNS
INTERN_NUMRUNS += 1
s = "never interned before" + str(INTERN_NUMRUNS)
t = sys.intern(s)
self.assertIs(t, s)

interp = interpreters.create()
interp.run(textwrap.dedent(f'''
import sys
t = sys.intern({s!r})
assert id(t) != {id(s)}, (id(t), {id(s)})
assert id(t) != {id(t)}, (id(t), {id(t)})
'''))

@requires_subinterpreters
def test_subinterp_intern_statically_allocated(self):
# See Tools/build/generate_global_objects.py for the list
# of strings that are always statically allocated.
s = '__init__'
t = sys.intern(s)

print('------------------------')
interp = interpreters.create()
interp.run(textwrap.dedent(f'''
import sys
t = sys.intern({s!r})
assert id(t) == {id(t)}, (id(t), {id(t)})
'''))

def test_sys_flags(self):
self.assertTrue(sys.flags)
attrs = ("debug",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Statically allocated string objects are now interned globally instead of
per-interpreter. This fixes a situation where such a string would only be
interned in a single interpreter. Normal string objects are unaffected.
72 changes: 69 additions & 3 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -235,15 +235,54 @@ static inline PyObject *get_interned_dict(PyInterpreterState *interp)
return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
}

#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings

Py_ssize_t
_PyUnicode_InternedSize(void)
{
return PyObject_Length(get_interned_dict(_PyInterpreterState_GET()));
PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
}

static Py_hash_t unicode_hash(PyObject *);
static int unicode_compare_eq(PyObject *, PyObject *);

static Py_uhash_t
hashtable_unicode_hash(const void *key)
{
return unicode_hash((PyObject *)key);
}

static int
hashtable_unicode_compare(const void *key1, const void *key2)
{
PyObject *obj1 = (PyObject *)key1;
PyObject *obj2 = (PyObject *)key2;
if (obj1 != NULL && obj2 != NULL) {
return unicode_compare_eq(obj1, obj2);
}
else {
return obj1 == obj2;
}
}

static int
init_interned_dict(PyInterpreterState *interp)
{
if (_Py_IsMainInterpreter(interp)) {
assert(INTERNED_STRINGS == NULL);
_Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
INTERNED_STRINGS = _Py_hashtable_new_full(
hashtable_unicode_hash,
hashtable_unicode_compare,
NULL,
NULL,
&hashtable_alloc
);
if (INTERNED_STRINGS == NULL) {
return -1;
}
}
assert(get_interned_dict(interp) == NULL);
PyObject *interned = interned = PyDict_New();
if (interned == NULL) {
Expand All @@ -262,6 +301,10 @@ clear_interned_dict(PyInterpreterState *interp)
Py_DECREF(interned);
_Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
}
if (_Py_IsMainInterpreter(interp) && INTERNED_STRINGS != NULL) {
_Py_hashtable_destroy(INTERNED_STRINGS);
INTERNED_STRINGS = NULL;
}
}

#define _Py_RETURN_UNICODE_EMPTY() \
Expand Down Expand Up @@ -1222,6 +1265,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
_PyUnicode_STATE(unicode).kind = kind;
_PyUnicode_STATE(unicode).compact = 1;
_PyUnicode_STATE(unicode).ascii = is_ascii;
_PyUnicode_STATE(unicode).statically_allocated = 0;
if (is_ascii) {
((char*)data)[size] = 0;
}
Expand Down Expand Up @@ -1552,7 +1596,9 @@ unicode_dealloc(PyObject *unicode)
* we accidentally decref an immortal string out of existence. Since
* the string is an immortal object, just re-set the reference count.
*/
if (PyUnicode_CHECK_INTERNED(unicode)) {
if (PyUnicode_CHECK_INTERNED(unicode)
|| _PyUnicode_STATE(unicode).statically_allocated)
{
_Py_SetImmortal(unicode);
return;
}
Expand Down Expand Up @@ -14502,6 +14548,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
_PyUnicode_STATE(self).kind = kind;
_PyUnicode_STATE(self).compact = 0;
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
_PyUnicode_STATE(self).statically_allocated = 0;
_PyUnicode_UTF8_LENGTH(self) = 0;
_PyUnicode_UTF8(self) = NULL;
_PyUnicode_DATA_ANY(self) = NULL;
Expand Down Expand Up @@ -14725,6 +14772,23 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
return;
}

/* Look in the global cache first. */
PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
if (r != NULL && r != s) {
Py_SETREF(*p, Py_NewRef(r));
return;
}

/* Handle statically allocated strings. */
if (_PyUnicode_STATE(s).statically_allocated) {
assert(_Py_IsImmortal(s));
if (_Py_hashtable_set(INTERNED_STRINGS, s, s) == 0) {
_PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
}
return;
}

/* Look in the per-interpreter cache. */
PyObject *interned = get_interned_dict(interp);
assert(interned != NULL);

Expand All @@ -14740,9 +14804,11 @@ _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
}

if (_Py_IsImmortal(s)) {
// XXX Restrict this to the main interpreter?
_PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
return;
return;
}

#ifdef Py_REF_DEBUG
/* The reference count value excluding the 2 references from the
interned dictionary should be excluded from the RefTotal. The
Expand Down
7 changes: 7 additions & 0 deletions Python/hashtable.c
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,13 @@ _Py_hashtable_size(const _Py_hashtable_t *ht)
}


size_t
_Py_hashtable_len(const _Py_hashtable_t *ht)
{
return ht->nentries;
}


_Py_hashtable_entry_t *
_Py_hashtable_get_entry_generic(_Py_hashtable_t *ht, const void *key)
{
Expand Down
2 changes: 2 additions & 0 deletions Tools/build/deepfreeze.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ def generate_unicode(self, name: str, s: str) -> str:
self.write(".kind = 1,")
self.write(".compact = 1,")
self.write(".ascii = 1,")
self.write(".statically_allocated = 1,")
self.write(f"._data = {make_string_literal(s.encode('ascii'))},")
return f"& {name}._ascii.ob_base"
else:
Expand All @@ -220,6 +221,7 @@ def generate_unicode(self, name: str, s: str) -> str:
self.write(f".kind = {kind},")
self.write(".compact = 1,")
self.write(".ascii = 0,")
self.write(".statically_allocated = 1,")
utf8 = s.encode('utf-8')
self.write(f'.utf8 = {make_string_literal(utf8)},')
self.write(f'.utf8_length = {len(utf8)},')
Expand Down