diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 0e3d46852f2e6d..5a54608ae3db81 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -18,6 +18,7 @@ extern "C" { #include "pycore_dict_state.h" // struct _Py_dict_state #include "pycore_exceptions.h" // struct _Py_exc_state #include "pycore_floatobject.h" // struct _Py_float_state +#include "pycore_pymem.h" // free lists #include "pycore_function.h" // FUNC_MAX_WATCHERS #include "pycore_genobject.h" // struct _Py_async_gen_state #include "pycore_gc.h" // struct _gc_runtime_state @@ -50,6 +51,9 @@ struct _Py_long_state { /* interpreter state */ +#define SMALL_OBJECT_FREELIST_SIZE 1024 +#define INTERP_NUM_FREELISTS 30 + /* PyInterpreterState holds the global state for one of the runtime's interpreters. Typically the initial (main) interpreter is the only one. @@ -178,6 +182,7 @@ struct _is { struct _Py_context_state context; struct _Py_exc_state exc_state; + _PyFreeList freelists[INTERP_NUM_FREELISTS]; struct ast_state ast; struct types_state types; struct callable_cache callable_cache; @@ -230,6 +235,36 @@ PyAPI_FUNC(int) _PyInterpreterState_IDInitref(PyInterpreterState *); PyAPI_FUNC(int) _PyInterpreterState_IDIncref(PyInterpreterState *); PyAPI_FUNC(void) _PyInterpreterState_IDDecref(PyInterpreterState *); +#if SIZEOF_VOID_P == 4 +#define LOG_BASE_2_OF_FREELIST_QUANTUM 3 +#elif SIZEOF_VOID_P == 8 +#define LOG_BASE_2_OF_FREELIST_QUANTUM 4 +#else +#error "void pointer size not in (32, 64)" +#endif + +#define FREELIST_QUANTUM (2*SIZEOF_VOID_P) +#define SIZE_TO_FREELIST_SIZE_CLASS(size) (((size) + FREELIST_QUANTUM - 1) >> \ + LOG_BASE_2_OF_FREELIST_QUANTUM) +#define FREELIST_INDEX_TO_ALLOCATED_SIZE(idx) ((idx) * FREELIST_QUANTUM) + +static inline PyObject* +_PyInterpreterState_FreelistAlloc(PyInterpreterState *interp, Py_ssize_t size) { + Py_ssize_t index = SIZE_TO_FREELIST_SIZE_CLASS(size); + assert(index >= 0 && index < INTERP_NUM_FREELISTS); + return _PyFreeList_Alloc(&interp->freelists[index]); +} + +static inline void +_PyInterpreterState_FreelistFree(PyInterpreterState * interp, PyObject *op, Py_ssize_t size) { + /* todo: assert the size is correct? */ + Py_ssize_t index = SIZE_TO_FREELIST_SIZE_CLASS(size); + assert(index >= 0 && index < INTERP_NUM_FREELISTS); + _PyFreeList_Free(&interp->freelists[index], op); +} + + + #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_long.h b/Include/internal/pycore_long.h index 8c1d017bb95e4e..9a93184cc0c28a 100644 --- a/Include/internal/pycore_long.h +++ b/Include/internal/pycore_long.h @@ -129,6 +129,8 @@ _PyLong_IsPositiveSingleDigit(PyObject* sub) { return ((size_t)signed_size) <= 1; } +void _PyLong_Free(PyLongObject *op); + #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_pymem.h b/Include/internal/pycore_pymem.h index 4cc953d8d779c9..86e0ea1c5e1a02 100644 --- a/Include/internal/pycore_pymem.h +++ b/Include/internal/pycore_pymem.h @@ -91,6 +91,80 @@ PyAPI_FUNC(int) _PyMem_GetAllocatorName( PyAPI_FUNC(int) _PyMem_SetupAllocators(PyMemAllocatorName allocator); +/* Free lists. + * + * Free lists have a pointer to their first entry and + * the amount of space available allowing fast checks + * for emptiness and fullness. + * When empty they are half filled and when full they are + * completely emptied. This helps the underlying allocator + * avoid fragmentation and helps performance. + */ + +typedef struct _freelist { + void *ptr; + uint32_t space; + uint16_t size; + uint16_t capacity; +#ifdef Py_STATS + int size_class; +#endif +} _PyFreeList; + +extern void *_PyFreeList_HalfFillAndAllocate(_PyFreeList *list); +extern void _PyFreeList_FreeToFull(_PyFreeList *list, void *ptr); +extern void _PyFreeList_Clear(_PyFreeList *list); +extern void _PyFreeList_Disable(_PyFreeList *list); + +static inline void * +_PyFreeList_Alloc(_PyFreeList *list) { +#ifdef Py_STATS + if (_py_stats) _py_stats->freelist_stats[list->size_class].allocations++; +#endif + if (list->ptr != NULL) { + void *result = list->ptr; + list->ptr = *((void **)result); + list->space++; + return result; + } +#ifdef Py_STATS + if (_py_stats) _py_stats->freelist_stats[list->size_class].empty++; +#endif + return _PyFreeList_HalfFillAndAllocate(list); +} + +static inline void +_PyFreeList_Free(_PyFreeList *list, void *ptr) { +#ifdef Py_STATS + if (_py_stats) _py_stats->freelist_stats[list->size_class].frees++; +#endif + if (list->space) { + *((void **)ptr) = list->ptr; + list->ptr = ptr; + list->space--; + return; + } +#ifdef Py_STATS + if (_py_stats) _py_stats->freelist_stats[list->size_class].full++; +#endif + _PyFreeList_FreeToFull(list, ptr); +} + +static inline void +_PyFreeList_Init(_PyFreeList *list, int size_class, int size, int capacity) +{ + list->ptr = NULL; + list->size = size; +#ifdef Py_STATS + list->size_class = size_class; +#endif +#if WITH_FREELISTS + list->space = list->capacity = capacity; +#else + _PyFreeList_Disable(list); +#endif +} + #ifdef __cplusplus } #endif diff --git a/Include/pystats.h b/Include/pystats.h index 25ed4bddc7240c..1d14960d31cbdc 100644 --- a/Include/pystats.h +++ b/Include/pystats.h @@ -48,6 +48,13 @@ typedef struct _call_stats { uint64_t eval_calls[EVAL_CALL_KINDS]; } CallStats; +typedef struct _generic_freelist_stats { + uint64_t allocations; + uint64_t frees; + uint64_t empty; + uint64_t full; +} GenericFreelistStats; + typedef struct _object_stats { uint64_t increfs; uint64_t decrefs; @@ -78,6 +85,7 @@ typedef struct _stats { OpcodeStats opcode_stats[256]; CallStats call_stats; ObjectStats object_stats; + GenericFreelistStats freelist_stats[30]; // INTERP_NUM_FREELISTS } PyStats; diff --git a/Objects/longobject.c b/Objects/longobject.c index 8293f133bed213..248ca83f2d7c96 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -6,6 +6,7 @@ #include "pycore_bitutils.h" // _Py_popcount32() #include "pycore_initconfig.h" // _PyStatus_OK() #include "pycore_long.h" // _Py_SmallInts +#include "pycore_pymem.h" // Free lists #include "pycore_object.h" // _PyObject_InitVar() #include "pycore_pystate.h" // _Py_IsMainInterpreter() #include "pycore_runtime.h" // _PY_NSMALLPOSINTS @@ -46,7 +47,7 @@ static inline void _Py_DECREF_INT(PyLongObject *op) { assert(PyLong_CheckExact(op)); - _Py_DECREF_SPECIALIZED((PyObject *)op, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED((PyObject *)op, (destructor)_PyLong_Free); } static inline int @@ -152,16 +153,20 @@ _PyLong_New(Py_ssize_t size) "too many digits in integer"); return NULL; } - /* Fast operations for single digit integers (including zero) - * assume that there is always at least one digit present. */ - Py_ssize_t ndigits = size ? size : 1; - /* Number of bytes needed is: offsetof(PyLongObject, ob_digit) + - sizeof(digit)*size. Previous incarnations of this code used - sizeof(PyVarObject) instead of the offsetof, but this risks being - incorrect in the presence of padding between the PyVarObject header - and the digits. */ - result = PyObject_Malloc(offsetof(PyLongObject, long_value.ob_digit) + - ndigits*sizeof(digit)); + assert(size >= 0); + if (size <= 1) { + PyInterpreterState *interp = _PyInterpreterState_GET(); + result = (PyLongObject *)_PyInterpreterState_FreelistAlloc(interp, sizeof(PyLongObject)); + } + else { + /* Number of bytes needed is: offsetof(PyLongObject, ob_digit) + + sizeof(digit)*size. Previous incarnations of this code used + sizeof(PyVarObject) instead of the offsetof, but this risks being + incorrect in the presence of padding between the PyVarObject header + and the digits. */ + result = PyObject_Malloc(offsetof(PyLongObject, long_value.ob_digit) + + size*sizeof(digit)); + } if (!result) { PyErr_NoMemory(); return NULL; @@ -201,11 +206,11 @@ _PyLong_FromMedium(sdigit x) { assert(!IS_SMALL_INT(x)); assert(is_medium_int(x)); - /* We could use a freelist here */ - PyLongObject *v = PyObject_Malloc(sizeof(PyLongObject)); + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyLongObject *v = (PyLongObject *)_PyInterpreterState_FreelistAlloc( + interp, sizeof(PyLongObject)); if (v == NULL) { - PyErr_NoMemory(); - return NULL; + return PyErr_NoMemory(); } Py_ssize_t sign = x < 0 ? -1: 1; digit abs_x = x < 0 ? -x : x; @@ -267,6 +272,19 @@ _PyLong_FromSTwoDigits(stwodigits x) return _PyLong_FromLarge(x); } +void +_PyLong_Free(PyLongObject *op) +{ + if (PyLong_CheckExact(op) && IS_MEDIUM_VALUE(op)) { + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyInterpreterState_FreelistFree(interp, (PyObject*)op, sizeof(PyLongObject)); + } + else + { + Py_TYPE(op)->tp_free((PyObject *)op); + } +} + int _PyLong_AssignValue(PyObject **target, Py_ssize_t value) { @@ -6286,7 +6304,7 @@ PyTypeObject PyLong_Type = { "int", /* tp_name */ offsetof(PyLongObject, long_value.ob_digit), /* tp_basicsize */ sizeof(digit), /* tp_itemsize */ - 0, /* tp_dealloc */ + (destructor)_PyLong_Free, /* tp_dealloc */ 0, /* tp_vectorcall_offset */ 0, /* tp_getattr */ 0, /* tp_setattr */ diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 276c5a276c06e6..73cbccc980b9a9 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -717,6 +717,64 @@ PyObject_Free(void *ptr) # define LIKELY(value) (value) #endif + +void * +_PyFreeList_HalfFillAndAllocate(_PyFreeList *list) +{ + assert(list->ptr == NULL); + if (list->capacity < 4) { + return PyObject_Malloc(list->size); + } + uint32_t i = 0; + for (; i < list->space>>1; i++) { + void* ptr = PyObject_Malloc(list->size); + if (ptr == NULL) { + break; + } + *((void**)ptr) = list->ptr; + list->ptr = ptr; + } + if (i == 0) { + return NULL; + } + void *result = list->ptr; + list->ptr = *((void **)result); + list->space -= (i-1); + return result; +} + +void +_PyFreeList_Clear(_PyFreeList *list) +{ + int space = 0; + void *head = list->ptr; + while (head) { + void *next = *((void**)head); + PyObject_Free(head); + head = next; + space++; + } + list->ptr = NULL; + list->space += space; +} + +void +_PyFreeList_Disable(_PyFreeList *list) +{ + list->space = list->capacity = 0; +} + +void +_PyFreeList_FreeToFull(_PyFreeList *list, void *ptr) +{ + assert(list->space == 0); + PyObject_Free(ptr); + if (list->ptr == NULL) { + return; + } + _PyFreeList_Clear(list); +} + #ifdef WITH_PYMALLOC #ifdef WITH_VALGRIND diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 1169d8d172dd57..da45d0511cbdf8 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -183,8 +183,8 @@ dummy_func( DEOPT_IF(!PyLong_CheckExact(right), BINARY_OP); STAT_INC(BINARY_OP, hit); prod = _PyLong_Multiply((PyLongObject *)left, (PyLongObject *)right); - _Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free); - _Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(right, (destructor)_PyLong_Free); + _Py_DECREF_SPECIALIZED(left, (destructor)_PyLong_Free); ERROR_IF(prod == NULL, error); } @@ -207,8 +207,8 @@ dummy_func( DEOPT_IF(!PyLong_CheckExact(right), BINARY_OP); STAT_INC(BINARY_OP, hit); sub = _PyLong_Subtract((PyLongObject *)left, (PyLongObject *)right); - _Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free); - _Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(right, (destructor)_PyLong_Free); + _Py_DECREF_SPECIALIZED(left, (destructor)_PyLong_Free); ERROR_IF(sub == NULL, error); } @@ -290,8 +290,8 @@ dummy_func( DEOPT_IF(Py_TYPE(right) != Py_TYPE(left), BINARY_OP); STAT_INC(BINARY_OP, hit); sum = _PyLong_Add((PyLongObject *)left, (PyLongObject *)right); - _Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free); - _Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(right, (destructor)_PyLong_Free); + _Py_DECREF_SPECIALIZED(left, (destructor)_PyLong_Free); ERROR_IF(sum == NULL, error); } @@ -364,7 +364,7 @@ dummy_func( res = PyList_GET_ITEM(list, index); assert(res != NULL); Py_INCREF(res); - _Py_DECREF_SPECIALIZED(sub, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(sub, (destructor)_PyLong_Free); Py_DECREF(list); } @@ -382,7 +382,7 @@ dummy_func( res = PyTuple_GET_ITEM(tuple, index); assert(res != NULL); Py_INCREF(res); - _Py_DECREF_SPECIALIZED(sub, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(sub, (destructor)_PyLong_Free); Py_DECREF(tuple); } @@ -478,7 +478,7 @@ dummy_func( PyList_SET_ITEM(list, index, value); assert(old_value != NULL); Py_DECREF(old_value); - _Py_DECREF_SPECIALIZED(sub, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(sub, (destructor)_PyLong_Free); Py_DECREF(list); } @@ -1775,8 +1775,8 @@ dummy_func( Py_ssize_t iright = Py_SIZE(right) * ((PyLongObject *)right)->long_value.ob_digit[0]; // 2 if <, 4 if >, 8 if ==; this matches the low 4 bits of the oparg int sign_ish = COMPARISON_BIT(ileft, iright); - _Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free); - _Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(left, (destructor)_PyLong_Free); + _Py_DECREF_SPECIALIZED(right, (destructor)_PyLong_Free); if (sign_ish & oparg) { int offset = _Py_OPARG(next_instr[1]); JUMPBY(offset); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 09eb6893ebf6b4..9ff7525a680370 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -248,8 +248,8 @@ DEOPT_IF(!PyLong_CheckExact(right), BINARY_OP); STAT_INC(BINARY_OP, hit); prod = _PyLong_Multiply((PyLongObject *)left, (PyLongObject *)right); - _Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free); - _Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(right, (destructor)_PyLong_Free); + _Py_DECREF_SPECIALIZED(left, (destructor)_PyLong_Free); if (prod == NULL) goto pop_2_error; STACK_SHRINK(1); POKE(1, prod); @@ -286,8 +286,8 @@ DEOPT_IF(!PyLong_CheckExact(right), BINARY_OP); STAT_INC(BINARY_OP, hit); sub = _PyLong_Subtract((PyLongObject *)left, (PyLongObject *)right); - _Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free); - _Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(right, (destructor)_PyLong_Free); + _Py_DECREF_SPECIALIZED(left, (destructor)_PyLong_Free); if (sub == NULL) goto pop_2_error; STACK_SHRINK(1); POKE(1, sub); @@ -395,8 +395,8 @@ DEOPT_IF(Py_TYPE(right) != Py_TYPE(left), BINARY_OP); STAT_INC(BINARY_OP, hit); sum = _PyLong_Add((PyLongObject *)left, (PyLongObject *)right); - _Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free); - _Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(right, (destructor)_PyLong_Free); + _Py_DECREF_SPECIALIZED(left, (destructor)_PyLong_Free); if (sum == NULL) goto pop_2_error; STACK_SHRINK(1); POKE(1, sum); @@ -491,7 +491,7 @@ res = PyList_GET_ITEM(list, index); assert(res != NULL); Py_INCREF(res); - _Py_DECREF_SPECIALIZED(sub, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(sub, (destructor)_PyLong_Free); Py_DECREF(list); STACK_SHRINK(1); POKE(1, res); @@ -516,7 +516,7 @@ res = PyTuple_GET_ITEM(tuple, index); assert(res != NULL); Py_INCREF(res); - _Py_DECREF_SPECIALIZED(sub, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(sub, (destructor)_PyLong_Free); Py_DECREF(tuple); STACK_SHRINK(1); POKE(1, res); @@ -644,7 +644,7 @@ PyList_SET_ITEM(list, index, value); assert(old_value != NULL); Py_DECREF(old_value); - _Py_DECREF_SPECIALIZED(sub, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(sub, (destructor)_PyLong_Free); Py_DECREF(list); STACK_SHRINK(3); JUMPBY(1); @@ -2236,8 +2236,8 @@ Py_ssize_t iright = Py_SIZE(right) * ((PyLongObject *)right)->long_value.ob_digit[0]; // 2 if <, 4 if >, 8 if ==; this matches the low 4 bits of the oparg int sign_ish = COMPARISON_BIT(ileft, iright); - _Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free); - _Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free); + _Py_DECREF_SPECIALIZED(left, (destructor)_PyLong_Free); + _Py_DECREF_SPECIALIZED(right, (destructor)_PyLong_Free); if (sign_ish & oparg) { int offset = _Py_OPARG(next_instr[1]); JUMPBY(offset); diff --git a/Python/pystate.c b/Python/pystate.c index 1261092d1435fa..d0a52beb35eb68 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -717,6 +717,12 @@ PyInterpreterState_New(void) init_interpreter(interp, runtime, id, old_head, pending_lock); + for (int i=0; i < INTERP_NUM_FREELISTS; i++) { + _PyFreeList_Init(&interp->freelists[i], i, + FREELIST_INDEX_TO_ALLOCATED_SIZE(i), + SMALL_OBJECT_FREELIST_SIZE); + } + HEAD_UNLOCK(runtime); return interp; @@ -759,6 +765,11 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate) } HEAD_UNLOCK(runtime); + for (int i=0; i < INTERP_NUM_FREELISTS; i++) { + _PyFreeList_Clear(&interp->freelists[i]); + _PyFreeList_Disable(&interp->freelists[i]); + } + /* It is possible that any of the objects below have a finalizer that runs Python code or otherwise relies on a thread state or even the interpreter state. For now we trust that isn't diff --git a/Python/specialize.c b/Python/specialize.c index 908ad6dceb57f3..8ec4978b3a886f 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -195,11 +195,26 @@ print_object_stats(FILE *out, ObjectStats *stats) fprintf(out, "Object method cache dunder misses: %" PRIu64 "\n", stats->type_cache_dunder_misses); } +static void +print_freelist_stats(FILE *out, GenericFreelistStats freelist_stats[INTERP_NUM_FREELISTS]) +{ + for (int i=0; iallocations > 0) { + fprintf(out, "Allocations from freelist[%d]: %" PRIu64 "\n", i, stats->allocations); + fprintf(out, "Frees into freelist[%d]: %" PRIu64 "\n", i, stats->frees); + fprintf(out, "Freelist[%d] empty: %" PRIu64 "\n", i, stats->empty); + fprintf(out, "Freelist[%d] full: %" PRIu64 "\n", i, stats->full); + } + } +} + static void print_stats(FILE *out, PyStats *stats) { print_spec_stats(out, stats->opcode_stats); print_call_stats(out, &stats->call_stats); print_object_stats(out, &stats->object_stats); + print_freelist_stats(out, stats->freelist_stats); } void