From 472af089670d05e9b46c966f3daf621e20ed48a1 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Mar 2024 11:00:55 -0800 Subject: [PATCH 01/12] Print the final optimized uop --- Python/optimizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index acd6d52c4a885f..ac0c199af29239 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -947,7 +947,7 @@ make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *depende } if (lltrace >= 2) { printf("Optimized executor (length %d):\n", length); - for (int i = 0; i < length; i++) { + for (int i = 0; i < length+1; i++) { printf("%4d OPTIMIZED: ", i); _PyUOpPrint(&executor->trace[i]); printf("\n"); From aeafc38e8447e0d0b693337cf38c7f743b3a422e Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Mar 2024 12:07:21 -0800 Subject: [PATCH 02/12] First prototype, works for cast(int, x) --- Include/internal/pycore_uop_ids.h | 247 +++++++++++++------------ Include/internal/pycore_uop_metadata.h | 2 + Python/bytecodes.c | 5 + Python/executor_cases.c.h | 19 ++ Python/optimizer_analysis.c | 96 ++++++++++ Python/optimizer_cases.c.h | 9 + 6 files changed, 255 insertions(+), 123 deletions(-) diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index 8f71eab44d914d..0a5897c90e7830 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -11,18 +11,19 @@ extern "C" { #define _EXIT_TRACE 300 #define _SET_IP 301 +#define _ADJUST_STUFF 302 #define _BEFORE_ASYNC_WITH BEFORE_ASYNC_WITH #define _BEFORE_WITH BEFORE_WITH -#define _BINARY_OP 302 -#define _BINARY_OP_ADD_FLOAT 303 -#define _BINARY_OP_ADD_INT 304 -#define _BINARY_OP_ADD_UNICODE 305 -#define _BINARY_OP_MULTIPLY_FLOAT 306 -#define _BINARY_OP_MULTIPLY_INT 307 -#define _BINARY_OP_SUBTRACT_FLOAT 308 -#define _BINARY_OP_SUBTRACT_INT 309 +#define _BINARY_OP 303 +#define _BINARY_OP_ADD_FLOAT 304 +#define _BINARY_OP_ADD_INT 305 +#define _BINARY_OP_ADD_UNICODE 306 +#define _BINARY_OP_MULTIPLY_FLOAT 307 +#define _BINARY_OP_MULTIPLY_INT 308 +#define _BINARY_OP_SUBTRACT_FLOAT 309 +#define _BINARY_OP_SUBTRACT_INT 310 #define _BINARY_SLICE BINARY_SLICE -#define _BINARY_SUBSCR 310 +#define _BINARY_SUBSCR 311 #define _BINARY_SUBSCR_DICT BINARY_SUBSCR_DICT #define _BINARY_SUBSCR_GETITEM BINARY_SUBSCR_GETITEM #define _BINARY_SUBSCR_LIST_INT BINARY_SUBSCR_LIST_INT @@ -35,7 +36,7 @@ extern "C" { #define _BUILD_SLICE BUILD_SLICE #define _BUILD_STRING BUILD_STRING #define _BUILD_TUPLE BUILD_TUPLE -#define _CALL 311 +#define _CALL 312 #define _CALL_ALLOC_AND_ENTER_INIT CALL_ALLOC_AND_ENTER_INIT #define _CALL_BUILTIN_CLASS CALL_BUILTIN_CLASS #define _CALL_BUILTIN_FAST CALL_BUILTIN_FAST @@ -55,26 +56,26 @@ extern "C" { #define _CALL_STR_1 CALL_STR_1 #define _CALL_TUPLE_1 CALL_TUPLE_1 #define _CALL_TYPE_1 CALL_TYPE_1 -#define _CHECK_ATTR_CLASS 312 -#define _CHECK_ATTR_METHOD_LAZY_DICT 313 -#define _CHECK_ATTR_MODULE 314 -#define _CHECK_ATTR_WITH_HINT 315 -#define _CHECK_BUILTINS 316 -#define _CHECK_CALL_BOUND_METHOD_EXACT_ARGS 317 +#define _CHECK_ATTR_CLASS 313 +#define _CHECK_ATTR_METHOD_LAZY_DICT 314 +#define _CHECK_ATTR_MODULE 315 +#define _CHECK_ATTR_WITH_HINT 316 +#define _CHECK_BUILTINS 317 +#define _CHECK_CALL_BOUND_METHOD_EXACT_ARGS 318 #define _CHECK_EG_MATCH CHECK_EG_MATCH #define _CHECK_EXC_MATCH CHECK_EXC_MATCH -#define _CHECK_FUNCTION_EXACT_ARGS 318 -#define _CHECK_GLOBALS 319 -#define _CHECK_MANAGED_OBJECT_HAS_VALUES 320 -#define _CHECK_PEP_523 321 -#define _CHECK_STACK_SPACE 322 -#define _CHECK_VALIDITY 323 -#define _CHECK_VALIDITY_AND_SET_IP 324 -#define _COLD_EXIT 325 -#define _COMPARE_OP 326 -#define _COMPARE_OP_FLOAT 327 -#define _COMPARE_OP_INT 328 -#define _COMPARE_OP_STR 329 +#define _CHECK_FUNCTION_EXACT_ARGS 319 +#define _CHECK_GLOBALS 320 +#define _CHECK_MANAGED_OBJECT_HAS_VALUES 321 +#define _CHECK_PEP_523 322 +#define _CHECK_STACK_SPACE 323 +#define _CHECK_VALIDITY 324 +#define _CHECK_VALIDITY_AND_SET_IP 325 +#define _COLD_EXIT 326 +#define _COMPARE_OP 327 +#define _COMPARE_OP_FLOAT 328 +#define _COMPARE_OP_INT 329 +#define _COMPARE_OP_STR 330 #define _CONTAINS_OP CONTAINS_OP #define _CONVERT_VALUE CONVERT_VALUE #define _COPY COPY @@ -89,41 +90,41 @@ extern "C" { #define _DICT_UPDATE DICT_UPDATE #define _END_SEND END_SEND #define _EXIT_INIT_CHECK EXIT_INIT_CHECK -#define _FATAL_ERROR 330 +#define _FATAL_ERROR 331 #define _FORMAT_SIMPLE FORMAT_SIMPLE #define _FORMAT_WITH_SPEC FORMAT_WITH_SPEC -#define _FOR_ITER 331 +#define _FOR_ITER 332 #define _FOR_ITER_GEN FOR_ITER_GEN -#define _FOR_ITER_TIER_TWO 332 +#define _FOR_ITER_TIER_TWO 333 #define _GET_AITER GET_AITER #define _GET_ANEXT GET_ANEXT #define _GET_AWAITABLE GET_AWAITABLE #define _GET_ITER GET_ITER #define _GET_LEN GET_LEN #define _GET_YIELD_FROM_ITER GET_YIELD_FROM_ITER -#define _GUARD_BOTH_FLOAT 333 -#define _GUARD_BOTH_INT 334 -#define _GUARD_BOTH_UNICODE 335 -#define _GUARD_BUILTINS_VERSION 336 -#define _GUARD_DORV_VALUES 337 -#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT 338 -#define _GUARD_GLOBALS_VERSION 339 -#define _GUARD_IS_FALSE_POP 340 -#define _GUARD_IS_NONE_POP 341 -#define _GUARD_IS_NOT_NONE_POP 342 -#define _GUARD_IS_TRUE_POP 343 -#define _GUARD_KEYS_VERSION 344 -#define _GUARD_NOT_EXHAUSTED_LIST 345 -#define _GUARD_NOT_EXHAUSTED_RANGE 346 -#define _GUARD_NOT_EXHAUSTED_TUPLE 347 -#define _GUARD_TYPE_VERSION 348 -#define _INIT_CALL_BOUND_METHOD_EXACT_ARGS 349 -#define _INIT_CALL_PY_EXACT_ARGS 350 -#define _INIT_CALL_PY_EXACT_ARGS_0 351 -#define _INIT_CALL_PY_EXACT_ARGS_1 352 -#define _INIT_CALL_PY_EXACT_ARGS_2 353 -#define _INIT_CALL_PY_EXACT_ARGS_3 354 -#define _INIT_CALL_PY_EXACT_ARGS_4 355 +#define _GUARD_BOTH_FLOAT 334 +#define _GUARD_BOTH_INT 335 +#define _GUARD_BOTH_UNICODE 336 +#define _GUARD_BUILTINS_VERSION 337 +#define _GUARD_DORV_VALUES 338 +#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT 339 +#define _GUARD_GLOBALS_VERSION 340 +#define _GUARD_IS_FALSE_POP 341 +#define _GUARD_IS_NONE_POP 342 +#define _GUARD_IS_NOT_NONE_POP 343 +#define _GUARD_IS_TRUE_POP 344 +#define _GUARD_KEYS_VERSION 345 +#define _GUARD_NOT_EXHAUSTED_LIST 346 +#define _GUARD_NOT_EXHAUSTED_RANGE 347 +#define _GUARD_NOT_EXHAUSTED_TUPLE 348 +#define _GUARD_TYPE_VERSION 349 +#define _INIT_CALL_BOUND_METHOD_EXACT_ARGS 350 +#define _INIT_CALL_PY_EXACT_ARGS 351 +#define _INIT_CALL_PY_EXACT_ARGS_0 352 +#define _INIT_CALL_PY_EXACT_ARGS_1 353 +#define _INIT_CALL_PY_EXACT_ARGS_2 354 +#define _INIT_CALL_PY_EXACT_ARGS_3 355 +#define _INIT_CALL_PY_EXACT_ARGS_4 356 #define _INSTRUMENTED_CALL INSTRUMENTED_CALL #define _INSTRUMENTED_CALL_FUNCTION_EX INSTRUMENTED_CALL_FUNCTION_EX #define _INSTRUMENTED_CALL_KW INSTRUMENTED_CALL_KW @@ -140,65 +141,65 @@ extern "C" { #define _INSTRUMENTED_RETURN_CONST INSTRUMENTED_RETURN_CONST #define _INSTRUMENTED_RETURN_VALUE INSTRUMENTED_RETURN_VALUE #define _INSTRUMENTED_YIELD_VALUE INSTRUMENTED_YIELD_VALUE -#define _INTERNAL_INCREMENT_OPT_COUNTER 356 -#define _IS_NONE 357 +#define _INTERNAL_INCREMENT_OPT_COUNTER 357 +#define _IS_NONE 358 #define _IS_OP IS_OP -#define _ITER_CHECK_LIST 358 -#define _ITER_CHECK_RANGE 359 -#define _ITER_CHECK_TUPLE 360 -#define _ITER_JUMP_LIST 361 -#define _ITER_JUMP_RANGE 362 -#define _ITER_JUMP_TUPLE 363 -#define _ITER_NEXT_LIST 364 -#define _ITER_NEXT_RANGE 365 -#define _ITER_NEXT_TUPLE 366 -#define _JUMP_TO_TOP 367 +#define _ITER_CHECK_LIST 359 +#define _ITER_CHECK_RANGE 360 +#define _ITER_CHECK_TUPLE 361 +#define _ITER_JUMP_LIST 362 +#define _ITER_JUMP_RANGE 363 +#define _ITER_JUMP_TUPLE 364 +#define _ITER_NEXT_LIST 365 +#define _ITER_NEXT_RANGE 366 +#define _ITER_NEXT_TUPLE 367 +#define _JUMP_TO_TOP 368 #define _LIST_APPEND LIST_APPEND #define _LIST_EXTEND LIST_EXTEND #define _LOAD_ASSERTION_ERROR LOAD_ASSERTION_ERROR -#define _LOAD_ATTR 368 -#define _LOAD_ATTR_CLASS 369 -#define _LOAD_ATTR_CLASS_0 370 -#define _LOAD_ATTR_CLASS_1 371 +#define _LOAD_ATTR 369 +#define _LOAD_ATTR_CLASS 370 +#define _LOAD_ATTR_CLASS_0 371 +#define _LOAD_ATTR_CLASS_1 372 #define _LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN -#define _LOAD_ATTR_INSTANCE_VALUE 372 -#define _LOAD_ATTR_INSTANCE_VALUE_0 373 -#define _LOAD_ATTR_INSTANCE_VALUE_1 374 -#define _LOAD_ATTR_METHOD_LAZY_DICT 375 -#define _LOAD_ATTR_METHOD_NO_DICT 376 -#define _LOAD_ATTR_METHOD_WITH_VALUES 377 -#define _LOAD_ATTR_MODULE 378 -#define _LOAD_ATTR_NONDESCRIPTOR_NO_DICT 379 -#define _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES 380 +#define _LOAD_ATTR_INSTANCE_VALUE 373 +#define _LOAD_ATTR_INSTANCE_VALUE_0 374 +#define _LOAD_ATTR_INSTANCE_VALUE_1 375 +#define _LOAD_ATTR_METHOD_LAZY_DICT 376 +#define _LOAD_ATTR_METHOD_NO_DICT 377 +#define _LOAD_ATTR_METHOD_WITH_VALUES 378 +#define _LOAD_ATTR_MODULE 379 +#define _LOAD_ATTR_NONDESCRIPTOR_NO_DICT 380 +#define _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES 381 #define _LOAD_ATTR_PROPERTY LOAD_ATTR_PROPERTY -#define _LOAD_ATTR_SLOT 381 -#define _LOAD_ATTR_SLOT_0 382 -#define _LOAD_ATTR_SLOT_1 383 -#define _LOAD_ATTR_WITH_HINT 384 +#define _LOAD_ATTR_SLOT 382 +#define _LOAD_ATTR_SLOT_0 383 +#define _LOAD_ATTR_SLOT_1 384 +#define _LOAD_ATTR_WITH_HINT 385 #define _LOAD_BUILD_CLASS LOAD_BUILD_CLASS #define _LOAD_CONST LOAD_CONST -#define _LOAD_CONST_INLINE 385 -#define _LOAD_CONST_INLINE_BORROW 386 -#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 387 -#define _LOAD_CONST_INLINE_WITH_NULL 388 +#define _LOAD_CONST_INLINE 386 +#define _LOAD_CONST_INLINE_BORROW 387 +#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 388 +#define _LOAD_CONST_INLINE_WITH_NULL 389 #define _LOAD_DEREF LOAD_DEREF -#define _LOAD_FAST 389 -#define _LOAD_FAST_0 390 -#define _LOAD_FAST_1 391 -#define _LOAD_FAST_2 392 -#define _LOAD_FAST_3 393 -#define _LOAD_FAST_4 394 -#define _LOAD_FAST_5 395 -#define _LOAD_FAST_6 396 -#define _LOAD_FAST_7 397 +#define _LOAD_FAST 390 +#define _LOAD_FAST_0 391 +#define _LOAD_FAST_1 392 +#define _LOAD_FAST_2 393 +#define _LOAD_FAST_3 394 +#define _LOAD_FAST_4 395 +#define _LOAD_FAST_5 396 +#define _LOAD_FAST_6 397 +#define _LOAD_FAST_7 398 #define _LOAD_FAST_AND_CLEAR LOAD_FAST_AND_CLEAR #define _LOAD_FAST_CHECK LOAD_FAST_CHECK #define _LOAD_FAST_LOAD_FAST LOAD_FAST_LOAD_FAST #define _LOAD_FROM_DICT_OR_DEREF LOAD_FROM_DICT_OR_DEREF #define _LOAD_FROM_DICT_OR_GLOBALS LOAD_FROM_DICT_OR_GLOBALS -#define _LOAD_GLOBAL 398 -#define _LOAD_GLOBAL_BUILTINS 399 -#define _LOAD_GLOBAL_MODULE 400 +#define _LOAD_GLOBAL 399 +#define _LOAD_GLOBAL_BUILTINS 400 +#define _LOAD_GLOBAL_MODULE 401 #define _LOAD_LOCALS LOAD_LOCALS #define _LOAD_NAME LOAD_NAME #define _LOAD_SUPER_ATTR_ATTR LOAD_SUPER_ATTR_ATTR @@ -212,47 +213,47 @@ extern "C" { #define _MATCH_SEQUENCE MATCH_SEQUENCE #define _NOP NOP #define _POP_EXCEPT POP_EXCEPT -#define _POP_FRAME 401 -#define _POP_JUMP_IF_FALSE 402 -#define _POP_JUMP_IF_TRUE 403 +#define _POP_FRAME 402 +#define _POP_JUMP_IF_FALSE 403 +#define _POP_JUMP_IF_TRUE 404 #define _POP_TOP POP_TOP -#define _POP_TOP_LOAD_CONST_INLINE_BORROW 404 +#define _POP_TOP_LOAD_CONST_INLINE_BORROW 405 #define _PUSH_EXC_INFO PUSH_EXC_INFO -#define _PUSH_FRAME 405 +#define _PUSH_FRAME 406 #define _PUSH_NULL PUSH_NULL #define _RESUME_CHECK RESUME_CHECK -#define _SAVE_RETURN_OFFSET 406 -#define _SEND 407 +#define _SAVE_RETURN_OFFSET 407 +#define _SEND 408 #define _SEND_GEN SEND_GEN #define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS #define _SET_ADD SET_ADD #define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE #define _SET_UPDATE SET_UPDATE -#define _START_EXECUTOR 408 -#define _STORE_ATTR 409 -#define _STORE_ATTR_INSTANCE_VALUE 410 -#define _STORE_ATTR_SLOT 411 +#define _START_EXECUTOR 409 +#define _STORE_ATTR 410 +#define _STORE_ATTR_INSTANCE_VALUE 411 +#define _STORE_ATTR_SLOT 412 #define _STORE_ATTR_WITH_HINT STORE_ATTR_WITH_HINT #define _STORE_DEREF STORE_DEREF -#define _STORE_FAST 412 -#define _STORE_FAST_0 413 -#define _STORE_FAST_1 414 -#define _STORE_FAST_2 415 -#define _STORE_FAST_3 416 -#define _STORE_FAST_4 417 -#define _STORE_FAST_5 418 -#define _STORE_FAST_6 419 -#define _STORE_FAST_7 420 +#define _STORE_FAST 413 +#define _STORE_FAST_0 414 +#define _STORE_FAST_1 415 +#define _STORE_FAST_2 416 +#define _STORE_FAST_3 417 +#define _STORE_FAST_4 418 +#define _STORE_FAST_5 419 +#define _STORE_FAST_6 420 +#define _STORE_FAST_7 421 #define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST #define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST #define _STORE_GLOBAL STORE_GLOBAL #define _STORE_NAME STORE_NAME #define _STORE_SLICE STORE_SLICE -#define _STORE_SUBSCR 421 +#define _STORE_SUBSCR 422 #define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT #define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT #define _SWAP SWAP -#define _TO_BOOL 422 +#define _TO_BOOL 423 #define _TO_BOOL_ALWAYS_TRUE TO_BOOL_ALWAYS_TRUE #define _TO_BOOL_BOOL TO_BOOL_BOOL #define _TO_BOOL_INT TO_BOOL_INT @@ -263,12 +264,12 @@ extern "C" { #define _UNARY_NEGATIVE UNARY_NEGATIVE #define _UNARY_NOT UNARY_NOT #define _UNPACK_EX UNPACK_EX -#define _UNPACK_SEQUENCE 423 +#define _UNPACK_SEQUENCE 424 #define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST #define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE #define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE #define _WITH_EXCEPT_START WITH_EXCEPT_START -#define MAX_UOP_ID 423 +#define MAX_UOP_ID 424 #ifdef __cplusplus } diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 7f921a6cd3f4c8..19a91d72e154cd 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -242,6 +242,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_START_EXECUTOR] = 0, [_FATAL_ERROR] = HAS_ESCAPES_FLAG, [_CHECK_VALIDITY_AND_SET_IP] = HAS_DEOPT_FLAG, + [_ADJUST_STUFF] = HAS_ARG_FLAG, }; const uint8_t _PyUop_Replication[MAX_UOP_ID+1] = { @@ -251,6 +252,7 @@ const uint8_t _PyUop_Replication[MAX_UOP_ID+1] = { }; const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = { + [_ADJUST_STUFF] = "_ADJUST_STUFF", [_BEFORE_ASYNC_WITH] = "_BEFORE_ASYNC_WITH", [_BEFORE_WITH] = "_BEFORE_WITH", [_BINARY_OP] = "_BINARY_OP", diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 396a8f09f3feca..d734946b0c0dab 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -4125,6 +4125,11 @@ dummy_func( frame->instr_ptr = (_Py_CODEUNIT *)instr_ptr; } + tier2 op(_ADJUST_STUFF, (callable, unused, args[oparg], retval -- retval)) { + Py_INCREF(retval); + DECREF_INPUTS(); + } + // END BYTECODES // } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 56ee93862743d5..2419d5c1633a55 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -3873,4 +3873,23 @@ break; } + case _ADJUST_STUFF: { + PyObject *retval; + PyObject **args; + PyObject *callable; + oparg = CURRENT_OPARG(); + retval = stack_pointer[-1]; + args = &stack_pointer[-1 - oparg]; + callable = stack_pointer[-3 - oparg]; + Py_INCREF(retval); + Py_DECREF(callable); + for (int _i = oparg; --_i >= 0;) { + Py_DECREF(args[_i]); + } + Py_DECREF(retval); + stack_pointer[-3 - oparg] = retval; + stack_pointer += -2 - oparg; + break; + } + #undef TIER_TWO diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index a326e2249bb4de..6e071e3ab52990 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -375,6 +375,96 @@ optimize_uops( } +#ifdef Py_DEBUG +extern void _PyUOpPrint(const _PyUOpInstruction *uop); +#endif + +static int +inline_calls(_PyUOpInstruction *buffer, int buffer_size, int curr_stacklen) +{ + int last_push = -1; + int last_pop = -1; + for (int pc = 0; pc < buffer_size; pc++) { + _PyUOpInstruction *inst = &buffer[pc]; +#ifdef Py_DEBUG + if (get_lltrace() >= 2) { // TODO: >= 3 or even >= 4 + printf("%4d INL: ", pc); + _PyUOpPrint(inst); + printf("\n"); + } +#endif + if (inst->opcode == _PUSH_FRAME) { + last_push = pc; + last_pop = -1; + } + else if (inst->opcode == _POP_FRAME) { + last_pop = pc; + } + if (last_pop >= 0 && last_push >= 0) { + DPRINTF( + 2, + "An opportunity for call inlining presents itself at [%d, %d]\n", + last_push, + last_pop); + for (int i = last_push + 1; i < last_pop; i++) { + switch (buffer[i].opcode) { + case _NOP: + case _CHECK_VALIDITY: + case _RESUME_CHECK: + case _LOAD_FAST: + case _SET_IP: + case _CHECK_VALIDITY_AND_SET_IP: + // TODO: More systematic approach to which uops are safe here + break; + default: + DPRINTF(2, "Not inlining call: %s\n", _PyUOpName(buffer[i].opcode)); + goto out; + } + } + DPRINTF(2, "Inlining call!!!\n"); + /* A little before the _PUSH_FRAME, the stack layout is: + + | callable | NULL | arg1 | arg2 | ... | argN | + + The arg count is the _PUSH_FRAME oparg. + The locals of the frame would start at arg1, + so we translate _LOAD_FAST i into _COPY nargs-i. + */ + for (int i = last_push + 1; i < last_pop; i++) { + switch (buffer[i].opcode) { + case _LOAD_FAST: + buffer[i].opcode = _COPY; + buffer[i].oparg = buffer[last_push].oparg - buffer[i].oparg; + break; + case _RESUME_CHECK: + buffer[i].opcode = _NOP; + break; + } + } + + assert(buffer[last_pop].opcode == _POP_FRAME); + buffer[last_pop].opcode = _ADJUST_STUFF; + buffer[last_pop].oparg = buffer[last_push].oparg; + + buffer[last_push].opcode = NOP; + + assert(buffer[last_push - 1].opcode == _SAVE_RETURN_OFFSET); + buffer[last_push - 1].opcode = NOP; + + assert(buffer[last_push - 2].opcode == _INIT_CALL_PY_EXACT_ARGS); + buffer[last_push - 2].opcode = NOP; + + out: + last_pop = last_push = -1; + } + if (inst->opcode == _JUMP_TO_TOP || inst->opcode == _EXIT_TRACE) { + break; + } + } + return 1; +} + + static void remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size) { @@ -529,6 +619,12 @@ _Py_uop_analyze_and_optimize( } assert(err == 1); + err = inline_calls(buffer, buffer_size, curr_stacklen); + if (err == 0) { + goto not_ready; + } + assert(err == 1); + remove_unneeded_uops(buffer, buffer_size); OPT_STAT_INC(optimizer_successes); diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 6d3488f2118589..3ac9ff1781ae83 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -1910,3 +1910,12 @@ break; } + case _ADJUST_STUFF: { + _Py_UopsSymbol *retval; + retval = sym_new_unknown(ctx); + if (retval == NULL) goto out_of_space; + stack_pointer[-3 - oparg] = retval; + stack_pointer += -2 - oparg; + break; + } + From 9bf95e4230cf219f0d69d6bf245b8d2698e47cac Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Mar 2024 13:10:28 -0800 Subject: [PATCH 03/12] More cleanup of debug output, and length adjustment --- Python/optimizer.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index ac0c199af29239..4dbfd918a51f62 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -899,7 +899,8 @@ make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *depende uint32_t used[(UOP_MAX_TRACE_LENGTH + 31)/32] = { 0 }; int exit_count; int length = compute_used(buffer, used, &exit_count); - _PyExecutorObject *executor = allocate_executor(exit_count, length+1); + length += 1; // For _START_EXECUTOR + _PyExecutorObject *executor = allocate_executor(exit_count, length); if (executor == NULL) { return NULL; } @@ -909,7 +910,7 @@ make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *depende executor->exits[i].temperature = 0; } int next_exit = exit_count-1; - _PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length]; + _PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length-1]; /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */ for (int i = UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) { if (!BIT_IS_SET(used, i)) { @@ -947,7 +948,7 @@ make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *depende } if (lltrace >= 2) { printf("Optimized executor (length %d):\n", length); - for (int i = 0; i < length+1; i++) { + for (int i = 0; i < length; i++) { printf("%4d OPTIMIZED: ", i); _PyUOpPrint(&executor->trace[i]); printf("\n"); @@ -957,7 +958,7 @@ make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *depende #ifdef _Py_JIT executor->jit_code = NULL; executor->jit_size = 0; - if (_PyJIT_Compile(executor, executor->trace, length+1)) { + if (_PyJIT_Compile(executor, executor->trace, length)) { Py_DECREF(executor); return NULL; } From 8939a9ae6c0bcc538cef42f26493133bead2de23 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Mar 2024 13:20:22 -0800 Subject: [PATCH 04/12] Tweak debug output --- Python/optimizer_analysis.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 6e071e3ab52990..d347f095b69852 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -387,7 +387,7 @@ inline_calls(_PyUOpInstruction *buffer, int buffer_size, int curr_stacklen) for (int pc = 0; pc < buffer_size; pc++) { _PyUOpInstruction *inst = &buffer[pc]; #ifdef Py_DEBUG - if (get_lltrace() >= 2) { // TODO: >= 3 or even >= 4 + if (get_lltrace() >= 3) { printf("%4d INL: ", pc); _PyUOpPrint(inst); printf("\n"); @@ -401,11 +401,7 @@ inline_calls(_PyUOpInstruction *buffer, int buffer_size, int curr_stacklen) last_pop = pc; } if (last_pop >= 0 && last_push >= 0) { - DPRINTF( - 2, - "An opportunity for call inlining presents itself at [%d, %d]\n", - last_push, - last_pop); + DPRINTF(3, "Maybe inline call at [%d-%d]\n", last_push, last_pop); for (int i = last_push + 1; i < last_pop; i++) { switch (buffer[i].opcode) { case _NOP: @@ -414,14 +410,15 @@ inline_calls(_PyUOpInstruction *buffer, int buffer_size, int curr_stacklen) case _LOAD_FAST: case _SET_IP: case _CHECK_VALIDITY_AND_SET_IP: - // TODO: More systematic approach to which uops are safe here + // TODO: More systematic approach to which uops are safe break; default: - DPRINTF(2, "Not inlining call: %s\n", _PyUOpName(buffer[i].opcode)); + DPRINTF(3, "Not inlining call at [%d-%d]: %s\n", + last_push, last_pop, _PyUOpName(buffer[i].opcode)); goto out; } } - DPRINTF(2, "Inlining call!!!\n"); + DPRINTF(2, "Inlining call at [%d-%d]\n", last_push, last_pop); /* A little before the _PUSH_FRAME, the stack layout is: | callable | NULL | arg1 | arg2 | ... | argN | From b0097a0279c2329a01c940a81ca2ad2e1db37c9c Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Mar 2024 13:31:34 -0800 Subject: [PATCH 05/12] Fix decrefs in _ADJUST_STUFF --- Python/bytecodes.c | 9 ++++++--- Python/executor_cases.c.h | 9 +++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index d734946b0c0dab..df3a93ed4864ea 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -4125,9 +4125,12 @@ dummy_func( frame->instr_ptr = (_Py_CODEUNIT *)instr_ptr; } - tier2 op(_ADJUST_STUFF, (callable, unused, args[oparg], retval -- retval)) { - Py_INCREF(retval); - DECREF_INPUTS(); + tier2 op(_ADJUST_STUFF, (callable, self_or_null, args[oparg], retval -- retval)) { + Py_DECREF(callable); + Py_XDECREF(self_or_null); + for (int i = oparg; --i >= 0;) { + Py_DECREF(args[i]); + } } // END BYTECODES // diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 2419d5c1633a55..f879a82d290e11 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -3876,17 +3876,18 @@ case _ADJUST_STUFF: { PyObject *retval; PyObject **args; + PyObject *self_or_null; PyObject *callable; oparg = CURRENT_OPARG(); retval = stack_pointer[-1]; args = &stack_pointer[-1 - oparg]; + self_or_null = stack_pointer[-2 - oparg]; callable = stack_pointer[-3 - oparg]; - Py_INCREF(retval); Py_DECREF(callable); - for (int _i = oparg; --_i >= 0;) { - Py_DECREF(args[_i]); + Py_XDECREF(self_or_null); + for (int i = oparg; --i >= 0;) { + Py_DECREF(args[i]); } - Py_DECREF(retval); stack_pointer[-3 - oparg] = retval; stack_pointer += -2 - oparg; break; From 7579cf3d5122e0197b456611027a17376024663f Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Mar 2024 13:46:11 -0800 Subject: [PATCH 06/12] X{INC,DEC}REF don't escape --- Include/internal/pycore_opcode_metadata.h | 2 +- Include/internal/pycore_uop_metadata.h | 2 +- Tools/cases_generator/analyzer.py | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index ab34366ab1066c..f7636cfc366e0d 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1123,7 +1123,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[268] = { [SET_UPDATE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [STORE_ATTR] = { true, INSTR_FMT_IBC000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [STORE_ATTR_INSTANCE_VALUE] = { true, INSTR_FMT_IXC000, HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG }, - [STORE_ATTR_SLOT] = { true, INSTR_FMT_IXC000, HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG }, + [STORE_ATTR_SLOT] = { true, INSTR_FMT_IXC000, HAS_DEOPT_FLAG | HAS_EXIT_FLAG }, [STORE_ATTR_WITH_HINT] = { true, INSTR_FMT_IBC000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, [STORE_DEREF] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ESCAPES_FLAG }, [STORE_FAST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_LOCAL_FLAG }, diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 19a91d72e154cd..f18f36f56026ed 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -147,7 +147,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_LOAD_ATTR_CLASS] = HAS_ARG_FLAG | HAS_OPARG_AND_1_FLAG, [_GUARD_DORV_VALUES] = HAS_DEOPT_FLAG | HAS_PASSTHROUGH_FLAG, [_STORE_ATTR_INSTANCE_VALUE] = HAS_ESCAPES_FLAG, - [_STORE_ATTR_SLOT] = HAS_ESCAPES_FLAG, + [_STORE_ATTR_SLOT] = 0, [_COMPARE_OP] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_COMPARE_OP_FLOAT] = HAS_ARG_FLAG | HAS_ESCAPES_FLAG, [_COMPARE_OP_INT] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG, diff --git a/Tools/cases_generator/analyzer.py b/Tools/cases_generator/analyzer.py index b0a15e6d87c2c6..2fef36129a8132 100644 --- a/Tools/cases_generator/analyzer.py +++ b/Tools/cases_generator/analyzer.py @@ -332,11 +332,13 @@ def is_infallible(op: parser.InstDef) -> bool: NON_ESCAPING_FUNCTIONS = ( "Py_INCREF", + "Py_XINCREF", "_PyDictOrValues_IsValues", "_PyObject_DictOrValuesPointer", "_PyDictOrValues_GetValues", "_PyObject_MakeInstanceAttributesFromDict", "Py_DECREF", + "Py_XDECREF", "_Py_DECREF_SPECIALIZED", "DECREF_INPUTS_AND_REUSE_FLOAT", "PyUnicode_Append", From a370161a3684ad54287bdddfd2b4f4990fff1fee Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Mar 2024 13:48:01 -0800 Subject: [PATCH 07/12] Refactor call inline logic and support method calls --- Python/optimizer_analysis.c | 137 ++++++++++++++++++++++-------------- 1 file changed, 83 insertions(+), 54 deletions(-) diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index d347f095b69852..93a5a6b2cb72c5 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -383,7 +383,6 @@ static int inline_calls(_PyUOpInstruction *buffer, int buffer_size, int curr_stacklen) { int last_push = -1; - int last_pop = -1; for (int pc = 0; pc < buffer_size; pc++) { _PyUOpInstruction *inst = &buffer[pc]; #ifdef Py_DEBUG @@ -395,68 +394,98 @@ inline_calls(_PyUOpInstruction *buffer, int buffer_size, int curr_stacklen) #endif if (inst->opcode == _PUSH_FRAME) { last_push = pc; - last_pop = -1; + continue; } - else if (inst->opcode == _POP_FRAME) { - last_pop = pc; + if (inst->opcode != _POP_FRAME) { + if (inst->opcode == _JUMP_TO_TOP || inst->opcode == _EXIT_TRACE) { + break; + } + continue; } - if (last_pop >= 0 && last_push >= 0) { - DPRINTF(3, "Maybe inline call at [%d-%d]\n", last_push, last_pop); - for (int i = last_push + 1; i < last_pop; i++) { - switch (buffer[i].opcode) { - case _NOP: - case _CHECK_VALIDITY: - case _RESUME_CHECK: - case _LOAD_FAST: - case _SET_IP: - case _CHECK_VALIDITY_AND_SET_IP: - // TODO: More systematic approach to which uops are safe - break; - default: - DPRINTF(3, "Not inlining call at [%d-%d]: %s\n", - last_push, last_pop, _PyUOpName(buffer[i].opcode)); - goto out; - } + assert(inst->opcode == _POP_FRAME); + if (last_push < 0) { + continue; + } + DPRINTF(3, "Maybe inline call at [%d-%d]\n", last_push, pc); + for (int i = last_push + 1; i < pc; i++) { + switch (buffer[i].opcode) { + case _NOP: + case _CHECK_VALIDITY: + case _RESUME_CHECK: + case _LOAD_FAST: + case _SET_IP: + case _CHECK_VALIDITY_AND_SET_IP: + // TODO: More systematic approach to which uops are safe + break; + default: + DPRINTF(3, "Not inlining call at [%d-%d]: %s\n", + last_push, pc, _PyUOpName(buffer[i].opcode)); + goto out; } - DPRINTF(2, "Inlining call at [%d-%d]\n", last_push, last_pop); - /* A little before the _PUSH_FRAME, the stack layout is: - - | callable | NULL | arg1 | arg2 | ... | argN | - - The arg count is the _PUSH_FRAME oparg. - The locals of the frame would start at arg1, - so we translate _LOAD_FAST i into _COPY nargs-i. - */ - for (int i = last_push + 1; i < last_pop; i++) { - switch (buffer[i].opcode) { - case _LOAD_FAST: - buffer[i].opcode = _COPY; - buffer[i].oparg = buffer[last_push].oparg - buffer[i].oparg; - break; - case _RESUME_CHECK: - buffer[i].opcode = _NOP; - break; - } + } + int nargs = buffer[last_push].oparg; + if (buffer[last_push - 1].opcode != _SAVE_RETURN_OFFSET) { + DPRINTF(3, "Not inlining call at [%d-%d]: No _SAVE_RETURN_OFFSET\n", + last_push, pc); + goto out; + } + if (buffer[last_push - 2].opcode != _INIT_CALL_PY_EXACT_ARGS) { + DPRINTF(3, "Not inlining call at [%d-%d]: No _INIT_CALL_PY_EXACT_ARGS\n", + last_push, pc); + goto out; + } + if (buffer[last_push - 3].opcode != _CHECK_STACK_SPACE) { + DPRINTF(3, "Not inlining call at [%d-%d]: No _CHECK_STACK_SPACE\n", + last_push, pc); + goto out; + } + if (buffer[last_push - 4].opcode != _CHECK_FUNCTION_EXACT_ARGS) { + DPRINTF(3, "Not inlining call at [%d-%d]: No _CHECK_FUNCTION_EXACT_ARGS\n", + last_push, pc); + goto out; + } + if (buffer[last_push - 5].opcode == _INIT_CALL_BOUND_METHOD_EXACT_ARGS) { + DPRINTF(2, "Inlining method call at [%d-%d]\n", last_push, pc); + nargs += 1; + } + else { + DPRINTF(2, "Inlining function call at [%d-%d]\n", last_push, pc); + } + /* A little before the _PUSH_FRAME, the stack layout is: + + | callable | self_or_null | arg1 | arg2 | ... | argN | + + The arg count is the _PUSH_FRAME oparg. + If self_or_null is NULL, the locals of the frame start at arg1, + so we translate _LOAD_FAST i into _COPY nargs-i; + if it's not NULL, add one to nargs; + */ + for (int i = last_push + 1; i < pc; i++) { + switch (buffer[i].opcode) { + case _LOAD_FAST: + buffer[i].opcode = _COPY; + buffer[i].oparg = nargs - buffer[i].oparg; + break; + case _RESUME_CHECK: + buffer[i].opcode = _NOP; + break; } + } - assert(buffer[last_pop].opcode == _POP_FRAME); - buffer[last_pop].opcode = _ADJUST_STUFF; - buffer[last_pop].oparg = buffer[last_push].oparg; + assert(buffer[pc].opcode == _POP_FRAME); + buffer[pc].opcode = _ADJUST_STUFF; + buffer[pc].oparg = buffer[last_push].oparg; - buffer[last_push].opcode = NOP; + buffer[last_push].opcode = NOP; - assert(buffer[last_push - 1].opcode == _SAVE_RETURN_OFFSET); - buffer[last_push - 1].opcode = NOP; + assert(buffer[last_push - 1].opcode == _SAVE_RETURN_OFFSET); + buffer[last_push - 1].opcode = NOP; - assert(buffer[last_push - 2].opcode == _INIT_CALL_PY_EXACT_ARGS); - buffer[last_push - 2].opcode = NOP; + assert(buffer[last_push - 2].opcode == _INIT_CALL_PY_EXACT_ARGS); + buffer[last_push - 2].opcode = NOP; - out: - last_pop = last_push = -1; - } - if (inst->opcode == _JUMP_TO_TOP || inst->opcode == _EXIT_TRACE) { - break; - } + out: + last_push = -1; } return 1; } From 087f8d42b11c4f96a9f3d5999c94e247e289c714 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Mar 2024 14:06:21 -0800 Subject: [PATCH 08/12] Add tests --- Lib/test/test_capi/test_opt.py | 36 ++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index a0a19225b79433..7e62738c20a803 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -953,6 +953,42 @@ def testfunc(n): _, ex = self._run_with_optimizer(testfunc, 16) self.assertIsNone(ex) + def test_function_call_inline(self): + def cast(typ, val): + return val + def testfunc(n): + x = 0 + for i in range(n): + x = cast(int, i) + 1 + return x + x, ex = self._run_with_optimizer(testfunc, 20) + self.assertEqual(x, 20) + self.assertIsNotNone(ex) + uops = get_opnames(ex) + print() + print(list(iter_opnames(ex))) + self.assertNotIn("_PUSH_FRAME", uops) + self.assertNotIn("_POP_FRALE", uops) + + def test_method_call_inline(self): + class Caster: + def cast(self, typ, val): + return val + def testfunc(n): + cast = Caster().cast + x = 0 + for i in range(n): + x = cast(int, i) + 1 + return x + x, ex = self._run_with_optimizer(testfunc, 20) + self.assertEqual(x, 20) + self.assertIsNotNone(ex) + uops = get_opnames(ex) + print() + print(list(iter_opnames(ex))) + self.assertNotIn("_PUSH_FRAME", uops) + self.assertNotIn("_POP_FRALE", uops) + if __name__ == "__main__": unittest.main() From 215b7c2096a15bb204eb6dcfcf800da42f70c287 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Mar 2024 16:35:23 -0800 Subject: [PATCH 09/12] Add _GUARD_NOT_METHOD uop --- Include/internal/pycore_uop_ids.h | 155 +++++++++++++------------ Include/internal/pycore_uop_metadata.h | 2 + Python/bytecodes.c | 5 + Python/executor_cases.c.h | 8 ++ Python/optimizer_cases.c.h | 4 + 5 files changed, 97 insertions(+), 77 deletions(-) diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index 0a5897c90e7830..a4c4f127b31f05 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -117,14 +117,15 @@ extern "C" { #define _GUARD_NOT_EXHAUSTED_LIST 346 #define _GUARD_NOT_EXHAUSTED_RANGE 347 #define _GUARD_NOT_EXHAUSTED_TUPLE 348 -#define _GUARD_TYPE_VERSION 349 -#define _INIT_CALL_BOUND_METHOD_EXACT_ARGS 350 -#define _INIT_CALL_PY_EXACT_ARGS 351 -#define _INIT_CALL_PY_EXACT_ARGS_0 352 -#define _INIT_CALL_PY_EXACT_ARGS_1 353 -#define _INIT_CALL_PY_EXACT_ARGS_2 354 -#define _INIT_CALL_PY_EXACT_ARGS_3 355 -#define _INIT_CALL_PY_EXACT_ARGS_4 356 +#define _GUARD_NOT_METHOD 349 +#define _GUARD_TYPE_VERSION 350 +#define _INIT_CALL_BOUND_METHOD_EXACT_ARGS 351 +#define _INIT_CALL_PY_EXACT_ARGS 352 +#define _INIT_CALL_PY_EXACT_ARGS_0 353 +#define _INIT_CALL_PY_EXACT_ARGS_1 354 +#define _INIT_CALL_PY_EXACT_ARGS_2 355 +#define _INIT_CALL_PY_EXACT_ARGS_3 356 +#define _INIT_CALL_PY_EXACT_ARGS_4 357 #define _INSTRUMENTED_CALL INSTRUMENTED_CALL #define _INSTRUMENTED_CALL_FUNCTION_EX INSTRUMENTED_CALL_FUNCTION_EX #define _INSTRUMENTED_CALL_KW INSTRUMENTED_CALL_KW @@ -141,65 +142,65 @@ extern "C" { #define _INSTRUMENTED_RETURN_CONST INSTRUMENTED_RETURN_CONST #define _INSTRUMENTED_RETURN_VALUE INSTRUMENTED_RETURN_VALUE #define _INSTRUMENTED_YIELD_VALUE INSTRUMENTED_YIELD_VALUE -#define _INTERNAL_INCREMENT_OPT_COUNTER 357 -#define _IS_NONE 358 +#define _INTERNAL_INCREMENT_OPT_COUNTER 358 +#define _IS_NONE 359 #define _IS_OP IS_OP -#define _ITER_CHECK_LIST 359 -#define _ITER_CHECK_RANGE 360 -#define _ITER_CHECK_TUPLE 361 -#define _ITER_JUMP_LIST 362 -#define _ITER_JUMP_RANGE 363 -#define _ITER_JUMP_TUPLE 364 -#define _ITER_NEXT_LIST 365 -#define _ITER_NEXT_RANGE 366 -#define _ITER_NEXT_TUPLE 367 -#define _JUMP_TO_TOP 368 +#define _ITER_CHECK_LIST 360 +#define _ITER_CHECK_RANGE 361 +#define _ITER_CHECK_TUPLE 362 +#define _ITER_JUMP_LIST 363 +#define _ITER_JUMP_RANGE 364 +#define _ITER_JUMP_TUPLE 365 +#define _ITER_NEXT_LIST 366 +#define _ITER_NEXT_RANGE 367 +#define _ITER_NEXT_TUPLE 368 +#define _JUMP_TO_TOP 369 #define _LIST_APPEND LIST_APPEND #define _LIST_EXTEND LIST_EXTEND #define _LOAD_ASSERTION_ERROR LOAD_ASSERTION_ERROR -#define _LOAD_ATTR 369 -#define _LOAD_ATTR_CLASS 370 -#define _LOAD_ATTR_CLASS_0 371 -#define _LOAD_ATTR_CLASS_1 372 +#define _LOAD_ATTR 370 +#define _LOAD_ATTR_CLASS 371 +#define _LOAD_ATTR_CLASS_0 372 +#define _LOAD_ATTR_CLASS_1 373 #define _LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN -#define _LOAD_ATTR_INSTANCE_VALUE 373 -#define _LOAD_ATTR_INSTANCE_VALUE_0 374 -#define _LOAD_ATTR_INSTANCE_VALUE_1 375 -#define _LOAD_ATTR_METHOD_LAZY_DICT 376 -#define _LOAD_ATTR_METHOD_NO_DICT 377 -#define _LOAD_ATTR_METHOD_WITH_VALUES 378 -#define _LOAD_ATTR_MODULE 379 -#define _LOAD_ATTR_NONDESCRIPTOR_NO_DICT 380 -#define _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES 381 +#define _LOAD_ATTR_INSTANCE_VALUE 374 +#define _LOAD_ATTR_INSTANCE_VALUE_0 375 +#define _LOAD_ATTR_INSTANCE_VALUE_1 376 +#define _LOAD_ATTR_METHOD_LAZY_DICT 377 +#define _LOAD_ATTR_METHOD_NO_DICT 378 +#define _LOAD_ATTR_METHOD_WITH_VALUES 379 +#define _LOAD_ATTR_MODULE 380 +#define _LOAD_ATTR_NONDESCRIPTOR_NO_DICT 381 +#define _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES 382 #define _LOAD_ATTR_PROPERTY LOAD_ATTR_PROPERTY -#define _LOAD_ATTR_SLOT 382 -#define _LOAD_ATTR_SLOT_0 383 -#define _LOAD_ATTR_SLOT_1 384 -#define _LOAD_ATTR_WITH_HINT 385 +#define _LOAD_ATTR_SLOT 383 +#define _LOAD_ATTR_SLOT_0 384 +#define _LOAD_ATTR_SLOT_1 385 +#define _LOAD_ATTR_WITH_HINT 386 #define _LOAD_BUILD_CLASS LOAD_BUILD_CLASS #define _LOAD_CONST LOAD_CONST -#define _LOAD_CONST_INLINE 386 -#define _LOAD_CONST_INLINE_BORROW 387 -#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 388 -#define _LOAD_CONST_INLINE_WITH_NULL 389 +#define _LOAD_CONST_INLINE 387 +#define _LOAD_CONST_INLINE_BORROW 388 +#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 389 +#define _LOAD_CONST_INLINE_WITH_NULL 390 #define _LOAD_DEREF LOAD_DEREF -#define _LOAD_FAST 390 -#define _LOAD_FAST_0 391 -#define _LOAD_FAST_1 392 -#define _LOAD_FAST_2 393 -#define _LOAD_FAST_3 394 -#define _LOAD_FAST_4 395 -#define _LOAD_FAST_5 396 -#define _LOAD_FAST_6 397 -#define _LOAD_FAST_7 398 +#define _LOAD_FAST 391 +#define _LOAD_FAST_0 392 +#define _LOAD_FAST_1 393 +#define _LOAD_FAST_2 394 +#define _LOAD_FAST_3 395 +#define _LOAD_FAST_4 396 +#define _LOAD_FAST_5 397 +#define _LOAD_FAST_6 398 +#define _LOAD_FAST_7 399 #define _LOAD_FAST_AND_CLEAR LOAD_FAST_AND_CLEAR #define _LOAD_FAST_CHECK LOAD_FAST_CHECK #define _LOAD_FAST_LOAD_FAST LOAD_FAST_LOAD_FAST #define _LOAD_FROM_DICT_OR_DEREF LOAD_FROM_DICT_OR_DEREF #define _LOAD_FROM_DICT_OR_GLOBALS LOAD_FROM_DICT_OR_GLOBALS -#define _LOAD_GLOBAL 399 -#define _LOAD_GLOBAL_BUILTINS 400 -#define _LOAD_GLOBAL_MODULE 401 +#define _LOAD_GLOBAL 400 +#define _LOAD_GLOBAL_BUILTINS 401 +#define _LOAD_GLOBAL_MODULE 402 #define _LOAD_LOCALS LOAD_LOCALS #define _LOAD_NAME LOAD_NAME #define _LOAD_SUPER_ATTR_ATTR LOAD_SUPER_ATTR_ATTR @@ -213,47 +214,47 @@ extern "C" { #define _MATCH_SEQUENCE MATCH_SEQUENCE #define _NOP NOP #define _POP_EXCEPT POP_EXCEPT -#define _POP_FRAME 402 -#define _POP_JUMP_IF_FALSE 403 -#define _POP_JUMP_IF_TRUE 404 +#define _POP_FRAME 403 +#define _POP_JUMP_IF_FALSE 404 +#define _POP_JUMP_IF_TRUE 405 #define _POP_TOP POP_TOP -#define _POP_TOP_LOAD_CONST_INLINE_BORROW 405 +#define _POP_TOP_LOAD_CONST_INLINE_BORROW 406 #define _PUSH_EXC_INFO PUSH_EXC_INFO -#define _PUSH_FRAME 406 +#define _PUSH_FRAME 407 #define _PUSH_NULL PUSH_NULL #define _RESUME_CHECK RESUME_CHECK -#define _SAVE_RETURN_OFFSET 407 -#define _SEND 408 +#define _SAVE_RETURN_OFFSET 408 +#define _SEND 409 #define _SEND_GEN SEND_GEN #define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS #define _SET_ADD SET_ADD #define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE #define _SET_UPDATE SET_UPDATE -#define _START_EXECUTOR 409 -#define _STORE_ATTR 410 -#define _STORE_ATTR_INSTANCE_VALUE 411 -#define _STORE_ATTR_SLOT 412 +#define _START_EXECUTOR 410 +#define _STORE_ATTR 411 +#define _STORE_ATTR_INSTANCE_VALUE 412 +#define _STORE_ATTR_SLOT 413 #define _STORE_ATTR_WITH_HINT STORE_ATTR_WITH_HINT #define _STORE_DEREF STORE_DEREF -#define _STORE_FAST 413 -#define _STORE_FAST_0 414 -#define _STORE_FAST_1 415 -#define _STORE_FAST_2 416 -#define _STORE_FAST_3 417 -#define _STORE_FAST_4 418 -#define _STORE_FAST_5 419 -#define _STORE_FAST_6 420 -#define _STORE_FAST_7 421 +#define _STORE_FAST 414 +#define _STORE_FAST_0 415 +#define _STORE_FAST_1 416 +#define _STORE_FAST_2 417 +#define _STORE_FAST_3 418 +#define _STORE_FAST_4 419 +#define _STORE_FAST_5 420 +#define _STORE_FAST_6 421 +#define _STORE_FAST_7 422 #define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST #define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST #define _STORE_GLOBAL STORE_GLOBAL #define _STORE_NAME STORE_NAME #define _STORE_SLICE STORE_SLICE -#define _STORE_SUBSCR 422 +#define _STORE_SUBSCR 423 #define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT #define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT #define _SWAP SWAP -#define _TO_BOOL 423 +#define _TO_BOOL 424 #define _TO_BOOL_ALWAYS_TRUE TO_BOOL_ALWAYS_TRUE #define _TO_BOOL_BOOL TO_BOOL_BOOL #define _TO_BOOL_INT TO_BOOL_INT @@ -264,12 +265,12 @@ extern "C" { #define _UNARY_NEGATIVE UNARY_NEGATIVE #define _UNARY_NOT UNARY_NOT #define _UNPACK_EX UNPACK_EX -#define _UNPACK_SEQUENCE 424 +#define _UNPACK_SEQUENCE 425 #define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST #define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE #define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE #define _WITH_EXCEPT_START WITH_EXCEPT_START -#define MAX_UOP_ID 424 +#define MAX_UOP_ID 425 #ifdef __cplusplus } diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index f18f36f56026ed..c89b3b69fa95de 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -197,6 +197,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_INIT_CALL_PY_EXACT_ARGS_3] = HAS_ESCAPES_FLAG | HAS_PURE_FLAG, [_INIT_CALL_PY_EXACT_ARGS_4] = HAS_ESCAPES_FLAG | HAS_PURE_FLAG, [_INIT_CALL_PY_EXACT_ARGS] = HAS_ARG_FLAG | HAS_ESCAPES_FLAG | HAS_PURE_FLAG, + [_GUARD_NOT_METHOD] = HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_PASSTHROUGH_FLAG, [_PUSH_FRAME] = HAS_ESCAPES_FLAG, [_CALL_TYPE_1] = HAS_ARG_FLAG | HAS_DEOPT_FLAG, [_CALL_STR_1] = HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, @@ -351,6 +352,7 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = { [_GUARD_NOT_EXHAUSTED_LIST] = "_GUARD_NOT_EXHAUSTED_LIST", [_GUARD_NOT_EXHAUSTED_RANGE] = "_GUARD_NOT_EXHAUSTED_RANGE", [_GUARD_NOT_EXHAUSTED_TUPLE] = "_GUARD_NOT_EXHAUSTED_TUPLE", + [_GUARD_NOT_METHOD] = "_GUARD_NOT_METHOD", [_GUARD_TYPE_VERSION] = "_GUARD_TYPE_VERSION", [_INIT_CALL_BOUND_METHOD_EXACT_ARGS] = "_INIT_CALL_BOUND_METHOD_EXACT_ARGS", [_INIT_CALL_PY_EXACT_ARGS] = "_INIT_CALL_PY_EXACT_ARGS", diff --git a/Python/bytecodes.c b/Python/bytecodes.c index df3a93ed4864ea..afe2c7e0a4320a 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3120,6 +3120,11 @@ dummy_func( } } + // Hack for function call inlining + tier2 op(_GUARD_NOT_METHOD, (self_or_null, unused[oparg] -- self_or_null, unused[oparg])) { + DEOPT_IF(self_or_null != NULL); + } + // The 'unused' output effect represents the return value // (which will be pushed when the frame returns). // It is needed so CALL_PY_EXACT_ARGS matches its family. diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index f879a82d290e11..00600d5cf79b6e 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -2997,6 +2997,14 @@ break; } + case _GUARD_NOT_METHOD: { + PyObject *self_or_null; + oparg = CURRENT_OPARG(); + self_or_null = stack_pointer[-1 - oparg]; + if (self_or_null != NULL) goto deoptimize; + break; + } + case _PUSH_FRAME: { _PyInterpreterFrame *new_frame; new_frame = (_PyInterpreterFrame *)stack_pointer[-1]; diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 3ac9ff1781ae83..a9bfd135b0f575 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -1551,6 +1551,10 @@ break; } + case _GUARD_NOT_METHOD: { + break; + } + case _PUSH_FRAME: { _Py_UOpsAbstractFrame *new_frame; new_frame = (_Py_UOpsAbstractFrame *)stack_pointer[-1]; From 0136b3a9f56cd240304f78ed7ed0187a67040308 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Mar 2024 16:35:59 -0800 Subject: [PATCH 10/12] Deopt if inlined function call finds self != NULL --- Python/optimizer_analysis.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 93a5a6b2cb72c5..0838f1671cc837 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -424,6 +424,7 @@ inline_calls(_PyUOpInstruction *buffer, int buffer_size, int curr_stacklen) } } int nargs = buffer[last_push].oparg; + bool self_present = false; if (buffer[last_push - 1].opcode != _SAVE_RETURN_OFFSET) { DPRINTF(3, "Not inlining call at [%d-%d]: No _SAVE_RETURN_OFFSET\n", last_push, pc); @@ -446,7 +447,8 @@ inline_calls(_PyUOpInstruction *buffer, int buffer_size, int curr_stacklen) } if (buffer[last_push - 5].opcode == _INIT_CALL_BOUND_METHOD_EXACT_ARGS) { DPRINTF(2, "Inlining method call at [%d-%d]\n", last_push, pc); - nargs += 1; + nargs++; + self_present = true; } else { DPRINTF(2, "Inlining function call at [%d-%d]\n", last_push, pc); @@ -464,6 +466,7 @@ inline_calls(_PyUOpInstruction *buffer, int buffer_size, int curr_stacklen) switch (buffer[i].opcode) { case _LOAD_FAST: buffer[i].opcode = _COPY; + assert(nargs - buffer[i].oparg > 0); buffer[i].oparg = nargs - buffer[i].oparg; break; case _RESUME_CHECK: @@ -482,7 +485,7 @@ inline_calls(_PyUOpInstruction *buffer, int buffer_size, int curr_stacklen) buffer[last_push - 1].opcode = NOP; assert(buffer[last_push - 2].opcode == _INIT_CALL_PY_EXACT_ARGS); - buffer[last_push - 2].opcode = NOP; + buffer[last_push - 2].opcode = self_present ? _NOP : _GUARD_NOT_METHOD; out: last_push = -1; From b2ef2958ac9e625f5f14474ffccb73dee0a9f5a1 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Mar 2024 16:57:52 -0800 Subject: [PATCH 11/12] Comment out debug print() calls in test --- Lib/test/test_capi/test_opt.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 7e62738c20a803..d270f444e66e32 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -965,8 +965,8 @@ def testfunc(n): self.assertEqual(x, 20) self.assertIsNotNone(ex) uops = get_opnames(ex) - print() - print(list(iter_opnames(ex))) + # print() + # print(list(iter_opnames(ex))) self.assertNotIn("_PUSH_FRAME", uops) self.assertNotIn("_POP_FRALE", uops) @@ -984,8 +984,8 @@ def testfunc(n): self.assertEqual(x, 20) self.assertIsNotNone(ex) uops = get_opnames(ex) - print() - print(list(iter_opnames(ex))) + # print() + # print(list(iter_opnames(ex))) self.assertNotIn("_PUSH_FRAME", uops) self.assertNotIn("_POP_FRALE", uops) From 6c9ed1ff5996882895af9444a4a0b44663cd3dec Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Mar 2024 17:25:45 -0800 Subject: [PATCH 12/12] Remove failing assert --- Python/optimizer_analysis.c | 1 - 1 file changed, 1 deletion(-) diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 0838f1671cc837..9fd9eb27a4ddd5 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -466,7 +466,6 @@ inline_calls(_PyUOpInstruction *buffer, int buffer_size, int curr_stacklen) switch (buffer[i].opcode) { case _LOAD_FAST: buffer[i].opcode = _COPY; - assert(nargs - buffer[i].oparg > 0); buffer[i].oparg = nargs - buffer[i].oparg; break; case _RESUME_CHECK: