From 660cc785e976d001533a8bd991f60745698314f8 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Sun, 16 Jan 2022 15:40:19 -0800 Subject: [PATCH 1/4] bpo-46406: Faster single digit int division. This expresses the algorithm in a more basic manner resulting in better instruction generation by todays compilers. See https://mail.python.org/archives/list/python-dev@python.org/thread/ZICIMX5VFCX4IOFH5NUPVHCUJCQ4Q7QM/#NEUNFZU3TQU4CPTYZNF3WCN7DOJBBTK5 --- .../2022-01-16-15-40-11.bpo-46406.g0mke-.rst | 2 + Objects/longobject.c | 37 +++++++++++++------ 2 files changed, 28 insertions(+), 11 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-01-16-15-40-11.bpo-46406.g0mke-.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-01-16-15-40-11.bpo-46406.g0mke-.rst b/Misc/NEWS.d/next/Core and Builtins/2022-01-16-15-40-11.bpo-46406.g0mke-.rst new file mode 100644 index 00000000000000..351710c000097f --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-01-16-15-40-11.bpo-46406.g0mke-.rst @@ -0,0 +1,2 @@ +The int division code has been optimized, using modern compilers it can be +20% faster on large numbers in some scenarios. diff --git a/Objects/longobject.c b/Objects/longobject.c index 1b2d1266c6bc5f..b042a6f4d8c0fd 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -1616,25 +1616,40 @@ v_rshift(digit *z, digit *a, Py_ssize_t m, int d) in pout, and returning the remainder. pin and pout point at the LSD. It's OK for pin == pout on entry, which saves oodles of mallocs/frees in _PyLong_Format, but that should be done with great care since ints are - immutable. */ + immutable. + This version of the code can be >20% faster than the pre-2022 version + on todays compilers. It evolved from Mark Dickenson observing that on + x86_64 a 128:64 divide instruction was always being generated by the + compiler despite us working with 30-bit digit values. See the thread + for full context: + + https://mail.python.org/archives/list/python-dev@python.org/thread/ZICIMX5VFCX4IOFH5NUPVHCUJCQ4Q7QM/#NEUNFZU3TQU4CPTYZNF3WCN7DOJBBTK5 + + If you ever want to change this code, pay attention to performance using + different compilers, optimization levels, and cpu architectures. Beware of + PGO/FDO builds doing value specialization such as a fast path for //10. :) + + Verify that 17 isn't specialized and this works as a useful test: + python -m timeit -n 1500000 -s 'x = 10**1000; r=x//10; assert r == 10**999, r' 'x//17' +*/ static digit inplace_divrem1(digit *pout, digit *pin, Py_ssize_t size, digit n) { - twodigits rem = 0; + digit remainder = 0; - assert(n > 0 && n <= PyLong_MASK); - pin += size; - pout += size; while (--size >= 0) { - digit hi; - rem = (rem << PyLong_SHIFT) | *--pin; - *--pout = hi = (digit)(rem / n); - rem -= (twodigits)hi * n; - } - return (digit)rem; + twodigits dividend; + dividend = ((twodigits)remainder << PyLong_SHIFT) | pin[size]; + digit quotient; + quotient = dividend / n; + remainder = dividend % n; + pout[size] = quotient; + } + return remainder; } + /* Divide an integer by a digit, returning both the quotient (as function result) and the remainder (through *prem). The sign of a is ignored; n should not be zero. */ From 91b105070660ccb62a74e278e5f65baf6b3c4fc9 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Sun, 16 Jan 2022 18:52:02 -0800 Subject: [PATCH 2/4] re-add the assert, reword news. --- .../2022-01-16-15-40-11.bpo-46406.g0mke-.rst | 5 +++-- Objects/longobject.c | 15 ++++++++------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-01-16-15-40-11.bpo-46406.g0mke-.rst b/Misc/NEWS.d/next/Core and Builtins/2022-01-16-15-40-11.bpo-46406.g0mke-.rst index 351710c000097f..20d1e08bfd48b3 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2022-01-16-15-40-11.bpo-46406.g0mke-.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2022-01-16-15-40-11.bpo-46406.g0mke-.rst @@ -1,2 +1,3 @@ -The int division code has been optimized, using modern compilers it can be -20% faster on large numbers in some scenarios. +The integer division ``//`` implementation has been optimized to better let the +compiler understand its constraints. It can be 20% faster on the amd64 platform +when dividing an int by a value smaller than ``2**30``. diff --git a/Objects/longobject.c b/Objects/longobject.c index b042a6f4d8c0fd..63511db733d26b 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -1618,11 +1618,11 @@ v_rshift(digit *z, digit *a, Py_ssize_t m, int d) _PyLong_Format, but that should be done with great care since ints are immutable. - This version of the code can be >20% faster than the pre-2022 version - on todays compilers. It evolved from Mark Dickenson observing that on - x86_64 a 128:64 divide instruction was always being generated by the - compiler despite us working with 30-bit digit values. See the thread - for full context: + This version of the code can be 20% faster than the pre-2022 version + on todays compilers on architectures like amd64. It evolved from Mark + Dickenson observing that a 128:64 divide instruction was always being + generated by the compiler despite us working with 30-bit digit values. + See the thread for full context: https://mail.python.org/archives/list/python-dev@python.org/thread/ZICIMX5VFCX4IOFH5NUPVHCUJCQ4Q7QM/#NEUNFZU3TQU4CPTYZNF3WCN7DOJBBTK5 @@ -1630,14 +1630,15 @@ v_rshift(digit *z, digit *a, Py_ssize_t m, int d) different compilers, optimization levels, and cpu architectures. Beware of PGO/FDO builds doing value specialization such as a fast path for //10. :) - Verify that 17 isn't specialized and this works as a useful test: - python -m timeit -n 1500000 -s 'x = 10**1000; r=x//10; assert r == 10**999, r' 'x//17' + Verify that 17 isn't specialized and this works as a quick test: + python -m timeit -s 'x = 10**1000; r=x//10; assert r == 10**999, r' 'x//17' */ static digit inplace_divrem1(digit *pout, digit *pin, Py_ssize_t size, digit n) { digit remainder = 0; + assert(n > 0 && n <= PyLong_MASK); while (--size >= 0) { twodigits dividend; dividend = ((twodigits)remainder << PyLong_SHIFT) | pin[size]; From d09f0c73fb863dd2dfc9d815568fac54a2afa57a Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Sat, 22 Jan 2022 10:37:02 -0800 Subject: [PATCH 3/4] Spell Mark's name properly. Co-authored-by: Mark Dickinson --- Objects/longobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/longobject.c b/Objects/longobject.c index 63511db733d26b..b6389c29b7d7f6 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -1620,7 +1620,7 @@ v_rshift(digit *z, digit *a, Py_ssize_t m, int d) This version of the code can be 20% faster than the pre-2022 version on todays compilers on architectures like amd64. It evolved from Mark - Dickenson observing that a 128:64 divide instruction was always being + Dickinson observing that a 128:64 divide instruction was always being generated by the compiler despite us working with 30-bit digit values. See the thread for full context: From 9902dc29a4ae17be3f6cd8bb2f0f3d48b7b83da2 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Sat, 22 Jan 2022 10:39:35 -0800 Subject: [PATCH 4/4] Add a cast to silence a compiler warning. Co-authored-by: Mark Dickinson --- Objects/longobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/longobject.c b/Objects/longobject.c index b6389c29b7d7f6..1d2347a4968af7 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -1643,7 +1643,7 @@ inplace_divrem1(digit *pout, digit *pin, Py_ssize_t size, digit n) twodigits dividend; dividend = ((twodigits)remainder << PyLong_SHIFT) | pin[size]; digit quotient; - quotient = dividend / n; + quotient = (digit)(dividend / n); remainder = dividend % n; pout[size] = quotient; }