From a4eea39d42f9f7feec51df8b3d66ccd44c33ffa5 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Mon, 24 Jan 2022 12:52:58 -0600 Subject: [PATCH 1/2] bpo-46504: faster code for trial quotient in x_divrem() This brings x_divrem() back into synch with x_divrem1(), which was changed in bpo-46406 to generate faster code to find machine-word division quotients and remainders. Modern processors compute both with a single machine instructioh, but convincing C to exploit that requires writing _less_ "clever" C code. --- Objects/longobject.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Objects/longobject.c b/Objects/longobject.c index ee20e2638bcad1..806c27d1d21bb2 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -2767,8 +2767,15 @@ x_divrem(PyLongObject *v1, PyLongObject *w1, PyLongObject **prem) vtop = vk[size_w]; assert(vtop <= wm1); vv = ((twodigits)vtop << PyLong_SHIFT) | vk[size_w-1]; + /* The code used to compute the remainder via + * r = (digit)(vv - (twodigits)wm1 * q); + * and compilers generally generated code to do the * and -. + * But modern p;ocessors generally compute q and r with a single + * instruction, and modern optimizing compilers exploit that if we + * _don't_ try to optimize it. + */ q = (digit)(vv / wm1); - r = (digit)(vv - (twodigits)wm1 * q); /* r = vv % wm1 */ + r = (digit)(vv % wm1); while ((twodigits)wm2 * q > (((twodigits)r << PyLong_SHIFT) | vk[size_w-2])) { --q; From 6e5e4d224720ae669b273a68d1a1b2b32928a995 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Mon, 24 Jan 2022 12:58:28 -0600 Subject: [PATCH 2/2] Fix typo in comment. --- Objects/longobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/longobject.c b/Objects/longobject.c index 806c27d1d21bb2..5f0cc579c2cca5 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -2770,7 +2770,7 @@ x_divrem(PyLongObject *v1, PyLongObject *w1, PyLongObject **prem) /* The code used to compute the remainder via * r = (digit)(vv - (twodigits)wm1 * q); * and compilers generally generated code to do the * and -. - * But modern p;ocessors generally compute q and r with a single + * But modern processors generally compute q and r with a single * instruction, and modern optimizing compilers exploit that if we * _don't_ try to optimize it. */