From d0091d7397c3ee9316d32fec5902d1d957aa68e2 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 17 Mar 2023 14:20:29 -0500 Subject: [PATCH 1/4] Add comments --- Modules/mathmodule.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index 48cd9a642de150..4953aad6c5e508 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2530,16 +2530,16 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) x = vec[i]; assert(Py_IS_FINITE(x) && fabs(x) <= max); - x *= scale; + x *= scale; // lossless scaling assert(fabs(x) < 1.0); - pr = dl_mul(x, x); + pr = dl_mul(x, x); // lossless squaring assert(pr.hi <= 1.0); - sm = dl_fast_sum(csum, pr.hi); + sm = dl_fast_sum(csum, pr.hi); // lossless addition csum = sm.hi; - frac1 += pr.lo; - frac2 += sm.lo; + frac1 += pr.lo; // lossy addition + frac2 += sm.lo; // lossy addition } h = sqrt(csum - 1.0 + (frac1 + frac2)); pr = dl_mul(-h, h); @@ -2548,7 +2548,8 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) frac1 += pr.lo; frac2 += sm.lo; x = csum - 1.0 + (frac1 + frac2); - return (h + x / (2.0 * h)) / scale; + h += x / (2.0 * h); // diffential correction + return h / scale; } #define NUM_STACK_ELEMS 16 From 6bcce8c85ccc74379027874c8d4afa946200f4e6 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 18 Mar 2023 09:00:46 -0500 Subject: [PATCH 2/4] Inline the subnormal comment --- Modules/mathmodule.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index 4953aad6c5e508..4f38180a4cb768 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2514,12 +2514,9 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) } frexp(max, &max_e); if (max_e < -1023) { - /* When max_e < -1023, ldexp(1.0, -max_e) would overflow. - So we first perform lossless scaling from subnormals back to normals, - then recurse back to vector_norm(), and then finally undo the scaling. - */ + /* When max_e < -1023, ldexp(1.0, -max_e) would overflow. */ for (i=0 ; i < n ; i++) { - vec[i] /= DBL_MIN; + vec[i] /= DBL_MIN; // convert subnormals to normals } return DBL_MIN * vector_norm(n, vec, max / DBL_MIN, found_nan); } @@ -2529,13 +2526,10 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) for (i=0 ; i < n ; i++) { x = vec[i]; assert(Py_IS_FINITE(x) && fabs(x) <= max); - x *= scale; // lossless scaling assert(fabs(x) < 1.0); - pr = dl_mul(x, x); // lossless squaring assert(pr.hi <= 1.0); - sm = dl_fast_sum(csum, pr.hi); // lossless addition csum = sm.hi; frac1 += pr.lo; // lossy addition From 56c8c2f3389bd892702ed4334fe5a85685e15fef Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 18 Mar 2023 09:06:21 -0500 Subject: [PATCH 3/4] Fix spelling --- Modules/mathmodule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index 4f38180a4cb768..c9bc689ef131d2 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2542,7 +2542,7 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) frac1 += pr.lo; frac2 += sm.lo; x = csum - 1.0 + (frac1 + frac2); - h += x / (2.0 * h); // diffential correction + h += x / (2.0 * h); // differential correction return h / scale; } From 96c5035f89fd00dc5723278fcf68fac57394f3d1 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 18 Mar 2023 11:40:54 -0500 Subject: [PATCH 4/4] Update timing information to reflect the new fma() code --- Modules/mathmodule.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index c9bc689ef131d2..c9a2be66863993 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2447,9 +2447,8 @@ Since lo**2 is less than 1/2 ulp(csum), we have csum+lo*lo == csum. To minimize loss of information during the accumulation of fractional values, each term has a separate accumulator. This also breaks up sequential dependencies in the inner loop so the CPU can maximize -floating point throughput. [4] On a 2.6 GHz Haswell, adding one -dimension has an incremental cost of only 5ns -- for example when -moving from hypot(x,y) to hypot(x,y,z). +floating point throughput. [4] On an Apple M1 Max, hypot(*vec) +takes only 3.33 µsec when len(vec) == 1000. The square root differential correction is needed because a correctly rounded square root of a correctly rounded sum of @@ -2473,7 +2472,7 @@ step is exact. The Neumaier summation computes as if in doubled precision (106 bits) and has the advantage that its input squares are non-negative so that the condition number of the sum is one. The square root with a differential correction is likewise computed -as if in double precision. +as if in doubled precision. For n <= 1000, prior to the final addition that rounds the overall result, the internal accuracy of "h" together with its correction of