@@ -2455,6 +2455,9 @@ Given that csum >= 1.0, we have:
2455
2455
lo**2 <= 2**-54 < 2**-53 == 1/2*ulp(1.0) <= ulp(csum)/2
2456
2456
Since lo**2 is less than 1/2 ulp(csum), we have csum+lo*lo == csum.
2457
2457
2458
+ To minimize loss of information during the accumulation of fractional
2459
+ values, the lo**2 term has a separate accumulator.
2460
+
2458
2461
The square root differential correction is needed because a
2459
2462
correctly rounded square root of a correctly rounded sum of
2460
2463
squares can still be off by as much as one ulp.
@@ -2475,15 +2478,16 @@ Borges' ALGORITHM 4 and 5.
2475
2478
2476
2479
1. Veltkamp-Dekker splitting: http://csclub.uwaterloo.ca/~pbarfuss/dekker1971.pdf
2477
2480
2. Compensated summation: http://www.ti3.tu-harburg.de/paper/rump/Ru08b.pdf
2478
- 3. Square root diffential correction: https://arxiv.org/pdf/1904.09481.pdf
2481
+ 3. Square root differential correction: https://arxiv.org/pdf/1904.09481.pdf
2482
+ 4. https://www.wolframalpha.com/input/?i=Maclaurin+series+sqrt%28h**2+%2B+x%29+at+x%3D0
2479
2483
2480
2484
*/
2481
2485
2482
2486
static inline double
2483
2487
vector_norm (Py_ssize_t n , double * vec , double max , int found_nan )
2484
2488
{
2485
2489
const double T27 = 134217729.0 ; /* ldexp(1.0, 27)+1.0) */
2486
- double x , csum = 1.0 , oldcsum , frac = 0.0 , scale ;
2490
+ double x , csum = 1.0 , oldcsum , frac = 0.0 , frac_lo = 0.0 , scale ;
2487
2491
double t , hi , lo , h ;
2488
2492
int max_e ;
2489
2493
Py_ssize_t i ;
@@ -2528,8 +2532,9 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
2528
2532
frac += (oldcsum - csum ) + x ;
2529
2533
2530
2534
assert (csum + lo * lo == csum );
2531
- frac += lo * lo ;
2535
+ frac_lo += lo * lo ;
2532
2536
}
2537
+ frac += frac_lo ;
2533
2538
h = sqrt (csum - 1.0 + frac );
2534
2539
2535
2540
x = h ;
0 commit comments