@@ -2832,7 +2832,7 @@ long_add_would_overflow(long a, long b)
2832
2832
}
2833
2833
2834
2834
/*
2835
- Double length extended precision floating point arithmetic
2835
+ Double and triple length extended precision floating point arithmetic
2836
2836
based on ideas from three sources:
2837
2837
2838
2838
Improved Kahan–Babuška algorithm by Arnold Neumaier
@@ -2845,22 +2845,22 @@ based on ideas from three sources:
2845
2845
Ultimately Fast Accurate Summation by Siegfried M. Rump
2846
2846
https://www.tuhh.de/ti3/paper/rump/Ru08b.pdf
2847
2847
2848
- The double length routines allow for quite a bit of instruction
2849
- level parallelism. On a 3.22 Ghz Apple M1 Max, the incremental
2850
- cost of increasing the input vector size by one is 6.0 nsec .
2848
+ Double length functions:
2849
+ * dl_split() exact split of a C double into two half precision components.
2850
+ * dl_mul() exact multiplication of two C doubles .
2851
2851
2852
- dl_zero() returns an extended precision zero
2853
- dl_split() exactly splits a double into two half precision components.
2854
- dl_add() performs compensated summation to keep a running total.
2855
- dl_mul() implements lossless multiplication of doubles.
2856
- dl_fma() implements an extended precision fused-multiply-add.
2857
- dl_to_d() converts from extended precision to double precision.
2852
+ Triple length functions and constant:
2853
+ * tl_zero is a triple length zero for starting or resetting an accumulation.
2854
+ * tl_add() compensated addition of a C double to a triple length number.
2855
+ * tl_fma() performs a triple length fused-multiply-add.
2856
+ * tl_to_d() converts from triple length number back to a C double.
2858
2857
2859
2858
*/
2860
2859
2861
2860
typedef struct { double hi ; double lo ; } DoubleLength ;
2861
+ typedef struct { double hi ; double lo ; double tiny ; } TripleLength ;
2862
2862
2863
- static const DoubleLength dl_zero = {0.0 , 0.0 };
2863
+ static const TripleLength tl_zero = {0.0 , 0.0 , 0.0 };
2864
2864
2865
2865
static inline DoubleLength
2866
2866
twosum (double a , double b )
@@ -2874,11 +2874,20 @@ twosum(double a, double b)
2874
2874
return (DoubleLength ) {s , t };
2875
2875
}
2876
2876
2877
- static inline DoubleLength
2878
- dl_add ( DoubleLength total , double x )
2877
+ static inline TripleLength
2878
+ tl_add ( TripleLength total , double x )
2879
2879
{
2880
- DoubleLength s = twosum (total .hi , x );
2881
- return (DoubleLength ) {s .hi , total .lo + s .lo };
2880
+ /* Input: x total.hi total.lo total.tiny
2881
+ |--- twosum ---|
2882
+ s.hi s.lo
2883
+ |--- twosum ---|
2884
+ t.hi t.lo
2885
+ |--- single sum ---|
2886
+ Output: s.hi t.hi tiny
2887
+ */
2888
+ DoubleLength s = twosum (x , total .hi );
2889
+ DoubleLength t = twosum (s .lo , total .lo );
2890
+ return (TripleLength ) {s .hi , t .hi , t .lo + total .tiny };
2882
2891
}
2883
2892
2884
2893
static inline DoubleLength
@@ -2902,18 +2911,18 @@ dl_mul(double x, double y)
2902
2911
return (DoubleLength ) {z , zz };
2903
2912
}
2904
2913
2905
- static inline DoubleLength
2906
- dl_fma ( DoubleLength total , double p , double q )
2914
+ static inline TripleLength
2915
+ tl_fma ( TripleLength total , double p , double q )
2907
2916
{
2908
2917
DoubleLength product = dl_mul (p , q );
2909
- total = dl_add (total , product .hi );
2910
- return dl_add (total , product .lo );
2918
+ total = tl_add (total , product .hi );
2919
+ return tl_add (total , product .lo );
2911
2920
}
2912
2921
2913
2922
static inline double
2914
- dl_to_d ( DoubleLength total )
2923
+ tl_to_d ( TripleLength total )
2915
2924
{
2916
- return total .hi + total .lo ;
2925
+ return total .tiny + total .lo + total . hi ;
2917
2926
}
2918
2927
2919
2928
/*[clinic input]
@@ -2944,7 +2953,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
2944
2953
bool int_path_enabled = true, int_total_in_use = false;
2945
2954
bool flt_path_enabled = true, flt_total_in_use = false;
2946
2955
long int_total = 0 ;
2947
- DoubleLength flt_total = dl_zero ;
2956
+ TripleLength flt_total = tl_zero ;
2948
2957
2949
2958
p_it = PyObject_GetIter (p );
2950
2959
if (p_it == NULL ) {
@@ -3079,7 +3088,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
3079
3088
} else {
3080
3089
goto finalize_flt_path ;
3081
3090
}
3082
- DoubleLength new_flt_total = dl_fma (flt_total , flt_p , flt_q );
3091
+ TripleLength new_flt_total = tl_fma (flt_total , flt_p , flt_q );
3083
3092
if (isfinite (new_flt_total .hi )) {
3084
3093
flt_total = new_flt_total ;
3085
3094
flt_total_in_use = true;
@@ -3093,7 +3102,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
3093
3102
// We're finished, overflowed, have a non-float, or got a non-finite value
3094
3103
flt_path_enabled = false;
3095
3104
if (flt_total_in_use ) {
3096
- term_i = PyFloat_FromDouble (dl_to_d (flt_total ));
3105
+ term_i = PyFloat_FromDouble (tl_to_d (flt_total ));
3097
3106
if (term_i == NULL ) {
3098
3107
goto err_exit ;
3099
3108
}
@@ -3104,7 +3113,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
3104
3113
Py_SETREF (total , new_total );
3105
3114
new_total = NULL ;
3106
3115
Py_CLEAR (term_i );
3107
- flt_total = dl_zero ;
3116
+ flt_total = tl_zero ;
3108
3117
flt_total_in_use = false;
3109
3118
}
3110
3119
}
0 commit comments