From ce58750b850f85346c3462b0d6489e43db510a7c Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 25 Aug 2021 20:16:11 -0500 Subject: [PATCH 1/8] bpo-39218: Simplify and improve accuracy of variance calculations --- Lib/statistics.py | 13 +++---------- Lib/test/test_statistics.py | 3 +++ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 1314095332a159..532a00d2761efc 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -727,16 +727,9 @@ def _ss(data, c=None): calculated from ``c`` as given. Use the second case with care, as it can lead to garbage results. """ - if c is not None: - T, total, count = _sum((x-c)**2 for x in data) - return (T, total) - c = mean(data) - T, total, count = _sum((x-c)**2 for x in data) - # The following sum should mathematically equal zero, but due to rounding - # error may not. - U, total2, count2 = _sum((x - c) for x in data) - assert T == U and count == count2 - total -= total2 ** 2 / len(data) + if c is None: + c = mean(data) + T, total, count = _sum((y := x - c) * y for x in data) assert not total < 0, 'negative sum of square deviations: %f' % total return (T, total) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index dc066fa921d797..64ebd0e8cb9f84 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -1210,6 +1210,9 @@ def __pow__(self, other): def __add__(self, other): return type(self)(super().__add__(other)) __radd__ = __add__ + def __mul__(self, other): + return type(self)(super().__mul__(other)) + __rmul__ = __mul__ return (float, Decimal, Fraction, MyFloat) def test_types_conserved(self): From 742cfb5ddbf46a7a86d079fe65477f58306eafb7 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 25 Aug 2021 20:18:39 -0500 Subject: [PATCH 2/8] Add blurb --- .../next/Library/2021-08-25-20-18-31.bpo-39218.BlO6jW.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2021-08-25-20-18-31.bpo-39218.BlO6jW.rst diff --git a/Misc/NEWS.d/next/Library/2021-08-25-20-18-31.bpo-39218.BlO6jW.rst b/Misc/NEWS.d/next/Library/2021-08-25-20-18-31.bpo-39218.BlO6jW.rst new file mode 100644 index 00000000000000..e3a2166349fc96 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-08-25-20-18-31.bpo-39218.BlO6jW.rst @@ -0,0 +1,2 @@ +Improve accuracy of variance calculations by using x*x instead of x**2. +Simplify code by removing a rounding correction that did not make sense. From f5abe4aad1757b63ac81f144704f14439630e189 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 25 Aug 2021 20:27:56 -0500 Subject: [PATCH 3/8] Also update correlation and linear_regression --- Lib/statistics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 532a00d2761efc..d4e41fd5628dba 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -917,8 +917,8 @@ def correlation(x, y, /): xbar = fsum(x) / n ybar = fsum(y) / n sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) - sxx = fsum((xi - xbar) ** 2.0 for xi in x) - syy = fsum((yi - ybar) ** 2.0 for yi in y) + sxx = fsum((d := xi - xbar) * d for xi in x) + syy = fsum((d := yi - ybar) * d for yi in y) try: return sxy / sqrt(sxx * syy) except ZeroDivisionError: @@ -961,7 +961,7 @@ def linear_regression(x, y, /): xbar = fsum(x) / n ybar = fsum(y) / n sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) - sxx = fsum((xi - xbar) ** 2.0 for xi in x) + sxx = fsum((d := xi - xbar) * d for xi in x) try: slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x) except ZeroDivisionError: From cf77ef06e44b712f29951ae62e0fcea01e9a5547 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 25 Aug 2021 20:29:25 -0500 Subject: [PATCH 4/8] Use "d" for difference instead of "y" --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index d4e41fd5628dba..3fb6a432cb7718 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -729,7 +729,7 @@ def _ss(data, c=None): """ if c is None: c = mean(data) - T, total, count = _sum((y := x - c) * y for x in data) + T, total, count = _sum((d := x - c) * d for x in data) assert not total < 0, 'negative sum of square deviations: %f' % total return (T, total) From 891334a6acbcc0ff7d469cb55ecead2d959b0448 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 26 Aug 2021 10:49:13 -0500 Subject: [PATCH 5/8] Restore the correction for inaccuracy in the mean --- Lib/statistics.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 3fb6a432cb7718..7edfeffbc5f8e7 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -727,9 +727,20 @@ def _ss(data, c=None): calculated from ``c`` as given. Use the second case with care, as it can lead to garbage results. """ - if c is None: - c = mean(data) + if c is not None: + T, total, count = _sum((d := x - c) * d for x in data) + return (T, total) + # Compute the mean accurate to within 1/2 ulp + c = mean(data) + # Initial computation for the sum of square deviations T, total, count = _sum((d := x - c) * d for x in data) + # Correct any remaining inaccuracy in the mean c. + # The following sum should mathematically equal zero, + # but due to the final rounding of the mean, it may not. + U, error, count2 = _sum((x - c) for x in data) + assert T == U and count == count2 + correction = error * error / len(data) + total -= correction assert not total < 0, 'negative sum of square deviations: %f' % total return (T, total) From 00887862451fcc4c082a0b9f89abedeafb79aa05 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 26 Aug 2021 10:51:24 -0500 Subject: [PATCH 6/8] Per Steven's comment, the type assertion has outlived its usefulness. --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 7edfeffbc5f8e7..9aaeae0fb56875 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -738,7 +738,7 @@ def _ss(data, c=None): # The following sum should mathematically equal zero, # but due to the final rounding of the mean, it may not. U, error, count2 = _sum((x - c) for x in data) - assert T == U and count == count2 + assert count == count2 correction = error * error / len(data) total -= correction assert not total < 0, 'negative sum of square deviations: %f' % total From 37739b75aa26a6f2c709c3d41de3ee7eaf4aeb63 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 26 Aug 2021 11:37:56 -0500 Subject: [PATCH 7/8] Update Misc/NEWS message. --- .../NEWS.d/next/Library/2021-08-25-20-18-31.bpo-39218.BlO6jW.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2021-08-25-20-18-31.bpo-39218.BlO6jW.rst b/Misc/NEWS.d/next/Library/2021-08-25-20-18-31.bpo-39218.BlO6jW.rst index e3a2166349fc96..7bcf9a222e33c3 100644 --- a/Misc/NEWS.d/next/Library/2021-08-25-20-18-31.bpo-39218.BlO6jW.rst +++ b/Misc/NEWS.d/next/Library/2021-08-25-20-18-31.bpo-39218.BlO6jW.rst @@ -1,2 +1 @@ Improve accuracy of variance calculations by using x*x instead of x**2. -Simplify code by removing a rounding correction that did not make sense. From 9fc93dbeb89c18e28f32b2e0e45a8a128b87f5f7 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 26 Aug 2021 15:22:19 -0500 Subject: [PATCH 8/8] Convert other instances of x**2.0 to x*x --- Lib/statistics.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 9aaeae0fb56875..a14c48e89814bd 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1098,10 +1098,11 @@ def samples(self, n, *, seed=None): def pdf(self, x): "Probability density function. P(x <= X < x+dx) / dx" - variance = self._sigma ** 2.0 + variance = self._sigma * self._sigma if not variance: raise StatisticsError('pdf() not defined when sigma is zero') - return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance) + diff = x - self._mu + return exp(diff * diff / (-2.0 * variance)) / sqrt(tau * variance) def cdf(self, x): "Cumulative distribution function. P(X <= x)" @@ -1165,7 +1166,7 @@ def overlap(self, other): if not dv: return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0))) a = X._mu * Y_var - Y._mu * X_var - b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var)) + b = X._sigma * Y._sigma * sqrt(dm * dm + dv * log(Y_var / X_var)) x1 = (a + b) / dv x2 = (a - b) / dv return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2))) @@ -1208,7 +1209,7 @@ def stdev(self): @property def variance(self): "Square of the standard deviation." - return self._sigma ** 2.0 + return self._sigma * self._sigma def __add__(x1, x2): """Add a constant or another NormalDist instance.