From a11ac25e464f3bd98f693c32bb1e2cb67e2c4cc3 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 14 May 2021 11:40:20 -0700 Subject: [PATCH 1/4] Improve speed and accuracy for correlation() --- Lib/statistics.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index db8c581068b7dd..84b748fc28f9f2 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -884,8 +884,8 @@ def covariance(x, y, /): raise StatisticsError('covariance requires at least two data points') xbar = fmean(x) ybar = fmean(y) - total = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) - return total / (n - 1) + sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) + return sxy / (n - 1) def correlation(x, y, /): @@ -910,11 +910,13 @@ def correlation(x, y, /): raise StatisticsError('correlation requires that both inputs have same number of data points') if n < 2: raise StatisticsError('correlation requires at least two data points') - cov = covariance(x, y) - stdx = stdev(x) - stdy = stdev(y) + xbar = fmean(x) + ybar = fmean(y) + sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) + s2x = fsum((xi - xbar) ** 2.0 for xi in x) + s2y = fsum((yi - ybar) ** 2.0 for yi in y) try: - return cov / (stdx * stdy) + return sxy / sqrt(s2x * s2y) except ZeroDivisionError: raise StatisticsError('at least one of the inputs is constant') From 9d05de138ff7c25e079dffc9453b7e6b023bbdd6 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 14 May 2021 12:28:28 -0700 Subject: [PATCH 2/4] Consistently use fsum() --- Lib/statistics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 84b748fc28f9f2..d815f4b9bf56f5 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -882,8 +882,8 @@ def covariance(x, y, /): raise StatisticsError('covariance requires that both inputs have same number of data points') if n < 2: raise StatisticsError('covariance requires at least two data points') - xbar = fmean(x) - ybar = fmean(y) + xbar = fsum(x) / n + ybar = fsum(y) / n sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) return sxy / (n - 1) @@ -910,8 +910,8 @@ def correlation(x, y, /): raise StatisticsError('correlation requires that both inputs have same number of data points') if n < 2: raise StatisticsError('correlation requires at least two data points') - xbar = fmean(x) - ybar = fmean(y) + xbar = fsum(x) / n + ybar = fsum(y) / n sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) s2x = fsum((xi - xbar) ** 2.0 for xi in x) s2y = fsum((yi - ybar) ** 2.0 for yi in y) From cf3986ba3d889139b378494dffa1fab7fc668a7a Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 15 May 2021 09:58:26 -0700 Subject: [PATCH 3/4] Restore sort order to __all__ --- Lib/statistics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index d815f4b9bf56f5..b2fe37a8308449 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -107,9 +107,12 @@ __all__ = [ 'NormalDist', 'StatisticsError', + 'correlation', + 'covariance', 'fmean', 'geometric_mean', 'harmonic_mean', + 'linear_regression', 'mean', 'median', 'median_grouped', @@ -122,9 +125,6 @@ 'quantiles', 'stdev', 'variance', - 'correlation', - 'covariance', - 'linear_regression', ] import math From c31b93a04397ea6d901117a7361469d74c838ca1 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 15 May 2021 10:31:11 -0700 Subject: [PATCH 4/4] Add comment --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index b2fe37a8308449..507a5b2d79dce2 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -960,7 +960,7 @@ def linear_regression(regressor, dependent_variable, /): sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) s2x = fsum((xi - xbar) ** 2.0 for xi in x) try: - slope = sxy / s2x + slope = sxy / s2x # equivalent to: covariance(x, y) / variance(x) except ZeroDivisionError: raise StatisticsError('regressor is constant') intercept = ybar - slope * xbar