From 0b547235a0ee617f83e13d616f8d6562520a499f Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 3 Jan 2022 14:16:22 -0600 Subject: [PATCH 01/13] Add doctest and improve readability for move_to_end() example. --- Doc/library/collections.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Doc/library/collections.rst b/Doc/library/collections.rst index 8bf3cb6cb12da1..b8a717d883c093 100644 --- a/Doc/library/collections.rst +++ b/Doc/library/collections.rst @@ -1120,14 +1120,16 @@ Some differences from :class:`dict` still remain: Move an existing *key* to either end of an ordered dictionary. The item is moved to the right end if *last* is true (the default) or to the beginning if *last* is false. Raises :exc:`KeyError` if the *key* does - not exist:: + not exist: + + .. doctest:: >>> d = OrderedDict.fromkeys('abcde') >>> d.move_to_end('b') - >>> ''.join(d.keys()) + >>> ''.join(d) 'acdeb' >>> d.move_to_end('b', last=False) - >>> ''.join(d.keys()) + >>> ''.join(d) 'bacde' .. versionadded:: 3.2 From 6ce943f62ee628311859968324351dac29c51aff Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 4 Jan 2022 10:07:39 -0600 Subject: [PATCH 02/13] Single pass sum of squares --- Lib/statistics.py | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index c104571d39053d..41eb72519d8bdf 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -776,14 +776,6 @@ def quantiles(data, *, n=4, method='exclusive'): # See http://mathworld.wolfram.com/Variance.html # http://mathworld.wolfram.com/SampleVariance.html -# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance -# -# Under no circumstances use the so-called "computational formula for -# variance", as that is only suitable for hand calculations with a small -# amount of low-precision data. It has terrible numeric properties. -# -# See a comparison of three computational methods here: -# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/ def _ss(data, c=None): """Return sum of square deviations of sequence data. @@ -796,20 +788,30 @@ def _ss(data, c=None): if c is not None: T, total, count = _sum((d := x - c) * d for x in data) return (T, total) - T, total, count = _sum(data) - mean_n, mean_d = (total / count).as_integer_ratio() - partials = Counter() - for n, d in map(_exact_ratio, data): - diff_n = n * mean_d - d * mean_n - diff_d = d * mean_d - partials[diff_d * diff_d] += diff_n * diff_n - if None in partials: + count = 0 + sx_partials = {} + sx_partials_get = sx_partials.get + sxx_partials = {} + sxx_partials_get = sxx_partials.get + T = int + for typ, values in groupby(data, type): + T = _coerce(T, typ) # or raise TypeError + for n, d in map(_exact_ratio, values): + count += 1 + sx_partials[d] = sx_partials_get(d, 0) + n + dd = d * d + sxx_partials[dd] = sxx_partials_get(dd, 0) + n*n + if None in sx_partials: # The sum will be a NAN or INF. We can ignore all the finite # partials, and just look at this special one. - total = partials[None] + total = sx_partials[None] assert not _isfinite(total) else: - total = sum(Fraction(n, d) for d, n in partials.items()) + sx = sum(Fraction(n, d) for d, n in sx_partials.items()) + sxx = sum(Fraction(n, d) for d, n in sxx_partials.items()) + # This formula is has poor numeric properties for floats, + # but with fractions it is exact. + total = (count * sxx - sx * sx) / count return (T, total) From c8e2de7d1e69332bb024a015fac86688adc2e977 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 4 Jan 2022 10:15:00 -0600 Subject: [PATCH 03/13] Use len() to get the count --- Lib/statistics.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 41eb72519d8bdf..33cabd20d9ee20 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -788,7 +788,6 @@ def _ss(data, c=None): if c is not None: T, total, count = _sum((d := x - c) * d for x in data) return (T, total) - count = 0 sx_partials = {} sx_partials_get = sx_partials.get sxx_partials = {} @@ -797,7 +796,6 @@ def _ss(data, c=None): for typ, values in groupby(data, type): T = _coerce(T, typ) # or raise TypeError for n, d in map(_exact_ratio, values): - count += 1 sx_partials[d] = sx_partials_get(d, 0) + n dd = d * d sxx_partials[dd] = sxx_partials_get(dd, 0) + n*n @@ -807,6 +805,7 @@ def _ss(data, c=None): total = sx_partials[None] assert not _isfinite(total) else: + count = len(data) sx = sum(Fraction(n, d) for d, n in sx_partials.items()) sxx = sum(Fraction(n, d) for d, n in sxx_partials.items()) # This formula is has poor numeric properties for floats, From 45d83da8eb47cdbfab49d5a57c9cc4635378b01f Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 4 Jan 2022 10:32:09 -0600 Subject: [PATCH 04/13] Avoid converting iterators to lists --- Lib/statistics.py | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 33cabd20d9ee20..d1ea3542f42ef1 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -787,7 +787,8 @@ def _ss(data, c=None): """ if c is not None: T, total, count = _sum((d := x - c) * d for x in data) - return (T, total) + return (T, total, count) + count = 0 sx_partials = {} sx_partials_get = sx_partials.get sxx_partials = {} @@ -796,22 +797,24 @@ def _ss(data, c=None): for typ, values in groupby(data, type): T = _coerce(T, typ) # or raise TypeError for n, d in map(_exact_ratio, values): + count += 1 sx_partials[d] = sx_partials_get(d, 0) + n dd = d * d sxx_partials[dd] = sxx_partials_get(dd, 0) + n*n - if None in sx_partials: + if not count: + total = Fraction(0) + elif None in sx_partials: # The sum will be a NAN or INF. We can ignore all the finite # partials, and just look at this special one. total = sx_partials[None] assert not _isfinite(total) else: - count = len(data) sx = sum(Fraction(n, d) for d, n in sx_partials.items()) sxx = sum(Fraction(n, d) for d, n in sxx_partials.items()) # This formula is has poor numeric properties for floats, # but with fractions it is exact. total = (count * sxx - sx * sx) / count - return (T, total) + return (T, total, count) def variance(data, xbar=None): @@ -852,12 +855,9 @@ def variance(data, xbar=None): Fraction(67, 108) """ - if iter(data) is data: - data = list(data) - n = len(data) + T, ss, n = _ss(data, xbar) if n < 2: raise StatisticsError('variance requires at least two data points') - T, ss = _ss(data, xbar) return _convert(ss / (n - 1), T) @@ -896,12 +896,9 @@ def pvariance(data, mu=None): Fraction(13, 72) """ - if iter(data) is data: - data = list(data) - n = len(data) + T, ss, n = _ss(data, mu) if n < 1: raise StatisticsError('pvariance requires at least one data point') - T, ss = _ss(data, mu) return _convert(ss / n, T) @@ -914,12 +911,9 @@ def stdev(data, xbar=None): 1.0810874155219827 """ - if iter(data) is data: - data = list(data) - n = len(data) + T, ss, n = _ss(data, xbar) if n < 2: raise StatisticsError('stdev requires at least two data points') - T, ss = _ss(data, xbar) mss = ss / (n - 1) if issubclass(T, Decimal): return _decimal_sqrt_of_frac(mss.numerator, mss.denominator) @@ -935,12 +929,9 @@ def pstdev(data, mu=None): 0.986893273527251 """ - if iter(data) is data: - data = list(data) - n = len(data) + T, ss, n = _ss(data, mu) if n < 1: raise StatisticsError('pstdev requires at least one data point') - T, ss = _ss(data, mu) mss = ss / n if issubclass(T, Decimal): return _decimal_sqrt_of_frac(mss.numerator, mss.denominator) From b1a89be49481361dc982ed4fb10408cc164c4e59 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 4 Jan 2022 10:42:45 -0600 Subject: [PATCH 05/13] Neaten-up --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index d1ea3542f42ef1..c0331f7af66401 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -800,7 +800,7 @@ def _ss(data, c=None): count += 1 sx_partials[d] = sx_partials_get(d, 0) + n dd = d * d - sxx_partials[dd] = sxx_partials_get(dd, 0) + n*n + sxx_partials[dd] = sxx_partials_get(dd, 0) + n * n if not count: total = Fraction(0) elif None in sx_partials: From b4d2797476c3a164ead5643b8483299f36138f66 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 4 Jan 2022 11:04:41 -0600 Subject: [PATCH 06/13] Add blurb --- .../next/Library/2022-01-04-11-04-20.bpo-46257._o2ADe.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2022-01-04-11-04-20.bpo-46257._o2ADe.rst diff --git a/Misc/NEWS.d/next/Library/2022-01-04-11-04-20.bpo-46257._o2ADe.rst b/Misc/NEWS.d/next/Library/2022-01-04-11-04-20.bpo-46257._o2ADe.rst new file mode 100644 index 00000000000000..6ccf96ed706043 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-01-04-11-04-20.bpo-46257._o2ADe.rst @@ -0,0 +1,4 @@ +Optimized the variance and stdev functions in the statistics module. If the +input is an iterator, it is consumed in a single pass rather than eating +memory by conversion to a list. The single pass algorithm is about twice as +fast as the previous two pass code. From 712f648b86d0d9f67d41e95af4405ee878d14a5b Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 4 Jan 2022 09:10:57 -0800 Subject: [PATCH 07/13] Avoid touching collections.rst --- Doc/library/collections.rst | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Doc/library/collections.rst b/Doc/library/collections.rst index b8a717d883c093..8bf3cb6cb12da1 100644 --- a/Doc/library/collections.rst +++ b/Doc/library/collections.rst @@ -1120,16 +1120,14 @@ Some differences from :class:`dict` still remain: Move an existing *key* to either end of an ordered dictionary. The item is moved to the right end if *last* is true (the default) or to the beginning if *last* is false. Raises :exc:`KeyError` if the *key* does - not exist: - - .. doctest:: + not exist:: >>> d = OrderedDict.fromkeys('abcde') >>> d.move_to_end('b') - >>> ''.join(d) + >>> ''.join(d.keys()) 'acdeb' >>> d.move_to_end('b', last=False) - >>> ''.join(d) + >>> ''.join(d.keys()) 'bacde' .. versionadded:: 3.2 From dc98276ec2c62cecef08ac660b5c470e30cd85f7 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 4 Jan 2022 11:20:24 -0600 Subject: [PATCH 08/13] Accumulate unsquared denominators --- Lib/statistics.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index c0331f7af66401..01836524e0fb64 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -799,8 +799,7 @@ def _ss(data, c=None): for n, d in map(_exact_ratio, values): count += 1 sx_partials[d] = sx_partials_get(d, 0) + n - dd = d * d - sxx_partials[dd] = sxx_partials_get(dd, 0) + n * n + sxx_partials[d] = sxx_partials_get(d, 0) + n * n if not count: total = Fraction(0) elif None in sx_partials: @@ -810,7 +809,7 @@ def _ss(data, c=None): assert not _isfinite(total) else: sx = sum(Fraction(n, d) for d, n in sx_partials.items()) - sxx = sum(Fraction(n, d) for d, n in sxx_partials.items()) + sxx = sum(Fraction(n, d*d) for d, n in sxx_partials.items()) # This formula is has poor numeric properties for floats, # but with fractions it is exact. total = (count * sxx - sx * sx) / count From 6b2e8ca128bc590c380d16660a7d23ce6dc6210f Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 4 Jan 2022 09:25:51 -0800 Subject: [PATCH 09/13] Update Lib/statistics.py Co-authored-by: Tim Peters --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 01836524e0fb64..f02409e0806312 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -810,7 +810,7 @@ def _ss(data, c=None): else: sx = sum(Fraction(n, d) for d, n in sx_partials.items()) sxx = sum(Fraction(n, d*d) for d, n in sxx_partials.items()) - # This formula is has poor numeric properties for floats, + # This formula has poor numeric properties for floats, # but with fractions it is exact. total = (count * sxx - sx * sx) / count return (T, total, count) From ae382ffa0d59ba408173fa89e59ced2a721e05f2 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 4 Jan 2022 12:54:37 -0600 Subject: [PATCH 10/13] Make mean() single pass over iterators --- Lib/statistics.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 01836524e0fb64..4a0eb5219b378a 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -399,13 +399,9 @@ def mean(data): If ``data`` is empty, StatisticsError will be raised. """ - if iter(data) is data: - data = list(data) - n = len(data) + T, total, n = _sum(data) if n < 1: raise StatisticsError('mean requires at least one data point') - T, total, count = _sum(data) - assert count == n return _convert(total / n, T) From bbe65586b39c4b695dd0454ebf2d1097d7320305 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 4 Jan 2022 13:08:17 -0600 Subject: [PATCH 11/13] Update blurb to cover mean(). --- .../next/Library/2022-01-04-11-04-20.bpo-46257._o2ADe.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2022-01-04-11-04-20.bpo-46257._o2ADe.rst b/Misc/NEWS.d/next/Library/2022-01-04-11-04-20.bpo-46257._o2ADe.rst index 6ccf96ed706043..72ae56ec412a6d 100644 --- a/Misc/NEWS.d/next/Library/2022-01-04-11-04-20.bpo-46257._o2ADe.rst +++ b/Misc/NEWS.d/next/Library/2022-01-04-11-04-20.bpo-46257._o2ADe.rst @@ -1,4 +1,4 @@ -Optimized the variance and stdev functions in the statistics module. If the -input is an iterator, it is consumed in a single pass rather than eating -memory by conversion to a list. The single pass algorithm is about twice as -fast as the previous two pass code. +Optimized the mean, variance, and stdev functions in the statistics module. +If the input is an iterator, it is consumed in a single pass rather than +eating memory by conversion to a list. The single pass algorithm is about +twice as fast as the previous two pass code. From 326bce8fe3e8d70308a68d08593f10c84f4abf9c Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 4 Jan 2022 13:14:15 -0600 Subject: [PATCH 12/13] Move _ss() into the private utilities section. --- Lib/statistics.py | 77 ++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 4c82b37bf58208..9aee21517223ae 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -202,6 +202,45 @@ def _sum(data): return (T, total, count) +def _ss(data, c=None): + """Return sum of square deviations of sequence data. + + If ``c`` is None, the mean is calculated in one pass, and the deviations + from the mean are calculated in a second pass. Otherwise, deviations are + calculated from ``c`` as given. Use the second case with care, as it can + lead to garbage results. + """ + if c is not None: + T, total, count = _sum((d := x - c) * d for x in data) + return (T, total, count) + count = 0 + sx_partials = {} + sx_partials_get = sx_partials.get + sxx_partials = {} + sxx_partials_get = sxx_partials.get + T = int + for typ, values in groupby(data, type): + T = _coerce(T, typ) # or raise TypeError + for n, d in map(_exact_ratio, values): + count += 1 + sx_partials[d] = sx_partials_get(d, 0) + n + sxx_partials[d] = sxx_partials_get(d, 0) + n * n + if not count: + total = Fraction(0) + elif None in sx_partials: + # The sum will be a NAN or INF. We can ignore all the finite + # partials, and just look at this special one. + total = sx_partials[None] + assert not _isfinite(total) + else: + sx = sum(Fraction(n, d) for d, n in sx_partials.items()) + sxx = sum(Fraction(n, d*d) for d, n in sxx_partials.items()) + # This formula has poor numeric properties for floats, + # but with fractions it is exact. + total = (count * sxx - sx * sx) / count + return (T, total, count) + + def _isfinite(x): try: return x.is_finite() # Likely a Decimal. @@ -773,44 +812,6 @@ def quantiles(data, *, n=4, method='exclusive'): # See http://mathworld.wolfram.com/Variance.html # http://mathworld.wolfram.com/SampleVariance.html -def _ss(data, c=None): - """Return sum of square deviations of sequence data. - - If ``c`` is None, the mean is calculated in one pass, and the deviations - from the mean are calculated in a second pass. Otherwise, deviations are - calculated from ``c`` as given. Use the second case with care, as it can - lead to garbage results. - """ - if c is not None: - T, total, count = _sum((d := x - c) * d for x in data) - return (T, total, count) - count = 0 - sx_partials = {} - sx_partials_get = sx_partials.get - sxx_partials = {} - sxx_partials_get = sxx_partials.get - T = int - for typ, values in groupby(data, type): - T = _coerce(T, typ) # or raise TypeError - for n, d in map(_exact_ratio, values): - count += 1 - sx_partials[d] = sx_partials_get(d, 0) + n - sxx_partials[d] = sxx_partials_get(d, 0) + n * n - if not count: - total = Fraction(0) - elif None in sx_partials: - # The sum will be a NAN or INF. We can ignore all the finite - # partials, and just look at this special one. - total = sx_partials[None] - assert not _isfinite(total) - else: - sx = sum(Fraction(n, d) for d, n in sx_partials.items()) - sxx = sum(Fraction(n, d*d) for d, n in sxx_partials.items()) - # This formula has poor numeric properties for floats, - # but with fractions it is exact. - total = (count * sxx - sx * sx) / count - return (T, total, count) - def variance(data, xbar=None): """Return the sample variance of data. From 208abcd8f1726646f8d86306616b0db802d8064c Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 4 Jan 2022 21:41:08 -0600 Subject: [PATCH 13/13] Use defaultdict() instead of boundmethod --- Lib/statistics.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 9aee21517223ae..eef2453bc7394b 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -138,7 +138,7 @@ from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum from operator import mul -from collections import Counter, namedtuple +from collections import Counter, namedtuple, defaultdict _SQRT2 = sqrt(2.0) @@ -214,17 +214,15 @@ def _ss(data, c=None): T, total, count = _sum((d := x - c) * d for x in data) return (T, total, count) count = 0 - sx_partials = {} - sx_partials_get = sx_partials.get - sxx_partials = {} - sxx_partials_get = sxx_partials.get + sx_partials = defaultdict(int) + sxx_partials = defaultdict(int) T = int for typ, values in groupby(data, type): T = _coerce(T, typ) # or raise TypeError for n, d in map(_exact_ratio, values): count += 1 - sx_partials[d] = sx_partials_get(d, 0) + n - sxx_partials[d] = sxx_partials_get(d, 0) + n * n + sx_partials[d] += n + sxx_partials[d] += n * n if not count: total = Fraction(0) elif None in sx_partials: