From 2c48b79d07ba0363a2cd02597f87b11bbbbdebcf Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 22 Aug 2023 10:52:46 -0500 Subject: [PATCH 1/5] Faster samples() --- Doc/library/statistics.rst | 5 +++++ Lib/statistics.py | 12 ++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 483ebea67f0c6d..29847379b33106 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -828,6 +828,11 @@ of applications in statistics. number generator. This is useful for creating reproducible results, even in a multi-threading context. + .. versionchanged:: 3.13 + + Switched to a faster algorithm. To reproduce samples from previous + versions, use :func:`random.seed` and :func`random.gauss`. + .. method:: NormalDist.pdf(x) Using a `probability density function (pdf) diff --git a/Lib/statistics.py b/Lib/statistics.py index a8036e9928c464..8caa3ec77a1ec7 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -134,7 +134,7 @@ from fractions import Fraction from decimal import Decimal -from itertools import count, groupby, repeat +from itertools import count, groupby, repeat, starmap from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod from math import isfinite, isinf @@ -1135,7 +1135,7 @@ def linear_regression(x, y, /, *, proportional=False): >>> noise = NormalDist().samples(5, seed=42) >>> y = [3 * x[i] + 2 + noise[i] for i in range(5)] >>> linear_regression(x, y) #doctest: +ELLIPSIS - LinearRegression(slope=3.09078914170..., intercept=1.75684970486...) + LinearRegression(slope=3.17495..., intercept=1.00925...) If *proportional* is true, the independent variable *x* and the dependent variable *y* are assumed to be directly proportional. @@ -1148,7 +1148,7 @@ def linear_regression(x, y, /, *, proportional=False): >>> y = [3 * x[i] + noise[i] for i in range(5)] >>> linear_regression(x, y, proportional=True) #doctest: +ELLIPSIS - LinearRegression(slope=3.02447542484..., intercept=0.0) + LinearRegression(slope=2.90475..., intercept=0.0) """ n = len(x) @@ -1279,9 +1279,9 @@ def from_samples(cls, data): def samples(self, n, *, seed=None): "Generate *n* samples for a given mean and standard deviation." - gauss = random.gauss if seed is None else random.Random(seed).gauss - mu, sigma = self._mu, self._sigma - return [gauss(mu, sigma) for _ in repeat(None, n)] + rnd = random.random if seed is None else random.Random(seed).random + inv_cdf = self.inv_cdf + return [inv_cdf(rnd()) for _ in repeat(None, n)] def pdf(self, x): "Probability density function. P(x <= X < x+dx) / dx" From 7154b7afc45e5f14f5170856ac2d3021e63ab2a2 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 22 Aug 2023 10:56:46 -0500 Subject: [PATCH 2/5] Even faster samples() --- Lib/statistics.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 8caa3ec77a1ec7..b199dc8ef35e1e 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1280,8 +1280,10 @@ def from_samples(cls, data): def samples(self, n, *, seed=None): "Generate *n* samples for a given mean and standard deviation." rnd = random.random if seed is None else random.Random(seed).random - inv_cdf = self.inv_cdf - return [inv_cdf(rnd()) for _ in repeat(None, n)] + inv_cdf = _normal_dist_inv_cdf + mu = self._mu + sigma = self._sigma + return [inv_cdf(rnd(), mu, sigma) for _ in repeat(None, n)] def pdf(self, x): "Probability density function. P(x <= X < x+dx) / dx" From 0ca8252d7dcf821d49ddfbd67d1de9a1de2fd944 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 22 Aug 2023 12:06:00 -0500 Subject: [PATCH 3/5] Add blurb --- .../next/Library/2023-08-22-12-05-47.gh-issue-108322.kf3NJX.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2023-08-22-12-05-47.gh-issue-108322.kf3NJX.rst diff --git a/Misc/NEWS.d/next/Library/2023-08-22-12-05-47.gh-issue-108322.kf3NJX.rst b/Misc/NEWS.d/next/Library/2023-08-22-12-05-47.gh-issue-108322.kf3NJX.rst new file mode 100644 index 00000000000000..5416c01a43f113 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-08-22-12-05-47.gh-issue-108322.kf3NJX.rst @@ -0,0 +1,2 @@ +Speed-up NormalDist.samples() by using the inverse CDF method instead of +calling random.gauss(). From 086d2adc8e8eda55da6832ead6e93da4ec3f2448 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 22 Aug 2023 12:12:00 -0500 Subject: [PATCH 4/5] Remove unused import --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index b199dc8ef35e1e..96c803483057e7 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -134,7 +134,7 @@ from fractions import Fraction from decimal import Decimal -from itertools import count, groupby, repeat, starmap +from itertools import count, groupby, repeat from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod from math import isfinite, isinf From ca71c8fbeaf42cf67665a68478d8b34b8bbee088 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 22 Aug 2023 12:18:39 -0500 Subject: [PATCH 5/5] Fix markup --- Doc/library/statistics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 29847379b33106..368b2a17cef997 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -831,7 +831,7 @@ of applications in statistics. .. versionchanged:: 3.13 Switched to a faster algorithm. To reproduce samples from previous - versions, use :func:`random.seed` and :func`random.gauss`. + versions, use :func:`random.seed` and :func:`random.gauss`. .. method:: NormalDist.pdf(x)