From 2a5841d98e36089c8f132555eba2527c49209570 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 22 Feb 2024 23:42:41 -0600 Subject: [PATCH 01/19] Add docs and code for kde() --- Doc/library/statistics.rst | 53 ++++++++++++- Lib/statistics.py | 155 ++++++++++++++++++++++++++++++++++++- 2 files changed, 204 insertions(+), 4 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 0417b3f38a9807..9b0bdb253403d4 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -82,7 +82,6 @@ or sample. :func:`median_grouped` Median, or 50th percentile, of grouped data. :func:`mode` Single mode (most common value) of discrete or nominal data. :func:`multimode` List of modes (most common values) of discrete or nominal data. -:func:`quantiles` Divide data into intervals with equal probability. ======================= =============================================================== Measures of spread @@ -91,12 +90,14 @@ Measures of spread These functions calculate a measure of how much the population or sample tends to deviate from the typical or average values. -======================= ============================================= +======================= ===================================================== +:func:`kde` Estimate the probability density distributionof data. :func:`pstdev` Population standard deviation of data. :func:`pvariance` Population variance of data. +:func:`quantiles` Divide data into intervals with equal probability. :func:`stdev` Sample standard deviation of data. :func:`variance` Sample variance of data. -======================= ============================================= +======================= ===================================================== Statistics for relations between two inputs ------------------------------------------- @@ -501,6 +502,52 @@ However, for reading convenience, most of the examples show sorted sequences. of the population variance. +.. function:: kde(data, h, kernel='normal') + + `Kernel Density Estimation (KDE) + https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf`_ + creates an estimated continuous probability density function from *data* + containing a fixed number of discrete samples. + + The basic idea is to smooth the data using `a kernel function such as a + normal distribution, triangular distribution, or uniform distribution + `_. + + The degree of smoothing is controlled by the scaling parameter *h* + which is called the bandwidth. Smaller values emphasize local + features while larger values give smoother results. + + The *kernel* determines the relative weights of the sample data + points. Generally, the choice of kernel shape does not matter + as much as the more influential bandwidth smoothing parameter. + + Kernels that give some weight to every sample point include: + normal or gauss, logistic, and sigmoid. + + Kernels that only give weight to sample points within the bandwidth + include rectangular or uniform, triangular, parabolic or epanechnikov, + quartic or biweight, triweight, and cosine. + + `Wikipedia has an example + `_ + where we can use :func:`kde` to generate and plot a probability + density function estimated from a small sample: + + .. doctest:: + + >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] + >>> f_hat = kde(sample, h=1.5) + >>> xarr = [i/100 for i in range(-750, 1100)] + >>> yarr = [f_hat(x) for x in xarr] + + The points in ``xarr`` and ``yarr`` can be used to make a PDF plot: + + .. image:: kde_example.png + :alt: Scatter plot of the estimated probability density function. + + .. versionadded:: 3.13 + + .. function:: stdev(data, xbar=None) Return the sample standard deviation (the square root of the sample diff --git a/Lib/statistics.py b/Lib/statistics.py index 83aaedb04515e0..73b0ac95860d75 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -137,7 +137,7 @@ from itertools import count, groupby, repeat from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod -from math import isfinite, isinf +from math import isfinite, isinf, pi, cos, cosh from functools import reduce from operator import itemgetter from collections import Counter, namedtuple, defaultdict @@ -802,6 +802,159 @@ def multimode(data): return [value for value, count in counts.items() if count == maxcount] +def kde(data, h, kernel='normal'): + """Kernel Density Estimation: Create a continuous probability + density function from discrete samples. + + The basic idea is to smooth the data using a kernel function + to help draw inferences about a population from a sample. + + The degree of smoothing is controlled by the scaling parameter h + which is called the bandwidth. Smaller values emphasize local + features while larger values give smoother results. + + Kernels that give some weight to every sample point: + + normal or gauss + logistic + sigmoid + + Kernels that only give weight to sample points within + the bandwidth: + + rectangular or uniform + triangular + parabolic or epanechnikov + quartic or biweight + triweight + cosine + + Example + ------- + + Given a sample of six data points, construct a continuous + function that estimates the underlying probability density: + + >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] + >>> f_hat = kde(sample, h=1.5) + + Compute the area under the curve: + + >>> sum(f_hat(x) for x in range(-20, 20)) + 1.0 + + Plot the estimated probability density function at + evenly spaced points from -6 to 10: + + >>> for x in range(-6, 11): + ... density = f_hat(x) + ... plot = ' ' * int(density * 400) + 'x' + ... print(f'{x:2}: {density:.3f} {plot}') + ... + -6: 0.002 x + -5: 0.009 x + -4: 0.031 x + -3: 0.070 x + -2: 0.111 x + -1: 0.125 x + 0: 0.110 x + 1: 0.086 x + 2: 0.068 x + 3: 0.059 x + 4: 0.066 x + 5: 0.082 x + 6: 0.082 x + 7: 0.058 x + 8: 0.028 x + 9: 0.009 x + 10: 0.002 x + + References + ---------- + + Kernel density estimation and its application: + https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf + + Kernel functions in common use: + https://en.wikipedia.org/wiki/Kernel_(statistics)#kernel_functions_in_common_use + + Interactive graphical demonstration and exploration: + https://demonstrations.wolfram.com/KernelDensityEstimation/ + + """ + + match kernel: + + case 'normal' | 'gauss': + c = 1 / sqrt(2 * pi) + K = lambda t: c * exp(-1/2 * t * t) + support = None + + case 'logistic': + # 1.0 / (exp(t) + 2.0 + exp(-t)) + K = lambda t: 1/2 / (1.0 + cosh(t)) + support = None + + case 'sigmoid': + # (2/pi) / (exp(t) + exp(-t)) + c = 1 / pi + K = lambda t: c / cosh(t) + support = None + + case 'rectangular' | 'uniform': + K = lambda t: 1/2 + support = 1.0 + + case 'triangular': + K = lambda t: 1.0 - abs(t) + support = 1.0 + + case 'parabolic' | 'epanechnikov': + K = lambda t: 3/4 * (1.0 - t * t) + support = 1.0 + + case 'quartic' | 'biweight': + K = lambda t: 15/16 * (1.0 - t * t) ** 2 + support = 1.0 + + case 'triweight': + K = lambda t: 35/32 * (1.0 - t * t) ** 3 + support = 1.0 + + case 'cosine': + c1 = pi / 4 + c2 = pi / 2 + K = lambda t: c1 * cos(c2 * t) + support = 1.0 + + case _: + raise ValueError(f'Unknown kernel name: {kernel!r}') + + n = len(data) + if not n: + raise StatisticsError('Empty data sequence') + + if support is None: + + def pdf(x): + return sum(K((x - x_i) / h) for x_i in data) / (n * h) + + else: + + sample = sorted(data) + bandwidth = h * support + + def pdf(x): + i = bisect_left(sample, x - bandwidth) + j = bisect_right(sample, x + bandwidth) + supported = sample[i : j] + return sum(K((x - x_i) / h) for x_i in supported) / (n * h) + + pdf.__doc__ = f'PDF estimate with the {kernel!r} kernel and bandwidth {h}.' + + return pdf + + # Notes on methods for computing quantiles # ---------------------------------------- # From 940795f4979f45d978e79966003b329157f39b9b Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 23 Feb 2024 11:06:21 -0600 Subject: [PATCH 02/19] Alphabetize the function order --- Doc/library/statistics.rst | 103 +++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 9b0bdb253403d4..778c70ed254d6d 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -76,12 +76,14 @@ or sample. :func:`fmean` Fast, floating point arithmetic mean, with optional weighting. :func:`geometric_mean` Geometric mean of data. :func:`harmonic_mean` Harmonic mean of data. +:func:`kde` Estimate the probability density distribution of the data. :func:`median` Median (middle value) of data. :func:`median_low` Low median of data. :func:`median_high` High median of data. :func:`median_grouped` Median, or 50th percentile, of grouped data. :func:`mode` Single mode (most common value) of discrete or nominal data. :func:`multimode` List of modes (most common values) of discrete or nominal data. +:func:`quantiles` Divide data into intervals with equal probability. ======================= =============================================================== Measures of spread @@ -90,14 +92,12 @@ Measures of spread These functions calculate a measure of how much the population or sample tends to deviate from the typical or average values. -======================= ===================================================== -:func:`kde` Estimate the probability density distributionof data. +======================= ============================================= :func:`pstdev` Population standard deviation of data. :func:`pvariance` Population variance of data. -:func:`quantiles` Divide data into intervals with equal probability. :func:`stdev` Sample standard deviation of data. :func:`variance` Sample variance of data. -======================= ===================================================== +======================= ============================================= Statistics for relations between two inputs ------------------------------------------- @@ -260,6 +260,55 @@ However, for reading convenience, most of the examples show sorted sequences. .. versionchanged:: 3.10 Added support for *weights*. + +.. function:: kde(data, h, kernel='normal') + + `Kernel Density Estimation (KDE) + https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf`_ + creates an estimated continuous probability density function from *data* + containing a fixed number of discrete samples. + + The basic idea is to smooth the data using `a kernel function such as a + normal distribution, triangular distribution, or uniform distribution + `_. + + The degree of smoothing is controlled by the scaling parameter *h* + which is called the bandwidth. Smaller values emphasize local + features while larger values give smoother results. + + The *kernel* determines the relative weights of the sample data + points. Generally, the choice of kernel shape does not matter + as much as the more influential bandwidth smoothing parameter. + + Kernels that give some weight to every sample point include: + normal or gauss, logistic, and sigmoid. + + Kernels that only give weight to sample points within the bandwidth + include rectangular or uniform, triangular, parabolic or epanechnikov, + quartic or biweight, triweight, and cosine. + + A :exc:`StatisticsError` will be raised if the *data* sequence is empty. + + `Wikipedia has an example + `_ + where we can use :func:`kde` to generate and plot a probability + density function estimated from a small sample: + + .. doctest:: + + >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] + >>> f_hat = kde(sample, h=1.5) + >>> xarr = [i/100 for i in range(-750, 1100)] + >>> yarr = [f_hat(x) for x in xarr] + + The points in ``xarr`` and ``yarr`` can be used to make a PDF plot: + + .. image:: kde_example.png + :alt: Scatter plot of the estimated probability density function. + + .. versionadded:: 3.13 + + .. function:: median(data) Return the median (middle value) of numeric data, using the common "mean of @@ -502,52 +551,6 @@ However, for reading convenience, most of the examples show sorted sequences. of the population variance. -.. function:: kde(data, h, kernel='normal') - - `Kernel Density Estimation (KDE) - https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf`_ - creates an estimated continuous probability density function from *data* - containing a fixed number of discrete samples. - - The basic idea is to smooth the data using `a kernel function such as a - normal distribution, triangular distribution, or uniform distribution - `_. - - The degree of smoothing is controlled by the scaling parameter *h* - which is called the bandwidth. Smaller values emphasize local - features while larger values give smoother results. - - The *kernel* determines the relative weights of the sample data - points. Generally, the choice of kernel shape does not matter - as much as the more influential bandwidth smoothing parameter. - - Kernels that give some weight to every sample point include: - normal or gauss, logistic, and sigmoid. - - Kernels that only give weight to sample points within the bandwidth - include rectangular or uniform, triangular, parabolic or epanechnikov, - quartic or biweight, triweight, and cosine. - - `Wikipedia has an example - `_ - where we can use :func:`kde` to generate and plot a probability - density function estimated from a small sample: - - .. doctest:: - - >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] - >>> f_hat = kde(sample, h=1.5) - >>> xarr = [i/100 for i in range(-750, 1100)] - >>> yarr = [f_hat(x) for x in xarr] - - The points in ``xarr`` and ``yarr`` can be used to make a PDF plot: - - .. image:: kde_example.png - :alt: Scatter plot of the estimated probability density function. - - .. versionadded:: 3.13 - - .. function:: stdev(data, xbar=None) Return the sample standard deviation (the square root of the sample From 482c9c11d79e291248a147937d60d5b838a40d9d Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 23 Feb 2024 11:09:29 -0600 Subject: [PATCH 03/19] Add blurb --- .../next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst diff --git a/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst b/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst new file mode 100644 index 00000000000000..fb36c0b2a4fabd --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst @@ -0,0 +1 @@ +Add kernel density estimation to the statistics module. From e9386edcf8d7631428a55327671cb045dc4a5386 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 23 Feb 2024 11:30:39 -0600 Subject: [PATCH 04/19] Add PDF area test --- Lib/test/test_statistics.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index bf2c254c9ee7d9..1d4a94045671f7 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2353,6 +2353,30 @@ def test_mixed_int_and_float(self): self.assertAlmostEqual(actual_mean, expected_mean, places=5) +class TestKDE(unittest.TestCase): + + def test_kde_area(self): + # The approximate integral of a PDF should be close to 1.0 + + sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] + + def integrate(func, low, high, steps=10_000): + "Numeric approximation of a definite function integral." + width = (high - low) / steps + xarr = (low + width/2 + width * i for i in range(steps)) + return sum(map(func, xarr)) * width + + kernels = ['normal', 'gauss', 'logistic', 'sigmoid', 'rectangular', + 'uniform', 'triangular', 'parabolic', 'epanechnikov', + 'quartic', 'biweight', 'triweight', 'cosine'] + + for kernel in kernels: + with self.subTest(kernel=kernel): + f_hat = statistics.kde(sample, h=1.5, kernel=kernel) + area = integrate(f_hat, -20, 20) + self.assertAlmostEqual(area, 1.0, places=4) + + class TestQuantiles(unittest.TestCase): def test_specific_cases(self): From c3ddb1e08bf7876577003cc8df7f5cb23246ae27 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 23 Feb 2024 11:48:00 -0600 Subject: [PATCH 05/19] Add tests --- Lib/statistics.py | 11 +++++++---- Lib/test/test_statistics.py | 33 ++++++++++++++++++++++++++------- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 73b0ac95860d75..6dc26fbe5114cf 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -883,6 +883,13 @@ def kde(data, h, kernel='normal'): """ + n = len(data) + if not n: + raise StatisticsError('Empty data sequence') + + if h <= 0.0: + raise StatisticsError(f'Bandwidth h must be positive, not {h=}') + match kernel: case 'normal' | 'gauss': @@ -930,10 +937,6 @@ def kde(data, h, kernel='normal'): case _: raise ValueError(f'Unknown kernel name: {kernel!r}') - n = len(data) - if not n: - raise StatisticsError('Empty data sequence') - if support is None: def pdf(x): diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 1d4a94045671f7..e0e7ca5cf1c806 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2355,27 +2355,46 @@ def test_mixed_int_and_float(self): class TestKDE(unittest.TestCase): - def test_kde_area(self): - # The approximate integral of a PDF should be close to 1.0 + def test_kde(self): + kde = statistics.kde + StatisticsError = statistics.StatisticsError + + kernels = ['normal', 'gauss', 'logistic', 'sigmoid', 'rectangular', + 'uniform', 'triangular', 'parabolic', 'epanechnikov', + 'quartic', 'biweight', 'triweight', 'cosine'] sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] + # The approximate integral of a PDF should be close to 1.0 + def integrate(func, low, high, steps=10_000): "Numeric approximation of a definite function integral." width = (high - low) / steps xarr = (low + width/2 + width * i for i in range(steps)) return sum(map(func, xarr)) * width - kernels = ['normal', 'gauss', 'logistic', 'sigmoid', 'rectangular', - 'uniform', 'triangular', 'parabolic', 'epanechnikov', - 'quartic', 'biweight', 'triweight', 'cosine'] - for kernel in kernels: with self.subTest(kernel=kernel): - f_hat = statistics.kde(sample, h=1.5, kernel=kernel) + f_hat = kde(sample, h=1.5, kernel=kernel) area = integrate(f_hat, -20, 20) self.assertAlmostEqual(area, 1.0, places=4) + # Check error cases + + with self.assertRaises(StatisticsError): + kde([], h=1.0) # Empty dataset + f_hat = kde(['abc', 'def'], 1.5) # Non-numeric data + with self.assertRaises(TypeError): + f_hat(0) + with self.assertRaises(StatisticsError): + kde(sample, h=0.0) # Zero bandwidth + with self.assertRaises(StatisticsError): + kde(sample, h=0.0) # Negative bandwidth + with self.assertRaises(TypeError): + kde(sample, h='str') # Wrong bandwidth type + with self.assertRaises(ValueError): + kde(sample, h=1.0, kernel='bogus') # Invalid kernel + class TestQuantiles(unittest.TestCase): From a2771cbc22e108a9763cd01ed72ca31b06026335 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 23 Feb 2024 11:58:20 -0600 Subject: [PATCH 06/19] Early test for non-numeric data. Tests for name and docstring --- Lib/statistics.py | 3 +++ Lib/test/test_statistics.py | 11 +++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 6dc26fbe5114cf..8ee756ead2a9fb 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -887,6 +887,9 @@ def kde(data, h, kernel='normal'): if not n: raise StatisticsError('Empty data sequence') + if not isinstance(data[0], (int, float)): + raise TypeError('Data sequence must contain ints or floats') + if h <= 0.0: raise StatisticsError(f'Bandwidth h must be positive, not {h=}') diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index e0e7ca5cf1c806..827bbcce26fdb6 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2383,9 +2383,8 @@ def integrate(func, low, high, steps=10_000): with self.assertRaises(StatisticsError): kde([], h=1.0) # Empty dataset - f_hat = kde(['abc', 'def'], 1.5) # Non-numeric data with self.assertRaises(TypeError): - f_hat(0) + kde(['abc', 'def'], 1.5) # Non-numeric data with self.assertRaises(StatisticsError): kde(sample, h=0.0) # Zero bandwidth with self.assertRaises(StatisticsError): @@ -2395,6 +2394,14 @@ def integrate(func, low, high, steps=10_000): with self.assertRaises(ValueError): kde(sample, h=1.0, kernel='bogus') # Invalid kernel + # Test name and docstring + h = 1.5 + kernel = 'cosine' + f_hat = kde(sample, h, kernel) + self.assertEqual(f_hat.__name__, 'pdf') + self.assertIn(kernel, f_hat.__doc__) + self.assertIn(str(h), f_hat.__doc__) + class TestQuantiles(unittest.TestCase): From e29d64f8c41a953330d757eab2dd8b8b4b814f74 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 23 Feb 2024 12:01:20 -0600 Subject: [PATCH 07/19] Use StatisticsError for invalid kernels --- Lib/statistics.py | 2 +- Lib/test/test_statistics.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 8ee756ead2a9fb..c4a29e9c2978b7 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -938,7 +938,7 @@ def kde(data, h, kernel='normal'): support = 1.0 case _: - raise ValueError(f'Unknown kernel name: {kernel!r}') + raise StatisticsError(f'Unknown kernel name: {kernel!r}') if support is None: diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 827bbcce26fdb6..b92bbfc746b0c0 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2391,10 +2391,11 @@ def integrate(func, low, high, steps=10_000): kde(sample, h=0.0) # Negative bandwidth with self.assertRaises(TypeError): kde(sample, h='str') # Wrong bandwidth type - with self.assertRaises(ValueError): + with self.assertRaises(StatisticsError): kde(sample, h=1.0, kernel='bogus') # Invalid kernel - # Test name and docstring + # Test name and docstring of the generated function + h = 1.5 kernel = 'cosine' f_hat = kde(sample, h, kernel) From 5d8bab08b77a3707656890598d6917ab19500694 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 23 Feb 2024 12:02:00 -0600 Subject: [PATCH 08/19] . --- Lib/test/test_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index b92bbfc746b0c0..014582428730b5 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2391,7 +2391,7 @@ def integrate(func, low, high, steps=10_000): kde(sample, h=0.0) # Negative bandwidth with self.assertRaises(TypeError): kde(sample, h='str') # Wrong bandwidth type - with self.assertRaises(StatisticsError): + with self.assertRaises(Statisticsrror): kde(sample, h=1.0, kernel='bogus') # Invalid kernel # Test name and docstring of the generated function From 9e6aaa772d6f10e1a4994aa55b8458034d31c2b9 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 23 Feb 2024 12:14:45 -0600 Subject: [PATCH 09/19] Add kde() to __all__ --- Lib/statistics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/statistics.py b/Lib/statistics.py index c4a29e9c2978b7..bc879cfc0fece1 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -112,6 +112,7 @@ 'fmean', 'geometric_mean', 'harmonic_mean', + 'kde', 'linear_regression', 'mean', 'median', From c8b19d8965d44299dd04faeb6ae1d27e51d2f04a Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 23 Feb 2024 12:16:28 -0600 Subject: [PATCH 10/19] Add test for non-sequence input --- Lib/test/test_statistics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 014582428730b5..f3e5ce3f11866e 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2385,13 +2385,15 @@ def integrate(func, low, high, steps=10_000): kde([], h=1.0) # Empty dataset with self.assertRaises(TypeError): kde(['abc', 'def'], 1.5) # Non-numeric data + with self.assertRaises(TypeError): + kde(iter(sample), 1.5) # Data is not a sequence with self.assertRaises(StatisticsError): kde(sample, h=0.0) # Zero bandwidth with self.assertRaises(StatisticsError): kde(sample, h=0.0) # Negative bandwidth with self.assertRaises(TypeError): kde(sample, h='str') # Wrong bandwidth type - with self.assertRaises(Statisticsrror): + with self.assertRaises(StatisticsError): kde(sample, h=1.0, kernel='bogus') # Invalid kernel # Test name and docstring of the generated function From d5714523a5ee17b15b3f6486645af27c4e0ce9b6 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 23 Feb 2024 12:27:04 -0600 Subject: [PATCH 11/19] Fix markup for external link --- Doc/library/statistics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 778c70ed254d6d..f69cc1313082a0 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -264,7 +264,7 @@ However, for reading convenience, most of the examples show sorted sequences. .. function:: kde(data, h, kernel='normal') `Kernel Density Estimation (KDE) - https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf`_ + `_ creates an estimated continuous probability density function from *data* containing a fixed number of discrete samples. From ddc32b89a6257a1a9d2052f2f1d542481c662366 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 23 Feb 2024 15:11:19 -0600 Subject: [PATCH 12/19] Remove outdated KDE recipe --- Doc/library/statistics.rst | 40 -------------------------------------- 1 file changed, 40 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index f69cc1313082a0..d702c453b08f97 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1145,46 +1145,6 @@ The final prediction goes to the largest posterior. This is known as the 'female' -Kernel density estimation -************************* - -It is possible to estimate a continuous probability density function -from a fixed number of discrete samples. - -The basic idea is to smooth the data using `a kernel function such as a -normal distribution, triangular distribution, or uniform distribution -`_. -The degree of smoothing is controlled by a scaling parameter, ``h``, -which is called the *bandwidth*. - -.. testcode:: - - def kde_normal(sample, h): - "Create a continuous probability density function from a sample." - # Smooth the sample with a normal distribution kernel scaled by h. - kernel_h = NormalDist(0.0, h).pdf - n = len(sample) - def pdf(x): - return sum(kernel_h(x - x_i) for x_i in sample) / n - return pdf - -`Wikipedia has an example -`_ -where we can use the ``kde_normal()`` recipe to generate and plot -a probability density function estimated from a small sample: - -.. doctest:: - - >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] - >>> f_hat = kde_normal(sample, h=1.5) - >>> xarr = [i/100 for i in range(-750, 1100)] - >>> yarr = [f_hat(x) for x in xarr] - -The points in ``xarr`` and ``yarr`` can be used to make a PDF plot: - -.. image:: kde_example.png - :alt: Scatter plot of the estimated probability density function. - .. # This modelines must appear within the last ten lines of the file. kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding utf-8; From 5ac105514b216517b557bc11b7c91b85b91df26e Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 24 Feb 2024 09:12:24 -0600 Subject: [PATCH 13/19] Improve variable names in integration using the midpoint rule. --- Lib/test/test_statistics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index f3e5ce3f11866e..b3a1fc04acdc46 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2369,9 +2369,9 @@ def test_kde(self): def integrate(func, low, high, steps=10_000): "Numeric approximation of a definite function integral." - width = (high - low) / steps - xarr = (low + width/2 + width * i for i in range(steps)) - return sum(map(func, xarr)) * width + dx = (high - low) / steps + midpoints = (low + (i + 1/2) * dx for i in range(steps)) + return sum(map(func, midpoints)) * dx for kernel in kernels: with self.subTest(kernel=kernel): From 24e38e2020c35a4da6c6462bcfbff2fe25c1e724 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 24 Feb 2024 09:18:50 -0600 Subject: [PATCH 14/19] Improve appearance of generated docstring --- Lib/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index bc879cfc0fece1..fa3609565cddec 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -892,7 +892,7 @@ def kde(data, h, kernel='normal'): raise TypeError('Data sequence must contain ints or floats') if h <= 0.0: - raise StatisticsError(f'Bandwidth h must be positive, not {h=}') + raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}') match kernel: @@ -957,7 +957,7 @@ def pdf(x): supported = sample[i : j] return sum(K((x - x_i) / h) for x_i in supported) / (n * h) - pdf.__doc__ = f'PDF estimate with the {kernel!r} kernel and bandwidth {h}.' + pdf.__doc__ = f'PDF estimate with {kernel=!r} and {h=!r}' return pdf From e729127cbce5f7d4bca13f67a51f9a1983585bff Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 24 Feb 2024 09:32:39 -0600 Subject: [PATCH 15/19] Put the kernel names in italics --- Doc/library/statistics.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index d702c453b08f97..43eddedbcdcfb3 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -280,12 +280,12 @@ However, for reading convenience, most of the examples show sorted sequences. points. Generally, the choice of kernel shape does not matter as much as the more influential bandwidth smoothing parameter. - Kernels that give some weight to every sample point include: - normal or gauss, logistic, and sigmoid. + Kernels that give some weight to every sample point include + *normal* or *gauss*, *logistic*, and *sigmoid*. Kernels that only give weight to sample points within the bandwidth - include rectangular or uniform, triangular, parabolic or epanechnikov, - quartic or biweight, triweight, and cosine. + include *rectangular* or *uniform*, *triangular*, *parabolic* or + *epanechnikov*, *quartic* or *biweight*, *triweight*, and *cosine*. A :exc:`StatisticsError` will be raised if the *data* sequence is empty. From e6d6e0fec23888d23aed5dd25f85d97e6694424d Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 24 Feb 2024 09:39:49 -0600 Subject: [PATCH 16/19] Add a whatsnew entry --- Doc/whatsnew/3.13.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 7c6a2af28758be..a9e90b9782f600 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -460,6 +460,14 @@ sqlite3 for filtering database objects to dump. (Contributed by Mariusz Felisiak in :gh:`91602`.) +statistics +---------- + +* Add :func:`statistics.kde` for kernel density estimation. + This makes it possible to estimate a continuous probability density function + from a fixed number of discrete samples. + (Contributed by Raymond Hettinger in :gh:`115863`.) + subprocess ---------- From b3fd269c8accce9e8d5c5925d743dddaf0497036 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 25 Feb 2024 07:46:20 -0600 Subject: [PATCH 17/19] Add test for support interval boundary conditions --- Lib/test/test_statistics.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index b3a1fc04acdc46..1cf41638a7f01a 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2405,6 +2405,13 @@ def integrate(func, low, high, steps=10_000): self.assertIn(kernel, f_hat.__doc__) self.assertIn(str(h), f_hat.__doc__) + # Test closed interval for the support boundaries. + # In particular, 'uniform' should non-zero at the boundaries. + + f_hat = kde([0], 1.0, 'uniform') + self.assertEqual(f_hat(-1.0), 1/2) + self.assertEqual(f_hat(1.0), 1/2) + class TestQuantiles(unittest.TestCase): From 9dec3087eb271a54fd8eb8190b66ea467d028039 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 25 Feb 2024 08:04:24 -0600 Subject: [PATCH 18/19] Sync the docstring with the main docs. --- Doc/library/statistics.rst | 11 +++++------ Lib/statistics.py | 6 ++++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 43eddedbcdcfb3..fb388a900bdbff 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -264,13 +264,12 @@ However, for reading convenience, most of the examples show sorted sequences. .. function:: kde(data, h, kernel='normal') `Kernel Density Estimation (KDE) - `_ - creates an estimated continuous probability density function from *data* - containing a fixed number of discrete samples. + `_: + Create a continuous probability density function from discrete samples. - The basic idea is to smooth the data using `a kernel function such as a - normal distribution, triangular distribution, or uniform distribution - `_. + The basic idea is to smooth the data using `a kernel function + Date: Sun, 25 Feb 2024 09:39:29 -0600 Subject: [PATCH 19/19] Fix missing angle bracket markup --- Doc/library/statistics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index fb388a900bdbff..1785c6bcc212b7 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -268,7 +268,7 @@ However, for reading convenience, most of the examples show sorted sequences. Create a continuous probability density function from discrete samples. The basic idea is to smooth the data using `a kernel function - `_. to help draw inferences about a population from a sample. The degree of smoothing is controlled by the scaling parameter *h*