From 2a5841d98e36089c8f132555eba2527c49209570 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Thu, 22 Feb 2024 23:42:41 -0600
Subject: [PATCH 01/19] Add docs and code for kde()

---
 Doc/library/statistics.rst |  53 ++++++++++++-
 Lib/statistics.py          | 155 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 204 insertions(+), 4 deletions(-)

diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
index 0417b3f38a9807..9b0bdb253403d4 100644
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -82,7 +82,6 @@ or sample.
 :func:`median_grouped`   Median, or 50th percentile, of grouped data.
 :func:`mode`             Single mode (most common value) of discrete or nominal data.
 :func:`multimode`        List of modes (most common values) of discrete or nominal data.
-:func:`quantiles`        Divide data into intervals with equal probability.
 =======================  ===============================================================
 
 Measures of spread
@@ -91,12 +90,14 @@ Measures of spread
 These functions calculate a measure of how much the population or sample
 tends to deviate from the typical or average values.
 
-=======================  =============================================
+=======================  =====================================================
+:func:`kde`              Estimate the probability density distributionof data.
 :func:`pstdev`           Population standard deviation of data.
 :func:`pvariance`        Population variance of data.
+:func:`quantiles`        Divide data into intervals with equal probability.
 :func:`stdev`            Sample standard deviation of data.
 :func:`variance`         Sample variance of data.
-=======================  =============================================
+=======================  =====================================================
 
 Statistics for relations between two inputs
 -------------------------------------------
@@ -501,6 +502,52 @@ However, for reading convenience, most of the examples show sorted sequences.
       of the population variance.
 
 
+.. function:: kde(data, h, kernel='normal')
+
+   `Kernel Density Estimation (KDE)
+   https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf`_
+   creates an estimated continuous probability density function from *data*
+   containing a fixed number of discrete samples.
+
+   The basic idea is to smooth the data using `a kernel function such as a
+   normal distribution, triangular distribution, or uniform distribution
+   <https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use>`_.
+
+   The degree of smoothing is controlled by the scaling parameter *h*
+   which is called the bandwidth.  Smaller values emphasize local
+   features while larger values give smoother results.
+
+   The *kernel* determines the relative weights of the sample data
+   points.  Generally, the choice of kernel shape does not matter
+   as much as the more influential bandwidth smoothing parameter.
+
+   Kernels that give some weight to every sample point include:
+   normal or gauss, logistic, and sigmoid.
+
+   Kernels that only give weight to sample points within the bandwidth
+   include rectangular or uniform, triangular, parabolic or epanechnikov,
+   quartic or biweight, triweight, and cosine.
+
+   `Wikipedia has an example
+   <https://en.wikipedia.org/wiki/Kernel_density_estimation#Example>`_
+   where we can use :func:`kde` to generate and plot a probability
+   density function estimated from a small sample:
+
+   .. doctest::
+
+      >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
+      >>> f_hat = kde(sample, h=1.5)
+      >>> xarr = [i/100 for i in range(-750, 1100)]
+      >>> yarr = [f_hat(x) for x in xarr]
+
+   The points in ``xarr`` and ``yarr`` can be used to make a PDF plot:
+
+   .. image:: kde_example.png
+      :alt: Scatter plot of the estimated probability density function.
+
+   .. versionadded:: 3.13
+
+
 .. function:: stdev(data, xbar=None)
 
    Return the sample standard deviation (the square root of the sample
diff --git a/Lib/statistics.py b/Lib/statistics.py
index 83aaedb04515e0..73b0ac95860d75 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -137,7 +137,7 @@
 from itertools import count, groupby, repeat
 from bisect import bisect_left, bisect_right
 from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod
-from math import isfinite, isinf
+from math import isfinite, isinf, pi, cos, cosh
 from functools import reduce
 from operator import itemgetter
 from collections import Counter, namedtuple, defaultdict
@@ -802,6 +802,159 @@ def multimode(data):
     return [value for value, count in counts.items() if count == maxcount]
 
 
+def kde(data, h, kernel='normal'):
+    """Kernel Density Estimation:  Create a continuous probability
+    density function from discrete samples.
+
+    The basic idea is to smooth the data using a kernel function
+    to help draw inferences about a population from a sample.
+
+    The degree of smoothing is controlled by the scaling parameter h
+    which is called the bandwidth.  Smaller values emphasize local
+    features while larger values give smoother results.
+
+    Kernels that give some weight to every sample point:
+
+       normal or gauss
+       logistic
+       sigmoid
+
+    Kernels that only give weight to sample points within
+    the bandwidth:
+
+       rectangular or uniform
+       triangular
+       parabolic or epanechnikov
+       quartic or biweight
+       triweight
+       cosine
+
+    Example
+    -------
+
+    Given a sample of six data points, construct a continuous
+    function that estimates the underlying probability density:
+
+        >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
+        >>> f_hat = kde(sample, h=1.5)
+
+    Compute the area under the curve:
+
+        >>> sum(f_hat(x) for x in range(-20, 20))
+        1.0
+
+    Plot the estimated probability density function at
+    evenly spaced points from -6 to 10:
+
+        >>> for x in range(-6, 11):
+        ...     density = f_hat(x)
+        ...     plot = ' ' * int(density * 400) + 'x'
+        ...     print(f'{x:2}: {density:.3f} {plot}')
+        ...
+        -6: 0.002 x
+        -5: 0.009    x
+        -4: 0.031             x
+        -3: 0.070                             x
+        -2: 0.111                                             x
+        -1: 0.125                                                   x
+         0: 0.110                                            x
+         1: 0.086                                   x
+         2: 0.068                            x
+         3: 0.059                        x
+         4: 0.066                           x
+         5: 0.082                                 x
+         6: 0.082                                 x
+         7: 0.058                        x
+         8: 0.028            x
+         9: 0.009    x
+        10: 0.002 x
+
+    References
+    ----------
+
+    Kernel density estimation and its application:
+    https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf
+
+    Kernel functions in common use:
+    https://en.wikipedia.org/wiki/Kernel_(statistics)#kernel_functions_in_common_use
+
+    Interactive graphical demonstration and exploration:
+    https://demonstrations.wolfram.com/KernelDensityEstimation/
+
+    """
+
+    match kernel:
+
+        case 'normal' | 'gauss':
+            c = 1 / sqrt(2 * pi)
+            K = lambda t: c * exp(-1/2 * t * t)
+            support = None
+
+        case 'logistic':
+            # 1.0 / (exp(t) + 2.0 + exp(-t))
+            K = lambda t: 1/2 / (1.0 + cosh(t))
+            support = None
+
+        case 'sigmoid':
+            # (2/pi) / (exp(t) + exp(-t))
+            c = 1 / pi
+            K = lambda t: c / cosh(t)
+            support = None
+
+        case 'rectangular' | 'uniform':
+            K = lambda t: 1/2
+            support = 1.0
+
+        case 'triangular':
+            K = lambda t: 1.0 - abs(t)
+            support = 1.0
+
+        case 'parabolic' | 'epanechnikov':
+            K = lambda t: 3/4 * (1.0 - t * t)
+            support = 1.0
+
+        case 'quartic' | 'biweight':
+            K = lambda t: 15/16 * (1.0 - t * t) ** 2
+            support = 1.0
+
+        case 'triweight':
+            K = lambda t: 35/32 * (1.0 - t * t) ** 3
+            support = 1.0
+
+        case 'cosine':
+            c1 = pi / 4
+            c2 = pi / 2
+            K = lambda t: c1 * cos(c2 * t)
+            support = 1.0
+
+        case _:
+            raise ValueError(f'Unknown kernel name: {kernel!r}')
+
+    n = len(data)
+    if not n:
+        raise StatisticsError('Empty data sequence')
+
+    if support is None:
+
+        def pdf(x):
+            return sum(K((x - x_i) / h) for x_i in data) / (n * h)
+
+    else:
+
+        sample = sorted(data)
+        bandwidth = h * support
+
+        def pdf(x):
+            i = bisect_left(sample, x - bandwidth)
+            j = bisect_right(sample, x + bandwidth)
+            supported = sample[i : j]
+            return sum(K((x - x_i) / h) for x_i in supported) / (n * h)
+
+    pdf.__doc__ = f'PDF estimate with the {kernel!r} kernel and bandwidth {h}.'
+
+    return pdf
+
+
 # Notes on methods for computing quantiles
 # ----------------------------------------
 #

From 940795f4979f45d978e79966003b329157f39b9b Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 23 Feb 2024 11:06:21 -0600
Subject: [PATCH 02/19] Alphabetize the function order

---
 Doc/library/statistics.rst | 103 +++++++++++++++++++------------------
 1 file changed, 53 insertions(+), 50 deletions(-)

diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
index 9b0bdb253403d4..778c70ed254d6d 100644
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -76,12 +76,14 @@ or sample.
 :func:`fmean`            Fast, floating point arithmetic mean, with optional weighting.
 :func:`geometric_mean`   Geometric mean of data.
 :func:`harmonic_mean`    Harmonic mean of data.
+:func:`kde`              Estimate the probability density distribution of the data.
 :func:`median`           Median (middle value) of data.
 :func:`median_low`       Low median of data.
 :func:`median_high`      High median of data.
 :func:`median_grouped`   Median, or 50th percentile, of grouped data.
 :func:`mode`             Single mode (most common value) of discrete or nominal data.
 :func:`multimode`        List of modes (most common values) of discrete or nominal data.
+:func:`quantiles`        Divide data into intervals with equal probability.
 =======================  ===============================================================
 
 Measures of spread
@@ -90,14 +92,12 @@ Measures of spread
 These functions calculate a measure of how much the population or sample
 tends to deviate from the typical or average values.
 
-=======================  =====================================================
-:func:`kde`              Estimate the probability density distributionof data.
+=======================  =============================================
 :func:`pstdev`           Population standard deviation of data.
 :func:`pvariance`        Population variance of data.
-:func:`quantiles`        Divide data into intervals with equal probability.
 :func:`stdev`            Sample standard deviation of data.
 :func:`variance`         Sample variance of data.
-=======================  =====================================================
+=======================  =============================================
 
 Statistics for relations between two inputs
 -------------------------------------------
@@ -260,6 +260,55 @@ However, for reading convenience, most of the examples show sorted sequences.
    .. versionchanged:: 3.10
       Added support for *weights*.
 
+
+.. function:: kde(data, h, kernel='normal')
+
+   `Kernel Density Estimation (KDE)
+   https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf`_
+   creates an estimated continuous probability density function from *data*
+   containing a fixed number of discrete samples.
+
+   The basic idea is to smooth the data using `a kernel function such as a
+   normal distribution, triangular distribution, or uniform distribution
+   <https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use>`_.
+
+   The degree of smoothing is controlled by the scaling parameter *h*
+   which is called the bandwidth.  Smaller values emphasize local
+   features while larger values give smoother results.
+
+   The *kernel* determines the relative weights of the sample data
+   points.  Generally, the choice of kernel shape does not matter
+   as much as the more influential bandwidth smoothing parameter.
+
+   Kernels that give some weight to every sample point include:
+   normal or gauss, logistic, and sigmoid.
+
+   Kernels that only give weight to sample points within the bandwidth
+   include rectangular or uniform, triangular, parabolic or epanechnikov,
+   quartic or biweight, triweight, and cosine.
+
+   A :exc:`StatisticsError` will be raised if the *data* sequence is empty.
+
+   `Wikipedia has an example
+   <https://en.wikipedia.org/wiki/Kernel_density_estimation#Example>`_
+   where we can use :func:`kde` to generate and plot a probability
+   density function estimated from a small sample:
+
+   .. doctest::
+
+      >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
+      >>> f_hat = kde(sample, h=1.5)
+      >>> xarr = [i/100 for i in range(-750, 1100)]
+      >>> yarr = [f_hat(x) for x in xarr]
+
+   The points in ``xarr`` and ``yarr`` can be used to make a PDF plot:
+
+   .. image:: kde_example.png
+      :alt: Scatter plot of the estimated probability density function.
+
+   .. versionadded:: 3.13
+
+
 .. function:: median(data)
 
    Return the median (middle value) of numeric data, using the common "mean of
@@ -502,52 +551,6 @@ However, for reading convenience, most of the examples show sorted sequences.
       of the population variance.
 
 
-.. function:: kde(data, h, kernel='normal')
-
-   `Kernel Density Estimation (KDE)
-   https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf`_
-   creates an estimated continuous probability density function from *data*
-   containing a fixed number of discrete samples.
-
-   The basic idea is to smooth the data using `a kernel function such as a
-   normal distribution, triangular distribution, or uniform distribution
-   <https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use>`_.
-
-   The degree of smoothing is controlled by the scaling parameter *h*
-   which is called the bandwidth.  Smaller values emphasize local
-   features while larger values give smoother results.
-
-   The *kernel* determines the relative weights of the sample data
-   points.  Generally, the choice of kernel shape does not matter
-   as much as the more influential bandwidth smoothing parameter.
-
-   Kernels that give some weight to every sample point include:
-   normal or gauss, logistic, and sigmoid.
-
-   Kernels that only give weight to sample points within the bandwidth
-   include rectangular or uniform, triangular, parabolic or epanechnikov,
-   quartic or biweight, triweight, and cosine.
-
-   `Wikipedia has an example
-   <https://en.wikipedia.org/wiki/Kernel_density_estimation#Example>`_
-   where we can use :func:`kde` to generate and plot a probability
-   density function estimated from a small sample:
-
-   .. doctest::
-
-      >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
-      >>> f_hat = kde(sample, h=1.5)
-      >>> xarr = [i/100 for i in range(-750, 1100)]
-      >>> yarr = [f_hat(x) for x in xarr]
-
-   The points in ``xarr`` and ``yarr`` can be used to make a PDF plot:
-
-   .. image:: kde_example.png
-      :alt: Scatter plot of the estimated probability density function.
-
-   .. versionadded:: 3.13
-
-
 .. function:: stdev(data, xbar=None)
 
    Return the sample standard deviation (the square root of the sample

From 482c9c11d79e291248a147937d60d5b838a40d9d Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 23 Feb 2024 11:09:29 -0600
Subject: [PATCH 03/19] Add blurb

---
 .../next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst  | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst

diff --git a/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst b/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst
new file mode 100644
index 00000000000000..fb36c0b2a4fabd
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst
@@ -0,0 +1 @@
+Add kernel density estimation to the statistics module.

From e9386edcf8d7631428a55327671cb045dc4a5386 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 23 Feb 2024 11:30:39 -0600
Subject: [PATCH 04/19] Add PDF area test

---
 Lib/test/test_statistics.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index bf2c254c9ee7d9..1d4a94045671f7 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -2353,6 +2353,30 @@ def test_mixed_int_and_float(self):
                 self.assertAlmostEqual(actual_mean, expected_mean, places=5)
 
 
+class TestKDE(unittest.TestCase):
+
+    def test_kde_area(self):
+        # The approximate integral of a PDF should be close to 1.0
+
+        sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
+
+        def integrate(func, low, high, steps=10_000):
+            "Numeric approximation of a definite function integral."
+            width = (high - low) / steps
+            xarr = (low + width/2 + width * i for i in range(steps))
+            return sum(map(func, xarr)) * width
+
+        kernels = ['normal', 'gauss', 'logistic', 'sigmoid', 'rectangular',
+                   'uniform', 'triangular', 'parabolic', 'epanechnikov',
+                   'quartic', 'biweight', 'triweight', 'cosine']
+
+        for kernel in kernels:
+            with self.subTest(kernel=kernel):
+                f_hat = statistics.kde(sample, h=1.5, kernel=kernel)
+                area = integrate(f_hat, -20, 20)
+                self.assertAlmostEqual(area, 1.0, places=4)
+
+
 class TestQuantiles(unittest.TestCase):
 
     def test_specific_cases(self):

From c3ddb1e08bf7876577003cc8df7f5cb23246ae27 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 23 Feb 2024 11:48:00 -0600
Subject: [PATCH 05/19] Add tests

---
 Lib/statistics.py           | 11 +++++++----
 Lib/test/test_statistics.py | 33 ++++++++++++++++++++++++++-------
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/Lib/statistics.py b/Lib/statistics.py
index 73b0ac95860d75..6dc26fbe5114cf 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -883,6 +883,13 @@ def kde(data, h, kernel='normal'):
 
     """
 
+    n = len(data)
+    if not n:
+        raise StatisticsError('Empty data sequence')
+
+    if h <= 0.0:
+        raise StatisticsError(f'Bandwidth h must be positive, not {h=}')
+
     match kernel:
 
         case 'normal' | 'gauss':
@@ -930,10 +937,6 @@ def kde(data, h, kernel='normal'):
         case _:
             raise ValueError(f'Unknown kernel name: {kernel!r}')
 
-    n = len(data)
-    if not n:
-        raise StatisticsError('Empty data sequence')
-
     if support is None:
 
         def pdf(x):
diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index 1d4a94045671f7..e0e7ca5cf1c806 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -2355,27 +2355,46 @@ def test_mixed_int_and_float(self):
 
 class TestKDE(unittest.TestCase):
 
-    def test_kde_area(self):
-        # The approximate integral of a PDF should be close to 1.0
+    def test_kde(self):
+        kde = statistics.kde
+        StatisticsError = statistics.StatisticsError
+
+        kernels = ['normal', 'gauss', 'logistic', 'sigmoid', 'rectangular',
+                   'uniform', 'triangular', 'parabolic', 'epanechnikov',
+                   'quartic', 'biweight', 'triweight', 'cosine']
 
         sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
 
+        # The approximate integral of a PDF should be close to 1.0
+
         def integrate(func, low, high, steps=10_000):
             "Numeric approximation of a definite function integral."
             width = (high - low) / steps
             xarr = (low + width/2 + width * i for i in range(steps))
             return sum(map(func, xarr)) * width
 
-        kernels = ['normal', 'gauss', 'logistic', 'sigmoid', 'rectangular',
-                   'uniform', 'triangular', 'parabolic', 'epanechnikov',
-                   'quartic', 'biweight', 'triweight', 'cosine']
-
         for kernel in kernels:
             with self.subTest(kernel=kernel):
-                f_hat = statistics.kde(sample, h=1.5, kernel=kernel)
+                f_hat = kde(sample, h=1.5, kernel=kernel)
                 area = integrate(f_hat, -20, 20)
                 self.assertAlmostEqual(area, 1.0, places=4)
 
+        # Check error cases
+
+        with self.assertRaises(StatisticsError):
+            kde([], h=1.0)                              # Empty dataset
+        f_hat = kde(['abc', 'def'], 1.5)                # Non-numeric data
+        with self.assertRaises(TypeError):
+            f_hat(0)
+        with self.assertRaises(StatisticsError):
+            kde(sample, h=0.0)                          # Zero bandwidth
+        with self.assertRaises(StatisticsError):
+            kde(sample, h=0.0)                          # Negative bandwidth
+        with self.assertRaises(TypeError):
+            kde(sample, h='str')                        # Wrong bandwidth type
+        with self.assertRaises(ValueError):
+            kde(sample, h=1.0, kernel='bogus')          # Invalid kernel
+
 
 class TestQuantiles(unittest.TestCase):
 

From a2771cbc22e108a9763cd01ed72ca31b06026335 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 23 Feb 2024 11:58:20 -0600
Subject: [PATCH 06/19] Early test for non-numeric data. Tests for name and
 docstring

---
 Lib/statistics.py           |  3 +++
 Lib/test/test_statistics.py | 11 +++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/Lib/statistics.py b/Lib/statistics.py
index 6dc26fbe5114cf..8ee756ead2a9fb 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -887,6 +887,9 @@ def kde(data, h, kernel='normal'):
     if not n:
         raise StatisticsError('Empty data sequence')
 
+    if not isinstance(data[0], (int, float)):
+        raise TypeError('Data sequence must contain ints or floats')
+
     if h <= 0.0:
         raise StatisticsError(f'Bandwidth h must be positive, not {h=}')
 
diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index e0e7ca5cf1c806..827bbcce26fdb6 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -2383,9 +2383,8 @@ def integrate(func, low, high, steps=10_000):
 
         with self.assertRaises(StatisticsError):
             kde([], h=1.0)                              # Empty dataset
-        f_hat = kde(['abc', 'def'], 1.5)                # Non-numeric data
         with self.assertRaises(TypeError):
-            f_hat(0)
+            kde(['abc', 'def'], 1.5)                    # Non-numeric data
         with self.assertRaises(StatisticsError):
             kde(sample, h=0.0)                          # Zero bandwidth
         with self.assertRaises(StatisticsError):
@@ -2395,6 +2394,14 @@ def integrate(func, low, high, steps=10_000):
         with self.assertRaises(ValueError):
             kde(sample, h=1.0, kernel='bogus')          # Invalid kernel
 
+        # Test name and docstring
+        h = 1.5
+        kernel = 'cosine'
+        f_hat = kde(sample, h, kernel)
+        self.assertEqual(f_hat.__name__, 'pdf')
+        self.assertIn(kernel, f_hat.__doc__)
+        self.assertIn(str(h), f_hat.__doc__)
+
 
 class TestQuantiles(unittest.TestCase):
 

From e29d64f8c41a953330d757eab2dd8b8b4b814f74 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 23 Feb 2024 12:01:20 -0600
Subject: [PATCH 07/19] Use StatisticsError for invalid kernels

---
 Lib/statistics.py           | 2 +-
 Lib/test/test_statistics.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Lib/statistics.py b/Lib/statistics.py
index 8ee756ead2a9fb..c4a29e9c2978b7 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -938,7 +938,7 @@ def kde(data, h, kernel='normal'):
             support = 1.0
 
         case _:
-            raise ValueError(f'Unknown kernel name: {kernel!r}')
+            raise StatisticsError(f'Unknown kernel name: {kernel!r}')
 
     if support is None:
 
diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index 827bbcce26fdb6..b92bbfc746b0c0 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -2391,10 +2391,11 @@ def integrate(func, low, high, steps=10_000):
             kde(sample, h=0.0)                          # Negative bandwidth
         with self.assertRaises(TypeError):
             kde(sample, h='str')                        # Wrong bandwidth type
-        with self.assertRaises(ValueError):
+        with self.assertRaises(StatisticsError):
             kde(sample, h=1.0, kernel='bogus')          # Invalid kernel
 
-        # Test name and docstring
+        # Test name and docstring of the generated function
+
         h = 1.5
         kernel = 'cosine'
         f_hat = kde(sample, h, kernel)

From 5d8bab08b77a3707656890598d6917ab19500694 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 23 Feb 2024 12:02:00 -0600
Subject: [PATCH 08/19] .

---
 Lib/test/test_statistics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index b92bbfc746b0c0..014582428730b5 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -2391,7 +2391,7 @@ def integrate(func, low, high, steps=10_000):
             kde(sample, h=0.0)                          # Negative bandwidth
         with self.assertRaises(TypeError):
             kde(sample, h='str')                        # Wrong bandwidth type
-        with self.assertRaises(StatisticsError):
+        with self.assertRaises(Statisticsrror):
             kde(sample, h=1.0, kernel='bogus')          # Invalid kernel
 
         # Test name and docstring of the generated function

From 9e6aaa772d6f10e1a4994aa55b8458034d31c2b9 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 23 Feb 2024 12:14:45 -0600
Subject: [PATCH 09/19] Add kde() to __all__

---
 Lib/statistics.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Lib/statistics.py b/Lib/statistics.py
index c4a29e9c2978b7..bc879cfc0fece1 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -112,6 +112,7 @@
     'fmean',
     'geometric_mean',
     'harmonic_mean',
+    'kde',
     'linear_regression',
     'mean',
     'median',

From c8b19d8965d44299dd04faeb6ae1d27e51d2f04a Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 23 Feb 2024 12:16:28 -0600
Subject: [PATCH 10/19] Add test for non-sequence input

---
 Lib/test/test_statistics.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index 014582428730b5..f3e5ce3f11866e 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -2385,13 +2385,15 @@ def integrate(func, low, high, steps=10_000):
             kde([], h=1.0)                              # Empty dataset
         with self.assertRaises(TypeError):
             kde(['abc', 'def'], 1.5)                    # Non-numeric data
+        with self.assertRaises(TypeError):
+            kde(iter(sample), 1.5)                      # Data is not a sequence
         with self.assertRaises(StatisticsError):
             kde(sample, h=0.0)                          # Zero bandwidth
         with self.assertRaises(StatisticsError):
             kde(sample, h=0.0)                          # Negative bandwidth
         with self.assertRaises(TypeError):
             kde(sample, h='str')                        # Wrong bandwidth type
-        with self.assertRaises(Statisticsrror):
+        with self.assertRaises(StatisticsError):
             kde(sample, h=1.0, kernel='bogus')          # Invalid kernel
 
         # Test name and docstring of the generated function

From d5714523a5ee17b15b3f6486645af27c4e0ce9b6 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 23 Feb 2024 12:27:04 -0600
Subject: [PATCH 11/19] Fix markup for external link

---
 Doc/library/statistics.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
index 778c70ed254d6d..f69cc1313082a0 100644
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -264,7 +264,7 @@ However, for reading convenience, most of the examples show sorted sequences.
 .. function:: kde(data, h, kernel='normal')
 
    `Kernel Density Estimation (KDE)
-   https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf`_
+   <https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf>`_
    creates an estimated continuous probability density function from *data*
    containing a fixed number of discrete samples.
 

From ddc32b89a6257a1a9d2052f2f1d542481c662366 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Fri, 23 Feb 2024 15:11:19 -0600
Subject: [PATCH 12/19] Remove outdated KDE recipe

---
 Doc/library/statistics.rst | 40 --------------------------------------
 1 file changed, 40 deletions(-)

diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
index f69cc1313082a0..d702c453b08f97 100644
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -1145,46 +1145,6 @@ The final prediction goes to the largest posterior. This is known as the
   'female'
 
 
-Kernel density estimation
-*************************
-
-It is possible to estimate a continuous probability density function
-from a fixed number of discrete samples.
-
-The basic idea is to smooth the data using `a kernel function such as a
-normal distribution, triangular distribution, or uniform distribution
-<https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use>`_.
-The degree of smoothing is controlled by a scaling parameter, ``h``,
-which is called the *bandwidth*.
-
-.. testcode::
-
-   def kde_normal(sample, h):
-       "Create a continuous probability density function from a sample."
-       # Smooth the sample with a normal distribution kernel scaled by h.
-       kernel_h = NormalDist(0.0, h).pdf
-       n = len(sample)
-       def pdf(x):
-           return sum(kernel_h(x - x_i) for x_i in sample) / n
-       return pdf
-
-`Wikipedia has an example
-<https://en.wikipedia.org/wiki/Kernel_density_estimation#Example>`_
-where we can use the ``kde_normal()`` recipe to generate and plot
-a probability density function estimated from a small sample:
-
-.. doctest::
-
-   >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
-   >>> f_hat = kde_normal(sample, h=1.5)
-   >>> xarr = [i/100 for i in range(-750, 1100)]
-   >>> yarr = [f_hat(x) for x in xarr]
-
-The points in ``xarr`` and ``yarr`` can be used to make a PDF plot:
-
-.. image:: kde_example.png
-   :alt: Scatter plot of the estimated probability density function.
-
 ..
    # This modelines must appear within the last ten lines of the file.
    kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding utf-8;

From 5ac105514b216517b557bc11b7c91b85b91df26e Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sat, 24 Feb 2024 09:12:24 -0600
Subject: [PATCH 13/19] Improve variable names in integration using the
 midpoint rule.

---
 Lib/test/test_statistics.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index f3e5ce3f11866e..b3a1fc04acdc46 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -2369,9 +2369,9 @@ def test_kde(self):
 
         def integrate(func, low, high, steps=10_000):
             "Numeric approximation of a definite function integral."
-            width = (high - low) / steps
-            xarr = (low + width/2 + width * i for i in range(steps))
-            return sum(map(func, xarr)) * width
+            dx = (high - low) / steps
+            midpoints = (low + (i + 1/2) * dx for i in range(steps))
+            return sum(map(func, midpoints)) * dx
 
         for kernel in kernels:
             with self.subTest(kernel=kernel):

From 24e38e2020c35a4da6c6462bcfbff2fe25c1e724 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sat, 24 Feb 2024 09:18:50 -0600
Subject: [PATCH 14/19] Improve appearance of generated docstring

---
 Lib/statistics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Lib/statistics.py b/Lib/statistics.py
index bc879cfc0fece1..fa3609565cddec 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -892,7 +892,7 @@ def kde(data, h, kernel='normal'):
         raise TypeError('Data sequence must contain ints or floats')
 
     if h <= 0.0:
-        raise StatisticsError(f'Bandwidth h must be positive, not {h=}')
+        raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}')
 
     match kernel:
 
@@ -957,7 +957,7 @@ def pdf(x):
             supported = sample[i : j]
             return sum(K((x - x_i) / h) for x_i in supported) / (n * h)
 
-    pdf.__doc__ = f'PDF estimate with the {kernel!r} kernel and bandwidth {h}.'
+    pdf.__doc__ = f'PDF estimate with {kernel=!r} and {h=!r}'
 
     return pdf
 

From e729127cbce5f7d4bca13f67a51f9a1983585bff Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sat, 24 Feb 2024 09:32:39 -0600
Subject: [PATCH 15/19] Put the kernel names in italics

---
 Doc/library/statistics.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
index d702c453b08f97..43eddedbcdcfb3 100644
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -280,12 +280,12 @@ However, for reading convenience, most of the examples show sorted sequences.
    points.  Generally, the choice of kernel shape does not matter
    as much as the more influential bandwidth smoothing parameter.
 
-   Kernels that give some weight to every sample point include:
-   normal or gauss, logistic, and sigmoid.
+   Kernels that give some weight to every sample point include
+   *normal* or *gauss*, *logistic*, and *sigmoid*.
 
    Kernels that only give weight to sample points within the bandwidth
-   include rectangular or uniform, triangular, parabolic or epanechnikov,
-   quartic or biweight, triweight, and cosine.
+   include *rectangular* or *uniform*, *triangular*, *parabolic* or
+   *epanechnikov*, *quartic* or *biweight*, *triweight*, and *cosine*.
 
    A :exc:`StatisticsError` will be raised if the *data* sequence is empty.
 

From e6d6e0fec23888d23aed5dd25f85d97e6694424d Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sat, 24 Feb 2024 09:39:49 -0600
Subject: [PATCH 16/19] Add a whatsnew entry

---
 Doc/whatsnew/3.13.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst
index 7c6a2af28758be..a9e90b9782f600 100644
--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@@ -460,6 +460,14 @@ sqlite3
   for filtering database objects to dump.
   (Contributed by Mariusz Felisiak in :gh:`91602`.)
 
+statistics
+----------
+
+* Add :func:`statistics.kde` for kernel density estimation.
+  This makes it possible to estimate a continuous probability density function
+  from a fixed number of discrete samples.
+  (Contributed by Raymond Hettinger in :gh:`115863`.)
+
 subprocess
 ----------
 

From b3fd269c8accce9e8d5c5925d743dddaf0497036 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 25 Feb 2024 07:46:20 -0600
Subject: [PATCH 17/19] Add test for support interval boundary conditions

---
 Lib/test/test_statistics.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index b3a1fc04acdc46..1cf41638a7f01a 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -2405,6 +2405,13 @@ def integrate(func, low, high, steps=10_000):
         self.assertIn(kernel, f_hat.__doc__)
         self.assertIn(str(h), f_hat.__doc__)
 
+        # Test closed interval for the support boundaries.
+        # In particular, 'uniform' should non-zero at the boundaries.
+
+        f_hat = kde([0], 1.0, 'uniform')
+        self.assertEqual(f_hat(-1.0), 1/2)
+        self.assertEqual(f_hat(1.0), 1/2)
+
 
 class TestQuantiles(unittest.TestCase):
 

From 9dec3087eb271a54fd8eb8190b66ea467d028039 Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 25 Feb 2024 08:04:24 -0600
Subject: [PATCH 18/19] Sync the docstring with the main docs.

---
 Doc/library/statistics.rst | 11 +++++------
 Lib/statistics.py          |  6 ++++++
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
index 43eddedbcdcfb3..fb388a900bdbff 100644
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -264,13 +264,12 @@ However, for reading convenience, most of the examples show sorted sequences.
 .. function:: kde(data, h, kernel='normal')
 
    `Kernel Density Estimation (KDE)
-   <https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf>`_
-   creates an estimated continuous probability density function from *data*
-   containing a fixed number of discrete samples.
+   <https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf>`_:
+   Create a continuous probability density function from discrete samples.
 
-   The basic idea is to smooth the data using `a kernel function such as a
-   normal distribution, triangular distribution, or uniform distribution
-   <https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use>`_.
+   The basic idea is to smooth the data using `a kernel function
+   <https://en.wikipedia.org/wiki/Kernel_(statistics)`_.
+   to help draw inferences about a population from a sample.
 
    The degree of smoothing is controlled by the scaling parameter *h*
    which is called the bandwidth.  Smaller values emphasize local
diff --git a/Lib/statistics.py b/Lib/statistics.py
index fa3609565cddec..7924123c05b8c3 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -814,6 +814,10 @@ def kde(data, h, kernel='normal'):
     which is called the bandwidth.  Smaller values emphasize local
     features while larger values give smoother results.
 
+    The kernel determines the relative weights of the sample data
+    points.  Generally, the choice of kernel shape does not matter
+    as much as the more influential bandwidth smoothing parameter.
+
     Kernels that give some weight to every sample point:
 
        normal or gauss
@@ -830,6 +834,8 @@ def kde(data, h, kernel='normal'):
        triweight
        cosine
 
+    A StatisticsError will be raised if the data sequence is empty.
+
     Example
     -------
 

From 771a232a81f1aaeeb56b52b515b6185be097edbb Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <raymond.hettinger@gmail.com>
Date: Sun, 25 Feb 2024 09:39:29 -0600
Subject: [PATCH 19/19] Fix missing angle bracket markup

---
 Doc/library/statistics.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
index fb388a900bdbff..1785c6bcc212b7 100644
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -268,7 +268,7 @@ However, for reading convenience, most of the examples show sorted sequences.
    Create a continuous probability density function from discrete samples.
 
    The basic idea is to smooth the data using `a kernel function
-   <https://en.wikipedia.org/wiki/Kernel_(statistics)`_.
+   <https://en.wikipedia.org/wiki/Kernel_(statistics)>`_.
    to help draw inferences about a population from a sample.
 
    The degree of smoothing is controlled by the scaling parameter *h*