Add cumulative option for the new statistics.kde() function. (#117033)

2024-03-24 11:35:58 +02:00 · 2024-03-24 11:35:58 +02:00 · a1e948edba
commit a1e948edba
parent d610d821fd
3 changed files with 75 additions and 21 deletions
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@ -261,11 +261,12 @@ However, for reading convenience, most of the examples show sorted sequences.
      Added support for *weights*.
-.. function:: kde(data, h, kernel='normal')
+.. function:: kde(data, h, kernel='normal', *, cumulative=False)
   `Kernel Density Estimation (KDE)
   <https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf>`_:
-   Create a continuous probability density function from discrete samples.
+   Create a continuous probability density function or cumulative
   distribution function from discrete samples.
   The basic idea is to smooth the data using `a kernel function
   <https://en.wikipedia.org/wiki/Kernel_(statistics)>`_.
@ -280,11 +281,13 @@ However, for reading convenience, most of the examples show sorted sequences.
   as much as the more influential bandwidth smoothing parameter.
   Kernels that give some weight to every sample point include
-   *normal* or *gauss*, *logistic*, and *sigmoid*.
+   *normal* (*gauss*), *logistic*, and *sigmoid*.
   Kernels that only give weight to sample points within the bandwidth
-   include *rectangular* or *uniform*, *triangular*, *parabolic* or
+   include *rectangular* (*uniform*), *triangular*, *parabolic*
-   *epanechnikov*, *quartic* or *biweight*, *triweight*, and *cosine*.
+   (*epanechnikov*), *quartic* (*biweight*), *triweight*, and *cosine*.
   If *cumulative* is true, will return a cumulative distribution function.
   A :exc:`StatisticsError` will be raised if the *data* sequence is empty.
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@ -138,7 +138,7 @@ from decimal import Decimal
 from itertools import count, groupby, repeat
 from bisect import bisect_left, bisect_right
 from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod
-from math import isfinite, isinf, pi, cos, cosh
+from math import isfinite, isinf, pi, cos, sin, cosh, atan
 from functools import reduce
 from operator import itemgetter
 from collections import Counter, namedtuple, defaultdict
@ -803,9 +803,9 @@ def multimode(data):
    return [value for value, count in counts.items() if count == maxcount]
-def kde(data, h, kernel='normal'):
+def kde(data, h, kernel='normal', *, cumulative=False):
-    """Kernel Density Estimation:  Create a continuous probability
+    """Kernel Density Estimation:  Create a continuous probability density
-    density function from discrete samples.
+    function or cumulative distribution function from discrete samples.
    The basic idea is to smooth the data using a kernel function
    to help draw inferences about a population from a sample.
@ -820,20 +820,22 @@ def kde(data, h, kernel='normal'):
    Kernels that give some weight to every sample point:
-       normal or gauss
+       normal (gauss)
       logistic
       sigmoid
    Kernels that only give weight to sample points within
    the bandwidth:
-       rectangular or uniform
+       rectangular (uniform)
       triangular
-       parabolic or epanechnikov
+       parabolic (epanechnikov)
-       quartic or biweight
+       quartic (biweight)
       triweight
       cosine
    If *cumulative* is true, will return a cumulative distribution function.
    A StatisticsError will be raised if the data sequence is empty.
    Example
@ -847,7 +849,8 @@ def kde(data, h, kernel='normal'):
    Compute the area under the curve:
-        >>> sum(f_hat(x) for x in range(-20, 20))
+        >>> area = sum(f_hat(x) for x in range(-20, 20))
        >>> round(area, 4)
        1.0
    Plot the estimated probability density function at
@ -876,6 +879,13 @@ def kde(data, h, kernel='normal'):
         9: 0.009    x
        10: 0.002 x
    Estimate P(4.5 < X <= 7.5), the probability that a new sample value
    will be between 4.5 and 7.5:
        >>> cdf = kde(sample, h=1.5, cumulative=True)
        >>> round(cdf(7.5) - cdf(4.5), 2)
        0.22
    References
    ----------
@ -888,6 +898,9 @@ def kde(data, h, kernel='normal'):
    Interactive graphical demonstration and exploration:
    https://demonstrations.wolfram.com/KernelDensityEstimation/
    Kernel estimation of cumulative distribution function of a random variable with bounded support
    https://www.econstor.eu/bitstream/10419/207829/1/10.21307_stattrans-2016-037.pdf
    """
    n = len(data)
@ -903,45 +916,56 @@ def kde(data, h, kernel='normal'):
    match kernel:
        case 'normal' | 'gauss':
-            c = 1 / sqrt(2 * pi)
+            sqrt2pi = sqrt(2 * pi)
-            K = lambda t: c * exp(-1/2 * t * t)
+            sqrt2 = sqrt(2)
            K = lambda t: exp(-1/2 * t * t) / sqrt2pi
            I = lambda t: 1/2 * (1.0 + erf(t / sqrt2))
            support = None
        case 'logistic':
            # 1.0 / (exp(t) + 2.0 + exp(-t))
            K = lambda t: 1/2 / (1.0 + cosh(t))
            I = lambda t: 1.0 - 1.0 / (exp(t) + 1.0)
            support = None
        case 'sigmoid':
            # (2/pi) / (exp(t) + exp(-t))
-            c = 1 / pi
+            c1 = 1 / pi
-            K = lambda t: c / cosh(t)
+            c2 = 2 / pi
            K = lambda t: c1 / cosh(t)
            I = lambda t: c2 * atan(exp(t))
            support = None
        case 'rectangular' | 'uniform':
            K = lambda t: 1/2
            I = lambda t: 1/2 * t + 1/2
            support = 1.0
        case 'triangular':
            K = lambda t: 1.0 - abs(t)
            I = lambda t: t*t * (1/2 if t < 0.0 else -1/2) + t + 1/2
            support = 1.0
        case 'parabolic' | 'epanechnikov':
            K = lambda t: 3/4 * (1.0 - t * t)
            I = lambda t: -1/4 * t**3 + 3/4 * t + 1/2
            support = 1.0
        case 'quartic' | 'biweight':
            K = lambda t: 15/16 * (1.0 - t * t) ** 2
            I = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2
            support = 1.0
        case 'triweight':
            K = lambda t: 35/32 * (1.0 - t * t) ** 3
            I = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2
            support = 1.0
        case 'cosine':
            c1 = pi / 4
            c2 = pi / 2
            K = lambda t: c1 * cos(c2 * t)
            I = lambda t: 1/2 * sin(c2 * t) + 1/2
            support = 1.0
        case _:
@ -952,6 +976,9 @@ def kde(data, h, kernel='normal'):
        def pdf(x):
            return sum(K((x - x_i) / h) for x_i in data) / (n * h)
        def cdf(x):
            return sum(I((x - x_i) / h) for x_i in data) / n
    else:
        sample = sorted(data)
@ -963,8 +990,18 @@ def kde(data, h, kernel='normal'):
            supported = sample[i : j]
            return sum(K((x - x_i) / h) for x_i in supported) / (n * h)
-    pdf.__doc__ = f'PDF estimate with {h=!r} and {kernel=!r}'
+        def cdf(x):
            i = bisect_left(sample, x - bandwidth)
            j = bisect_right(sample, x + bandwidth)
            supported = sample[i : j]
            return sum((I((x - x_i) / h) for x_i in supported), i) / n
    if cumulative:
        cdf.__doc__ = f'CDF estimate with {h=!r} and {kernel=!r}'
        return cdf
    else:
        pdf.__doc__ = f'PDF estimate with {h=!r} and {kernel=!r}'
        return pdf
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@ -2379,6 +2379,18 @@ class TestKDE(unittest.TestCase):
                area = integrate(f_hat, -20, 20)
                self.assertAlmostEqual(area, 1.0, places=4)
        # Check CDF against an integral of the PDF
        data = [3, 5, 10, 12]
        h = 2.3
        x = 10.5
        for kernel in kernels:
            with self.subTest(kernel=kernel):
                cdf = kde(data, h, kernel, cumulative=True)
                f_hat = kde(data, h, kernel)
                area = integrate(f_hat, -20, x, 100_000)
                self.assertAlmostEqual(cdf(x), area, places=4)
        # Check error cases
        with self.assertRaises(StatisticsError):
@ -2395,6 +2407,8 @@ class TestKDE(unittest.TestCase):
            kde(sample, h='str')                        # Wrong bandwidth type
        with self.assertRaises(StatisticsError):
            kde(sample, h=1.0, kernel='bogus')          # Invalid kernel
        with self.assertRaises(TypeError):
            kde(sample, 1.0, 'gauss', True)             # Positional cumulative argument
        # Test name and docstring of the generated function
@ -2403,7 +2417,7 @@ class TestKDE(unittest.TestCase):
        f_hat = kde(sample, h, kernel)
        self.assertEqual(f_hat.__name__, 'pdf')
        self.assertIn(kernel, f_hat.__doc__)
-        self.assertIn(str(h), f_hat.__doc__)
+        self.assertIn(repr(h), f_hat.__doc__)
        # Test closed interval for the support boundaries.
        # In particular, 'uniform' should non-zero at the boundaries.