Add cumulative option for the new statistics.kde() function. (#117033)

This commit is contained in:
Raymond Hettinger 2024-03-24 11:35:58 +02:00 committed by GitHub
parent d610d821fd
commit a1e948edba
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 75 additions and 21 deletions

View File

@ -261,11 +261,12 @@ However, for reading convenience, most of the examples show sorted sequences.
Added support for *weights*.
.. function:: kde(data, h, kernel='normal')
.. function:: kde(data, h, kernel='normal', *, cumulative=False)
`Kernel Density Estimation (KDE)
<https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf>`_:
Create a continuous probability density function from discrete samples.
Create a continuous probability density function or cumulative
distribution function from discrete samples.
The basic idea is to smooth the data using `a kernel function
<https://en.wikipedia.org/wiki/Kernel_(statistics)>`_.
@ -280,11 +281,13 @@ However, for reading convenience, most of the examples show sorted sequences.
as much as the more influential bandwidth smoothing parameter.
Kernels that give some weight to every sample point include
*normal* or *gauss*, *logistic*, and *sigmoid*.
*normal* (*gauss*), *logistic*, and *sigmoid*.
Kernels that only give weight to sample points within the bandwidth
include *rectangular* or *uniform*, *triangular*, *parabolic* or
*epanechnikov*, *quartic* or *biweight*, *triweight*, and *cosine*.
include *rectangular* (*uniform*), *triangular*, *parabolic*
(*epanechnikov*), *quartic* (*biweight*), *triweight*, and *cosine*.
If *cumulative* is true, will return a cumulative distribution function.
A :exc:`StatisticsError` will be raised if the *data* sequence is empty.

View File

@ -138,7 +138,7 @@ from decimal import Decimal
from itertools import count, groupby, repeat
from bisect import bisect_left, bisect_right
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod
from math import isfinite, isinf, pi, cos, cosh
from math import isfinite, isinf, pi, cos, sin, cosh, atan
from functools import reduce
from operator import itemgetter
from collections import Counter, namedtuple, defaultdict
@ -803,9 +803,9 @@ def multimode(data):
return [value for value, count in counts.items() if count == maxcount]
def kde(data, h, kernel='normal'):
"""Kernel Density Estimation: Create a continuous probability
density function from discrete samples.
def kde(data, h, kernel='normal', *, cumulative=False):
"""Kernel Density Estimation: Create a continuous probability density
function or cumulative distribution function from discrete samples.
The basic idea is to smooth the data using a kernel function
to help draw inferences about a population from a sample.
@ -820,20 +820,22 @@ def kde(data, h, kernel='normal'):
Kernels that give some weight to every sample point:
normal or gauss
normal (gauss)
logistic
sigmoid
Kernels that only give weight to sample points within
the bandwidth:
rectangular or uniform
rectangular (uniform)
triangular
parabolic or epanechnikov
quartic or biweight
parabolic (epanechnikov)
quartic (biweight)
triweight
cosine
If *cumulative* is true, will return a cumulative distribution function.
A StatisticsError will be raised if the data sequence is empty.
Example
@ -847,7 +849,8 @@ def kde(data, h, kernel='normal'):
Compute the area under the curve:
>>> sum(f_hat(x) for x in range(-20, 20))
>>> area = sum(f_hat(x) for x in range(-20, 20))
>>> round(area, 4)
1.0
Plot the estimated probability density function at
@ -876,6 +879,13 @@ def kde(data, h, kernel='normal'):
9: 0.009 x
10: 0.002 x
Estimate P(4.5 < X <= 7.5), the probability that a new sample value
will be between 4.5 and 7.5:
>>> cdf = kde(sample, h=1.5, cumulative=True)
>>> round(cdf(7.5) - cdf(4.5), 2)
0.22
References
----------
@ -888,6 +898,9 @@ def kde(data, h, kernel='normal'):
Interactive graphical demonstration and exploration:
https://demonstrations.wolfram.com/KernelDensityEstimation/
Kernel estimation of cumulative distribution function of a random variable with bounded support
https://www.econstor.eu/bitstream/10419/207829/1/10.21307_stattrans-2016-037.pdf
"""
n = len(data)
@ -903,45 +916,56 @@ def kde(data, h, kernel='normal'):
match kernel:
case 'normal' | 'gauss':
c = 1 / sqrt(2 * pi)
K = lambda t: c * exp(-1/2 * t * t)
sqrt2pi = sqrt(2 * pi)
sqrt2 = sqrt(2)
K = lambda t: exp(-1/2 * t * t) / sqrt2pi
I = lambda t: 1/2 * (1.0 + erf(t / sqrt2))
support = None
case 'logistic':
# 1.0 / (exp(t) + 2.0 + exp(-t))
K = lambda t: 1/2 / (1.0 + cosh(t))
I = lambda t: 1.0 - 1.0 / (exp(t) + 1.0)
support = None
case 'sigmoid':
# (2/pi) / (exp(t) + exp(-t))
c = 1 / pi
K = lambda t: c / cosh(t)
c1 = 1 / pi
c2 = 2 / pi
K = lambda t: c1 / cosh(t)
I = lambda t: c2 * atan(exp(t))
support = None
case 'rectangular' | 'uniform':
K = lambda t: 1/2
I = lambda t: 1/2 * t + 1/2
support = 1.0
case 'triangular':
K = lambda t: 1.0 - abs(t)
I = lambda t: t*t * (1/2 if t < 0.0 else -1/2) + t + 1/2
support = 1.0
case 'parabolic' | 'epanechnikov':
K = lambda t: 3/4 * (1.0 - t * t)
I = lambda t: -1/4 * t**3 + 3/4 * t + 1/2
support = 1.0
case 'quartic' | 'biweight':
K = lambda t: 15/16 * (1.0 - t * t) ** 2
I = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2
support = 1.0
case 'triweight':
K = lambda t: 35/32 * (1.0 - t * t) ** 3
I = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2
support = 1.0
case 'cosine':
c1 = pi / 4
c2 = pi / 2
K = lambda t: c1 * cos(c2 * t)
I = lambda t: 1/2 * sin(c2 * t) + 1/2
support = 1.0
case _:
@ -952,6 +976,9 @@ def kde(data, h, kernel='normal'):
def pdf(x):
return sum(K((x - x_i) / h) for x_i in data) / (n * h)
def cdf(x):
return sum(I((x - x_i) / h) for x_i in data) / n
else:
sample = sorted(data)
@ -963,9 +990,19 @@ def kde(data, h, kernel='normal'):
supported = sample[i : j]
return sum(K((x - x_i) / h) for x_i in supported) / (n * h)
pdf.__doc__ = f'PDF estimate with {h=!r} and {kernel=!r}'
def cdf(x):
i = bisect_left(sample, x - bandwidth)
j = bisect_right(sample, x + bandwidth)
supported = sample[i : j]
return sum((I((x - x_i) / h) for x_i in supported), i) / n
return pdf
if cumulative:
cdf.__doc__ = f'CDF estimate with {h=!r} and {kernel=!r}'
return cdf
else:
pdf.__doc__ = f'PDF estimate with {h=!r} and {kernel=!r}'
return pdf
# Notes on methods for computing quantiles

View File

@ -2379,6 +2379,18 @@ class TestKDE(unittest.TestCase):
area = integrate(f_hat, -20, 20)
self.assertAlmostEqual(area, 1.0, places=4)
# Check CDF against an integral of the PDF
data = [3, 5, 10, 12]
h = 2.3
x = 10.5
for kernel in kernels:
with self.subTest(kernel=kernel):
cdf = kde(data, h, kernel, cumulative=True)
f_hat = kde(data, h, kernel)
area = integrate(f_hat, -20, x, 100_000)
self.assertAlmostEqual(cdf(x), area, places=4)
# Check error cases
with self.assertRaises(StatisticsError):
@ -2395,6 +2407,8 @@ class TestKDE(unittest.TestCase):
kde(sample, h='str') # Wrong bandwidth type
with self.assertRaises(StatisticsError):
kde(sample, h=1.0, kernel='bogus') # Invalid kernel
with self.assertRaises(TypeError):
kde(sample, 1.0, 'gauss', True) # Positional cumulative argument
# Test name and docstring of the generated function
@ -2403,7 +2417,7 @@ class TestKDE(unittest.TestCase):
f_hat = kde(sample, h, kernel)
self.assertEqual(f_hat.__name__, 'pdf')
self.assertIn(kernel, f_hat.__doc__)
self.assertIn(str(h), f_hat.__doc__)
self.assertIn(repr(h), f_hat.__doc__)
# Test closed interval for the support boundaries.
# In particular, 'uniform' should non-zero at the boundaries.