Add cumulative option for the new statistics.kde() function. (#117033)

This commit is contained in:
Raymond Hettinger 2024-03-24 11:35:58 +02:00 committed by GitHub
parent d610d821fd
commit a1e948edba
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 75 additions and 21 deletions

View File

@ -261,11 +261,12 @@ However, for reading convenience, most of the examples show sorted sequences.
Added support for *weights*. Added support for *weights*.
.. function:: kde(data, h, kernel='normal') .. function:: kde(data, h, kernel='normal', *, cumulative=False)
`Kernel Density Estimation (KDE) `Kernel Density Estimation (KDE)
<https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf>`_: <https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf>`_:
Create a continuous probability density function from discrete samples. Create a continuous probability density function or cumulative
distribution function from discrete samples.
The basic idea is to smooth the data using `a kernel function The basic idea is to smooth the data using `a kernel function
<https://en.wikipedia.org/wiki/Kernel_(statistics)>`_. <https://en.wikipedia.org/wiki/Kernel_(statistics)>`_.
@ -280,11 +281,13 @@ However, for reading convenience, most of the examples show sorted sequences.
as much as the more influential bandwidth smoothing parameter. as much as the more influential bandwidth smoothing parameter.
Kernels that give some weight to every sample point include Kernels that give some weight to every sample point include
*normal* or *gauss*, *logistic*, and *sigmoid*. *normal* (*gauss*), *logistic*, and *sigmoid*.
Kernels that only give weight to sample points within the bandwidth Kernels that only give weight to sample points within the bandwidth
include *rectangular* or *uniform*, *triangular*, *parabolic* or include *rectangular* (*uniform*), *triangular*, *parabolic*
*epanechnikov*, *quartic* or *biweight*, *triweight*, and *cosine*. (*epanechnikov*), *quartic* (*biweight*), *triweight*, and *cosine*.
If *cumulative* is true, will return a cumulative distribution function.
A :exc:`StatisticsError` will be raised if the *data* sequence is empty. A :exc:`StatisticsError` will be raised if the *data* sequence is empty.

View File

@ -138,7 +138,7 @@ from decimal import Decimal
from itertools import count, groupby, repeat from itertools import count, groupby, repeat
from bisect import bisect_left, bisect_right from bisect import bisect_left, bisect_right
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod
from math import isfinite, isinf, pi, cos, cosh from math import isfinite, isinf, pi, cos, sin, cosh, atan
from functools import reduce from functools import reduce
from operator import itemgetter from operator import itemgetter
from collections import Counter, namedtuple, defaultdict from collections import Counter, namedtuple, defaultdict
@ -803,9 +803,9 @@ def multimode(data):
return [value for value, count in counts.items() if count == maxcount] return [value for value, count in counts.items() if count == maxcount]
def kde(data, h, kernel='normal'): def kde(data, h, kernel='normal', *, cumulative=False):
"""Kernel Density Estimation: Create a continuous probability """Kernel Density Estimation: Create a continuous probability density
density function from discrete samples. function or cumulative distribution function from discrete samples.
The basic idea is to smooth the data using a kernel function The basic idea is to smooth the data using a kernel function
to help draw inferences about a population from a sample. to help draw inferences about a population from a sample.
@ -820,20 +820,22 @@ def kde(data, h, kernel='normal'):
Kernels that give some weight to every sample point: Kernels that give some weight to every sample point:
normal or gauss normal (gauss)
logistic logistic
sigmoid sigmoid
Kernels that only give weight to sample points within Kernels that only give weight to sample points within
the bandwidth: the bandwidth:
rectangular or uniform rectangular (uniform)
triangular triangular
parabolic or epanechnikov parabolic (epanechnikov)
quartic or biweight quartic (biweight)
triweight triweight
cosine cosine
If *cumulative* is true, will return a cumulative distribution function.
A StatisticsError will be raised if the data sequence is empty. A StatisticsError will be raised if the data sequence is empty.
Example Example
@ -847,7 +849,8 @@ def kde(data, h, kernel='normal'):
Compute the area under the curve: Compute the area under the curve:
>>> sum(f_hat(x) for x in range(-20, 20)) >>> area = sum(f_hat(x) for x in range(-20, 20))
>>> round(area, 4)
1.0 1.0
Plot the estimated probability density function at Plot the estimated probability density function at
@ -876,6 +879,13 @@ def kde(data, h, kernel='normal'):
9: 0.009 x 9: 0.009 x
10: 0.002 x 10: 0.002 x
Estimate P(4.5 < X <= 7.5), the probability that a new sample value
will be between 4.5 and 7.5:
>>> cdf = kde(sample, h=1.5, cumulative=True)
>>> round(cdf(7.5) - cdf(4.5), 2)
0.22
References References
---------- ----------
@ -888,6 +898,9 @@ def kde(data, h, kernel='normal'):
Interactive graphical demonstration and exploration: Interactive graphical demonstration and exploration:
https://demonstrations.wolfram.com/KernelDensityEstimation/ https://demonstrations.wolfram.com/KernelDensityEstimation/
Kernel estimation of cumulative distribution function of a random variable with bounded support
https://www.econstor.eu/bitstream/10419/207829/1/10.21307_stattrans-2016-037.pdf
""" """
n = len(data) n = len(data)
@ -903,45 +916,56 @@ def kde(data, h, kernel='normal'):
match kernel: match kernel:
case 'normal' | 'gauss': case 'normal' | 'gauss':
c = 1 / sqrt(2 * pi) sqrt2pi = sqrt(2 * pi)
K = lambda t: c * exp(-1/2 * t * t) sqrt2 = sqrt(2)
K = lambda t: exp(-1/2 * t * t) / sqrt2pi
I = lambda t: 1/2 * (1.0 + erf(t / sqrt2))
support = None support = None
case 'logistic': case 'logistic':
# 1.0 / (exp(t) + 2.0 + exp(-t)) # 1.0 / (exp(t) + 2.0 + exp(-t))
K = lambda t: 1/2 / (1.0 + cosh(t)) K = lambda t: 1/2 / (1.0 + cosh(t))
I = lambda t: 1.0 - 1.0 / (exp(t) + 1.0)
support = None support = None
case 'sigmoid': case 'sigmoid':
# (2/pi) / (exp(t) + exp(-t)) # (2/pi) / (exp(t) + exp(-t))
c = 1 / pi c1 = 1 / pi
K = lambda t: c / cosh(t) c2 = 2 / pi
K = lambda t: c1 / cosh(t)
I = lambda t: c2 * atan(exp(t))
support = None support = None
case 'rectangular' | 'uniform': case 'rectangular' | 'uniform':
K = lambda t: 1/2 K = lambda t: 1/2
I = lambda t: 1/2 * t + 1/2
support = 1.0 support = 1.0
case 'triangular': case 'triangular':
K = lambda t: 1.0 - abs(t) K = lambda t: 1.0 - abs(t)
I = lambda t: t*t * (1/2 if t < 0.0 else -1/2) + t + 1/2
support = 1.0 support = 1.0
case 'parabolic' | 'epanechnikov': case 'parabolic' | 'epanechnikov':
K = lambda t: 3/4 * (1.0 - t * t) K = lambda t: 3/4 * (1.0 - t * t)
I = lambda t: -1/4 * t**3 + 3/4 * t + 1/2
support = 1.0 support = 1.0
case 'quartic' | 'biweight': case 'quartic' | 'biweight':
K = lambda t: 15/16 * (1.0 - t * t) ** 2 K = lambda t: 15/16 * (1.0 - t * t) ** 2
I = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2
support = 1.0 support = 1.0
case 'triweight': case 'triweight':
K = lambda t: 35/32 * (1.0 - t * t) ** 3 K = lambda t: 35/32 * (1.0 - t * t) ** 3
I = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2
support = 1.0 support = 1.0
case 'cosine': case 'cosine':
c1 = pi / 4 c1 = pi / 4
c2 = pi / 2 c2 = pi / 2
K = lambda t: c1 * cos(c2 * t) K = lambda t: c1 * cos(c2 * t)
I = lambda t: 1/2 * sin(c2 * t) + 1/2
support = 1.0 support = 1.0
case _: case _:
@ -952,6 +976,9 @@ def kde(data, h, kernel='normal'):
def pdf(x): def pdf(x):
return sum(K((x - x_i) / h) for x_i in data) / (n * h) return sum(K((x - x_i) / h) for x_i in data) / (n * h)
def cdf(x):
return sum(I((x - x_i) / h) for x_i in data) / n
else: else:
sample = sorted(data) sample = sorted(data)
@ -963,8 +990,18 @@ def kde(data, h, kernel='normal'):
supported = sample[i : j] supported = sample[i : j]
return sum(K((x - x_i) / h) for x_i in supported) / (n * h) return sum(K((x - x_i) / h) for x_i in supported) / (n * h)
pdf.__doc__ = f'PDF estimate with {h=!r} and {kernel=!r}' def cdf(x):
i = bisect_left(sample, x - bandwidth)
j = bisect_right(sample, x + bandwidth)
supported = sample[i : j]
return sum((I((x - x_i) / h) for x_i in supported), i) / n
if cumulative:
cdf.__doc__ = f'CDF estimate with {h=!r} and {kernel=!r}'
return cdf
else:
pdf.__doc__ = f'PDF estimate with {h=!r} and {kernel=!r}'
return pdf return pdf

View File

@ -2379,6 +2379,18 @@ class TestKDE(unittest.TestCase):
area = integrate(f_hat, -20, 20) area = integrate(f_hat, -20, 20)
self.assertAlmostEqual(area, 1.0, places=4) self.assertAlmostEqual(area, 1.0, places=4)
# Check CDF against an integral of the PDF
data = [3, 5, 10, 12]
h = 2.3
x = 10.5
for kernel in kernels:
with self.subTest(kernel=kernel):
cdf = kde(data, h, kernel, cumulative=True)
f_hat = kde(data, h, kernel)
area = integrate(f_hat, -20, x, 100_000)
self.assertAlmostEqual(cdf(x), area, places=4)
# Check error cases # Check error cases
with self.assertRaises(StatisticsError): with self.assertRaises(StatisticsError):
@ -2395,6 +2407,8 @@ class TestKDE(unittest.TestCase):
kde(sample, h='str') # Wrong bandwidth type kde(sample, h='str') # Wrong bandwidth type
with self.assertRaises(StatisticsError): with self.assertRaises(StatisticsError):
kde(sample, h=1.0, kernel='bogus') # Invalid kernel kde(sample, h=1.0, kernel='bogus') # Invalid kernel
with self.assertRaises(TypeError):
kde(sample, 1.0, 'gauss', True) # Positional cumulative argument
# Test name and docstring of the generated function # Test name and docstring of the generated function
@ -2403,7 +2417,7 @@ class TestKDE(unittest.TestCase):
f_hat = kde(sample, h, kernel) f_hat = kde(sample, h, kernel)
self.assertEqual(f_hat.__name__, 'pdf') self.assertEqual(f_hat.__name__, 'pdf')
self.assertIn(kernel, f_hat.__doc__) self.assertIn(kernel, f_hat.__doc__)
self.assertIn(str(h), f_hat.__doc__) self.assertIn(repr(h), f_hat.__doc__)
# Test closed interval for the support boundaries. # Test closed interval for the support boundaries.
# In particular, 'uniform' should non-zero at the boundaries. # In particular, 'uniform' should non-zero at the boundaries.