|
""" |
|
Kernel Density Estimation |
|
------------------------- |
|
""" |
|
|
|
|
|
|
|
|
|
import itertools |
|
from numbers import Integral, Real |
|
|
|
import numpy as np |
|
from scipy.special import gammainc |
|
|
|
from ..base import BaseEstimator, _fit_context |
|
from ..neighbors._base import VALID_METRICS |
|
from ..utils import check_random_state |
|
from ..utils._param_validation import Interval, StrOptions |
|
from ..utils.extmath import row_norms |
|
from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data |
|
from ._ball_tree import BallTree |
|
from ._kd_tree import KDTree |
|
|
|
VALID_KERNELS = [ |
|
"gaussian", |
|
"tophat", |
|
"epanechnikov", |
|
"exponential", |
|
"linear", |
|
"cosine", |
|
] |
|
|
|
TREE_DICT = {"ball_tree": BallTree, "kd_tree": KDTree} |
|
|
|
|
|
|
|
|
|
class KernelDensity(BaseEstimator): |
|
"""Kernel Density Estimation. |
|
|
|
Read more in the :ref:`User Guide <kernel_density>`. |
|
|
|
Parameters |
|
---------- |
|
bandwidth : float or {"scott", "silverman"}, default=1.0 |
|
The bandwidth of the kernel. If bandwidth is a float, it defines the |
|
bandwidth of the kernel. If bandwidth is a string, one of the estimation |
|
methods is implemented. |
|
|
|
algorithm : {'kd_tree', 'ball_tree', 'auto'}, default='auto' |
|
The tree algorithm to use. |
|
|
|
kernel : {'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', \ |
|
'cosine'}, default='gaussian' |
|
The kernel to use. |
|
|
|
metric : str, default='euclidean' |
|
Metric to use for distance computation. See the |
|
documentation of `scipy.spatial.distance |
|
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and |
|
the metrics listed in |
|
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric |
|
values. |
|
|
|
Not all metrics are valid with all algorithms: refer to the |
|
documentation of :class:`BallTree` and :class:`KDTree`. Note that the |
|
normalization of the density output is correct only for the Euclidean |
|
distance metric. |
|
|
|
atol : float, default=0 |
|
The desired absolute tolerance of the result. A larger tolerance will |
|
generally lead to faster execution. |
|
|
|
rtol : float, default=0 |
|
The desired relative tolerance of the result. A larger tolerance will |
|
generally lead to faster execution. |
|
|
|
breadth_first : bool, default=True |
|
If true (default), use a breadth-first approach to the problem. |
|
Otherwise use a depth-first approach. |
|
|
|
leaf_size : int, default=40 |
|
Specify the leaf size of the underlying tree. See :class:`BallTree` |
|
or :class:`KDTree` for details. |
|
|
|
metric_params : dict, default=None |
|
Additional parameters to be passed to the tree for use with the |
|
metric. For more information, see the documentation of |
|
:class:`BallTree` or :class:`KDTree`. |
|
|
|
Attributes |
|
---------- |
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
tree_ : ``BinaryTree`` instance |
|
The tree algorithm for fast generalized N-point problems. |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
bandwidth_ : float |
|
Value of the bandwidth, given directly by the bandwidth parameter or |
|
estimated using the 'scott' or 'silverman' method. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
sklearn.neighbors.KDTree : K-dimensional tree for fast generalized N-point |
|
problems. |
|
sklearn.neighbors.BallTree : Ball tree for fast generalized N-point |
|
problems. |
|
|
|
Examples |
|
-------- |
|
Compute a gaussian kernel density estimate with a fixed bandwidth. |
|
|
|
>>> from sklearn.neighbors import KernelDensity |
|
>>> import numpy as np |
|
>>> rng = np.random.RandomState(42) |
|
>>> X = rng.random_sample((100, 3)) |
|
>>> kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(X) |
|
>>> log_density = kde.score_samples(X[:3]) |
|
>>> log_density |
|
array([-1.52955942, -1.51462041, -1.60244657]) |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
"bandwidth": [ |
|
Interval(Real, 0, None, closed="neither"), |
|
StrOptions({"scott", "silverman"}), |
|
], |
|
"algorithm": [StrOptions(set(TREE_DICT.keys()) | {"auto"})], |
|
"kernel": [StrOptions(set(VALID_KERNELS))], |
|
"metric": [ |
|
StrOptions( |
|
set(itertools.chain(*[VALID_METRICS[alg] for alg in TREE_DICT.keys()])) |
|
) |
|
], |
|
"atol": [Interval(Real, 0, None, closed="left")], |
|
"rtol": [Interval(Real, 0, None, closed="left")], |
|
"breadth_first": ["boolean"], |
|
"leaf_size": [Interval(Integral, 1, None, closed="left")], |
|
"metric_params": [None, dict], |
|
} |
|
|
|
def __init__( |
|
self, |
|
*, |
|
bandwidth=1.0, |
|
algorithm="auto", |
|
kernel="gaussian", |
|
metric="euclidean", |
|
atol=0, |
|
rtol=0, |
|
breadth_first=True, |
|
leaf_size=40, |
|
metric_params=None, |
|
): |
|
self.algorithm = algorithm |
|
self.bandwidth = bandwidth |
|
self.kernel = kernel |
|
self.metric = metric |
|
self.atol = atol |
|
self.rtol = rtol |
|
self.breadth_first = breadth_first |
|
self.leaf_size = leaf_size |
|
self.metric_params = metric_params |
|
|
|
def _choose_algorithm(self, algorithm, metric): |
|
|
|
|
|
if algorithm == "auto": |
|
|
|
if metric in KDTree.valid_metrics: |
|
return "kd_tree" |
|
elif metric in BallTree.valid_metrics: |
|
return "ball_tree" |
|
else: |
|
if metric not in TREE_DICT[algorithm].valid_metrics: |
|
raise ValueError( |
|
"invalid metric for {0}: '{1}'".format(TREE_DICT[algorithm], metric) |
|
) |
|
return algorithm |
|
|
|
@_fit_context( |
|
|
|
prefer_skip_nested_validation=False |
|
) |
|
def fit(self, X, y=None, sample_weight=None): |
|
"""Fit the Kernel Density model on the data. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
List of n_features-dimensional data points. Each row |
|
corresponds to a single data point. |
|
|
|
y : None |
|
Ignored. This parameter exists only for compatibility with |
|
:class:`~sklearn.pipeline.Pipeline`. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
List of sample weights attached to the data X. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
Returns |
|
------- |
|
self : object |
|
Returns the instance itself. |
|
""" |
|
algorithm = self._choose_algorithm(self.algorithm, self.metric) |
|
|
|
if isinstance(self.bandwidth, str): |
|
if self.bandwidth == "scott": |
|
self.bandwidth_ = X.shape[0] ** (-1 / (X.shape[1] + 4)) |
|
elif self.bandwidth == "silverman": |
|
self.bandwidth_ = (X.shape[0] * (X.shape[1] + 2) / 4) ** ( |
|
-1 / (X.shape[1] + 4) |
|
) |
|
else: |
|
self.bandwidth_ = self.bandwidth |
|
|
|
X = validate_data(self, X, order="C", dtype=np.float64) |
|
|
|
if sample_weight is not None: |
|
sample_weight = _check_sample_weight( |
|
sample_weight, X, dtype=np.float64, ensure_non_negative=True |
|
) |
|
|
|
kwargs = self.metric_params |
|
if kwargs is None: |
|
kwargs = {} |
|
self.tree_ = TREE_DICT[algorithm]( |
|
X, |
|
metric=self.metric, |
|
leaf_size=self.leaf_size, |
|
sample_weight=sample_weight, |
|
**kwargs, |
|
) |
|
return self |
|
|
|
def score_samples(self, X): |
|
"""Compute the log-likelihood of each sample under the model. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
An array of points to query. Last dimension should match dimension |
|
of training data (n_features). |
|
|
|
Returns |
|
------- |
|
density : ndarray of shape (n_samples,) |
|
Log-likelihood of each sample in `X`. These are normalized to be |
|
probability densities, so values will be low for high-dimensional |
|
data. |
|
""" |
|
check_is_fitted(self) |
|
|
|
|
|
|
|
X = validate_data(self, X, order="C", dtype=np.float64, reset=False) |
|
if self.tree_.sample_weight is None: |
|
N = self.tree_.data.shape[0] |
|
else: |
|
N = self.tree_.sum_weight |
|
atol_N = self.atol * N |
|
log_density = self.tree_.kernel_density( |
|
X, |
|
h=self.bandwidth_, |
|
kernel=self.kernel, |
|
atol=atol_N, |
|
rtol=self.rtol, |
|
breadth_first=self.breadth_first, |
|
return_log=True, |
|
) |
|
log_density -= np.log(N) |
|
return log_density |
|
|
|
def score(self, X, y=None): |
|
"""Compute the total log-likelihood under the model. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
List of n_features-dimensional data points. Each row |
|
corresponds to a single data point. |
|
|
|
y : None |
|
Ignored. This parameter exists only for compatibility with |
|
:class:`~sklearn.pipeline.Pipeline`. |
|
|
|
Returns |
|
------- |
|
logprob : float |
|
Total log-likelihood of the data in X. This is normalized to be a |
|
probability density, so the value will be low for high-dimensional |
|
data. |
|
""" |
|
return np.sum(self.score_samples(X)) |
|
|
|
def sample(self, n_samples=1, random_state=None): |
|
"""Generate random samples from the model. |
|
|
|
Currently, this is implemented only for gaussian and tophat kernels. |
|
|
|
Parameters |
|
---------- |
|
n_samples : int, default=1 |
|
Number of samples to generate. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Determines random number generation used to generate |
|
random samples. Pass an int for reproducible results |
|
across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Returns |
|
------- |
|
X : array-like of shape (n_samples, n_features) |
|
List of samples. |
|
""" |
|
check_is_fitted(self) |
|
|
|
if self.kernel not in ["gaussian", "tophat"]: |
|
raise NotImplementedError() |
|
|
|
data = np.asarray(self.tree_.data) |
|
|
|
rng = check_random_state(random_state) |
|
u = rng.uniform(0, 1, size=n_samples) |
|
if self.tree_.sample_weight is None: |
|
i = (u * data.shape[0]).astype(np.int64) |
|
else: |
|
cumsum_weight = np.cumsum(np.asarray(self.tree_.sample_weight)) |
|
sum_weight = cumsum_weight[-1] |
|
i = np.searchsorted(cumsum_weight, u * sum_weight) |
|
if self.kernel == "gaussian": |
|
return np.atleast_2d(rng.normal(data[i], self.bandwidth_)) |
|
|
|
elif self.kernel == "tophat": |
|
|
|
|
|
|
|
dim = data.shape[1] |
|
X = rng.normal(size=(n_samples, dim)) |
|
s_sq = row_norms(X, squared=True) |
|
correction = ( |
|
gammainc(0.5 * dim, 0.5 * s_sq) ** (1.0 / dim) |
|
* self.bandwidth_ |
|
/ np.sqrt(s_sq) |
|
) |
|
return data[i] + X * correction[:, np.newaxis] |
|
|