spam-classifier
/
venv
/lib
/python3.11
/site-packages
/sklearn
/covariance
/_empirical_covariance.py
""" | |
Maximum likelihood covariance estimator. | |
""" | |
# Authors: The scikit-learn developers | |
# SPDX-License-Identifier: BSD-3-Clause | |
# avoid division truncation | |
import warnings | |
import numpy as np | |
from scipy import linalg | |
from sklearn.utils import metadata_routing | |
from .. import config_context | |
from ..base import BaseEstimator, _fit_context | |
from ..metrics.pairwise import pairwise_distances | |
from ..utils import check_array | |
from ..utils._param_validation import validate_params | |
from ..utils.extmath import fast_logdet | |
from ..utils.validation import validate_data | |
def log_likelihood(emp_cov, precision): | |
"""Compute the sample mean of the log_likelihood under a covariance model. | |
Computes the empirical expected log-likelihood, allowing for universal | |
comparison (beyond this software package), and accounts for normalization | |
terms and scaling. | |
Parameters | |
---------- | |
emp_cov : ndarray of shape (n_features, n_features) | |
Maximum Likelihood Estimator of covariance. | |
precision : ndarray of shape (n_features, n_features) | |
The precision matrix of the covariance model to be tested. | |
Returns | |
------- | |
log_likelihood_ : float | |
Sample mean of the log-likelihood. | |
""" | |
p = precision.shape[0] | |
log_likelihood_ = -np.sum(emp_cov * precision) + fast_logdet(precision) | |
log_likelihood_ -= p * np.log(2 * np.pi) | |
log_likelihood_ /= 2.0 | |
return log_likelihood_ | |
def empirical_covariance(X, *, assume_centered=False): | |
"""Compute the Maximum likelihood covariance estimator. | |
Parameters | |
---------- | |
X : ndarray of shape (n_samples, n_features) | |
Data from which to compute the covariance estimate. | |
assume_centered : bool, default=False | |
If `True`, data will not be centered before computation. | |
Useful when working with data whose mean is almost, but not exactly | |
zero. | |
If `False`, data will be centered before computation. | |
Returns | |
------- | |
covariance : ndarray of shape (n_features, n_features) | |
Empirical covariance (Maximum Likelihood Estimator). | |
Examples | |
-------- | |
>>> from sklearn.covariance import empirical_covariance | |
>>> X = [[1,1,1],[1,1,1],[1,1,1], | |
... [0,0,0],[0,0,0],[0,0,0]] | |
>>> empirical_covariance(X) | |
array([[0.25, 0.25, 0.25], | |
[0.25, 0.25, 0.25], | |
[0.25, 0.25, 0.25]]) | |
""" | |
X = check_array(X, ensure_2d=False, ensure_all_finite=False) | |
if X.ndim == 1: | |
X = np.reshape(X, (1, -1)) | |
if X.shape[0] == 1: | |
warnings.warn( | |
"Only one sample available. You may want to reshape your data array" | |
) | |
if assume_centered: | |
covariance = np.dot(X.T, X) / X.shape[0] | |
else: | |
covariance = np.cov(X.T, bias=1) | |
if covariance.ndim == 0: | |
covariance = np.array([[covariance]]) | |
return covariance | |
class EmpiricalCovariance(BaseEstimator): | |
"""Maximum likelihood covariance estimator. | |
Read more in the :ref:`User Guide <covariance>`. | |
Parameters | |
---------- | |
store_precision : bool, default=True | |
Specifies if the estimated precision is stored. | |
assume_centered : bool, default=False | |
If True, data are not centered before computation. | |
Useful when working with data whose mean is almost, but not exactly | |
zero. | |
If False (default), data are centered before computation. | |
Attributes | |
---------- | |
location_ : ndarray of shape (n_features,) | |
Estimated location, i.e. the estimated mean. | |
covariance_ : ndarray of shape (n_features, n_features) | |
Estimated covariance matrix | |
precision_ : ndarray of shape (n_features, n_features) | |
Estimated pseudo-inverse matrix. | |
(stored only if store_precision is True) | |
n_features_in_ : int | |
Number of features seen during :term:`fit`. | |
.. versionadded:: 0.24 | |
feature_names_in_ : ndarray of shape (`n_features_in_`,) | |
Names of features seen during :term:`fit`. Defined only when `X` | |
has feature names that are all strings. | |
.. versionadded:: 1.0 | |
See Also | |
-------- | |
EllipticEnvelope : An object for detecting outliers in | |
a Gaussian distributed dataset. | |
GraphicalLasso : Sparse inverse covariance estimation | |
with an l1-penalized estimator. | |
LedoitWolf : LedoitWolf Estimator. | |
MinCovDet : Minimum Covariance Determinant | |
(robust estimator of covariance). | |
OAS : Oracle Approximating Shrinkage Estimator. | |
ShrunkCovariance : Covariance estimator with shrinkage. | |
Examples | |
-------- | |
>>> import numpy as np | |
>>> from sklearn.covariance import EmpiricalCovariance | |
>>> from sklearn.datasets import make_gaussian_quantiles | |
>>> real_cov = np.array([[.8, .3], | |
... [.3, .4]]) | |
>>> rng = np.random.RandomState(0) | |
>>> X = rng.multivariate_normal(mean=[0, 0], | |
... cov=real_cov, | |
... size=500) | |
>>> cov = EmpiricalCovariance().fit(X) | |
>>> cov.covariance_ | |
array([[0.7569..., 0.2818...], | |
[0.2818..., 0.3928...]]) | |
>>> cov.location_ | |
array([0.0622..., 0.0193...]) | |
""" | |
# X_test should have been called X | |
__metadata_request__score = {"X_test": metadata_routing.UNUSED} | |
_parameter_constraints: dict = { | |
"store_precision": ["boolean"], | |
"assume_centered": ["boolean"], | |
} | |
def __init__(self, *, store_precision=True, assume_centered=False): | |
self.store_precision = store_precision | |
self.assume_centered = assume_centered | |
def _set_covariance(self, covariance): | |
"""Saves the covariance and precision estimates | |
Storage is done accordingly to `self.store_precision`. | |
Precision stored only if invertible. | |
Parameters | |
---------- | |
covariance : array-like of shape (n_features, n_features) | |
Estimated covariance matrix to be stored, and from which precision | |
is computed. | |
""" | |
covariance = check_array(covariance) | |
# set covariance | |
self.covariance_ = covariance | |
# set precision | |
if self.store_precision: | |
self.precision_ = linalg.pinvh(covariance, check_finite=False) | |
else: | |
self.precision_ = None | |
def get_precision(self): | |
"""Getter for the precision matrix. | |
Returns | |
------- | |
precision_ : array-like of shape (n_features, n_features) | |
The precision matrix associated to the current covariance object. | |
""" | |
if self.store_precision: | |
precision = self.precision_ | |
else: | |
precision = linalg.pinvh(self.covariance_, check_finite=False) | |
return precision | |
def fit(self, X, y=None): | |
"""Fit the maximum likelihood covariance estimator to X. | |
Parameters | |
---------- | |
X : array-like of shape (n_samples, n_features) | |
Training data, where `n_samples` is the number of samples and | |
`n_features` is the number of features. | |
y : Ignored | |
Not used, present for API consistency by convention. | |
Returns | |
------- | |
self : object | |
Returns the instance itself. | |
""" | |
X = validate_data(self, X) | |
if self.assume_centered: | |
self.location_ = np.zeros(X.shape[1]) | |
else: | |
self.location_ = X.mean(0) | |
covariance = empirical_covariance(X, assume_centered=self.assume_centered) | |
self._set_covariance(covariance) | |
return self | |
def score(self, X_test, y=None): | |
"""Compute the log-likelihood of `X_test` under the estimated Gaussian model. | |
The Gaussian model is defined by its mean and covariance matrix which are | |
represented respectively by `self.location_` and `self.covariance_`. | |
Parameters | |
---------- | |
X_test : array-like of shape (n_samples, n_features) | |
Test data of which we compute the likelihood, where `n_samples` is | |
the number of samples and `n_features` is the number of features. | |
`X_test` is assumed to be drawn from the same distribution than | |
the data used in fit (including centering). | |
y : Ignored | |
Not used, present for API consistency by convention. | |
Returns | |
------- | |
res : float | |
The log-likelihood of `X_test` with `self.location_` and `self.covariance_` | |
as estimators of the Gaussian model mean and covariance matrix respectively. | |
""" | |
X_test = validate_data(self, X_test, reset=False) | |
# compute empirical covariance of the test set | |
test_cov = empirical_covariance(X_test - self.location_, assume_centered=True) | |
# compute log likelihood | |
res = log_likelihood(test_cov, self.get_precision()) | |
return res | |
def error_norm(self, comp_cov, norm="frobenius", scaling=True, squared=True): | |
"""Compute the Mean Squared Error between two covariance estimators. | |
Parameters | |
---------- | |
comp_cov : array-like of shape (n_features, n_features) | |
The covariance to compare with. | |
norm : {"frobenius", "spectral"}, default="frobenius" | |
The type of norm used to compute the error. Available error types: | |
- 'frobenius' (default): sqrt(tr(A^t.A)) | |
- 'spectral': sqrt(max(eigenvalues(A^t.A)) | |
where A is the error ``(comp_cov - self.covariance_)``. | |
scaling : bool, default=True | |
If True (default), the squared error norm is divided by n_features. | |
If False, the squared error norm is not rescaled. | |
squared : bool, default=True | |
Whether to compute the squared error norm or the error norm. | |
If True (default), the squared error norm is returned. | |
If False, the error norm is returned. | |
Returns | |
------- | |
result : float | |
The Mean Squared Error (in the sense of the Frobenius norm) between | |
`self` and `comp_cov` covariance estimators. | |
""" | |
# compute the error | |
error = comp_cov - self.covariance_ | |
# compute the error norm | |
if norm == "frobenius": | |
squared_norm = np.sum(error**2) | |
elif norm == "spectral": | |
squared_norm = np.amax(linalg.svdvals(np.dot(error.T, error))) | |
else: | |
raise NotImplementedError( | |
"Only spectral and frobenius norms are implemented" | |
) | |
# optionally scale the error norm | |
if scaling: | |
squared_norm = squared_norm / error.shape[0] | |
# finally get either the squared norm or the norm | |
if squared: | |
result = squared_norm | |
else: | |
result = np.sqrt(squared_norm) | |
return result | |
def mahalanobis(self, X): | |
"""Compute the squared Mahalanobis distances of given observations. | |
Parameters | |
---------- | |
X : array-like of shape (n_samples, n_features) | |
The observations, the Mahalanobis distances of the which we | |
compute. Observations are assumed to be drawn from the same | |
distribution than the data used in fit. | |
Returns | |
------- | |
dist : ndarray of shape (n_samples,) | |
Squared Mahalanobis distances of the observations. | |
""" | |
X = validate_data(self, X, reset=False) | |
precision = self.get_precision() | |
with config_context(assume_finite=True): | |
# compute mahalanobis distances | |
dist = pairwise_distances( | |
X, self.location_[np.newaxis, :], metric="mahalanobis", VI=precision | |
) | |
return np.reshape(dist, (len(X),)) ** 2 | |