|
|
|
|
|
|
|
import warnings |
|
from numbers import Real |
|
|
|
import numpy as np |
|
|
|
from ..base import OutlierMixin, _fit_context |
|
from ..utils import check_array |
|
from ..utils._param_validation import Interval, StrOptions |
|
from ..utils.metaestimators import available_if |
|
from ..utils.validation import check_is_fitted |
|
from ._base import KNeighborsMixin, NeighborsBase |
|
|
|
__all__ = ["LocalOutlierFactor"] |
|
|
|
|
|
class LocalOutlierFactor(KNeighborsMixin, OutlierMixin, NeighborsBase): |
|
"""Unsupervised Outlier Detection using the Local Outlier Factor (LOF). |
|
|
|
The anomaly score of each sample is called the Local Outlier Factor. |
|
It measures the local deviation of the density of a given sample with respect |
|
to its neighbors. |
|
It is local in that the anomaly score depends on how isolated the object |
|
is with respect to the surrounding neighborhood. |
|
More precisely, locality is given by k-nearest neighbors, whose distance |
|
is used to estimate the local density. |
|
By comparing the local density of a sample to the local densities of its |
|
neighbors, one can identify samples that have a substantially lower density |
|
than their neighbors. These are considered outliers. |
|
|
|
.. versionadded:: 0.19 |
|
|
|
Parameters |
|
---------- |
|
n_neighbors : int, default=20 |
|
Number of neighbors to use by default for :meth:`kneighbors` queries. |
|
If n_neighbors is larger than the number of samples provided, |
|
all samples will be used. |
|
|
|
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' |
|
Algorithm used to compute the nearest neighbors: |
|
|
|
- 'ball_tree' will use :class:`BallTree` |
|
- 'kd_tree' will use :class:`KDTree` |
|
- 'brute' will use a brute-force search. |
|
- 'auto' will attempt to decide the most appropriate algorithm |
|
based on the values passed to :meth:`fit` method. |
|
|
|
Note: fitting on sparse input will override the setting of |
|
this parameter, using brute force. |
|
|
|
leaf_size : int, default=30 |
|
Leaf is size passed to :class:`BallTree` or :class:`KDTree`. This can |
|
affect the speed of the construction and query, as well as the memory |
|
required to store the tree. The optimal value depends on the |
|
nature of the problem. |
|
|
|
metric : str or callable, default='minkowski' |
|
Metric to use for distance computation. Default is "minkowski", which |
|
results in the standard Euclidean distance when p = 2. See the |
|
documentation of `scipy.spatial.distance |
|
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and |
|
the metrics listed in |
|
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric |
|
values. |
|
|
|
If metric is "precomputed", X is assumed to be a distance matrix and |
|
must be square during fit. X may be a :term:`sparse graph`, in which |
|
case only "nonzero" elements may be considered neighbors. |
|
|
|
If metric is a callable function, it takes two arrays representing 1D |
|
vectors as inputs and must return one value indicating the distance |
|
between those vectors. This works for Scipy's metrics, but is less |
|
efficient than passing the metric name as a string. |
|
|
|
p : float, default=2 |
|
Parameter for the Minkowski metric from |
|
:func:`sklearn.metrics.pairwise_distances`. When p = 1, this |
|
is equivalent to using manhattan_distance (l1), and euclidean_distance |
|
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. |
|
|
|
metric_params : dict, default=None |
|
Additional keyword arguments for the metric function. |
|
|
|
contamination : 'auto' or float, default='auto' |
|
The amount of contamination of the data set, i.e. the proportion |
|
of outliers in the data set. When fitting this is used to define the |
|
threshold on the scores of the samples. |
|
|
|
- if 'auto', the threshold is determined as in the |
|
original paper, |
|
- if a float, the contamination should be in the range (0, 0.5]. |
|
|
|
.. versionchanged:: 0.22 |
|
The default value of ``contamination`` changed from 0.1 |
|
to ``'auto'``. |
|
|
|
novelty : bool, default=False |
|
By default, LocalOutlierFactor is only meant to be used for outlier |
|
detection (novelty=False). Set novelty to True if you want to use |
|
LocalOutlierFactor for novelty detection. In this case be aware that |
|
you should only use predict, decision_function and score_samples |
|
on new unseen data and not on the training set; and note that the |
|
results obtained this way may differ from the standard LOF results. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
n_jobs : int, default=None |
|
The number of parallel jobs to run for neighbors search. |
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
Attributes |
|
---------- |
|
negative_outlier_factor_ : ndarray of shape (n_samples,) |
|
The opposite LOF of the training samples. The higher, the more normal. |
|
Inliers tend to have a LOF score close to 1 |
|
(``negative_outlier_factor_`` close to -1), while outliers tend to have |
|
a larger LOF score. |
|
|
|
The local outlier factor (LOF) of a sample captures its |
|
supposed 'degree of abnormality'. |
|
It is the average of the ratio of the local reachability density of |
|
a sample and those of its k-nearest neighbors. |
|
|
|
n_neighbors_ : int |
|
The actual number of neighbors used for :meth:`kneighbors` queries. |
|
|
|
offset_ : float |
|
Offset used to obtain binary labels from the raw scores. |
|
Observations having a negative_outlier_factor smaller than `offset_` |
|
are detected as abnormal. |
|
The offset is set to -1.5 (inliers score around -1), except when a |
|
contamination parameter different than "auto" is provided. In that |
|
case, the offset is defined in such a way we obtain the expected |
|
number of outliers in training. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
effective_metric_ : str |
|
The effective metric used for the distance computation. |
|
|
|
effective_metric_params_ : dict |
|
The effective additional keyword arguments for the metric function. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
n_samples_fit_ : int |
|
It is the number of samples in the fitted data. |
|
|
|
See Also |
|
-------- |
|
sklearn.svm.OneClassSVM: Unsupervised Outlier Detection using |
|
Support Vector Machine. |
|
|
|
References |
|
---------- |
|
.. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May). |
|
LOF: identifying density-based local outliers. In ACM sigmod record. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.neighbors import LocalOutlierFactor |
|
>>> X = [[-1.1], [0.2], [101.1], [0.3]] |
|
>>> clf = LocalOutlierFactor(n_neighbors=2) |
|
>>> clf.fit_predict(X) |
|
array([ 1, 1, -1, 1]) |
|
>>> clf.negative_outlier_factor_ |
|
array([ -0.9821..., -1.0370..., -73.3697..., -0.9821...]) |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
**NeighborsBase._parameter_constraints, |
|
"contamination": [ |
|
StrOptions({"auto"}), |
|
Interval(Real, 0, 0.5, closed="right"), |
|
], |
|
"novelty": ["boolean"], |
|
} |
|
_parameter_constraints.pop("radius") |
|
|
|
def __init__( |
|
self, |
|
n_neighbors=20, |
|
*, |
|
algorithm="auto", |
|
leaf_size=30, |
|
metric="minkowski", |
|
p=2, |
|
metric_params=None, |
|
contamination="auto", |
|
novelty=False, |
|
n_jobs=None, |
|
): |
|
super().__init__( |
|
n_neighbors=n_neighbors, |
|
algorithm=algorithm, |
|
leaf_size=leaf_size, |
|
metric=metric, |
|
p=p, |
|
metric_params=metric_params, |
|
n_jobs=n_jobs, |
|
) |
|
self.contamination = contamination |
|
self.novelty = novelty |
|
|
|
def _check_novelty_fit_predict(self): |
|
if self.novelty: |
|
msg = ( |
|
"fit_predict is not available when novelty=True. Use " |
|
"novelty=False if you want to predict on the training set." |
|
) |
|
raise AttributeError(msg) |
|
return True |
|
|
|
@available_if(_check_novelty_fit_predict) |
|
def fit_predict(self, X, y=None): |
|
"""Fit the model to the training set X and return the labels. |
|
|
|
**Not available for novelty detection (when novelty is set to True).** |
|
Label is 1 for an inlier and -1 for an outlier according to the LOF |
|
score and the contamination parameter. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None |
|
The query sample or samples to compute the Local Outlier Factor |
|
w.r.t. the training samples. |
|
|
|
y : Ignored |
|
Not used, present for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
is_inlier : ndarray of shape (n_samples,) |
|
Returns -1 for anomalies/outliers and 1 for inliers. |
|
""" |
|
|
|
|
|
|
|
|
|
return self.fit(X)._predict() |
|
|
|
@_fit_context( |
|
|
|
prefer_skip_nested_validation=False |
|
) |
|
def fit(self, X, y=None): |
|
"""Fit the local outlier factor detector from the training dataset. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ |
|
(n_samples, n_samples) if metric='precomputed' |
|
Training data. |
|
|
|
y : Ignored |
|
Not used, present for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
self : LocalOutlierFactor |
|
The fitted local outlier factor detector. |
|
""" |
|
self._fit(X) |
|
|
|
n_samples = self.n_samples_fit_ |
|
if self.n_neighbors > n_samples: |
|
warnings.warn( |
|
"n_neighbors (%s) is greater than the " |
|
"total number of samples (%s). n_neighbors " |
|
"will be set to (n_samples - 1) for estimation." |
|
% (self.n_neighbors, n_samples) |
|
) |
|
self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1)) |
|
|
|
self._distances_fit_X_, _neighbors_indices_fit_X_ = self.kneighbors( |
|
n_neighbors=self.n_neighbors_ |
|
) |
|
|
|
if self._fit_X.dtype == np.float32: |
|
self._distances_fit_X_ = self._distances_fit_X_.astype( |
|
self._fit_X.dtype, |
|
copy=False, |
|
) |
|
|
|
self._lrd = self._local_reachability_density( |
|
self._distances_fit_X_, _neighbors_indices_fit_X_ |
|
) |
|
|
|
|
|
lrd_ratios_array = ( |
|
self._lrd[_neighbors_indices_fit_X_] / self._lrd[:, np.newaxis] |
|
) |
|
|
|
self.negative_outlier_factor_ = -np.mean(lrd_ratios_array, axis=1) |
|
|
|
if self.contamination == "auto": |
|
|
|
self.offset_ = -1.5 |
|
else: |
|
self.offset_ = np.percentile( |
|
self.negative_outlier_factor_, 100.0 * self.contamination |
|
) |
|
|
|
|
|
|
|
if np.min(self.negative_outlier_factor_) < -1e7 and not self.novelty: |
|
warnings.warn( |
|
"Duplicate values are leading to incorrect results. " |
|
"Increase the number of neighbors for more accurate results." |
|
) |
|
|
|
return self |
|
|
|
def _check_novelty_predict(self): |
|
if not self.novelty: |
|
msg = ( |
|
"predict is not available when novelty=False, use " |
|
"fit_predict if you want to predict on training data. Use " |
|
"novelty=True if you want to use LOF for novelty detection " |
|
"and predict on new unseen data." |
|
) |
|
raise AttributeError(msg) |
|
return True |
|
|
|
@available_if(_check_novelty_predict) |
|
def predict(self, X=None): |
|
"""Predict the labels (1 inlier, -1 outlier) of X according to LOF. |
|
|
|
**Only available for novelty detection (when novelty is set to True).** |
|
This method allows to generalize prediction to *new observations* (not |
|
in the training set). Note that the result of ``clf.fit(X)`` then |
|
``clf.predict(X)`` with ``novelty=True`` may differ from the result |
|
obtained by ``clf.fit_predict(X)`` with ``novelty=False``. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
The query sample or samples to compute the Local Outlier Factor |
|
w.r.t. the training samples. |
|
|
|
Returns |
|
------- |
|
is_inlier : ndarray of shape (n_samples,) |
|
Returns -1 for anomalies/outliers and +1 for inliers. |
|
""" |
|
return self._predict(X) |
|
|
|
def _predict(self, X=None): |
|
"""Predict the labels (1 inlier, -1 outlier) of X according to LOF. |
|
|
|
If X is None, returns the same as fit_predict(X_train). |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None |
|
The query sample or samples to compute the Local Outlier Factor |
|
w.r.t. the training samples. If None, makes prediction on the |
|
training data without considering them as their own neighbors. |
|
|
|
Returns |
|
------- |
|
is_inlier : ndarray of shape (n_samples,) |
|
Returns -1 for anomalies/outliers and +1 for inliers. |
|
""" |
|
check_is_fitted(self) |
|
|
|
if X is not None: |
|
shifted_opposite_lof_scores = self.decision_function(X) |
|
is_inlier = np.ones(shifted_opposite_lof_scores.shape[0], dtype=int) |
|
is_inlier[shifted_opposite_lof_scores < 0] = -1 |
|
else: |
|
is_inlier = np.ones(self.n_samples_fit_, dtype=int) |
|
is_inlier[self.negative_outlier_factor_ < self.offset_] = -1 |
|
|
|
return is_inlier |
|
|
|
def _check_novelty_decision_function(self): |
|
if not self.novelty: |
|
msg = ( |
|
"decision_function is not available when novelty=False. " |
|
"Use novelty=True if you want to use LOF for novelty " |
|
"detection and compute decision_function for new unseen " |
|
"data. Note that the opposite LOF of the training samples " |
|
"is always available by considering the " |
|
"negative_outlier_factor_ attribute." |
|
) |
|
raise AttributeError(msg) |
|
return True |
|
|
|
@available_if(_check_novelty_decision_function) |
|
def decision_function(self, X): |
|
"""Shifted opposite of the Local Outlier Factor of X. |
|
|
|
Bigger is better, i.e. large values correspond to inliers. |
|
|
|
**Only available for novelty detection (when novelty is set to True).** |
|
The shift offset allows a zero threshold for being an outlier. |
|
The argument X is supposed to contain *new data*: if X contains a |
|
point from training, it considers the later in its own neighborhood. |
|
Also, the samples in X are not considered in the neighborhood of any |
|
point. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
The query sample or samples to compute the Local Outlier Factor |
|
w.r.t. the training samples. |
|
|
|
Returns |
|
------- |
|
shifted_opposite_lof_scores : ndarray of shape (n_samples,) |
|
The shifted opposite of the Local Outlier Factor of each input |
|
samples. The lower, the more abnormal. Negative scores represent |
|
outliers, positive scores represent inliers. |
|
""" |
|
return self.score_samples(X) - self.offset_ |
|
|
|
def _check_novelty_score_samples(self): |
|
if not self.novelty: |
|
msg = ( |
|
"score_samples is not available when novelty=False. The " |
|
"scores of the training samples are always available " |
|
"through the negative_outlier_factor_ attribute. Use " |
|
"novelty=True if you want to use LOF for novelty detection " |
|
"and compute score_samples for new unseen data." |
|
) |
|
raise AttributeError(msg) |
|
return True |
|
|
|
@available_if(_check_novelty_score_samples) |
|
def score_samples(self, X): |
|
"""Opposite of the Local Outlier Factor of X. |
|
|
|
It is the opposite as bigger is better, i.e. large values correspond |
|
to inliers. |
|
|
|
**Only available for novelty detection (when novelty is set to True).** |
|
The argument X is supposed to contain *new data*: if X contains a |
|
point from training, it considers the later in its own neighborhood. |
|
Also, the samples in X are not considered in the neighborhood of any |
|
point. Because of this, the scores obtained via ``score_samples`` may |
|
differ from the standard LOF scores. |
|
The standard LOF scores for the training data is available via the |
|
``negative_outlier_factor_`` attribute. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
The query sample or samples to compute the Local Outlier Factor |
|
w.r.t. the training samples. |
|
|
|
Returns |
|
------- |
|
opposite_lof_scores : ndarray of shape (n_samples,) |
|
The opposite of the Local Outlier Factor of each input samples. |
|
The lower, the more abnormal. |
|
""" |
|
check_is_fitted(self) |
|
X = check_array(X, accept_sparse="csr") |
|
|
|
distances_X, neighbors_indices_X = self.kneighbors( |
|
X, n_neighbors=self.n_neighbors_ |
|
) |
|
|
|
if X.dtype == np.float32: |
|
distances_X = distances_X.astype(X.dtype, copy=False) |
|
|
|
X_lrd = self._local_reachability_density( |
|
distances_X, |
|
neighbors_indices_X, |
|
) |
|
|
|
lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis] |
|
|
|
|
|
return -np.mean(lrd_ratios_array, axis=1) |
|
|
|
def _local_reachability_density(self, distances_X, neighbors_indices): |
|
"""The local reachability density (LRD) |
|
|
|
The LRD of a sample is the inverse of the average reachability |
|
distance of its k-nearest neighbors. |
|
|
|
Parameters |
|
---------- |
|
distances_X : ndarray of shape (n_queries, self.n_neighbors) |
|
Distances to the neighbors (in the training samples `self._fit_X`) |
|
of each query point to compute the LRD. |
|
|
|
neighbors_indices : ndarray of shape (n_queries, self.n_neighbors) |
|
Neighbors indices (of each query point) among training samples |
|
self._fit_X. |
|
|
|
Returns |
|
------- |
|
local_reachability_density : ndarray of shape (n_queries,) |
|
The local reachability density of each sample. |
|
""" |
|
dist_k = self._distances_fit_X_[neighbors_indices, self.n_neighbors_ - 1] |
|
reach_dist_array = np.maximum(distances_X, dist_k) |
|
|
|
|
|
return 1.0 / (np.mean(reach_dist_array, axis=1) + 1e-10) |
|
|