|
"""Nearest Neighbor Classification""" |
|
|
|
|
|
|
|
|
|
import warnings |
|
from numbers import Integral |
|
|
|
import numpy as np |
|
|
|
from sklearn.neighbors._base import _check_precomputed |
|
|
|
from ..base import ClassifierMixin, _fit_context |
|
from ..metrics._pairwise_distances_reduction import ( |
|
ArgKminClassMode, |
|
RadiusNeighborsClassMode, |
|
) |
|
from ..utils._param_validation import StrOptions |
|
from ..utils.arrayfuncs import _all_with_any_reduction_axis_1 |
|
from ..utils.extmath import weighted_mode |
|
from ..utils.fixes import _mode |
|
from ..utils.validation import ( |
|
_is_arraylike, |
|
_num_samples, |
|
check_is_fitted, |
|
validate_data, |
|
) |
|
from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights |
|
|
|
|
|
def _adjusted_metric(metric, metric_kwargs, p=None): |
|
metric_kwargs = metric_kwargs or {} |
|
if metric == "minkowski": |
|
metric_kwargs["p"] = p |
|
if p == 2: |
|
metric = "euclidean" |
|
return metric, metric_kwargs |
|
|
|
|
|
class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase): |
|
"""Classifier implementing the k-nearest neighbors vote. |
|
|
|
Read more in the :ref:`User Guide <classification>`. |
|
|
|
Parameters |
|
---------- |
|
n_neighbors : int, default=5 |
|
Number of neighbors to use by default for :meth:`kneighbors` queries. |
|
|
|
weights : {'uniform', 'distance'}, callable or None, default='uniform' |
|
Weight function used in prediction. Possible values: |
|
|
|
- 'uniform' : uniform weights. All points in each neighborhood |
|
are weighted equally. |
|
- 'distance' : weight points by the inverse of their distance. |
|
in this case, closer neighbors of a query point will have a |
|
greater influence than neighbors which are further away. |
|
- [callable] : a user-defined function which accepts an |
|
array of distances, and returns an array of the same shape |
|
containing the weights. |
|
|
|
Refer to the example entitled |
|
:ref:`sphx_glr_auto_examples_neighbors_plot_classification.py` |
|
showing the impact of the `weights` parameter on the decision |
|
boundary. |
|
|
|
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' |
|
Algorithm used to compute the nearest neighbors: |
|
|
|
- 'ball_tree' will use :class:`BallTree` |
|
- 'kd_tree' will use :class:`KDTree` |
|
- 'brute' will use a brute-force search. |
|
- 'auto' will attempt to decide the most appropriate algorithm |
|
based on the values passed to :meth:`fit` method. |
|
|
|
Note: fitting on sparse input will override the setting of |
|
this parameter, using brute force. |
|
|
|
leaf_size : int, default=30 |
|
Leaf size passed to BallTree or KDTree. This can affect the |
|
speed of the construction and query, as well as the memory |
|
required to store the tree. The optimal value depends on the |
|
nature of the problem. |
|
|
|
p : float, default=2 |
|
Power parameter for the Minkowski metric. When p = 1, this is equivalent |
|
to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. |
|
For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected |
|
to be positive. |
|
|
|
metric : str or callable, default='minkowski' |
|
Metric to use for distance computation. Default is "minkowski", which |
|
results in the standard Euclidean distance when p = 2. See the |
|
documentation of `scipy.spatial.distance |
|
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and |
|
the metrics listed in |
|
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric |
|
values. |
|
|
|
If metric is "precomputed", X is assumed to be a distance matrix and |
|
must be square during fit. X may be a :term:`sparse graph`, in which |
|
case only "nonzero" elements may be considered neighbors. |
|
|
|
If metric is a callable function, it takes two arrays representing 1D |
|
vectors as inputs and must return one value indicating the distance |
|
between those vectors. This works for Scipy's metrics, but is less |
|
efficient than passing the metric name as a string. |
|
|
|
metric_params : dict, default=None |
|
Additional keyword arguments for the metric function. |
|
|
|
n_jobs : int, default=None |
|
The number of parallel jobs to run for neighbors search. |
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
Doesn't affect :meth:`fit` method. |
|
|
|
Attributes |
|
---------- |
|
classes_ : array of shape (n_classes,) |
|
Class labels known to the classifier |
|
|
|
effective_metric_ : str or callble |
|
The distance metric used. It will be same as the `metric` parameter |
|
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to |
|
'minkowski' and `p` parameter set to 2. |
|
|
|
effective_metric_params_ : dict |
|
Additional keyword arguments for the metric function. For most metrics |
|
will be same with `metric_params` parameter, but may also contain the |
|
`p` parameter value if the `effective_metric_` attribute is set to |
|
'minkowski'. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
n_samples_fit_ : int |
|
Number of samples in the fitted data. |
|
|
|
outputs_2d_ : bool |
|
False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit |
|
otherwise True. |
|
|
|
See Also |
|
-------- |
|
RadiusNeighborsClassifier: Classifier based on neighbors within a fixed radius. |
|
KNeighborsRegressor: Regression based on k-nearest neighbors. |
|
RadiusNeighborsRegressor: Regression based on neighbors within a fixed radius. |
|
NearestNeighbors: Unsupervised learner for implementing neighbor searches. |
|
|
|
Notes |
|
----- |
|
See :ref:`Nearest Neighbors <neighbors>` in the online documentation |
|
for a discussion of the choice of ``algorithm`` and ``leaf_size``. |
|
|
|
.. warning:: |
|
|
|
Regarding the Nearest Neighbors algorithms, if it is found that two |
|
neighbors, neighbor `k+1` and `k`, have identical distances |
|
but different labels, the results will depend on the ordering of the |
|
training data. |
|
|
|
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm |
|
|
|
Examples |
|
-------- |
|
>>> X = [[0], [1], [2], [3]] |
|
>>> y = [0, 0, 1, 1] |
|
>>> from sklearn.neighbors import KNeighborsClassifier |
|
>>> neigh = KNeighborsClassifier(n_neighbors=3) |
|
>>> neigh.fit(X, y) |
|
KNeighborsClassifier(...) |
|
>>> print(neigh.predict([[1.1]])) |
|
[0] |
|
>>> print(neigh.predict_proba([[0.9]])) |
|
[[0.666... 0.333...]] |
|
""" |
|
|
|
_parameter_constraints: dict = {**NeighborsBase._parameter_constraints} |
|
_parameter_constraints.pop("radius") |
|
_parameter_constraints.update( |
|
{"weights": [StrOptions({"uniform", "distance"}), callable, None]} |
|
) |
|
|
|
def __init__( |
|
self, |
|
n_neighbors=5, |
|
*, |
|
weights="uniform", |
|
algorithm="auto", |
|
leaf_size=30, |
|
p=2, |
|
metric="minkowski", |
|
metric_params=None, |
|
n_jobs=None, |
|
): |
|
super().__init__( |
|
n_neighbors=n_neighbors, |
|
algorithm=algorithm, |
|
leaf_size=leaf_size, |
|
metric=metric, |
|
p=p, |
|
metric_params=metric_params, |
|
n_jobs=n_jobs, |
|
) |
|
self.weights = weights |
|
|
|
@_fit_context( |
|
|
|
prefer_skip_nested_validation=False |
|
) |
|
def fit(self, X, y): |
|
"""Fit the k-nearest neighbors classifier from the training dataset. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ |
|
(n_samples, n_samples) if metric='precomputed' |
|
Training data. |
|
|
|
y : {array-like, sparse matrix} of shape (n_samples,) or \ |
|
(n_samples, n_outputs) |
|
Target values. |
|
|
|
Returns |
|
------- |
|
self : KNeighborsClassifier |
|
The fitted k-nearest neighbors classifier. |
|
""" |
|
return self._fit(X, y) |
|
|
|
def predict(self, X): |
|
"""Predict the class labels for the provided data. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_queries, n_features), \ |
|
or (n_queries, n_indexed) if metric == 'precomputed', or None |
|
Test samples. If `None`, predictions for all indexed points are |
|
returned; in this case, points are not considered their own |
|
neighbors. |
|
|
|
Returns |
|
------- |
|
y : ndarray of shape (n_queries,) or (n_queries, n_outputs) |
|
Class labels for each data sample. |
|
""" |
|
check_is_fitted(self, "_fit_method") |
|
if self.weights == "uniform": |
|
if self._fit_method == "brute" and ArgKminClassMode.is_usable_for( |
|
X, self._fit_X, self.metric |
|
): |
|
probabilities = self.predict_proba(X) |
|
if self.outputs_2d_: |
|
return np.stack( |
|
[ |
|
self.classes_[idx][np.argmax(probas, axis=1)] |
|
for idx, probas in enumerate(probabilities) |
|
], |
|
axis=1, |
|
) |
|
return self.classes_[np.argmax(probabilities, axis=1)] |
|
|
|
|
|
neigh_ind = self.kneighbors(X, return_distance=False) |
|
neigh_dist = None |
|
else: |
|
neigh_dist, neigh_ind = self.kneighbors(X) |
|
|
|
classes_ = self.classes_ |
|
_y = self._y |
|
if not self.outputs_2d_: |
|
_y = self._y.reshape((-1, 1)) |
|
classes_ = [self.classes_] |
|
|
|
n_outputs = len(classes_) |
|
n_queries = _num_samples(self._fit_X if X is None else X) |
|
weights = _get_weights(neigh_dist, self.weights) |
|
if weights is not None and _all_with_any_reduction_axis_1(weights, value=0): |
|
raise ValueError( |
|
"All neighbors of some sample is getting zero weights. " |
|
"Please modify 'weights' to avoid this case if you are " |
|
"using a user-defined function." |
|
) |
|
|
|
y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype) |
|
for k, classes_k in enumerate(classes_): |
|
if weights is None: |
|
mode, _ = _mode(_y[neigh_ind, k], axis=1) |
|
else: |
|
mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1) |
|
|
|
mode = np.asarray(mode.ravel(), dtype=np.intp) |
|
y_pred[:, k] = classes_k.take(mode) |
|
|
|
if not self.outputs_2d_: |
|
y_pred = y_pred.ravel() |
|
|
|
return y_pred |
|
|
|
def predict_proba(self, X): |
|
"""Return probability estimates for the test data X. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_queries, n_features), \ |
|
or (n_queries, n_indexed) if metric == 'precomputed', or None |
|
Test samples. If `None`, predictions for all indexed points are |
|
returned; in this case, points are not considered their own |
|
neighbors. |
|
|
|
Returns |
|
------- |
|
p : ndarray of shape (n_queries, n_classes), or a list of n_outputs \ |
|
of such arrays if n_outputs > 1. |
|
The class probabilities of the input samples. Classes are ordered |
|
by lexicographic order. |
|
""" |
|
check_is_fitted(self, "_fit_method") |
|
if self.weights == "uniform": |
|
|
|
|
|
metric, metric_kwargs = _adjusted_metric( |
|
metric=self.metric, metric_kwargs=self.metric_params, p=self.p |
|
) |
|
if ( |
|
self._fit_method == "brute" |
|
and ArgKminClassMode.is_usable_for(X, self._fit_X, metric) |
|
|
|
and not self.outputs_2d_ |
|
): |
|
if self.metric == "precomputed": |
|
X = _check_precomputed(X) |
|
else: |
|
X = validate_data( |
|
self, X, accept_sparse="csr", reset=False, order="C" |
|
) |
|
|
|
probabilities = ArgKminClassMode.compute( |
|
X, |
|
self._fit_X, |
|
k=self.n_neighbors, |
|
weights=self.weights, |
|
Y_labels=self._y, |
|
unique_Y_labels=self.classes_, |
|
metric=metric, |
|
metric_kwargs=metric_kwargs, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
strategy="parallel_on_X", |
|
) |
|
return probabilities |
|
|
|
|
|
|
|
neigh_ind = self.kneighbors(X, return_distance=False) |
|
neigh_dist = None |
|
else: |
|
neigh_dist, neigh_ind = self.kneighbors(X) |
|
|
|
classes_ = self.classes_ |
|
_y = self._y |
|
if not self.outputs_2d_: |
|
_y = self._y.reshape((-1, 1)) |
|
classes_ = [self.classes_] |
|
|
|
n_queries = _num_samples(self._fit_X if X is None else X) |
|
|
|
weights = _get_weights(neigh_dist, self.weights) |
|
if weights is None: |
|
weights = np.ones_like(neigh_ind) |
|
elif _all_with_any_reduction_axis_1(weights, value=0): |
|
raise ValueError( |
|
"All neighbors of some sample is getting zero weights. " |
|
"Please modify 'weights' to avoid this case if you are " |
|
"using a user-defined function." |
|
) |
|
|
|
all_rows = np.arange(n_queries) |
|
probabilities = [] |
|
for k, classes_k in enumerate(classes_): |
|
pred_labels = _y[:, k][neigh_ind] |
|
proba_k = np.zeros((n_queries, classes_k.size)) |
|
|
|
|
|
for i, idx in enumerate(pred_labels.T): |
|
proba_k[all_rows, idx] += weights[:, i] |
|
|
|
|
|
normalizer = proba_k.sum(axis=1)[:, np.newaxis] |
|
proba_k /= normalizer |
|
|
|
probabilities.append(proba_k) |
|
|
|
if not self.outputs_2d_: |
|
probabilities = probabilities[0] |
|
|
|
return probabilities |
|
|
|
|
|
|
|
def score(self, X, y, sample_weight=None): |
|
""" |
|
Return the mean accuracy on the given test data and labels. |
|
|
|
In multi-label classification, this is the subset accuracy |
|
which is a harsh metric since you require for each sample that |
|
each label set be correctly predicted. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features), or None |
|
Test samples. If `None`, predictions for all indexed points are |
|
used; in this case, points are not considered their own |
|
neighbors. This means that `knn.fit(X, y).score(None, y)` |
|
implicitly performs a leave-one-out cross-validation procedure |
|
and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())` |
|
but typically much faster. |
|
|
|
y : array-like of shape (n_samples,) or (n_samples, n_outputs) |
|
True labels for `X`. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
Returns |
|
------- |
|
score : float |
|
Mean accuracy of ``self.predict(X)`` w.r.t. `y`. |
|
""" |
|
return super().score(X, y, sample_weight) |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
tags.classifier_tags.multi_label = True |
|
tags.input_tags.pairwise = self.metric == "precomputed" |
|
return tags |
|
|
|
|
|
class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase): |
|
"""Classifier implementing a vote among neighbors within a given radius. |
|
|
|
Read more in the :ref:`User Guide <classification>`. |
|
|
|
Parameters |
|
---------- |
|
radius : float, default=1.0 |
|
Range of parameter space to use by default for :meth:`radius_neighbors` |
|
queries. |
|
|
|
weights : {'uniform', 'distance'}, callable or None, default='uniform' |
|
Weight function used in prediction. Possible values: |
|
|
|
- 'uniform' : uniform weights. All points in each neighborhood |
|
are weighted equally. |
|
- 'distance' : weight points by the inverse of their distance. |
|
in this case, closer neighbors of a query point will have a |
|
greater influence than neighbors which are further away. |
|
- [callable] : a user-defined function which accepts an |
|
array of distances, and returns an array of the same shape |
|
containing the weights. |
|
|
|
Uniform weights are used by default. |
|
|
|
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' |
|
Algorithm used to compute the nearest neighbors: |
|
|
|
- 'ball_tree' will use :class:`BallTree` |
|
- 'kd_tree' will use :class:`KDTree` |
|
- 'brute' will use a brute-force search. |
|
- 'auto' will attempt to decide the most appropriate algorithm |
|
based on the values passed to :meth:`fit` method. |
|
|
|
Note: fitting on sparse input will override the setting of |
|
this parameter, using brute force. |
|
|
|
leaf_size : int, default=30 |
|
Leaf size passed to BallTree or KDTree. This can affect the |
|
speed of the construction and query, as well as the memory |
|
required to store the tree. The optimal value depends on the |
|
nature of the problem. |
|
|
|
p : float, default=2 |
|
Power parameter for the Minkowski metric. When p = 1, this is |
|
equivalent to using manhattan_distance (l1), and euclidean_distance |
|
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. |
|
This parameter is expected to be positive. |
|
|
|
metric : str or callable, default='minkowski' |
|
Metric to use for distance computation. Default is "minkowski", which |
|
results in the standard Euclidean distance when p = 2. See the |
|
documentation of `scipy.spatial.distance |
|
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and |
|
the metrics listed in |
|
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric |
|
values. |
|
|
|
If metric is "precomputed", X is assumed to be a distance matrix and |
|
must be square during fit. X may be a :term:`sparse graph`, in which |
|
case only "nonzero" elements may be considered neighbors. |
|
|
|
If metric is a callable function, it takes two arrays representing 1D |
|
vectors as inputs and must return one value indicating the distance |
|
between those vectors. This works for Scipy's metrics, but is less |
|
efficient than passing the metric name as a string. |
|
|
|
outlier_label : {manual label, 'most_frequent'}, default=None |
|
Label for outlier samples (samples with no neighbors in given radius). |
|
|
|
- manual label: str or int label (should be the same type as y) |
|
or list of manual labels if multi-output is used. |
|
- 'most_frequent' : assign the most frequent label of y to outliers. |
|
- None : when any outlier is detected, ValueError will be raised. |
|
|
|
The outlier label should be selected from among the unique 'Y' labels. |
|
If it is specified with a different value a warning will be raised and |
|
all class probabilities of outliers will be assigned to be 0. |
|
|
|
metric_params : dict, default=None |
|
Additional keyword arguments for the metric function. |
|
|
|
n_jobs : int, default=None |
|
The number of parallel jobs to run for neighbors search. |
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
Attributes |
|
---------- |
|
classes_ : ndarray of shape (n_classes,) |
|
Class labels known to the classifier. |
|
|
|
effective_metric_ : str or callable |
|
The distance metric used. It will be same as the `metric` parameter |
|
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to |
|
'minkowski' and `p` parameter set to 2. |
|
|
|
effective_metric_params_ : dict |
|
Additional keyword arguments for the metric function. For most metrics |
|
will be same with `metric_params` parameter, but may also contain the |
|
`p` parameter value if the `effective_metric_` attribute is set to |
|
'minkowski'. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
n_samples_fit_ : int |
|
Number of samples in the fitted data. |
|
|
|
outlier_label_ : int or array-like of shape (n_class,) |
|
Label which is given for outlier samples (samples with no neighbors |
|
on given radius). |
|
|
|
outputs_2d_ : bool |
|
False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit |
|
otherwise True. |
|
|
|
See Also |
|
-------- |
|
KNeighborsClassifier : Classifier implementing the k-nearest neighbors |
|
vote. |
|
RadiusNeighborsRegressor : Regression based on neighbors within a |
|
fixed radius. |
|
KNeighborsRegressor : Regression based on k-nearest neighbors. |
|
NearestNeighbors : Unsupervised learner for implementing neighbor |
|
searches. |
|
|
|
Notes |
|
----- |
|
See :ref:`Nearest Neighbors <neighbors>` in the online documentation |
|
for a discussion of the choice of ``algorithm`` and ``leaf_size``. |
|
|
|
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm |
|
|
|
Examples |
|
-------- |
|
>>> X = [[0], [1], [2], [3]] |
|
>>> y = [0, 0, 1, 1] |
|
>>> from sklearn.neighbors import RadiusNeighborsClassifier |
|
>>> neigh = RadiusNeighborsClassifier(radius=1.0) |
|
>>> neigh.fit(X, y) |
|
RadiusNeighborsClassifier(...) |
|
>>> print(neigh.predict([[1.5]])) |
|
[0] |
|
>>> print(neigh.predict_proba([[1.0]])) |
|
[[0.66666667 0.33333333]] |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
**NeighborsBase._parameter_constraints, |
|
"weights": [StrOptions({"uniform", "distance"}), callable, None], |
|
"outlier_label": [Integral, str, "array-like", None], |
|
} |
|
_parameter_constraints.pop("n_neighbors") |
|
|
|
def __init__( |
|
self, |
|
radius=1.0, |
|
*, |
|
weights="uniform", |
|
algorithm="auto", |
|
leaf_size=30, |
|
p=2, |
|
metric="minkowski", |
|
outlier_label=None, |
|
metric_params=None, |
|
n_jobs=None, |
|
): |
|
super().__init__( |
|
radius=radius, |
|
algorithm=algorithm, |
|
leaf_size=leaf_size, |
|
metric=metric, |
|
p=p, |
|
metric_params=metric_params, |
|
n_jobs=n_jobs, |
|
) |
|
self.weights = weights |
|
self.outlier_label = outlier_label |
|
|
|
@_fit_context( |
|
|
|
prefer_skip_nested_validation=False |
|
) |
|
def fit(self, X, y): |
|
"""Fit the radius neighbors classifier from the training dataset. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ |
|
(n_samples, n_samples) if metric='precomputed' |
|
Training data. |
|
|
|
y : {array-like, sparse matrix} of shape (n_samples,) or \ |
|
(n_samples, n_outputs) |
|
Target values. |
|
|
|
Returns |
|
------- |
|
self : RadiusNeighborsClassifier |
|
The fitted radius neighbors classifier. |
|
""" |
|
self._fit(X, y) |
|
|
|
classes_ = self.classes_ |
|
_y = self._y |
|
if not self.outputs_2d_: |
|
_y = self._y.reshape((-1, 1)) |
|
classes_ = [self.classes_] |
|
|
|
if self.outlier_label is None: |
|
outlier_label_ = None |
|
|
|
elif self.outlier_label == "most_frequent": |
|
outlier_label_ = [] |
|
|
|
|
|
for k, classes_k in enumerate(classes_): |
|
label_count = np.bincount(_y[:, k]) |
|
outlier_label_.append(classes_k[label_count.argmax()]) |
|
|
|
else: |
|
if _is_arraylike(self.outlier_label) and not isinstance( |
|
self.outlier_label, str |
|
): |
|
if len(self.outlier_label) != len(classes_): |
|
raise ValueError( |
|
"The length of outlier_label: {} is " |
|
"inconsistent with the output " |
|
"length: {}".format(self.outlier_label, len(classes_)) |
|
) |
|
outlier_label_ = self.outlier_label |
|
else: |
|
outlier_label_ = [self.outlier_label] * len(classes_) |
|
|
|
for classes, label in zip(classes_, outlier_label_): |
|
if _is_arraylike(label) and not isinstance(label, str): |
|
|
|
raise TypeError( |
|
"The outlier_label of classes {} is " |
|
"supposed to be a scalar, got " |
|
"{}.".format(classes, label) |
|
) |
|
if np.append(classes, label).dtype != classes.dtype: |
|
|
|
raise TypeError( |
|
"The dtype of outlier_label {} is " |
|
"inconsistent with classes {} in " |
|
"y.".format(label, classes) |
|
) |
|
|
|
self.outlier_label_ = outlier_label_ |
|
|
|
return self |
|
|
|
def predict(self, X): |
|
"""Predict the class labels for the provided data. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_queries, n_features), \ |
|
or (n_queries, n_indexed) if metric == 'precomputed', or None |
|
Test samples. If `None`, predictions for all indexed points are |
|
returned; in this case, points are not considered their own |
|
neighbors. |
|
|
|
Returns |
|
------- |
|
y : ndarray of shape (n_queries,) or (n_queries, n_outputs) |
|
Class labels for each data sample. |
|
""" |
|
|
|
probs = self.predict_proba(X) |
|
classes_ = self.classes_ |
|
|
|
if not self.outputs_2d_: |
|
probs = [probs] |
|
classes_ = [self.classes_] |
|
|
|
n_outputs = len(classes_) |
|
n_queries = probs[0].shape[0] |
|
y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype) |
|
|
|
for k, prob in enumerate(probs): |
|
|
|
|
|
max_prob_index = prob.argmax(axis=1) |
|
y_pred[:, k] = classes_[k].take(max_prob_index) |
|
|
|
outlier_zero_probs = (prob == 0).all(axis=1) |
|
if outlier_zero_probs.any(): |
|
zero_prob_index = np.flatnonzero(outlier_zero_probs) |
|
y_pred[zero_prob_index, k] = self.outlier_label_[k] |
|
|
|
if not self.outputs_2d_: |
|
y_pred = y_pred.ravel() |
|
|
|
return y_pred |
|
|
|
def predict_proba(self, X): |
|
"""Return probability estimates for the test data X. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_queries, n_features), \ |
|
or (n_queries, n_indexed) if metric == 'precomputed', or None |
|
Test samples. If `None`, predictions for all indexed points are |
|
returned; in this case, points are not considered their own |
|
neighbors. |
|
|
|
Returns |
|
------- |
|
p : ndarray of shape (n_queries, n_classes), or a list of \ |
|
n_outputs of such arrays if n_outputs > 1. |
|
The class probabilities of the input samples. Classes are ordered |
|
by lexicographic order. |
|
""" |
|
check_is_fitted(self, "_fit_method") |
|
n_queries = _num_samples(self._fit_X if X is None else X) |
|
|
|
metric, metric_kwargs = _adjusted_metric( |
|
metric=self.metric, metric_kwargs=self.metric_params, p=self.p |
|
) |
|
|
|
if ( |
|
self.weights == "uniform" |
|
and self._fit_method == "brute" |
|
and not self.outputs_2d_ |
|
and RadiusNeighborsClassMode.is_usable_for(X, self._fit_X, metric) |
|
): |
|
probabilities = RadiusNeighborsClassMode.compute( |
|
X=X, |
|
Y=self._fit_X, |
|
radius=self.radius, |
|
weights=self.weights, |
|
Y_labels=self._y, |
|
unique_Y_labels=self.classes_, |
|
outlier_label=self.outlier_label, |
|
metric=metric, |
|
metric_kwargs=metric_kwargs, |
|
strategy="parallel_on_X", |
|
|
|
|
|
|
|
|
|
|
|
|
|
) |
|
return probabilities |
|
|
|
neigh_dist, neigh_ind = self.radius_neighbors(X) |
|
outlier_mask = np.zeros(n_queries, dtype=bool) |
|
outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind] |
|
outliers = np.flatnonzero(outlier_mask) |
|
inliers = np.flatnonzero(~outlier_mask) |
|
|
|
classes_ = self.classes_ |
|
_y = self._y |
|
if not self.outputs_2d_: |
|
_y = self._y.reshape((-1, 1)) |
|
classes_ = [self.classes_] |
|
|
|
if self.outlier_label_ is None and outliers.size > 0: |
|
raise ValueError( |
|
"No neighbors found for test samples %r, " |
|
"you can try using larger radius, " |
|
"giving a label for outliers, " |
|
"or considering removing them from your dataset." % outliers |
|
) |
|
|
|
weights = _get_weights(neigh_dist, self.weights) |
|
if weights is not None: |
|
weights = weights[inliers] |
|
|
|
probabilities = [] |
|
|
|
for k, classes_k in enumerate(classes_): |
|
pred_labels = np.zeros(len(neigh_ind), dtype=object) |
|
pred_labels[:] = [_y[ind, k] for ind in neigh_ind] |
|
|
|
proba_k = np.zeros((n_queries, classes_k.size)) |
|
proba_inl = np.zeros((len(inliers), classes_k.size)) |
|
|
|
|
|
if weights is None: |
|
for i, idx in enumerate(pred_labels[inliers]): |
|
proba_inl[i, :] = np.bincount(idx, minlength=classes_k.size) |
|
else: |
|
for i, idx in enumerate(pred_labels[inliers]): |
|
proba_inl[i, :] = np.bincount( |
|
idx, weights[i], minlength=classes_k.size |
|
) |
|
proba_k[inliers, :] = proba_inl |
|
|
|
if outliers.size > 0: |
|
_outlier_label = self.outlier_label_[k] |
|
label_index = np.flatnonzero(classes_k == _outlier_label) |
|
if label_index.size == 1: |
|
proba_k[outliers, label_index[0]] = 1.0 |
|
else: |
|
warnings.warn( |
|
"Outlier label {} is not in training " |
|
"classes. All class probabilities of " |
|
"outliers will be assigned with 0." |
|
"".format(self.outlier_label_[k]) |
|
) |
|
|
|
|
|
normalizer = proba_k.sum(axis=1)[:, np.newaxis] |
|
normalizer[normalizer == 0.0] = 1.0 |
|
proba_k /= normalizer |
|
|
|
probabilities.append(proba_k) |
|
|
|
if not self.outputs_2d_: |
|
probabilities = probabilities[0] |
|
|
|
return probabilities |
|
|
|
|
|
|
|
def score(self, X, y, sample_weight=None): |
|
""" |
|
Return the mean accuracy on the given test data and labels. |
|
|
|
In multi-label classification, this is the subset accuracy |
|
which is a harsh metric since you require for each sample that |
|
each label set be correctly predicted. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features), or None |
|
Test samples. If `None`, predictions for all indexed points are |
|
used; in this case, points are not considered their own |
|
neighbors. This means that `knn.fit(X, y).score(None, y)` |
|
implicitly performs a leave-one-out cross-validation procedure |
|
and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())` |
|
but typically much faster. |
|
|
|
y : array-like of shape (n_samples,) or (n_samples, n_outputs) |
|
True labels for `X`. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
Returns |
|
------- |
|
score : float |
|
Mean accuracy of ``self.predict(X)`` w.r.t. `y`. |
|
""" |
|
return super().score(X, y, sample_weight) |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
tags.classifier_tags.multi_label = True |
|
return tags |
|
|