|
"""Nearest Neighbor Regression.""" |
|
|
|
|
|
|
|
|
|
import warnings |
|
|
|
import numpy as np |
|
|
|
from ..base import RegressorMixin, _fit_context |
|
from ..metrics import DistanceMetric |
|
from ..utils._param_validation import StrOptions |
|
from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights |
|
|
|
|
|
class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase): |
|
"""Regression based on k-nearest neighbors. |
|
|
|
The target is predicted by local interpolation of the targets |
|
associated of the nearest neighbors in the training set. |
|
|
|
Read more in the :ref:`User Guide <regression>`. |
|
|
|
.. versionadded:: 0.9 |
|
|
|
Parameters |
|
---------- |
|
n_neighbors : int, default=5 |
|
Number of neighbors to use by default for :meth:`kneighbors` queries. |
|
|
|
weights : {'uniform', 'distance'}, callable or None, default='uniform' |
|
Weight function used in prediction. Possible values: |
|
|
|
- 'uniform' : uniform weights. All points in each neighborhood |
|
are weighted equally. |
|
- 'distance' : weight points by the inverse of their distance. |
|
in this case, closer neighbors of a query point will have a |
|
greater influence than neighbors which are further away. |
|
- [callable] : a user-defined function which accepts an |
|
array of distances, and returns an array of the same shape |
|
containing the weights. |
|
|
|
Uniform weights are used by default. |
|
|
|
See the following example for a demonstration of the impact of |
|
different weighting schemes on predictions: |
|
:ref:`sphx_glr_auto_examples_neighbors_plot_regression.py`. |
|
|
|
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' |
|
Algorithm used to compute the nearest neighbors: |
|
|
|
- 'ball_tree' will use :class:`BallTree` |
|
- 'kd_tree' will use :class:`KDTree` |
|
- 'brute' will use a brute-force search. |
|
- 'auto' will attempt to decide the most appropriate algorithm |
|
based on the values passed to :meth:`fit` method. |
|
|
|
Note: fitting on sparse input will override the setting of |
|
this parameter, using brute force. |
|
|
|
leaf_size : int, default=30 |
|
Leaf size passed to BallTree or KDTree. This can affect the |
|
speed of the construction and query, as well as the memory |
|
required to store the tree. The optimal value depends on the |
|
nature of the problem. |
|
|
|
p : float, default=2 |
|
Power parameter for the Minkowski metric. When p = 1, this is |
|
equivalent to using manhattan_distance (l1), and euclidean_distance |
|
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. |
|
|
|
metric : str, DistanceMetric object or callable, default='minkowski' |
|
Metric to use for distance computation. Default is "minkowski", which |
|
results in the standard Euclidean distance when p = 2. See the |
|
documentation of `scipy.spatial.distance |
|
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and |
|
the metrics listed in |
|
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric |
|
values. |
|
|
|
If metric is "precomputed", X is assumed to be a distance matrix and |
|
must be square during fit. X may be a :term:`sparse graph`, in which |
|
case only "nonzero" elements may be considered neighbors. |
|
|
|
If metric is a callable function, it takes two arrays representing 1D |
|
vectors as inputs and must return one value indicating the distance |
|
between those vectors. This works for Scipy's metrics, but is less |
|
efficient than passing the metric name as a string. |
|
|
|
If metric is a DistanceMetric object, it will be passed directly to |
|
the underlying computation routines. |
|
|
|
metric_params : dict, default=None |
|
Additional keyword arguments for the metric function. |
|
|
|
n_jobs : int, default=None |
|
The number of parallel jobs to run for neighbors search. |
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
Doesn't affect :meth:`fit` method. |
|
|
|
Attributes |
|
---------- |
|
effective_metric_ : str or callable |
|
The distance metric to use. It will be same as the `metric` parameter |
|
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to |
|
'minkowski' and `p` parameter set to 2. |
|
|
|
effective_metric_params_ : dict |
|
Additional keyword arguments for the metric function. For most metrics |
|
will be same with `metric_params` parameter, but may also contain the |
|
`p` parameter value if the `effective_metric_` attribute is set to |
|
'minkowski'. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
n_samples_fit_ : int |
|
Number of samples in the fitted data. |
|
|
|
See Also |
|
-------- |
|
NearestNeighbors : Unsupervised learner for implementing neighbor searches. |
|
RadiusNeighborsRegressor : Regression based on neighbors within a fixed radius. |
|
KNeighborsClassifier : Classifier implementing the k-nearest neighbors vote. |
|
RadiusNeighborsClassifier : Classifier implementing |
|
a vote among neighbors within a given radius. |
|
|
|
Notes |
|
----- |
|
See :ref:`Nearest Neighbors <neighbors>` in the online documentation |
|
for a discussion of the choice of ``algorithm`` and ``leaf_size``. |
|
|
|
.. warning:: |
|
|
|
Regarding the Nearest Neighbors algorithms, if it is found that two |
|
neighbors, neighbor `k+1` and `k`, have identical distances but |
|
different labels, the results will depend on the ordering of the |
|
training data. |
|
|
|
https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm |
|
|
|
Examples |
|
-------- |
|
>>> X = [[0], [1], [2], [3]] |
|
>>> y = [0, 0, 1, 1] |
|
>>> from sklearn.neighbors import KNeighborsRegressor |
|
>>> neigh = KNeighborsRegressor(n_neighbors=2) |
|
>>> neigh.fit(X, y) |
|
KNeighborsRegressor(...) |
|
>>> print(neigh.predict([[1.5]])) |
|
[0.5] |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
**NeighborsBase._parameter_constraints, |
|
"weights": [StrOptions({"uniform", "distance"}), callable, None], |
|
} |
|
_parameter_constraints["metric"].append(DistanceMetric) |
|
_parameter_constraints.pop("radius") |
|
|
|
def __init__( |
|
self, |
|
n_neighbors=5, |
|
*, |
|
weights="uniform", |
|
algorithm="auto", |
|
leaf_size=30, |
|
p=2, |
|
metric="minkowski", |
|
metric_params=None, |
|
n_jobs=None, |
|
): |
|
super().__init__( |
|
n_neighbors=n_neighbors, |
|
algorithm=algorithm, |
|
leaf_size=leaf_size, |
|
metric=metric, |
|
p=p, |
|
metric_params=metric_params, |
|
n_jobs=n_jobs, |
|
) |
|
self.weights = weights |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
|
|
tags.input_tags.pairwise = self.metric == "precomputed" |
|
return tags |
|
|
|
@_fit_context( |
|
|
|
prefer_skip_nested_validation=False |
|
) |
|
def fit(self, X, y): |
|
"""Fit the k-nearest neighbors regressor from the training dataset. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ |
|
(n_samples, n_samples) if metric='precomputed' |
|
Training data. |
|
|
|
y : {array-like, sparse matrix} of shape (n_samples,) or \ |
|
(n_samples, n_outputs) |
|
Target values. |
|
|
|
Returns |
|
------- |
|
self : KNeighborsRegressor |
|
The fitted k-nearest neighbors regressor. |
|
""" |
|
return self._fit(X, y) |
|
|
|
def predict(self, X): |
|
"""Predict the target for the provided data. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_queries, n_features), \ |
|
or (n_queries, n_indexed) if metric == 'precomputed', or None |
|
Test samples. If `None`, predictions for all indexed points are |
|
returned; in this case, points are not considered their own |
|
neighbors. |
|
|
|
Returns |
|
------- |
|
y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int |
|
Target values. |
|
""" |
|
if self.weights == "uniform": |
|
|
|
|
|
neigh_ind = self.kneighbors(X, return_distance=False) |
|
neigh_dist = None |
|
else: |
|
neigh_dist, neigh_ind = self.kneighbors(X) |
|
|
|
weights = _get_weights(neigh_dist, self.weights) |
|
|
|
_y = self._y |
|
if _y.ndim == 1: |
|
_y = _y.reshape((-1, 1)) |
|
|
|
if weights is None: |
|
y_pred = np.mean(_y[neigh_ind], axis=1) |
|
else: |
|
y_pred = np.empty((neigh_dist.shape[0], _y.shape[1]), dtype=np.float64) |
|
denom = np.sum(weights, axis=1) |
|
|
|
for j in range(_y.shape[1]): |
|
num = np.sum(_y[neigh_ind, j] * weights, axis=1) |
|
y_pred[:, j] = num / denom |
|
|
|
if self._y.ndim == 1: |
|
y_pred = y_pred.ravel() |
|
|
|
return y_pred |
|
|
|
|
|
class RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBase): |
|
"""Regression based on neighbors within a fixed radius. |
|
|
|
The target is predicted by local interpolation of the targets |
|
associated of the nearest neighbors in the training set. |
|
|
|
Read more in the :ref:`User Guide <regression>`. |
|
|
|
.. versionadded:: 0.9 |
|
|
|
Parameters |
|
---------- |
|
radius : float, default=1.0 |
|
Range of parameter space to use by default for :meth:`radius_neighbors` |
|
queries. |
|
|
|
weights : {'uniform', 'distance'}, callable or None, default='uniform' |
|
Weight function used in prediction. Possible values: |
|
|
|
- 'uniform' : uniform weights. All points in each neighborhood |
|
are weighted equally. |
|
- 'distance' : weight points by the inverse of their distance. |
|
in this case, closer neighbors of a query point will have a |
|
greater influence than neighbors which are further away. |
|
- [callable] : a user-defined function which accepts an |
|
array of distances, and returns an array of the same shape |
|
containing the weights. |
|
|
|
Uniform weights are used by default. |
|
|
|
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' |
|
Algorithm used to compute the nearest neighbors: |
|
|
|
- 'ball_tree' will use :class:`BallTree` |
|
- 'kd_tree' will use :class:`KDTree` |
|
- 'brute' will use a brute-force search. |
|
- 'auto' will attempt to decide the most appropriate algorithm |
|
based on the values passed to :meth:`fit` method. |
|
|
|
Note: fitting on sparse input will override the setting of |
|
this parameter, using brute force. |
|
|
|
leaf_size : int, default=30 |
|
Leaf size passed to BallTree or KDTree. This can affect the |
|
speed of the construction and query, as well as the memory |
|
required to store the tree. The optimal value depends on the |
|
nature of the problem. |
|
|
|
p : float, default=2 |
|
Power parameter for the Minkowski metric. When p = 1, this is |
|
equivalent to using manhattan_distance (l1), and euclidean_distance |
|
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. |
|
|
|
metric : str or callable, default='minkowski' |
|
Metric to use for distance computation. Default is "minkowski", which |
|
results in the standard Euclidean distance when p = 2. See the |
|
documentation of `scipy.spatial.distance |
|
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and |
|
the metrics listed in |
|
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric |
|
values. |
|
|
|
If metric is "precomputed", X is assumed to be a distance matrix and |
|
must be square during fit. X may be a :term:`sparse graph`, in which |
|
case only "nonzero" elements may be considered neighbors. |
|
|
|
If metric is a callable function, it takes two arrays representing 1D |
|
vectors as inputs and must return one value indicating the distance |
|
between those vectors. This works for Scipy's metrics, but is less |
|
efficient than passing the metric name as a string. |
|
|
|
metric_params : dict, default=None |
|
Additional keyword arguments for the metric function. |
|
|
|
n_jobs : int, default=None |
|
The number of parallel jobs to run for neighbors search. |
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
Attributes |
|
---------- |
|
effective_metric_ : str or callable |
|
The distance metric to use. It will be same as the `metric` parameter |
|
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to |
|
'minkowski' and `p` parameter set to 2. |
|
|
|
effective_metric_params_ : dict |
|
Additional keyword arguments for the metric function. For most metrics |
|
will be same with `metric_params` parameter, but may also contain the |
|
`p` parameter value if the `effective_metric_` attribute is set to |
|
'minkowski'. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
n_samples_fit_ : int |
|
Number of samples in the fitted data. |
|
|
|
See Also |
|
-------- |
|
NearestNeighbors : Unsupervised learner for implementing neighbor searches. |
|
KNeighborsRegressor : Regression based on k-nearest neighbors. |
|
KNeighborsClassifier : Classifier based on the k-nearest neighbors. |
|
RadiusNeighborsClassifier : Classifier based on neighbors within a given radius. |
|
|
|
Notes |
|
----- |
|
See :ref:`Nearest Neighbors <neighbors>` in the online documentation |
|
for a discussion of the choice of ``algorithm`` and ``leaf_size``. |
|
|
|
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm |
|
|
|
Examples |
|
-------- |
|
>>> X = [[0], [1], [2], [3]] |
|
>>> y = [0, 0, 1, 1] |
|
>>> from sklearn.neighbors import RadiusNeighborsRegressor |
|
>>> neigh = RadiusNeighborsRegressor(radius=1.0) |
|
>>> neigh.fit(X, y) |
|
RadiusNeighborsRegressor(...) |
|
>>> print(neigh.predict([[1.5]])) |
|
[0.5] |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
**NeighborsBase._parameter_constraints, |
|
"weights": [StrOptions({"uniform", "distance"}), callable, None], |
|
} |
|
_parameter_constraints.pop("n_neighbors") |
|
|
|
def __init__( |
|
self, |
|
radius=1.0, |
|
*, |
|
weights="uniform", |
|
algorithm="auto", |
|
leaf_size=30, |
|
p=2, |
|
metric="minkowski", |
|
metric_params=None, |
|
n_jobs=None, |
|
): |
|
super().__init__( |
|
radius=radius, |
|
algorithm=algorithm, |
|
leaf_size=leaf_size, |
|
p=p, |
|
metric=metric, |
|
metric_params=metric_params, |
|
n_jobs=n_jobs, |
|
) |
|
self.weights = weights |
|
|
|
@_fit_context( |
|
|
|
prefer_skip_nested_validation=False |
|
) |
|
def fit(self, X, y): |
|
"""Fit the radius neighbors regressor from the training dataset. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ |
|
(n_samples, n_samples) if metric='precomputed' |
|
Training data. |
|
|
|
y : {array-like, sparse matrix} of shape (n_samples,) or \ |
|
(n_samples, n_outputs) |
|
Target values. |
|
|
|
Returns |
|
------- |
|
self : RadiusNeighborsRegressor |
|
The fitted radius neighbors regressor. |
|
""" |
|
return self._fit(X, y) |
|
|
|
def predict(self, X): |
|
"""Predict the target for the provided data. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_queries, n_features), \ |
|
or (n_queries, n_indexed) if metric == 'precomputed', or None |
|
Test samples. If `None`, predictions for all indexed points are |
|
returned; in this case, points are not considered their own |
|
neighbors. |
|
|
|
Returns |
|
------- |
|
y : ndarray of shape (n_queries,) or (n_queries, n_outputs), \ |
|
dtype=double |
|
Target values. |
|
""" |
|
neigh_dist, neigh_ind = self.radius_neighbors(X) |
|
|
|
weights = _get_weights(neigh_dist, self.weights) |
|
|
|
_y = self._y |
|
if _y.ndim == 1: |
|
_y = _y.reshape((-1, 1)) |
|
|
|
empty_obs = np.full_like(_y[0], np.nan) |
|
|
|
if weights is None: |
|
y_pred = np.array( |
|
[ |
|
np.mean(_y[ind, :], axis=0) if len(ind) else empty_obs |
|
for (i, ind) in enumerate(neigh_ind) |
|
] |
|
) |
|
|
|
else: |
|
y_pred = np.array( |
|
[ |
|
( |
|
np.average(_y[ind, :], axis=0, weights=weights[i]) |
|
if len(ind) |
|
else empty_obs |
|
) |
|
for (i, ind) in enumerate(neigh_ind) |
|
] |
|
) |
|
|
|
if np.any(np.isnan(y_pred)): |
|
empty_warning_msg = ( |
|
"One or more samples have no neighbors " |
|
"within specified radius; predicting NaN." |
|
) |
|
warnings.warn(empty_warning_msg) |
|
|
|
if self._y.ndim == 1: |
|
y_pred = y_pred.ravel() |
|
|
|
return y_pred |
|
|