File size: 14,905 Bytes
7885a28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 |
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from numbers import Integral
import numpy as np
from ..base import _fit_context
from ..metrics import pairwise_distances_chunked
from ..metrics.pairwise import _NAN_METRICS
from ..neighbors._base import _get_weights
from ..utils._mask import _get_mask
from ..utils._missing import is_scalar_nan
from ..utils._param_validation import Hidden, Interval, StrOptions
from ..utils.validation import (
FLOAT_DTYPES,
_check_feature_names_in,
check_is_fitted,
validate_data,
)
from ._base import _BaseImputer
class KNNImputer(_BaseImputer):
"""Imputation for completing missing values using k-Nearest Neighbors.
Each sample's missing values are imputed using the mean value from
`n_neighbors` nearest neighbors found in the training set. Two samples are
close if the features that neither is missing are close.
Read more in the :ref:`User Guide <knnimpute>`.
.. versionadded:: 0.22
Parameters
----------
missing_values : int, float, str, np.nan or None, default=np.nan
The placeholder for the missing values. All occurrences of
`missing_values` will be imputed. For pandas' dataframes with
nullable integer dtypes with missing values, `missing_values`
should be set to np.nan, since `pd.NA` will be converted to np.nan.
n_neighbors : int, default=5
Number of neighboring samples to use for imputation.
weights : {'uniform', 'distance'} or callable, default='uniform'
Weight function used in prediction. Possible values:
- 'uniform' : uniform weights. All points in each neighborhood are
weighted equally.
- 'distance' : weight points by the inverse of their distance.
in this case, closer neighbors of a query point will have a
greater influence than neighbors which are further away.
- callable : a user-defined function which accepts an
array of distances, and returns an array of the same shape
containing the weights.
metric : {'nan_euclidean'} or callable, default='nan_euclidean'
Distance metric for searching neighbors. Possible values:
- 'nan_euclidean'
- callable : a user-defined function which conforms to the definition
of ``func_metric(x, y, *, missing_values=np.nan)``. `x` and `y`
corresponds to a row (i.e. 1-D arrays) of `X` and `Y`, respectively.
The callable should returns a scalar distance value.
copy : bool, default=True
If True, a copy of X will be created. If False, imputation will
be done in-place whenever possible.
add_indicator : bool, default=False
If True, a :class:`MissingIndicator` transform will stack onto the
output of the imputer's transform. This allows a predictive estimator
to account for missingness despite imputation. If a feature has no
missing values at fit/train time, the feature won't appear on the
missing indicator even if there are missing values at transform/test
time.
keep_empty_features : bool, default=False
If True, features that consist exclusively of missing values when
`fit` is called are returned in results when `transform` is called.
The imputed value is always `0`.
.. versionadded:: 1.2
Attributes
----------
indicator_ : :class:`~sklearn.impute.MissingIndicator`
Indicator used to add binary indicators for missing values.
``None`` if add_indicator is False.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
SimpleImputer : Univariate imputer for completing missing values
with simple strategies.
IterativeImputer : Multivariate imputer that estimates values to impute for
each feature with missing values from all the others.
References
----------
* `Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing
value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17
no. 6, 2001 Pages 520-525.
<https://academic.oup.com/bioinformatics/article/17/6/520/272365>`_
Examples
--------
>>> import numpy as np
>>> from sklearn.impute import KNNImputer
>>> X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
>>> imputer = KNNImputer(n_neighbors=2)
>>> imputer.fit_transform(X)
array([[1. , 2. , 4. ],
[3. , 4. , 3. ],
[5.5, 6. , 5. ],
[8. , 8. , 7. ]])
For a more detailed example see
:ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`.
"""
_parameter_constraints: dict = {
**_BaseImputer._parameter_constraints,
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
"weights": [StrOptions({"uniform", "distance"}), callable, Hidden(None)],
"metric": [StrOptions(set(_NAN_METRICS)), callable],
"copy": ["boolean"],
}
def __init__(
self,
*,
missing_values=np.nan,
n_neighbors=5,
weights="uniform",
metric="nan_euclidean",
copy=True,
add_indicator=False,
keep_empty_features=False,
):
super().__init__(
missing_values=missing_values,
add_indicator=add_indicator,
keep_empty_features=keep_empty_features,
)
self.n_neighbors = n_neighbors
self.weights = weights
self.metric = metric
self.copy = copy
def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col):
"""Helper function to impute a single column.
Parameters
----------
dist_pot_donors : ndarray of shape (n_receivers, n_potential_donors)
Distance matrix between the receivers and potential donors from
training set. There must be at least one non-nan distance between
a receiver and a potential donor.
n_neighbors : int
Number of neighbors to consider.
fit_X_col : ndarray of shape (n_potential_donors,)
Column of potential donors from training set.
mask_fit_X_col : ndarray of shape (n_potential_donors,)
Missing mask for fit_X_col.
Returns
-------
imputed_values: ndarray of shape (n_receivers,)
Imputed values for receiver.
"""
# Get donors
donors_idx = np.argpartition(dist_pot_donors, n_neighbors - 1, axis=1)[
:, :n_neighbors
]
# Get weight matrix from distance matrix
donors_dist = dist_pot_donors[
np.arange(donors_idx.shape[0])[:, None], donors_idx
]
weight_matrix = _get_weights(donors_dist, self.weights)
# fill nans with zeros
if weight_matrix is not None:
weight_matrix[np.isnan(weight_matrix)] = 0.0
else:
weight_matrix = np.ones_like(donors_dist)
weight_matrix[np.isnan(donors_dist)] = 0.0
# Retrieve donor values and calculate kNN average
donors = fit_X_col.take(donors_idx)
donors_mask = mask_fit_X_col.take(donors_idx)
donors = np.ma.array(donors, mask=donors_mask)
return np.ma.average(donors, axis=1, weights=weight_matrix).data
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit the imputer on X.
Parameters
----------
X : array-like shape of (n_samples, n_features)
Input data, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : object
The fitted `KNNImputer` class instance.
"""
# Check data integrity and calling arguments
if not is_scalar_nan(self.missing_values):
ensure_all_finite = True
else:
ensure_all_finite = "allow-nan"
X = validate_data(
self,
X,
accept_sparse=False,
dtype=FLOAT_DTYPES,
ensure_all_finite=ensure_all_finite,
copy=self.copy,
)
self._fit_X = X
self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)
self._valid_mask = ~np.all(self._mask_fit_X, axis=0)
super()._fit_indicator(self._mask_fit_X)
return self
def transform(self, X):
"""Impute all missing values in X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input data to complete.
Returns
-------
X : array-like of shape (n_samples, n_output_features)
The imputed dataset. `n_output_features` is the number of features
that is not always missing during `fit`.
"""
check_is_fitted(self)
if not is_scalar_nan(self.missing_values):
ensure_all_finite = True
else:
ensure_all_finite = "allow-nan"
X = validate_data(
self,
X,
accept_sparse=False,
dtype=FLOAT_DTYPES,
force_writeable=True,
ensure_all_finite=ensure_all_finite,
copy=self.copy,
reset=False,
)
mask = _get_mask(X, self.missing_values)
mask_fit_X = self._mask_fit_X
valid_mask = self._valid_mask
X_indicator = super()._transform_indicator(mask)
# Removes columns where the training data is all nan
if not np.any(mask[:, valid_mask]):
# No missing values in X
if self.keep_empty_features:
Xc = X
Xc[:, ~valid_mask] = 0
else:
Xc = X[:, valid_mask]
# Even if there are no missing values in X, we still concatenate Xc
# with the missing value indicator matrix, X_indicator.
# This is to ensure that the output maintains consistency in terms
# of columns, regardless of whether missing values exist in X or not.
return super()._concatenate_indicator(Xc, X_indicator)
row_missing_idx = np.flatnonzero(mask[:, valid_mask].any(axis=1))
non_missing_fix_X = np.logical_not(mask_fit_X)
# Maps from indices from X to indices in dist matrix
dist_idx_map = np.zeros(X.shape[0], dtype=int)
dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0])
def process_chunk(dist_chunk, start):
row_missing_chunk = row_missing_idx[start : start + len(dist_chunk)]
# Find and impute missing by column
for col in range(X.shape[1]):
if not valid_mask[col]:
# column was all missing during training
continue
col_mask = mask[row_missing_chunk, col]
if not np.any(col_mask):
# column has no missing values
continue
(potential_donors_idx,) = np.nonzero(non_missing_fix_X[:, col])
# receivers_idx are indices in X
receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)]
# distances for samples that needed imputation for column
dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][
:, potential_donors_idx
]
# receivers with all nan distances impute with mean
all_nan_dist_mask = np.isnan(dist_subset).all(axis=1)
all_nan_receivers_idx = receivers_idx[all_nan_dist_mask]
if all_nan_receivers_idx.size:
col_mean = np.ma.array(
self._fit_X[:, col], mask=mask_fit_X[:, col]
).mean()
X[all_nan_receivers_idx, col] = col_mean
if len(all_nan_receivers_idx) == len(receivers_idx):
# all receivers imputed with mean
continue
# receivers with at least one defined distance
receivers_idx = receivers_idx[~all_nan_dist_mask]
dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][
:, potential_donors_idx
]
n_neighbors = min(self.n_neighbors, len(potential_donors_idx))
value = self._calc_impute(
dist_subset,
n_neighbors,
self._fit_X[potential_donors_idx, col],
mask_fit_X[potential_donors_idx, col],
)
X[receivers_idx, col] = value
# process in fixed-memory chunks
gen = pairwise_distances_chunked(
X[row_missing_idx, :],
self._fit_X,
metric=self.metric,
missing_values=self.missing_values,
ensure_all_finite=ensure_all_finite,
reduce_func=process_chunk,
)
for chunk in gen:
# process_chunk modifies X in place. No return value.
pass
if self.keep_empty_features:
Xc = X
Xc[:, ~valid_mask] = 0
else:
Xc = X[:, valid_mask]
return super()._concatenate_indicator(Xc, X_indicator)
def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.
Parameters
----------
input_features : array-like of str or None, default=None
Input features.
- If `input_features` is `None`, then `feature_names_in_` is
used as feature names in. If `feature_names_in_` is not defined,
then the following input feature names are generated:
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
- If `input_features` is an array-like, then `input_features` must
match `feature_names_in_` if `feature_names_in_` is defined.
Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
"""
check_is_fitted(self, "n_features_in_")
input_features = _check_feature_names_in(self, input_features)
names = input_features[self._valid_mask]
return self._concatenate_indicator_feature_names_out(names, input_features)
|