|
|
|
|
|
|
|
import warnings |
|
from collections import namedtuple |
|
from numbers import Integral, Real |
|
from time import time |
|
|
|
import numpy as np |
|
from scipy import stats |
|
|
|
from ..base import _fit_context, clone |
|
from ..exceptions import ConvergenceWarning |
|
from ..preprocessing import normalize |
|
from ..utils import _safe_indexing, check_array, check_random_state |
|
from ..utils._indexing import _safe_assign |
|
from ..utils._mask import _get_mask |
|
from ..utils._missing import is_scalar_nan |
|
from ..utils._param_validation import HasMethods, Interval, StrOptions |
|
from ..utils.metadata_routing import ( |
|
MetadataRouter, |
|
MethodMapping, |
|
_raise_for_params, |
|
process_routing, |
|
) |
|
from ..utils.validation import ( |
|
FLOAT_DTYPES, |
|
_check_feature_names_in, |
|
_num_samples, |
|
check_is_fitted, |
|
validate_data, |
|
) |
|
from ._base import SimpleImputer, _BaseImputer, _check_inputs_dtype |
|
|
|
_ImputerTriplet = namedtuple( |
|
"_ImputerTriplet", ["feat_idx", "neighbor_feat_idx", "estimator"] |
|
) |
|
|
|
|
|
def _assign_where(X1, X2, cond): |
|
"""Assign X2 to X1 where cond is True. |
|
|
|
Parameters |
|
---------- |
|
X1 : ndarray or dataframe of shape (n_samples, n_features) |
|
Data. |
|
|
|
X2 : ndarray of shape (n_samples, n_features) |
|
Data to be assigned. |
|
|
|
cond : ndarray of shape (n_samples, n_features) |
|
Boolean mask to assign data. |
|
""" |
|
if hasattr(X1, "mask"): |
|
X1.mask(cond=cond, other=X2, inplace=True) |
|
else: |
|
X1[cond] = X2[cond] |
|
|
|
|
|
class IterativeImputer(_BaseImputer): |
|
"""Multivariate imputer that estimates each feature from all the others. |
|
|
|
A strategy for imputing missing values by modeling each feature with |
|
missing values as a function of other features in a round-robin fashion. |
|
|
|
Read more in the :ref:`User Guide <iterative_imputer>`. |
|
|
|
.. versionadded:: 0.21 |
|
|
|
.. note:: |
|
|
|
This estimator is still **experimental** for now: the predictions |
|
and the API might change without any deprecation cycle. To use it, |
|
you need to explicitly import `enable_iterative_imputer`:: |
|
|
|
>>> # explicitly require this experimental feature |
|
>>> from sklearn.experimental import enable_iterative_imputer # noqa |
|
>>> # now you can import normally from sklearn.impute |
|
>>> from sklearn.impute import IterativeImputer |
|
|
|
Parameters |
|
---------- |
|
estimator : estimator object, default=BayesianRidge() |
|
The estimator to use at each step of the round-robin imputation. |
|
If `sample_posterior=True`, the estimator must support |
|
`return_std` in its `predict` method. |
|
|
|
missing_values : int or np.nan, default=np.nan |
|
The placeholder for the missing values. All occurrences of |
|
`missing_values` will be imputed. For pandas' dataframes with |
|
nullable integer dtypes with missing values, `missing_values` |
|
should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`. |
|
|
|
sample_posterior : bool, default=False |
|
Whether to sample from the (Gaussian) predictive posterior of the |
|
fitted estimator for each imputation. Estimator must support |
|
`return_std` in its `predict` method if set to `True`. Set to |
|
`True` if using `IterativeImputer` for multiple imputations. |
|
|
|
max_iter : int, default=10 |
|
Maximum number of imputation rounds to perform before returning the |
|
imputations computed during the final round. A round is a single |
|
imputation of each feature with missing values. The stopping criterion |
|
is met once `max(abs(X_t - X_{t-1}))/max(abs(X[known_vals])) < tol`, |
|
where `X_t` is `X` at iteration `t`. Note that early stopping is only |
|
applied if `sample_posterior=False`. |
|
|
|
tol : float, default=1e-3 |
|
Tolerance of the stopping condition. |
|
|
|
n_nearest_features : int, default=None |
|
Number of other features to use to estimate the missing values of |
|
each feature column. Nearness between features is measured using |
|
the absolute correlation coefficient between each feature pair (after |
|
initial imputation). To ensure coverage of features throughout the |
|
imputation process, the neighbor features are not necessarily nearest, |
|
but are drawn with probability proportional to correlation for each |
|
imputed target feature. Can provide significant speed-up when the |
|
number of features is huge. If `None`, all features will be used. |
|
|
|
initial_strategy : {'mean', 'median', 'most_frequent', 'constant'}, \ |
|
default='mean' |
|
Which strategy to use to initialize the missing values. Same as the |
|
`strategy` parameter in :class:`~sklearn.impute.SimpleImputer`. |
|
|
|
fill_value : str or numerical value, default=None |
|
When `strategy="constant"`, `fill_value` is used to replace all |
|
occurrences of missing_values. For string or object data types, |
|
`fill_value` must be a string. |
|
If `None`, `fill_value` will be 0 when imputing numerical |
|
data and "missing_value" for strings or object data types. |
|
|
|
.. versionadded:: 1.3 |
|
|
|
imputation_order : {'ascending', 'descending', 'roman', 'arabic', \ |
|
'random'}, default='ascending' |
|
The order in which the features will be imputed. Possible values: |
|
|
|
- `'ascending'`: From features with fewest missing values to most. |
|
- `'descending'`: From features with most missing values to fewest. |
|
- `'roman'`: Left to right. |
|
- `'arabic'`: Right to left. |
|
- `'random'`: A random order for each round. |
|
|
|
skip_complete : bool, default=False |
|
If `True` then features with missing values during :meth:`transform` |
|
which did not have any missing values during :meth:`fit` will be |
|
imputed with the initial imputation method only. Set to `True` if you |
|
have many features with no missing values at both :meth:`fit` and |
|
:meth:`transform` time to save compute. |
|
|
|
min_value : float or array-like of shape (n_features,), default=-np.inf |
|
Minimum possible imputed value. Broadcast to shape `(n_features,)` if |
|
scalar. If array-like, expects shape `(n_features,)`, one min value for |
|
each feature. The default is `-np.inf`. |
|
|
|
.. versionchanged:: 0.23 |
|
Added support for array-like. |
|
|
|
max_value : float or array-like of shape (n_features,), default=np.inf |
|
Maximum possible imputed value. Broadcast to shape `(n_features,)` if |
|
scalar. If array-like, expects shape `(n_features,)`, one max value for |
|
each feature. The default is `np.inf`. |
|
|
|
.. versionchanged:: 0.23 |
|
Added support for array-like. |
|
|
|
verbose : int, default=0 |
|
Verbosity flag, controls the debug messages that are issued |
|
as functions are evaluated. The higher, the more verbose. Can be 0, 1, |
|
or 2. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
The seed of the pseudo random number generator to use. Randomizes |
|
selection of estimator features if `n_nearest_features` is not `None`, |
|
the `imputation_order` if `random`, and the sampling from posterior if |
|
`sample_posterior=True`. Use an integer for determinism. |
|
See :term:`the Glossary <random_state>`. |
|
|
|
add_indicator : bool, default=False |
|
If `True`, a :class:`MissingIndicator` transform will stack onto output |
|
of the imputer's transform. This allows a predictive estimator |
|
to account for missingness despite imputation. If a feature has no |
|
missing values at fit/train time, the feature won't appear on |
|
the missing indicator even if there are missing values at |
|
transform/test time. |
|
|
|
keep_empty_features : bool, default=False |
|
If True, features that consist exclusively of missing values when |
|
`fit` is called are returned in results when `transform` is called. |
|
The imputed value is always `0` except when |
|
`initial_strategy="constant"` in which case `fill_value` will be |
|
used instead. |
|
|
|
.. versionadded:: 1.2 |
|
|
|
Attributes |
|
---------- |
|
initial_imputer_ : object of type :class:`~sklearn.impute.SimpleImputer` |
|
Imputer used to initialize the missing values. |
|
|
|
imputation_sequence_ : list of tuples |
|
Each tuple has `(feat_idx, neighbor_feat_idx, estimator)`, where |
|
`feat_idx` is the current feature to be imputed, |
|
`neighbor_feat_idx` is the array of other features used to impute the |
|
current feature, and `estimator` is the trained estimator used for |
|
the imputation. Length is `self.n_features_with_missing_ * |
|
self.n_iter_`. |
|
|
|
n_iter_ : int |
|
Number of iteration rounds that occurred. Will be less than |
|
`self.max_iter` if early stopping criterion was reached. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
n_features_with_missing_ : int |
|
Number of features with missing values. |
|
|
|
indicator_ : :class:`~sklearn.impute.MissingIndicator` |
|
Indicator used to add binary indicators for missing values. |
|
`None` if `add_indicator=False`. |
|
|
|
random_state_ : RandomState instance |
|
RandomState instance that is generated either from a seed, the random |
|
number generator or by `np.random`. |
|
|
|
See Also |
|
-------- |
|
SimpleImputer : Univariate imputer for completing missing values |
|
with simple strategies. |
|
KNNImputer : Multivariate imputer that estimates missing features using |
|
nearest samples. |
|
|
|
Notes |
|
----- |
|
To support imputation in inductive mode we store each feature's estimator |
|
during the :meth:`fit` phase, and predict without refitting (in order) |
|
during the :meth:`transform` phase. |
|
|
|
Features which contain all missing values at :meth:`fit` are discarded upon |
|
:meth:`transform`. |
|
|
|
Using defaults, the imputer scales in :math:`\\mathcal{O}(knp^3\\min(n,p))` |
|
where :math:`k` = `max_iter`, :math:`n` the number of samples and |
|
:math:`p` the number of features. It thus becomes prohibitively costly when |
|
the number of features increases. Setting |
|
`n_nearest_features << n_features`, `skip_complete=True` or increasing `tol` |
|
can help to reduce its computational cost. |
|
|
|
Depending on the nature of missing values, simple imputers can be |
|
preferable in a prediction context. |
|
|
|
References |
|
---------- |
|
.. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: |
|
Multivariate Imputation by Chained Equations in R". Journal of |
|
Statistical Software 45: 1-67. |
|
<https://www.jstatsoft.org/article/view/v045i03>`_ |
|
|
|
.. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in |
|
Multivariate Data Suitable for use with an Electronic Computer". |
|
Journal of the Royal Statistical Society 22(2): 302-306. |
|
<https://www.jstor.org/stable/2984099>`_ |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.experimental import enable_iterative_imputer |
|
>>> from sklearn.impute import IterativeImputer |
|
>>> imp_mean = IterativeImputer(random_state=0) |
|
>>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) |
|
IterativeImputer(random_state=0) |
|
>>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] |
|
>>> imp_mean.transform(X) |
|
array([[ 6.9584..., 2. , 3. ], |
|
[ 4. , 2.6000..., 6. ], |
|
[10. , 4.9999..., 9. ]]) |
|
|
|
For a more detailed example see |
|
:ref:`sphx_glr_auto_examples_impute_plot_missing_values.py` or |
|
:ref:`sphx_glr_auto_examples_impute_plot_iterative_imputer_variants_comparison.py`. |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
**_BaseImputer._parameter_constraints, |
|
"estimator": [None, HasMethods(["fit", "predict"])], |
|
"sample_posterior": ["boolean"], |
|
"max_iter": [Interval(Integral, 0, None, closed="left")], |
|
"tol": [Interval(Real, 0, None, closed="left")], |
|
"n_nearest_features": [None, Interval(Integral, 1, None, closed="left")], |
|
"initial_strategy": [ |
|
StrOptions({"mean", "median", "most_frequent", "constant"}) |
|
], |
|
"fill_value": "no_validation", |
|
"imputation_order": [ |
|
StrOptions({"ascending", "descending", "roman", "arabic", "random"}) |
|
], |
|
"skip_complete": ["boolean"], |
|
"min_value": [None, Interval(Real, None, None, closed="both"), "array-like"], |
|
"max_value": [None, Interval(Real, None, None, closed="both"), "array-like"], |
|
"verbose": ["verbose"], |
|
"random_state": ["random_state"], |
|
} |
|
|
|
def __init__( |
|
self, |
|
estimator=None, |
|
*, |
|
missing_values=np.nan, |
|
sample_posterior=False, |
|
max_iter=10, |
|
tol=1e-3, |
|
n_nearest_features=None, |
|
initial_strategy="mean", |
|
fill_value=None, |
|
imputation_order="ascending", |
|
skip_complete=False, |
|
min_value=-np.inf, |
|
max_value=np.inf, |
|
verbose=0, |
|
random_state=None, |
|
add_indicator=False, |
|
keep_empty_features=False, |
|
): |
|
super().__init__( |
|
missing_values=missing_values, |
|
add_indicator=add_indicator, |
|
keep_empty_features=keep_empty_features, |
|
) |
|
|
|
self.estimator = estimator |
|
self.sample_posterior = sample_posterior |
|
self.max_iter = max_iter |
|
self.tol = tol |
|
self.n_nearest_features = n_nearest_features |
|
self.initial_strategy = initial_strategy |
|
self.fill_value = fill_value |
|
self.imputation_order = imputation_order |
|
self.skip_complete = skip_complete |
|
self.min_value = min_value |
|
self.max_value = max_value |
|
self.verbose = verbose |
|
self.random_state = random_state |
|
|
|
def _impute_one_feature( |
|
self, |
|
X_filled, |
|
mask_missing_values, |
|
feat_idx, |
|
neighbor_feat_idx, |
|
estimator=None, |
|
fit_mode=True, |
|
params=None, |
|
): |
|
"""Impute a single feature from the others provided. |
|
|
|
This function predicts the missing values of one of the features using |
|
the current estimates of all the other features. The `estimator` must |
|
support `return_std=True` in its `predict` method for this function |
|
to work. |
|
|
|
Parameters |
|
---------- |
|
X_filled : ndarray |
|
Input data with the most recent imputations. |
|
|
|
mask_missing_values : ndarray |
|
Input data's missing indicator matrix. |
|
|
|
feat_idx : int |
|
Index of the feature currently being imputed. |
|
|
|
neighbor_feat_idx : ndarray |
|
Indices of the features to be used in imputing `feat_idx`. |
|
|
|
estimator : object |
|
The estimator to use at this step of the round-robin imputation. |
|
If `sample_posterior=True`, the estimator must support |
|
`return_std` in its `predict` method. |
|
If None, it will be cloned from self._estimator. |
|
|
|
fit_mode : boolean, default=True |
|
Whether to fit and predict with the estimator or just predict. |
|
|
|
params : dict |
|
Additional params routed to the individual estimator. |
|
|
|
Returns |
|
------- |
|
X_filled : ndarray |
|
Input data with `X_filled[missing_row_mask, feat_idx]` updated. |
|
|
|
estimator : estimator with sklearn API |
|
The fitted estimator used to impute |
|
`X_filled[missing_row_mask, feat_idx]`. |
|
""" |
|
if estimator is None and fit_mode is False: |
|
raise ValueError( |
|
"If fit_mode is False, then an already-fitted " |
|
"estimator should be passed in." |
|
) |
|
|
|
if estimator is None: |
|
estimator = clone(self._estimator) |
|
|
|
missing_row_mask = mask_missing_values[:, feat_idx] |
|
if fit_mode: |
|
X_train = _safe_indexing( |
|
_safe_indexing(X_filled, neighbor_feat_idx, axis=1), |
|
~missing_row_mask, |
|
axis=0, |
|
) |
|
y_train = _safe_indexing( |
|
_safe_indexing(X_filled, feat_idx, axis=1), |
|
~missing_row_mask, |
|
axis=0, |
|
) |
|
estimator.fit(X_train, y_train, **params) |
|
|
|
|
|
if np.sum(missing_row_mask) == 0: |
|
return X_filled, estimator |
|
|
|
|
|
X_test = _safe_indexing( |
|
_safe_indexing(X_filled, neighbor_feat_idx, axis=1), |
|
missing_row_mask, |
|
axis=0, |
|
) |
|
if self.sample_posterior: |
|
mus, sigmas = estimator.predict(X_test, return_std=True) |
|
imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) |
|
|
|
|
|
|
|
positive_sigmas = sigmas > 0 |
|
imputed_values[~positive_sigmas] = mus[~positive_sigmas] |
|
mus_too_low = mus < self._min_value[feat_idx] |
|
imputed_values[mus_too_low] = self._min_value[feat_idx] |
|
mus_too_high = mus > self._max_value[feat_idx] |
|
imputed_values[mus_too_high] = self._max_value[feat_idx] |
|
|
|
inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high |
|
mus = mus[inrange_mask] |
|
sigmas = sigmas[inrange_mask] |
|
a = (self._min_value[feat_idx] - mus) / sigmas |
|
b = (self._max_value[feat_idx] - mus) / sigmas |
|
|
|
truncated_normal = stats.truncnorm(a=a, b=b, loc=mus, scale=sigmas) |
|
imputed_values[inrange_mask] = truncated_normal.rvs( |
|
random_state=self.random_state_ |
|
) |
|
else: |
|
imputed_values = estimator.predict(X_test) |
|
imputed_values = np.clip( |
|
imputed_values, self._min_value[feat_idx], self._max_value[feat_idx] |
|
) |
|
|
|
|
|
_safe_assign( |
|
X_filled, |
|
imputed_values, |
|
row_indexer=missing_row_mask, |
|
column_indexer=feat_idx, |
|
) |
|
return X_filled, estimator |
|
|
|
def _get_neighbor_feat_idx(self, n_features, feat_idx, abs_corr_mat): |
|
"""Get a list of other features to predict `feat_idx`. |
|
|
|
If `self.n_nearest_features` is less than or equal to the total |
|
number of features, then use a probability proportional to the absolute |
|
correlation between `feat_idx` and each other feature to randomly |
|
choose a subsample of the other features (without replacement). |
|
|
|
Parameters |
|
---------- |
|
n_features : int |
|
Number of features in `X`. |
|
|
|
feat_idx : int |
|
Index of the feature currently being imputed. |
|
|
|
abs_corr_mat : ndarray, shape (n_features, n_features) |
|
Absolute correlation matrix of `X`. The diagonal has been zeroed |
|
out and each feature has been normalized to sum to 1. Can be None. |
|
|
|
Returns |
|
------- |
|
neighbor_feat_idx : array-like |
|
The features to use to impute `feat_idx`. |
|
""" |
|
if self.n_nearest_features is not None and self.n_nearest_features < n_features: |
|
p = abs_corr_mat[:, feat_idx] |
|
neighbor_feat_idx = self.random_state_.choice( |
|
np.arange(n_features), self.n_nearest_features, replace=False, p=p |
|
) |
|
else: |
|
inds_left = np.arange(feat_idx) |
|
inds_right = np.arange(feat_idx + 1, n_features) |
|
neighbor_feat_idx = np.concatenate((inds_left, inds_right)) |
|
return neighbor_feat_idx |
|
|
|
def _get_ordered_idx(self, mask_missing_values): |
|
"""Decide in what order we will update the features. |
|
|
|
As a homage to the MICE R package, we will have 4 main options of |
|
how to order the updates, and use a random order if anything else |
|
is specified. |
|
|
|
Also, this function skips features which have no missing values. |
|
|
|
Parameters |
|
---------- |
|
mask_missing_values : array-like, shape (n_samples, n_features) |
|
Input data's missing indicator matrix, where `n_samples` is the |
|
number of samples and `n_features` is the number of features. |
|
|
|
Returns |
|
------- |
|
ordered_idx : ndarray, shape (n_features,) |
|
The order in which to impute the features. |
|
""" |
|
frac_of_missing_values = mask_missing_values.mean(axis=0) |
|
if self.skip_complete: |
|
missing_values_idx = np.flatnonzero(frac_of_missing_values) |
|
else: |
|
missing_values_idx = np.arange(np.shape(frac_of_missing_values)[0]) |
|
if self.imputation_order == "roman": |
|
ordered_idx = missing_values_idx |
|
elif self.imputation_order == "arabic": |
|
ordered_idx = missing_values_idx[::-1] |
|
elif self.imputation_order == "ascending": |
|
n = len(frac_of_missing_values) - len(missing_values_idx) |
|
ordered_idx = np.argsort(frac_of_missing_values, kind="mergesort")[n:] |
|
elif self.imputation_order == "descending": |
|
n = len(frac_of_missing_values) - len(missing_values_idx) |
|
ordered_idx = np.argsort(frac_of_missing_values, kind="mergesort")[n:][::-1] |
|
elif self.imputation_order == "random": |
|
ordered_idx = missing_values_idx |
|
self.random_state_.shuffle(ordered_idx) |
|
return ordered_idx |
|
|
|
def _get_abs_corr_mat(self, X_filled, tolerance=1e-6): |
|
"""Get absolute correlation matrix between features. |
|
|
|
Parameters |
|
---------- |
|
X_filled : ndarray, shape (n_samples, n_features) |
|
Input data with the most recent imputations. |
|
|
|
tolerance : float, default=1e-6 |
|
`abs_corr_mat` can have nans, which will be replaced |
|
with `tolerance`. |
|
|
|
Returns |
|
------- |
|
abs_corr_mat : ndarray, shape (n_features, n_features) |
|
Absolute correlation matrix of `X` at the beginning of the |
|
current round. The diagonal has been zeroed out and each feature's |
|
absolute correlations with all others have been normalized to sum |
|
to 1. |
|
""" |
|
n_features = X_filled.shape[1] |
|
if self.n_nearest_features is None or self.n_nearest_features >= n_features: |
|
return None |
|
with np.errstate(invalid="ignore"): |
|
|
|
|
|
|
|
abs_corr_mat = np.abs(np.corrcoef(X_filled.T)) |
|
|
|
abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance |
|
|
|
np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat) |
|
|
|
np.fill_diagonal(abs_corr_mat, 0) |
|
|
|
abs_corr_mat = normalize(abs_corr_mat, norm="l1", axis=0, copy=False) |
|
return abs_corr_mat |
|
|
|
def _initial_imputation(self, X, in_fit=False): |
|
"""Perform initial imputation for input `X`. |
|
|
|
Parameters |
|
---------- |
|
X : ndarray of shape (n_samples, n_features) |
|
Input data, where `n_samples` is the number of samples and |
|
`n_features` is the number of features. |
|
|
|
in_fit : bool, default=False |
|
Whether function is called in :meth:`fit`. |
|
|
|
Returns |
|
------- |
|
Xt : ndarray of shape (n_samples, n_features) |
|
Input data, where `n_samples` is the number of samples and |
|
`n_features` is the number of features. |
|
|
|
X_filled : ndarray of shape (n_samples, n_features) |
|
Input data with the most recent imputations. |
|
|
|
mask_missing_values : ndarray of shape (n_samples, n_features) |
|
Input data's missing indicator matrix, where `n_samples` is the |
|
number of samples and `n_features` is the number of features, |
|
masked by non-missing features. |
|
|
|
X_missing_mask : ndarray, shape (n_samples, n_features) |
|
Input data's mask matrix indicating missing datapoints, where |
|
`n_samples` is the number of samples and `n_features` is the |
|
number of features. |
|
""" |
|
if is_scalar_nan(self.missing_values): |
|
ensure_all_finite = "allow-nan" |
|
else: |
|
ensure_all_finite = True |
|
|
|
X = validate_data( |
|
self, |
|
X, |
|
dtype=FLOAT_DTYPES, |
|
order="F", |
|
reset=in_fit, |
|
ensure_all_finite=ensure_all_finite, |
|
) |
|
_check_inputs_dtype(X, self.missing_values) |
|
|
|
X_missing_mask = _get_mask(X, self.missing_values) |
|
mask_missing_values = X_missing_mask.copy() |
|
|
|
|
|
|
|
catch_warning = ( |
|
self.initial_strategy == "constant" and not self.keep_empty_features |
|
) |
|
|
|
if self.initial_imputer_ is None: |
|
self.initial_imputer_ = SimpleImputer( |
|
missing_values=self.missing_values, |
|
strategy=self.initial_strategy, |
|
fill_value=self.fill_value, |
|
keep_empty_features=self.keep_empty_features, |
|
).set_output(transform="default") |
|
|
|
|
|
|
|
if catch_warning: |
|
with warnings.catch_warnings(): |
|
warnings.simplefilter("ignore", FutureWarning) |
|
X_filled = self.initial_imputer_.fit_transform(X) |
|
else: |
|
X_filled = self.initial_imputer_.fit_transform(X) |
|
else: |
|
|
|
|
|
if catch_warning: |
|
with warnings.catch_warnings(): |
|
warnings.simplefilter("ignore", FutureWarning) |
|
X_filled = self.initial_imputer_.transform(X) |
|
else: |
|
X_filled = self.initial_imputer_.transform(X) |
|
|
|
if in_fit: |
|
self._is_empty_feature = np.all(mask_missing_values, axis=0) |
|
|
|
if not self.keep_empty_features: |
|
|
|
Xt = X[:, ~self._is_empty_feature] |
|
mask_missing_values = mask_missing_values[:, ~self._is_empty_feature] |
|
|
|
if self.initial_imputer_.get_params()["strategy"] == "constant": |
|
|
|
|
|
|
|
|
|
|
|
|
|
X_filled = X_filled[:, ~self._is_empty_feature] |
|
|
|
else: |
|
|
|
|
|
mask_missing_values[:, self._is_empty_feature] = False |
|
Xt = X |
|
Xt[:, self._is_empty_feature] = X_filled[:, self._is_empty_feature] |
|
|
|
return Xt, X_filled, mask_missing_values, X_missing_mask |
|
|
|
@staticmethod |
|
def _validate_limit( |
|
limit, limit_type, n_features, is_empty_feature, keep_empty_feature |
|
): |
|
"""Validate the limits (min/max) of the feature values. |
|
|
|
Converts scalar min/max limits to vectors of shape `(n_features,)`. |
|
|
|
Parameters |
|
---------- |
|
limit: scalar or array-like |
|
The user-specified limit (i.e, min_value or max_value). |
|
limit_type: {'max', 'min'} |
|
Type of limit to validate. |
|
n_features: int |
|
Number of features in the dataset. |
|
is_empty_feature: ndarray, shape (n_features, ) |
|
Mask array indicating empty feature imputer has seen during fit. |
|
keep_empty_feature: bool |
|
If False, remove empty-feature indices from the limit. |
|
|
|
Returns |
|
------- |
|
limit: ndarray, shape(n_features,) |
|
Array of limits, one for each feature. |
|
""" |
|
n_features_in = _num_samples(is_empty_feature) |
|
if ( |
|
limit is not None |
|
and not np.isscalar(limit) |
|
and _num_samples(limit) != n_features_in |
|
): |
|
raise ValueError( |
|
f"'{limit_type}_value' should be of shape ({n_features_in},) when an" |
|
f" array-like is provided. Got {len(limit)}, instead." |
|
) |
|
|
|
limit_bound = np.inf if limit_type == "max" else -np.inf |
|
limit = limit_bound if limit is None else limit |
|
if np.isscalar(limit): |
|
limit = np.full(n_features, limit) |
|
limit = check_array(limit, ensure_all_finite=False, copy=False, ensure_2d=False) |
|
|
|
|
|
if not keep_empty_feature and len(limit) == len(is_empty_feature): |
|
limit = limit[~is_empty_feature] |
|
|
|
return limit |
|
|
|
@_fit_context( |
|
|
|
prefer_skip_nested_validation=False |
|
) |
|
def fit_transform(self, X, y=None, **params): |
|
"""Fit the imputer on `X` and return the transformed `X`. |
|
|
|
Parameters |
|
---------- |
|
X : array-like, shape (n_samples, n_features) |
|
Input data, where `n_samples` is the number of samples and |
|
`n_features` is the number of features. |
|
|
|
y : Ignored |
|
Not used, present for API consistency by convention. |
|
|
|
**params : dict |
|
Parameters routed to the `fit` method of the sub-estimator via the |
|
metadata routing API. |
|
|
|
.. versionadded:: 1.5 |
|
Only available if |
|
`sklearn.set_config(enable_metadata_routing=True)` is set. See |
|
:ref:`Metadata Routing User Guide <metadata_routing>` for more |
|
details. |
|
|
|
Returns |
|
------- |
|
Xt : array-like, shape (n_samples, n_features) |
|
The imputed input data. |
|
""" |
|
_raise_for_params(params, self, "fit") |
|
|
|
routed_params = process_routing( |
|
self, |
|
"fit", |
|
**params, |
|
) |
|
|
|
self.random_state_ = getattr( |
|
self, "random_state_", check_random_state(self.random_state) |
|
) |
|
|
|
if self.estimator is None: |
|
from ..linear_model import BayesianRidge |
|
|
|
self._estimator = BayesianRidge() |
|
else: |
|
self._estimator = clone(self.estimator) |
|
|
|
self.imputation_sequence_ = [] |
|
|
|
self.initial_imputer_ = None |
|
|
|
X, Xt, mask_missing_values, complete_mask = self._initial_imputation( |
|
X, in_fit=True |
|
) |
|
|
|
super()._fit_indicator(complete_mask) |
|
X_indicator = super()._transform_indicator(complete_mask) |
|
|
|
if self.max_iter == 0 or np.all(mask_missing_values): |
|
self.n_iter_ = 0 |
|
return super()._concatenate_indicator(Xt, X_indicator) |
|
|
|
|
|
if Xt.shape[1] == 1: |
|
self.n_iter_ = 0 |
|
return super()._concatenate_indicator(Xt, X_indicator) |
|
|
|
self._min_value = self._validate_limit( |
|
self.min_value, |
|
"min", |
|
X.shape[1], |
|
self._is_empty_feature, |
|
self.keep_empty_features, |
|
) |
|
self._max_value = self._validate_limit( |
|
self.max_value, |
|
"max", |
|
X.shape[1], |
|
self._is_empty_feature, |
|
self.keep_empty_features, |
|
) |
|
|
|
if not np.all(np.greater(self._max_value, self._min_value)): |
|
raise ValueError("One (or more) features have min_value >= max_value.") |
|
|
|
|
|
|
|
|
|
|
|
ordered_idx = self._get_ordered_idx(mask_missing_values) |
|
self.n_features_with_missing_ = len(ordered_idx) |
|
|
|
abs_corr_mat = self._get_abs_corr_mat(Xt) |
|
|
|
n_samples, n_features = Xt.shape |
|
if self.verbose > 0: |
|
print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) |
|
start_t = time() |
|
if not self.sample_posterior: |
|
Xt_previous = Xt.copy() |
|
normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values])) |
|
for self.n_iter_ in range(1, self.max_iter + 1): |
|
if self.imputation_order == "random": |
|
ordered_idx = self._get_ordered_idx(mask_missing_values) |
|
|
|
for feat_idx in ordered_idx: |
|
neighbor_feat_idx = self._get_neighbor_feat_idx( |
|
n_features, feat_idx, abs_corr_mat |
|
) |
|
Xt, estimator = self._impute_one_feature( |
|
Xt, |
|
mask_missing_values, |
|
feat_idx, |
|
neighbor_feat_idx, |
|
estimator=None, |
|
fit_mode=True, |
|
params=routed_params.estimator.fit, |
|
) |
|
estimator_triplet = _ImputerTriplet( |
|
feat_idx, neighbor_feat_idx, estimator |
|
) |
|
self.imputation_sequence_.append(estimator_triplet) |
|
|
|
if self.verbose > 1: |
|
print( |
|
"[IterativeImputer] Ending imputation round " |
|
"%d/%d, elapsed time %0.2f" |
|
% (self.n_iter_, self.max_iter, time() - start_t) |
|
) |
|
|
|
if not self.sample_posterior: |
|
inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf, axis=None) |
|
if self.verbose > 0: |
|
print( |
|
"[IterativeImputer] Change: {}, scaled tolerance: {} ".format( |
|
inf_norm, normalized_tol |
|
) |
|
) |
|
if inf_norm < normalized_tol: |
|
if self.verbose > 0: |
|
print("[IterativeImputer] Early stopping criterion reached.") |
|
break |
|
Xt_previous = Xt.copy() |
|
else: |
|
if not self.sample_posterior: |
|
warnings.warn( |
|
"[IterativeImputer] Early stopping criterion not reached.", |
|
ConvergenceWarning, |
|
) |
|
_assign_where(Xt, X, cond=~mask_missing_values) |
|
|
|
return super()._concatenate_indicator(Xt, X_indicator) |
|
|
|
def transform(self, X): |
|
"""Impute all missing values in `X`. |
|
|
|
Note that this is stochastic, and that if `random_state` is not fixed, |
|
repeated calls, or permuted input, results will differ. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
The input data to complete. |
|
|
|
Returns |
|
------- |
|
Xt : array-like, shape (n_samples, n_features) |
|
The imputed input data. |
|
""" |
|
check_is_fitted(self) |
|
|
|
X, Xt, mask_missing_values, complete_mask = self._initial_imputation( |
|
X, in_fit=False |
|
) |
|
|
|
X_indicator = super()._transform_indicator(complete_mask) |
|
|
|
if self.n_iter_ == 0 or np.all(mask_missing_values): |
|
return super()._concatenate_indicator(Xt, X_indicator) |
|
|
|
imputations_per_round = len(self.imputation_sequence_) // self.n_iter_ |
|
i_rnd = 0 |
|
if self.verbose > 0: |
|
print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) |
|
start_t = time() |
|
for it, estimator_triplet in enumerate(self.imputation_sequence_): |
|
Xt, _ = self._impute_one_feature( |
|
Xt, |
|
mask_missing_values, |
|
estimator_triplet.feat_idx, |
|
estimator_triplet.neighbor_feat_idx, |
|
estimator=estimator_triplet.estimator, |
|
fit_mode=False, |
|
) |
|
if not (it + 1) % imputations_per_round: |
|
if self.verbose > 1: |
|
print( |
|
"[IterativeImputer] Ending imputation round " |
|
"%d/%d, elapsed time %0.2f" |
|
% (i_rnd + 1, self.n_iter_, time() - start_t) |
|
) |
|
i_rnd += 1 |
|
|
|
_assign_where(Xt, X, cond=~mask_missing_values) |
|
|
|
return super()._concatenate_indicator(Xt, X_indicator) |
|
|
|
def fit(self, X, y=None, **fit_params): |
|
"""Fit the imputer on `X` and return self. |
|
|
|
Parameters |
|
---------- |
|
X : array-like, shape (n_samples, n_features) |
|
Input data, where `n_samples` is the number of samples and |
|
`n_features` is the number of features. |
|
|
|
y : Ignored |
|
Not used, present for API consistency by convention. |
|
|
|
**fit_params : dict |
|
Parameters routed to the `fit` method of the sub-estimator via the |
|
metadata routing API. |
|
|
|
.. versionadded:: 1.5 |
|
Only available if |
|
`sklearn.set_config(enable_metadata_routing=True)` is set. See |
|
:ref:`Metadata Routing User Guide <metadata_routing>` for more |
|
details. |
|
|
|
Returns |
|
------- |
|
self : object |
|
Fitted estimator. |
|
""" |
|
self.fit_transform(X, **fit_params) |
|
return self |
|
|
|
def get_feature_names_out(self, input_features=None): |
|
"""Get output feature names for transformation. |
|
|
|
Parameters |
|
---------- |
|
input_features : array-like of str or None, default=None |
|
Input features. |
|
|
|
- If `input_features` is `None`, then `feature_names_in_` is |
|
used as feature names in. If `feature_names_in_` is not defined, |
|
then the following input feature names are generated: |
|
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`. |
|
- If `input_features` is an array-like, then `input_features` must |
|
match `feature_names_in_` if `feature_names_in_` is defined. |
|
|
|
Returns |
|
------- |
|
feature_names_out : ndarray of str objects |
|
Transformed feature names. |
|
""" |
|
check_is_fitted(self, "n_features_in_") |
|
input_features = _check_feature_names_in(self, input_features) |
|
names = self.initial_imputer_.get_feature_names_out(input_features) |
|
return self._concatenate_indicator_feature_names_out(names, input_features) |
|
|
|
def get_metadata_routing(self): |
|
"""Get metadata routing of this object. |
|
|
|
Please check :ref:`User Guide <metadata_routing>` on how the routing |
|
mechanism works. |
|
|
|
.. versionadded:: 1.5 |
|
|
|
Returns |
|
------- |
|
routing : MetadataRouter |
|
A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating |
|
routing information. |
|
""" |
|
router = MetadataRouter(owner=self.__class__.__name__).add( |
|
estimator=self.estimator, |
|
method_mapping=MethodMapping().add(callee="fit", caller="fit"), |
|
) |
|
return router |
|
|