File size: 9,421 Bytes
7885a28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 |
"""Generic feature selection mixin"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import warnings
from abc import ABCMeta, abstractmethod
from operator import attrgetter
import numpy as np
from scipy.sparse import csc_matrix, issparse
from ..base import TransformerMixin
from ..utils import _safe_indexing, check_array, safe_sqr
from ..utils._set_output import _get_output_config
from ..utils._tags import get_tags
from ..utils.validation import (
_check_feature_names_in,
_is_pandas_df,
check_is_fitted,
validate_data,
)
class SelectorMixin(TransformerMixin, metaclass=ABCMeta):
"""
Transformer mixin that performs feature selection given a support mask
This mixin provides a feature selector implementation with `transform` and
`inverse_transform` functionality given an implementation of
`_get_support_mask`.
Examples
--------
>>> import numpy as np
>>> from sklearn.datasets import load_iris
>>> from sklearn.base import BaseEstimator
>>> from sklearn.feature_selection import SelectorMixin
>>> class FeatureSelector(SelectorMixin, BaseEstimator):
... def fit(self, X, y=None):
... self.n_features_in_ = X.shape[1]
... return self
... def _get_support_mask(self):
... mask = np.zeros(self.n_features_in_, dtype=bool)
... mask[:2] = True # select the first two features
... return mask
>>> X, y = load_iris(return_X_y=True)
>>> FeatureSelector().fit_transform(X, y).shape
(150, 2)
"""
def get_support(self, indices=False):
"""
Get a mask, or integer index, of the features selected.
Parameters
----------
indices : bool, default=False
If True, the return value will be an array of integers, rather
than a boolean mask.
Returns
-------
support : array
An index that selects the retained features from a feature vector.
If `indices` is False, this is a boolean array of shape
[# input features], in which an element is True iff its
corresponding feature is selected for retention. If `indices` is
True, this is an integer array of shape [# output features] whose
values are indices into the input feature vector.
"""
mask = self._get_support_mask()
return mask if not indices else np.where(mask)[0]
@abstractmethod
def _get_support_mask(self):
"""
Get the boolean mask indicating which features are selected
Returns
-------
support : boolean array of shape [# input features]
An element is True iff its corresponding feature is selected for
retention.
"""
def transform(self, X):
"""Reduce X to the selected features.
Parameters
----------
X : array of shape [n_samples, n_features]
The input samples.
Returns
-------
X_r : array of shape [n_samples, n_selected_features]
The input samples with only the selected features.
"""
# Preserve X when X is a dataframe and the output is configured to
# be pandas.
output_config_dense = _get_output_config("transform", estimator=self)["dense"]
preserve_X = output_config_dense != "default" and _is_pandas_df(X)
# note: we use get_tags instead of __sklearn_tags__ because this is a
# public Mixin.
X = validate_data(
self,
X,
dtype=None,
accept_sparse="csr",
ensure_all_finite=not get_tags(self).input_tags.allow_nan,
skip_check_array=preserve_X,
reset=False,
)
return self._transform(X)
def _transform(self, X):
"""Reduce X to the selected features."""
mask = self.get_support()
if not mask.any():
warnings.warn(
(
"No features were selected: either the data is"
" too noisy or the selection test too strict."
),
UserWarning,
)
if hasattr(X, "iloc"):
return X.iloc[:, :0]
return np.empty(0, dtype=X.dtype).reshape((X.shape[0], 0))
return _safe_indexing(X, mask, axis=1)
def inverse_transform(self, X):
"""Reverse the transformation operation.
Parameters
----------
X : array of shape [n_samples, n_selected_features]
The input samples.
Returns
-------
X_r : array of shape [n_samples, n_original_features]
`X` with columns of zeros inserted where features would have
been removed by :meth:`transform`.
"""
if issparse(X):
X = X.tocsc()
# insert additional entries in indptr:
# e.g. if transform changed indptr from [0 2 6 7] to [0 2 3]
# col_nonzeros here will be [2 0 1] so indptr becomes [0 2 2 3]
it = self.inverse_transform(np.diff(X.indptr).reshape(1, -1))
col_nonzeros = it.ravel()
indptr = np.concatenate([[0], np.cumsum(col_nonzeros)])
Xt = csc_matrix(
(X.data, X.indices, indptr),
shape=(X.shape[0], len(indptr) - 1),
dtype=X.dtype,
)
return Xt
support = self.get_support()
X = check_array(X, dtype=None)
if support.sum() != X.shape[1]:
raise ValueError("X has a different shape than during fitting.")
if X.ndim == 1:
X = X[None, :]
Xt = np.zeros((X.shape[0], support.size), dtype=X.dtype)
Xt[:, support] = X
return Xt
def get_feature_names_out(self, input_features=None):
"""Mask feature names according to selected features.
Parameters
----------
input_features : array-like of str or None, default=None
Input features.
- If `input_features` is `None`, then `feature_names_in_` is
used as feature names in. If `feature_names_in_` is not defined,
then the following input feature names are generated:
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
- If `input_features` is an array-like, then `input_features` must
match `feature_names_in_` if `feature_names_in_` is defined.
Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
"""
check_is_fitted(self)
input_features = _check_feature_names_in(self, input_features)
return input_features[self.get_support()]
def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1):
"""
Retrieve and aggregate (ndim > 1) the feature importances
from an estimator. Also optionally applies transformation.
Parameters
----------
estimator : estimator
A scikit-learn estimator from which we want to get the feature
importances.
getter : "auto", str or callable
An attribute or a callable to get the feature importance. If `"auto"`,
`estimator` is expected to expose `coef_` or `feature_importances`.
transform_func : {"norm", "square"}, default=None
The transform to apply to the feature importances. By default (`None`)
no transformation is applied.
norm_order : int, default=1
The norm order to apply when `transform_func="norm"`. Only applied
when `importances.ndim > 1`.
Returns
-------
importances : ndarray of shape (n_features,)
The features importances, optionally transformed.
"""
if isinstance(getter, str):
if getter == "auto":
if hasattr(estimator, "coef_"):
getter = attrgetter("coef_")
elif hasattr(estimator, "feature_importances_"):
getter = attrgetter("feature_importances_")
else:
raise ValueError(
"when `importance_getter=='auto'`, the underlying "
f"estimator {estimator.__class__.__name__} should have "
"`coef_` or `feature_importances_` attribute. Either "
"pass a fitted estimator to feature selector or call fit "
"before calling transform."
)
else:
getter = attrgetter(getter)
elif not callable(getter):
raise ValueError("`importance_getter` has to be a string or `callable`")
importances = getter(estimator)
if transform_func is None:
return importances
elif transform_func == "norm":
if importances.ndim == 1:
importances = np.abs(importances)
else:
importances = np.linalg.norm(importances, axis=0, ord=norm_order)
elif transform_func == "square":
if importances.ndim == 1:
importances = safe_sqr(importances)
else:
importances = safe_sqr(importances).sum(axis=0)
else:
raise ValueError(
"Valid values for `transform_func` are "
+ "None, 'norm' and 'square'. Those two "
+ "transformation are only supported now"
)
return importances
|