|
"""Generic feature selection mixin""" |
|
|
|
|
|
|
|
|
|
import warnings |
|
from abc import ABCMeta, abstractmethod |
|
from operator import attrgetter |
|
|
|
import numpy as np |
|
from scipy.sparse import csc_matrix, issparse |
|
|
|
from ..base import TransformerMixin |
|
from ..utils import _safe_indexing, check_array, safe_sqr |
|
from ..utils._set_output import _get_output_config |
|
from ..utils._tags import get_tags |
|
from ..utils.validation import ( |
|
_check_feature_names_in, |
|
_is_pandas_df, |
|
check_is_fitted, |
|
validate_data, |
|
) |
|
|
|
|
|
class SelectorMixin(TransformerMixin, metaclass=ABCMeta): |
|
""" |
|
Transformer mixin that performs feature selection given a support mask |
|
|
|
This mixin provides a feature selector implementation with `transform` and |
|
`inverse_transform` functionality given an implementation of |
|
`_get_support_mask`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.datasets import load_iris |
|
>>> from sklearn.base import BaseEstimator |
|
>>> from sklearn.feature_selection import SelectorMixin |
|
>>> class FeatureSelector(SelectorMixin, BaseEstimator): |
|
... def fit(self, X, y=None): |
|
... self.n_features_in_ = X.shape[1] |
|
... return self |
|
... def _get_support_mask(self): |
|
... mask = np.zeros(self.n_features_in_, dtype=bool) |
|
... mask[:2] = True # select the first two features |
|
... return mask |
|
>>> X, y = load_iris(return_X_y=True) |
|
>>> FeatureSelector().fit_transform(X, y).shape |
|
(150, 2) |
|
""" |
|
|
|
def get_support(self, indices=False): |
|
""" |
|
Get a mask, or integer index, of the features selected. |
|
|
|
Parameters |
|
---------- |
|
indices : bool, default=False |
|
If True, the return value will be an array of integers, rather |
|
than a boolean mask. |
|
|
|
Returns |
|
------- |
|
support : array |
|
An index that selects the retained features from a feature vector. |
|
If `indices` is False, this is a boolean array of shape |
|
[# input features], in which an element is True iff its |
|
corresponding feature is selected for retention. If `indices` is |
|
True, this is an integer array of shape [# output features] whose |
|
values are indices into the input feature vector. |
|
""" |
|
mask = self._get_support_mask() |
|
return mask if not indices else np.where(mask)[0] |
|
|
|
@abstractmethod |
|
def _get_support_mask(self): |
|
""" |
|
Get the boolean mask indicating which features are selected |
|
|
|
Returns |
|
------- |
|
support : boolean array of shape [# input features] |
|
An element is True iff its corresponding feature is selected for |
|
retention. |
|
""" |
|
|
|
def transform(self, X): |
|
"""Reduce X to the selected features. |
|
|
|
Parameters |
|
---------- |
|
X : array of shape [n_samples, n_features] |
|
The input samples. |
|
|
|
Returns |
|
------- |
|
X_r : array of shape [n_samples, n_selected_features] |
|
The input samples with only the selected features. |
|
""" |
|
|
|
|
|
output_config_dense = _get_output_config("transform", estimator=self)["dense"] |
|
preserve_X = output_config_dense != "default" and _is_pandas_df(X) |
|
|
|
|
|
|
|
X = validate_data( |
|
self, |
|
X, |
|
dtype=None, |
|
accept_sparse="csr", |
|
ensure_all_finite=not get_tags(self).input_tags.allow_nan, |
|
skip_check_array=preserve_X, |
|
reset=False, |
|
) |
|
return self._transform(X) |
|
|
|
def _transform(self, X): |
|
"""Reduce X to the selected features.""" |
|
mask = self.get_support() |
|
if not mask.any(): |
|
warnings.warn( |
|
( |
|
"No features were selected: either the data is" |
|
" too noisy or the selection test too strict." |
|
), |
|
UserWarning, |
|
) |
|
if hasattr(X, "iloc"): |
|
return X.iloc[:, :0] |
|
return np.empty(0, dtype=X.dtype).reshape((X.shape[0], 0)) |
|
return _safe_indexing(X, mask, axis=1) |
|
|
|
def inverse_transform(self, X): |
|
"""Reverse the transformation operation. |
|
|
|
Parameters |
|
---------- |
|
X : array of shape [n_samples, n_selected_features] |
|
The input samples. |
|
|
|
Returns |
|
------- |
|
X_r : array of shape [n_samples, n_original_features] |
|
`X` with columns of zeros inserted where features would have |
|
been removed by :meth:`transform`. |
|
""" |
|
if issparse(X): |
|
X = X.tocsc() |
|
|
|
|
|
|
|
it = self.inverse_transform(np.diff(X.indptr).reshape(1, -1)) |
|
col_nonzeros = it.ravel() |
|
indptr = np.concatenate([[0], np.cumsum(col_nonzeros)]) |
|
Xt = csc_matrix( |
|
(X.data, X.indices, indptr), |
|
shape=(X.shape[0], len(indptr) - 1), |
|
dtype=X.dtype, |
|
) |
|
return Xt |
|
|
|
support = self.get_support() |
|
X = check_array(X, dtype=None) |
|
if support.sum() != X.shape[1]: |
|
raise ValueError("X has a different shape than during fitting.") |
|
|
|
if X.ndim == 1: |
|
X = X[None, :] |
|
Xt = np.zeros((X.shape[0], support.size), dtype=X.dtype) |
|
Xt[:, support] = X |
|
return Xt |
|
|
|
def get_feature_names_out(self, input_features=None): |
|
"""Mask feature names according to selected features. |
|
|
|
Parameters |
|
---------- |
|
input_features : array-like of str or None, default=None |
|
Input features. |
|
|
|
- If `input_features` is `None`, then `feature_names_in_` is |
|
used as feature names in. If `feature_names_in_` is not defined, |
|
then the following input feature names are generated: |
|
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`. |
|
- If `input_features` is an array-like, then `input_features` must |
|
match `feature_names_in_` if `feature_names_in_` is defined. |
|
|
|
Returns |
|
------- |
|
feature_names_out : ndarray of str objects |
|
Transformed feature names. |
|
""" |
|
check_is_fitted(self) |
|
input_features = _check_feature_names_in(self, input_features) |
|
return input_features[self.get_support()] |
|
|
|
|
|
def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1): |
|
""" |
|
Retrieve and aggregate (ndim > 1) the feature importances |
|
from an estimator. Also optionally applies transformation. |
|
|
|
Parameters |
|
---------- |
|
estimator : estimator |
|
A scikit-learn estimator from which we want to get the feature |
|
importances. |
|
|
|
getter : "auto", str or callable |
|
An attribute or a callable to get the feature importance. If `"auto"`, |
|
`estimator` is expected to expose `coef_` or `feature_importances`. |
|
|
|
transform_func : {"norm", "square"}, default=None |
|
The transform to apply to the feature importances. By default (`None`) |
|
no transformation is applied. |
|
|
|
norm_order : int, default=1 |
|
The norm order to apply when `transform_func="norm"`. Only applied |
|
when `importances.ndim > 1`. |
|
|
|
Returns |
|
------- |
|
importances : ndarray of shape (n_features,) |
|
The features importances, optionally transformed. |
|
""" |
|
if isinstance(getter, str): |
|
if getter == "auto": |
|
if hasattr(estimator, "coef_"): |
|
getter = attrgetter("coef_") |
|
elif hasattr(estimator, "feature_importances_"): |
|
getter = attrgetter("feature_importances_") |
|
else: |
|
raise ValueError( |
|
"when `importance_getter=='auto'`, the underlying " |
|
f"estimator {estimator.__class__.__name__} should have " |
|
"`coef_` or `feature_importances_` attribute. Either " |
|
"pass a fitted estimator to feature selector or call fit " |
|
"before calling transform." |
|
) |
|
else: |
|
getter = attrgetter(getter) |
|
elif not callable(getter): |
|
raise ValueError("`importance_getter` has to be a string or `callable`") |
|
|
|
importances = getter(estimator) |
|
|
|
if transform_func is None: |
|
return importances |
|
elif transform_func == "norm": |
|
if importances.ndim == 1: |
|
importances = np.abs(importances) |
|
else: |
|
importances = np.linalg.norm(importances, axis=0, ord=norm_order) |
|
elif transform_func == "square": |
|
if importances.ndim == 1: |
|
importances = safe_sqr(importances) |
|
else: |
|
importances = safe_sqr(importances).sum(axis=0) |
|
else: |
|
raise ValueError( |
|
"Valid values for `transform_func` are " |
|
+ "None, 'norm' and 'square'. Those two " |
|
+ "transformation are only supported now" |
|
) |
|
|
|
return importances |
|
|