|
""" |
|
Sequential feature selection |
|
""" |
|
|
|
|
|
|
|
|
|
from numbers import Integral, Real |
|
|
|
import numpy as np |
|
|
|
from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier |
|
from ..metrics import check_scoring, get_scorer_names |
|
from ..model_selection import check_cv, cross_val_score |
|
from ..utils._metadata_requests import ( |
|
MetadataRouter, |
|
MethodMapping, |
|
_raise_for_params, |
|
_routing_enabled, |
|
process_routing, |
|
) |
|
from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions |
|
from ..utils._tags import get_tags |
|
from ..utils.validation import check_is_fitted, validate_data |
|
from ._base import SelectorMixin |
|
|
|
|
|
class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator): |
|
"""Transformer that performs Sequential Feature Selection. |
|
|
|
This Sequential Feature Selector adds (forward selection) or |
|
removes (backward selection) features to form a feature subset in a |
|
greedy fashion. At each stage, this estimator chooses the best feature to |
|
add or remove based on the cross-validation score of an estimator. In |
|
the case of unsupervised learning, this Sequential Feature Selector |
|
looks only at the features (X), not the desired outputs (y). |
|
|
|
Read more in the :ref:`User Guide <sequential_feature_selection>`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
Parameters |
|
---------- |
|
estimator : estimator instance |
|
An unfitted estimator. |
|
|
|
n_features_to_select : "auto", int or float, default="auto" |
|
If `"auto"`, the behaviour depends on the `tol` parameter: |
|
|
|
- if `tol` is not `None`, then features are selected while the score |
|
change does not exceed `tol`. |
|
- otherwise, half of the features are selected. |
|
|
|
If integer, the parameter is the absolute number of features to select. |
|
If float between 0 and 1, it is the fraction of features to select. |
|
|
|
.. versionadded:: 1.1 |
|
The option `"auto"` was added in version 1.1. |
|
|
|
.. versionchanged:: 1.3 |
|
The default changed from `"warn"` to `"auto"` in 1.3. |
|
|
|
tol : float, default=None |
|
If the score is not incremented by at least `tol` between two |
|
consecutive feature additions or removals, stop adding or removing. |
|
|
|
`tol` can be negative when removing features using `direction="backward"`. |
|
`tol` is required to be strictly positive when doing forward selection. |
|
It can be useful to reduce the number of features at the cost of a small |
|
decrease in the score. |
|
|
|
`tol` is enabled only when `n_features_to_select` is `"auto"`. |
|
|
|
.. versionadded:: 1.1 |
|
|
|
direction : {'forward', 'backward'}, default='forward' |
|
Whether to perform forward selection or backward selection. |
|
|
|
scoring : str or callable, default=None |
|
A single str (see :ref:`scoring_parameter`) or a callable |
|
(see :ref:`scoring_callable`) to evaluate the predictions on the test set. |
|
|
|
NOTE that when using a custom scorer, it should return a single |
|
value. |
|
|
|
If None, the estimator's score method is used. |
|
|
|
cv : int, cross-validation generator or an iterable, default=None |
|
Determines the cross-validation splitting strategy. |
|
Possible inputs for cv are: |
|
|
|
- None, to use the default 5-fold cross validation, |
|
- integer, to specify the number of folds in a `(Stratified)KFold`, |
|
- :term:`CV splitter`, |
|
- An iterable yielding (train, test) splits as arrays of indices. |
|
|
|
For integer/None inputs, if the estimator is a classifier and ``y`` is |
|
either binary or multiclass, |
|
:class:`~sklearn.model_selection.StratifiedKFold` is used. In all other |
|
cases, :class:`~sklearn.model_selection.KFold` is used. These splitters |
|
are instantiated with `shuffle=False` so the splits will be the same |
|
across calls. |
|
|
|
Refer :ref:`User Guide <cross_validation>` for the various |
|
cross-validation strategies that can be used here. |
|
|
|
n_jobs : int, default=None |
|
Number of jobs to run in parallel. When evaluating a new feature to |
|
add or remove, the cross-validation procedure is parallel over the |
|
folds. |
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
Attributes |
|
---------- |
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. Only defined if the |
|
underlying estimator exposes such an attribute when fit. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
n_features_to_select_ : int |
|
The number of features that were selected. |
|
|
|
support_ : ndarray of shape (n_features,), dtype=bool |
|
The mask of selected features. |
|
|
|
See Also |
|
-------- |
|
GenericUnivariateSelect : Univariate feature selector with configurable |
|
strategy. |
|
RFE : Recursive feature elimination based on importance weights. |
|
RFECV : Recursive feature elimination based on importance weights, with |
|
automatic selection of the number of features. |
|
SelectFromModel : Feature selection based on thresholds of importance |
|
weights. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.feature_selection import SequentialFeatureSelector |
|
>>> from sklearn.neighbors import KNeighborsClassifier |
|
>>> from sklearn.datasets import load_iris |
|
>>> X, y = load_iris(return_X_y=True) |
|
>>> knn = KNeighborsClassifier(n_neighbors=3) |
|
>>> sfs = SequentialFeatureSelector(knn, n_features_to_select=3) |
|
>>> sfs.fit(X, y) |
|
SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3), |
|
n_features_to_select=3) |
|
>>> sfs.get_support() |
|
array([ True, False, True, True]) |
|
>>> sfs.transform(X).shape |
|
(150, 3) |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
"estimator": [HasMethods(["fit"])], |
|
"n_features_to_select": [ |
|
StrOptions({"auto"}), |
|
Interval(RealNotInt, 0, 1, closed="right"), |
|
Interval(Integral, 0, None, closed="neither"), |
|
], |
|
"tol": [None, Interval(Real, None, None, closed="neither")], |
|
"direction": [StrOptions({"forward", "backward"})], |
|
"scoring": [None, StrOptions(set(get_scorer_names())), callable], |
|
"cv": ["cv_object"], |
|
"n_jobs": [None, Integral], |
|
} |
|
|
|
def __init__( |
|
self, |
|
estimator, |
|
*, |
|
n_features_to_select="auto", |
|
tol=None, |
|
direction="forward", |
|
scoring=None, |
|
cv=5, |
|
n_jobs=None, |
|
): |
|
self.estimator = estimator |
|
self.n_features_to_select = n_features_to_select |
|
self.tol = tol |
|
self.direction = direction |
|
self.scoring = scoring |
|
self.cv = cv |
|
self.n_jobs = n_jobs |
|
|
|
@_fit_context( |
|
|
|
prefer_skip_nested_validation=False |
|
) |
|
def fit(self, X, y=None, **params): |
|
"""Learn the features to select from X. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training vectors, where `n_samples` is the number of samples and |
|
`n_features` is the number of predictors. |
|
|
|
y : array-like of shape (n_samples,), default=None |
|
Target values. This parameter may be ignored for |
|
unsupervised learning. |
|
|
|
**params : dict, default=None |
|
Parameters to be passed to the underlying `estimator`, `cv` |
|
and `scorer` objects. |
|
|
|
.. versionadded:: 1.6 |
|
|
|
Only available if `enable_metadata_routing=True`, |
|
which can be set by using |
|
``sklearn.set_config(enable_metadata_routing=True)``. |
|
See :ref:`Metadata Routing User Guide <metadata_routing>` for |
|
more details. |
|
|
|
Returns |
|
------- |
|
self : object |
|
Returns the instance itself. |
|
""" |
|
_raise_for_params(params, self, "fit") |
|
tags = self.__sklearn_tags__() |
|
X = validate_data( |
|
self, |
|
X, |
|
accept_sparse="csc", |
|
ensure_min_features=2, |
|
ensure_all_finite=not tags.input_tags.allow_nan, |
|
) |
|
n_features = X.shape[1] |
|
|
|
if self.n_features_to_select == "auto": |
|
if self.tol is not None: |
|
|
|
|
|
self.n_features_to_select_ = n_features - 1 |
|
else: |
|
self.n_features_to_select_ = n_features // 2 |
|
elif isinstance(self.n_features_to_select, Integral): |
|
if self.n_features_to_select >= n_features: |
|
raise ValueError("n_features_to_select must be < n_features.") |
|
self.n_features_to_select_ = self.n_features_to_select |
|
elif isinstance(self.n_features_to_select, Real): |
|
self.n_features_to_select_ = int(n_features * self.n_features_to_select) |
|
|
|
if self.tol is not None and self.tol < 0 and self.direction == "forward": |
|
raise ValueError( |
|
"tol must be strictly positive when doing forward selection" |
|
) |
|
|
|
cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) |
|
|
|
cloned_estimator = clone(self.estimator) |
|
|
|
|
|
|
|
|
|
current_mask = np.zeros(shape=n_features, dtype=bool) |
|
n_iterations = ( |
|
self.n_features_to_select_ |
|
if self.n_features_to_select == "auto" or self.direction == "forward" |
|
else n_features - self.n_features_to_select_ |
|
) |
|
|
|
old_score = -np.inf |
|
is_auto_select = self.tol is not None and self.n_features_to_select == "auto" |
|
|
|
|
|
|
|
|
|
if _routing_enabled(): |
|
process_routing(self, "fit", **params) |
|
for _ in range(n_iterations): |
|
new_feature_idx, new_score = self._get_best_new_feature_score( |
|
cloned_estimator, X, y, cv, current_mask, **params |
|
) |
|
if is_auto_select and ((new_score - old_score) < self.tol): |
|
break |
|
|
|
old_score = new_score |
|
current_mask[new_feature_idx] = True |
|
|
|
if self.direction == "backward": |
|
current_mask = ~current_mask |
|
|
|
self.support_ = current_mask |
|
self.n_features_to_select_ = self.support_.sum() |
|
|
|
return self |
|
|
|
def _get_best_new_feature_score(self, estimator, X, y, cv, current_mask, **params): |
|
|
|
|
|
|
|
|
|
|
|
candidate_feature_indices = np.flatnonzero(~current_mask) |
|
scores = {} |
|
for feature_idx in candidate_feature_indices: |
|
candidate_mask = current_mask.copy() |
|
candidate_mask[feature_idx] = True |
|
if self.direction == "backward": |
|
candidate_mask = ~candidate_mask |
|
X_new = X[:, candidate_mask] |
|
scores[feature_idx] = cross_val_score( |
|
estimator, |
|
X_new, |
|
y, |
|
cv=cv, |
|
scoring=self.scoring, |
|
n_jobs=self.n_jobs, |
|
params=params, |
|
).mean() |
|
new_feature_idx = max(scores, key=lambda feature_idx: scores[feature_idx]) |
|
return new_feature_idx, scores[new_feature_idx] |
|
|
|
def _get_support_mask(self): |
|
check_is_fitted(self) |
|
return self.support_ |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan |
|
tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse |
|
return tags |
|
|
|
def get_metadata_routing(self): |
|
"""Get metadata routing of this object. |
|
|
|
Please check :ref:`User Guide <metadata_routing>` on how the routing |
|
mechanism works. |
|
|
|
.. versionadded:: 1.6 |
|
|
|
Returns |
|
------- |
|
routing : MetadataRouter |
|
A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating |
|
routing information. |
|
""" |
|
router = MetadataRouter(owner=self.__class__.__name__) |
|
router.add( |
|
estimator=self.estimator, |
|
method_mapping=MethodMapping().add(caller="fit", callee="fit"), |
|
) |
|
router.add( |
|
splitter=check_cv(self.cv, classifier=is_classifier(self.estimator)), |
|
method_mapping=MethodMapping().add(caller="fit", callee="split"), |
|
) |
|
router.add( |
|
scorer=check_scoring(self.estimator, scoring=self.scoring), |
|
method_mapping=MethodMapping().add(caller="fit", callee="score"), |
|
) |
|
return router |
|
|