Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

13.8 kB

	"""
	Sequential feature selection
	"""

	# Authors: The scikit-learn developers
	# SPDX-License-Identifier: BSD-3-Clause

	from numbers import Integral, Real

	import numpy as np

	from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
	from ..metrics import check_scoring, get_scorer_names
	from ..model_selection import check_cv, cross_val_score
	from ..utils._metadata_requests import (
	MetadataRouter,
	MethodMapping,
	_raise_for_params,
	_routing_enabled,
	process_routing,
	)
	from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
	from ..utils._tags import get_tags
	from ..utils.validation import check_is_fitted, validate_data
	from ._base import SelectorMixin


	class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
	"""Transformer that performs Sequential Feature Selection.

	This Sequential Feature Selector adds (forward selection) or
	removes (backward selection) features to form a feature subset in a
	greedy fashion. At each stage, this estimator chooses the best feature to
	add or remove based on the cross-validation score of an estimator. In
	the case of unsupervised learning, this Sequential Feature Selector
	looks only at the features (X), not the desired outputs (y).

	Read more in the :ref:`User Guide <sequential_feature_selection>`.

	.. versionadded:: 0.24

	Parameters
	----------
	estimator : estimator instance
	An unfitted estimator.

	n_features_to_select : "auto", int or float, default="auto"
	If `"auto"`, the behaviour depends on the `tol` parameter:

	- if `tol` is not `None`, then features are selected while the score
	change does not exceed `tol`.
	- otherwise, half of the features are selected.

	If integer, the parameter is the absolute number of features to select.
	If float between 0 and 1, it is the fraction of features to select.

	.. versionadded:: 1.1
	The option `"auto"` was added in version 1.1.

	.. versionchanged:: 1.3
	The default changed from `"warn"` to `"auto"` in 1.3.

	tol : float, default=None
	If the score is not incremented by at least `tol` between two
	consecutive feature additions or removals, stop adding or removing.

	`tol` can be negative when removing features using `direction="backward"`.
	`tol` is required to be strictly positive when doing forward selection.
	It can be useful to reduce the number of features at the cost of a small
	decrease in the score.

	`tol` is enabled only when `n_features_to_select` is `"auto"`.

	.. versionadded:: 1.1

	direction : {'forward', 'backward'}, default='forward'
	Whether to perform forward selection or backward selection.

	scoring : str or callable, default=None
	A single str (see :ref:`scoring_parameter`) or a callable
	(see :ref:`scoring_callable`) to evaluate the predictions on the test set.

	NOTE that when using a custom scorer, it should return a single
	value.

	If None, the estimator's score method is used.

	cv : int, cross-validation generator or an iterable, default=None
	Determines the cross-validation splitting strategy.
	Possible inputs for cv are:

	- None, to use the default 5-fold cross validation,
	- integer, to specify the number of folds in a `(Stratified)KFold`,
	- :term:`CV splitter`,
	- An iterable yielding (train, test) splits as arrays of indices.

	For integer/None inputs, if the estimator is a classifier and ``y`` is
	either binary or multiclass,
	:class:`~sklearn.model_selection.StratifiedKFold` is used. In all other
	cases, :class:`~sklearn.model_selection.KFold` is used. These splitters
	are instantiated with `shuffle=False` so the splits will be the same
	across calls.

	Refer :ref:`User Guide <cross_validation>` for the various
	cross-validation strategies that can be used here.

	n_jobs : int, default=None
	Number of jobs to run in parallel. When evaluating a new feature to
	add or remove, the cross-validation procedure is parallel over the
	folds.
	``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
	``-1`` means using all processors. See :term:`Glossary <n_jobs>`
	for more details.

	Attributes
	----------
	n_features_in_ : int
	Number of features seen during :term:`fit`. Only defined if the
	underlying estimator exposes such an attribute when fit.

	.. versionadded:: 0.24

	feature_names_in_ : ndarray of shape (`n_features_in_`,)
	Names of features seen during :term:`fit`. Defined only when `X`
	has feature names that are all strings.

	.. versionadded:: 1.0

	n_features_to_select_ : int
	The number of features that were selected.

	support_ : ndarray of shape (n_features,), dtype=bool
	The mask of selected features.

	See Also
	--------
	GenericUnivariateSelect : Univariate feature selector with configurable
	strategy.
	RFE : Recursive feature elimination based on importance weights.
	RFECV : Recursive feature elimination based on importance weights, with
	automatic selection of the number of features.
	SelectFromModel : Feature selection based on thresholds of importance
	weights.

	Examples
	--------
	>>> from sklearn.feature_selection import SequentialFeatureSelector
	>>> from sklearn.neighbors import KNeighborsClassifier
	>>> from sklearn.datasets import load_iris
	>>> X, y = load_iris(return_X_y=True)
	>>> knn = KNeighborsClassifier(n_neighbors=3)
	>>> sfs = SequentialFeatureSelector(knn, n_features_to_select=3)
	>>> sfs.fit(X, y)
	SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3),
	n_features_to_select=3)
	>>> sfs.get_support()
	array([ True, False, True, True])
	>>> sfs.transform(X).shape
	(150, 3)
	"""

	_parameter_constraints: dict = {
	"estimator": [HasMethods(["fit"])],
	"n_features_to_select": [
	StrOptions({"auto"}),
	Interval(RealNotInt, 0, 1, closed="right"),
	Interval(Integral, 0, None, closed="neither"),
	],
	"tol": [None, Interval(Real, None, None, closed="neither")],
	"direction": [StrOptions({"forward", "backward"})],
	"scoring": [None, StrOptions(set(get_scorer_names())), callable],
	"cv": ["cv_object"],
	"n_jobs": [None, Integral],
	}

	def __init__(
	self,
	estimator,
	*,
	n_features_to_select="auto",
	tol=None,
	direction="forward",
	scoring=None,
	cv=5,
	n_jobs=None,
	):
	self.estimator = estimator
	self.n_features_to_select = n_features_to_select
	self.tol = tol
	self.direction = direction
	self.scoring = scoring
	self.cv = cv
	self.n_jobs = n_jobs

	@_fit_context(
	# SequentialFeatureSelector.estimator is not validated yet
	prefer_skip_nested_validation=False
	)
	def fit(self, X, y=None, **params):
	"""Learn the features to select from X.

	Parameters
	----------
	X : array-like of shape (n_samples, n_features)
	Training vectors, where `n_samples` is the number of samples and
	`n_features` is the number of predictors.

	y : array-like of shape (n_samples,), default=None
	Target values. This parameter may be ignored for
	unsupervised learning.

	**params : dict, default=None
	Parameters to be passed to the underlying `estimator`, `cv`
	and `scorer` objects.

	.. versionadded:: 1.6

	Only available if `enable_metadata_routing=True`,
	which can be set by using
	``sklearn.set_config(enable_metadata_routing=True)``.
	See :ref:`Metadata Routing User Guide <metadata_routing>` for
	more details.

	Returns
	-------
	self : object
	Returns the instance itself.
	"""
	_raise_for_params(params, self, "fit")
	tags = self.__sklearn_tags__()
	X = validate_data(
	self,
	X,
	accept_sparse="csc",
	ensure_min_features=2,
	ensure_all_finite=not tags.input_tags.allow_nan,
	)
	n_features = X.shape[1]

	if self.n_features_to_select == "auto":
	if self.tol is not None:
	# With auto feature selection, `n_features_to_select_` will be updated
	# to `support_.sum()` after features are selected.
	self.n_features_to_select_ = n_features - 1
	else:
	self.n_features_to_select_ = n_features // 2
	elif isinstance(self.n_features_to_select, Integral):
	if self.n_features_to_select >= n_features:
	raise ValueError("n_features_to_select must be < n_features.")
	self.n_features_to_select_ = self.n_features_to_select
	elif isinstance(self.n_features_to_select, Real):
	self.n_features_to_select_ = int(n_features * self.n_features_to_select)

	if self.tol is not None and self.tol < 0 and self.direction == "forward":
	raise ValueError(
	"tol must be strictly positive when doing forward selection"
	)

	cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))

	cloned_estimator = clone(self.estimator)

	# the current mask corresponds to the set of features:
	# - that we have already selected if we do forward selection
	# - that we have already excluded if we do backward selection
	current_mask = np.zeros(shape=n_features, dtype=bool)
	n_iterations = (
	self.n_features_to_select_
	if self.n_features_to_select == "auto" or self.direction == "forward"
	else n_features - self.n_features_to_select_
	)

	old_score = -np.inf
	is_auto_select = self.tol is not None and self.n_features_to_select == "auto"

	# We only need to verify the routing here and not use the routed params
	# because internally the actual routing will also take place inside the
	# `cross_val_score` function.
	if _routing_enabled():
	process_routing(self, "fit", **params)
	for _ in range(n_iterations):
	new_feature_idx, new_score = self._get_best_new_feature_score(
	cloned_estimator, X, y, cv, current_mask, **params
	)
	if is_auto_select and ((new_score - old_score) < self.tol):
	break

	old_score = new_score
	current_mask[new_feature_idx] = True

	if self.direction == "backward":
	current_mask = ~current_mask

	self.support_ = current_mask
	self.n_features_to_select_ = self.support_.sum()

	return self

	def _get_best_new_feature_score(self, estimator, X, y, cv, current_mask, **params):
	# Return the best new feature and its score to add to the current_mask,
	# i.e. return the best new feature and its score to add (resp. remove)
	# when doing forward selection (resp. backward selection).
	# Feature will be added if the current score and past score are greater
	# than tol when n_feature is auto,
	candidate_feature_indices = np.flatnonzero(~current_mask)
	scores = {}
	for feature_idx in candidate_feature_indices:
	candidate_mask = current_mask.copy()
	candidate_mask[feature_idx] = True
	if self.direction == "backward":
	candidate_mask = ~candidate_mask
	X_new = X[:, candidate_mask]
	scores[feature_idx] = cross_val_score(
	estimator,
	X_new,
	y,
	cv=cv,
	scoring=self.scoring,
	n_jobs=self.n_jobs,
	params=params,
	).mean()
	new_feature_idx = max(scores, key=lambda feature_idx: scores[feature_idx])
	return new_feature_idx, scores[new_feature_idx]

	def _get_support_mask(self):
	check_is_fitted(self)
	return self.support_

	def __sklearn_tags__(self):
	tags = super().__sklearn_tags__()
	tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan
	tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse
	return tags

	def get_metadata_routing(self):
	"""Get metadata routing of this object.

	Please check :ref:`User Guide <metadata_routing>` on how the routing
	mechanism works.

	.. versionadded:: 1.6

	Returns
	-------
	routing : MetadataRouter
	A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
	routing information.
	"""
	router = MetadataRouter(owner=self.__class__.__name__)
	router.add(
	estimator=self.estimator,
	method_mapping=MethodMapping().add(caller="fit", callee="fit"),
	)
	router.add(
	splitter=check_cv(self.cv, classifier=is_classifier(self.estimator)),
	method_mapping=MethodMapping().add(caller="fit", callee="split"),
	)
	router.add(
	scorer=check_scoring(self.estimator, scoring=self.scoring),
	method_mapping=MethodMapping().add(caller="fit", callee="score"),
	)
	return router