Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

9.11 kB

	import numpy as np
	import pytest

	from sklearn.base import ClassifierMixin, clone, is_classifier
	from sklearn.datasets import (
	load_diabetes,
	load_iris,
	make_classification,
	make_regression,
	)
	from sklearn.ensemble import (
	RandomForestClassifier,
	RandomForestRegressor,
	StackingClassifier,
	StackingRegressor,
	VotingClassifier,
	VotingRegressor,
	)
	from sklearn.impute import SimpleImputer
	from sklearn.linear_model import LinearRegression, LogisticRegression
	from sklearn.pipeline import make_pipeline
	from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR

	X, y = load_iris(return_X_y=True)

	X_r, y_r = load_diabetes(return_X_y=True)


	@pytest.mark.parametrize(
	"X, y, estimator",
	[
	(
	*make_classification(n_samples=10),
	StackingClassifier(
	estimators=[
	("lr", LogisticRegression()),
	("svm", LinearSVC()),
	("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
	],
	cv=2,
	),
	),
	(
	*make_classification(n_samples=10),
	VotingClassifier(
	estimators=[
	("lr", LogisticRegression()),
	("svm", LinearSVC()),
	("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
	]
	),
	),
	(
	*make_regression(n_samples=10),
	StackingRegressor(
	estimators=[
	("lr", LinearRegression()),
	("svm", LinearSVR()),
	("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
	],
	cv=2,
	),
	),
	(
	*make_regression(n_samples=10),
	VotingRegressor(
	estimators=[
	("lr", LinearRegression()),
	("svm", LinearSVR()),
	("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
	]
	),
	),
	],
	ids=[
	"stacking-classifier",
	"voting-classifier",
	"stacking-regressor",
	"voting-regressor",
	],
	)
	def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
	# check that the behavior of `estimators`, `estimators_`,
	# `named_estimators`, `named_estimators_` is consistent across all
	# ensemble classes and when using `set_params()`.

	# before fit
	assert "svm" in estimator.named_estimators
	assert estimator.named_estimators.svm is estimator.estimators[1][1]
	assert estimator.named_estimators.svm is estimator.named_estimators["svm"]

	# check fitted attributes
	estimator.fit(X, y)
	assert len(estimator.named_estimators) == 3
	assert len(estimator.named_estimators_) == 3
	assert sorted(list(estimator.named_estimators_.keys())) == sorted(
	["lr", "svm", "rf"]
	)

	# check that set_params() does not add a new attribute
	estimator_new_params = clone(estimator)
	svm_estimator = SVC() if is_classifier(estimator) else SVR()
	estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
	assert not hasattr(estimator_new_params, "svm")
	assert (
	estimator_new_params.named_estimators.lr.get_params()
	== estimator.named_estimators.lr.get_params()
	)
	assert (
	estimator_new_params.named_estimators.rf.get_params()
	== estimator.named_estimators.rf.get_params()
	)

	# check the behavior when setting an dropping an estimator
	estimator_dropped = clone(estimator)
	estimator_dropped.set_params(svm="drop")
	estimator_dropped.fit(X, y)
	assert len(estimator_dropped.named_estimators) == 3
	assert estimator_dropped.named_estimators.svm == "drop"
	assert len(estimator_dropped.named_estimators_) == 3
	assert sorted(list(estimator_dropped.named_estimators_.keys())) == sorted(
	["lr", "svm", "rf"]
	)
	for sub_est in estimator_dropped.named_estimators_:
	# check that the correspondence is correct
	assert not isinstance(sub_est, type(estimator.named_estimators.svm))

	# check that we can set the parameters of the underlying classifier
	estimator.set_params(svm__C=10.0)
	estimator.set_params(rf__max_depth=5)
	assert (
	estimator.get_params()["svm__C"]
	== estimator.get_params()["svm"].get_params()["C"]
	)
	assert (
	estimator.get_params()["rf__max_depth"]
	== estimator.get_params()["rf"].get_params()["max_depth"]
	)


	@pytest.mark.parametrize(
	"Ensemble",
	[VotingClassifier, StackingRegressor, VotingRegressor],
	)
	def test_ensemble_heterogeneous_estimators_type(Ensemble):
	# check that ensemble will fail during validation if the underlying
	# estimators are not of the same type (i.e. classifier or regressor)
	# StackingClassifier can have an underlying regresor so it's not checked
	if issubclass(Ensemble, ClassifierMixin):
	X, y = make_classification(n_samples=10)
	estimators = [("lr", LinearRegression())]
	ensemble_type = "classifier"
	else:
	X, y = make_regression(n_samples=10)
	estimators = [("lr", LogisticRegression())]
	ensemble_type = "regressor"
	ensemble = Ensemble(estimators=estimators)

	err_msg = "should be a {}".format(ensemble_type)
	with pytest.raises(ValueError, match=err_msg):
	ensemble.fit(X, y)


	@pytest.mark.parametrize(
	"X, y, Ensemble",
	[
	(*make_classification(n_samples=10), StackingClassifier),
	(*make_classification(n_samples=10), VotingClassifier),
	(*make_regression(n_samples=10), StackingRegressor),
	(*make_regression(n_samples=10), VotingRegressor),
	],
	)
	def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
	# raise an error when the name contains dunder
	if issubclass(Ensemble, ClassifierMixin):
	estimators = [("lr__", LogisticRegression())]
	else:
	estimators = [("lr__", LinearRegression())]
	ensemble = Ensemble(estimators=estimators)

	err_msg = r"Estimator names must not contain __: got \['lr__'\]"
	with pytest.raises(ValueError, match=err_msg):
	ensemble.fit(X, y)

	# raise an error when the name is not unique
	if issubclass(Ensemble, ClassifierMixin):
	estimators = [("lr", LogisticRegression()), ("lr", LogisticRegression())]
	else:
	estimators = [("lr", LinearRegression()), ("lr", LinearRegression())]
	ensemble = Ensemble(estimators=estimators)

	err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
	with pytest.raises(ValueError, match=err_msg):
	ensemble.fit(X, y)

	# raise an error when the name conflicts with the parameters
	if issubclass(Ensemble, ClassifierMixin):
	estimators = [("estimators", LogisticRegression())]
	else:
	estimators = [("estimators", LinearRegression())]
	ensemble = Ensemble(estimators=estimators)

	err_msg = "Estimator names conflict with constructor arguments"
	with pytest.raises(ValueError, match=err_msg):
	ensemble.fit(X, y)


	@pytest.mark.parametrize(
	"X, y, estimator",
	[
	(
	*make_classification(n_samples=10),
	StackingClassifier(estimators=[("lr", LogisticRegression())]),
	),
	(
	*make_classification(n_samples=10),
	VotingClassifier(estimators=[("lr", LogisticRegression())]),
	),
	(
	*make_regression(n_samples=10),
	StackingRegressor(estimators=[("lr", LinearRegression())]),
	),
	(
	*make_regression(n_samples=10),
	VotingRegressor(estimators=[("lr", LinearRegression())]),
	),
	],
	ids=[
	"stacking-classifier",
	"voting-classifier",
	"stacking-regressor",
	"voting-regressor",
	],
	)
	def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
	# check that we raise a consistent error when all estimators are
	# dropped
	estimator.set_params(lr="drop")
	with pytest.raises(ValueError, match="All estimators are dropped."):
	estimator.fit(X, y)


	@pytest.mark.parametrize(
	"Ensemble, Estimator, X, y",
	[
	(StackingClassifier, LogisticRegression, X, y),
	(StackingRegressor, LinearRegression, X_r, y_r),
	(VotingClassifier, LogisticRegression, X, y),
	(VotingRegressor, LinearRegression, X_r, y_r),
	],
	)
	# FIXME: we should move this test in `estimator_checks` once we are able
	# to construct meta-estimator instances
	def test_heterogeneous_ensemble_support_missing_values(Ensemble, Estimator, X, y):
	# check that Voting and Stacking predictor delegate the missing values
	# validation to the underlying estimator.
	X = X.copy()
	mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
	X[mask] = np.nan
	pipe = make_pipeline(SimpleImputer(), Estimator())
	ensemble = Ensemble(estimators=[("pipe1", pipe), ("pipe2", pipe)])
	ensemble.fit(X, y).score(X, y)