|
import numpy as np |
|
import pytest |
|
|
|
from sklearn.datasets import ( |
|
load_iris, |
|
make_classification, |
|
make_multilabel_classification, |
|
make_regression, |
|
) |
|
from sklearn.ensemble import IsolationForest |
|
from sklearn.linear_model import ( |
|
LinearRegression, |
|
LogisticRegression, |
|
) |
|
from sklearn.multioutput import ClassifierChain |
|
from sklearn.preprocessing import scale |
|
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor |
|
from sklearn.utils._mocking import _MockEstimatorOnOffPrediction |
|
from sklearn.utils._response import _get_response_values, _get_response_values_binary |
|
from sklearn.utils._testing import assert_allclose, assert_array_equal |
|
|
|
X, y = load_iris(return_X_y=True) |
|
|
|
X = scale(X, copy=False) |
|
X_binary, y_binary = X[:100], y[:100] |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"response_method", ["decision_function", "predict_proba", "predict_log_proba"] |
|
) |
|
def test_get_response_values_regressor_error(response_method): |
|
"""Check the error message with regressor an not supported response |
|
method.""" |
|
my_estimator = _MockEstimatorOnOffPrediction(response_methods=[response_method]) |
|
X = "mocking_data", "mocking_target" |
|
err_msg = f"{my_estimator.__class__.__name__} should either be a classifier" |
|
with pytest.raises(ValueError, match=err_msg): |
|
_get_response_values(my_estimator, X, response_method=response_method) |
|
|
|
|
|
@pytest.mark.parametrize("return_response_method_used", [True, False]) |
|
def test_get_response_values_regressor(return_response_method_used): |
|
"""Check the behaviour of `_get_response_values` with regressor.""" |
|
X, y = make_regression(n_samples=10, random_state=0) |
|
regressor = LinearRegression().fit(X, y) |
|
results = _get_response_values( |
|
regressor, |
|
X, |
|
response_method="predict", |
|
return_response_method_used=return_response_method_used, |
|
) |
|
assert_array_equal(results[0], regressor.predict(X)) |
|
assert results[1] is None |
|
if return_response_method_used: |
|
assert results[2] == "predict" |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"response_method", |
|
["predict", "decision_function", ["decision_function", "predict"]], |
|
) |
|
@pytest.mark.parametrize("return_response_method_used", [True, False]) |
|
def test_get_response_values_outlier_detection( |
|
response_method, return_response_method_used |
|
): |
|
"""Check the behaviour of `_get_response_values` with outlier detector.""" |
|
X, y = make_classification(n_samples=50, random_state=0) |
|
outlier_detector = IsolationForest(random_state=0).fit(X, y) |
|
results = _get_response_values( |
|
outlier_detector, |
|
X, |
|
response_method=response_method, |
|
return_response_method_used=return_response_method_used, |
|
) |
|
chosen_response_method = ( |
|
response_method[0] if isinstance(response_method, list) else response_method |
|
) |
|
prediction_method = getattr(outlier_detector, chosen_response_method) |
|
assert_array_equal(results[0], prediction_method(X)) |
|
assert results[1] is None |
|
if return_response_method_used: |
|
assert results[2] == chosen_response_method |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"response_method", |
|
["predict_proba", "decision_function", "predict", "predict_log_proba"], |
|
) |
|
def test_get_response_values_classifier_unknown_pos_label(response_method): |
|
"""Check that `_get_response_values` raises the proper error message with |
|
classifier.""" |
|
X, y = make_classification(n_samples=10, n_classes=2, random_state=0) |
|
classifier = LogisticRegression().fit(X, y) |
|
|
|
|
|
err_msg = r"pos_label=whatever is not a valid label: It should be one of \[0 1\]" |
|
with pytest.raises(ValueError, match=err_msg): |
|
_get_response_values( |
|
classifier, |
|
X, |
|
response_method=response_method, |
|
pos_label="whatever", |
|
) |
|
|
|
|
|
@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"]) |
|
def test_get_response_values_classifier_inconsistent_y_pred_for_binary_proba( |
|
response_method, |
|
): |
|
"""Check that `_get_response_values` will raise an error when `y_pred` has a |
|
single class with `predict_proba`.""" |
|
X, y_two_class = make_classification(n_samples=10, n_classes=2, random_state=0) |
|
y_single_class = np.zeros_like(y_two_class) |
|
classifier = DecisionTreeClassifier().fit(X, y_single_class) |
|
|
|
err_msg = ( |
|
r"Got predict_proba of shape \(10, 1\), but need classifier with " |
|
r"two classes" |
|
) |
|
with pytest.raises(ValueError, match=err_msg): |
|
_get_response_values(classifier, X, response_method=response_method) |
|
|
|
|
|
@pytest.mark.parametrize("return_response_method_used", [True, False]) |
|
def test_get_response_values_binary_classifier_decision_function( |
|
return_response_method_used, |
|
): |
|
"""Check the behaviour of `_get_response_values` with `decision_function` |
|
and binary classifier.""" |
|
X, y = make_classification( |
|
n_samples=10, |
|
n_classes=2, |
|
weights=[0.3, 0.7], |
|
random_state=0, |
|
) |
|
classifier = LogisticRegression().fit(X, y) |
|
response_method = "decision_function" |
|
|
|
|
|
results = _get_response_values( |
|
classifier, |
|
X, |
|
response_method=response_method, |
|
pos_label=None, |
|
return_response_method_used=return_response_method_used, |
|
) |
|
assert_allclose(results[0], classifier.decision_function(X)) |
|
assert results[1] == 1 |
|
if return_response_method_used: |
|
assert results[2] == "decision_function" |
|
|
|
|
|
results = _get_response_values( |
|
classifier, |
|
X, |
|
response_method=response_method, |
|
pos_label=classifier.classes_[0], |
|
return_response_method_used=return_response_method_used, |
|
) |
|
assert_allclose(results[0], classifier.decision_function(X) * -1) |
|
assert results[1] == 0 |
|
if return_response_method_used: |
|
assert results[2] == "decision_function" |
|
|
|
|
|
@pytest.mark.parametrize("return_response_method_used", [True, False]) |
|
@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"]) |
|
def test_get_response_values_binary_classifier_predict_proba( |
|
return_response_method_used, response_method |
|
): |
|
"""Check that `_get_response_values` with `predict_proba` and binary |
|
classifier.""" |
|
X, y = make_classification( |
|
n_samples=10, |
|
n_classes=2, |
|
weights=[0.3, 0.7], |
|
random_state=0, |
|
) |
|
classifier = LogisticRegression().fit(X, y) |
|
|
|
|
|
results = _get_response_values( |
|
classifier, |
|
X, |
|
response_method=response_method, |
|
pos_label=None, |
|
return_response_method_used=return_response_method_used, |
|
) |
|
assert_allclose(results[0], getattr(classifier, response_method)(X)[:, 1]) |
|
assert results[1] == 1 |
|
if return_response_method_used: |
|
assert len(results) == 3 |
|
assert results[2] == response_method |
|
else: |
|
assert len(results) == 2 |
|
|
|
|
|
y_pred, pos_label, *_ = _get_response_values( |
|
classifier, |
|
X, |
|
response_method=response_method, |
|
pos_label=classifier.classes_[0], |
|
return_response_method_used=return_response_method_used, |
|
) |
|
assert_allclose(y_pred, getattr(classifier, response_method)(X)[:, 0]) |
|
assert pos_label == 0 |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"estimator, X, y, err_msg, params", |
|
[ |
|
( |
|
DecisionTreeRegressor(), |
|
X_binary, |
|
y_binary, |
|
"Expected 'estimator' to be a binary classifier", |
|
{"response_method": "auto"}, |
|
), |
|
( |
|
DecisionTreeClassifier(), |
|
X_binary, |
|
y_binary, |
|
r"pos_label=unknown is not a valid label: It should be one of \[0 1\]", |
|
{"response_method": "auto", "pos_label": "unknown"}, |
|
), |
|
( |
|
DecisionTreeClassifier(), |
|
X, |
|
y, |
|
"be a binary classifier. Got 3 classes instead.", |
|
{"response_method": "predict_proba"}, |
|
), |
|
], |
|
) |
|
def test_get_response_error(estimator, X, y, err_msg, params): |
|
"""Check that we raise the proper error messages in _get_response_values_binary.""" |
|
|
|
estimator.fit(X, y) |
|
with pytest.raises(ValueError, match=err_msg): |
|
_get_response_values_binary(estimator, X, **params) |
|
|
|
|
|
@pytest.mark.parametrize("return_response_method_used", [True, False]) |
|
def test_get_response_predict_proba(return_response_method_used): |
|
"""Check the behaviour of `_get_response_values_binary` using `predict_proba`.""" |
|
classifier = DecisionTreeClassifier().fit(X_binary, y_binary) |
|
results = _get_response_values_binary( |
|
classifier, |
|
X_binary, |
|
response_method="predict_proba", |
|
return_response_method_used=return_response_method_used, |
|
) |
|
assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 1]) |
|
assert results[1] == 1 |
|
if return_response_method_used: |
|
assert results[2] == "predict_proba" |
|
|
|
results = _get_response_values_binary( |
|
classifier, |
|
X_binary, |
|
response_method="predict_proba", |
|
pos_label=0, |
|
return_response_method_used=return_response_method_used, |
|
) |
|
assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 0]) |
|
assert results[1] == 0 |
|
if return_response_method_used: |
|
assert results[2] == "predict_proba" |
|
|
|
|
|
@pytest.mark.parametrize("return_response_method_used", [True, False]) |
|
def test_get_response_decision_function(return_response_method_used): |
|
"""Check the behaviour of `_get_response_values_binary` using decision_function.""" |
|
classifier = LogisticRegression().fit(X_binary, y_binary) |
|
results = _get_response_values_binary( |
|
classifier, |
|
X_binary, |
|
response_method="decision_function", |
|
return_response_method_used=return_response_method_used, |
|
) |
|
assert_allclose(results[0], classifier.decision_function(X_binary)) |
|
assert results[1] == 1 |
|
if return_response_method_used: |
|
assert results[2] == "decision_function" |
|
|
|
results = _get_response_values_binary( |
|
classifier, |
|
X_binary, |
|
response_method="decision_function", |
|
pos_label=0, |
|
return_response_method_used=return_response_method_used, |
|
) |
|
assert_allclose(results[0], classifier.decision_function(X_binary) * -1) |
|
assert results[1] == 0 |
|
if return_response_method_used: |
|
assert results[2] == "decision_function" |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"estimator, response_method", |
|
[ |
|
(DecisionTreeClassifier(max_depth=2, random_state=0), "predict_proba"), |
|
(DecisionTreeClassifier(max_depth=2, random_state=0), "predict_log_proba"), |
|
(LogisticRegression(), "decision_function"), |
|
], |
|
) |
|
def test_get_response_values_multiclass(estimator, response_method): |
|
"""Check that we can call `_get_response_values` with a multiclass estimator. |
|
It should return the predictions untouched. |
|
""" |
|
estimator.fit(X, y) |
|
predictions, pos_label = _get_response_values( |
|
estimator, X, response_method=response_method |
|
) |
|
|
|
assert pos_label is None |
|
assert predictions.shape == (X.shape[0], len(estimator.classes_)) |
|
if response_method == "predict_proba": |
|
assert np.logical_and(predictions >= 0, predictions <= 1).all() |
|
elif response_method == "predict_log_proba": |
|
assert (predictions <= 0.0).all() |
|
|
|
|
|
def test_get_response_values_with_response_list(): |
|
"""Check the behaviour of passing a list of responses to `_get_response_values`.""" |
|
classifier = LogisticRegression().fit(X_binary, y_binary) |
|
|
|
|
|
y_pred, pos_label, response_method = _get_response_values( |
|
classifier, |
|
X_binary, |
|
response_method=["predict_proba", "decision_function"], |
|
return_response_method_used=True, |
|
) |
|
assert_allclose(y_pred, classifier.predict_proba(X_binary)[:, 1]) |
|
assert pos_label == 1 |
|
assert response_method == "predict_proba" |
|
|
|
|
|
y_pred, pos_label, response_method = _get_response_values( |
|
classifier, |
|
X_binary, |
|
response_method=["decision_function", "predict_proba"], |
|
return_response_method_used=True, |
|
) |
|
assert_allclose(y_pred, classifier.decision_function(X_binary)) |
|
assert pos_label == 1 |
|
assert response_method == "decision_function" |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"response_method", ["predict_proba", "decision_function", "predict"] |
|
) |
|
def test_get_response_values_multilabel_indicator(response_method): |
|
X, Y = make_multilabel_classification(random_state=0) |
|
estimator = ClassifierChain(LogisticRegression()).fit(X, Y) |
|
|
|
y_pred, pos_label = _get_response_values( |
|
estimator, X, response_method=response_method |
|
) |
|
assert pos_label is None |
|
assert y_pred.shape == Y.shape |
|
|
|
if response_method == "predict_proba": |
|
assert np.logical_and(y_pred >= 0, y_pred <= 1).all() |
|
elif response_method == "decision_function": |
|
|
|
assert (y_pred < 0).sum() > 0 |
|
assert (y_pred > 1).sum() > 0 |
|
else: |
|
assert np.logical_or(y_pred == 0, y_pred == 1).all() |
|
|