|
import pickle |
|
|
|
import numpy as np |
|
import pytest |
|
from numpy.testing import assert_array_equal |
|
|
|
from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"values, expected", |
|
[ |
|
(np.array([2, 1, 3, 1, 3], dtype="int64"), np.array([1, 2, 3], dtype="int64")), |
|
( |
|
np.array([2, 1, np.nan, 1, np.nan], dtype="float32"), |
|
np.array([1, 2, np.nan], dtype="float32"), |
|
), |
|
( |
|
np.array(["b", "a", "c", "a", "c"], dtype=object), |
|
np.array(["a", "b", "c"], dtype=object), |
|
), |
|
( |
|
np.array(["b", "a", None, "a", None], dtype=object), |
|
np.array(["a", "b", None], dtype=object), |
|
), |
|
(np.array(["b", "a", "c", "a", "c"]), np.array(["a", "b", "c"])), |
|
], |
|
ids=["int64", "float32-nan", "object", "object-None", "str"], |
|
) |
|
def test_encode_util(values, expected): |
|
uniques = _unique(values) |
|
assert_array_equal(uniques, expected) |
|
|
|
result, encoded = _unique(values, return_inverse=True) |
|
assert_array_equal(result, expected) |
|
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) |
|
|
|
encoded = _encode(values, uniques=uniques) |
|
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) |
|
|
|
result, counts = _unique(values, return_counts=True) |
|
assert_array_equal(result, expected) |
|
assert_array_equal(counts, np.array([2, 1, 2])) |
|
|
|
result, encoded, counts = _unique(values, return_inverse=True, return_counts=True) |
|
assert_array_equal(result, expected) |
|
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) |
|
assert_array_equal(counts, np.array([2, 1, 2])) |
|
|
|
|
|
def test_encode_with_check_unknown(): |
|
|
|
uniques = np.array([1, 2, 3]) |
|
values = np.array([1, 2, 3, 4]) |
|
|
|
|
|
with pytest.raises(ValueError, match="y contains previously unseen labels"): |
|
_encode(values, uniques=uniques, check_unknown=True) |
|
|
|
|
|
_encode(values, uniques=uniques, check_unknown=False) |
|
|
|
|
|
uniques = np.array(["a", "b", "c"], dtype=object) |
|
values = np.array(["a", "b", "c", "d"], dtype=object) |
|
with pytest.raises(ValueError, match="y contains previously unseen labels"): |
|
_encode(values, uniques=uniques, check_unknown=False) |
|
|
|
|
|
def _assert_check_unknown(values, uniques, expected_diff, expected_mask): |
|
diff = _check_unknown(values, uniques) |
|
assert_array_equal(diff, expected_diff) |
|
|
|
diff, valid_mask = _check_unknown(values, uniques, return_mask=True) |
|
assert_array_equal(diff, expected_diff) |
|
assert_array_equal(valid_mask, expected_mask) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"values, uniques, expected_diff, expected_mask", |
|
[ |
|
(np.array([1, 2, 3, 4]), np.array([1, 2, 3]), [4], [True, True, True, False]), |
|
(np.array([2, 1, 4, 5]), np.array([2, 5, 1]), [4], [True, True, False, True]), |
|
(np.array([2, 1, np.nan]), np.array([2, 5, 1]), [np.nan], [True, True, False]), |
|
( |
|
np.array([2, 1, 4, np.nan]), |
|
np.array([2, 5, 1, np.nan]), |
|
[4], |
|
[True, True, False, True], |
|
), |
|
( |
|
np.array([2, 1, 4, np.nan]), |
|
np.array([2, 5, 1]), |
|
[4, np.nan], |
|
[True, True, False, False], |
|
), |
|
( |
|
np.array([2, 1, 4, 5]), |
|
np.array([2, 5, 1, np.nan]), |
|
[4], |
|
[True, True, False, True], |
|
), |
|
( |
|
np.array(["a", "b", "c", "d"], dtype=object), |
|
np.array(["a", "b", "c"], dtype=object), |
|
np.array(["d"], dtype=object), |
|
[True, True, True, False], |
|
), |
|
( |
|
np.array(["d", "c", "a", "b"], dtype=object), |
|
np.array(["a", "c", "b"], dtype=object), |
|
np.array(["d"], dtype=object), |
|
[False, True, True, True], |
|
), |
|
( |
|
np.array(["a", "b", "c", "d"]), |
|
np.array(["a", "b", "c"]), |
|
np.array(["d"]), |
|
[True, True, True, False], |
|
), |
|
( |
|
np.array(["d", "c", "a", "b"]), |
|
np.array(["a", "c", "b"]), |
|
np.array(["d"]), |
|
[False, True, True, True], |
|
), |
|
], |
|
) |
|
def test_check_unknown(values, uniques, expected_diff, expected_mask): |
|
_assert_check_unknown(values, uniques, expected_diff, expected_mask) |
|
|
|
|
|
@pytest.mark.parametrize("missing_value", [None, np.nan, float("nan")]) |
|
@pytest.mark.parametrize("pickle_uniques", [True, False]) |
|
def test_check_unknown_missing_values(missing_value, pickle_uniques): |
|
|
|
values = np.array(["d", "c", "a", "b", missing_value], dtype=object) |
|
uniques = np.array(["c", "a", "b", missing_value], dtype=object) |
|
if pickle_uniques: |
|
uniques = pickle.loads(pickle.dumps(uniques)) |
|
|
|
expected_diff = ["d"] |
|
expected_mask = [False, True, True, True, True] |
|
_assert_check_unknown(values, uniques, expected_diff, expected_mask) |
|
|
|
values = np.array(["d", "c", "a", "b", missing_value], dtype=object) |
|
uniques = np.array(["c", "a", "b"], dtype=object) |
|
if pickle_uniques: |
|
uniques = pickle.loads(pickle.dumps(uniques)) |
|
|
|
expected_diff = ["d", missing_value] |
|
|
|
expected_mask = [False, True, True, True, False] |
|
_assert_check_unknown(values, uniques, expected_diff, expected_mask) |
|
|
|
values = np.array(["a", missing_value], dtype=object) |
|
uniques = np.array(["a", "b", "z"], dtype=object) |
|
if pickle_uniques: |
|
uniques = pickle.loads(pickle.dumps(uniques)) |
|
|
|
expected_diff = [missing_value] |
|
expected_mask = [True, False] |
|
_assert_check_unknown(values, uniques, expected_diff, expected_mask) |
|
|
|
|
|
@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")]) |
|
@pytest.mark.parametrize("pickle_uniques", [True, False]) |
|
def test_unique_util_missing_values_objects(missing_value, pickle_uniques): |
|
|
|
values = np.array(["a", "c", "c", missing_value, "b"], dtype=object) |
|
expected_uniques = np.array(["a", "b", "c", missing_value], dtype=object) |
|
|
|
uniques = _unique(values) |
|
|
|
if missing_value is None: |
|
assert_array_equal(uniques, expected_uniques) |
|
else: |
|
assert_array_equal(uniques[:-1], expected_uniques[:-1]) |
|
assert np.isnan(uniques[-1]) |
|
|
|
if pickle_uniques: |
|
uniques = pickle.loads(pickle.dumps(uniques)) |
|
|
|
encoded = _encode(values, uniques=uniques) |
|
assert_array_equal(encoded, np.array([0, 2, 2, 3, 1])) |
|
|
|
|
|
def test_unique_util_missing_values_numeric(): |
|
|
|
values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float) |
|
expected_uniques = np.array([1, 3, 5, np.nan], dtype=float) |
|
expected_inverse = np.array([1, 0, 3, 2, 1, 3]) |
|
|
|
uniques = _unique(values) |
|
assert_array_equal(uniques, expected_uniques) |
|
|
|
uniques, inverse = _unique(values, return_inverse=True) |
|
assert_array_equal(uniques, expected_uniques) |
|
assert_array_equal(inverse, expected_inverse) |
|
|
|
encoded = _encode(values, uniques=uniques) |
|
assert_array_equal(encoded, expected_inverse) |
|
|
|
|
|
def test_unique_util_with_all_missing_values(): |
|
|
|
values = np.array([np.nan, "a", "c", "c", None, float("nan"), None], dtype=object) |
|
|
|
uniques = _unique(values) |
|
assert_array_equal(uniques[:-1], ["a", "c", None]) |
|
|
|
assert np.isnan(uniques[-1]) |
|
|
|
expected_inverse = [3, 0, 1, 1, 2, 3, 2] |
|
_, inverse = _unique(values, return_inverse=True) |
|
assert_array_equal(inverse, expected_inverse) |
|
|
|
|
|
def test_check_unknown_with_both_missing_values(): |
|
|
|
values = np.array([np.nan, "a", "c", "c", None, np.nan, None], dtype=object) |
|
|
|
diff = _check_unknown(values, known_values=np.array(["a", "c"], dtype=object)) |
|
assert diff[0] is None |
|
assert np.isnan(diff[1]) |
|
|
|
diff, valid_mask = _check_unknown( |
|
values, known_values=np.array(["a", "c"], dtype=object), return_mask=True |
|
) |
|
|
|
assert diff[0] is None |
|
assert np.isnan(diff[1]) |
|
assert_array_equal(valid_mask, [False, True, True, True, False, False, False]) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"values, uniques, expected_counts", |
|
[ |
|
(np.array([1] * 10 + [2] * 4 + [3] * 15), np.array([1, 2, 3]), [10, 4, 15]), |
|
( |
|
np.array([1] * 10 + [2] * 4 + [3] * 15), |
|
np.array([1, 2, 3, 5]), |
|
[10, 4, 15, 0], |
|
), |
|
( |
|
np.array([np.nan] * 10 + [2] * 4 + [3] * 15), |
|
np.array([2, 3, np.nan]), |
|
[4, 15, 10], |
|
), |
|
( |
|
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object), |
|
["a", "b", "c"], |
|
[16, 4, 20], |
|
), |
|
( |
|
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object), |
|
["c", "b", "a"], |
|
[20, 4, 16], |
|
), |
|
( |
|
np.array([np.nan] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object), |
|
["c", np.nan, "a"], |
|
[20, 4, 16], |
|
), |
|
( |
|
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object), |
|
["a", "b", "c", "e"], |
|
[16, 4, 20, 0], |
|
), |
|
], |
|
) |
|
def test_get_counts(values, uniques, expected_counts): |
|
counts = _get_counts(values, uniques) |
|
assert_array_equal(counts, expected_counts) |
|
|