Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

79.5 kB

	import re
	import warnings

	import numpy as np
	import pytest
	from scipy import sparse

	from sklearn.exceptions import NotFittedError
	from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
	from sklearn.utils._missing import is_scalar_nan
	from sklearn.utils._testing import (
	_convert_container,
	assert_allclose,
	assert_array_equal,
	)
	from sklearn.utils.fixes import CSR_CONTAINERS


	def test_one_hot_encoder_sparse_dense():
	# check that sparse and dense will give the same results

	X = np.array([[3, 2, 1], [0, 1, 1]])
	enc_sparse = OneHotEncoder()
	enc_dense = OneHotEncoder(sparse_output=False)

	X_trans_sparse = enc_sparse.fit_transform(X)
	X_trans_dense = enc_dense.fit_transform(X)

	assert X_trans_sparse.shape == (2, 5)
	assert X_trans_dense.shape == (2, 5)

	assert sparse.issparse(X_trans_sparse)
	assert not sparse.issparse(X_trans_dense)

	# check outcome
	assert_array_equal(
	X_trans_sparse.toarray(), [[0.0, 1.0, 0.0, 1.0, 1.0], [1.0, 0.0, 1.0, 0.0, 1.0]]
	)
	assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)


	@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
	def test_one_hot_encoder_handle_unknown(handle_unknown):
	X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
	X2 = np.array([[4, 1, 1]])

	# Test that one hot encoder raises error for unknown features
	# present during transform.
	oh = OneHotEncoder(handle_unknown="error")
	oh.fit(X)
	with pytest.raises(ValueError, match="Found unknown categories"):
	oh.transform(X2)

	# Test the ignore option, ignores unknown features (giving all 0's)
	oh = OneHotEncoder(handle_unknown=handle_unknown)
	oh.fit(X)
	X2_passed = X2.copy()
	assert_array_equal(
	oh.transform(X2_passed).toarray(),
	np.array([[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]]),
	)
	# ensure transformed data was not modified in place
	assert_allclose(X2, X2_passed)


	@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
	def test_one_hot_encoder_handle_unknown_strings(handle_unknown):
	X = np.array(["11111111", "22", "333", "4444"]).reshape((-1, 1))
	X2 = np.array(["55555", "22"]).reshape((-1, 1))
	# Non Regression test for the issue #12470
	# Test the ignore option, when categories are numpy string dtype
	# particularly when the known category strings are larger
	# than the unknown category strings
	oh = OneHotEncoder(handle_unknown=handle_unknown)
	oh.fit(X)
	X2_passed = X2.copy()
	assert_array_equal(
	oh.transform(X2_passed).toarray(),
	np.array([[0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]),
	)
	# ensure transformed data was not modified in place
	assert_array_equal(X2, X2_passed)


	@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
	@pytest.mark.parametrize("input_dtype", [np.int32, np.float32, np.float64])
	def test_one_hot_encoder_dtype(input_dtype, output_dtype):
	X = np.asarray([[0, 1]], dtype=input_dtype).T
	X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype)

	oh = OneHotEncoder(categories="auto", dtype=output_dtype)
	assert_array_equal(oh.fit_transform(X).toarray(), X_expected)
	assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected)

	oh = OneHotEncoder(categories="auto", dtype=output_dtype, sparse_output=False)
	assert_array_equal(oh.fit_transform(X), X_expected)
	assert_array_equal(oh.fit(X).transform(X), X_expected)


	@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
	def test_one_hot_encoder_dtype_pandas(output_dtype):
	pd = pytest.importorskip("pandas")

	X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
	X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype)

	oh = OneHotEncoder(dtype=output_dtype)
	assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected)
	assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected)

	oh = OneHotEncoder(dtype=output_dtype, sparse_output=False)
	assert_array_equal(oh.fit_transform(X_df), X_expected)
	assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)


	def test_one_hot_encoder_feature_names():
	enc = OneHotEncoder()
	X = [
	["Male", 1, "girl", 2, 3],
	["Female", 41, "girl", 1, 10],
	["Male", 51, "boy", 12, 3],
	["Male", 91, "girl", 21, 30],
	]

	enc.fit(X)
	feature_names = enc.get_feature_names_out()

	assert_array_equal(
	[
	"x0_Female",
	"x0_Male",
	"x1_1",
	"x1_41",
	"x1_51",
	"x1_91",
	"x2_boy",
	"x2_girl",
	"x3_1",
	"x3_2",
	"x3_12",
	"x3_21",
	"x4_3",
	"x4_10",
	"x4_30",
	],
	feature_names,
	)

	feature_names2 = enc.get_feature_names_out(["one", "two", "three", "four", "five"])

	assert_array_equal(
	[
	"one_Female",
	"one_Male",
	"two_1",
	"two_41",
	"two_51",
	"two_91",
	"three_boy",
	"three_girl",
	"four_1",
	"four_2",
	"four_12",
	"four_21",
	"five_3",
	"five_10",
	"five_30",
	],
	feature_names2,
	)

	with pytest.raises(ValueError, match="input_features should have length"):
	enc.get_feature_names_out(["one", "two"])


	def test_one_hot_encoder_feature_names_unicode():
	enc = OneHotEncoder()
	X = np.array([["c❤t1", "dat2"]], dtype=object).T
	enc.fit(X)
	feature_names = enc.get_feature_names_out()
	assert_array_equal(["x0_c❤t1", "x0_dat2"], feature_names)
	feature_names = enc.get_feature_names_out(input_features=["n👍me"])
	assert_array_equal(["n👍me_c❤t1", "n👍me_dat2"], feature_names)


	def test_one_hot_encoder_custom_feature_name_combiner():
	"""Check the behaviour of `feature_name_combiner` as a callable."""

	def name_combiner(feature, category):
	return feature + "_" + repr(category)

	enc = OneHotEncoder(feature_name_combiner=name_combiner)
	X = np.array([["None", None]], dtype=object).T
	enc.fit(X)
	feature_names = enc.get_feature_names_out()
	assert_array_equal(["x0_'None'", "x0_None"], feature_names)
	feature_names = enc.get_feature_names_out(input_features=["a"])
	assert_array_equal(["a_'None'", "a_None"], feature_names)

	def wrong_combiner(feature, category):
	# we should be returning a Python string
	return 0

	enc = OneHotEncoder(feature_name_combiner=wrong_combiner).fit(X)
	err_msg = (
	"When `feature_name_combiner` is a callable, it should return a Python string."
	)
	with pytest.raises(TypeError, match=err_msg):
	enc.get_feature_names_out()


	def test_one_hot_encoder_set_params():
	X = np.array([[1, 2]]).T
	oh = OneHotEncoder()
	# set params on not yet fitted object
	oh.set_params(categories=[[0, 1, 2, 3]])
	assert oh.get_params()["categories"] == [[0, 1, 2, 3]]
	assert oh.fit_transform(X).toarray().shape == (2, 4)
	# set params on already fitted object
	oh.set_params(categories=[[0, 1, 2, 3, 4]])
	assert oh.fit_transform(X).toarray().shape == (2, 5)


	def check_categorical_onehot(X):
	enc = OneHotEncoder(categories="auto")
	Xtr1 = enc.fit_transform(X)

	enc = OneHotEncoder(categories="auto", sparse_output=False)
	Xtr2 = enc.fit_transform(X)

	assert_allclose(Xtr1.toarray(), Xtr2)

	assert sparse.issparse(Xtr1) and Xtr1.format == "csr"
	return Xtr1.toarray()


	@pytest.mark.parametrize(
	"X",
	[
	[["def", 1, 55], ["abc", 2, 55]],
	np.array([[10, 1, 55], [5, 2, 55]]),
	np.array([["b", "A", "cat"], ["a", "B", "cat"]], dtype=object),
	np.array([["b", 1, "cat"], ["a", np.nan, "cat"]], dtype=object),
	np.array([["b", 1, "cat"], ["a", float("nan"), "cat"]], dtype=object),
	np.array([[None, 1, "cat"], ["a", 2, "cat"]], dtype=object),
	np.array([[None, 1, None], ["a", np.nan, None]], dtype=object),
	np.array([[None, 1, None], ["a", float("nan"), None]], dtype=object),
	],
	ids=[
	"mixed",
	"numeric",
	"object",
	"mixed-nan",
	"mixed-float-nan",
	"mixed-None",
	"mixed-None-nan",
	"mixed-None-float-nan",
	],
	)
	def test_one_hot_encoder(X):
	Xtr = check_categorical_onehot(np.array(X)[:, [0]])
	assert_allclose(Xtr, [[0, 1], [1, 0]])

	Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
	assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])

	Xtr = OneHotEncoder(categories="auto").fit_transform(X)
	assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])


	@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
	@pytest.mark.parametrize("sparse_", [False, True])
	@pytest.mark.parametrize("drop", [None, "first"])
	def test_one_hot_encoder_inverse(handle_unknown, sparse_, drop):
	X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]]
	enc = OneHotEncoder(sparse_output=sparse_, drop=drop)
	X_tr = enc.fit_transform(X)
	exp = np.array(X, dtype=object)
	assert_array_equal(enc.inverse_transform(X_tr), exp)

	X = [[2, 55], [1, 55], [3, 55]]
	enc = OneHotEncoder(sparse_output=sparse_, categories="auto", drop=drop)
	X_tr = enc.fit_transform(X)
	exp = np.array(X)
	assert_array_equal(enc.inverse_transform(X_tr), exp)

	if drop is None:
	# with unknown categories
	# drop is incompatible with handle_unknown=ignore
	X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]]
	enc = OneHotEncoder(
	sparse_output=sparse_,
	handle_unknown=handle_unknown,
	categories=[["abc", "def"], [1, 2], [54, 55, 56]],
	)
	X_tr = enc.fit_transform(X)
	exp = np.array(X, dtype=object)
	exp[2, 1] = None
	assert_array_equal(enc.inverse_transform(X_tr), exp)

	# with an otherwise numerical output, still object if unknown
	X = [[2, 55], [1, 55], [3, 55]]
	enc = OneHotEncoder(
	sparse_output=sparse_,
	categories=[[1, 2], [54, 56]],
	handle_unknown=handle_unknown,
	)
	X_tr = enc.fit_transform(X)
	exp = np.array(X, dtype=object)
	exp[2, 0] = None
	exp[:, 1] = None
	assert_array_equal(enc.inverse_transform(X_tr), exp)

	# incorrect shape raises
	X_tr = np.array([[0, 1, 1], [1, 0, 1]])
	msg = re.escape("Shape of the passed X data is not correct")
	with pytest.raises(ValueError, match=msg):
	enc.inverse_transform(X_tr)


	@pytest.mark.parametrize("sparse_", [False, True])
	@pytest.mark.parametrize(
	"X, X_trans",
	[
	([[2, 55], [1, 55], [2, 55]], [[0, 1, 1], [0, 0, 0], [0, 1, 1]]),
	(
	[["one", "a"], ["two", "a"], ["three", "b"], ["two", "a"]],
	[[0, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0]],
	),
	],
	)
	def test_one_hot_encoder_inverse_transform_raise_error_with_unknown(
	X, X_trans, sparse_
	):
	"""Check that `inverse_transform` raise an error with unknown samples, no
	dropped feature, and `handle_unknow="error`.
	Non-regression test for:
	https://github.com/scikit-learn/scikit-learn/issues/14934
	"""
	enc = OneHotEncoder(sparse_output=sparse_).fit(X)
	msg = (
	r"Samples \[(\d )*\d\] can not be inverted when drop=None and "
	r"handle_unknown='error' because they contain all zeros"
	)

	if sparse_:
	# emulate sparse data transform by a one-hot encoder sparse.
	X_trans = _convert_container(X_trans, "sparse")
	with pytest.raises(ValueError, match=msg):
	enc.inverse_transform(X_trans)


	def test_one_hot_encoder_inverse_if_binary():
	X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object)
	ohe = OneHotEncoder(drop="if_binary", sparse_output=False)
	X_tr = ohe.fit_transform(X)
	assert_array_equal(ohe.inverse_transform(X_tr), X)


	@pytest.mark.parametrize("drop", ["if_binary", "first", None])
	@pytest.mark.parametrize("reset_drop", ["if_binary", "first", None])
	def test_one_hot_encoder_drop_reset(drop, reset_drop):
	# check that resetting drop option without refitting does not throw an error
	X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object)
	ohe = OneHotEncoder(drop=drop, sparse_output=False)
	ohe.fit(X)
	X_tr = ohe.transform(X)
	feature_names = ohe.get_feature_names_out()
	ohe.set_params(drop=reset_drop)
	assert_array_equal(ohe.inverse_transform(X_tr), X)
	assert_allclose(ohe.transform(X), X_tr)
	assert_array_equal(ohe.get_feature_names_out(), feature_names)


	@pytest.mark.parametrize("method", ["fit", "fit_transform"])
	@pytest.mark.parametrize("X", [[1, 2], np.array([3.0, 4.0])])
	def test_X_is_not_1D(X, method):
	oh = OneHotEncoder()

	msg = "Expected 2D array, got 1D array instead"
	with pytest.raises(ValueError, match=msg):
	getattr(oh, method)(X)


	@pytest.mark.parametrize("method", ["fit", "fit_transform"])
	def test_X_is_not_1D_pandas(method):
	pd = pytest.importorskip("pandas")
	X = pd.Series([6, 3, 4, 6])
	oh = OneHotEncoder()

	msg = f"Expected a 2-dimensional container but got {type(X)} instead."
	with pytest.raises(ValueError, match=msg):
	getattr(oh, method)(X)


	@pytest.mark.parametrize(
	"X, cat_exp, cat_dtype",
	[
	([["abc", 55], ["def", 55]], [["abc", "def"], [55]], np.object_),
	(np.array([[1, 2], [3, 2]]), [[1, 3], [2]], np.integer),
	(
	np.array([["A", "cat"], ["B", "cat"]], dtype=object),
	[["A", "B"], ["cat"]],
	np.object_,
	),
	(np.array([["A", "cat"], ["B", "cat"]]), [["A", "B"], ["cat"]], np.str_),
	(np.array([[1, 2], [np.nan, 2]]), [[1, np.nan], [2]], np.float64),
	(
	np.array([["A", np.nan], [None, np.nan]], dtype=object),
	[["A", None], [np.nan]],
	np.object_,
	),
	(
	np.array([["A", float("nan")], [None, float("nan")]], dtype=object),
	[["A", None], [float("nan")]],
	np.object_,
	),
	],
	ids=[
	"mixed",
	"numeric",
	"object",
	"string",
	"missing-float",
	"missing-np.nan-object",
	"missing-float-nan-object",
	],
	)
	def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
	# order of categories should not depend on order of samples
	for Xi in [X, X[::-1]]:
	enc = OneHotEncoder(categories="auto")
	enc.fit(Xi)
	# assert enc.categories == 'auto'
	assert isinstance(enc.categories_, list)
	for res, exp in zip(enc.categories_, cat_exp):
	res_list = res.tolist()
	if is_scalar_nan(exp[-1]):
	assert is_scalar_nan(res_list[-1])
	assert res_list[:-1] == exp[:-1]
	else:
	assert res.tolist() == exp
	assert np.issubdtype(res.dtype, cat_dtype)


	@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
	@pytest.mark.parametrize(
	"X, X2, cats, cat_dtype",
	[
	(
	np.array([["a", "b"]], dtype=object).T,
	np.array([["a", "d"]], dtype=object).T,
	[["a", "b", "c"]],
	np.object_,
	),
	(
	np.array([[1, 2]], dtype="int64").T,
	np.array([[1, 4]], dtype="int64").T,
	[[1, 2, 3]],
	np.int64,
	),
	(
	np.array([["a", "b"]], dtype=object).T,
	np.array([["a", "d"]], dtype=object).T,
	[np.array(["a", "b", "c"])],
	np.object_,
	),
	(
	np.array([[None, "a"]], dtype=object).T,
	np.array([[None, "b"]], dtype=object).T,
	[[None, "a", "z"]],
	object,
	),
	(
	np.array([["a", "b"]], dtype=object).T,
	np.array([["a", np.nan]], dtype=object).T,
	[["a", "b", "z"]],
	object,
	),
	(
	np.array([["a", None]], dtype=object).T,
	np.array([["a", np.nan]], dtype=object).T,
	[["a", None, "z"]],
	object,
	),
	],
	ids=[
	"object",
	"numeric",
	"object-string",
	"object-string-none",
	"object-string-nan",
	"object-None-and-nan",
	],
	)
	def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype, handle_unknown):
	enc = OneHotEncoder(categories=cats)
	exp = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
	assert_array_equal(enc.fit_transform(X).toarray(), exp)
	assert list(enc.categories[0]) == list(cats[0])
	assert enc.categories_[0].tolist() == list(cats[0])
	# manually specified categories should have same dtype as
	# the data when coerced from lists
	assert enc.categories_[0].dtype == cat_dtype

	# when specifying categories manually, unknown categories should already
	# raise when fitting
	enc = OneHotEncoder(categories=cats)
	with pytest.raises(ValueError, match="Found unknown categories"):
	enc.fit(X2)
	enc = OneHotEncoder(categories=cats, handle_unknown=handle_unknown)
	exp = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
	assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)


	def test_one_hot_encoder_unsorted_categories():
	X = np.array([["a", "b"]], dtype=object).T

	enc = OneHotEncoder(categories=[["b", "a", "c"]])
	exp = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0]])
	assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
	assert_array_equal(enc.fit_transform(X).toarray(), exp)
	assert enc.categories_[0].tolist() == ["b", "a", "c"]
	assert np.issubdtype(enc.categories_[0].dtype, np.object_)

	# unsorted passed categories still raise for numerical values
	X = np.array([[1, 2]]).T
	enc = OneHotEncoder(categories=[[2, 1, 3]])
	msg = "Unsorted categories are not supported"
	with pytest.raises(ValueError, match=msg):
	enc.fit_transform(X)


	@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
	def test_encoder_nan_ending_specified_categories(Encoder):
	"""Test encoder for specified categories that nan is at the end.

	Non-regression test for:
	https://github.com/scikit-learn/scikit-learn/issues/27088
	"""
	cats = [np.array([0, np.nan, 1])]
	enc = Encoder(categories=cats)
	X = np.array([[0, 1]], dtype=object).T
	with pytest.raises(ValueError, match="Nan should be the last element"):
	enc.fit(X)


	def test_one_hot_encoder_specified_categories_mixed_columns():
	# multiple columns
	X = np.array([["a", "b"], [0, 2]], dtype=object).T
	enc = OneHotEncoder(categories=[["a", "b", "c"], [0, 1, 2]])
	exp = np.array([[1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 1.0]])
	assert_array_equal(enc.fit_transform(X).toarray(), exp)
	assert enc.categories_[0].tolist() == ["a", "b", "c"]
	assert np.issubdtype(enc.categories_[0].dtype, np.object_)
	assert enc.categories_[1].tolist() == [0, 1, 2]
	# integer categories but from object dtype data
	assert np.issubdtype(enc.categories_[1].dtype, np.object_)


	def test_one_hot_encoder_pandas():
	pd = pytest.importorskip("pandas")

	X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})

	Xtr = check_categorical_onehot(X_df)
	assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])


	@pytest.mark.parametrize(
	"drop, expected_names",
	[
	("first", ["x0_c", "x2_b"]),
	("if_binary", ["x0_c", "x1_2", "x2_b"]),
	(["c", 2, "b"], ["x0_b", "x2_a"]),
	],
	ids=["first", "binary", "manual"],
	)
	def test_one_hot_encoder_feature_names_drop(drop, expected_names):
	X = [["c", 2, "a"], ["b", 2, "b"]]

	ohe = OneHotEncoder(drop=drop)
	ohe.fit(X)
	feature_names = ohe.get_feature_names_out()
	assert_array_equal(expected_names, feature_names)


	def test_one_hot_encoder_drop_equals_if_binary():
	# Canonical case
	X = [[10, "yes"], [20, "no"], [30, "yes"]]
	expected = np.array(
	[[1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0]]
	)
	expected_drop_idx = np.array([None, 0])

	ohe = OneHotEncoder(drop="if_binary", sparse_output=False)
	result = ohe.fit_transform(X)
	assert_array_equal(ohe.drop_idx_, expected_drop_idx)
	assert_allclose(result, expected)

	# with only one cat, the behaviour is equivalent to drop=None
	X = [["true", "a"], ["false", "a"], ["false", "a"]]
	expected = np.array([[1.0, 1.0], [0.0, 1.0], [0.0, 1.0]])
	expected_drop_idx = np.array([0, None])

	ohe = OneHotEncoder(drop="if_binary", sparse_output=False)
	result = ohe.fit_transform(X)
	assert_array_equal(ohe.drop_idx_, expected_drop_idx)
	assert_allclose(result, expected)


	@pytest.mark.parametrize(
	"X",
	[
	[["abc", 2, 55], ["def", 1, 55]],
	np.array([[10, 2, 55], [20, 1, 55]]),
	np.array([["a", "B", "cat"], ["b", "A", "cat"]], dtype=object),
	],
	ids=["mixed", "numeric", "object"],
	)
	def test_ordinal_encoder(X):
	enc = OrdinalEncoder()
	exp = np.array([[0, 1, 0], [1, 0, 0]], dtype="int64")
	assert_array_equal(enc.fit_transform(X), exp.astype("float64"))
	enc = OrdinalEncoder(dtype="int64")
	assert_array_equal(enc.fit_transform(X), exp)


	@pytest.mark.parametrize(
	"X, X2, cats, cat_dtype",
	[
	(
	np.array([["a", "b"]], dtype=object).T,
	np.array([["a", "d"]], dtype=object).T,
	[["a", "b", "c"]],
	np.object_,
	),
	(
	np.array([[1, 2]], dtype="int64").T,
	np.array([[1, 4]], dtype="int64").T,
	[[1, 2, 3]],
	np.int64,
	),
	(
	np.array([["a", "b"]], dtype=object).T,
	np.array([["a", "d"]], dtype=object).T,
	[np.array(["a", "b", "c"])],
	np.object_,
	),
	],
	ids=["object", "numeric", "object-string-cat"],
	)
	def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
	enc = OrdinalEncoder(categories=cats)
	exp = np.array([[0.0], [1.0]])
	assert_array_equal(enc.fit_transform(X), exp)
	assert list(enc.categories[0]) == list(cats[0])
	assert enc.categories_[0].tolist() == list(cats[0])
	# manually specified categories should have same dtype as
	# the data when coerced from lists
	assert enc.categories_[0].dtype == cat_dtype

	# when specifying categories manually, unknown categories should already
	# raise when fitting
	enc = OrdinalEncoder(categories=cats)
	with pytest.raises(ValueError, match="Found unknown categories"):
	enc.fit(X2)


	def test_ordinal_encoder_inverse():
	X = [["abc", 2, 55], ["def", 1, 55]]
	enc = OrdinalEncoder()
	X_tr = enc.fit_transform(X)
	exp = np.array(X, dtype=object)
	assert_array_equal(enc.inverse_transform(X_tr), exp)

	# incorrect shape raises
	X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
	msg = re.escape("Shape of the passed X data is not correct")
	with pytest.raises(ValueError, match=msg):
	enc.inverse_transform(X_tr)


	def test_ordinal_encoder_handle_unknowns_string():
	enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2)
	X_fit = np.array([["a", "x"], ["b", "y"], ["c", "z"]], dtype=object)
	X_trans = np.array([["c", "xy"], ["bla", "y"], ["a", "x"]], dtype=object)
	enc.fit(X_fit)

	X_trans_enc = enc.transform(X_trans)
	exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype="int64")
	assert_array_equal(X_trans_enc, exp)

	X_trans_inv = enc.inverse_transform(X_trans_enc)
	inv_exp = np.array([["c", None], [None, "y"], ["a", "x"]], dtype=object)
	assert_array_equal(X_trans_inv, inv_exp)


	@pytest.mark.parametrize("dtype", [float, int])
	def test_ordinal_encoder_handle_unknowns_numeric(dtype):
	enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)
	X_fit = np.array([[1, 7], [2, 8], [3, 9]], dtype=dtype)
	X_trans = np.array([[3, 12], [23, 8], [1, 7]], dtype=dtype)
	enc.fit(X_fit)

	X_trans_enc = enc.transform(X_trans)
	exp = np.array([[2, -999], [-999, 1], [0, 0]], dtype="int64")
	assert_array_equal(X_trans_enc, exp)

	X_trans_inv = enc.inverse_transform(X_trans_enc)
	inv_exp = np.array([[3, None], [None, 8], [1, 7]], dtype=object)
	assert_array_equal(X_trans_inv, inv_exp)


	def test_ordinal_encoder_handle_unknowns_nan():
	# Make sure unknown_value=np.nan properly works

	enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)

	X_fit = np.array([[1], [2], [3]])
	enc.fit(X_fit)
	X_trans = enc.transform([[1], [2], [4]])
	assert_array_equal(X_trans, [[0], [1], [np.nan]])


	def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype():
	# Make sure an error is raised when unknown_value=np.nan and the dtype
	# isn't a float dtype
	enc = OrdinalEncoder(
	handle_unknown="use_encoded_value", unknown_value=np.nan, dtype=int
	)

	X_fit = np.array([[1], [2], [3]])
	with pytest.raises(ValueError, match="dtype parameter should be a float dtype"):
	enc.fit(X_fit)


	def test_ordinal_encoder_raise_categories_shape():
	X = np.array([["Low", "Medium", "High", "Medium", "Low"]], dtype=object).T
	cats = ["Low", "Medium", "High"]
	enc = OrdinalEncoder(categories=cats)
	msg = "Shape mismatch: if categories is an array,"

	with pytest.raises(ValueError, match=msg):
	enc.fit(X)


	def test_encoder_dtypes():
	# check that dtypes are preserved when determining categories
	enc = OneHotEncoder(categories="auto")
	exp = np.array([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0]], dtype="float64")

	for X in [
	np.array([[1, 2], [3, 4]], dtype="int64"),
	np.array([[1, 2], [3, 4]], dtype="float64"),
	np.array([["a", "b"], ["c", "d"]]), # str dtype
	np.array([[b"a", b"b"], [b"c", b"d"]]), # bytes dtype
	np.array([[1, "a"], [3, "b"]], dtype="object"),
	]:
	enc.fit(X)
	assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
	assert_array_equal(enc.transform(X).toarray(), exp)

	X = [[1, 2], [3, 4]]
	enc.fit(X)
	assert all([np.issubdtype(enc.categories_[i].dtype, np.integer) for i in range(2)])
	assert_array_equal(enc.transform(X).toarray(), exp)

	X = [[1, "a"], [3, "b"]]
	enc.fit(X)
	assert all([enc.categories_[i].dtype == "object" for i in range(2)])
	assert_array_equal(enc.transform(X).toarray(), exp)


	def test_encoder_dtypes_pandas():
	# check dtype (similar to test_categorical_encoder_dtypes for dataframes)
	pd = pytest.importorskip("pandas")

	enc = OneHotEncoder(categories="auto")
	exp = np.array(
	[[1.0, 0.0, 1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0, 0.0, 1.0]],
	dtype="float64",
	)

	X = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}, dtype="int64")
	enc.fit(X)
	assert all([enc.categories_[i].dtype == "int64" for i in range(2)])
	assert_array_equal(enc.transform(X).toarray(), exp)

	X = pd.DataFrame({"A": [1, 2], "B": ["a", "b"], "C": [3.0, 4.0]})
	X_type = [X["A"].dtype, X["B"].dtype, X["C"].dtype]
	enc.fit(X)
	assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
	assert_array_equal(enc.transform(X).toarray(), exp)


	def test_one_hot_encoder_warning():
	enc = OneHotEncoder()
	X = [["Male", 1], ["Female", 3]]
	with warnings.catch_warnings():
	warnings.simplefilter("error")
	enc.fit_transform(X)


	@pytest.mark.parametrize("drop", ["if_binary", "first"])
	def test_ohe_handle_unknown_warn(drop):
	"""Check handle_unknown='warn' works correctly."""

	X = [["a", 0], ["b", 2], ["b", 1]]

	ohe = OneHotEncoder(
	drop=drop,
	sparse_output=False,
	handle_unknown="warn",
	categories=[["b", "a"], [1, 2]],
	)
	ohe.fit(X)

	X_test = [["c", 1]]
	X_expected = np.array([[0, 0]])

	warn_msg = (
	r"Found unknown categories in columns \[0\] during transform. "
	r"These unknown categories will be encoded as all zeros"
	)
	with pytest.warns(UserWarning, match=warn_msg):
	X_trans = ohe.transform(X_test)
	assert_allclose(X_trans, X_expected)


	@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
	def test_one_hot_encoder_drop_manual(missing_value):
	cats_to_drop = ["def", 12, 3, 56, missing_value]
	enc = OneHotEncoder(drop=cats_to_drop)
	X = [
	["abc", 12, 2, 55, "a"],
	["def", 12, 1, 55, "a"],
	["def", 12, 3, 56, missing_value],
	]
	trans = enc.fit_transform(X).toarray()
	exp = [[1, 0, 1, 1, 1], [0, 1, 0, 1, 1], [0, 0, 0, 0, 0]]
	assert_array_equal(trans, exp)
	assert enc.drop is cats_to_drop

	dropped_cats = [
	cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_)
	]
	X_inv_trans = enc.inverse_transform(trans)
	X_array = np.array(X, dtype=object)

	# last value is np.nan
	if is_scalar_nan(cats_to_drop[-1]):
	assert_array_equal(dropped_cats[:-1], cats_to_drop[:-1])
	assert is_scalar_nan(dropped_cats[-1])
	assert is_scalar_nan(cats_to_drop[-1])
	# do not include the last column which includes missing values
	assert_array_equal(X_array[:, :-1], X_inv_trans[:, :-1])

	# check last column is the missing value
	assert_array_equal(X_array[-1, :-1], X_inv_trans[-1, :-1])
	assert is_scalar_nan(X_array[-1, -1])
	assert is_scalar_nan(X_inv_trans[-1, -1])
	else:
	assert_array_equal(dropped_cats, cats_to_drop)
	assert_array_equal(X_array, X_inv_trans)


	@pytest.mark.parametrize("drop", [["abc", 3], ["abc", 3, 41, "a"]])
	def test_invalid_drop_length(drop):
	enc = OneHotEncoder(drop=drop)
	err_msg = "`drop` should have length equal to the number"
	with pytest.raises(ValueError, match=err_msg):
	enc.fit([["abc", 2, 55], ["def", 1, 55], ["def", 3, 59]])


	@pytest.mark.parametrize("density", [True, False], ids=["sparse", "dense"])
	@pytest.mark.parametrize("drop", ["first", ["a", 2, "b"]], ids=["first", "manual"])
	def test_categories(density, drop):
	ohe_base = OneHotEncoder(sparse_output=density)
	ohe_test = OneHotEncoder(sparse_output=density, drop=drop)
	X = [["c", 1, "a"], ["a", 2, "b"]]
	ohe_base.fit(X)
	ohe_test.fit(X)
	assert_array_equal(ohe_base.categories_, ohe_test.categories_)
	if drop == "first":
	assert_array_equal(ohe_test.drop_idx_, 0)
	else:
	for drop_cat, drop_idx, cat_list in zip(
	drop, ohe_test.drop_idx_, ohe_test.categories_
	):
	assert cat_list[int(drop_idx)] == drop_cat
	assert isinstance(ohe_test.drop_idx_, np.ndarray)
	assert ohe_test.drop_idx_.dtype == object


	@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
	def test_encoders_has_categorical_tags(Encoder):
	assert Encoder().__sklearn_tags__().input_tags.categorical


	@pytest.mark.parametrize(
	"kwargs",
	[
	{"max_categories": 2},
	{"min_frequency": 11},
	{"min_frequency": 0.29},
	{"max_categories": 2, "min_frequency": 6},
	{"max_categories": 4, "min_frequency": 12},
	],
	)
	@pytest.mark.parametrize("categories", ["auto", [["a", "b", "c", "d"]]])
	def test_ohe_infrequent_two_levels(kwargs, categories):
	"""Test that different parameters for combine 'a', 'c', and 'd' into
	the infrequent category works as expected."""

	X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
	ohe = OneHotEncoder(
	categories=categories,
	handle_unknown="infrequent_if_exist",
	sparse_output=False,
	**kwargs,
	).fit(X_train)
	assert_array_equal(ohe.infrequent_categories_, [["a", "c", "d"]])

	X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
	expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])

	X_trans = ohe.transform(X_test)
	assert_allclose(expected, X_trans)

	expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4]
	X_inv = ohe.inverse_transform(X_trans)
	assert_array_equal(expected_inv, X_inv)

	feature_names = ohe.get_feature_names_out()
	assert_array_equal(["x0_b", "x0_infrequent_sklearn"], feature_names)


	@pytest.mark.parametrize("drop", ["if_binary", "first", ["b"]])
	def test_ohe_infrequent_two_levels_drop_frequent(drop):
	"""Test two levels and dropping the frequent category."""

	X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
	ohe = OneHotEncoder(
	handle_unknown="infrequent_if_exist",
	sparse_output=False,
	max_categories=2,
	drop=drop,
	).fit(X_train)
	assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"

	X_test = np.array([["b"], ["c"]])
	X_trans = ohe.transform(X_test)
	assert_allclose([[0], [1]], X_trans)

	feature_names = ohe.get_feature_names_out()
	assert_array_equal(["x0_infrequent_sklearn"], feature_names)

	X_inverse = ohe.inverse_transform(X_trans)
	assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse)


	@pytest.mark.parametrize("drop", [["a"], ["d"]])
	def test_ohe_infrequent_two_levels_drop_infrequent_errors(drop):
	"""Test two levels and dropping any infrequent category removes the
	whole infrequent category."""

	X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
	ohe = OneHotEncoder(
	handle_unknown="infrequent_if_exist",
	sparse_output=False,
	max_categories=2,
	drop=drop,
	)

	msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent"
	with pytest.raises(ValueError, match=msg):
	ohe.fit(X_train)


	@pytest.mark.parametrize(
	"kwargs",
	[
	{"max_categories": 3},
	{"min_frequency": 6},
	{"min_frequency": 9},
	{"min_frequency": 0.24},
	{"min_frequency": 0.16},
	{"max_categories": 3, "min_frequency": 8},
	{"max_categories": 4, "min_frequency": 6},
	],
	)
	def test_ohe_infrequent_three_levels(kwargs):
	"""Test that different parameters for combing 'a', and 'd' into
	the infrequent category works as expected."""

	X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
	ohe = OneHotEncoder(
	handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs
	).fit(X_train)
	assert_array_equal(ohe.infrequent_categories_, [["a", "d"]])

	X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
	expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1], [0, 0, 1]])

	X_trans = ohe.transform(X_test)
	assert_allclose(expected, X_trans)

	expected_inv = [
	["b"],
	["infrequent_sklearn"],
	["c"],
	["infrequent_sklearn"],
	["infrequent_sklearn"],
	]
	X_inv = ohe.inverse_transform(X_trans)
	assert_array_equal(expected_inv, X_inv)

	feature_names = ohe.get_feature_names_out()
	assert_array_equal(["x0_b", "x0_c", "x0_infrequent_sklearn"], feature_names)


	@pytest.mark.parametrize("drop", ["first", ["b"]])
	def test_ohe_infrequent_three_levels_drop_frequent(drop):
	"""Test three levels and dropping the frequent category."""

	X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
	ohe = OneHotEncoder(
	handle_unknown="infrequent_if_exist",
	sparse_output=False,
	max_categories=3,
	drop=drop,
	).fit(X_train)

	X_test = np.array([["b"], ["c"], ["d"]])
	assert_allclose([[0, 0], [1, 0], [0, 1]], ohe.transform(X_test))

	# Check handle_unknown="ignore"
	ohe.set_params(handle_unknown="ignore").fit(X_train)
	msg = "Found unknown categories"
	with pytest.warns(UserWarning, match=msg):
	X_trans = ohe.transform([["b"], ["e"]])

	assert_allclose([[0, 0], [0, 0]], X_trans)


	@pytest.mark.parametrize("drop", [["a"], ["d"]])
	def test_ohe_infrequent_three_levels_drop_infrequent_errors(drop):
	"""Test three levels and dropping the infrequent category."""
	X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
	ohe = OneHotEncoder(
	handle_unknown="infrequent_if_exist",
	sparse_output=False,
	max_categories=3,
	drop=drop,
	)

	msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent"
	with pytest.raises(ValueError, match=msg):
	ohe.fit(X_train)


	def test_ohe_infrequent_handle_unknown_error():
	"""Test that different parameters for combining 'a', and 'd' into
	the infrequent category works as expected."""

	X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
	ohe = OneHotEncoder(
	handle_unknown="error", sparse_output=False, max_categories=3
	).fit(X_train)
	assert_array_equal(ohe.infrequent_categories_, [["a", "d"]])

	# all categories are known
	X_test = [["b"], ["a"], ["c"], ["d"]]
	expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]])

	X_trans = ohe.transform(X_test)
	assert_allclose(expected, X_trans)

	# 'bad' is not known and will error
	X_test = [["bad"]]
	msg = r"Found unknown categories \['bad'\] in column 0"
	with pytest.raises(ValueError, match=msg):
	ohe.transform(X_test)


	@pytest.mark.parametrize(
	"kwargs", [{"max_categories": 3, "min_frequency": 1}, {"min_frequency": 4}]
	)
	def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
	"""'a' is the only frequent category, all other categories are infrequent."""

	X_train = np.array([["a"] * 5 + ["e"] * 30], dtype=object).T
	ohe = OneHotEncoder(
	categories=[["c", "d", "a", "b"]],
	sparse_output=False,
	handle_unknown="infrequent_if_exist",
	**kwargs,
	).fit(X_train)

	X_test = [["a"], ["b"], ["c"], ["d"], ["e"]]
	expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])

	X_trans = ohe.transform(X_test)
	assert_allclose(expected, X_trans)

	# 'a' is dropped
	drops = ["first", "if_binary", ["a"]]
	X_test = [["a"], ["c"]]
	for drop in drops:
	ohe.set_params(drop=drop).fit(X_train)
	assert_allclose([[0], [1]], ohe.transform(X_test))


	def test_ohe_infrequent_two_levels_user_cats():
	"""Test that the order of the categories provided by a user is respected."""
	X_train = np.array(
	[["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
	).T
	ohe = OneHotEncoder(
	categories=[["c", "d", "a", "b"]],
	sparse_output=False,
	handle_unknown="infrequent_if_exist",
	max_categories=2,
	).fit(X_train)

	assert_array_equal(ohe.infrequent_categories_, [["c", "d", "a"]])

	X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
	expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])

	X_trans = ohe.transform(X_test)
	assert_allclose(expected, X_trans)

	# 'infrequent' is used to denote the infrequent categories for
	# `inverse_transform`
	expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4]
	X_inv = ohe.inverse_transform(X_trans)
	assert_array_equal(expected_inv, X_inv)


	def test_ohe_infrequent_three_levels_user_cats():
	"""Test that the order of the categories provided by a user is respected.
	In this case 'c' is encoded as the first category and 'b' is encoded
	as the second one."""

	X_train = np.array(
	[["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
	).T
	ohe = OneHotEncoder(
	categories=[["c", "d", "b", "a"]],
	sparse_output=False,
	handle_unknown="infrequent_if_exist",
	max_categories=3,
	).fit(X_train)

	assert_array_equal(ohe.infrequent_categories_, [["d", "a"]])

	X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
	expected = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 0, 1], [0, 0, 1]])

	X_trans = ohe.transform(X_test)
	assert_allclose(expected, X_trans)

	# 'infrequent' is used to denote the infrequent categories for
	# `inverse_transform`
	expected_inv = [
	["b"],
	["infrequent_sklearn"],
	["c"],
	["infrequent_sklearn"],
	["infrequent_sklearn"],
	]
	X_inv = ohe.inverse_transform(X_trans)
	assert_array_equal(expected_inv, X_inv)


	def test_ohe_infrequent_mixed():
	"""Test infrequent categories where feature 0 has infrequent categories,
	and feature 1 does not."""

	# X[:, 0] 1 and 2 are infrequent
	# X[:, 1] nothing is infrequent
	X = np.c_[[0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1]]

	ohe = OneHotEncoder(max_categories=3, drop="if_binary", sparse_output=False)
	ohe.fit(X)

	X_test = [[3, 0], [1, 1]]
	X_trans = ohe.transform(X_test)

	# feature 1 is binary so it drops a category 0
	assert_allclose(X_trans, [[0, 1, 0, 0], [0, 0, 1, 1]])


	def test_ohe_infrequent_multiple_categories():
	"""Test infrequent categories with feature matrix with 3 features."""

	X = np.c_[
	[0, 1, 3, 3, 3, 3, 2, 0, 3],
	[0, 0, 5, 1, 1, 10, 5, 5, 0],
	[1, 0, 1, 0, 1, 0, 1, 0, 1],
	]

	ohe = OneHotEncoder(
	categories="auto", max_categories=3, handle_unknown="infrequent_if_exist"
	)
	# X[:, 0] 1 and 2 are infrequent
	# X[:, 1] 1 and 10 are infrequent
	# X[:, 2] nothing is infrequent

	X_trans = ohe.fit_transform(X).toarray()
	assert_array_equal(ohe.infrequent_categories_[0], [1, 2])
	assert_array_equal(ohe.infrequent_categories_[1], [1, 10])
	assert_array_equal(ohe.infrequent_categories_[2], None)

	# 'infrequent' is used to denote the infrequent categories
	# For the first column, 1 and 2 have the same frequency. In this case,
	# 1 will be chosen to be the feature name because is smaller lexiconically
	feature_names = ohe.get_feature_names_out()
	assert_array_equal(
	[
	"x0_0",
	"x0_3",
	"x0_infrequent_sklearn",
	"x1_0",
	"x1_5",
	"x1_infrequent_sklearn",
	"x2_0",
	"x2_1",
	],
	feature_names,
	)

	expected = [
	[1, 0, 0, 1, 0, 0, 0, 1],
	[0, 0, 1, 1, 0, 0, 1, 0],
	[0, 1, 0, 0, 1, 0, 0, 1],
	[0, 1, 0, 0, 0, 1, 1, 0],
	[0, 1, 0, 0, 0, 1, 0, 1],
	[0, 1, 0, 0, 0, 1, 1, 0],
	[0, 0, 1, 0, 1, 0, 0, 1],
	[1, 0, 0, 0, 1, 0, 1, 0],
	[0, 1, 0, 1, 0, 0, 0, 1],
	]

	assert_allclose(expected, X_trans)

	X_test = [[3, 1, 2], [4, 0, 3]]

	X_test_trans = ohe.transform(X_test)

	# X[:, 2] does not have an infrequent category, thus it is encoded as all
	# zeros
	expected = [[0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0]]
	assert_allclose(expected, X_test_trans.toarray())

	X_inv = ohe.inverse_transform(X_test_trans)
	expected_inv = np.array(
	[[3, "infrequent_sklearn", None], ["infrequent_sklearn", 0, None]], dtype=object
	)
	assert_array_equal(expected_inv, X_inv)

	# error for unknown categories
	ohe = OneHotEncoder(
	categories="auto", max_categories=3, handle_unknown="error"
	).fit(X)
	with pytest.raises(ValueError, match="Found unknown categories"):
	ohe.transform(X_test)

	# only infrequent or known categories
	X_test = [[1, 1, 1], [3, 10, 0]]
	X_test_trans = ohe.transform(X_test)

	expected = [[0, 0, 1, 0, 0, 1, 0, 1], [0, 1, 0, 0, 0, 1, 1, 0]]
	assert_allclose(expected, X_test_trans.toarray())

	X_inv = ohe.inverse_transform(X_test_trans)

	expected_inv = np.array(
	[["infrequent_sklearn", "infrequent_sklearn", 1], [3, "infrequent_sklearn", 0]],
	dtype=object,
	)
	assert_array_equal(expected_inv, X_inv)


	def test_ohe_infrequent_multiple_categories_dtypes():
	"""Test infrequent categories with a pandas dataframe with multiple dtypes."""

	pd = pytest.importorskip("pandas")
	X = pd.DataFrame(
	{
	"str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"],
	"int": [5, 3, 0, 10, 10, 12, 0, 3, 5],
	},
	columns=["str", "int"],
	)

	ohe = OneHotEncoder(
	categories="auto", max_categories=3, handle_unknown="infrequent_if_exist"
	)
	# X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be
	# considered infrequent because they are greater

	# X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1.
	# 0, 3, 12 will be considered infrequent

	X_trans = ohe.fit_transform(X).toarray()
	assert_array_equal(ohe.infrequent_categories_[0], ["a", "b"])
	assert_array_equal(ohe.infrequent_categories_[1], [0, 3, 12])

	expected = [
	[0, 0, 1, 1, 0, 0],
	[0, 1, 0, 0, 0, 1],
	[1, 0, 0, 0, 0, 1],
	[0, 1, 0, 0, 1, 0],
	[0, 1, 0, 0, 1, 0],
	[0, 0, 1, 0, 0, 1],
	[1, 0, 0, 0, 0, 1],
	[0, 0, 1, 0, 0, 1],
	[0, 0, 1, 1, 0, 0],
	]

	assert_allclose(expected, X_trans)

	X_test = pd.DataFrame({"str": ["b", "f"], "int": [14, 12]}, columns=["str", "int"])

	expected = [[0, 0, 1, 0, 0, 1], [0, 1, 0, 0, 0, 1]]
	X_test_trans = ohe.transform(X_test)
	assert_allclose(expected, X_test_trans.toarray())

	X_inv = ohe.inverse_transform(X_test_trans)
	expected_inv = np.array(
	[["infrequent_sklearn", "infrequent_sklearn"], ["f", "infrequent_sklearn"]],
	dtype=object,
	)
	assert_array_equal(expected_inv, X_inv)

	# only infrequent or known categories
	X_test = pd.DataFrame({"str": ["c", "b"], "int": [12, 5]}, columns=["str", "int"])
	X_test_trans = ohe.transform(X_test).toarray()
	expected = [[1, 0, 0, 0, 0, 1], [0, 0, 1, 1, 0, 0]]
	assert_allclose(expected, X_test_trans)

	X_inv = ohe.inverse_transform(X_test_trans)
	expected_inv = np.array(
	[["c", "infrequent_sklearn"], ["infrequent_sklearn", 5]], dtype=object
	)
	assert_array_equal(expected_inv, X_inv)


	@pytest.mark.parametrize("kwargs", [{"min_frequency": 21, "max_categories": 1}])
	def test_ohe_infrequent_one_level_errors(kwargs):
	"""All user provided categories are infrequent."""
	X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 2]).T

	ohe = OneHotEncoder(
	handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs
	)
	ohe.fit(X_train)

	X_trans = ohe.transform([["a"]])
	assert_allclose(X_trans, [[1]])


	@pytest.mark.parametrize("kwargs", [{"min_frequency": 2, "max_categories": 3}])
	def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs):
	"""All user provided categories are infrequent."""

	X_train = np.array([["e"] * 3], dtype=object).T
	ohe = OneHotEncoder(
	categories=[["c", "d", "a", "b"]],
	sparse_output=False,
	handle_unknown="infrequent_if_exist",
	**kwargs,
	).fit(X_train)

	X_trans = ohe.transform([["a"], ["e"]])
	assert_allclose(X_trans, [[1], [1]])


	# deliberately omit 'OS' as an invalid combo
	@pytest.mark.parametrize(
	"input_dtype, category_dtype", ["OO", "OU", "UO", "UU", "SO", "SU", "SS"]
	)
	@pytest.mark.parametrize("array_type", ["list", "array", "dataframe"])
	def test_encoders_string_categories(input_dtype, category_dtype, array_type):
	"""Check that encoding work with object, unicode, and byte string dtypes.
	Non-regression test for:
	https://github.com/scikit-learn/scikit-learn/issues/15616
	https://github.com/scikit-learn/scikit-learn/issues/15726
	https://github.com/scikit-learn/scikit-learn/issues/19677
	"""

	X = np.array([["b"], ["a"]], dtype=input_dtype)
	categories = [np.array(["b", "a"], dtype=category_dtype)]
	ohe = OneHotEncoder(categories=categories, sparse_output=False).fit(X)

	X_test = _convert_container(
	[["a"], ["a"], ["b"], ["a"]], array_type, dtype=input_dtype
	)
	X_trans = ohe.transform(X_test)

	expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]])
	assert_allclose(X_trans, expected)

	oe = OrdinalEncoder(categories=categories).fit(X)
	X_trans = oe.transform(X_test)

	expected = np.array([[1], [1], [0], [1]])
	assert_array_equal(X_trans, expected)


	def test_mixed_string_bytes_categoricals():
	"""Check that this mixture of predefined categories and X raises an error.

	Categories defined as bytes can not easily be compared to data that is
	a string.
	"""
	# data as unicode
	X = np.array([["b"], ["a"]], dtype="U")
	# predefined categories as bytes
	categories = [np.array(["b", "a"], dtype="S")]
	ohe = OneHotEncoder(categories=categories, sparse_output=False)

	msg = re.escape(
	"In column 0, the predefined categories have type 'bytes' which is incompatible"
	" with values of type 'str_'."
	)

	with pytest.raises(ValueError, match=msg):
	ohe.fit(X)


	@pytest.mark.parametrize("missing_value", [np.nan, None])
	def test_ohe_missing_values_get_feature_names(missing_value):
	# encoder with missing values with object dtypes
	X = np.array([["a", "b", missing_value, "a", missing_value]], dtype=object).T
	ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(X)
	names = ohe.get_feature_names_out()
	assert_array_equal(names, ["x0_a", "x0_b", f"x0_{missing_value}"])


	def test_ohe_missing_value_support_pandas():
	# check support for pandas with mixed dtypes and missing values
	pd = pytest.importorskip("pandas")
	df = pd.DataFrame(
	{
	"col1": ["dog", "cat", None, "cat"],
	"col2": np.array([3, 0, 4, np.nan], dtype=float),
	},
	columns=["col1", "col2"],
	)
	expected_df_trans = np.array(
	[
	[0, 1, 0, 0, 1, 0, 0],
	[1, 0, 0, 1, 0, 0, 0],
	[0, 0, 1, 0, 0, 1, 0],
	[1, 0, 0, 0, 0, 0, 1],
	]
	)

	Xtr = check_categorical_onehot(df)
	assert_allclose(Xtr, expected_df_trans)


	@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
	@pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"])
	def test_ohe_missing_value_support_pandas_categorical(pd_nan_type, handle_unknown):
	# checks pandas dataframe with categorical features
	pd = pytest.importorskip("pandas")

	pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan

	df = pd.DataFrame(
	{
	"col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"),
	}
	)
	expected_df_trans = np.array(
	[
	[0, 0, 1, 0],
	[1, 0, 0, 0],
	[0, 0, 0, 1],
	[0, 1, 0, 0],
	[1, 0, 0, 0],
	]
	)

	ohe = OneHotEncoder(sparse_output=False, handle_unknown=handle_unknown)
	df_trans = ohe.fit_transform(df)
	assert_allclose(expected_df_trans, df_trans)

	assert len(ohe.categories_) == 1
	assert_array_equal(ohe.categories_[0][:-1], ["a", "b", "c"])
	assert np.isnan(ohe.categories_[0][-1])


	@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
	def test_ohe_drop_first_handle_unknown_ignore_warns(handle_unknown):
	"""Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
	during transform."""
	X = [["a", 0], ["b", 2], ["b", 1]]

	ohe = OneHotEncoder(
	drop="first", sparse_output=False, handle_unknown=handle_unknown
	)
	X_trans = ohe.fit_transform(X)

	X_expected = np.array(
	[
	[0, 0, 0],
	[1, 0, 1],
	[1, 1, 0],
	]
	)
	assert_allclose(X_trans, X_expected)

	# Both categories are unknown
	X_test = [["c", 3]]
	X_expected = np.array([[0, 0, 0]])

	warn_msg = (
	r"Found unknown categories in columns \[0, 1\] during "
	"transform. These unknown categories will be encoded as all "
	"zeros"
	)
	with pytest.warns(UserWarning, match=warn_msg):
	X_trans = ohe.transform(X_test)
	assert_allclose(X_trans, X_expected)

	# inverse_transform maps to None
	X_inv = ohe.inverse_transform(X_expected)
	assert_array_equal(X_inv, np.array([["a", 0]], dtype=object))


	@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
	def test_ohe_drop_if_binary_handle_unknown_ignore_warns(handle_unknown):
	"""Check drop='if_binary' and handle_unknown='ignore' during transform."""
	X = [["a", 0], ["b", 2], ["b", 1]]

	ohe = OneHotEncoder(
	drop="if_binary", sparse_output=False, handle_unknown=handle_unknown
	)
	X_trans = ohe.fit_transform(X)

	X_expected = np.array(
	[
	[0, 1, 0, 0],
	[1, 0, 0, 1],
	[1, 0, 1, 0],
	]
	)
	assert_allclose(X_trans, X_expected)

	# Both categories are unknown
	X_test = [["c", 3]]
	X_expected = np.array([[0, 0, 0, 0]])

	warn_msg = (
	r"Found unknown categories in columns \[0, 1\] during "
	"transform. These unknown categories will be encoded as all "
	"zeros"
	)
	with pytest.warns(UserWarning, match=warn_msg):
	X_trans = ohe.transform(X_test)
	assert_allclose(X_trans, X_expected)

	# inverse_transform maps to None
	X_inv = ohe.inverse_transform(X_expected)
	assert_array_equal(X_inv, np.array([["a", None]], dtype=object))


	@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"])
	def test_ohe_drop_first_explicit_categories(handle_unknown):
	"""Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
	during fit with categories passed in."""

	X = [["a", 0], ["b", 2], ["b", 1]]

	ohe = OneHotEncoder(
	drop="first",
	sparse_output=False,
	handle_unknown=handle_unknown,
	categories=[["b", "a"], [1, 2]],
	)
	ohe.fit(X)

	X_test = [["c", 1]]
	X_expected = np.array([[0, 0]])

	warn_msg = (
	r"Found unknown categories in columns \[0\] during transform. "
	r"These unknown categories will be encoded as all zeros"
	)
	with pytest.warns(UserWarning, match=warn_msg):
	X_trans = ohe.transform(X_test)
	assert_allclose(X_trans, X_expected)


	def test_ohe_more_informative_error_message():
	"""Raise informative error message when pandas output and sparse_output=True."""
	pd = pytest.importorskip("pandas")
	df = pd.DataFrame({"a": [1, 2, 3], "b": ["z", "b", "b"]}, columns=["a", "b"])

	ohe = OneHotEncoder(sparse_output=True)
	ohe.set_output(transform="pandas")

	msg = (
	"Pandas output does not support sparse data. Set "
	"sparse_output=False to output pandas dataframes or disable Pandas output"
	)
	with pytest.raises(ValueError, match=msg):
	ohe.fit_transform(df)

	ohe.fit(df)
	with pytest.raises(ValueError, match=msg):
	ohe.transform(df)


	def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():
	"""Test ordinal encoder with nan passthrough fails when dtype=np.int32."""

	X = np.array([[np.nan, 3.0, 1.0, 3.0]]).T
	oe = OrdinalEncoder(dtype=np.int32)

	msg = (
	r"There are missing values in features \[0\]. For OrdinalEncoder "
	f"to encode missing values with dtype: {np.int32}"
	)
	with pytest.raises(ValueError, match=msg):
	oe.fit(X)


	@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
	def test_ordinal_encoder_passthrough_missing_values_float(encoded_missing_value):
	"""Test ordinal encoder with nan on float dtypes."""

	X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T
	oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(X)

	assert len(oe.categories_) == 1

	assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan])

	X_trans = oe.transform(X)
	assert_allclose(X_trans, [[encoded_missing_value], [1.0], [0.0], [1.0]])

	X_inverse = oe.inverse_transform(X_trans)
	assert_allclose(X_inverse, X)


	@pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"])
	@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
	def test_ordinal_encoder_missing_value_support_pandas_categorical(
	pd_nan_type, encoded_missing_value
	):
	"""Check ordinal encoder is compatible with pandas."""
	# checks pandas dataframe with categorical features
	pd = pytest.importorskip("pandas")

	pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan

	df = pd.DataFrame(
	{
	"col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"),
	}
	)

	oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(df)
	assert len(oe.categories_) == 1
	assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"])
	assert np.isnan(oe.categories_[0][-1])

	df_trans = oe.transform(df)

	assert_allclose(df_trans, [[2.0], [0.0], [encoded_missing_value], [1.0], [0.0]])

	X_inverse = oe.inverse_transform(df_trans)
	assert X_inverse.shape == (5, 1)
	assert_array_equal(X_inverse[:2, 0], ["c", "a"])
	assert_array_equal(X_inverse[3:, 0], ["b", "a"])
	assert np.isnan(X_inverse[2, 0])


	@pytest.mark.parametrize(
	"X, X2, cats, cat_dtype",
	[
	(
	(
	np.array([["a", np.nan]], dtype=object).T,
	np.array([["a", "b"]], dtype=object).T,
	[np.array(["a", "d", np.nan], dtype=object)],
	np.object_,
	)
	),
	(
	(
	np.array([["a", np.nan]], dtype=object).T,
	np.array([["a", "b"]], dtype=object).T,
	[np.array(["a", "d", np.nan], dtype=object)],
	np.object_,
	)
	),
	(
	(
	np.array([[2.0, np.nan]], dtype=np.float64).T,
	np.array([[3.0]], dtype=np.float64).T,
	[np.array([2.0, 4.0, np.nan])],
	np.float64,
	)
	),
	],
	ids=[
	"object-None-missing-value",
	"object-nan-missing_value",
	"numeric-missing-value",
	],
	)
	def test_ordinal_encoder_specified_categories_missing_passthrough(
	X, X2, cats, cat_dtype
	):
	"""Test ordinal encoder for specified categories."""
	oe = OrdinalEncoder(categories=cats)
	exp = np.array([[0.0], [np.nan]])
	assert_array_equal(oe.fit_transform(X), exp)
	# manually specified categories should have same dtype as
	# the data when coerced from lists
	assert oe.categories_[0].dtype == cat_dtype

	# when specifying categories manually, unknown categories should already
	# raise when fitting
	oe = OrdinalEncoder(categories=cats)
	with pytest.raises(ValueError, match="Found unknown categories"):
	oe.fit(X2)


	@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
	def test_encoder_duplicate_specified_categories(Encoder):
	"""Test encoder for specified categories have duplicate values.

	Non-regression test for:
	https://github.com/scikit-learn/scikit-learn/issues/27088
	"""
	cats = [np.array(["a", "b", "a"], dtype=object)]
	enc = Encoder(categories=cats)
	X = np.array([["a", "b"]], dtype=object).T
	with pytest.raises(
	ValueError, match="the predefined categories contain duplicate elements."
	):
	enc.fit(X)


	@pytest.mark.parametrize(
	"X, expected_X_trans, X_test",
	[
	(
	np.array([[1.0, np.nan, 3.0]]).T,
	np.array([[0.0, np.nan, 1.0]]).T,
	np.array([[4.0]]),
	),
	(
	np.array([[1.0, 4.0, 3.0]]).T,
	np.array([[0.0, 2.0, 1.0]]).T,
	np.array([[np.nan]]),
	),
	(
	np.array([["c", np.nan, "b"]], dtype=object).T,
	np.array([[1.0, np.nan, 0.0]]).T,
	np.array([["d"]], dtype=object),
	),
	(
	np.array([["c", "a", "b"]], dtype=object).T,
	np.array([[2.0, 0.0, 1.0]]).T,
	np.array([[np.nan]], dtype=object),
	),
	],
	)
	def test_ordinal_encoder_handle_missing_and_unknown(X, expected_X_trans, X_test):
	"""Test the interaction between missing values and handle_unknown"""

	oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

	X_trans = oe.fit_transform(X)
	assert_allclose(X_trans, expected_X_trans)

	assert_allclose(oe.transform(X_test), [[-1.0]])


	@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
	def test_ordinal_encoder_sparse(csr_container):
	"""Check that we raise proper error with sparse input in OrdinalEncoder.
	Non-regression test for:
	https://github.com/scikit-learn/scikit-learn/issues/19878
	"""
	X = np.array([[3, 2, 1], [0, 1, 1]])
	X_sparse = csr_container(X)

	encoder = OrdinalEncoder()

	err_msg = "Sparse data was passed, but dense data is required"
	with pytest.raises(TypeError, match=err_msg):
	encoder.fit(X_sparse)
	with pytest.raises(TypeError, match=err_msg):
	encoder.fit_transform(X_sparse)

	X_trans = encoder.fit_transform(X)
	X_trans_sparse = csr_container(X_trans)
	with pytest.raises(TypeError, match=err_msg):
	encoder.inverse_transform(X_trans_sparse)


	def test_ordinal_encoder_fit_with_unseen_category():
	"""Check OrdinalEncoder.fit works with unseen category when
	`handle_unknown="use_encoded_value"`.
	Non-regression test for:
	https://github.com/scikit-learn/scikit-learn/issues/19872
	"""
	X = np.array([0, 0, 1, 0, 2, 5])[:, np.newaxis]
	oe = OrdinalEncoder(
	categories=[[-1, 0, 1]], handle_unknown="use_encoded_value", unknown_value=-999
	)
	oe.fit(X)

	oe = OrdinalEncoder(categories=[[-1, 0, 1]], handle_unknown="error")
	with pytest.raises(ValueError, match="Found unknown categories"):
	oe.fit(X)


	@pytest.mark.parametrize(
	"X_train",
	[
	[["AA", "B"]],
	np.array([["AA", "B"]], dtype="O"),
	np.array([["AA", "B"]], dtype="U"),
	],
	)
	@pytest.mark.parametrize(
	"X_test",
	[
	[["A", "B"]],
	np.array([["A", "B"]], dtype="O"),
	np.array([["A", "B"]], dtype="U"),
	],
	)
	def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test):
	"""Checks that `OrdinalEncoder` transforms string dtypes.
	Non-regression test for:
	https://github.com/scikit-learn/scikit-learn/issues/19872
	"""
	enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-9)
	enc.fit(X_train)

	X_trans = enc.transform(X_test)
	assert_allclose(X_trans, [[-9, 0]])


	def test_ordinal_encoder_python_integer():
	"""Check that `OrdinalEncoder` accepts Python integers that are potentially
	larger than 64 bits.
	Non-regression test for:
	https://github.com/scikit-learn/scikit-learn/issues/20721
	"""
	X = np.array(
	[
	44253463435747313673,
	9867966753463435747313673,
	44253462342215747313673,
	442534634357764313673,
	]
	).reshape(-1, 1)
	encoder = OrdinalEncoder().fit(X)
	assert_array_equal(encoder.categories_, np.sort(X, axis=0).T)
	X_trans = encoder.transform(X)
	assert_array_equal(X_trans, [[0], [3], [2], [1]])


	def test_ordinal_encoder_features_names_out_pandas():
	"""Check feature names out is same as the input."""
	pd = pytest.importorskip("pandas")

	names = ["b", "c", "a"]
	X = pd.DataFrame([[1, 2, 3]], columns=names)
	enc = OrdinalEncoder().fit(X)

	feature_names_out = enc.get_feature_names_out()
	assert_array_equal(names, feature_names_out)


	def test_ordinal_encoder_unknown_missing_interaction():
	"""Check interactions between encode_unknown and missing value encoding."""

	X = np.array([["a"], ["b"], [np.nan]], dtype=object)

	oe = OrdinalEncoder(
	handle_unknown="use_encoded_value",
	unknown_value=np.nan,
	encoded_missing_value=-3,
	).fit(X)

	X_trans = oe.transform(X)
	assert_allclose(X_trans, [[0], [1], [-3]])

	# "c" is unknown and is mapped to np.nan
	# "None" is a missing value and is set to -3
	X_test = np.array([["c"], [np.nan]], dtype=object)
	X_test_trans = oe.transform(X_test)
	assert_allclose(X_test_trans, [[np.nan], [-3]])

	# Non-regression test for #24082
	X_roundtrip = oe.inverse_transform(X_test_trans)

	# np.nan is unknown so it maps to None
	assert X_roundtrip[0][0] is None

	# -3 is the encoded missing value so it maps back to nan
	assert np.isnan(X_roundtrip[1][0])


	@pytest.mark.parametrize("with_pandas", [True, False])
	def test_ordinal_encoder_encoded_missing_value_error(with_pandas):
	"""Check OrdinalEncoder errors when encoded_missing_value is used by
	an known category."""
	X = np.array([["a", "dog"], ["b", "cat"], ["c", np.nan]], dtype=object)

	# The 0-th feature has no missing values so it is not included in the list of
	# features
	error_msg = (
	r"encoded_missing_value \(1\) is already used to encode a known category "
	r"in features: "
	)

	if with_pandas:
	pd = pytest.importorskip("pandas")
	X = pd.DataFrame(X, columns=["letter", "pet"])
	error_msg = error_msg + r"\['pet'\]"
	else:
	error_msg = error_msg + r"\[1\]"

	oe = OrdinalEncoder(encoded_missing_value=1)

	with pytest.raises(ValueError, match=error_msg):
	oe.fit(X)


	@pytest.mark.parametrize(
	"X_train, X_test_trans_expected, X_roundtrip_expected",
	[
	(
	# missing value is not in training set
	# inverse transform will considering encoded nan as unknown
	np.array([["a"], ["1"]], dtype=object),
	[[0], [np.nan], [np.nan]],
	np.asarray([["1"], [None], [None]], dtype=object),
	),
	(
	# missing value in training set,
	# inverse transform will considering encoded nan as missing
	np.array([[np.nan], ["1"], ["a"]], dtype=object),
	[[0], [np.nan], [np.nan]],
	np.asarray([["1"], [np.nan], [np.nan]], dtype=object),
	),
	],
	)
	def test_ordinal_encoder_unknown_missing_interaction_both_nan(
	X_train, X_test_trans_expected, X_roundtrip_expected
	):
	"""Check transform when unknown_value and encoded_missing_value is nan.

	Non-regression test for #24082.
	"""
	oe = OrdinalEncoder(
	handle_unknown="use_encoded_value",
	unknown_value=np.nan,
	encoded_missing_value=np.nan,
	).fit(X_train)

	X_test = np.array([["1"], [np.nan], ["b"]])
	X_test_trans = oe.transform(X_test)

	# both nan and unknown are encoded as nan
	assert_allclose(X_test_trans, X_test_trans_expected)
	X_roundtrip = oe.inverse_transform(X_test_trans)

	n_samples = X_roundtrip_expected.shape[0]
	for i in range(n_samples):
	expected_val = X_roundtrip_expected[i, 0]
	val = X_roundtrip[i, 0]

	if expected_val is None:
	assert val is None
	elif is_scalar_nan(expected_val):
	assert np.isnan(val)
	else:
	assert val == expected_val


	def test_one_hot_encoder_set_output():
	"""Check OneHotEncoder works with set_output."""
	pd = pytest.importorskip("pandas")

	X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
	ohe = OneHotEncoder()

	ohe.set_output(transform="pandas")

	match = "Pandas output does not support sparse data. Set sparse_output=False"
	with pytest.raises(ValueError, match=match):
	ohe.fit_transform(X_df)

	ohe_default = OneHotEncoder(sparse_output=False).set_output(transform="default")
	ohe_pandas = OneHotEncoder(sparse_output=False).set_output(transform="pandas")

	X_default = ohe_default.fit_transform(X_df)
	X_pandas = ohe_pandas.fit_transform(X_df)

	assert_allclose(X_pandas.to_numpy(), X_default)
	assert_array_equal(ohe_pandas.get_feature_names_out(), X_pandas.columns)


	def test_ordinal_set_output():
	"""Check OrdinalEncoder works with set_output."""
	pd = pytest.importorskip("pandas")

	X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})

	ord_default = OrdinalEncoder().set_output(transform="default")
	ord_pandas = OrdinalEncoder().set_output(transform="pandas")

	X_default = ord_default.fit_transform(X_df)
	X_pandas = ord_pandas.fit_transform(X_df)

	assert_allclose(X_pandas.to_numpy(), X_default)
	assert_array_equal(ord_pandas.get_feature_names_out(), X_pandas.columns)


	def test_predefined_categories_dtype():
	"""Check that the categories_ dtype is `object` for string categories

	Regression test for gh-25171.
	"""
	categories = [["as", "mmas", "eas", "ras", "acs"], ["1", "2"]]

	enc = OneHotEncoder(categories=categories)

	enc.fit([["as", "1"]])

	assert len(categories) == len(enc.categories_)
	for n, cat in enumerate(enc.categories_):
	assert cat.dtype == object
	assert_array_equal(categories[n], cat)


	def test_ordinal_encoder_missing_unknown_encoding_max():
	"""Check missing value or unknown encoding can equal the cardinality."""
	X = np.array([["dog"], ["cat"], [np.nan]], dtype=object)
	X_trans = OrdinalEncoder(encoded_missing_value=2).fit_transform(X)
	assert_allclose(X_trans, [[1], [0], [2]])

	enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=2).fit(X)
	X_test = np.array([["snake"]])
	X_trans = enc.transform(X_test)
	assert_allclose(X_trans, [[2]])


	def test_drop_idx_infrequent_categories():
	"""Check drop_idx is defined correctly with infrequent categories.

	Non-regression test for gh-25550.
	"""
	X = np.array(
	[["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
	).T
	ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="first").fit(X)
	assert_array_equal(
	ohe.get_feature_names_out(), ["x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"]
	)
	assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"

	X = np.array([["a"] * 2 + ["b"] * 2 + ["c"] * 10], dtype=object).T
	ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="if_binary").fit(X)
	assert_array_equal(ohe.get_feature_names_out(), ["x0_infrequent_sklearn"])
	assert ohe.categories_[0][ohe.drop_idx_[0]] == "c"

	X = np.array(
	[["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
	).T
	ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=["d"]).fit(X)
	assert_array_equal(
	ohe.get_feature_names_out(), ["x0_b", "x0_c", "x0_e", "x0_infrequent_sklearn"]
	)
	assert ohe.categories_[0][ohe.drop_idx_[0]] == "d"

	ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=None).fit(X)
	assert_array_equal(
	ohe.get_feature_names_out(),
	["x0_b", "x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"],
	)
	assert ohe.drop_idx_ is None


	@pytest.mark.parametrize(
	"kwargs",
	[
	{"max_categories": 3},
	{"min_frequency": 6},
	{"min_frequency": 9},
	{"min_frequency": 0.24},
	{"min_frequency": 0.16},
	{"max_categories": 3, "min_frequency": 8},
	{"max_categories": 4, "min_frequency": 6},
	],
	)
	def test_ordinal_encoder_infrequent_three_levels(kwargs):
	"""Test parameters for grouping 'a', and 'd' into the infrequent category."""

	X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
	ordinal = OrdinalEncoder(
	handle_unknown="use_encoded_value", unknown_value=-1, **kwargs
	).fit(X_train)
	assert_array_equal(ordinal.categories_, [["a", "b", "c", "d"]])
	assert_array_equal(ordinal.infrequent_categories_, [["a", "d"]])

	X_test = [["a"], ["b"], ["c"], ["d"], ["z"]]
	expected_trans = [[2], [0], [1], [2], [-1]]

	X_trans = ordinal.transform(X_test)
	assert_allclose(X_trans, expected_trans)

	X_inverse = ordinal.inverse_transform(X_trans)
	expected_inverse = [
	["infrequent_sklearn"],
	["b"],
	["c"],
	["infrequent_sklearn"],
	[None],
	]
	assert_array_equal(X_inverse, expected_inverse)


	def test_ordinal_encoder_infrequent_three_levels_user_cats():
	"""Test that the order of the categories provided by a user is respected.

	In this case 'c' is encoded as the first category and 'b' is encoded
	as the second one.
	"""

	X_train = np.array(
	[["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
	).T
	ordinal = OrdinalEncoder(
	categories=[["c", "d", "b", "a"]],
	max_categories=3,
	handle_unknown="use_encoded_value",
	unknown_value=-1,
	).fit(X_train)
	assert_array_equal(ordinal.categories_, [["c", "d", "b", "a"]])
	assert_array_equal(ordinal.infrequent_categories_, [["d", "a"]])

	X_test = [["a"], ["b"], ["c"], ["d"], ["z"]]
	expected_trans = [[2], [1], [0], [2], [-1]]

	X_trans = ordinal.transform(X_test)
	assert_allclose(X_trans, expected_trans)

	X_inverse = ordinal.inverse_transform(X_trans)
	expected_inverse = [
	["infrequent_sklearn"],
	["b"],
	["c"],
	["infrequent_sklearn"],
	[None],
	]
	assert_array_equal(X_inverse, expected_inverse)


	def test_ordinal_encoder_infrequent_mixed():
	"""Test when feature 0 has infrequent categories and feature 1 does not."""

	X = np.column_stack(([0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1]))

	ordinal = OrdinalEncoder(max_categories=3).fit(X)

	assert_array_equal(ordinal.infrequent_categories_[0], [1, 2])
	assert ordinal.infrequent_categories_[1] is None

	X_test = [[3, 0], [1, 1]]
	expected_trans = [[1, 0], [2, 1]]

	X_trans = ordinal.transform(X_test)
	assert_allclose(X_trans, expected_trans)

	X_inverse = ordinal.inverse_transform(X_trans)
	expected_inverse = np.array([[3, 0], ["infrequent_sklearn", 1]], dtype=object)
	assert_array_equal(X_inverse, expected_inverse)


	def test_ordinal_encoder_infrequent_multiple_categories_dtypes():
	"""Test infrequent categories with a pandas DataFrame with multiple dtypes."""

	pd = pytest.importorskip("pandas")
	categorical_dtype = pd.CategoricalDtype(["bird", "cat", "dog", "snake"])
	X = pd.DataFrame(
	{
	"str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"],
	"int": [5, 3, 0, 10, 10, 12, 0, 3, 5],
	"categorical": pd.Series(
	["dog"] * 4 + ["cat"] * 3 + ["snake"] + ["bird"],
	dtype=categorical_dtype,
	),
	},
	columns=["str", "int", "categorical"],
	)

	ordinal = OrdinalEncoder(max_categories=3).fit(X)
	# X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be
	# considered infrequent because they appear first when sorted

	# X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1.
	# 0, 3, 12 will be considered infrequent because they appear first when
	# sorted.

	# X[:, 2] "snake" and "bird" or infrequent

	assert_array_equal(ordinal.infrequent_categories_[0], ["a", "b"])
	assert_array_equal(ordinal.infrequent_categories_[1], [0, 3, 12])
	assert_array_equal(ordinal.infrequent_categories_[2], ["bird", "snake"])

	X_test = pd.DataFrame(
	{
	"str": ["a", "b", "f", "c"],
	"int": [12, 0, 10, 5],
	"categorical": pd.Series(
	["cat"] + ["snake"] + ["bird"] + ["dog"],
	dtype=categorical_dtype,
	),
	},
	columns=["str", "int", "categorical"],
	)
	expected_trans = [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]]

	X_trans = ordinal.transform(X_test)
	assert_allclose(X_trans, expected_trans)


	def test_ordinal_encoder_infrequent_custom_mapping():
	"""Check behavior of unknown_value and encoded_missing_value with infrequent."""
	X_train = np.array(
	[["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]], dtype=object
	).T

	ordinal = OrdinalEncoder(
	handle_unknown="use_encoded_value",
	unknown_value=2,
	max_categories=2,
	encoded_missing_value=3,
	).fit(X_train)
	assert_array_equal(ordinal.infrequent_categories_, [["a", "c", "d"]])

	X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object)
	expected_trans = [[1], [0], [1], [1], [2], [3]]

	X_trans = ordinal.transform(X_test)
	assert_allclose(X_trans, expected_trans)


	@pytest.mark.parametrize(
	"kwargs",
	[
	{"max_categories": 6},
	{"min_frequency": 2},
	],
	)
	def test_ordinal_encoder_all_frequent(kwargs):
	"""All categories are considered frequent have same encoding as default encoder."""
	X_train = np.array(
	[["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
	).T

	adjusted_encoder = OrdinalEncoder(
	**kwargs, handle_unknown="use_encoded_value", unknown_value=-1
	).fit(X_train)
	default_encoder = OrdinalEncoder(
	handle_unknown="use_encoded_value", unknown_value=-1
	).fit(X_train)

	X_test = [["a"], ["b"], ["c"], ["d"], ["e"]]

	assert_allclose(
	adjusted_encoder.transform(X_test), default_encoder.transform(X_test)
	)


	@pytest.mark.parametrize(
	"kwargs",
	[
	{"max_categories": 1},
	{"min_frequency": 100},
	],
	)
	def test_ordinal_encoder_all_infrequent(kwargs):
	"""When all categories are infrequent, they are all encoded as zero."""
	X_train = np.array(
	[["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
	).T
	encoder = OrdinalEncoder(
	**kwargs, handle_unknown="use_encoded_value", unknown_value=-1
	).fit(X_train)

	X_test = [["a"], ["b"], ["c"], ["d"], ["e"]]
	assert_allclose(encoder.transform(X_test), [[0], [0], [0], [0], [-1]])


	def test_ordinal_encoder_missing_appears_frequent():
	"""Check behavior when missing value appears frequently."""
	X = np.array(
	[[np.nan] * 20 + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"]],
	dtype=object,
	).T
	ordinal = OrdinalEncoder(max_categories=3).fit(X)

	X_test = np.array([["snake", "cat", "dog", np.nan]], dtype=object).T
	X_trans = ordinal.transform(X_test)
	assert_allclose(X_trans, [[2], [0], [1], [np.nan]])


	def test_ordinal_encoder_missing_appears_infrequent():
	"""Check behavior when missing value appears infrequently."""

	# feature 0 has infrequent categories
	# feature 1 has no infrequent categories
	X = np.array(
	[
	[np.nan] + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"],
	["red"] * 9 + ["green"] * 9,
	],
	dtype=object,
	).T
	ordinal = OrdinalEncoder(min_frequency=4).fit(X)

	X_test = np.array(
	[
	["snake", "red"],
	["deer", "green"],
	[np.nan, "green"],
	["dog", "green"],
	["cat", "red"],
	],
	dtype=object,
	)
	X_trans = ordinal.transform(X_test)
	assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]])


	@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
	def test_encoder_not_fitted(Encoder):
	"""Check that we raise a `NotFittedError` by calling transform before fit with
	the encoders.

	One could expect that the passing the `categories` argument to the encoder
	would make it stateless. However, `fit` is making a couple of check, such as the
	position of `np.nan`.
	"""
	X = np.array([["A"], ["B"], ["C"]], dtype=object)
	encoder = Encoder(categories=[["A", "B", "C"]])
	with pytest.raises(NotFittedError):
	encoder.transform(X)