Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

31.3 kB

	# Authors: The scikit-learn developers
	# SPDX-License-Identifier: BSD-3-Clause

	import array
	import itertools
	import warnings
	from collections import defaultdict
	from numbers import Integral

	import numpy as np
	import scipy.sparse as sp

	from ..base import BaseEstimator, TransformerMixin, _fit_context
	from ..utils import column_or_1d
	from ..utils._array_api import _setdiff1d, device, get_namespace
	from ..utils._encode import _encode, _unique
	from ..utils._param_validation import Interval, validate_params
	from ..utils.multiclass import type_of_target, unique_labels
	from ..utils.sparsefuncs import min_max_axis
	from ..utils.validation import _num_samples, check_array, check_is_fitted

	__all__ = [
	"label_binarize",
	"LabelBinarizer",
	"LabelEncoder",
	"MultiLabelBinarizer",
	]


	class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
	"""Encode target labels with value between 0 and n_classes-1.

	This transformer should be used to encode target values, i.e. `y`, and
	not the input `X`.

	Read more in the :ref:`User Guide <preprocessing_targets>`.

	.. versionadded:: 0.12

	Attributes
	----------
	classes_ : ndarray of shape (n_classes,)
	Holds the label for each class.

	See Also
	--------
	OrdinalEncoder : Encode categorical features using an ordinal encoding
	scheme.
	OneHotEncoder : Encode categorical features as a one-hot numeric array.

	Examples
	--------
	`LabelEncoder` can be used to normalize labels.

	>>> from sklearn.preprocessing import LabelEncoder
	>>> le = LabelEncoder()
	>>> le.fit([1, 2, 2, 6])
	LabelEncoder()
	>>> le.classes_
	array([1, 2, 6])
	>>> le.transform([1, 1, 2, 6])
	array([0, 0, 1, 2]...)
	>>> le.inverse_transform([0, 0, 1, 2])
	array([1, 1, 2, 6])

	It can also be used to transform non-numerical labels (as long as they are
	hashable and comparable) to numerical labels.

	>>> le = LabelEncoder()
	>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
	LabelEncoder()
	>>> list(le.classes_)
	[np.str_('amsterdam'), np.str_('paris'), np.str_('tokyo')]
	>>> le.transform(["tokyo", "tokyo", "paris"])
	array([2, 2, 1]...)
	>>> list(le.inverse_transform([2, 2, 1]))
	[np.str_('tokyo'), np.str_('tokyo'), np.str_('paris')]
	"""

	def fit(self, y):
	"""Fit label encoder.

	Parameters
	----------
	y : array-like of shape (n_samples,)
	Target values.

	Returns
	-------
	self : returns an instance of self.
	Fitted label encoder.
	"""
	y = column_or_1d(y, warn=True)
	self.classes_ = _unique(y)
	return self

	def fit_transform(self, y):
	"""Fit label encoder and return encoded labels.

	Parameters
	----------
	y : array-like of shape (n_samples,)
	Target values.

	Returns
	-------
	y : array-like of shape (n_samples,)
	Encoded labels.
	"""
	y = column_or_1d(y, warn=True)
	self.classes_, y = _unique(y, return_inverse=True)
	return y

	def transform(self, y):
	"""Transform labels to normalized encoding.

	Parameters
	----------
	y : array-like of shape (n_samples,)
	Target values.

	Returns
	-------
	y : array-like of shape (n_samples,)
	Labels as normalized encodings.
	"""
	check_is_fitted(self)
	xp, _ = get_namespace(y)
	y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
	# transform of empty array is empty array
	if _num_samples(y) == 0:
	return xp.asarray([])

	return _encode(y, uniques=self.classes_)

	def inverse_transform(self, y):
	"""Transform labels back to original encoding.

	Parameters
	----------
	y : array-like of shape (n_samples,)
	Target values.

	Returns
	-------
	y : ndarray of shape (n_samples,)
	Original encoding.
	"""
	check_is_fitted(self)
	xp, _ = get_namespace(y)
	y = column_or_1d(y, warn=True)
	# inverse transform of empty array is empty array
	if _num_samples(y) == 0:
	return xp.asarray([])

	diff = _setdiff1d(
	ar1=y,
	ar2=xp.arange(self.classes_.shape[0], device=device(y)),
	xp=xp,
	)
	if diff.shape[0]:
	raise ValueError("y contains previously unseen labels: %s" % str(diff))
	y = xp.asarray(y)
	return xp.take(self.classes_, y, axis=0)

	def __sklearn_tags__(self):
	tags = super().__sklearn_tags__()
	tags.array_api_support = True
	tags.input_tags.two_d_array = False
	tags.target_tags.one_d_labels = True
	return tags


	class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
	"""Binarize labels in a one-vs-all fashion.

	Several regression and binary classification algorithms are
	available in scikit-learn. A simple way to extend these algorithms
	to the multi-class classification case is to use the so-called
	one-vs-all scheme.

	At learning time, this simply consists in learning one regressor
	or binary classifier per class. In doing so, one needs to convert
	multi-class labels to binary labels (belong or does not belong
	to the class). `LabelBinarizer` makes this process easy with the
	transform method.

	At prediction time, one assigns the class for which the corresponding
	model gave the greatest confidence. `LabelBinarizer` makes this easy
	with the :meth:`inverse_transform` method.

	Read more in the :ref:`User Guide <preprocessing_targets>`.

	Parameters
	----------
	neg_label : int, default=0
	Value with which negative labels must be encoded.

	pos_label : int, default=1
	Value with which positive labels must be encoded.

	sparse_output : bool, default=False
	True if the returned array from transform is desired to be in sparse
	CSR format.

	Attributes
	----------
	classes_ : ndarray of shape (n_classes,)
	Holds the label for each class.

	y_type_ : str
	Represents the type of the target data as evaluated by
	:func:`~sklearn.utils.multiclass.type_of_target`. Possible type are
	'continuous', 'continuous-multioutput', 'binary', 'multiclass',
	'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.

	sparse_input_ : bool
	`True` if the input data to transform is given as a sparse matrix,
	`False` otherwise.

	See Also
	--------
	label_binarize : Function to perform the transform operation of
	LabelBinarizer with fixed classes.
	OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
	scheme.

	Examples
	--------
	>>> from sklearn.preprocessing import LabelBinarizer
	>>> lb = LabelBinarizer()
	>>> lb.fit([1, 2, 6, 4, 2])
	LabelBinarizer()
	>>> lb.classes_
	array([1, 2, 4, 6])
	>>> lb.transform([1, 6])
	array([[1, 0, 0, 0],
	[0, 0, 0, 1]])

	Binary targets transform to a column vector

	>>> lb = LabelBinarizer()
	>>> lb.fit_transform(['yes', 'no', 'no', 'yes'])
	array([[1],
	[0],
	[0],
	[1]])

	Passing a 2D matrix for multilabel classification

	>>> import numpy as np
	>>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))
	LabelBinarizer()
	>>> lb.classes_
	array([0, 1, 2])
	>>> lb.transform([0, 1, 2, 1])
	array([[1, 0, 0],
	[0, 1, 0],
	[0, 0, 1],
	[0, 1, 0]])
	"""

	_parameter_constraints: dict = {
	"neg_label": [Integral],
	"pos_label": [Integral],
	"sparse_output": ["boolean"],
	}

	def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
	self.neg_label = neg_label
	self.pos_label = pos_label
	self.sparse_output = sparse_output

	@_fit_context(prefer_skip_nested_validation=True)
	def fit(self, y):
	"""Fit label binarizer.

	Parameters
	----------
	y : ndarray of shape (n_samples,) or (n_samples, n_classes)
	Target values. The 2-d matrix should only contain 0 and 1,
	represents multilabel classification.

	Returns
	-------
	self : object
	Returns the instance itself.
	"""
	if self.neg_label >= self.pos_label:
	raise ValueError(
	f"neg_label={self.neg_label} must be strictly less than "
	f"pos_label={self.pos_label}."
	)

	if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0):
	raise ValueError(
	"Sparse binarization is only supported with non "
	"zero pos_label and zero neg_label, got "
	f"pos_label={self.pos_label} and neg_label={self.neg_label}"
	)

	self.y_type_ = type_of_target(y, input_name="y")

	if "multioutput" in self.y_type_:
	raise ValueError(
	"Multioutput target data is not supported with label binarization"
	)
	if _num_samples(y) == 0:
	raise ValueError("y has 0 samples: %r" % y)

	self.sparse_input_ = sp.issparse(y)
	self.classes_ = unique_labels(y)
	return self

	def fit_transform(self, y):
	"""Fit label binarizer/transform multi-class labels to binary labels.

	The output of transform is sometimes referred to as
	the 1-of-K coding scheme.

	Parameters
	----------
	y : {ndarray, sparse matrix} of shape (n_samples,) or \
	(n_samples, n_classes)
	Target values. The 2-d matrix should only contain 0 and 1,
	represents multilabel classification. Sparse matrix can be
	CSR, CSC, COO, DOK, or LIL.

	Returns
	-------
	Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
	Shape will be (n_samples, 1) for binary problems. Sparse matrix
	will be of CSR format.
	"""
	return self.fit(y).transform(y)

	def transform(self, y):
	"""Transform multi-class labels to binary labels.

	The output of transform is sometimes referred to by some authors as
	the 1-of-K coding scheme.

	Parameters
	----------
	y : {array, sparse matrix} of shape (n_samples,) or \
	(n_samples, n_classes)
	Target values. The 2-d matrix should only contain 0 and 1,
	represents multilabel classification. Sparse matrix can be
	CSR, CSC, COO, DOK, or LIL.

	Returns
	-------
	Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
	Shape will be (n_samples, 1) for binary problems. Sparse matrix
	will be of CSR format.
	"""
	check_is_fitted(self)

	y_is_multilabel = type_of_target(y).startswith("multilabel")
	if y_is_multilabel and not self.y_type_.startswith("multilabel"):
	raise ValueError("The object was not fitted with multilabel input.")

	return label_binarize(
	y,
	classes=self.classes_,
	pos_label=self.pos_label,
	neg_label=self.neg_label,
	sparse_output=self.sparse_output,
	)

	def inverse_transform(self, Y, threshold=None):
	"""Transform binary labels back to multi-class labels.

	Parameters
	----------
	Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
	Target values. All sparse matrices are converted to CSR before
	inverse transformation.

	threshold : float, default=None
	Threshold used in the binary and multi-label cases.

	Use 0 when ``Y`` contains the output of :term:`decision_function`
	(classifier).
	Use 0.5 when ``Y`` contains the output of :term:`predict_proba`.

	If None, the threshold is assumed to be half way between
	neg_label and pos_label.

	Returns
	-------
	y : {ndarray, sparse matrix} of shape (n_samples,)
	Target values. Sparse matrix will be of CSR format.

	Notes
	-----
	In the case when the binary labels are fractional
	(probabilistic), :meth:`inverse_transform` chooses the class with the
	greatest value. Typically, this allows to use the output of a
	linear model's :term:`decision_function` method directly as the input
	of :meth:`inverse_transform`.
	"""
	check_is_fitted(self)

	if threshold is None:
	threshold = (self.pos_label + self.neg_label) / 2.0

	if self.y_type_ == "multiclass":
	y_inv = _inverse_binarize_multiclass(Y, self.classes_)
	else:
	y_inv = _inverse_binarize_thresholding(
	Y, self.y_type_, self.classes_, threshold
	)

	if self.sparse_input_:
	y_inv = sp.csr_matrix(y_inv)
	elif sp.issparse(y_inv):
	y_inv = y_inv.toarray()

	return y_inv

	def __sklearn_tags__(self):
	tags = super().__sklearn_tags__()
	tags.input_tags.two_d_array = False
	tags.target_tags.one_d_labels = True
	return tags


	@validate_params(
	{
	"y": ["array-like", "sparse matrix"],
	"classes": ["array-like"],
	"neg_label": [Interval(Integral, None, None, closed="neither")],
	"pos_label": [Interval(Integral, None, None, closed="neither")],
	"sparse_output": ["boolean"],
	},
	prefer_skip_nested_validation=True,
	)
	def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False):
	"""Binarize labels in a one-vs-all fashion.

	Several regression and binary classification algorithms are
	available in scikit-learn. A simple way to extend these algorithms
	to the multi-class classification case is to use the so-called
	one-vs-all scheme.

	This function makes it possible to compute this transformation for a
	fixed set of class labels known ahead of time.

	Parameters
	----------
	y : array-like or sparse matrix
	Sequence of integer labels or multilabel data to encode.

	classes : array-like of shape (n_classes,)
	Uniquely holds the label for each class.

	neg_label : int, default=0
	Value with which negative labels must be encoded.

	pos_label : int, default=1
	Value with which positive labels must be encoded.

	sparse_output : bool, default=False,
	Set to true if output binary array is desired in CSR sparse format.

	Returns
	-------
	Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
	Shape will be (n_samples, 1) for binary problems. Sparse matrix will
	be of CSR format.

	See Also
	--------
	LabelBinarizer : Class used to wrap the functionality of label_binarize and
	allow for fitting to classes independently of the transform operation.

	Examples
	--------
	>>> from sklearn.preprocessing import label_binarize
	>>> label_binarize([1, 6], classes=[1, 2, 4, 6])
	array([[1, 0, 0, 0],
	[0, 0, 0, 1]])

	The class ordering is preserved:

	>>> label_binarize([1, 6], classes=[1, 6, 4, 2])
	array([[1, 0, 0, 0],
	[0, 1, 0, 0]])

	Binary targets transform to a column vector

	>>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])
	array([[1],
	[0],
	[0],
	[1]])
	"""
	if not isinstance(y, list):
	# XXX Workaround that will be removed when list of list format is
	# dropped
	y = check_array(
	y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None
	)
	else:
	if _num_samples(y) == 0:
	raise ValueError("y has 0 samples: %r" % y)
	if neg_label >= pos_label:
	raise ValueError(
	"neg_label={0} must be strictly less than pos_label={1}.".format(
	neg_label, pos_label
	)
	)

	if sparse_output and (pos_label == 0 or neg_label != 0):
	raise ValueError(
	"Sparse binarization is only supported with non "
	"zero pos_label and zero neg_label, got "
	"pos_label={0} and neg_label={1}"
	"".format(pos_label, neg_label)
	)

	# To account for pos_label == 0 in the dense case
	pos_switch = pos_label == 0
	if pos_switch:
	pos_label = -neg_label

	y_type = type_of_target(y)
	if "multioutput" in y_type:
	raise ValueError(
	"Multioutput target data is not supported with label binarization"
	)
	if y_type == "unknown":
	raise ValueError("The type of target data is not known")

	n_samples = y.shape[0] if sp.issparse(y) else len(y)
	n_classes = len(classes)
	classes = np.asarray(classes)

	if y_type == "binary":
	if n_classes == 1:
	if sparse_output:
	return sp.csr_matrix((n_samples, 1), dtype=int)
	else:
	Y = np.zeros((len(y), 1), dtype=int)
	Y += neg_label
	return Y
	elif len(classes) >= 3:
	y_type = "multiclass"

	sorted_class = np.sort(classes)
	if y_type == "multilabel-indicator":
	y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0])
	if classes.size != y_n_classes:
	raise ValueError(
	"classes {0} mismatch with the labels {1} found in the data".format(
	classes, unique_labels(y)
	)
	)

	if y_type in ("binary", "multiclass"):
	y = column_or_1d(y)

	# pick out the known labels from y
	y_in_classes = np.isin(y, classes)
	y_seen = y[y_in_classes]
	indices = np.searchsorted(sorted_class, y_seen)
	indptr = np.hstack((0, np.cumsum(y_in_classes)))

	data = np.empty_like(indices)
	data.fill(pos_label)
	Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes))
	elif y_type == "multilabel-indicator":
	Y = sp.csr_matrix(y)
	if pos_label != 1:
	data = np.empty_like(Y.data)
	data.fill(pos_label)
	Y.data = data
	else:
	raise ValueError(
	"%s target data is not supported with label binarization" % y_type
	)

	if not sparse_output:
	Y = Y.toarray()
	Y = Y.astype(int, copy=False)

	if neg_label != 0:
	Y[Y == 0] = neg_label

	if pos_switch:
	Y[Y == pos_label] = 0
	else:
	Y.data = Y.data.astype(int, copy=False)

	# preserve label ordering
	if np.any(classes != sorted_class):
	indices = np.searchsorted(sorted_class, classes)
	Y = Y[:, indices]

	if y_type == "binary":
	if sparse_output:
	Y = Y.getcol(-1)
	else:
	Y = Y[:, -1].reshape((-1, 1))

	return Y


	def _inverse_binarize_multiclass(y, classes):
	"""Inverse label binarization transformation for multiclass.

	Multiclass uses the maximal score instead of a threshold.
	"""
	classes = np.asarray(classes)

	if sp.issparse(y):
	# Find the argmax for each row in y where y is a CSR matrix

	y = y.tocsr()
	n_samples, n_outputs = y.shape
	outputs = np.arange(n_outputs)
	row_max = min_max_axis(y, 1)[1]
	row_nnz = np.diff(y.indptr)

	y_data_repeated_max = np.repeat(row_max, row_nnz)
	# picks out all indices obtaining the maximum per row
	y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)

	# For corner case where last row has a max of 0
	if row_max[-1] == 0:
	y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])

	# Gets the index of the first argmax in each row from y_i_all_argmax
	index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
	# first argmax of each row
	y_ind_ext = np.append(y.indices, [0])
	y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
	# Handle rows of all 0
	y_i_argmax[np.where(row_nnz == 0)[0]] = 0

	# Handles rows with max of 0 that contain negative numbers
	samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]
	for i in samples:
	ind = y.indices[y.indptr[i] : y.indptr[i + 1]]
	y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]

	return classes[y_i_argmax]
	else:
	return classes.take(y.argmax(axis=1), mode="clip")


	def _inverse_binarize_thresholding(y, output_type, classes, threshold):
	"""Inverse label binarization transformation using thresholding."""

	if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
	raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape))

	if output_type != "binary" and y.shape[1] != len(classes):
	raise ValueError(
	"The number of class is not equal to the number of dimension of y."
	)

	classes = np.asarray(classes)

	# Perform thresholding
	if sp.issparse(y):
	if threshold > 0:
	if y.format not in ("csr", "csc"):
	y = y.tocsr()
	y.data = np.array(y.data > threshold, dtype=int)
	y.eliminate_zeros()
	else:
	y = np.array(y.toarray() > threshold, dtype=int)
	else:
	y = np.array(y > threshold, dtype=int)

	# Inverse transform data
	if output_type == "binary":
	if sp.issparse(y):
	y = y.toarray()
	if y.ndim == 2 and y.shape[1] == 2:
	return classes[y[:, 1]]
	else:
	if len(classes) == 1:
	return np.repeat(classes[0], len(y))
	else:
	return classes[y.ravel()]

	elif output_type == "multilabel-indicator":
	return y

	else:
	raise ValueError("{0} format is not supported".format(output_type))


	class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
	"""Transform between iterable of iterables and a multilabel format.

	Although a list of sets or tuples is a very intuitive format for multilabel
	data, it is unwieldy to process. This transformer converts between this
	intuitive format and the supported multilabel format: a (samples x classes)
	binary matrix indicating the presence of a class label.

	Parameters
	----------
	classes : array-like of shape (n_classes,), default=None
	Indicates an ordering for the class labels.
	All entries should be unique (cannot contain duplicate classes).

	sparse_output : bool, default=False
	Set to True if output binary array is desired in CSR sparse format.

	Attributes
	----------
	classes_ : ndarray of shape (n_classes,)
	A copy of the `classes` parameter when provided.
	Otherwise it corresponds to the sorted set of classes found
	when fitting.

	See Also
	--------
	OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
	scheme.

	Examples
	--------
	>>> from sklearn.preprocessing import MultiLabelBinarizer
	>>> mlb = MultiLabelBinarizer()
	>>> mlb.fit_transform([(1, 2), (3,)])
	array([[1, 1, 0],
	[0, 0, 1]])
	>>> mlb.classes_
	array([1, 2, 3])

	>>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])
	array([[0, 1, 1],
	[1, 0, 0]])
	>>> list(mlb.classes_)
	['comedy', 'sci-fi', 'thriller']

	A common mistake is to pass in a list, which leads to the following issue:

	>>> mlb = MultiLabelBinarizer()
	>>> mlb.fit(['sci-fi', 'thriller', 'comedy'])
	MultiLabelBinarizer()
	>>> mlb.classes_
	array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't',
	'y'], dtype=object)

	To correct this, the list of labels should be passed in as:

	>>> mlb = MultiLabelBinarizer()
	>>> mlb.fit([['sci-fi', 'thriller', 'comedy']])
	MultiLabelBinarizer()
	>>> mlb.classes_
	array(['comedy', 'sci-fi', 'thriller'], dtype=object)
	"""

	_parameter_constraints: dict = {
	"classes": ["array-like", None],
	"sparse_output": ["boolean"],
	}

	def __init__(self, *, classes=None, sparse_output=False):
	self.classes = classes
	self.sparse_output = sparse_output

	@_fit_context(prefer_skip_nested_validation=True)
	def fit(self, y):
	"""Fit the label sets binarizer, storing :term:`classes_`.

	Parameters
	----------
	y : iterable of iterables
	A set of labels (any orderable and hashable object) for each
	sample. If the `classes` parameter is set, `y` will not be
	iterated.

	Returns
	-------
	self : object
	Fitted estimator.
	"""
	self._cached_dict = None

	if self.classes is None:
	classes = sorted(set(itertools.chain.from_iterable(y)))
	elif len(set(self.classes)) < len(self.classes):
	raise ValueError(
	"The classes argument contains duplicate "
	"classes. Remove these duplicates before passing "
	"them to MultiLabelBinarizer."
	)
	else:
	classes = self.classes
	dtype = int if all(isinstance(c, int) for c in classes) else object
	self.classes_ = np.empty(len(classes), dtype=dtype)
	self.classes_[:] = classes
	return self

	@_fit_context(prefer_skip_nested_validation=True)
	def fit_transform(self, y):
	"""Fit the label sets binarizer and transform the given label sets.

	Parameters
	----------
	y : iterable of iterables
	A set of labels (any orderable and hashable object) for each
	sample. If the `classes` parameter is set, `y` will not be
	iterated.

	Returns
	-------
	y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)
	A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`
	is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR
	format.
	"""
	if self.classes is not None:
	return self.fit(y).transform(y)

	self._cached_dict = None

	# Automatically increment on new class
	class_mapping = defaultdict(int)
	class_mapping.default_factory = class_mapping.__len__
	yt = self._transform(y, class_mapping)

	# sort classes and reorder columns
	tmp = sorted(class_mapping, key=class_mapping.get)

	# (make safe for tuples)
	dtype = int if all(isinstance(c, int) for c in tmp) else object
	class_mapping = np.empty(len(tmp), dtype=dtype)
	class_mapping[:] = tmp
	self.classes_, inverse = np.unique(class_mapping, return_inverse=True)
	# ensure yt.indices keeps its current dtype
	yt.indices = np.asarray(inverse[yt.indices], dtype=yt.indices.dtype)

	if not self.sparse_output:
	yt = yt.toarray()

	return yt

	def transform(self, y):
	"""Transform the given label sets.

	Parameters
	----------
	y : iterable of iterables
	A set of labels (any orderable and hashable object) for each
	sample. If the `classes` parameter is set, `y` will not be
	iterated.

	Returns
	-------
	y_indicator : array or CSR matrix, shape (n_samples, n_classes)
	A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in
	`y[i]`, and 0 otherwise.
	"""
	check_is_fitted(self)

	class_to_index = self._build_cache()
	yt = self._transform(y, class_to_index)

	if not self.sparse_output:
	yt = yt.toarray()

	return yt

	def _build_cache(self):
	if self._cached_dict is None:
	self._cached_dict = dict(zip(self.classes_, range(len(self.classes_))))

	return self._cached_dict

	def _transform(self, y, class_mapping):
	"""Transforms the label sets with a given mapping.

	Parameters
	----------
	y : iterable of iterables
	A set of labels (any orderable and hashable object) for each
	sample. If the `classes` parameter is set, `y` will not be
	iterated.

	class_mapping : Mapping
	Maps from label to column index in label indicator matrix.

	Returns
	-------
	y_indicator : sparse matrix of shape (n_samples, n_classes)
	Label indicator matrix. Will be of CSR format.
	"""
	indices = array.array("i")
	indptr = array.array("i", [0])
	unknown = set()
	for labels in y:
	index = set()
	for label in labels:
	try:
	index.add(class_mapping[label])
	except KeyError:
	unknown.add(label)
	indices.extend(index)
	indptr.append(len(indices))
	if unknown:
	warnings.warn(
	"unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))
	)
	data = np.ones(len(indices), dtype=int)

	return sp.csr_matrix(
	(data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping))
	)

	def inverse_transform(self, yt):
	"""Transform the given indicator matrix into label sets.

	Parameters
	----------
	yt : {ndarray, sparse matrix} of shape (n_samples, n_classes)
	A matrix containing only 1s ands 0s.

	Returns
	-------
	y : list of tuples
	The set of labels for each sample such that `y[i]` consists of
	`classes_[j]` for each `yt[i, j] == 1`.
	"""
	check_is_fitted(self)

	if yt.shape[1] != len(self.classes_):
	raise ValueError(
	"Expected indicator for {0} classes, but got {1}".format(
	len(self.classes_), yt.shape[1]
	)
	)

	if sp.issparse(yt):
	yt = yt.tocsr()
	if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0:
	raise ValueError("Expected only 0s and 1s in label indicator.")
	return [
	tuple(self.classes_.take(yt.indices[start:end]))
	for start, end in zip(yt.indptr[:-1], yt.indptr[1:])
	]
	else:
	unexpected = np.setdiff1d(yt, [0, 1])
	if len(unexpected) > 0:
	raise ValueError(
	"Expected only 0s and 1s in label indicator. Also got {0}".format(
	unexpected
	)
	)
	return [tuple(self.classes_.compress(indicators)) for indicators in yt]

	def __sklearn_tags__(self):
	tags = super().__sklearn_tags__()
	tags.input_tags.two_d_array = False
	tags.target_tags.two_d_labels = True
	return tags