Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

81.6 kB

	"""K-means clustering."""

	# Authors: The scikit-learn developers
	# SPDX-License-Identifier: BSD-3-Clause

	import warnings
	from abc import ABC, abstractmethod
	from numbers import Integral, Real

	import numpy as np
	import scipy.sparse as sp

	from ..base import (
	BaseEstimator,
	ClassNamePrefixFeaturesOutMixin,
	ClusterMixin,
	TransformerMixin,
	_fit_context,
	)
	from ..exceptions import ConvergenceWarning
	from ..metrics.pairwise import _euclidean_distances, euclidean_distances
	from ..utils import check_array, check_random_state
	from ..utils._openmp_helpers import _openmp_effective_n_threads
	from ..utils._param_validation import Interval, StrOptions, validate_params
	from ..utils.extmath import row_norms, stable_cumsum
	from ..utils.parallel import (
	_get_threadpool_controller,
	_threadpool_controller_decorator,
	)
	from ..utils.sparsefuncs import mean_variance_axis
	from ..utils.sparsefuncs_fast import assign_rows_csr
	from ..utils.validation import (
	_check_sample_weight,
	_is_arraylike_not_scalar,
	check_is_fitted,
	validate_data,
	)
	from ._k_means_common import (
	CHUNK_SIZE,
	_inertia_dense,
	_inertia_sparse,
	_is_same_clustering,
	)
	from ._k_means_elkan import (
	elkan_iter_chunked_dense,
	elkan_iter_chunked_sparse,
	init_bounds_dense,
	init_bounds_sparse,
	)
	from ._k_means_lloyd import lloyd_iter_chunked_dense, lloyd_iter_chunked_sparse
	from ._k_means_minibatch import _minibatch_update_dense, _minibatch_update_sparse

	###############################################################################
	# Initialization heuristic


	@validate_params(
	{
	"X": ["array-like", "sparse matrix"],
	"n_clusters": [Interval(Integral, 1, None, closed="left")],
	"sample_weight": ["array-like", None],
	"x_squared_norms": ["array-like", None],
	"random_state": ["random_state"],
	"n_local_trials": [Interval(Integral, 1, None, closed="left"), None],
	},
	prefer_skip_nested_validation=True,
	)
	def kmeans_plusplus(
	X,
	n_clusters,
	*,
	sample_weight=None,
	x_squared_norms=None,
	random_state=None,
	n_local_trials=None,
	):
	"""Init n_clusters seeds according to k-means++.

	.. versionadded:: 0.24

	Parameters
	----------
	X : {array-like, sparse matrix} of shape (n_samples, n_features)
	The data to pick seeds from.

	n_clusters : int
	The number of centroids to initialize.

	sample_weight : array-like of shape (n_samples,), default=None
	The weights for each observation in `X`. If `None`, all observations
	are assigned equal weight. `sample_weight` is ignored if `init`
	is a callable or a user provided array.

	.. versionadded:: 1.3

	x_squared_norms : array-like of shape (n_samples,), default=None
	Squared Euclidean norm of each data point.

	random_state : int or RandomState instance, default=None
	Determines random number generation for centroid initialization. Pass
	an int for reproducible output across multiple function calls.
	See :term:`Glossary <random_state>`.

	n_local_trials : int, default=None
	The number of seeding trials for each center (except the first),
	of which the one reducing inertia the most is greedily chosen.
	Set to None to make the number of trials depend logarithmically
	on the number of seeds (2+log(k)) which is the recommended setting.
	Setting to 1 disables the greedy cluster selection and recovers the
	vanilla k-means++ algorithm which was empirically shown to work less
	well than its greedy variant.

	Returns
	-------
	centers : ndarray of shape (n_clusters, n_features)
	The initial centers for k-means.

	indices : ndarray of shape (n_clusters,)
	The index location of the chosen centers in the data array X. For a
	given index and center, X[index] = center.

	Notes
	-----
	Selects initial cluster centers for k-mean clustering in a smart way
	to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
	"k-means++: the advantages of careful seeding". ACM-SIAM symposium
	on Discrete algorithms. 2007

	Examples
	--------

	>>> from sklearn.cluster import kmeans_plusplus
	>>> import numpy as np
	>>> X = np.array([[1, 2], [1, 4], [1, 0],
	... [10, 2], [10, 4], [10, 0]])
	>>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0)
	>>> centers
	array([[10, 2],
	[ 1, 0]])
	>>> indices
	array([3, 2])
	"""
	# Check data
	check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
	sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

	if X.shape[0] < n_clusters:
	raise ValueError(
	f"n_samples={X.shape[0]} should be >= n_clusters={n_clusters}."
	)

	# Check parameters
	if x_squared_norms is None:
	x_squared_norms = row_norms(X, squared=True)
	else:
	x_squared_norms = check_array(x_squared_norms, dtype=X.dtype, ensure_2d=False)

	if x_squared_norms.shape[0] != X.shape[0]:
	raise ValueError(
	f"The length of x_squared_norms {x_squared_norms.shape[0]} should "
	f"be equal to the length of n_samples {X.shape[0]}."
	)

	random_state = check_random_state(random_state)

	# Call private k-means++
	centers, indices = _kmeans_plusplus(
	X, n_clusters, x_squared_norms, sample_weight, random_state, n_local_trials
	)

	return centers, indices


	def _kmeans_plusplus(
	X, n_clusters, x_squared_norms, sample_weight, random_state, n_local_trials=None
	):
	"""Computational component for initialization of n_clusters by
	k-means++. Prior validation of data is assumed.

	Parameters
	----------
	X : {ndarray, sparse matrix} of shape (n_samples, n_features)
	The data to pick seeds for.

	n_clusters : int
	The number of seeds to choose.

	sample_weight : ndarray of shape (n_samples,)
	The weights for each observation in `X`.

	x_squared_norms : ndarray of shape (n_samples,)
	Squared Euclidean norm of each data point.

	random_state : RandomState instance
	The generator used to initialize the centers.
	See :term:`Glossary <random_state>`.

	n_local_trials : int, default=None
	The number of seeding trials for each center (except the first),
	of which the one reducing inertia the most is greedily chosen.
	Set to None to make the number of trials depend logarithmically
	on the number of seeds (2+log(k)); this is the default.

	Returns
	-------
	centers : ndarray of shape (n_clusters, n_features)
	The initial centers for k-means.

	indices : ndarray of shape (n_clusters,)
	The index location of the chosen centers in the data array X. For a
	given index and center, X[index] = center.
	"""
	n_samples, n_features = X.shape

	centers = np.empty((n_clusters, n_features), dtype=X.dtype)

	# Set the number of local seeding trials if none is given
	if n_local_trials is None:
	# This is what Arthur/Vassilvitskii tried, but did not report
	# specific results for other than mentioning in the conclusion
	# that it helped.
	n_local_trials = 2 + int(np.log(n_clusters))

	# Pick first center randomly and track index of point
	center_id = random_state.choice(n_samples, p=sample_weight / sample_weight.sum())
	indices = np.full(n_clusters, -1, dtype=int)
	if sp.issparse(X):
	centers[0] = X[[center_id]].toarray()
	else:
	centers[0] = X[center_id]
	indices[0] = center_id

	# Initialize list of closest distances and calculate current potential
	closest_dist_sq = _euclidean_distances(
	centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, squared=True
	)
	current_pot = closest_dist_sq @ sample_weight

	# Pick the remaining n_clusters-1 points
	for c in range(1, n_clusters):
	# Choose center candidates by sampling with probability proportional
	# to the squared distance to the closest existing center
	rand_vals = random_state.uniform(size=n_local_trials) * current_pot
	candidate_ids = np.searchsorted(
	stable_cumsum(sample_weight * closest_dist_sq), rand_vals
	)
	# XXX: numerical imprecision can result in a candidate_id out of range
	np.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids)

	# Compute distances to center candidates
	distance_to_candidates = _euclidean_distances(
	X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True
	)

	# update closest distances squared and potential for each candidate
	np.minimum(closest_dist_sq, distance_to_candidates, out=distance_to_candidates)
	candidates_pot = distance_to_candidates @ sample_weight.reshape(-1, 1)

	# Decide which candidate is the best
	best_candidate = np.argmin(candidates_pot)
	current_pot = candidates_pot[best_candidate]
	closest_dist_sq = distance_to_candidates[best_candidate]
	best_candidate = candidate_ids[best_candidate]

	# Permanently add best center candidate found in local tries
	if sp.issparse(X):
	centers[c] = X[[best_candidate]].toarray()
	else:
	centers[c] = X[best_candidate]
	indices[c] = best_candidate

	return centers, indices


	###############################################################################
	# K-means batch estimation by EM (expectation maximization)


	def _tolerance(X, tol):
	"""Return a tolerance which is dependent on the dataset."""
	if tol == 0:
	return 0
	if sp.issparse(X):
	variances = mean_variance_axis(X, axis=0)[1]
	else:
	variances = np.var(X, axis=0)
	return np.mean(variances) * tol


	@validate_params(
	{
	"X": ["array-like", "sparse matrix"],
	"sample_weight": ["array-like", None],
	"return_n_iter": [bool],
	},
	prefer_skip_nested_validation=False,
	)
	def k_means(
	X,
	n_clusters,
	*,
	sample_weight=None,
	init="k-means++",
	n_init="auto",
	max_iter=300,
	verbose=False,
	tol=1e-4,
	random_state=None,
	copy_x=True,
	algorithm="lloyd",
	return_n_iter=False,
	):
	"""Perform K-means clustering algorithm.

	Read more in the :ref:`User Guide <k_means>`.

	Parameters
	----------
	X : {array-like, sparse matrix} of shape (n_samples, n_features)
	The observations to cluster. It must be noted that the data
	will be converted to C ordering, which will cause a memory copy
	if the given data is not C-contiguous.

	n_clusters : int
	The number of clusters to form as well as the number of
	centroids to generate.

	sample_weight : array-like of shape (n_samples,), default=None
	The weights for each observation in `X`. If `None`, all observations
	are assigned equal weight. `sample_weight` is not used during
	initialization if `init` is a callable or a user provided array.

	init : {'k-means++', 'random'}, callable or array-like of shape \
	(n_clusters, n_features), default='k-means++'
	Method for initialization:

	- `'k-means++'` : selects initial cluster centers for k-mean
	clustering in a smart way to speed up convergence. See section
	Notes in k_init for more details.
	- `'random'`: choose `n_clusters` observations (rows) at random from data
	for the initial centroids.
	- If an array is passed, it should be of shape `(n_clusters, n_features)`
	and gives the initial centers.
	- If a callable is passed, it should take arguments `X`, `n_clusters` and a
	random state and return an initialization.

	n_init : 'auto' or int, default="auto"
	Number of time the k-means algorithm will be run with different
	centroid seeds. The final results will be the best output of
	n_init consecutive runs in terms of inertia.

	When `n_init='auto'`, the number of runs depends on the value of init:
	10 if using `init='random'` or `init` is a callable;
	1 if using `init='k-means++'` or `init` is an array-like.

	.. versionadded:: 1.2
	Added 'auto' option for `n_init`.

	.. versionchanged:: 1.4
	Default value for `n_init` changed to `'auto'`.

	max_iter : int, default=300
	Maximum number of iterations of the k-means algorithm to run.

	verbose : bool, default=False
	Verbosity mode.

	tol : float, default=1e-4
	Relative tolerance with regards to Frobenius norm of the difference
	in the cluster centers of two consecutive iterations to declare
	convergence.

	random_state : int, RandomState instance or None, default=None
	Determines random number generation for centroid initialization. Use
	an int to make the randomness deterministic.
	See :term:`Glossary <random_state>`.

	copy_x : bool, default=True
	When pre-computing distances it is more numerically accurate to center
	the data first. If `copy_x` is True (default), then the original data is
	not modified. If False, the original data is modified, and put back
	before the function returns, but small numerical differences may be
	introduced by subtracting and then adding the data mean. Note that if
	the original data is not C-contiguous, a copy will be made even if
	`copy_x` is False. If the original data is sparse, but not in CSR format,
	a copy will be made even if `copy_x` is False.

	algorithm : {"lloyd", "elkan"}, default="lloyd"
	K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`.
	The `"elkan"` variation can be more efficient on some datasets with
	well-defined clusters, by using the triangle inequality. However it's
	more memory intensive due to the allocation of an extra array of shape
	`(n_samples, n_clusters)`.

	.. versionchanged:: 0.18
	Added Elkan algorithm

	.. versionchanged:: 1.1
	Renamed "full" to "lloyd", and deprecated "auto" and "full".
	Changed "auto" to use "lloyd" instead of "elkan".

	return_n_iter : bool, default=False
	Whether or not to return the number of iterations.

	Returns
	-------
	centroid : ndarray of shape (n_clusters, n_features)
	Centroids found at the last iteration of k-means.

	label : ndarray of shape (n_samples,)
	The `label[i]` is the code or index of the centroid the
	i'th observation is closest to.

	inertia : float
	The final value of the inertia criterion (sum of squared distances to
	the closest centroid for all observations in the training set).

	best_n_iter : int
	Number of iterations corresponding to the best results.
	Returned only if `return_n_iter` is set to True.

	Examples
	--------
	>>> import numpy as np
	>>> from sklearn.cluster import k_means
	>>> X = np.array([[1, 2], [1, 4], [1, 0],
	... [10, 2], [10, 4], [10, 0]])
	>>> centroid, label, inertia = k_means(
	... X, n_clusters=2, n_init="auto", random_state=0
	... )
	>>> centroid
	array([[10., 2.],
	[ 1., 2.]])
	>>> label
	array([1, 1, 1, 0, 0, 0], dtype=int32)
	>>> inertia
	16.0
	"""
	est = KMeans(
	n_clusters=n_clusters,
	init=init,
	n_init=n_init,
	max_iter=max_iter,
	verbose=verbose,
	tol=tol,
	random_state=random_state,
	copy_x=copy_x,
	algorithm=algorithm,
	).fit(X, sample_weight=sample_weight)
	if return_n_iter:
	return est.cluster_centers_, est.labels_, est.inertia_, est.n_iter_
	else:
	return est.cluster_centers_, est.labels_, est.inertia_


	def _kmeans_single_elkan(
	X,
	sample_weight,
	centers_init,
	max_iter=300,
	verbose=False,
	tol=1e-4,
	n_threads=1,
	):
	"""A single run of k-means elkan, assumes preparation completed prior.

	Parameters
	----------
	X : {ndarray, sparse matrix} of shape (n_samples, n_features)
	The observations to cluster. If sparse matrix, must be in CSR format.

	sample_weight : array-like of shape (n_samples,)
	The weights for each observation in X.

	centers_init : ndarray of shape (n_clusters, n_features)
	The initial centers.

	max_iter : int, default=300
	Maximum number of iterations of the k-means algorithm to run.

	verbose : bool, default=False
	Verbosity mode.

	tol : float, default=1e-4
	Relative tolerance with regards to Frobenius norm of the difference
	in the cluster centers of two consecutive iterations to declare
	convergence.
	It's not advised to set `tol=0` since convergence might never be
	declared due to rounding errors. Use a very small number instead.

	n_threads : int, default=1
	The number of OpenMP threads to use for the computation. Parallelism is
	sample-wise on the main cython loop which assigns each sample to its
	closest center.

	Returns
	-------
	centroid : ndarray of shape (n_clusters, n_features)
	Centroids found at the last iteration of k-means.

	label : ndarray of shape (n_samples,)
	label[i] is the code or index of the centroid the
	i'th observation is closest to.

	inertia : float
	The final value of the inertia criterion (sum of squared distances to
	the closest centroid for all observations in the training set).

	n_iter : int
	Number of iterations run.
	"""
	n_samples = X.shape[0]
	n_clusters = centers_init.shape[0]

	# Buffers to avoid new allocations at each iteration.
	centers = centers_init
	centers_new = np.zeros_like(centers)
	weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)
	labels = np.full(n_samples, -1, dtype=np.int32)
	labels_old = labels.copy()
	center_half_distances = euclidean_distances(centers) / 2
	distance_next_center = np.partition(
	np.asarray(center_half_distances), kth=1, axis=0
	)[1]
	upper_bounds = np.zeros(n_samples, dtype=X.dtype)
	lower_bounds = np.zeros((n_samples, n_clusters), dtype=X.dtype)
	center_shift = np.zeros(n_clusters, dtype=X.dtype)

	if sp.issparse(X):
	init_bounds = init_bounds_sparse
	elkan_iter = elkan_iter_chunked_sparse
	_inertia = _inertia_sparse
	else:
	init_bounds = init_bounds_dense
	elkan_iter = elkan_iter_chunked_dense
	_inertia = _inertia_dense

	init_bounds(
	X,
	centers,
	center_half_distances,
	labels,
	upper_bounds,
	lower_bounds,
	n_threads=n_threads,
	)

	strict_convergence = False

	for i in range(max_iter):
	elkan_iter(
	X,
	sample_weight,
	centers,
	centers_new,
	weight_in_clusters,
	center_half_distances,
	distance_next_center,
	upper_bounds,
	lower_bounds,
	labels,
	center_shift,
	n_threads,
	)

	# compute new pairwise distances between centers and closest other
	# center of each center for next iterations
	center_half_distances = euclidean_distances(centers_new) / 2
	distance_next_center = np.partition(
	np.asarray(center_half_distances), kth=1, axis=0
	)[1]

	if verbose:
	inertia = _inertia(X, sample_weight, centers, labels, n_threads)
	print(f"Iteration {i}, inertia {inertia}")

	centers, centers_new = centers_new, centers

	if np.array_equal(labels, labels_old):
	# First check the labels for strict convergence.
	if verbose:
	print(f"Converged at iteration {i}: strict convergence.")
	strict_convergence = True
	break
	else:
	# No strict convergence, check for tol based convergence.
	center_shift_tot = (center_shift**2).sum()
	if center_shift_tot <= tol:
	if verbose:
	print(
	f"Converged at iteration {i}: center shift "
	f"{center_shift_tot} within tolerance {tol}."
	)
	break

	labels_old[:] = labels

	if not strict_convergence:
	# rerun E-step so that predicted labels match cluster centers
	elkan_iter(
	X,
	sample_weight,
	centers,
	centers,
	weight_in_clusters,
	center_half_distances,
	distance_next_center,
	upper_bounds,
	lower_bounds,
	labels,
	center_shift,
	n_threads,
	update_centers=False,
	)

	inertia = _inertia(X, sample_weight, centers, labels, n_threads)

	return labels, inertia, centers, i + 1


	# Threadpoolctl context to limit the number of threads in second level of
	# nested parallelism (i.e. BLAS) to avoid oversubscription.
	@_threadpool_controller_decorator(limits=1, user_api="blas")
	def _kmeans_single_lloyd(
	X,
	sample_weight,
	centers_init,
	max_iter=300,
	verbose=False,
	tol=1e-4,
	n_threads=1,
	):
	"""A single run of k-means lloyd, assumes preparation completed prior.

	Parameters
	----------
	X : {ndarray, sparse matrix} of shape (n_samples, n_features)
	The observations to cluster. If sparse matrix, must be in CSR format.

	sample_weight : ndarray of shape (n_samples,)
	The weights for each observation in X.

	centers_init : ndarray of shape (n_clusters, n_features)
	The initial centers.

	max_iter : int, default=300
	Maximum number of iterations of the k-means algorithm to run.

	verbose : bool, default=False
	Verbosity mode

	tol : float, default=1e-4
	Relative tolerance with regards to Frobenius norm of the difference
	in the cluster centers of two consecutive iterations to declare
	convergence.
	It's not advised to set `tol=0` since convergence might never be
	declared due to rounding errors. Use a very small number instead.

	n_threads : int, default=1
	The number of OpenMP threads to use for the computation. Parallelism is
	sample-wise on the main cython loop which assigns each sample to its
	closest center.

	Returns
	-------
	centroid : ndarray of shape (n_clusters, n_features)
	Centroids found at the last iteration of k-means.

	label : ndarray of shape (n_samples,)
	label[i] is the code or index of the centroid the
	i'th observation is closest to.

	inertia : float
	The final value of the inertia criterion (sum of squared distances to
	the closest centroid for all observations in the training set).

	n_iter : int
	Number of iterations run.
	"""
	n_clusters = centers_init.shape[0]

	# Buffers to avoid new allocations at each iteration.
	centers = centers_init
	centers_new = np.zeros_like(centers)
	labels = np.full(X.shape[0], -1, dtype=np.int32)
	labels_old = labels.copy()
	weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)
	center_shift = np.zeros(n_clusters, dtype=X.dtype)

	if sp.issparse(X):
	lloyd_iter = lloyd_iter_chunked_sparse
	_inertia = _inertia_sparse
	else:
	lloyd_iter = lloyd_iter_chunked_dense
	_inertia = _inertia_dense

	strict_convergence = False

	for i in range(max_iter):
	lloyd_iter(
	X,
	sample_weight,
	centers,
	centers_new,
	weight_in_clusters,
	labels,
	center_shift,
	n_threads,
	)

	if verbose:
	inertia = _inertia(X, sample_weight, centers, labels, n_threads)
	print(f"Iteration {i}, inertia {inertia}.")

	centers, centers_new = centers_new, centers

	if np.array_equal(labels, labels_old):
	# First check the labels for strict convergence.
	if verbose:
	print(f"Converged at iteration {i}: strict convergence.")
	strict_convergence = True
	break
	else:
	# No strict convergence, check for tol based convergence.
	center_shift_tot = (center_shift**2).sum()
	if center_shift_tot <= tol:
	if verbose:
	print(
	f"Converged at iteration {i}: center shift "
	f"{center_shift_tot} within tolerance {tol}."
	)
	break

	labels_old[:] = labels

	if not strict_convergence:
	# rerun E-step so that predicted labels match cluster centers
	lloyd_iter(
	X,
	sample_weight,
	centers,
	centers,
	weight_in_clusters,
	labels,
	center_shift,
	n_threads,
	update_centers=False,
	)

	inertia = _inertia(X, sample_weight, centers, labels, n_threads)

	return labels, inertia, centers, i + 1


	def _labels_inertia(X, sample_weight, centers, n_threads=1, return_inertia=True):
	"""E step of the K-means EM algorithm.

	Compute the labels and the inertia of the given samples and centers.

	Parameters
	----------
	X : {ndarray, sparse matrix} of shape (n_samples, n_features)
	The input samples to assign to the labels. If sparse matrix, must
	be in CSR format.

	sample_weight : ndarray of shape (n_samples,)
	The weights for each observation in X.

	x_squared_norms : ndarray of shape (n_samples,)
	Precomputed squared euclidean norm of each data point, to speed up
	computations.

	centers : ndarray of shape (n_clusters, n_features)
	The cluster centers.

	n_threads : int, default=1
	The number of OpenMP threads to use for the computation. Parallelism is
	sample-wise on the main cython loop which assigns each sample to its
	closest center.

	return_inertia : bool, default=True
	Whether to compute and return the inertia.

	Returns
	-------
	labels : ndarray of shape (n_samples,)
	The resulting assignment.

	inertia : float
	Sum of squared distances of samples to their closest cluster center.
	Inertia is only returned if return_inertia is True.
	"""
	n_samples = X.shape[0]
	n_clusters = centers.shape[0]

	labels = np.full(n_samples, -1, dtype=np.int32)
	center_shift = np.zeros(n_clusters, dtype=centers.dtype)

	if sp.issparse(X):
	_labels = lloyd_iter_chunked_sparse
	_inertia = _inertia_sparse
	else:
	_labels = lloyd_iter_chunked_dense
	_inertia = _inertia_dense

	_labels(
	X,
	sample_weight,
	centers,
	centers_new=None,
	weight_in_clusters=None,
	labels=labels,
	center_shift=center_shift,
	n_threads=n_threads,
	update_centers=False,
	)

	if return_inertia:
	inertia = _inertia(X, sample_weight, centers, labels, n_threads)
	return labels, inertia

	return labels


	# Same as _labels_inertia but in a threadpool_limits context.
	_labels_inertia_threadpool_limit = _threadpool_controller_decorator(
	limits=1, user_api="blas"
	)(_labels_inertia)


	class _BaseKMeans(
	ClassNamePrefixFeaturesOutMixin, TransformerMixin, ClusterMixin, BaseEstimator, ABC
	):
	"""Base class for KMeans and MiniBatchKMeans"""

	_parameter_constraints: dict = {
	"n_clusters": [Interval(Integral, 1, None, closed="left")],
	"init": [StrOptions({"k-means++", "random"}), callable, "array-like"],
	"n_init": [
	StrOptions({"auto"}),
	Interval(Integral, 1, None, closed="left"),
	],
	"max_iter": [Interval(Integral, 1, None, closed="left")],
	"tol": [Interval(Real, 0, None, closed="left")],
	"verbose": ["verbose"],
	"random_state": ["random_state"],
	}

	def __init__(
	self,
	n_clusters,
	*,
	init,
	n_init,
	max_iter,
	tol,
	verbose,
	random_state,
	):
	self.n_clusters = n_clusters
	self.init = init
	self.max_iter = max_iter
	self.tol = tol
	self.n_init = n_init
	self.verbose = verbose
	self.random_state = random_state

	def _check_params_vs_input(self, X, default_n_init=None):
	# n_clusters
	if X.shape[0] < self.n_clusters:
	raise ValueError(
	f"n_samples={X.shape[0]} should be >= n_clusters={self.n_clusters}."
	)

	# tol
	self._tol = _tolerance(X, self.tol)

	# n-init
	if self.n_init == "auto":
	if isinstance(self.init, str) and self.init == "k-means++":
	self._n_init = 1
	elif isinstance(self.init, str) and self.init == "random":
	self._n_init = default_n_init
	elif callable(self.init):
	self._n_init = default_n_init
	else: # array-like
	self._n_init = 1
	else:
	self._n_init = self.n_init

	if _is_arraylike_not_scalar(self.init) and self._n_init != 1:
	warnings.warn(
	(
	"Explicit initial center position passed: performing only"
	f" one init in {self.__class__.__name__} instead of "
	f"n_init={self._n_init}."
	),
	RuntimeWarning,
	stacklevel=2,
	)
	self._n_init = 1

	@abstractmethod
	def _warn_mkl_vcomp(self, n_active_threads):
	"""Issue an estimator specific warning when vcomp and mkl are both present

	This method is called by `_check_mkl_vcomp`.
	"""

	def _check_mkl_vcomp(self, X, n_samples):
	"""Check when vcomp and mkl are both present"""
	# The BLAS call inside a prange in lloyd_iter_chunked_dense is known to
	# cause a small memory leak when there are less chunks than the number
	# of available threads. It only happens when the OpenMP library is
	# vcomp (microsoft OpenMP) and the BLAS library is MKL. see #18653
	if sp.issparse(X):
	return

	n_active_threads = int(np.ceil(n_samples / CHUNK_SIZE))
	if n_active_threads < self._n_threads:
	modules = _get_threadpool_controller().info()
	has_vcomp = "vcomp" in [module["prefix"] for module in modules]
	has_mkl = ("mkl", "intel") in [
	(module["internal_api"], module.get("threading_layer", None))
	for module in modules
	]
	if has_vcomp and has_mkl:
	self._warn_mkl_vcomp(n_active_threads)

	def _validate_center_shape(self, X, centers):
	"""Check if centers is compatible with X and n_clusters."""
	if centers.shape[0] != self.n_clusters:
	raise ValueError(
	f"The shape of the initial centers {centers.shape} does not "
	f"match the number of clusters {self.n_clusters}."
	)
	if centers.shape[1] != X.shape[1]:
	raise ValueError(
	f"The shape of the initial centers {centers.shape} does not "
	f"match the number of features of the data {X.shape[1]}."
	)

	def _check_test_data(self, X):
	X = validate_data(
	self,
	X,
	accept_sparse="csr",
	reset=False,
	dtype=[np.float64, np.float32],
	order="C",
	accept_large_sparse=False,
	)
	return X

	def _init_centroids(
	self,
	X,
	x_squared_norms,
	init,
	random_state,
	sample_weight,
	init_size=None,
	n_centroids=None,
	):
	"""Compute the initial centroids.

	Parameters
	----------
	X : {ndarray, sparse matrix} of shape (n_samples, n_features)
	The input samples.

	x_squared_norms : ndarray of shape (n_samples,)
	Squared euclidean norm of each data point. Pass it if you have it
	at hands already to avoid it being recomputed here.

	init : {'k-means++', 'random'}, callable or ndarray of shape \
	(n_clusters, n_features)
	Method for initialization.

	random_state : RandomState instance
	Determines random number generation for centroid initialization.
	See :term:`Glossary <random_state>`.

	sample_weight : ndarray of shape (n_samples,)
	The weights for each observation in X. `sample_weight` is not used
	during initialization if `init` is a callable or a user provided
	array.

	init_size : int, default=None
	Number of samples to randomly sample for speeding up the
	initialization (sometimes at the expense of accuracy).

	n_centroids : int, default=None
	Number of centroids to initialize.
	If left to 'None' the number of centroids will be equal to
	number of clusters to form (self.n_clusters).

	Returns
	-------
	centers : ndarray of shape (n_clusters, n_features)
	Initial centroids of clusters.
	"""
	n_samples = X.shape[0]
	n_clusters = self.n_clusters if n_centroids is None else n_centroids

	if init_size is not None and init_size < n_samples:
	init_indices = random_state.randint(0, n_samples, init_size)
	X = X[init_indices]
	x_squared_norms = x_squared_norms[init_indices]
	n_samples = X.shape[0]
	sample_weight = sample_weight[init_indices]

	if isinstance(init, str) and init == "k-means++":
	centers, _ = _kmeans_plusplus(
	X,
	n_clusters,
	random_state=random_state,
	x_squared_norms=x_squared_norms,
	sample_weight=sample_weight,
	)
	elif isinstance(init, str) and init == "random":
	seeds = random_state.choice(
	n_samples,
	size=n_clusters,
	replace=False,
	p=sample_weight / sample_weight.sum(),
	)
	centers = X[seeds]
	elif _is_arraylike_not_scalar(self.init):
	centers = init
	elif callable(init):
	centers = init(X, n_clusters, random_state=random_state)
	centers = check_array(centers, dtype=X.dtype, copy=False, order="C")
	self._validate_center_shape(X, centers)

	if sp.issparse(centers):
	centers = centers.toarray()

	return centers

	def fit_predict(self, X, y=None, sample_weight=None):
	"""Compute cluster centers and predict cluster index for each sample.

	Convenience method; equivalent to calling fit(X) followed by
	predict(X).

	Parameters
	----------
	X : {array-like, sparse matrix} of shape (n_samples, n_features)
	New data to transform.

	y : Ignored
	Not used, present here for API consistency by convention.

	sample_weight : array-like of shape (n_samples,), default=None
	The weights for each observation in X. If None, all observations
	are assigned equal weight.

	Returns
	-------
	labels : ndarray of shape (n_samples,)
	Index of the cluster each sample belongs to.
	"""
	return self.fit(X, sample_weight=sample_weight).labels_

	def predict(self, X):
	"""Predict the closest cluster each sample in X belongs to.

	In the vector quantization literature, `cluster_centers_` is called
	the code book and each value returned by `predict` is the index of
	the closest code in the code book.

	Parameters
	----------
	X : {array-like, sparse matrix} of shape (n_samples, n_features)
	New data to predict.

	Returns
	-------
	labels : ndarray of shape (n_samples,)
	Index of the cluster each sample belongs to.
	"""
	check_is_fitted(self)

	X = self._check_test_data(X)

	# sample weights are not used by predict but cython helpers expect an array
	sample_weight = np.ones(X.shape[0], dtype=X.dtype)

	labels = _labels_inertia_threadpool_limit(
	X,
	sample_weight,
	self.cluster_centers_,
	n_threads=self._n_threads,
	return_inertia=False,
	)

	return labels

	def fit_transform(self, X, y=None, sample_weight=None):
	"""Compute clustering and transform X to cluster-distance space.

	Equivalent to fit(X).transform(X), but more efficiently implemented.

	Parameters
	----------
	X : {array-like, sparse matrix} of shape (n_samples, n_features)
	New data to transform.

	y : Ignored
	Not used, present here for API consistency by convention.

	sample_weight : array-like of shape (n_samples,), default=None
	The weights for each observation in X. If None, all observations
	are assigned equal weight.

	Returns
	-------
	X_new : ndarray of shape (n_samples, n_clusters)
	X transformed in the new space.
	"""
	return self.fit(X, sample_weight=sample_weight)._transform(X)

	def transform(self, X):
	"""Transform X to a cluster-distance space.

	In the new space, each dimension is the distance to the cluster
	centers. Note that even if X is sparse, the array returned by
	`transform` will typically be dense.

	Parameters
	----------
	X : {array-like, sparse matrix} of shape (n_samples, n_features)
	New data to transform.

	Returns
	-------
	X_new : ndarray of shape (n_samples, n_clusters)
	X transformed in the new space.
	"""
	check_is_fitted(self)

	X = self._check_test_data(X)
	return self._transform(X)

	def _transform(self, X):
	"""Guts of transform method; no input validation."""
	return euclidean_distances(X, self.cluster_centers_)

	def score(self, X, y=None, sample_weight=None):
	"""Opposite of the value of X on the K-means objective.

	Parameters
	----------
	X : {array-like, sparse matrix} of shape (n_samples, n_features)
	New data.

	y : Ignored
	Not used, present here for API consistency by convention.

	sample_weight : array-like of shape (n_samples,), default=None
	The weights for each observation in X. If None, all observations
	are assigned equal weight.

	Returns
	-------
	score : float
	Opposite of the value of X on the K-means objective.
	"""
	check_is_fitted(self)

	X = self._check_test_data(X)
	sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

	_, scores = _labels_inertia_threadpool_limit(
	X, sample_weight, self.cluster_centers_, self._n_threads
	)
	return -scores

	def __sklearn_tags__(self):
	tags = super().__sklearn_tags__()
	tags.input_tags.sparse = True
	return tags


	class KMeans(_BaseKMeans):
	"""K-Means clustering.

	Read more in the :ref:`User Guide <k_means>`.

	Parameters
	----------

	n_clusters : int, default=8
	The number of clusters to form as well as the number of
	centroids to generate.

	For an example of how to choose an optimal value for `n_clusters` refer to
	:ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.

	init : {'k-means++', 'random'}, callable or array-like of shape \
	(n_clusters, n_features), default='k-means++'
	Method for initialization:

	* 'k-means++' : selects initial cluster centroids using sampling \
	based on an empirical probability distribution of the points' \
	contribution to the overall inertia. This technique speeds up \
	convergence. The algorithm implemented is "greedy k-means++". It \
	differs from the vanilla k-means++ by making several trials at \
	each sampling step and choosing the best centroid among them.

	* 'random': choose `n_clusters` observations (rows) at random from \
	data for the initial centroids.

	* If an array is passed, it should be of shape (n_clusters, n_features)\
	and gives the initial centers.

	* If a callable is passed, it should take arguments X, n_clusters and a\
	random state and return an initialization.

	For an example of how to use the different `init` strategies, see
	:ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`.

	For an evaluation of the impact of initialization, see the example
	:ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`.

	n_init : 'auto' or int, default='auto'
	Number of times the k-means algorithm is run with different centroid
	seeds. The final results is the best output of `n_init` consecutive runs
	in terms of inertia. Several runs are recommended for sparse
	high-dimensional problems (see :ref:`kmeans_sparse_high_dim`).

	When `n_init='auto'`, the number of runs depends on the value of init:
	10 if using `init='random'` or `init` is a callable;
	1 if using `init='k-means++'` or `init` is an array-like.

	.. versionadded:: 1.2
	Added 'auto' option for `n_init`.

	.. versionchanged:: 1.4
	Default value for `n_init` changed to `'auto'`.

	max_iter : int, default=300
	Maximum number of iterations of the k-means algorithm for a
	single run.

	tol : float, default=1e-4
	Relative tolerance with regards to Frobenius norm of the difference
	in the cluster centers of two consecutive iterations to declare
	convergence.

	verbose : int, default=0
	Verbosity mode.

	random_state : int, RandomState instance or None, default=None
	Determines random number generation for centroid initialization. Use
	an int to make the randomness deterministic.
	See :term:`Glossary <random_state>`.

	copy_x : bool, default=True
	When pre-computing distances it is more numerically accurate to center
	the data first. If copy_x is True (default), then the original data is
	not modified. If False, the original data is modified, and put back
	before the function returns, but small numerical differences may be
	introduced by subtracting and then adding the data mean. Note that if
	the original data is not C-contiguous, a copy will be made even if
	copy_x is False. If the original data is sparse, but not in CSR format,
	a copy will be made even if copy_x is False.

	algorithm : {"lloyd", "elkan"}, default="lloyd"
	K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`.
	The `"elkan"` variation can be more efficient on some datasets with
	well-defined clusters, by using the triangle inequality. However it's
	more memory intensive due to the allocation of an extra array of shape
	`(n_samples, n_clusters)`.

	.. versionchanged:: 0.18
	Added Elkan algorithm

	.. versionchanged:: 1.1
	Renamed "full" to "lloyd", and deprecated "auto" and "full".
	Changed "auto" to use "lloyd" instead of "elkan".

	Attributes
	----------
	cluster_centers_ : ndarray of shape (n_clusters, n_features)
	Coordinates of cluster centers. If the algorithm stops before fully
	converging (see ``tol`` and ``max_iter``), these will not be
	consistent with ``labels_``.

	labels_ : ndarray of shape (n_samples,)
	Labels of each point

	inertia_ : float
	Sum of squared distances of samples to their closest cluster center,
	weighted by the sample weights if provided.

	n_iter_ : int
	Number of iterations run.

	n_features_in_ : int
	Number of features seen during :term:`fit`.

	.. versionadded:: 0.24

	feature_names_in_ : ndarray of shape (`n_features_in_`,)
	Names of features seen during :term:`fit`. Defined only when `X`
	has feature names that are all strings.

	.. versionadded:: 1.0

	See Also
	--------
	MiniBatchKMeans : Alternative online implementation that does incremental
	updates of the centers positions using mini-batches.
	For large scale learning (say n_samples > 10k) MiniBatchKMeans is
	probably much faster than the default batch implementation.

	Notes
	-----
	The k-means problem is solved using either Lloyd's or Elkan's algorithm.

	The average complexity is given by O(k n T), where n is the number of
	samples and T is the number of iteration.

	The worst case complexity is given by O(n^(k+2/p)) with
	n = n_samples, p = n_features.
	Refer to :doi:`"How slow is the k-means method?" D. Arthur and S. Vassilvitskii -
	SoCG2006.<10.1145/1137856.1137880>` for more details.

	In practice, the k-means algorithm is very fast (one of the fastest
	clustering algorithms available), but it falls in local minima. That's why
	it can be useful to restart it several times.

	If the algorithm stops before fully converging (because of ``tol`` or
	``max_iter``), ``labels_`` and ``cluster_centers_`` will not be consistent,
	i.e. the ``cluster_centers_`` will not be the means of the points in each
	cluster. Also, the estimator will reassign ``labels_`` after the last
	iteration to make ``labels_`` consistent with ``predict`` on the training
	set.

	Examples
	--------

	>>> from sklearn.cluster import KMeans
	>>> import numpy as np
	>>> X = np.array([[1, 2], [1, 4], [1, 0],
	... [10, 2], [10, 4], [10, 0]])
	>>> kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(X)
	>>> kmeans.labels_
	array([1, 1, 1, 0, 0, 0], dtype=int32)
	>>> kmeans.predict([[0, 0], [12, 3]])
	array([1, 0], dtype=int32)
	>>> kmeans.cluster_centers_
	array([[10., 2.],
	[ 1., 2.]])

	For examples of common problems with K-Means and how to address them see
	:ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`.

	For a demonstration of how K-Means can be used to cluster text documents see
	:ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`.

	For a comparison between K-Means and MiniBatchKMeans refer to example
	:ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`.

	For a comparison between K-Means and BisectingKMeans refer to example
	:ref:`sphx_glr_auto_examples_cluster_plot_bisect_kmeans.py`.
	"""

	_parameter_constraints: dict = {
	**_BaseKMeans._parameter_constraints,
	"copy_x": ["boolean"],
	"algorithm": [StrOptions({"lloyd", "elkan"})],
	}

	def __init__(
	self,
	n_clusters=8,
	*,
	init="k-means++",
	n_init="auto",
	max_iter=300,
	tol=1e-4,
	verbose=0,
	random_state=None,
	copy_x=True,
	algorithm="lloyd",
	):
	super().__init__(
	n_clusters=n_clusters,
	init=init,
	n_init=n_init,
	max_iter=max_iter,
	tol=tol,
	verbose=verbose,
	random_state=random_state,
	)

	self.copy_x = copy_x
	self.algorithm = algorithm

	def _check_params_vs_input(self, X):
	super()._check_params_vs_input(X, default_n_init=10)

	self._algorithm = self.algorithm
	if self._algorithm == "elkan" and self.n_clusters == 1:
	warnings.warn(
	(
	"algorithm='elkan' doesn't make sense for a single "
	"cluster. Using 'lloyd' instead."
	),
	RuntimeWarning,
	)
	self._algorithm = "lloyd"

	def _warn_mkl_vcomp(self, n_active_threads):
	"""Warn when vcomp and mkl are both present"""
	warnings.warn(
	"KMeans is known to have a memory leak on Windows "
	"with MKL, when there are less chunks than available "
	"threads. You can avoid it by setting the environment"
	f" variable OMP_NUM_THREADS={n_active_threads}."
	)

	@_fit_context(prefer_skip_nested_validation=True)
	def fit(self, X, y=None, sample_weight=None):
	"""Compute k-means clustering.

	Parameters
	----------
	X : {array-like, sparse matrix} of shape (n_samples, n_features)
	Training instances to cluster. It must be noted that the data
	will be converted to C ordering, which will cause a memory
	copy if the given data is not C-contiguous.
	If a sparse matrix is passed, a copy will be made if it's not in
	CSR format.

	y : Ignored
	Not used, present here for API consistency by convention.

	sample_weight : array-like of shape (n_samples,), default=None
	The weights for each observation in X. If None, all observations
	are assigned equal weight. `sample_weight` is not used during
	initialization if `init` is a callable or a user provided array.

	.. versionadded:: 0.20

	Returns
	-------
	self : object
	Fitted estimator.
	"""
	X = validate_data(
	self,
	X,
	accept_sparse="csr",
	dtype=[np.float64, np.float32],
	order="C",
	copy=self.copy_x,
	accept_large_sparse=False,
	)

	self._check_params_vs_input(X)

	random_state = check_random_state(self.random_state)
	sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
	self._n_threads = _openmp_effective_n_threads()

	# Validate init array
	init = self.init
	init_is_array_like = _is_arraylike_not_scalar(init)
	if init_is_array_like:
	init = check_array(init, dtype=X.dtype, copy=True, order="C")
	self._validate_center_shape(X, init)

	# subtract of mean of x for more accurate distance computations
	if not sp.issparse(X):
	X_mean = X.mean(axis=0)
	# The copy was already done above
	X -= X_mean

	if init_is_array_like:
	init -= X_mean

	# precompute squared norms of data points
	x_squared_norms = row_norms(X, squared=True)

	if self._algorithm == "elkan":
	kmeans_single = _kmeans_single_elkan
	else:
	kmeans_single = _kmeans_single_lloyd
	self._check_mkl_vcomp(X, X.shape[0])

	best_inertia, best_labels = None, None

	for i in range(self._n_init):
	# Initialize centers
	centers_init = self._init_centroids(
	X,
	x_squared_norms=x_squared_norms,
	init=init,
	random_state=random_state,
	sample_weight=sample_weight,
	)
	if self.verbose:
	print("Initialization complete")

	# run a k-means once
	labels, inertia, centers, n_iter_ = kmeans_single(
	X,
	sample_weight,
	centers_init,
	max_iter=self.max_iter,
	verbose=self.verbose,
	tol=self._tol,
	n_threads=self._n_threads,
	)

	# determine if these results are the best so far
	# we chose a new run if it has a better inertia and the clustering is
	# different from the best so far (it's possible that the inertia is
	# slightly better even if the clustering is the same with potentially
	# permuted labels, due to rounding errors)
	if best_inertia is None or (
	inertia < best_inertia
	and not _is_same_clustering(labels, best_labels, self.n_clusters)
	):
	best_labels = labels
	best_centers = centers
	best_inertia = inertia
	best_n_iter = n_iter_

	if not sp.issparse(X):
	if not self.copy_x:
	X += X_mean
	best_centers += X_mean

	distinct_clusters = len(set(best_labels))
	if distinct_clusters < self.n_clusters:
	warnings.warn(
	"Number of distinct clusters ({}) found smaller than "
	"n_clusters ({}). Possibly due to duplicate points "
	"in X.".format(distinct_clusters, self.n_clusters),
	ConvergenceWarning,
	stacklevel=2,
	)

	self.cluster_centers_ = best_centers
	self._n_features_out = self.cluster_centers_.shape[0]
	self.labels_ = best_labels
	self.inertia_ = best_inertia
	self.n_iter_ = best_n_iter
	return self


	def _mini_batch_step(
	X,
	sample_weight,
	centers,
	centers_new,
	weight_sums,
	random_state,
	random_reassign=False,
	reassignment_ratio=0.01,
	verbose=False,
	n_threads=1,
	):
	"""Incremental update of the centers for the Minibatch K-Means algorithm.

	Parameters
	----------

	X : {ndarray, sparse matrix} of shape (n_samples, n_features)
	The original data array. If sparse, must be in CSR format.

	x_squared_norms : ndarray of shape (n_samples,)
	Squared euclidean norm of each data point.

	sample_weight : ndarray of shape (n_samples,)
	The weights for each observation in `X`.

	centers : ndarray of shape (n_clusters, n_features)
	The cluster centers before the current iteration

	centers_new : ndarray of shape (n_clusters, n_features)
	The cluster centers after the current iteration. Modified in-place.

	weight_sums : ndarray of shape (n_clusters,)
	The vector in which we keep track of the numbers of points in a
	cluster. This array is modified in place.

	random_state : RandomState instance
	Determines random number generation for low count centers reassignment.
	See :term:`Glossary <random_state>`.

	random_reassign : boolean, default=False
	If True, centers with very low counts are randomly reassigned
	to observations.

	reassignment_ratio : float, default=0.01
	Control the fraction of the maximum number of counts for a
	center to be reassigned. A higher value means that low count
	centers are more likely to be reassigned, which means that the
	model will take longer to converge, but should converge in a
	better clustering.

	verbose : bool, default=False
	Controls the verbosity.

	n_threads : int, default=1
	The number of OpenMP threads to use for the computation.

	Returns
	-------
	inertia : float
	Sum of squared distances of samples to their closest cluster center.
	The inertia is computed after finding the labels and before updating
	the centers.
	"""
	# Perform label assignment to nearest centers
	# For better efficiency, it's better to run _mini_batch_step in a
	# threadpool_limit context than using _labels_inertia_threadpool_limit here
	labels, inertia = _labels_inertia(X, sample_weight, centers, n_threads=n_threads)

	# Update centers according to the labels
	if sp.issparse(X):
	_minibatch_update_sparse(
	X, sample_weight, centers, centers_new, weight_sums, labels, n_threads
	)
	else:
	_minibatch_update_dense(
	X,
	sample_weight,
	centers,
	centers_new,
	weight_sums,
	labels,
	n_threads,
	)

	# Reassign clusters that have very low weight
	if random_reassign and reassignment_ratio > 0:
	to_reassign = weight_sums < reassignment_ratio * weight_sums.max()

	# pick at most .5 * batch_size samples as new centers
	if to_reassign.sum() > 0.5 * X.shape[0]:
	indices_dont_reassign = np.argsort(weight_sums)[int(0.5 * X.shape[0]) :]
	to_reassign[indices_dont_reassign] = False
	n_reassigns = to_reassign.sum()

	if n_reassigns:
	# Pick new clusters amongst observations with uniform probability
	new_centers = random_state.choice(
	X.shape[0], replace=False, size=n_reassigns
	)
	if verbose:
	print(f"[MiniBatchKMeans] Reassigning {n_reassigns} cluster centers.")

	if sp.issparse(X):
	assign_rows_csr(
	X,
	new_centers.astype(np.intp, copy=False),
	np.where(to_reassign)[0].astype(np.intp, copy=False),
	centers_new,
	)
	else:
	centers_new[to_reassign] = X[new_centers]

	# reset counts of reassigned centers, but don't reset them too small
	# to avoid instant reassignment. This is a pretty dirty hack as it
	# also modifies the learning rates.
	weight_sums[to_reassign] = np.min(weight_sums[~to_reassign])

	return inertia


	class MiniBatchKMeans(_BaseKMeans):
	"""
	Mini-Batch K-Means clustering.

	Read more in the :ref:`User Guide <mini_batch_kmeans>`.

	Parameters
	----------

	n_clusters : int, default=8
	The number of clusters to form as well as the number of
	centroids to generate.

	init : {'k-means++', 'random'}, callable or array-like of shape \
	(n_clusters, n_features), default='k-means++'
	Method for initialization:

	'k-means++' : selects initial cluster centroids using sampling based on
	an empirical probability distribution of the points' contribution to the
	overall inertia. This technique speeds up convergence. The algorithm
	implemented is "greedy k-means++". It differs from the vanilla k-means++
	by making several trials at each sampling step and choosing the best centroid
	among them.

	'random': choose `n_clusters` observations (rows) at random from data
	for the initial centroids.

	If an array is passed, it should be of shape (n_clusters, n_features)
	and gives the initial centers.

	If a callable is passed, it should take arguments X, n_clusters and a
	random state and return an initialization.

	For an evaluation of the impact of initialization, see the example
	:ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`.

	max_iter : int, default=100
	Maximum number of iterations over the complete dataset before
	stopping independently of any early stopping criterion heuristics.

	batch_size : int, default=1024
	Size of the mini batches.
	For faster computations, you can set the ``batch_size`` greater than
	256 * number of cores to enable parallelism on all cores.

	.. versionchanged:: 1.0
	`batch_size` default changed from 100 to 1024.

	verbose : int, default=0
	Verbosity mode.

	compute_labels : bool, default=True
	Compute label assignment and inertia for the complete dataset
	once the minibatch optimization has converged in fit.

	random_state : int, RandomState instance or None, default=None
	Determines random number generation for centroid initialization and
	random reassignment. Use an int to make the randomness deterministic.
	See :term:`Glossary <random_state>`.

	tol : float, default=0.0
	Control early stopping based on the relative center changes as
	measured by a smoothed, variance-normalized of the mean center
	squared position changes. This early stopping heuristics is
	closer to the one used for the batch variant of the algorithms
	but induces a slight computational and memory overhead over the
	inertia heuristic.

	To disable convergence detection based on normalized center
	change, set tol to 0.0 (default).

	max_no_improvement : int, default=10
	Control early stopping based on the consecutive number of mini
	batches that does not yield an improvement on the smoothed inertia.

	To disable convergence detection based on inertia, set
	max_no_improvement to None.

	init_size : int, default=None
	Number of samples to randomly sample for speeding up the
	initialization (sometimes at the expense of accuracy): the
	only algorithm is initialized by running a batch KMeans on a
	random subset of the data. This needs to be larger than n_clusters.

	If `None`, the heuristic is `init_size = 3 * batch_size` if
	`3 * batch_size < n_clusters`, else `init_size = 3 * n_clusters`.

	n_init : 'auto' or int, default="auto"
	Number of random initializations that are tried.
	In contrast to KMeans, the algorithm is only run once, using the best of
	the `n_init` initializations as measured by inertia. Several runs are
	recommended for sparse high-dimensional problems (see
	:ref:`kmeans_sparse_high_dim`).

	When `n_init='auto'`, the number of runs depends on the value of init:
	3 if using `init='random'` or `init` is a callable;
	1 if using `init='k-means++'` or `init` is an array-like.

	.. versionadded:: 1.2
	Added 'auto' option for `n_init`.

	.. versionchanged:: 1.4
	Default value for `n_init` changed to `'auto'` in version.

	reassignment_ratio : float, default=0.01
	Control the fraction of the maximum number of counts for a center to
	be reassigned. A higher value means that low count centers are more
	easily reassigned, which means that the model will take longer to
	converge, but should converge in a better clustering. However, too high
	a value may cause convergence issues, especially with a small batch
	size.

	Attributes
	----------

	cluster_centers_ : ndarray of shape (n_clusters, n_features)
	Coordinates of cluster centers.

	labels_ : ndarray of shape (n_samples,)
	Labels of each point (if compute_labels is set to True).

	inertia_ : float
	The value of the inertia criterion associated with the chosen
	partition if compute_labels is set to True. If compute_labels is set to
	False, it's an approximation of the inertia based on an exponentially
	weighted average of the batch inertiae.
	The inertia is defined as the sum of square distances of samples to
	their cluster center, weighted by the sample weights if provided.

	n_iter_ : int
	Number of iterations over the full dataset.

	n_steps_ : int
	Number of minibatches processed.

	.. versionadded:: 1.0

	n_features_in_ : int
	Number of features seen during :term:`fit`.

	.. versionadded:: 0.24

	feature_names_in_ : ndarray of shape (`n_features_in_`,)
	Names of features seen during :term:`fit`. Defined only when `X`
	has feature names that are all strings.

	.. versionadded:: 1.0

	See Also
	--------
	KMeans : The classic implementation of the clustering method based on the
	Lloyd's algorithm. It consumes the whole set of input data at each
	iteration.

	Notes
	-----
	See https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf

	When there are too few points in the dataset, some centers may be
	duplicated, which means that a proper clustering in terms of the number
	of requesting clusters and the number of returned clusters will not
	always match. One solution is to set `reassignment_ratio=0`, which
	prevents reassignments of clusters that are too small.

	See :ref:`sphx_glr_auto_examples_cluster_plot_birch_vs_minibatchkmeans.py` for a
	comparison with :class:`~sklearn.cluster.BIRCH`.

	Examples
	--------
	>>> from sklearn.cluster import MiniBatchKMeans
	>>> import numpy as np
	>>> X = np.array([[1, 2], [1, 4], [1, 0],
	... [4, 2], [4, 0], [4, 4],
	... [4, 5], [0, 1], [2, 2],
	... [3, 2], [5, 5], [1, -1]])
	>>> # manually fit on batches
	>>> kmeans = MiniBatchKMeans(n_clusters=2,
	... random_state=0,
	... batch_size=6,
	... n_init="auto")
	>>> kmeans = kmeans.partial_fit(X[0:6,:])
	>>> kmeans = kmeans.partial_fit(X[6:12,:])
	>>> kmeans.cluster_centers_
	array([[3.375, 3. ],
	[0.75 , 0.5 ]])
	>>> kmeans.predict([[0, 0], [4, 4]])
	array([1, 0], dtype=int32)
	>>> # fit on the whole data
	>>> kmeans = MiniBatchKMeans(n_clusters=2,
	... random_state=0,
	... batch_size=6,
	... max_iter=10,
	... n_init="auto").fit(X)
	>>> kmeans.cluster_centers_
	array([[3.55102041, 2.48979592],
	[1.06896552, 1. ]])
	>>> kmeans.predict([[0, 0], [4, 4]])
	array([1, 0], dtype=int32)
	"""

	_parameter_constraints: dict = {
	**_BaseKMeans._parameter_constraints,
	"batch_size": [Interval(Integral, 1, None, closed="left")],
	"compute_labels": ["boolean"],
	"max_no_improvement": [Interval(Integral, 0, None, closed="left"), None],
	"init_size": [Interval(Integral, 1, None, closed="left"), None],
	"reassignment_ratio": [Interval(Real, 0, None, closed="left")],
	}

	def __init__(
	self,
	n_clusters=8,
	*,
	init="k-means++",
	max_iter=100,
	batch_size=1024,
	verbose=0,
	compute_labels=True,
	random_state=None,
	tol=0.0,
	max_no_improvement=10,
	init_size=None,
	n_init="auto",
	reassignment_ratio=0.01,
	):
	super().__init__(
	n_clusters=n_clusters,
	init=init,
	max_iter=max_iter,
	verbose=verbose,
	random_state=random_state,
	tol=tol,
	n_init=n_init,
	)

	self.max_no_improvement = max_no_improvement
	self.batch_size = batch_size
	self.compute_labels = compute_labels
	self.init_size = init_size
	self.reassignment_ratio = reassignment_ratio

	def _check_params_vs_input(self, X):
	super()._check_params_vs_input(X, default_n_init=3)

	self._batch_size = min(self.batch_size, X.shape[0])

	# init_size
	self._init_size = self.init_size
	if self._init_size is None:
	self._init_size = 3 * self._batch_size
	if self._init_size < self.n_clusters:
	self._init_size = 3 * self.n_clusters
	elif self._init_size < self.n_clusters:
	warnings.warn(
	(
	f"init_size={self._init_size} should be larger than "
	f"n_clusters={self.n_clusters}. Setting it to "
	"min(3*n_clusters, n_samples)"
	),
	RuntimeWarning,
	stacklevel=2,
	)
	self._init_size = 3 * self.n_clusters
	self._init_size = min(self._init_size, X.shape[0])

	# reassignment_ratio
	if self.reassignment_ratio < 0:
	raise ValueError(
	"reassignment_ratio should be >= 0, got "
	f"{self.reassignment_ratio} instead."
	)

	def _warn_mkl_vcomp(self, n_active_threads):
	"""Warn when vcomp and mkl are both present"""
	warnings.warn(
	"MiniBatchKMeans is known to have a memory leak on "
	"Windows with MKL, when there are less chunks than "
	"available threads. You can prevent it by setting "
	f"batch_size >= {self._n_threads * CHUNK_SIZE} or by "
	"setting the environment variable "
	f"OMP_NUM_THREADS={n_active_threads}"
	)

	def _mini_batch_convergence(
	self, step, n_steps, n_samples, centers_squared_diff, batch_inertia
	):
	"""Helper function to encapsulate the early stopping logic"""
	# Normalize inertia to be able to compare values when
	# batch_size changes
	batch_inertia /= self._batch_size

	# count steps starting from 1 for user friendly verbose mode.
	step = step + 1

	# Ignore first iteration because it's inertia from initialization.
	if step == 1:
	if self.verbose:
	print(
	f"Minibatch step {step}/{n_steps}: mean batch "
	f"inertia: {batch_inertia}"
	)
	return False

	# Compute an Exponentially Weighted Average of the inertia to
	# monitor the convergence while discarding minibatch-local stochastic
	# variability: https://en.wikipedia.org/wiki/Moving_average
	if self._ewa_inertia is None:
	self._ewa_inertia = batch_inertia
	else:
	alpha = self._batch_size * 2.0 / (n_samples + 1)
	alpha = min(alpha, 1)
	self._ewa_inertia = self._ewa_inertia * (1 - alpha) + batch_inertia * alpha

	# Log progress to be able to monitor convergence
	if self.verbose:
	print(
	f"Minibatch step {step}/{n_steps}: mean batch inertia: "
	f"{batch_inertia}, ewa inertia: {self._ewa_inertia}"
	)

	# Early stopping based on absolute tolerance on squared change of
	# centers position
	if self._tol > 0.0 and centers_squared_diff <= self._tol:
	if self.verbose:
	print(f"Converged (small centers change) at step {step}/{n_steps}")
	return True

	# Early stopping heuristic due to lack of improvement on smoothed
	# inertia
	if self._ewa_inertia_min is None or self._ewa_inertia < self._ewa_inertia_min:
	self._no_improvement = 0
	self._ewa_inertia_min = self._ewa_inertia
	else:
	self._no_improvement += 1

	if (
	self.max_no_improvement is not None
	and self._no_improvement >= self.max_no_improvement
	):
	if self.verbose:
	print(
	"Converged (lack of improvement in inertia) at step "
	f"{step}/{n_steps}"
	)
	return True

	return False

	def _random_reassign(self):
	"""Check if a random reassignment needs to be done.

	Do random reassignments each time 10 * n_clusters samples have been
	processed.

	If there are empty clusters we always want to reassign.
	"""
	self._n_since_last_reassign += self._batch_size
	if (self._counts == 0).any() or self._n_since_last_reassign >= (
	10 * self.n_clusters
	):
	self._n_since_last_reassign = 0
	return True
	return False

	@_fit_context(prefer_skip_nested_validation=True)
	def fit(self, X, y=None, sample_weight=None):
	"""Compute the centroids on X by chunking it into mini-batches.

	Parameters
	----------
	X : {array-like, sparse matrix} of shape (n_samples, n_features)
	Training instances to cluster. It must be noted that the data
	will be converted to C ordering, which will cause a memory copy
	if the given data is not C-contiguous.
	If a sparse matrix is passed, a copy will be made if it's not in
	CSR format.

	y : Ignored
	Not used, present here for API consistency by convention.

	sample_weight : array-like of shape (n_samples,), default=None
	The weights for each observation in X. If None, all observations
	are assigned equal weight. `sample_weight` is not used during
	initialization if `init` is a callable or a user provided array.

	.. versionadded:: 0.20

	Returns
	-------
	self : object
	Fitted estimator.
	"""
	X = validate_data(
	self,
	X,
	accept_sparse="csr",
	dtype=[np.float64, np.float32],
	order="C",
	accept_large_sparse=False,
	)

	self._check_params_vs_input(X)
	random_state = check_random_state(self.random_state)
	sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
	self._n_threads = _openmp_effective_n_threads()
	n_samples, n_features = X.shape

	# Validate init array
	init = self.init
	if _is_arraylike_not_scalar(init):
	init = check_array(init, dtype=X.dtype, copy=True, order="C")
	self._validate_center_shape(X, init)

	self._check_mkl_vcomp(X, self._batch_size)

	# precompute squared norms of data points
	x_squared_norms = row_norms(X, squared=True)

	# Validation set for the init
	validation_indices = random_state.randint(0, n_samples, self._init_size)
	X_valid = X[validation_indices]
	sample_weight_valid = sample_weight[validation_indices]

	# perform several inits with random subsets
	best_inertia = None
	for init_idx in range(self._n_init):
	if self.verbose:
	print(f"Init {init_idx + 1}/{self._n_init} with method {init}")

	# Initialize the centers using only a fraction of the data as we
	# expect n_samples to be very large when using MiniBatchKMeans.
	cluster_centers = self._init_centroids(
	X,
	x_squared_norms=x_squared_norms,
	init=init,
	random_state=random_state,
	init_size=self._init_size,
	sample_weight=sample_weight,
	)

	# Compute inertia on a validation set.
	_, inertia = _labels_inertia_threadpool_limit(
	X_valid,
	sample_weight_valid,
	cluster_centers,
	n_threads=self._n_threads,
	)

	if self.verbose:
	print(f"Inertia for init {init_idx + 1}/{self._n_init}: {inertia}")
	if best_inertia is None or inertia < best_inertia:
	init_centers = cluster_centers
	best_inertia = inertia

	centers = init_centers
	centers_new = np.empty_like(centers)

	# Initialize counts
	self._counts = np.zeros(self.n_clusters, dtype=X.dtype)

	# Attributes to monitor the convergence
	self._ewa_inertia = None
	self._ewa_inertia_min = None
	self._no_improvement = 0

	# Initialize number of samples seen since last reassignment
	self._n_since_last_reassign = 0

	n_steps = (self.max_iter * n_samples) // self._batch_size

	with _get_threadpool_controller().limit(limits=1, user_api="blas"):
	# Perform the iterative optimization until convergence
	for i in range(n_steps):
	# Sample a minibatch from the full dataset
	minibatch_indices = random_state.randint(0, n_samples, self._batch_size)

	# Perform the actual update step on the minibatch data
	batch_inertia = _mini_batch_step(
	X=X[minibatch_indices],
	sample_weight=sample_weight[minibatch_indices],
	centers=centers,
	centers_new=centers_new,
	weight_sums=self._counts,
	random_state=random_state,
	random_reassign=self._random_reassign(),
	reassignment_ratio=self.reassignment_ratio,
	verbose=self.verbose,
	n_threads=self._n_threads,
	)

	if self._tol > 0.0:
	centers_squared_diff = np.sum((centers_new - centers) ** 2)
	else:
	centers_squared_diff = 0

	centers, centers_new = centers_new, centers

	# Monitor convergence and do early stopping if necessary
	if self._mini_batch_convergence(
	i, n_steps, n_samples, centers_squared_diff, batch_inertia
	):
	break

	self.cluster_centers_ = centers
	self._n_features_out = self.cluster_centers_.shape[0]

	self.n_steps_ = i + 1
	self.n_iter_ = int(np.ceil(((i + 1) * self._batch_size) / n_samples))

	if self.compute_labels:
	self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
	X,
	sample_weight,
	self.cluster_centers_,
	n_threads=self._n_threads,
	)
	else:
	self.inertia_ = self._ewa_inertia * n_samples

	return self

	@_fit_context(prefer_skip_nested_validation=True)
	def partial_fit(self, X, y=None, sample_weight=None):
	"""Update k means estimate on a single mini-batch X.

	Parameters
	----------
	X : {array-like, sparse matrix} of shape (n_samples, n_features)
	Training instances to cluster. It must be noted that the data
	will be converted to C ordering, which will cause a memory copy
	if the given data is not C-contiguous.
	If a sparse matrix is passed, a copy will be made if it's not in
	CSR format.

	y : Ignored
	Not used, present here for API consistency by convention.

	sample_weight : array-like of shape (n_samples,), default=None
	The weights for each observation in X. If None, all observations
	are assigned equal weight. `sample_weight` is not used during
	initialization if `init` is a callable or a user provided array.

	Returns
	-------
	self : object
	Return updated estimator.
	"""
	has_centers = hasattr(self, "cluster_centers_")

	X = validate_data(
	self,
	X,
	accept_sparse="csr",
	dtype=[np.float64, np.float32],
	order="C",
	accept_large_sparse=False,
	reset=not has_centers,
	)

	self._random_state = getattr(
	self, "_random_state", check_random_state(self.random_state)
	)
	sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
	self.n_steps_ = getattr(self, "n_steps_", 0)

	# precompute squared norms of data points
	x_squared_norms = row_norms(X, squared=True)

	if not has_centers:
	# this instance has not been fitted yet (fit or partial_fit)
	self._check_params_vs_input(X)
	self._n_threads = _openmp_effective_n_threads()

	# Validate init array
	init = self.init
	if _is_arraylike_not_scalar(init):
	init = check_array(init, dtype=X.dtype, copy=True, order="C")
	self._validate_center_shape(X, init)

	self._check_mkl_vcomp(X, X.shape[0])

	# initialize the cluster centers
	self.cluster_centers_ = self._init_centroids(
	X,
	x_squared_norms=x_squared_norms,
	init=init,
	random_state=self._random_state,
	init_size=self._init_size,
	sample_weight=sample_weight,
	)

	# Initialize counts
	self._counts = np.zeros(self.n_clusters, dtype=X.dtype)

	# Initialize number of samples seen since last reassignment
	self._n_since_last_reassign = 0

	with _get_threadpool_controller().limit(limits=1, user_api="blas"):
	_mini_batch_step(
	X,
	sample_weight=sample_weight,
	centers=self.cluster_centers_,
	centers_new=self.cluster_centers_,
	weight_sums=self._counts,
	random_state=self._random_state,
	random_reassign=self._random_reassign(),
	reassignment_ratio=self.reassignment_ratio,
	verbose=self.verbose,
	n_threads=self._n_threads,
	)

	if self.compute_labels:
	self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
	X,
	sample_weight,
	self.cluster_centers_,
	n_threads=self._n_threads,
	)

	self.n_steps_ += 1
	self._n_features_out = self.cluster_centers_.shape[0]

	return self