Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

105 kB

	"""
	Ridge regression
	"""

	# Authors: The scikit-learn developers
	# SPDX-License-Identifier: BSD-3-Clause


	import numbers
	import warnings
	from abc import ABCMeta, abstractmethod
	from functools import partial
	from numbers import Integral, Real

	import numpy as np
	from scipy import linalg, optimize, sparse
	from scipy.sparse import linalg as sp_linalg

	from sklearn.base import BaseEstimator

	from ..base import MultiOutputMixin, RegressorMixin, _fit_context, is_classifier
	from ..exceptions import ConvergenceWarning
	from ..metrics import check_scoring, get_scorer_names
	from ..model_selection import GridSearchCV
	from ..preprocessing import LabelBinarizer
	from ..utils import (
	Bunch,
	check_array,
	check_consistent_length,
	check_scalar,
	column_or_1d,
	compute_sample_weight,
	deprecated,
	)
	from ..utils._array_api import (
	_is_numpy_namespace,
	_ravel,
	device,
	get_namespace,
	get_namespace_and_device,
	)
	from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params
	from ..utils.extmath import row_norms, safe_sparse_dot
	from ..utils.fixes import _sparse_linalg_cg
	from ..utils.metadata_routing import (
	MetadataRouter,
	MethodMapping,
	_raise_for_params,
	_routing_enabled,
	process_routing,
	)
	from ..utils.sparsefuncs import mean_variance_axis
	from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data
	from ._base import LinearClassifierMixin, LinearModel, _preprocess_data, _rescale_data
	from ._sag import sag_solver


	def _get_rescaled_operator(X, X_offset, sample_weight_sqrt):
	"""Create LinearOperator for matrix products with implicit centering.

	Matrix product `LinearOperator @ coef` returns `(X - X_offset) @ coef`.
	"""

	def matvec(b):
	return X.dot(b) - sample_weight_sqrt * b.dot(X_offset)

	def rmatvec(b):
	return X.T.dot(b) - X_offset * b.dot(sample_weight_sqrt)

	X1 = sparse.linalg.LinearOperator(shape=X.shape, matvec=matvec, rmatvec=rmatvec)
	return X1


	def _solve_sparse_cg(
	X,
	y,
	alpha,
	max_iter=None,
	tol=1e-4,
	verbose=0,
	X_offset=None,
	X_scale=None,
	sample_weight_sqrt=None,
	):
	if sample_weight_sqrt is None:
	sample_weight_sqrt = np.ones(X.shape[0], dtype=X.dtype)

	n_samples, n_features = X.shape

	if X_offset is None or X_scale is None:
	X1 = sp_linalg.aslinearoperator(X)
	else:
	X_offset_scale = X_offset / X_scale
	X1 = _get_rescaled_operator(X, X_offset_scale, sample_weight_sqrt)

	coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)

	if n_features > n_samples:

	def create_mv(curr_alpha):
	def _mv(x):
	return X1.matvec(X1.rmatvec(x)) + curr_alpha * x

	return _mv

	else:

	def create_mv(curr_alpha):
	def _mv(x):
	return X1.rmatvec(X1.matvec(x)) + curr_alpha * x

	return _mv

	for i in range(y.shape[1]):
	y_column = y[:, i]

	mv = create_mv(alpha[i])
	if n_features > n_samples:
	# kernel ridge
	# w = X.T * inv(X X^t + alpha*Id) y
	C = sp_linalg.LinearOperator(
	(n_samples, n_samples), matvec=mv, dtype=X.dtype
	)
	coef, info = _sparse_linalg_cg(C, y_column, rtol=tol)
	coefs[i] = X1.rmatvec(coef)
	else:
	# linear ridge
	# w = inv(X^t X + alphaId) X.T y
	y_column = X1.rmatvec(y_column)
	C = sp_linalg.LinearOperator(
	(n_features, n_features), matvec=mv, dtype=X.dtype
	)
	coefs[i], info = _sparse_linalg_cg(C, y_column, maxiter=max_iter, rtol=tol)

	if info < 0:
	raise ValueError("Failed with error code %d" % info)

	if max_iter is None and info > 0 and verbose:
	warnings.warn(
	"sparse_cg did not converge after %d iterations." % info,
	ConvergenceWarning,
	)

	return coefs


	def _solve_lsqr(
	X,
	y,
	*,
	alpha,
	fit_intercept=True,
	max_iter=None,
	tol=1e-4,
	X_offset=None,
	X_scale=None,
	sample_weight_sqrt=None,
	):
	"""Solve Ridge regression via LSQR.

	We expect that y is always mean centered.
	If X is dense, we expect it to be mean centered such that we can solve
	\|\|y - Xw\|\|_2^2 + alpha * \|\|w\|\|_2^2

	If X is sparse, we expect X_offset to be given such that we can solve
	\|\|y - (X - X_offset)w\|\|_2^2 + alpha * \|\|w\|\|_2^2

	With sample weights S=diag(sample_weight), this becomes
	\|\|sqrt(S) (y - (X - X_offset) w)\|\|_2^2 + alpha * \|\|w\|\|_2^2
	and we expect y and X to already be rescaled, i.e. sqrt(S) @ y, sqrt(S) @ X. In
	this case, X_offset is the sample_weight weighted mean of X before scaling by
	sqrt(S). The objective then reads
	\|\|y - (X - sqrt(S) X_offset) w)\|\|_2^2 + alpha * \|\|w\|\|_2^2
	"""
	if sample_weight_sqrt is None:
	sample_weight_sqrt = np.ones(X.shape[0], dtype=X.dtype)

	if sparse.issparse(X) and fit_intercept:
	X_offset_scale = X_offset / X_scale
	X1 = _get_rescaled_operator(X, X_offset_scale, sample_weight_sqrt)
	else:
	# No need to touch anything
	X1 = X

	n_samples, n_features = X.shape
	coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)
	n_iter = np.empty(y.shape[1], dtype=np.int32)

	# According to the lsqr documentation, alpha = damp^2.
	sqrt_alpha = np.sqrt(alpha)

	for i in range(y.shape[1]):
	y_column = y[:, i]
	info = sp_linalg.lsqr(
	X1, y_column, damp=sqrt_alpha[i], atol=tol, btol=tol, iter_lim=max_iter
	)
	coefs[i] = info[0]
	n_iter[i] = info[2]

	return coefs, n_iter


	def _solve_cholesky(X, y, alpha):
	# w = inv(X^t X + alphaId) X.T y
	n_features = X.shape[1]
	n_targets = y.shape[1]

	A = safe_sparse_dot(X.T, X, dense_output=True)
	Xy = safe_sparse_dot(X.T, y, dense_output=True)

	one_alpha = np.array_equal(alpha, len(alpha) * [alpha[0]])

	if one_alpha:
	A.flat[:: n_features + 1] += alpha[0]
	return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
	else:
	coefs = np.empty([n_targets, n_features], dtype=X.dtype)
	for coef, target, current_alpha in zip(coefs, Xy.T, alpha):
	A.flat[:: n_features + 1] += current_alpha
	coef[:] = linalg.solve(A, target, assume_a="pos", overwrite_a=False).ravel()
	A.flat[:: n_features + 1] -= current_alpha
	return coefs


	def _solve_cholesky_kernel(K, y, alpha, sample_weight=None, copy=False):
	# dual_coef = inv(X X^t + alpha*Id) y
	n_samples = K.shape[0]
	n_targets = y.shape[1]

	if copy:
	K = K.copy()

	alpha = np.atleast_1d(alpha)
	one_alpha = (alpha == alpha[0]).all()
	has_sw = isinstance(sample_weight, np.ndarray) or sample_weight not in [1.0, None]

	if has_sw:
	# Unlike other solvers, we need to support sample_weight directly
	# because K might be a pre-computed kernel.
	sw = np.sqrt(np.atleast_1d(sample_weight))
	y = y * sw[:, np.newaxis]
	K *= np.outer(sw, sw)

	if one_alpha:
	# Only one penalty, we can solve multi-target problems in one time.
	K.flat[:: n_samples + 1] += alpha[0]

	try:
	# Note: we must use overwrite_a=False in order to be able to
	# use the fall-back solution below in case a LinAlgError
	# is raised
	dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
	except np.linalg.LinAlgError:
	warnings.warn(
	"Singular matrix in solving dual problem. Using "
	"least-squares solution instead."
	)
	dual_coef = linalg.lstsq(K, y)[0]

	# K is expensive to compute and store in memory so change it back in
	# case it was user-given.
	K.flat[:: n_samples + 1] -= alpha[0]

	if has_sw:
	dual_coef *= sw[:, np.newaxis]

	return dual_coef
	else:
	# One penalty per target. We need to solve each target separately.
	dual_coefs = np.empty([n_targets, n_samples], K.dtype)

	for dual_coef, target, current_alpha in zip(dual_coefs, y.T, alpha):
	K.flat[:: n_samples + 1] += current_alpha

	dual_coef[:] = linalg.solve(
	K, target, assume_a="pos", overwrite_a=False
	).ravel()

	K.flat[:: n_samples + 1] -= current_alpha

	if has_sw:
	dual_coefs *= sw[np.newaxis, :]

	return dual_coefs.T


	def _solve_svd(X, y, alpha, xp=None):
	xp, _ = get_namespace(X, xp=xp)
	U, s, Vt = xp.linalg.svd(X, full_matrices=False)
	idx = s > 1e-15 # same default value as scipy.linalg.pinv
	s_nnz = s[idx][:, None]
	UTy = U.T @ y
	d = xp.zeros((s.shape[0], alpha.shape[0]), dtype=X.dtype, device=device(X))
	d[idx] = s_nnz / (s_nnz**2 + alpha)
	d_UT_y = d * UTy
	return (Vt.T @ d_UT_y).T


	def _solve_lbfgs(
	X,
	y,
	alpha,
	positive=True,
	max_iter=None,
	tol=1e-4,
	X_offset=None,
	X_scale=None,
	sample_weight_sqrt=None,
	):
	"""Solve ridge regression with LBFGS.

	The main purpose is fitting with forcing coefficients to be positive.
	For unconstrained ridge regression, there are faster dedicated solver methods.
	Note that with positive bounds on the coefficients, LBFGS seems faster
	than scipy.optimize.lsq_linear.
	"""
	n_samples, n_features = X.shape

	options = {}
	if max_iter is not None:
	options["maxiter"] = max_iter
	config = {
	"method": "L-BFGS-B",
	"tol": tol,
	"jac": True,
	"options": options,
	}
	if positive:
	config["bounds"] = [(0, np.inf)] * n_features

	if X_offset is not None and X_scale is not None:
	X_offset_scale = X_offset / X_scale
	else:
	X_offset_scale = None

	if sample_weight_sqrt is None:
	sample_weight_sqrt = np.ones(X.shape[0], dtype=X.dtype)

	coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)

	for i in range(y.shape[1]):
	x0 = np.zeros((n_features,))
	y_column = y[:, i]

	def func(w):
	residual = X.dot(w) - y_column
	if X_offset_scale is not None:
	residual -= sample_weight_sqrt * w.dot(X_offset_scale)
	f = 0.5 * residual.dot(residual) + 0.5 * alpha[i] * w.dot(w)
	grad = X.T @ residual + alpha[i] * w
	if X_offset_scale is not None:
	grad -= X_offset_scale * residual.dot(sample_weight_sqrt)

	return f, grad

	result = optimize.minimize(func, x0, **config)
	if not result["success"]:
	warnings.warn(
	(
	"The lbfgs solver did not converge. Try increasing max_iter "
	f"or tol. Currently: max_iter={max_iter} and tol={tol}"
	),
	ConvergenceWarning,
	)
	coefs[i] = result["x"]

	return coefs


	def _get_valid_accept_sparse(is_X_sparse, solver):
	if is_X_sparse and solver in ["auto", "sag", "saga"]:
	return "csr"
	else:
	return ["csr", "csc", "coo"]


	@validate_params(
	{
	"X": ["array-like", "sparse matrix", sp_linalg.LinearOperator],
	"y": ["array-like"],
	"alpha": [Interval(Real, 0, None, closed="left"), "array-like"],
	"sample_weight": [
	Interval(Real, None, None, closed="neither"),
	"array-like",
	None,
	],
	"solver": [
	StrOptions(
	{"auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"}
	)
	],
	"max_iter": [Interval(Integral, 0, None, closed="left"), None],
	"tol": [Interval(Real, 0, None, closed="left")],
	"verbose": ["verbose"],
	"positive": ["boolean"],
	"random_state": ["random_state"],
	"return_n_iter": ["boolean"],
	"return_intercept": ["boolean"],
	"check_input": ["boolean"],
	},
	prefer_skip_nested_validation=True,
	)
	def ridge_regression(
	X,
	y,
	alpha,
	*,
	sample_weight=None,
	solver="auto",
	max_iter=None,
	tol=1e-4,
	verbose=0,
	positive=False,
	random_state=None,
	return_n_iter=False,
	return_intercept=False,
	check_input=True,
	):
	"""Solve the ridge equation by the method of normal equations.

	Read more in the :ref:`User Guide <ridge_regression>`.

	Parameters
	----------
	X : {array-like, sparse matrix, LinearOperator} of shape \
	(n_samples, n_features)
	Training data.

	y : array-like of shape (n_samples,) or (n_samples, n_targets)
	Target values.

	alpha : float or array-like of shape (n_targets,)
	Constant that multiplies the L2 term, controlling regularization
	strength. `alpha` must be a non-negative float i.e. in `[0, inf)`.

	When `alpha = 0`, the objective is equivalent to ordinary least
	squares, solved by the :class:`LinearRegression` object. For numerical
	reasons, using `alpha = 0` with the `Ridge` object is not advised.
	Instead, you should use the :class:`LinearRegression` object.

	If an array is passed, penalties are assumed to be specific to the
	targets. Hence they must correspond in number.

	sample_weight : float or array-like of shape (n_samples,), default=None
	Individual weights for each sample. If given a float, every sample
	will have the same weight. If sample_weight is not None and
	solver='auto', the solver will be set to 'cholesky'.

	.. versionadded:: 0.17

	solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \
	'sag', 'saga', 'lbfgs'}, default='auto'
	Solver to use in the computational routines:

	- 'auto' chooses the solver automatically based on the type of data.

	- 'svd' uses a Singular Value Decomposition of X to compute the Ridge
	coefficients. It is the most stable solver, in particular more stable
	for singular matrices than 'cholesky' at the cost of being slower.

	- 'cholesky' uses the standard scipy.linalg.solve function to
	obtain a closed-form solution via a Cholesky decomposition of
	dot(X.T, X)

	- 'sparse_cg' uses the conjugate gradient solver as found in
	scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
	more appropriate than 'cholesky' for large-scale data
	(possibility to set `tol` and `max_iter`).

	- 'lsqr' uses the dedicated regularized least-squares routine
	scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative
	procedure.

	- 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
	its improved, unbiased version named SAGA. Both methods also use an
	iterative procedure, and are often faster than other solvers when
	both n_samples and n_features are large. Note that 'sag' and
	'saga' fast convergence is only guaranteed on features with
	approximately the same scale. You can preprocess the data with a
	scaler from sklearn.preprocessing.

	- 'lbfgs' uses L-BFGS-B algorithm implemented in
	`scipy.optimize.minimize`. It can be used only when `positive`
	is True.

	All solvers except 'svd' support both dense and sparse data. However, only
	'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when
	`fit_intercept` is True.

	.. versionadded:: 0.17
	Stochastic Average Gradient descent solver.
	.. versionadded:: 0.19
	SAGA solver.

	max_iter : int, default=None
	Maximum number of iterations for conjugate gradient solver.
	For the 'sparse_cg' and 'lsqr' solvers, the default value is determined
	by scipy.sparse.linalg. For 'sag' and saga solver, the default value is
	1000. For 'lbfgs' solver, the default value is 15000.

	tol : float, default=1e-4
	Precision of the solution. Note that `tol` has no effect for solvers 'svd' and
	'cholesky'.

	.. versionchanged:: 1.2
	Default value changed from 1e-3 to 1e-4 for consistency with other linear
	models.

	verbose : int, default=0
	Verbosity level. Setting verbose > 0 will display additional
	information depending on the solver used.

	positive : bool, default=False
	When set to ``True``, forces the coefficients to be positive.
	Only 'lbfgs' solver is supported in this case.

	random_state : int, RandomState instance, default=None
	Used when ``solver`` == 'sag' or 'saga' to shuffle the data.
	See :term:`Glossary <random_state>` for details.

	return_n_iter : bool, default=False
	If True, the method also returns `n_iter`, the actual number of
	iteration performed by the solver.

	.. versionadded:: 0.17

	return_intercept : bool, default=False
	If True and if X is sparse, the method also returns the intercept,
	and the solver is automatically changed to 'sag'. This is only a
	temporary fix for fitting the intercept with sparse data. For dense
	data, use sklearn.linear_model._preprocess_data before your regression.

	.. versionadded:: 0.17

	check_input : bool, default=True
	If False, the input arrays X and y will not be checked.

	.. versionadded:: 0.21

	Returns
	-------
	coef : ndarray of shape (n_features,) or (n_targets, n_features)
	Weight vector(s).

	n_iter : int, optional
	The actual number of iteration performed by the solver.
	Only returned if `return_n_iter` is True.

	intercept : float or ndarray of shape (n_targets,)
	The intercept of the model. Only returned if `return_intercept`
	is True and if X is a scipy sparse array.

	Notes
	-----
	This function won't compute the intercept.

	Regularization improves the conditioning of the problem and
	reduces the variance of the estimates. Larger values specify stronger
	regularization. Alpha corresponds to ``1 / (2C)`` in other linear
	models such as :class:`~sklearn.linear_model.LogisticRegression` or
	:class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are
	assumed to be specific to the targets. Hence they must correspond in
	number.

	Examples
	--------
	>>> import numpy as np
	>>> from sklearn.datasets import make_regression
	>>> from sklearn.linear_model import ridge_regression
	>>> rng = np.random.RandomState(0)
	>>> X = rng.randn(100, 4)
	>>> y = 2.0 * X[:, 0] - 1.0 * X[:, 1] + 0.1 * rng.standard_normal(100)
	>>> coef, intercept = ridge_regression(X, y, alpha=1.0, return_intercept=True)
	>>> list(coef)
	[np.float64(1.9...), np.float64(-1.0...), np.float64(-0.0...), np.float64(-0.0...)]
	>>> intercept
	np.float64(-0.0...)
	"""
	return _ridge_regression(
	X,
	y,
	alpha,
	sample_weight=sample_weight,
	solver=solver,
	max_iter=max_iter,
	tol=tol,
	verbose=verbose,
	positive=positive,
	random_state=random_state,
	return_n_iter=return_n_iter,
	return_intercept=return_intercept,
	X_scale=None,
	X_offset=None,
	check_input=check_input,
	)


	def _ridge_regression(
	X,
	y,
	alpha,
	sample_weight=None,
	solver="auto",
	max_iter=None,
	tol=1e-4,
	verbose=0,
	positive=False,
	random_state=None,
	return_n_iter=False,
	return_intercept=False,
	return_solver=False,
	X_scale=None,
	X_offset=None,
	check_input=True,
	fit_intercept=False,
	):
	xp, is_array_api_compliant, device_ = get_namespace_and_device(
	X, y, sample_weight, X_scale, X_offset
	)
	is_numpy_namespace = _is_numpy_namespace(xp)
	X_is_sparse = sparse.issparse(X)

	has_sw = sample_weight is not None

	solver = resolve_solver(solver, positive, return_intercept, X_is_sparse, xp)

	if is_numpy_namespace and not X_is_sparse:
	X = np.asarray(X)

	if not is_numpy_namespace and solver != "svd":
	raise ValueError(
	f"Array API dispatch to namespace {xp.__name__} only supports "
	f"solver 'svd'. Got '{solver}'."
	)

	if positive and solver != "lbfgs":
	raise ValueError(
	"When positive=True, only 'lbfgs' solver can be used. "
	f"Please change solver {solver} to 'lbfgs' "
	"or set positive=False."
	)

	if solver == "lbfgs" and not positive:
	raise ValueError(
	"'lbfgs' solver can be used only when positive=True. "
	"Please use another solver."
	)

	if return_intercept and solver != "sag":
	raise ValueError(
	"In Ridge, only 'sag' solver can directly fit the "
	"intercept. Please change solver to 'sag' or set "
	"return_intercept=False."
	)

	if check_input:
	_dtype = [xp.float64, xp.float32]
	_accept_sparse = _get_valid_accept_sparse(X_is_sparse, solver)
	X = check_array(X, accept_sparse=_accept_sparse, dtype=_dtype, order="C")
	y = check_array(y, dtype=X.dtype, ensure_2d=False, order=None)
	check_consistent_length(X, y)

	n_samples, n_features = X.shape

	if y.ndim > 2:
	raise ValueError("Target y has the wrong shape %s" % str(y.shape))

	if y.ndim == 1:
	y = xp.reshape(y, (-1, 1))

	n_samples_, n_targets = y.shape

	if n_samples != n_samples_:
	raise ValueError(
	"Number of samples in X and y does not correspond: %d != %d"
	% (n_samples, n_samples_)
	)

	if has_sw:
	sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

	if solver not in ["sag", "saga"]:
	# SAG supports sample_weight directly. For other solvers,
	# we implement sample_weight via a simple rescaling.
	X, y, sample_weight_sqrt = _rescale_data(X, y, sample_weight)

	# Some callers of this method might pass alpha as single
	# element array which already has been validated.
	if alpha is not None and not isinstance(alpha, type(xp.asarray([0.0]))):
	alpha = check_scalar(
	alpha,
	"alpha",
	target_type=numbers.Real,
	min_val=0.0,
	include_boundaries="left",
	)

	# There should be either 1 or n_targets penalties
	alpha = _ravel(xp.asarray(alpha, device=device_, dtype=X.dtype), xp=xp)
	if alpha.shape[0] not in [1, n_targets]:
	raise ValueError(
	"Number of targets and number of penalties do not correspond: %d != %d"
	% (alpha.shape[0], n_targets)
	)

	if alpha.shape[0] == 1 and n_targets > 1:
	alpha = xp.full(
	shape=(n_targets,), fill_value=alpha[0], dtype=alpha.dtype, device=device_
	)

	n_iter = None
	if solver == "sparse_cg":
	coef = _solve_sparse_cg(
	X,
	y,
	alpha,
	max_iter=max_iter,
	tol=tol,
	verbose=verbose,
	X_offset=X_offset,
	X_scale=X_scale,
	sample_weight_sqrt=sample_weight_sqrt if has_sw else None,
	)

	elif solver == "lsqr":
	coef, n_iter = _solve_lsqr(
	X,
	y,
	alpha=alpha,
	fit_intercept=fit_intercept,
	max_iter=max_iter,
	tol=tol,
	X_offset=X_offset,
	X_scale=X_scale,
	sample_weight_sqrt=sample_weight_sqrt if has_sw else None,
	)

	elif solver == "cholesky":
	if n_features > n_samples:
	K = safe_sparse_dot(X, X.T, dense_output=True)
	try:
	dual_coef = _solve_cholesky_kernel(K, y, alpha)

	coef = safe_sparse_dot(X.T, dual_coef, dense_output=True).T
	except linalg.LinAlgError:
	# use SVD solver if matrix is singular
	solver = "svd"
	else:
	try:
	coef = _solve_cholesky(X, y, alpha)
	except linalg.LinAlgError:
	# use SVD solver if matrix is singular
	solver = "svd"

	elif solver in ["sag", "saga"]:
	# precompute max_squared_sum for all targets
	max_squared_sum = row_norms(X, squared=True).max()

	coef = np.empty((y.shape[1], n_features), dtype=X.dtype)
	n_iter = np.empty(y.shape[1], dtype=np.int32)
	intercept = np.zeros((y.shape[1],), dtype=X.dtype)
	for i, (alpha_i, target) in enumerate(zip(alpha, y.T)):
	init = {
	"coef": np.zeros((n_features + int(return_intercept), 1), dtype=X.dtype)
	}
	coef_, n_iter_, _ = sag_solver(
	X,
	target.ravel(),
	sample_weight,
	"squared",
	alpha_i,
	0,
	max_iter,
	tol,
	verbose,
	random_state,
	False,
	max_squared_sum,
	init,
	is_saga=solver == "saga",
	)
	if return_intercept:
	coef[i] = coef_[:-1]
	intercept[i] = coef_[-1]
	else:
	coef[i] = coef_
	n_iter[i] = n_iter_

	if intercept.shape[0] == 1:
	intercept = intercept[0]

	elif solver == "lbfgs":
	coef = _solve_lbfgs(
	X,
	y,
	alpha,
	positive=positive,
	tol=tol,
	max_iter=max_iter,
	X_offset=X_offset,
	X_scale=X_scale,
	sample_weight_sqrt=sample_weight_sqrt if has_sw else None,
	)

	if solver == "svd":
	if X_is_sparse:
	raise TypeError("SVD solver does not support sparse inputs currently")
	coef = _solve_svd(X, y, alpha, xp)

	if n_targets == 1:
	coef = _ravel(coef)

	coef = xp.asarray(coef)

	if return_n_iter and return_intercept:
	res = coef, n_iter, intercept
	elif return_intercept:
	res = coef, intercept
	elif return_n_iter:
	res = coef, n_iter
	else:
	res = coef

	return (*res, solver) if return_solver else res


	def resolve_solver(solver, positive, return_intercept, is_sparse, xp):
	if solver != "auto":
	return solver

	is_numpy_namespace = _is_numpy_namespace(xp)

	auto_solver_np = resolve_solver_for_numpy(positive, return_intercept, is_sparse)
	if is_numpy_namespace:
	return auto_solver_np

	if positive:
	raise ValueError(
	"The solvers that support positive fitting do not support "
	f"Array API dispatch to namespace {xp.__name__}. Please "
	"either disable Array API dispatch, or use a numpy-like "
	"namespace, or set `positive=False`."
	)

	# At the moment, Array API dispatch only supports the "svd" solver.
	solver = "svd"
	if solver != auto_solver_np:
	warnings.warn(
	f"Using Array API dispatch to namespace {xp.__name__} with "
	f"`solver='auto'` will result in using the solver '{solver}'. "
	"The results may differ from those when using a Numpy array, "
	f"because in that case the preferred solver would be {auto_solver_np}. "
	f"Set `solver='{solver}'` to suppress this warning."
	)

	return solver


	def resolve_solver_for_numpy(positive, return_intercept, is_sparse):
	if positive:
	return "lbfgs"

	if return_intercept:
	# sag supports fitting intercept directly
	return "sag"

	if not is_sparse:
	return "cholesky"

	return "sparse_cg"


	class _BaseRidge(LinearModel, metaclass=ABCMeta):
	_parameter_constraints: dict = {
	"alpha": [Interval(Real, 0, None, closed="left"), np.ndarray],
	"fit_intercept": ["boolean"],
	"copy_X": ["boolean"],
	"max_iter": [Interval(Integral, 1, None, closed="left"), None],
	"tol": [Interval(Real, 0, None, closed="left")],
	"solver": [
	StrOptions(
	{"auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"}
	)
	],
	"positive": ["boolean"],
	"random_state": ["random_state"],
	}

	@abstractmethod
	def __init__(
	self,
	alpha=1.0,
	*,
	fit_intercept=True,
	copy_X=True,
	max_iter=None,
	tol=1e-4,
	solver="auto",
	positive=False,
	random_state=None,
	):
	self.alpha = alpha
	self.fit_intercept = fit_intercept
	self.copy_X = copy_X
	self.max_iter = max_iter
	self.tol = tol
	self.solver = solver
	self.positive = positive
	self.random_state = random_state

	def fit(self, X, y, sample_weight=None):
	xp, is_array_api_compliant = get_namespace(X, y, sample_weight)

	if self.solver == "lbfgs" and not self.positive:
	raise ValueError(
	"'lbfgs' solver can be used only when positive=True. "
	"Please use another solver."
	)

	if self.positive:
	if self.solver not in ["auto", "lbfgs"]:
	raise ValueError(
	f"solver='{self.solver}' does not support positive fitting. Please"
	" set the solver to 'auto' or 'lbfgs', or set `positive=False`"
	)
	else:
	solver = self.solver
	elif sparse.issparse(X) and self.fit_intercept:
	if self.solver not in ["auto", "lbfgs", "lsqr", "sag", "sparse_cg"]:
	raise ValueError(
	"solver='{}' does not support fitting the intercept "
	"on sparse data. Please set the solver to 'auto' or "
	"'lsqr', 'sparse_cg', 'sag', 'lbfgs' "
	"or set `fit_intercept=False`".format(self.solver)
	)
	if self.solver in ["lsqr", "lbfgs"]:
	solver = self.solver
	elif self.solver == "sag" and self.max_iter is None and self.tol > 1e-4:
	warnings.warn(
	'"sag" solver requires many iterations to fit '
	"an intercept with sparse inputs. Either set the "
	'solver to "auto" or "sparse_cg", or set a low '
	'"tol" and a high "max_iter" (especially if inputs are '
	"not standardized)."
	)
	solver = "sag"
	else:
	solver = "sparse_cg"
	else:
	solver = self.solver

	if sample_weight is not None:
	sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

	# when X is sparse we only remove offset from y
	X, y, X_offset, y_offset, X_scale = _preprocess_data(
	X,
	y,
	fit_intercept=self.fit_intercept,
	copy=self.copy_X,
	sample_weight=sample_weight,
	)

	if solver == "sag" and sparse.issparse(X) and self.fit_intercept:
	self.coef_, self.n_iter_, self.intercept_, self.solver_ = _ridge_regression(
	X,
	y,
	alpha=self.alpha,
	sample_weight=sample_weight,
	max_iter=self.max_iter,
	tol=self.tol,
	solver="sag",
	positive=self.positive,
	random_state=self.random_state,
	return_n_iter=True,
	return_intercept=True,
	return_solver=True,
	check_input=False,
	)
	# add the offset which was subtracted by _preprocess_data
	self.intercept_ += y_offset

	else:
	if sparse.issparse(X) and self.fit_intercept:
	# required to fit intercept with sparse_cg and lbfgs solver
	params = {"X_offset": X_offset, "X_scale": X_scale}
	else:
	# for dense matrices or when intercept is set to 0
	params = {}

	self.coef_, self.n_iter_, self.solver_ = _ridge_regression(
	X,
	y,
	alpha=self.alpha,
	sample_weight=sample_weight,
	max_iter=self.max_iter,
	tol=self.tol,
	solver=solver,
	positive=self.positive,
	random_state=self.random_state,
	return_n_iter=True,
	return_intercept=False,
	return_solver=True,
	check_input=False,
	fit_intercept=self.fit_intercept,
	**params,
	)
	self._set_intercept(X_offset, y_offset, X_scale)

	return self


	class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
	"""Linear least squares with l2 regularization.

	Minimizes the objective function::

	\|\|y - Xw\|\|^2_2 + alpha * \|\|w\|\|^2_2

	This model solves a regression model where the loss function is
	the linear least squares function and regularization is given by
	the l2-norm. Also known as Ridge Regression or Tikhonov regularization.
	This estimator has built-in support for multi-variate regression
	(i.e., when y is a 2d-array of shape (n_samples, n_targets)).

	Read more in the :ref:`User Guide <ridge_regression>`.

	Parameters
	----------
	alpha : {float, ndarray of shape (n_targets,)}, default=1.0
	Constant that multiplies the L2 term, controlling regularization
	strength. `alpha` must be a non-negative float i.e. in `[0, inf)`.

	When `alpha = 0`, the objective is equivalent to ordinary least
	squares, solved by the :class:`LinearRegression` object. For numerical
	reasons, using `alpha = 0` with the `Ridge` object is not advised.
	Instead, you should use the :class:`LinearRegression` object.

	If an array is passed, penalties are assumed to be specific to the
	targets. Hence they must correspond in number.

	fit_intercept : bool, default=True
	Whether to fit the intercept for this model. If set
	to false, no intercept will be used in calculations
	(i.e. ``X`` and ``y`` are expected to be centered).

	copy_X : bool, default=True
	If True, X will be copied; else, it may be overwritten.

	max_iter : int, default=None
	Maximum number of iterations for conjugate gradient solver.
	For 'sparse_cg' and 'lsqr' solvers, the default value is determined
	by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
	For 'lbfgs' solver, the default value is 15000.

	tol : float, default=1e-4
	The precision of the solution (`coef_`) is determined by `tol` which
	specifies a different convergence criterion for each solver:

	- 'svd': `tol` has no impact.

	- 'cholesky': `tol` has no impact.

	- 'sparse_cg': norm of residuals smaller than `tol`.

	- 'lsqr': `tol` is set as atol and btol of scipy.sparse.linalg.lsqr,
	which control the norm of the residual vector in terms of the norms of
	matrix and coefficients.

	- 'sag' and 'saga': relative change of coef smaller than `tol`.

	- 'lbfgs': maximum of the absolute (projected) gradient=max\|residuals\|
	smaller than `tol`.

	.. versionchanged:: 1.2
	Default value changed from 1e-3 to 1e-4 for consistency with other linear
	models.

	solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \
	'sag', 'saga', 'lbfgs'}, default='auto'
	Solver to use in the computational routines:

	- 'auto' chooses the solver automatically based on the type of data.

	- 'svd' uses a Singular Value Decomposition of X to compute the Ridge
	coefficients. It is the most stable solver, in particular more stable
	for singular matrices than 'cholesky' at the cost of being slower.

	- 'cholesky' uses the standard scipy.linalg.solve function to
	obtain a closed-form solution.

	- 'sparse_cg' uses the conjugate gradient solver as found in
	scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
	more appropriate than 'cholesky' for large-scale data
	(possibility to set `tol` and `max_iter`).

	- 'lsqr' uses the dedicated regularized least-squares routine
	scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative
	procedure.

	- 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
	its improved, unbiased version named SAGA. Both methods also use an
	iterative procedure, and are often faster than other solvers when
	both n_samples and n_features are large. Note that 'sag' and
	'saga' fast convergence is only guaranteed on features with
	approximately the same scale. You can preprocess the data with a
	scaler from sklearn.preprocessing.

	- 'lbfgs' uses L-BFGS-B algorithm implemented in
	`scipy.optimize.minimize`. It can be used only when `positive`
	is True.

	All solvers except 'svd' support both dense and sparse data. However, only
	'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when
	`fit_intercept` is True.

	.. versionadded:: 0.17
	Stochastic Average Gradient descent solver.
	.. versionadded:: 0.19
	SAGA solver.

	positive : bool, default=False
	When set to ``True``, forces the coefficients to be positive.
	Only 'lbfgs' solver is supported in this case.

	random_state : int, RandomState instance, default=None
	Used when ``solver`` == 'sag' or 'saga' to shuffle the data.
	See :term:`Glossary <random_state>` for details.

	.. versionadded:: 0.17
	`random_state` to support Stochastic Average Gradient.

	Attributes
	----------
	coef_ : ndarray of shape (n_features,) or (n_targets, n_features)
	Weight vector(s).

	intercept_ : float or ndarray of shape (n_targets,)
	Independent term in decision function. Set to 0.0 if
	``fit_intercept = False``.

	n_iter_ : None or ndarray of shape (n_targets,)
	Actual number of iterations for each target. Available only for
	sag and lsqr solvers. Other solvers will return None.

	.. versionadded:: 0.17

	n_features_in_ : int
	Number of features seen during :term:`fit`.

	.. versionadded:: 0.24

	feature_names_in_ : ndarray of shape (`n_features_in_`,)
	Names of features seen during :term:`fit`. Defined only when `X`
	has feature names that are all strings.

	.. versionadded:: 1.0

	solver_ : str
	The solver that was used at fit time by the computational
	routines.

	.. versionadded:: 1.5

	See Also
	--------
	RidgeClassifier : Ridge classifier.
	RidgeCV : Ridge regression with built-in cross validation.
	:class:`~sklearn.kernel_ridge.KernelRidge` : Kernel ridge regression
	combines ridge regression with the kernel trick.

	Notes
	-----
	Regularization improves the conditioning of the problem and
	reduces the variance of the estimates. Larger values specify stronger
	regularization. Alpha corresponds to ``1 / (2C)`` in other linear
	models such as :class:`~sklearn.linear_model.LogisticRegression` or
	:class:`~sklearn.svm.LinearSVC`.

	Examples
	--------
	>>> from sklearn.linear_model import Ridge
	>>> import numpy as np
	>>> n_samples, n_features = 10, 5
	>>> rng = np.random.RandomState(0)
	>>> y = rng.randn(n_samples)
	>>> X = rng.randn(n_samples, n_features)
	>>> clf = Ridge(alpha=1.0)
	>>> clf.fit(X, y)
	Ridge()
	"""

	def __init__(
	self,
	alpha=1.0,
	*,
	fit_intercept=True,
	copy_X=True,
	max_iter=None,
	tol=1e-4,
	solver="auto",
	positive=False,
	random_state=None,
	):
	super().__init__(
	alpha=alpha,
	fit_intercept=fit_intercept,
	copy_X=copy_X,
	max_iter=max_iter,
	tol=tol,
	solver=solver,
	positive=positive,
	random_state=random_state,
	)

	@_fit_context(prefer_skip_nested_validation=True)
	def fit(self, X, y, sample_weight=None):
	"""Fit Ridge regression model.

	Parameters
	----------
	X : {ndarray, sparse matrix} of shape (n_samples, n_features)
	Training data.

	y : ndarray of shape (n_samples,) or (n_samples, n_targets)
	Target values.

	sample_weight : float or ndarray of shape (n_samples,), default=None
	Individual weights for each sample. If given a float, every sample
	will have the same weight.

	Returns
	-------
	self : object
	Fitted estimator.
	"""
	_accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver)
	xp, _ = get_namespace(X, y, sample_weight)
	X, y = validate_data(
	self,
	X,
	y,
	accept_sparse=_accept_sparse,
	dtype=[xp.float64, xp.float32],
	force_writeable=True,
	multi_output=True,
	y_numeric=True,
	)
	return super().fit(X, y, sample_weight=sample_weight)

	def __sklearn_tags__(self):
	tags = super().__sklearn_tags__()
	tags.array_api_support = True
	tags.input_tags.sparse = (self.solver != "svd") and (
	self.solver != "cholesky" or not self.fit_intercept
	)
	return tags


	class _RidgeClassifierMixin(LinearClassifierMixin):
	def _prepare_data(self, X, y, sample_weight, solver):
	"""Validate `X` and `y` and binarize `y`.

	Parameters
	----------
	X : {ndarray, sparse matrix} of shape (n_samples, n_features)
	Training data.

	y : ndarray of shape (n_samples,)
	Target values.

	sample_weight : float or ndarray of shape (n_samples,), default=None
	Individual weights for each sample. If given a float, every sample
	will have the same weight.

	solver : str
	The solver used in `Ridge` to know which sparse format to support.

	Returns
	-------
	X : {ndarray, sparse matrix} of shape (n_samples, n_features)
	Validated training data.

	y : ndarray of shape (n_samples,)
	Validated target values.

	sample_weight : ndarray of shape (n_samples,)
	Validated sample weights.

	Y : ndarray of shape (n_samples, n_classes)
	The binarized version of `y`.
	"""
	accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver)
	X, y = validate_data(
	self,
	X,
	y,
	accept_sparse=accept_sparse,
	multi_output=True,
	y_numeric=False,
	force_writeable=True,
	)

	self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
	Y = self._label_binarizer.fit_transform(y)
	if not self._label_binarizer.y_type_.startswith("multilabel"):
	y = column_or_1d(y, warn=True)

	sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
	if self.class_weight:
	sample_weight = sample_weight * compute_sample_weight(self.class_weight, y)
	return X, y, sample_weight, Y

	def predict(self, X):
	"""Predict class labels for samples in `X`.

	Parameters
	----------
	X : {array-like, spare matrix} of shape (n_samples, n_features)
	The data matrix for which we want to predict the targets.

	Returns
	-------
	y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)
	Vector or matrix containing the predictions. In binary and
	multiclass problems, this is a vector containing `n_samples`. In
	a multilabel problem, it returns a matrix of shape
	`(n_samples, n_outputs)`.
	"""
	check_is_fitted(self, attributes=["_label_binarizer"])
	if self._label_binarizer.y_type_.startswith("multilabel"):
	# Threshold such that the negative label is -1 and positive label
	# is 1 to use the inverse transform of the label binarizer fitted
	# during fit.
	scores = 2 * (self.decision_function(X) > 0) - 1
	return self._label_binarizer.inverse_transform(scores)
	return super().predict(X)

	@property
	def classes_(self):
	"""Classes labels."""
	return self._label_binarizer.classes_

	def __sklearn_tags__(self):
	tags = super().__sklearn_tags__()
	tags.classifier_tags.multi_label = True
	return tags


	class RidgeClassifier(_RidgeClassifierMixin, _BaseRidge):
	"""Classifier using Ridge regression.

	This classifier first converts the target values into ``{-1, 1}`` and
	then treats the problem as a regression task (multi-output regression in
	the multiclass case).

	Read more in the :ref:`User Guide <ridge_regression>`.

	Parameters
	----------
	alpha : float, default=1.0
	Regularization strength; must be a positive float. Regularization
	improves the conditioning of the problem and reduces the variance of
	the estimates. Larger values specify stronger regularization.
	Alpha corresponds to ``1 / (2C)`` in other linear models such as
	:class:`~sklearn.linear_model.LogisticRegression` or
	:class:`~sklearn.svm.LinearSVC`.

	fit_intercept : bool, default=True
	Whether to calculate the intercept for this model. If set to false, no
	intercept will be used in calculations (e.g. data is expected to be
	already centered).

	copy_X : bool, default=True
	If True, X will be copied; else, it may be overwritten.

	max_iter : int, default=None
	Maximum number of iterations for conjugate gradient solver.
	The default value is determined by scipy.sparse.linalg.

	tol : float, default=1e-4
	The precision of the solution (`coef_`) is determined by `tol` which
	specifies a different convergence criterion for each solver:

	- 'svd': `tol` has no impact.

	- 'cholesky': `tol` has no impact.

	- 'sparse_cg': norm of residuals smaller than `tol`.

	- 'lsqr': `tol` is set as atol and btol of scipy.sparse.linalg.lsqr,
	which control the norm of the residual vector in terms of the norms of
	matrix and coefficients.

	- 'sag' and 'saga': relative change of coef smaller than `tol`.

	- 'lbfgs': maximum of the absolute (projected) gradient=max\|residuals\|
	smaller than `tol`.

	.. versionchanged:: 1.2
	Default value changed from 1e-3 to 1e-4 for consistency with other linear
	models.

	class_weight : dict or 'balanced', default=None
	Weights associated with classes in the form ``{class_label: weight}``.
	If not given, all classes are supposed to have weight one.

	The "balanced" mode uses the values of y to automatically adjust
	weights inversely proportional to class frequencies in the input data
	as ``n_samples / (n_classes * np.bincount(y))``.

	solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \
	'sag', 'saga', 'lbfgs'}, default='auto'
	Solver to use in the computational routines:

	- 'auto' chooses the solver automatically based on the type of data.

	- 'svd' uses a Singular Value Decomposition of X to compute the Ridge
	coefficients. It is the most stable solver, in particular more stable
	for singular matrices than 'cholesky' at the cost of being slower.

	- 'cholesky' uses the standard scipy.linalg.solve function to
	obtain a closed-form solution.

	- 'sparse_cg' uses the conjugate gradient solver as found in
	scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
	more appropriate than 'cholesky' for large-scale data
	(possibility to set `tol` and `max_iter`).

	- 'lsqr' uses the dedicated regularized least-squares routine
	scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative
	procedure.

	- 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
	its unbiased and more flexible version named SAGA. Both methods
	use an iterative procedure, and are often faster than other solvers
	when both n_samples and n_features are large. Note that 'sag' and
	'saga' fast convergence is only guaranteed on features with
	approximately the same scale. You can preprocess the data with a
	scaler from sklearn.preprocessing.

	.. versionadded:: 0.17
	Stochastic Average Gradient descent solver.
	.. versionadded:: 0.19
	SAGA solver.

	- 'lbfgs' uses L-BFGS-B algorithm implemented in
	`scipy.optimize.minimize`. It can be used only when `positive`
	is True.

	positive : bool, default=False
	When set to ``True``, forces the coefficients to be positive.
	Only 'lbfgs' solver is supported in this case.

	random_state : int, RandomState instance, default=None
	Used when ``solver`` == 'sag' or 'saga' to shuffle the data.
	See :term:`Glossary <random_state>` for details.

	Attributes
	----------
	coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
	Coefficient of the features in the decision function.

	``coef_`` is of shape (1, n_features) when the given problem is binary.

	intercept_ : float or ndarray of shape (n_targets,)
	Independent term in decision function. Set to 0.0 if
	``fit_intercept = False``.

	n_iter_ : None or ndarray of shape (n_targets,)
	Actual number of iterations for each target. Available only for
	sag and lsqr solvers. Other solvers will return None.

	classes_ : ndarray of shape (n_classes,)
	The classes labels.

	n_features_in_ : int
	Number of features seen during :term:`fit`.

	.. versionadded:: 0.24

	feature_names_in_ : ndarray of shape (`n_features_in_`,)
	Names of features seen during :term:`fit`. Defined only when `X`
	has feature names that are all strings.

	.. versionadded:: 1.0

	solver_ : str
	The solver that was used at fit time by the computational
	routines.

	.. versionadded:: 1.5

	See Also
	--------
	Ridge : Ridge regression.
	RidgeClassifierCV : Ridge classifier with built-in cross validation.

	Notes
	-----
	For multi-class classification, n_class classifiers are trained in
	a one-versus-all approach. Concretely, this is implemented by taking
	advantage of the multi-variate response support in Ridge.

	Examples
	--------
	>>> from sklearn.datasets import load_breast_cancer
	>>> from sklearn.linear_model import RidgeClassifier
	>>> X, y = load_breast_cancer(return_X_y=True)
	>>> clf = RidgeClassifier().fit(X, y)
	>>> clf.score(X, y)
	0.9595...
	"""

	_parameter_constraints: dict = {
	**_BaseRidge._parameter_constraints,
	"class_weight": [dict, StrOptions({"balanced"}), None],
	}

	def __init__(
	self,
	alpha=1.0,
	*,
	fit_intercept=True,
	copy_X=True,
	max_iter=None,
	tol=1e-4,
	class_weight=None,
	solver="auto",
	positive=False,
	random_state=None,
	):
	super().__init__(
	alpha=alpha,
	fit_intercept=fit_intercept,
	copy_X=copy_X,
	max_iter=max_iter,
	tol=tol,
	solver=solver,
	positive=positive,
	random_state=random_state,
	)
	self.class_weight = class_weight

	@_fit_context(prefer_skip_nested_validation=True)
	def fit(self, X, y, sample_weight=None):
	"""Fit Ridge classifier model.

	Parameters
	----------
	X : {ndarray, sparse matrix} of shape (n_samples, n_features)
	Training data.

	y : ndarray of shape (n_samples,)
	Target values.

	sample_weight : float or ndarray of shape (n_samples,), default=None
	Individual weights for each sample. If given a float, every sample
	will have the same weight.

	.. versionadded:: 0.17
	sample_weight support to RidgeClassifier.

	Returns
	-------
	self : object
	Instance of the estimator.
	"""
	X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, self.solver)

	super().fit(X, Y, sample_weight=sample_weight)
	return self

	def __sklearn_tags__(self):
	tags = super().__sklearn_tags__()
	tags.input_tags.sparse = (self.solver != "svd") and (
	self.solver != "cholesky" or not self.fit_intercept
	)
	return tags


	def _check_gcv_mode(X, gcv_mode):
	if gcv_mode in ["eigen", "svd"]:
	return gcv_mode
	# if X has more rows than columns, use decomposition of X^T.X,
	# otherwise X.X^T
	if X.shape[0] > X.shape[1]:
	return "svd"
	return "eigen"


	def _find_smallest_angle(query, vectors):
	"""Find the column of vectors that is most aligned with the query.

	Both query and the columns of vectors must have their l2 norm equal to 1.

	Parameters
	----------
	query : ndarray of shape (n_samples,)
	Normalized query vector.

	vectors : ndarray of shape (n_samples, n_features)
	Vectors to which we compare query, as columns. Must be normalized.
	"""
	abs_cosine = np.abs(query.dot(vectors))
	index = np.argmax(abs_cosine)
	return index


	class _X_CenterStackOp(sparse.linalg.LinearOperator):
	"""Behaves as centered and scaled X with an added intercept column.

	This operator behaves as
	np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]])
	"""

	def __init__(self, X, X_mean, sqrt_sw):
	n_samples, n_features = X.shape
	super().__init__(X.dtype, (n_samples, n_features + 1))
	self.X = X
	self.X_mean = X_mean
	self.sqrt_sw = sqrt_sw

	def _matvec(self, v):
	v = v.ravel()
	return (
	safe_sparse_dot(self.X, v[:-1], dense_output=True)
	- self.sqrt_sw * self.X_mean.dot(v[:-1])
	+ v[-1] * self.sqrt_sw
	)

	def _matmat(self, v):
	return (
	safe_sparse_dot(self.X, v[:-1], dense_output=True)
	- self.sqrt_sw[:, None] * self.X_mean.dot(v[:-1])
	+ v[-1] * self.sqrt_sw[:, None]
	)

	def _transpose(self):
	return _XT_CenterStackOp(self.X, self.X_mean, self.sqrt_sw)


	class _XT_CenterStackOp(sparse.linalg.LinearOperator):
	"""Behaves as transposed centered and scaled X with an intercept column.

	This operator behaves as
	np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T
	"""

	def __init__(self, X, X_mean, sqrt_sw):
	n_samples, n_features = X.shape
	super().__init__(X.dtype, (n_features + 1, n_samples))
	self.X = X
	self.X_mean = X_mean
	self.sqrt_sw = sqrt_sw

	def _matvec(self, v):
	v = v.ravel()
	n_features = self.shape[0]
	res = np.empty(n_features, dtype=self.X.dtype)
	res[:-1] = safe_sparse_dot(self.X.T, v, dense_output=True) - (
	self.X_mean * self.sqrt_sw.dot(v)
	)
	res[-1] = np.dot(v, self.sqrt_sw)
	return res

	def _matmat(self, v):
	n_features = self.shape[0]
	res = np.empty((n_features, v.shape[1]), dtype=self.X.dtype)
	res[:-1] = safe_sparse_dot(self.X.T, v, dense_output=True) - self.X_mean[
	:, None
	] * self.sqrt_sw.dot(v)
	res[-1] = np.dot(self.sqrt_sw, v)
	return res


	class _IdentityRegressor(RegressorMixin, BaseEstimator):
	"""Fake regressor which will directly output the prediction."""

	def decision_function(self, y_predict):
	return y_predict

	def predict(self, y_predict):
	return y_predict


	class _IdentityClassifier(LinearClassifierMixin, BaseEstimator):
	"""Fake classifier which will directly output the prediction.

	We inherit from LinearClassifierMixin to get the proper shape for the
	output `y`.
	"""

	def __init__(self, classes):
	self.classes_ = classes

	def decision_function(self, y_predict):
	return y_predict


	class _RidgeGCV(LinearModel):
	"""Ridge regression with built-in Leave-one-out Cross-Validation.

	This class is not intended to be used directly. Use RidgeCV instead.

	`_RidgeGCV` uses a Generalized Cross-Validation for model selection. It's an
	efficient approximation of leave-one-out cross-validation (LOO-CV), where instead of
	computing multiple models by excluding one data point at a time, it uses an
	algebraic shortcut to approximate the LOO-CV error, making it faster and
	computationally more efficient.

	Using a naive grid-search approach with a leave-one-out cross-validation in contrast
	requires to fit `n_samples` models to compute the prediction error for each sample
	and then to repeat this process for each alpha in the grid.

	Here, the prediction error for each sample is computed by solving a single
	linear system (in other words a single model) via a matrix factorization (i.e.
	eigendecomposition or SVD) solving the problem stated in the Notes section. Finally,
	we need to repeat this process for each alpha in the grid. The detailed complexity
	is further discussed in Sect. 4 in [1].

	This algebraic approach is only applicable for regularized least squares
	problems. It could potentially be extended to kernel ridge regression.

	See the Notes section and references for more details regarding the formulation
	and the linear system that is solved.

	Notes
	-----

	We want to solve (K + alpha*Id)c = y,
	where K = X X^T is the kernel matrix.

	Let G = (K + alpha*Id).

	Dual solution: c = G^-1y
	Primal solution: w = X^T c

	Compute eigendecomposition K = Q V Q^T.
	Then G^-1 = Q (V + alpha*Id)^-1 Q^T,
	where (V + alpha*Id) is diagonal.
	It is thus inexpensive to inverse for many alphas.

	Let loov be the vector of prediction values for each example
	when the model was fitted with all examples but this example.

	loov = (KG^-1Y - diag(KG^-1)Y) / diag(I-KG^-1)

	Let looe be the vector of prediction errors for each example
	when the model was fitted with all examples but this example.

	looe = y - loov = c / diag(G^-1)

	The best score (negative mean squared error or user-provided scoring) is
	stored in the `best_score_` attribute, and the selected hyperparameter in
	`alpha_`.

	References
	----------
	[1] http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf
	[2] https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf
	"""

	def __init__(
	self,
	alphas=(0.1, 1.0, 10.0),
	*,
	fit_intercept=True,
	scoring=None,
	copy_X=True,
	gcv_mode=None,
	store_cv_results=False,
	is_clf=False,
	alpha_per_target=False,
	):
	self.alphas = alphas
	self.fit_intercept = fit_intercept
	self.scoring = scoring
	self.copy_X = copy_X
	self.gcv_mode = gcv_mode
	self.store_cv_results = store_cv_results
	self.is_clf = is_clf
	self.alpha_per_target = alpha_per_target

	@staticmethod
	def _decomp_diag(v_prime, Q):
	# compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T))
	return (v_prime * Q**2).sum(axis=-1)

	@staticmethod
	def _diag_dot(D, B):
	# compute dot(diag(D), B)
	if len(B.shape) > 1:
	# handle case where B is > 1-d
	D = D[(slice(None),) + (np.newaxis,) * (len(B.shape) - 1)]
	return D * B

	def _compute_gram(self, X, sqrt_sw):
	"""Computes the Gram matrix XX^T with possible centering.

	Parameters
	----------
	X : {ndarray, sparse matrix} of shape (n_samples, n_features)
	The preprocessed design matrix.

	sqrt_sw : ndarray of shape (n_samples,)
	square roots of sample weights

	Returns
	-------
	gram : ndarray of shape (n_samples, n_samples)
	The Gram matrix.
	X_mean : ndarray of shape (n_feature,)
	The weighted mean of ``X`` for each feature.

	Notes
	-----
	When X is dense the centering has been done in preprocessing
	so the mean is 0 and we just compute XX^T.

	When X is sparse it has not been centered in preprocessing, but it has
	been scaled by sqrt(sample weights).

	When self.fit_intercept is False no centering is done.

	The centered X is never actually computed because centering would break
	the sparsity of X.
	"""
	center = self.fit_intercept and sparse.issparse(X)
	if not center:
	# in this case centering has been done in preprocessing
	# or we are not fitting an intercept.
	X_mean = np.zeros(X.shape[1], dtype=X.dtype)
	return safe_sparse_dot(X, X.T, dense_output=True), X_mean
	# X is sparse
	n_samples = X.shape[0]
	sample_weight_matrix = sparse.dia_matrix(
	(sqrt_sw, 0), shape=(n_samples, n_samples)
	)
	X_weighted = sample_weight_matrix.dot(X)
	X_mean, _ = mean_variance_axis(X_weighted, axis=0)
	X_mean *= n_samples / sqrt_sw.dot(sqrt_sw)
	X_mX = sqrt_sw[:, None] * safe_sparse_dot(X_mean, X.T, dense_output=True)
	X_mX_m = np.outer(sqrt_sw, sqrt_sw) * np.dot(X_mean, X_mean)
	return (
	safe_sparse_dot(X, X.T, dense_output=True) + X_mX_m - X_mX - X_mX.T,
	X_mean,
	)

	def _compute_covariance(self, X, sqrt_sw):
	"""Computes covariance matrix X^TX with possible centering.

	Parameters
	----------
	X : sparse matrix of shape (n_samples, n_features)
	The preprocessed design matrix.

	sqrt_sw : ndarray of shape (n_samples,)
	square roots of sample weights

	Returns
	-------
	covariance : ndarray of shape (n_features, n_features)
	The covariance matrix.
	X_mean : ndarray of shape (n_feature,)
	The weighted mean of ``X`` for each feature.

	Notes
	-----
	Since X is sparse it has not been centered in preprocessing, but it has
	been scaled by sqrt(sample weights).

	When self.fit_intercept is False no centering is done.

	The centered X is never actually computed because centering would break
	the sparsity of X.
	"""
	if not self.fit_intercept:
	# in this case centering has been done in preprocessing
	# or we are not fitting an intercept.
	X_mean = np.zeros(X.shape[1], dtype=X.dtype)
	return safe_sparse_dot(X.T, X, dense_output=True), X_mean
	# this function only gets called for sparse X
	n_samples = X.shape[0]
	sample_weight_matrix = sparse.dia_matrix(
	(sqrt_sw, 0), shape=(n_samples, n_samples)
	)
	X_weighted = sample_weight_matrix.dot(X)
	X_mean, _ = mean_variance_axis(X_weighted, axis=0)
	X_mean = X_mean * n_samples / sqrt_sw.dot(sqrt_sw)
	weight_sum = sqrt_sw.dot(sqrt_sw)
	return (
	safe_sparse_dot(X.T, X, dense_output=True)
	- weight_sum * np.outer(X_mean, X_mean),
	X_mean,
	)

	def _sparse_multidot_diag(self, X, A, X_mean, sqrt_sw):
	"""Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T)
	without explicitly centering X nor computing X.dot(A)
	when X is sparse.

	Parameters
	----------
	X : sparse matrix of shape (n_samples, n_features)

	A : ndarray of shape (n_features, n_features)

	X_mean : ndarray of shape (n_features,)

	sqrt_sw : ndarray of shape (n_features,)
	square roots of sample weights

	Returns
	-------
	diag : np.ndarray, shape (n_samples,)
	The computed diagonal.
	"""
	intercept_col = scale = sqrt_sw
	batch_size = X.shape[1]
	diag = np.empty(X.shape[0], dtype=X.dtype)
	for start in range(0, X.shape[0], batch_size):
	batch = slice(start, min(X.shape[0], start + batch_size), 1)
	X_batch = np.empty(
	(X[batch].shape[0], X.shape[1] + self.fit_intercept), dtype=X.dtype
	)
	if self.fit_intercept:
	X_batch[:, :-1] = X[batch].toarray() - X_mean * scale[batch][:, None]
	X_batch[:, -1] = intercept_col[batch]
	else:
	X_batch = X[batch].toarray()
	diag[batch] = (X_batch.dot(A) * X_batch).sum(axis=1)
	return diag

	def _eigen_decompose_gram(self, X, y, sqrt_sw):
	"""Eigendecomposition of X.X^T, used when n_samples <= n_features."""
	# if X is dense it has already been centered in preprocessing
	K, X_mean = self._compute_gram(X, sqrt_sw)
	if self.fit_intercept:
	# to emulate centering X with sample weights,
	# ie removing the weighted average, we add a column
	# containing the square roots of the sample weights.
	# by centering, it is orthogonal to the other columns
	K += np.outer(sqrt_sw, sqrt_sw)
	eigvals, Q = linalg.eigh(K)
	QT_y = np.dot(Q.T, y)
	return X_mean, eigvals, Q, QT_y

	def _solve_eigen_gram(self, alpha, y, sqrt_sw, X_mean, eigvals, Q, QT_y):
	"""Compute dual coefficients and diagonal of G^-1.

	Used when we have a decomposition of X.X^T (n_samples <= n_features).
	"""
	w = 1.0 / (eigvals + alpha)
	if self.fit_intercept:
	# the vector containing the square roots of the sample weights (1
	# when no sample weights) is the eigenvector of XX^T which
	# corresponds to the intercept; we cancel the regularization on
	# this dimension. the corresponding eigenvalue is
	# sum(sample_weight).
	normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)
	intercept_dim = _find_smallest_angle(normalized_sw, Q)
	w[intercept_dim] = 0 # cancel regularization for the intercept

	c = np.dot(Q, self._diag_dot(w, QT_y))
	G_inverse_diag = self._decomp_diag(w, Q)
	# handle case where y is 2-d
	if len(y.shape) != 1:
	G_inverse_diag = G_inverse_diag[:, np.newaxis]
	return G_inverse_diag, c

	def _eigen_decompose_covariance(self, X, y, sqrt_sw):
	"""Eigendecomposition of X^T.X, used when n_samples > n_features
	and X is sparse.
	"""
	n_samples, n_features = X.shape
	cov = np.empty((n_features + 1, n_features + 1), dtype=X.dtype)
	cov[:-1, :-1], X_mean = self._compute_covariance(X, sqrt_sw)
	if not self.fit_intercept:
	cov = cov[:-1, :-1]
	# to emulate centering X with sample weights,
	# ie removing the weighted average, we add a column
	# containing the square roots of the sample weights.
	# by centering, it is orthogonal to the other columns
	# when all samples have the same weight we add a column of 1
	else:
	cov[-1] = 0
	cov[:, -1] = 0
	cov[-1, -1] = sqrt_sw.dot(sqrt_sw)
	nullspace_dim = max(0, n_features - n_samples)
	eigvals, V = linalg.eigh(cov)
	# remove eigenvalues and vectors in the null space of X^T.X
	eigvals = eigvals[nullspace_dim:]
	V = V[:, nullspace_dim:]
	return X_mean, eigvals, V, X

	def _solve_eigen_covariance_no_intercept(
	self, alpha, y, sqrt_sw, X_mean, eigvals, V, X
	):
	"""Compute dual coefficients and diagonal of G^-1.

	Used when we have a decomposition of X^T.X
	(n_samples > n_features and X is sparse), and not fitting an intercept.
	"""
	w = 1 / (eigvals + alpha)
	A = (V * w).dot(V.T)
	AXy = A.dot(safe_sparse_dot(X.T, y, dense_output=True))
	y_hat = safe_sparse_dot(X, AXy, dense_output=True)
	hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)
	if len(y.shape) != 1:
	# handle case where y is 2-d
	hat_diag = hat_diag[:, np.newaxis]
	return (1 - hat_diag) / alpha, (y - y_hat) / alpha

	def _solve_eigen_covariance_intercept(
	self, alpha, y, sqrt_sw, X_mean, eigvals, V, X
	):
	"""Compute dual coefficients and diagonal of G^-1.

	Used when we have a decomposition of X^T.X
	(n_samples > n_features and X is sparse),
	and we are fitting an intercept.
	"""
	# the vector [0, 0, ..., 0, 1]
	# is the eigenvector of X^TX which
	# corresponds to the intercept; we cancel the regularization on
	# this dimension. the corresponding eigenvalue is
	# sum(sample_weight), e.g. n when uniform sample weights.
	intercept_sv = np.zeros(V.shape[0])
	intercept_sv[-1] = 1
	intercept_dim = _find_smallest_angle(intercept_sv, V)
	w = 1 / (eigvals + alpha)
	w[intercept_dim] = 1 / eigvals[intercept_dim]
	A = (V * w).dot(V.T)
	# add a column to X containing the square roots of sample weights
	X_op = _X_CenterStackOp(X, X_mean, sqrt_sw)
	AXy = A.dot(X_op.T.dot(y))
	y_hat = X_op.dot(AXy)
	hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)
	# return (1 - hat_diag), (y - y_hat)
	if len(y.shape) != 1:
	# handle case where y is 2-d
	hat_diag = hat_diag[:, np.newaxis]
	return (1 - hat_diag) / alpha, (y - y_hat) / alpha

	def _solve_eigen_covariance(self, alpha, y, sqrt_sw, X_mean, eigvals, V, X):
	"""Compute dual coefficients and diagonal of G^-1.

	Used when we have a decomposition of X^T.X
	(n_samples > n_features and X is sparse).
	"""
	if self.fit_intercept:
	return self._solve_eigen_covariance_intercept(
	alpha, y, sqrt_sw, X_mean, eigvals, V, X
	)
	return self._solve_eigen_covariance_no_intercept(
	alpha, y, sqrt_sw, X_mean, eigvals, V, X
	)

	def _svd_decompose_design_matrix(self, X, y, sqrt_sw):
	# X already centered
	X_mean = np.zeros(X.shape[1], dtype=X.dtype)
	if self.fit_intercept:
	# to emulate fit_intercept=True situation, add a column
	# containing the square roots of the sample weights
	# by centering, the other columns are orthogonal to that one
	intercept_column = sqrt_sw[:, None]
	X = np.hstack((X, intercept_column))
	U, singvals, _ = linalg.svd(X, full_matrices=0)
	singvals_sq = singvals**2
	UT_y = np.dot(U.T, y)
	return X_mean, singvals_sq, U, UT_y

	def _solve_svd_design_matrix(self, alpha, y, sqrt_sw, X_mean, singvals_sq, U, UT_y):
	"""Compute dual coefficients and diagonal of G^-1.

	Used when we have an SVD decomposition of X
	(n_samples > n_features and X is dense).
	"""
	w = ((singvals_sq + alpha) -1) - (alpha-1)
	if self.fit_intercept:
	# detect intercept column
	normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)
	intercept_dim = _find_smallest_angle(normalized_sw, U)
	# cancel the regularization for the intercept
	w[intercept_dim] = -(alpha**-1)
	c = np.dot(U, self._diag_dot(w, UT_y)) + (alpha*-1) y
	G_inverse_diag = self._decomp_diag(w, U) + (alpha**-1)
	if len(y.shape) != 1:
	# handle case where y is 2-d
	G_inverse_diag = G_inverse_diag[:, np.newaxis]
	return G_inverse_diag, c

	def fit(self, X, y, sample_weight=None, score_params=None):
	"""Fit Ridge regression model with gcv.

	Parameters
	----------
	X : {ndarray, sparse matrix} of shape (n_samples, n_features)
	Training data. Will be cast to float64 if necessary.

	y : ndarray of shape (n_samples,) or (n_samples, n_targets)
	Target values. Will be cast to float64 if necessary.

	sample_weight : float or ndarray of shape (n_samples,), default=None
	Individual weights for each sample. If given a float, every sample
	will have the same weight. Note that the scale of `sample_weight`
	has an impact on the loss; i.e. multiplying all weights by `k`
	is equivalent to setting `alpha / k`.

	score_params : dict, default=None
	Parameters to be passed to the underlying scorer.

	.. versionadded:: 1.5
	See :ref:`Metadata Routing User Guide <metadata_routing>` for
	more details.

	Returns
	-------
	self : object
	"""
	X, y = validate_data(
	self,
	X,
	y,
	accept_sparse=["csr", "csc", "coo"],
	dtype=[np.float64],
	multi_output=True,
	y_numeric=True,
	)

	# alpha_per_target cannot be used in classifier mode. All subclasses
	# of _RidgeGCV that are classifiers keep alpha_per_target at its
	# default value: False, so the condition below should never happen.
	assert not (self.is_clf and self.alpha_per_target)

	if sample_weight is not None:
	sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

	self.alphas = np.asarray(self.alphas)

	unscaled_y = y
	X, y, X_offset, y_offset, X_scale = _preprocess_data(
	X,
	y,
	fit_intercept=self.fit_intercept,
	copy=self.copy_X,
	sample_weight=sample_weight,
	)

	gcv_mode = _check_gcv_mode(X, self.gcv_mode)

	if gcv_mode == "eigen":
	decompose = self._eigen_decompose_gram
	solve = self._solve_eigen_gram
	elif gcv_mode == "svd":
	if sparse.issparse(X):
	decompose = self._eigen_decompose_covariance
	solve = self._solve_eigen_covariance
	else:
	decompose = self._svd_decompose_design_matrix
	solve = self._solve_svd_design_matrix

	n_samples = X.shape[0]

	if sample_weight is not None:
	X, y, sqrt_sw = _rescale_data(X, y, sample_weight)
	else:
	sqrt_sw = np.ones(n_samples, dtype=X.dtype)

	X_mean, *decomposition = decompose(X, y, sqrt_sw)

	n_y = 1 if len(y.shape) == 1 else y.shape[1]
	n_alphas = 1 if np.ndim(self.alphas) == 0 else len(self.alphas)

	if self.store_cv_results:
	self.cv_results_ = np.empty((n_samples * n_y, n_alphas), dtype=X.dtype)

	best_coef, best_score, best_alpha = None, None, None

	for i, alpha in enumerate(np.atleast_1d(self.alphas)):
	G_inverse_diag, c = solve(float(alpha), y, sqrt_sw, X_mean, *decomposition)
	if self.scoring is None:
	squared_errors = (c / G_inverse_diag) ** 2
	alpha_score = self._score_without_scorer(squared_errors=squared_errors)
	if self.store_cv_results:
	self.cv_results_[:, i] = squared_errors.ravel()
	else:
	predictions = y - (c / G_inverse_diag)
	# Rescale predictions back to original scale
	if sample_weight is not None: # avoid the unecessary division by ones
	if predictions.ndim > 1:
	predictions /= sqrt_sw[:, None]
	else:
	predictions /= sqrt_sw
	predictions += y_offset

	if self.store_cv_results:
	self.cv_results_[:, i] = predictions.ravel()

	score_params = score_params or {}
	alpha_score = self._score(
	predictions=predictions,
	y=unscaled_y,
	n_y=n_y,
	scorer=self.scoring,
	score_params=score_params,
	)

	# Keep track of the best model
	if best_score is None:
	# initialize
	if self.alpha_per_target and n_y > 1:
	best_coef = c
	best_score = np.atleast_1d(alpha_score)
	best_alpha = np.full(n_y, alpha)
	else:
	best_coef = c
	best_score = alpha_score
	best_alpha = alpha
	else:
	# update
	if self.alpha_per_target and n_y > 1:
	to_update = alpha_score > best_score
	best_coef[:, to_update] = c[:, to_update]
	best_score[to_update] = alpha_score[to_update]
	best_alpha[to_update] = alpha
	elif alpha_score > best_score:
	best_coef, best_score, best_alpha = c, alpha_score, alpha

	self.alpha_ = best_alpha
	self.best_score_ = best_score
	self.dual_coef_ = best_coef
	self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)
	if y.ndim == 1 or y.shape[1] == 1:
	self.coef_ = self.coef_.ravel()

	if sparse.issparse(X):
	X_offset = X_mean * X_scale
	else:
	X_offset += X_mean * X_scale
	self._set_intercept(X_offset, y_offset, X_scale)

	if self.store_cv_results:
	if len(y.shape) == 1:
	cv_results_shape = n_samples, n_alphas
	else:
	cv_results_shape = n_samples, n_y, n_alphas
	self.cv_results_ = self.cv_results_.reshape(cv_results_shape)

	return self

	def _score_without_scorer(self, squared_errors):
	"""Performs scoring using squared errors when the scorer is None."""
	if self.alpha_per_target:
	_score = -squared_errors.mean(axis=0)
	else:
	_score = -squared_errors.mean()

	return _score

	def _score(self, *, predictions, y, n_y, scorer, score_params):
	"""Performs scoring with the specified scorer using the
	predictions and the true y values.
	"""
	if self.is_clf:
	identity_estimator = _IdentityClassifier(classes=np.arange(n_y))
	_score = scorer(
	identity_estimator,
	predictions,
	y.argmax(axis=1),
	**score_params,
	)
	else:
	identity_estimator = _IdentityRegressor()
	if self.alpha_per_target:
	_score = np.array(
	[
	scorer(
	identity_estimator,
	predictions[:, j],
	y[:, j],
	**score_params,
	)
	for j in range(n_y)
	]
	)
	else:
	_score = scorer(identity_estimator, predictions, y, **score_params)

	return _score

	def __sklearn_tags__(self):
	tags = super().__sklearn_tags__()
	# Required since this is neither a RegressorMixin nor a ClassifierMixin
	tags.target_tags.required = True
	return tags


	class _BaseRidgeCV(LinearModel):
	_parameter_constraints: dict = {
	"alphas": ["array-like", Interval(Real, 0, None, closed="neither")],
	"fit_intercept": ["boolean"],
	"scoring": [StrOptions(set(get_scorer_names())), callable, None],
	"cv": ["cv_object"],
	"gcv_mode": [StrOptions({"auto", "svd", "eigen"}), None],
	"store_cv_results": ["boolean", Hidden(None)],
	"alpha_per_target": ["boolean"],
	"store_cv_values": ["boolean", Hidden(StrOptions({"deprecated"}))],
	}

	def __init__(
	self,
	alphas=(0.1, 1.0, 10.0),
	*,
	fit_intercept=True,
	scoring=None,
	cv=None,
	gcv_mode=None,
	store_cv_results=None,
	alpha_per_target=False,
	store_cv_values="deprecated",
	):
	self.alphas = alphas
	self.fit_intercept = fit_intercept
	self.scoring = scoring
	self.cv = cv
	self.gcv_mode = gcv_mode
	self.store_cv_results = store_cv_results
	self.alpha_per_target = alpha_per_target
	self.store_cv_values = store_cv_values

	def fit(self, X, y, sample_weight=None, **params):
	"""Fit Ridge regression model with cv.

	Parameters
	----------
	X : ndarray of shape (n_samples, n_features)
	Training data. If using GCV, will be cast to float64
	if necessary.

	y : ndarray of shape (n_samples,) or (n_samples, n_targets)
	Target values. Will be cast to X's dtype if necessary.

	sample_weight : float or ndarray of shape (n_samples,), default=None
	Individual weights for each sample. If given a float, every sample
	will have the same weight.

	**params : dict, default=None
	Extra parameters for the underlying scorer.

	.. versionadded:: 1.5
	Only available if `enable_metadata_routing=True`,
	which can be set by using
	``sklearn.set_config(enable_metadata_routing=True)``.
	See :ref:`Metadata Routing User Guide <metadata_routing>` for
	more details.

	Returns
	-------
	self : object
	Fitted estimator.

	Notes
	-----
	When sample_weight is provided, the selected hyperparameter may depend
	on whether we use leave-one-out cross-validation (cv=None or cv='auto')
	or another form of cross-validation, because only leave-one-out
	cross-validation takes the sample weights into account when computing
	the validation score.
	"""
	_raise_for_params(params, self, "fit")
	cv = self.cv
	scorer = self._get_scorer()

	# TODO(1.7): Remove in 1.7
	# Also change `store_cv_results` default back to False
	if self.store_cv_values != "deprecated":
	if self.store_cv_results is not None:
	raise ValueError(
	"Both 'store_cv_values' and 'store_cv_results' were set. "
	"'store_cv_values' is deprecated in version 1.5 and will be "
	"removed in 1.7. To avoid this error, only set 'store_cv_results'."
	)
	warnings.warn(
	(
	"'store_cv_values' is deprecated in version 1.5 and will be "
	"removed in 1.7. Use 'store_cv_results' instead."
	),
	FutureWarning,
	)
	self._store_cv_results = self.store_cv_values
	elif self.store_cv_results is None:
	self._store_cv_results = False
	else:
	self._store_cv_results = self.store_cv_results

	# `_RidgeGCV` does not work for alpha = 0
	if cv is None:
	check_scalar_alpha = partial(
	check_scalar,
	target_type=numbers.Real,
	min_val=0.0,
	include_boundaries="neither",
	)
	else:
	check_scalar_alpha = partial(
	check_scalar,
	target_type=numbers.Real,
	min_val=0.0,
	include_boundaries="left",
	)

	if isinstance(self.alphas, (np.ndarray, list, tuple)):
	n_alphas = 1 if np.ndim(self.alphas) == 0 else len(self.alphas)
	if n_alphas != 1:
	for index, alpha in enumerate(self.alphas):
	alpha = check_scalar_alpha(alpha, f"alphas[{index}]")
	else:
	self.alphas[0] = check_scalar_alpha(self.alphas[0], "alphas")
	alphas = np.asarray(self.alphas)

	if sample_weight is not None:
	params["sample_weight"] = sample_weight

	if cv is None:
	if _routing_enabled():
	routed_params = process_routing(
	self,
	"fit",
	**params,
	)
	else:
	routed_params = Bunch(scorer=Bunch(score={}))
	if sample_weight is not None:
	routed_params.scorer.score["sample_weight"] = sample_weight

	# reset `scorer` variable to original user-intend if no scoring is passed
	if self.scoring is None:
	scorer = None

	estimator = _RidgeGCV(
	alphas,
	fit_intercept=self.fit_intercept,
	scoring=scorer,
	gcv_mode=self.gcv_mode,
	store_cv_results=self._store_cv_results,
	is_clf=is_classifier(self),
	alpha_per_target=self.alpha_per_target,
	)
	estimator.fit(
	X,
	y,
	sample_weight=sample_weight,
	score_params=routed_params.scorer.score,
	)
	self.alpha_ = estimator.alpha_
	self.best_score_ = estimator.best_score_
	if self._store_cv_results:
	self.cv_results_ = estimator.cv_results_
	else:
	if self._store_cv_results:
	raise ValueError("cv!=None and store_cv_results=True are incompatible")
	if self.alpha_per_target:
	raise ValueError("cv!=None and alpha_per_target=True are incompatible")

	parameters = {"alpha": alphas}
	solver = "sparse_cg" if sparse.issparse(X) else "auto"
	model = RidgeClassifier if is_classifier(self) else Ridge
	estimator = model(
	fit_intercept=self.fit_intercept,
	solver=solver,
	)
	if _routing_enabled():
	estimator.set_fit_request(sample_weight=True)

	grid_search = GridSearchCV(
	estimator,
	parameters,
	cv=cv,
	scoring=scorer,
	)

	grid_search.fit(X, y, **params)
	estimator = grid_search.best_estimator_
	self.alpha_ = grid_search.best_estimator_.alpha
	self.best_score_ = grid_search.best_score_

	self.coef_ = estimator.coef_
	self.intercept_ = estimator.intercept_
	self.n_features_in_ = estimator.n_features_in_
	if hasattr(estimator, "feature_names_in_"):
	self.feature_names_in_ = estimator.feature_names_in_

	return self

	def get_metadata_routing(self):
	"""Get metadata routing of this object.

	Please check :ref:`User Guide <metadata_routing>` on how the routing
	mechanism works.

	.. versionadded:: 1.5

	Returns
	-------
	routing : MetadataRouter
	A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
	routing information.
	"""
	router = (
	MetadataRouter(owner=self.__class__.__name__)
	.add_self_request(self)
	.add(
	scorer=self.scoring,
	method_mapping=MethodMapping().add(caller="fit", callee="score"),
	)
	.add(
	splitter=self.cv,
	method_mapping=MethodMapping().add(caller="fit", callee="split"),
	)
	)
	return router

	def _get_scorer(self):
	scorer = check_scoring(estimator=self, scoring=self.scoring, allow_none=True)
	if _routing_enabled() and self.scoring is None:
	# This estimator passes an array of 1s as sample_weight even if
	# sample_weight is not provided by the user. Therefore we need to
	# always request it. But we don't set it if it's passed explicitly
	# by the user.
	scorer.set_score_request(sample_weight=True)
	return scorer

	# TODO(1.7): Remove
	# mypy error: Decorated property not supported
	@deprecated( # type: ignore
	"Attribute `cv_values_` is deprecated in version 1.5 and will be removed "
	"in 1.7. Use `cv_results_` instead."
	)
	@property
	def cv_values_(self):
	return self.cv_results_

	def __sklearn_tags__(self):
	tags = super().__sklearn_tags__()
	tags.input_tags.sparse = True
	return tags


	class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
	"""Ridge regression with built-in cross-validation.

	See glossary entry for :term:`cross-validation estimator`.

	By default, it performs efficient Leave-One-Out Cross-Validation.

	Read more in the :ref:`User Guide <ridge_regression>`.

	Parameters
	----------
	alphas : array-like of shape (n_alphas,), default=(0.1, 1.0, 10.0)
	Array of alpha values to try.
	Regularization strength; must be a positive float. Regularization
	improves the conditioning of the problem and reduces the variance of
	the estimates. Larger values specify stronger regularization.
	Alpha corresponds to ``1 / (2C)`` in other linear models such as
	:class:`~sklearn.linear_model.LogisticRegression` or
	:class:`~sklearn.svm.LinearSVC`.
	If using Leave-One-Out cross-validation, alphas must be strictly positive.

	fit_intercept : bool, default=True
	Whether to calculate the intercept for this model. If set
	to false, no intercept will be used in calculations
	(i.e. data is expected to be centered).

	scoring : str, callable, default=None
	A string (see :ref:`scoring_parameter`) or a scorer callable object /
	function with signature ``scorer(estimator, X, y)``. If None, the
	negative mean squared error if cv is 'auto' or None (i.e. when using
	leave-one-out cross-validation), and r2 score otherwise.

	cv : int, cross-validation generator or an iterable, default=None
	Determines the cross-validation splitting strategy.
	Possible inputs for cv are:

	- None, to use the efficient Leave-One-Out cross-validation
	- integer, to specify the number of folds.
	- :term:`CV splitter`,
	- An iterable yielding (train, test) splits as arrays of indices.

	For integer/None inputs, if ``y`` is binary or multiclass,
	:class:`~sklearn.model_selection.StratifiedKFold` is used, else,
	:class:`~sklearn.model_selection.KFold` is used.

	Refer :ref:`User Guide <cross_validation>` for the various
	cross-validation strategies that can be used here.

	gcv_mode : {'auto', 'svd', 'eigen'}, default='auto'
	Flag indicating which strategy to use when performing
	Leave-One-Out Cross-Validation. Options are::

	'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen'
	'svd' : force use of singular value decomposition of X when X is
	dense, eigenvalue decomposition of X^T.X when X is sparse.
	'eigen' : force computation via eigendecomposition of X.X^T

	The 'auto' mode is the default and is intended to pick the cheaper
	option of the two depending on the shape of the training data.

	store_cv_results : bool, default=False
	Flag indicating if the cross-validation values corresponding to
	each alpha should be stored in the ``cv_results_`` attribute (see
	below). This flag is only compatible with ``cv=None`` (i.e. using
	Leave-One-Out Cross-Validation).

	.. versionchanged:: 1.5
	Parameter name changed from `store_cv_values` to `store_cv_results`.

	alpha_per_target : bool, default=False
	Flag indicating whether to optimize the alpha value (picked from the
	`alphas` parameter list) for each target separately (for multi-output
	settings: multiple prediction targets). When set to `True`, after
	fitting, the `alpha_` attribute will contain a value for each target.
	When set to `False`, a single alpha is used for all targets.

	.. versionadded:: 0.24

	store_cv_values : bool
	Flag indicating if the cross-validation values corresponding to
	each alpha should be stored in the ``cv_values_`` attribute (see
	below). This flag is only compatible with ``cv=None`` (i.e. using
	Leave-One-Out Cross-Validation).

	.. deprecated:: 1.5
	`store_cv_values` is deprecated in version 1.5 in favor of
	`store_cv_results` and will be removed in version 1.7.

	Attributes
	----------
	cv_results_ : ndarray of shape (n_samples, n_alphas) or \
	shape (n_samples, n_targets, n_alphas), optional
	Cross-validation values for each alpha (only available if
	``store_cv_results=True`` and ``cv=None``). After ``fit()`` has been
	called, this attribute will contain the mean squared errors if
	`scoring is None` otherwise it will contain standardized per point
	prediction values.

	.. versionchanged:: 1.5
	`cv_values_` changed to `cv_results_`.

	coef_ : ndarray of shape (n_features) or (n_targets, n_features)
	Weight vector(s).

	intercept_ : float or ndarray of shape (n_targets,)
	Independent term in decision function. Set to 0.0 if
	``fit_intercept = False``.

	alpha_ : float or ndarray of shape (n_targets,)
	Estimated regularization parameter, or, if ``alpha_per_target=True``,
	the estimated regularization parameter for each target.

	best_score_ : float or ndarray of shape (n_targets,)
	Score of base estimator with best alpha, or, if
	``alpha_per_target=True``, a score for each target.

	.. versionadded:: 0.23

	n_features_in_ : int
	Number of features seen during :term:`fit`.

	.. versionadded:: 0.24

	feature_names_in_ : ndarray of shape (`n_features_in_`,)
	Names of features seen during :term:`fit`. Defined only when `X`
	has feature names that are all strings.

	.. versionadded:: 1.0

	See Also
	--------
	Ridge : Ridge regression.
	RidgeClassifier : Classifier based on ridge regression on {-1, 1} labels.
	RidgeClassifierCV : Ridge classifier with built-in cross validation.

	Examples
	--------
	>>> from sklearn.datasets import load_diabetes
	>>> from sklearn.linear_model import RidgeCV
	>>> X, y = load_diabetes(return_X_y=True)
	>>> clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)
	>>> clf.score(X, y)
	0.5166...
	"""

	@_fit_context(prefer_skip_nested_validation=True)
	def fit(self, X, y, sample_weight=None, **params):
	"""Fit Ridge regression model with cv.

	Parameters
	----------
	X : ndarray of shape (n_samples, n_features)
	Training data. If using GCV, will be cast to float64
	if necessary.

	y : ndarray of shape (n_samples,) or (n_samples, n_targets)
	Target values. Will be cast to X's dtype if necessary.

	sample_weight : float or ndarray of shape (n_samples,), default=None
	Individual weights for each sample. If given a float, every sample
	will have the same weight.

	**params : dict, default=None
	Parameters to be passed to the underlying scorer.

	.. versionadded:: 1.5
	Only available if `enable_metadata_routing=True`,
	which can be set by using
	``sklearn.set_config(enable_metadata_routing=True)``.
	See :ref:`Metadata Routing User Guide <metadata_routing>` for
	more details.

	Returns
	-------
	self : object
	Fitted estimator.

	Notes
	-----
	When sample_weight is provided, the selected hyperparameter may depend
	on whether we use leave-one-out cross-validation (cv=None or cv='auto')
	or another form of cross-validation, because only leave-one-out
	cross-validation takes the sample weights into account when computing
	the validation score.
	"""
	super().fit(X, y, sample_weight=sample_weight, **params)
	return self


	class RidgeClassifierCV(_RidgeClassifierMixin, _BaseRidgeCV):
	"""Ridge classifier with built-in cross-validation.

	See glossary entry for :term:`cross-validation estimator`.

	By default, it performs Leave-One-Out Cross-Validation. Currently,
	only the n_features > n_samples case is handled efficiently.

	Read more in the :ref:`User Guide <ridge_regression>`.

	Parameters
	----------
	alphas : array-like of shape (n_alphas,), default=(0.1, 1.0, 10.0)
	Array of alpha values to try.
	Regularization strength; must be a positive float. Regularization
	improves the conditioning of the problem and reduces the variance of
	the estimates. Larger values specify stronger regularization.
	Alpha corresponds to ``1 / (2C)`` in other linear models such as
	:class:`~sklearn.linear_model.LogisticRegression` or
	:class:`~sklearn.svm.LinearSVC`.
	If using Leave-One-Out cross-validation, alphas must be strictly positive.

	fit_intercept : bool, default=True
	Whether to calculate the intercept for this model. If set
	to false, no intercept will be used in calculations
	(i.e. data is expected to be centered).

	scoring : str, callable, default=None
	A string (see :ref:`scoring_parameter`) or a scorer callable object /
	function with signature ``scorer(estimator, X, y)``.

	cv : int, cross-validation generator or an iterable, default=None
	Determines the cross-validation splitting strategy.
	Possible inputs for cv are:

	- None, to use the efficient Leave-One-Out cross-validation
	- integer, to specify the number of folds.
	- :term:`CV splitter`,
	- An iterable yielding (train, test) splits as arrays of indices.

	Refer :ref:`User Guide <cross_validation>` for the various
	cross-validation strategies that can be used here.

	class_weight : dict or 'balanced', default=None
	Weights associated with classes in the form ``{class_label: weight}``.
	If not given, all classes are supposed to have weight one.

	The "balanced" mode uses the values of y to automatically adjust
	weights inversely proportional to class frequencies in the input data
	as ``n_samples / (n_classes * np.bincount(y))``.

	store_cv_results : bool, default=False
	Flag indicating if the cross-validation results corresponding to
	each alpha should be stored in the ``cv_results_`` attribute (see
	below). This flag is only compatible with ``cv=None`` (i.e. using
	Leave-One-Out Cross-Validation).

	.. versionchanged:: 1.5
	Parameter name changed from `store_cv_values` to `store_cv_results`.

	store_cv_values : bool
	Flag indicating if the cross-validation values corresponding to
	each alpha should be stored in the ``cv_values_`` attribute (see
	below). This flag is only compatible with ``cv=None`` (i.e. using
	Leave-One-Out Cross-Validation).

	.. deprecated:: 1.5
	`store_cv_values` is deprecated in version 1.5 in favor of
	`store_cv_results` and will be removed in version 1.7.

	Attributes
	----------
	cv_results_ : ndarray of shape (n_samples, n_targets, n_alphas), optional
	Cross-validation results for each alpha (only if ``store_cv_results=True`` and
	``cv=None``). After ``fit()`` has been called, this attribute will
	contain the mean squared errors if `scoring is None` otherwise it
	will contain standardized per point prediction values.

	.. versionchanged:: 1.5
	`cv_values_` changed to `cv_results_`.

	coef_ : ndarray of shape (1, n_features) or (n_targets, n_features)
	Coefficient of the features in the decision function.

	``coef_`` is of shape (1, n_features) when the given problem is binary.

	intercept_ : float or ndarray of shape (n_targets,)
	Independent term in decision function. Set to 0.0 if
	``fit_intercept = False``.

	alpha_ : float
	Estimated regularization parameter.

	best_score_ : float
	Score of base estimator with best alpha.

	.. versionadded:: 0.23

	classes_ : ndarray of shape (n_classes,)
	The classes labels.

	n_features_in_ : int
	Number of features seen during :term:`fit`.

	.. versionadded:: 0.24

	feature_names_in_ : ndarray of shape (`n_features_in_`,)
	Names of features seen during :term:`fit`. Defined only when `X`
	has feature names that are all strings.

	.. versionadded:: 1.0

	See Also
	--------
	Ridge : Ridge regression.
	RidgeClassifier : Ridge classifier.
	RidgeCV : Ridge regression with built-in cross validation.

	Notes
	-----
	For multi-class classification, n_class classifiers are trained in
	a one-versus-all approach. Concretely, this is implemented by taking
	advantage of the multi-variate response support in Ridge.

	Examples
	--------
	>>> from sklearn.datasets import load_breast_cancer
	>>> from sklearn.linear_model import RidgeClassifierCV
	>>> X, y = load_breast_cancer(return_X_y=True)
	>>> clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)
	>>> clf.score(X, y)
	0.9630...
	"""

	_parameter_constraints: dict = {
	**_BaseRidgeCV._parameter_constraints,
	"class_weight": [dict, StrOptions({"balanced"}), None],
	}
	for param in ("gcv_mode", "alpha_per_target"):
	_parameter_constraints.pop(param)

	def __init__(
	self,
	alphas=(0.1, 1.0, 10.0),
	*,
	fit_intercept=True,
	scoring=None,
	cv=None,
	class_weight=None,
	store_cv_results=None,
	store_cv_values="deprecated",
	):
	super().__init__(
	alphas=alphas,
	fit_intercept=fit_intercept,
	scoring=scoring,
	cv=cv,
	store_cv_results=store_cv_results,
	store_cv_values=store_cv_values,
	)
	self.class_weight = class_weight

	@_fit_context(prefer_skip_nested_validation=True)
	def fit(self, X, y, sample_weight=None, **params):
	"""Fit Ridge classifier with cv.

	Parameters
	----------
	X : ndarray of shape (n_samples, n_features)
	Training vectors, where `n_samples` is the number of samples
	and `n_features` is the number of features. When using GCV,
	will be cast to float64 if necessary.

	y : ndarray of shape (n_samples,)
	Target values. Will be cast to X's dtype if necessary.

	sample_weight : float or ndarray of shape (n_samples,), default=None
	Individual weights for each sample. If given a float, every sample
	will have the same weight.

	**params : dict, default=None
	Parameters to be passed to the underlying scorer.

	.. versionadded:: 1.5
	Only available if `enable_metadata_routing=True`,
	which can be set by using
	``sklearn.set_config(enable_metadata_routing=True)``.
	See :ref:`Metadata Routing User Guide <metadata_routing>` for
	more details.

	Returns
	-------
	self : object
	Fitted estimator.
	"""
	# `RidgeClassifier` does not accept "sag" or "saga" solver and thus support
	# csr, csc, and coo sparse matrices. By using solver="eigen" we force to accept
	# all sparse format.
	X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, solver="eigen")

	# If cv is None, gcv mode will be used and we used the binarized Y
	# since y will not be binarized in _RidgeGCV estimator.
	# If cv is not None, a GridSearchCV with some RidgeClassifier
	# estimators are used where y will be binarized. Thus, we pass y
	# instead of the binarized Y.
	target = Y if self.cv is None else y
	super().fit(X, target, sample_weight=sample_weight, **params)
	return self