Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

14.7 kB

	# Authors: The scikit-learn developers
	# SPDX-License-Identifier: BSD-3-Clause

	import numbers

	import numpy as np

	from ...utils import _safe_indexing, check_random_state
	from ...utils._optional_dependencies import check_matplotlib_support
	from ...utils._plotting import _validate_style_kwargs


	class PredictionErrorDisplay:
	"""Visualization of the prediction error of a regression model.

	This tool can display "residuals vs predicted" or "actual vs predicted"
	using scatter plots to qualitatively assess the behavior of a regressor,
	preferably on held-out data points.

	See the details in the docstrings of
	:func:`~sklearn.metrics.PredictionErrorDisplay.from_estimator` or
	:func:`~sklearn.metrics.PredictionErrorDisplay.from_predictions` to
	create a visualizer. All parameters are stored as attributes.

	For general information regarding `scikit-learn` visualization tools, read
	more in the :ref:`Visualization Guide <visualizations>`.
	For details regarding interpreting these plots, refer to the
	:ref:`Model Evaluation Guide <visualization_regression_evaluation>`.

	.. versionadded:: 1.2

	Parameters
	----------
	y_true : ndarray of shape (n_samples,)
	True values.

	y_pred : ndarray of shape (n_samples,)
	Prediction values.

	Attributes
	----------
	line_ : matplotlib Artist
	Optimal line representing `y_true == y_pred`. Therefore, it is a
	diagonal line for `kind="predictions"` and a horizontal line for
	`kind="residuals"`.

	errors_lines_ : matplotlib Artist or None
	Residual lines. If `with_errors=False`, then it is set to `None`.

	scatter_ : matplotlib Artist
	Scatter data points.

	ax_ : matplotlib Axes
	Axes with the different matplotlib axis.

	figure_ : matplotlib Figure
	Figure containing the scatter and lines.

	See Also
	--------
	PredictionErrorDisplay.from_estimator : Prediction error visualization
	given an estimator and some data.
	PredictionErrorDisplay.from_predictions : Prediction error visualization
	given the true and predicted targets.

	Examples
	--------
	>>> import matplotlib.pyplot as plt
	>>> from sklearn.datasets import load_diabetes
	>>> from sklearn.linear_model import Ridge
	>>> from sklearn.metrics import PredictionErrorDisplay
	>>> X, y = load_diabetes(return_X_y=True)
	>>> ridge = Ridge().fit(X, y)
	>>> y_pred = ridge.predict(X)
	>>> display = PredictionErrorDisplay(y_true=y, y_pred=y_pred)
	>>> display.plot()
	<...>
	>>> plt.show()
	"""

	def __init__(self, *, y_true, y_pred):
	self.y_true = y_true
	self.y_pred = y_pred

	def plot(
	self,
	ax=None,
	*,
	kind="residual_vs_predicted",
	scatter_kwargs=None,
	line_kwargs=None,
	):
	"""Plot visualization.

	Extra keyword arguments will be passed to matplotlib's ``plot``.

	Parameters
	----------
	ax : matplotlib axes, default=None
	Axes object to plot on. If `None`, a new figure and axes is
	created.

	kind : {"actual_vs_predicted", "residual_vs_predicted"}, \
	default="residual_vs_predicted"
	The type of plot to draw:

	- "actual_vs_predicted" draws the observed values (y-axis) vs.
	the predicted values (x-axis).
	- "residual_vs_predicted" draws the residuals, i.e. difference
	between observed and predicted values, (y-axis) vs. the predicted
	values (x-axis).

	scatter_kwargs : dict, default=None
	Dictionary with keywords passed to the `matplotlib.pyplot.scatter`
	call.

	line_kwargs : dict, default=None
	Dictionary with keyword passed to the `matplotlib.pyplot.plot`
	call to draw the optimal line.

	Returns
	-------
	display : :class:`~sklearn.metrics.PredictionErrorDisplay`

	Object that stores computed values.
	"""
	check_matplotlib_support(f"{self.__class__.__name__}.plot")

	expected_kind = ("actual_vs_predicted", "residual_vs_predicted")
	if kind not in expected_kind:
	raise ValueError(
	f"`kind` must be one of {', '.join(expected_kind)}. "
	f"Got {kind!r} instead."
	)

	import matplotlib.pyplot as plt

	if scatter_kwargs is None:
	scatter_kwargs = {}
	if line_kwargs is None:
	line_kwargs = {}

	default_scatter_kwargs = {"color": "tab:blue", "alpha": 0.8}
	default_line_kwargs = {"color": "black", "alpha": 0.7, "linestyle": "--"}

	scatter_kwargs = _validate_style_kwargs(default_scatter_kwargs, scatter_kwargs)
	line_kwargs = _validate_style_kwargs(default_line_kwargs, line_kwargs)

	scatter_kwargs = {default_scatter_kwargs, scatter_kwargs}
	line_kwargs = {default_line_kwargs, line_kwargs}

	if ax is None:
	_, ax = plt.subplots()

	if kind == "actual_vs_predicted":
	max_value = max(np.max(self.y_true), np.max(self.y_pred))
	min_value = min(np.min(self.y_true), np.min(self.y_pred))
	self.line_ = ax.plot(
	[min_value, max_value], [min_value, max_value], **line_kwargs
	)[0]

	x_data, y_data = self.y_pred, self.y_true
	xlabel, ylabel = "Predicted values", "Actual values"

	self.scatter_ = ax.scatter(x_data, y_data, **scatter_kwargs)

	# force to have a squared axis
	ax.set_aspect("equal", adjustable="datalim")
	ax.set_xticks(np.linspace(min_value, max_value, num=5))
	ax.set_yticks(np.linspace(min_value, max_value, num=5))
	else: # kind == "residual_vs_predicted"
	self.line_ = ax.plot(
	[np.min(self.y_pred), np.max(self.y_pred)],
	[0, 0],
	**line_kwargs,
	)[0]
	self.scatter_ = ax.scatter(
	self.y_pred, self.y_true - self.y_pred, **scatter_kwargs
	)
	xlabel, ylabel = "Predicted values", "Residuals (actual - predicted)"

	ax.set(xlabel=xlabel, ylabel=ylabel)

	self.ax_ = ax
	self.figure_ = ax.figure

	return self

	@classmethod
	def from_estimator(
	cls,
	estimator,
	X,
	y,
	*,
	kind="residual_vs_predicted",
	subsample=1_000,
	random_state=None,
	ax=None,
	scatter_kwargs=None,
	line_kwargs=None,
	):
	"""Plot the prediction error given a regressor and some data.

	For general information regarding `scikit-learn` visualization tools,
	read more in the :ref:`Visualization Guide <visualizations>`.
	For details regarding interpreting these plots, refer to the
	:ref:`Model Evaluation Guide <visualization_regression_evaluation>`.

	.. versionadded:: 1.2

	Parameters
	----------
	estimator : estimator instance
	Fitted regressor or a fitted :class:`~sklearn.pipeline.Pipeline`
	in which the last estimator is a regressor.

	X : {array-like, sparse matrix} of shape (n_samples, n_features)
	Input values.

	y : array-like of shape (n_samples,)
	Target values.

	kind : {"actual_vs_predicted", "residual_vs_predicted"}, \
	default="residual_vs_predicted"
	The type of plot to draw:

	- "actual_vs_predicted" draws the observed values (y-axis) vs.
	the predicted values (x-axis).
	- "residual_vs_predicted" draws the residuals, i.e. difference
	between observed and predicted values, (y-axis) vs. the predicted
	values (x-axis).

	subsample : float, int or None, default=1_000
	Sampling the samples to be shown on the scatter plot. If `float`,
	it should be between 0 and 1 and represents the proportion of the
	original dataset. If `int`, it represents the number of samples
	display on the scatter plot. If `None`, no subsampling will be
	applied. by default, 1000 samples or less will be displayed.

	random_state : int or RandomState, default=None
	Controls the randomness when `subsample` is not `None`.
	See :term:`Glossary <random_state>` for details.

	ax : matplotlib axes, default=None
	Axes object to plot on. If `None`, a new figure and axes is
	created.

	scatter_kwargs : dict, default=None
	Dictionary with keywords passed to the `matplotlib.pyplot.scatter`
	call.

	line_kwargs : dict, default=None
	Dictionary with keyword passed to the `matplotlib.pyplot.plot`
	call to draw the optimal line.

	Returns
	-------
	display : :class:`~sklearn.metrics.PredictionErrorDisplay`
	Object that stores the computed values.

	See Also
	--------
	PredictionErrorDisplay : Prediction error visualization for regression.
	PredictionErrorDisplay.from_predictions : Prediction error visualization
	given the true and predicted targets.

	Examples
	--------
	>>> import matplotlib.pyplot as plt
	>>> from sklearn.datasets import load_diabetes
	>>> from sklearn.linear_model import Ridge
	>>> from sklearn.metrics import PredictionErrorDisplay
	>>> X, y = load_diabetes(return_X_y=True)
	>>> ridge = Ridge().fit(X, y)
	>>> disp = PredictionErrorDisplay.from_estimator(ridge, X, y)
	>>> plt.show()
	"""
	check_matplotlib_support(f"{cls.__name__}.from_estimator")

	y_pred = estimator.predict(X)

	return cls.from_predictions(
	y_true=y,
	y_pred=y_pred,
	kind=kind,
	subsample=subsample,
	random_state=random_state,
	ax=ax,
	scatter_kwargs=scatter_kwargs,
	line_kwargs=line_kwargs,
	)

	@classmethod
	def from_predictions(
	cls,
	y_true,
	y_pred,
	*,
	kind="residual_vs_predicted",
	subsample=1_000,
	random_state=None,
	ax=None,
	scatter_kwargs=None,
	line_kwargs=None,
	):
	"""Plot the prediction error given the true and predicted targets.

	For general information regarding `scikit-learn` visualization tools,
	read more in the :ref:`Visualization Guide <visualizations>`.
	For details regarding interpreting these plots, refer to the
	:ref:`Model Evaluation Guide <visualization_regression_evaluation>`.

	.. versionadded:: 1.2

	Parameters
	----------
	y_true : array-like of shape (n_samples,)
	True target values.

	y_pred : array-like of shape (n_samples,)
	Predicted target values.

	kind : {"actual_vs_predicted", "residual_vs_predicted"}, \
	default="residual_vs_predicted"
	The type of plot to draw:

	- "actual_vs_predicted" draws the observed values (y-axis) vs.
	the predicted values (x-axis).
	- "residual_vs_predicted" draws the residuals, i.e. difference
	between observed and predicted values, (y-axis) vs. the predicted
	values (x-axis).

	subsample : float, int or None, default=1_000
	Sampling the samples to be shown on the scatter plot. If `float`,
	it should be between 0 and 1 and represents the proportion of the
	original dataset. If `int`, it represents the number of samples
	display on the scatter plot. If `None`, no subsampling will be
	applied. by default, 1000 samples or less will be displayed.

	random_state : int or RandomState, default=None
	Controls the randomness when `subsample` is not `None`.
	See :term:`Glossary <random_state>` for details.

	ax : matplotlib axes, default=None
	Axes object to plot on. If `None`, a new figure and axes is
	created.

	scatter_kwargs : dict, default=None
	Dictionary with keywords passed to the `matplotlib.pyplot.scatter`
	call.

	line_kwargs : dict, default=None
	Dictionary with keyword passed to the `matplotlib.pyplot.plot`
	call to draw the optimal line.

	Returns
	-------
	display : :class:`~sklearn.metrics.PredictionErrorDisplay`
	Object that stores the computed values.

	See Also
	--------
	PredictionErrorDisplay : Prediction error visualization for regression.
	PredictionErrorDisplay.from_estimator : Prediction error visualization
	given an estimator and some data.

	Examples
	--------
	>>> import matplotlib.pyplot as plt
	>>> from sklearn.datasets import load_diabetes
	>>> from sklearn.linear_model import Ridge
	>>> from sklearn.metrics import PredictionErrorDisplay
	>>> X, y = load_diabetes(return_X_y=True)
	>>> ridge = Ridge().fit(X, y)
	>>> y_pred = ridge.predict(X)
	>>> disp = PredictionErrorDisplay.from_predictions(y_true=y, y_pred=y_pred)
	>>> plt.show()
	"""
	check_matplotlib_support(f"{cls.__name__}.from_predictions")

	random_state = check_random_state(random_state)

	n_samples = len(y_true)
	if isinstance(subsample, numbers.Integral):
	if subsample <= 0:
	raise ValueError(
	f"When an integer, subsample={subsample} should be positive."
	)
	elif isinstance(subsample, numbers.Real):
	if subsample <= 0 or subsample >= 1:
	raise ValueError(
	f"When a floating-point, subsample={subsample} should"
	" be in the (0, 1) range."
	)
	subsample = int(n_samples * subsample)

	if subsample is not None and subsample < n_samples:
	indices = random_state.choice(np.arange(n_samples), size=subsample)
	y_true = _safe_indexing(y_true, indices, axis=0)
	y_pred = _safe_indexing(y_pred, indices, axis=0)

	viz = cls(
	y_true=y_true,
	y_pred=y_pred,
	)

	return viz.plot(
	ax=ax,
	kind=kind,
	scatter_kwargs=scatter_kwargs,
	line_kwargs=line_kwargs,
	)