|
""" |
|
Various bayesian regression |
|
""" |
|
|
|
|
|
|
|
|
|
from math import log |
|
from numbers import Integral, Real |
|
|
|
import numpy as np |
|
from scipy import linalg |
|
from scipy.linalg import pinvh |
|
|
|
from ..base import RegressorMixin, _fit_context |
|
from ..utils import _safe_indexing |
|
from ..utils._param_validation import Interval |
|
from ..utils.extmath import fast_logdet |
|
from ..utils.validation import _check_sample_weight, validate_data |
|
from ._base import LinearModel, _preprocess_data, _rescale_data |
|
|
|
|
|
|
|
|
|
|
|
class BayesianRidge(RegressorMixin, LinearModel): |
|
"""Bayesian ridge regression. |
|
|
|
Fit a Bayesian ridge model. See the Notes section for details on this |
|
implementation and the optimization of the regularization parameters |
|
lambda (precision of the weights) and alpha (precision of the noise). |
|
|
|
Read more in the :ref:`User Guide <bayesian_regression>`. |
|
For an intuitive visualization of how the sinusoid is approximated by |
|
a polynomial using different pairs of initial values, see |
|
:ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`. |
|
|
|
Parameters |
|
---------- |
|
max_iter : int, default=300 |
|
Maximum number of iterations over the complete dataset before |
|
stopping independently of any early stopping criterion. |
|
|
|
.. versionchanged:: 1.3 |
|
|
|
tol : float, default=1e-3 |
|
Stop the algorithm if w has converged. |
|
|
|
alpha_1 : float, default=1e-6 |
|
Hyper-parameter : shape parameter for the Gamma distribution prior |
|
over the alpha parameter. |
|
|
|
alpha_2 : float, default=1e-6 |
|
Hyper-parameter : inverse scale parameter (rate parameter) for the |
|
Gamma distribution prior over the alpha parameter. |
|
|
|
lambda_1 : float, default=1e-6 |
|
Hyper-parameter : shape parameter for the Gamma distribution prior |
|
over the lambda parameter. |
|
|
|
lambda_2 : float, default=1e-6 |
|
Hyper-parameter : inverse scale parameter (rate parameter) for the |
|
Gamma distribution prior over the lambda parameter. |
|
|
|
alpha_init : float, default=None |
|
Initial value for alpha (precision of the noise). |
|
If not set, alpha_init is 1/Var(y). |
|
|
|
.. versionadded:: 0.22 |
|
|
|
lambda_init : float, default=None |
|
Initial value for lambda (precision of the weights). |
|
If not set, lambda_init is 1. |
|
|
|
.. versionadded:: 0.22 |
|
|
|
compute_score : bool, default=False |
|
If True, compute the log marginal likelihood at each iteration of the |
|
optimization. |
|
|
|
fit_intercept : bool, default=True |
|
Whether to calculate the intercept for this model. |
|
The intercept is not treated as a probabilistic parameter |
|
and thus has no associated variance. If set |
|
to False, no intercept will be used in calculations |
|
(i.e. data is expected to be centered). |
|
|
|
copy_X : bool, default=True |
|
If True, X will be copied; else, it may be overwritten. |
|
|
|
verbose : bool, default=False |
|
Verbose mode when fitting the model. |
|
|
|
Attributes |
|
---------- |
|
coef_ : array-like of shape (n_features,) |
|
Coefficients of the regression model (mean of distribution) |
|
|
|
intercept_ : float |
|
Independent term in decision function. Set to 0.0 if |
|
`fit_intercept = False`. |
|
|
|
alpha_ : float |
|
Estimated precision of the noise. |
|
|
|
lambda_ : float |
|
Estimated precision of the weights. |
|
|
|
sigma_ : array-like of shape (n_features, n_features) |
|
Estimated variance-covariance matrix of the weights |
|
|
|
scores_ : array-like of shape (n_iter_+1,) |
|
If computed_score is True, value of the log marginal likelihood (to be |
|
maximized) at each iteration of the optimization. The array starts |
|
with the value of the log marginal likelihood obtained for the initial |
|
values of alpha and lambda and ends with the value obtained for the |
|
estimated alpha and lambda. |
|
|
|
n_iter_ : int |
|
The actual number of iterations to reach the stopping criterion. |
|
|
|
X_offset_ : ndarray of shape (n_features,) |
|
If `fit_intercept=True`, offset subtracted for centering data to a |
|
zero mean. Set to np.zeros(n_features) otherwise. |
|
|
|
X_scale_ : ndarray of shape (n_features,) |
|
Set to np.ones(n_features). |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
ARDRegression : Bayesian ARD regression. |
|
|
|
Notes |
|
----- |
|
There exist several strategies to perform Bayesian ridge regression. This |
|
implementation is based on the algorithm described in Appendix A of |
|
(Tipping, 2001) where updates of the regularization parameters are done as |
|
suggested in (MacKay, 1992). Note that according to A New |
|
View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these |
|
update rules do not guarantee that the marginal likelihood is increasing |
|
between two consecutive iterations of the optimization. |
|
|
|
References |
|
---------- |
|
D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems, |
|
Vol. 4, No. 3, 1992. |
|
|
|
M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine, |
|
Journal of Machine Learning Research, Vol. 1, 2001. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn import linear_model |
|
>>> clf = linear_model.BayesianRidge() |
|
>>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2]) |
|
BayesianRidge() |
|
>>> clf.predict([[1, 1]]) |
|
array([1.]) |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
"max_iter": [Interval(Integral, 1, None, closed="left")], |
|
"tol": [Interval(Real, 0, None, closed="neither")], |
|
"alpha_1": [Interval(Real, 0, None, closed="left")], |
|
"alpha_2": [Interval(Real, 0, None, closed="left")], |
|
"lambda_1": [Interval(Real, 0, None, closed="left")], |
|
"lambda_2": [Interval(Real, 0, None, closed="left")], |
|
"alpha_init": [None, Interval(Real, 0, None, closed="left")], |
|
"lambda_init": [None, Interval(Real, 0, None, closed="left")], |
|
"compute_score": ["boolean"], |
|
"fit_intercept": ["boolean"], |
|
"copy_X": ["boolean"], |
|
"verbose": ["verbose"], |
|
} |
|
|
|
def __init__( |
|
self, |
|
*, |
|
max_iter=300, |
|
tol=1.0e-3, |
|
alpha_1=1.0e-6, |
|
alpha_2=1.0e-6, |
|
lambda_1=1.0e-6, |
|
lambda_2=1.0e-6, |
|
alpha_init=None, |
|
lambda_init=None, |
|
compute_score=False, |
|
fit_intercept=True, |
|
copy_X=True, |
|
verbose=False, |
|
): |
|
self.max_iter = max_iter |
|
self.tol = tol |
|
self.alpha_1 = alpha_1 |
|
self.alpha_2 = alpha_2 |
|
self.lambda_1 = lambda_1 |
|
self.lambda_2 = lambda_2 |
|
self.alpha_init = alpha_init |
|
self.lambda_init = lambda_init |
|
self.compute_score = compute_score |
|
self.fit_intercept = fit_intercept |
|
self.copy_X = copy_X |
|
self.verbose = verbose |
|
|
|
@_fit_context(prefer_skip_nested_validation=True) |
|
def fit(self, X, y, sample_weight=None): |
|
"""Fit the model. |
|
|
|
Parameters |
|
---------- |
|
X : ndarray of shape (n_samples, n_features) |
|
Training data. |
|
y : ndarray of shape (n_samples,) |
|
Target values. Will be cast to X's dtype if necessary. |
|
|
|
sample_weight : ndarray of shape (n_samples,), default=None |
|
Individual weights for each sample. |
|
|
|
.. versionadded:: 0.20 |
|
parameter *sample_weight* support to BayesianRidge. |
|
|
|
Returns |
|
------- |
|
self : object |
|
Returns the instance itself. |
|
""" |
|
X, y = validate_data( |
|
self, |
|
X, |
|
y, |
|
dtype=[np.float64, np.float32], |
|
force_writeable=True, |
|
y_numeric=True, |
|
) |
|
dtype = X.dtype |
|
|
|
if sample_weight is not None: |
|
sample_weight = _check_sample_weight(sample_weight, X, dtype=dtype) |
|
|
|
X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data( |
|
X, |
|
y, |
|
fit_intercept=self.fit_intercept, |
|
copy=self.copy_X, |
|
sample_weight=sample_weight, |
|
) |
|
|
|
if sample_weight is not None: |
|
|
|
X, y, _ = _rescale_data(X, y, sample_weight) |
|
|
|
self.X_offset_ = X_offset_ |
|
self.X_scale_ = X_scale_ |
|
n_samples, n_features = X.shape |
|
|
|
|
|
eps = np.finfo(np.float64).eps |
|
|
|
|
|
alpha_ = self.alpha_init |
|
lambda_ = self.lambda_init |
|
if alpha_ is None: |
|
alpha_ = 1.0 / (np.var(y) + eps) |
|
if lambda_ is None: |
|
lambda_ = 1.0 |
|
|
|
|
|
alpha_ = np.asarray(alpha_, dtype=dtype) |
|
lambda_ = np.asarray(lambda_, dtype=dtype) |
|
|
|
verbose = self.verbose |
|
lambda_1 = self.lambda_1 |
|
lambda_2 = self.lambda_2 |
|
alpha_1 = self.alpha_1 |
|
alpha_2 = self.alpha_2 |
|
|
|
self.scores_ = list() |
|
coef_old_ = None |
|
|
|
XT_y = np.dot(X.T, y) |
|
U, S, Vh = linalg.svd(X, full_matrices=False) |
|
eigen_vals_ = S**2 |
|
|
|
|
|
for iter_ in range(self.max_iter): |
|
|
|
|
|
coef_, rmse_ = self._update_coef_( |
|
X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_ |
|
) |
|
if self.compute_score: |
|
|
|
s = self._log_marginal_likelihood( |
|
n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_ |
|
) |
|
self.scores_.append(s) |
|
|
|
|
|
gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_)) |
|
lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2) |
|
alpha_ = (n_samples - gamma_ + 2 * alpha_1) / (rmse_ + 2 * alpha_2) |
|
|
|
|
|
if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol: |
|
if verbose: |
|
print("Convergence after ", str(iter_), " iterations") |
|
break |
|
coef_old_ = np.copy(coef_) |
|
|
|
self.n_iter_ = iter_ + 1 |
|
|
|
|
|
|
|
self.alpha_ = alpha_ |
|
self.lambda_ = lambda_ |
|
self.coef_, rmse_ = self._update_coef_( |
|
X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_ |
|
) |
|
if self.compute_score: |
|
|
|
s = self._log_marginal_likelihood( |
|
n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_ |
|
) |
|
self.scores_.append(s) |
|
self.scores_ = np.array(self.scores_) |
|
|
|
|
|
scaled_sigma_ = np.dot( |
|
Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis] |
|
) |
|
self.sigma_ = (1.0 / alpha_) * scaled_sigma_ |
|
|
|
self._set_intercept(X_offset_, y_offset_, X_scale_) |
|
|
|
return self |
|
|
|
def predict(self, X, return_std=False): |
|
"""Predict using the linear model. |
|
|
|
In addition to the mean of the predictive distribution, also its |
|
standard deviation can be returned. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
Samples. |
|
|
|
return_std : bool, default=False |
|
Whether to return the standard deviation of posterior prediction. |
|
|
|
Returns |
|
------- |
|
y_mean : array-like of shape (n_samples,) |
|
Mean of predictive distribution of query points. |
|
|
|
y_std : array-like of shape (n_samples,) |
|
Standard deviation of predictive distribution of query points. |
|
""" |
|
y_mean = self._decision_function(X) |
|
if not return_std: |
|
return y_mean |
|
else: |
|
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1) |
|
y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_)) |
|
return y_mean, y_std |
|
|
|
def _update_coef_( |
|
self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_ |
|
): |
|
"""Update posterior mean and compute corresponding rmse. |
|
|
|
Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where |
|
scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features) |
|
+ np.dot(X.T, X))^-1 |
|
""" |
|
|
|
if n_samples > n_features: |
|
coef_ = np.linalg.multi_dot( |
|
[Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis], XT_y] |
|
) |
|
else: |
|
coef_ = np.linalg.multi_dot( |
|
[X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y] |
|
) |
|
|
|
rmse_ = np.sum((y - np.dot(X, coef_)) ** 2) |
|
|
|
return coef_, rmse_ |
|
|
|
def _log_marginal_likelihood( |
|
self, n_samples, n_features, eigen_vals, alpha_, lambda_, coef, rmse |
|
): |
|
"""Log marginal likelihood.""" |
|
alpha_1 = self.alpha_1 |
|
alpha_2 = self.alpha_2 |
|
lambda_1 = self.lambda_1 |
|
lambda_2 = self.lambda_2 |
|
|
|
|
|
|
|
|
|
if n_samples > n_features: |
|
logdet_sigma = -np.sum(np.log(lambda_ + alpha_ * eigen_vals)) |
|
else: |
|
logdet_sigma = np.full(n_features, lambda_, dtype=np.array(lambda_).dtype) |
|
logdet_sigma[:n_samples] += alpha_ * eigen_vals |
|
logdet_sigma = -np.sum(np.log(logdet_sigma)) |
|
|
|
score = lambda_1 * log(lambda_) - lambda_2 * lambda_ |
|
score += alpha_1 * log(alpha_) - alpha_2 * alpha_ |
|
score += 0.5 * ( |
|
n_features * log(lambda_) |
|
+ n_samples * log(alpha_) |
|
- alpha_ * rmse |
|
- lambda_ * np.sum(coef**2) |
|
+ logdet_sigma |
|
- n_samples * log(2 * np.pi) |
|
) |
|
|
|
return score |
|
|
|
|
|
|
|
|
|
|
|
|
|
class ARDRegression(RegressorMixin, LinearModel): |
|
"""Bayesian ARD regression. |
|
|
|
Fit the weights of a regression model, using an ARD prior. The weights of |
|
the regression model are assumed to be in Gaussian distributions. |
|
Also estimate the parameters lambda (precisions of the distributions of the |
|
weights) and alpha (precision of the distribution of the noise). |
|
The estimation is done by an iterative procedures (Evidence Maximization) |
|
|
|
Read more in the :ref:`User Guide <bayesian_regression>`. |
|
|
|
Parameters |
|
---------- |
|
max_iter : int, default=300 |
|
Maximum number of iterations. |
|
|
|
.. versionchanged:: 1.3 |
|
|
|
tol : float, default=1e-3 |
|
Stop the algorithm if w has converged. |
|
|
|
alpha_1 : float, default=1e-6 |
|
Hyper-parameter : shape parameter for the Gamma distribution prior |
|
over the alpha parameter. |
|
|
|
alpha_2 : float, default=1e-6 |
|
Hyper-parameter : inverse scale parameter (rate parameter) for the |
|
Gamma distribution prior over the alpha parameter. |
|
|
|
lambda_1 : float, default=1e-6 |
|
Hyper-parameter : shape parameter for the Gamma distribution prior |
|
over the lambda parameter. |
|
|
|
lambda_2 : float, default=1e-6 |
|
Hyper-parameter : inverse scale parameter (rate parameter) for the |
|
Gamma distribution prior over the lambda parameter. |
|
|
|
compute_score : bool, default=False |
|
If True, compute the objective function at each step of the model. |
|
|
|
threshold_lambda : float, default=10 000 |
|
Threshold for removing (pruning) weights with high precision from |
|
the computation. |
|
|
|
fit_intercept : bool, default=True |
|
Whether to calculate the intercept for this model. If set |
|
to false, no intercept will be used in calculations |
|
(i.e. data is expected to be centered). |
|
|
|
copy_X : bool, default=True |
|
If True, X will be copied; else, it may be overwritten. |
|
|
|
verbose : bool, default=False |
|
Verbose mode when fitting the model. |
|
|
|
Attributes |
|
---------- |
|
coef_ : array-like of shape (n_features,) |
|
Coefficients of the regression model (mean of distribution) |
|
|
|
alpha_ : float |
|
estimated precision of the noise. |
|
|
|
lambda_ : array-like of shape (n_features,) |
|
estimated precisions of the weights. |
|
|
|
sigma_ : array-like of shape (n_features, n_features) |
|
estimated variance-covariance matrix of the weights |
|
|
|
scores_ : float |
|
if computed, value of the objective function (to be maximized) |
|
|
|
n_iter_ : int |
|
The actual number of iterations to reach the stopping criterion. |
|
|
|
.. versionadded:: 1.3 |
|
|
|
intercept_ : float |
|
Independent term in decision function. Set to 0.0 if |
|
``fit_intercept = False``. |
|
|
|
X_offset_ : float |
|
If `fit_intercept=True`, offset subtracted for centering data to a |
|
zero mean. Set to np.zeros(n_features) otherwise. |
|
|
|
X_scale_ : float |
|
Set to np.ones(n_features). |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
BayesianRidge : Bayesian ridge regression. |
|
|
|
Notes |
|
----- |
|
For an example, see :ref:`examples/linear_model/plot_ard.py |
|
<sphx_glr_auto_examples_linear_model_plot_ard.py>`. |
|
|
|
References |
|
---------- |
|
D. J. C. MacKay, Bayesian nonlinear modeling for the prediction |
|
competition, ASHRAE Transactions, 1994. |
|
|
|
R. Salakhutdinov, Lecture notes on Statistical Machine Learning, |
|
http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15 |
|
Their beta is our ``self.alpha_`` |
|
Their alpha is our ``self.lambda_`` |
|
ARD is a little different than the slide: only dimensions/features for |
|
which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are |
|
discarded. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn import linear_model |
|
>>> clf = linear_model.ARDRegression() |
|
>>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2]) |
|
ARDRegression() |
|
>>> clf.predict([[1, 1]]) |
|
array([1.]) |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
"max_iter": [Interval(Integral, 1, None, closed="left")], |
|
"tol": [Interval(Real, 0, None, closed="left")], |
|
"alpha_1": [Interval(Real, 0, None, closed="left")], |
|
"alpha_2": [Interval(Real, 0, None, closed="left")], |
|
"lambda_1": [Interval(Real, 0, None, closed="left")], |
|
"lambda_2": [Interval(Real, 0, None, closed="left")], |
|
"compute_score": ["boolean"], |
|
"threshold_lambda": [Interval(Real, 0, None, closed="left")], |
|
"fit_intercept": ["boolean"], |
|
"copy_X": ["boolean"], |
|
"verbose": ["verbose"], |
|
} |
|
|
|
def __init__( |
|
self, |
|
*, |
|
max_iter=300, |
|
tol=1.0e-3, |
|
alpha_1=1.0e-6, |
|
alpha_2=1.0e-6, |
|
lambda_1=1.0e-6, |
|
lambda_2=1.0e-6, |
|
compute_score=False, |
|
threshold_lambda=1.0e4, |
|
fit_intercept=True, |
|
copy_X=True, |
|
verbose=False, |
|
): |
|
self.max_iter = max_iter |
|
self.tol = tol |
|
self.fit_intercept = fit_intercept |
|
self.alpha_1 = alpha_1 |
|
self.alpha_2 = alpha_2 |
|
self.lambda_1 = lambda_1 |
|
self.lambda_2 = lambda_2 |
|
self.compute_score = compute_score |
|
self.threshold_lambda = threshold_lambda |
|
self.copy_X = copy_X |
|
self.verbose = verbose |
|
|
|
@_fit_context(prefer_skip_nested_validation=True) |
|
def fit(self, X, y): |
|
"""Fit the model according to the given training data and parameters. |
|
|
|
Iterative procedure to maximize the evidence |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training vector, where `n_samples` is the number of samples and |
|
`n_features` is the number of features. |
|
y : array-like of shape (n_samples,) |
|
Target values (integers). Will be cast to X's dtype if necessary. |
|
|
|
Returns |
|
------- |
|
self : object |
|
Fitted estimator. |
|
""" |
|
X, y = validate_data( |
|
self, |
|
X, |
|
y, |
|
dtype=[np.float64, np.float32], |
|
force_writeable=True, |
|
y_numeric=True, |
|
ensure_min_samples=2, |
|
) |
|
dtype = X.dtype |
|
|
|
n_samples, n_features = X.shape |
|
coef_ = np.zeros(n_features, dtype=dtype) |
|
|
|
X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data( |
|
X, y, fit_intercept=self.fit_intercept, copy=self.copy_X |
|
) |
|
|
|
self.X_offset_ = X_offset_ |
|
self.X_scale_ = X_scale_ |
|
|
|
|
|
keep_lambda = np.ones(n_features, dtype=bool) |
|
|
|
lambda_1 = self.lambda_1 |
|
lambda_2 = self.lambda_2 |
|
alpha_1 = self.alpha_1 |
|
alpha_2 = self.alpha_2 |
|
verbose = self.verbose |
|
|
|
|
|
eps = np.finfo(np.float64).eps |
|
|
|
|
|
|
|
alpha_ = np.asarray(1.0 / (np.var(y) + eps), dtype=dtype) |
|
lambda_ = np.ones(n_features, dtype=dtype) |
|
|
|
self.scores_ = list() |
|
coef_old_ = None |
|
|
|
def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_): |
|
coef_[keep_lambda] = alpha_ * np.linalg.multi_dot( |
|
[sigma_, X[:, keep_lambda].T, y] |
|
) |
|
return coef_ |
|
|
|
update_sigma = ( |
|
self._update_sigma |
|
if n_samples >= n_features |
|
else self._update_sigma_woodbury |
|
) |
|
|
|
for iter_ in range(self.max_iter): |
|
sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda) |
|
coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_) |
|
|
|
|
|
rmse_ = np.sum((y - np.dot(X, coef_)) ** 2) |
|
gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_) |
|
lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / ( |
|
(coef_[keep_lambda]) ** 2 + 2.0 * lambda_2 |
|
) |
|
alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / ( |
|
rmse_ + 2.0 * alpha_2 |
|
) |
|
|
|
|
|
keep_lambda = lambda_ < self.threshold_lambda |
|
coef_[~keep_lambda] = 0 |
|
|
|
|
|
if self.compute_score: |
|
s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum() |
|
s += alpha_1 * log(alpha_) - alpha_2 * alpha_ |
|
s += 0.5 * ( |
|
fast_logdet(sigma_) |
|
+ n_samples * log(alpha_) |
|
+ np.sum(np.log(lambda_)) |
|
) |
|
s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_**2).sum()) |
|
self.scores_.append(s) |
|
|
|
|
|
if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol: |
|
if verbose: |
|
print("Converged after %s iterations" % iter_) |
|
break |
|
coef_old_ = np.copy(coef_) |
|
|
|
if not keep_lambda.any(): |
|
break |
|
|
|
self.n_iter_ = iter_ + 1 |
|
|
|
if keep_lambda.any(): |
|
|
|
sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda) |
|
coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_) |
|
else: |
|
sigma_ = np.array([]).reshape(0, 0) |
|
|
|
self.coef_ = coef_ |
|
self.alpha_ = alpha_ |
|
self.sigma_ = sigma_ |
|
self.lambda_ = lambda_ |
|
self._set_intercept(X_offset_, y_offset_, X_scale_) |
|
return self |
|
|
|
def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda): |
|
|
|
|
|
|
|
|
|
|
|
n_samples = X.shape[0] |
|
X_keep = X[:, keep_lambda] |
|
inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1) |
|
sigma_ = pinvh( |
|
np.eye(n_samples, dtype=X.dtype) / alpha_ |
|
+ np.dot(X_keep * inv_lambda, X_keep.T) |
|
) |
|
sigma_ = np.dot(sigma_, X_keep * inv_lambda) |
|
sigma_ = -np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_) |
|
sigma_[np.diag_indices(sigma_.shape[1])] += 1.0 / lambda_[keep_lambda] |
|
return sigma_ |
|
|
|
def _update_sigma(self, X, alpha_, lambda_, keep_lambda): |
|
|
|
|
|
|
|
X_keep = X[:, keep_lambda] |
|
gram = np.dot(X_keep.T, X_keep) |
|
eye = np.eye(gram.shape[0], dtype=X.dtype) |
|
sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram |
|
sigma_ = pinvh(sigma_inv) |
|
return sigma_ |
|
|
|
def predict(self, X, return_std=False): |
|
"""Predict using the linear model. |
|
|
|
In addition to the mean of the predictive distribution, also its |
|
standard deviation can be returned. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
Samples. |
|
|
|
return_std : bool, default=False |
|
Whether to return the standard deviation of posterior prediction. |
|
|
|
Returns |
|
------- |
|
y_mean : array-like of shape (n_samples,) |
|
Mean of predictive distribution of query points. |
|
|
|
y_std : array-like of shape (n_samples,) |
|
Standard deviation of predictive distribution of query points. |
|
""" |
|
y_mean = self._decision_function(X) |
|
if return_std is False: |
|
return y_mean |
|
else: |
|
col_index = self.lambda_ < self.threshold_lambda |
|
X = _safe_indexing(X, indices=col_index, axis=1) |
|
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1) |
|
y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_)) |
|
return y_mean, y_std |
|
|