|
"""Truncated SVD for sparse matrices, aka latent semantic analysis (LSA).""" |
|
|
|
|
|
|
|
|
|
from numbers import Integral, Real |
|
|
|
import numpy as np |
|
import scipy.sparse as sp |
|
from scipy.sparse.linalg import svds |
|
|
|
from ..base import ( |
|
BaseEstimator, |
|
ClassNamePrefixFeaturesOutMixin, |
|
TransformerMixin, |
|
_fit_context, |
|
) |
|
from ..utils import check_array, check_random_state |
|
from ..utils._arpack import _init_arpack_v0 |
|
from ..utils._param_validation import Interval, StrOptions |
|
from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip |
|
from ..utils.sparsefuncs import mean_variance_axis |
|
from ..utils.validation import check_is_fitted, validate_data |
|
|
|
__all__ = ["TruncatedSVD"] |
|
|
|
|
|
class TruncatedSVD(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): |
|
"""Dimensionality reduction using truncated SVD (aka LSA). |
|
|
|
This transformer performs linear dimensionality reduction by means of |
|
truncated singular value decomposition (SVD). Contrary to PCA, this |
|
estimator does not center the data before computing the singular value |
|
decomposition. This means it can work with sparse matrices |
|
efficiently. |
|
|
|
In particular, truncated SVD works on term count/tf-idf matrices as |
|
returned by the vectorizers in :mod:`sklearn.feature_extraction.text`. In |
|
that context, it is known as latent semantic analysis (LSA). |
|
|
|
This estimator supports two algorithms: a fast randomized SVD solver, and |
|
a "naive" algorithm that uses ARPACK as an eigensolver on `X * X.T` or |
|
`X.T * X`, whichever is more efficient. |
|
|
|
Read more in the :ref:`User Guide <LSA>`. |
|
|
|
Parameters |
|
---------- |
|
n_components : int, default=2 |
|
Desired dimensionality of output data. |
|
If algorithm='arpack', must be strictly less than the number of features. |
|
If algorithm='randomized', must be less than or equal to the number of features. |
|
The default value is useful for visualisation. For LSA, a value of |
|
100 is recommended. |
|
|
|
algorithm : {'arpack', 'randomized'}, default='randomized' |
|
SVD solver to use. Either "arpack" for the ARPACK wrapper in SciPy |
|
(scipy.sparse.linalg.svds), or "randomized" for the randomized |
|
algorithm due to Halko (2009). |
|
|
|
n_iter : int, default=5 |
|
Number of iterations for randomized SVD solver. Not used by ARPACK. The |
|
default is larger than the default in |
|
:func:`~sklearn.utils.extmath.randomized_svd` to handle sparse |
|
matrices that may have large slowly decaying spectrum. |
|
|
|
n_oversamples : int, default=10 |
|
Number of oversamples for randomized SVD solver. Not used by ARPACK. |
|
See :func:`~sklearn.utils.extmath.randomized_svd` for a complete |
|
description. |
|
|
|
.. versionadded:: 1.1 |
|
|
|
power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto' |
|
Power iteration normalizer for randomized SVD solver. |
|
Not used by ARPACK. See :func:`~sklearn.utils.extmath.randomized_svd` |
|
for more details. |
|
|
|
.. versionadded:: 1.1 |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Used during randomized svd. Pass an int for reproducible results across |
|
multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
tol : float, default=0.0 |
|
Tolerance for ARPACK. 0 means machine precision. Ignored by randomized |
|
SVD solver. |
|
|
|
Attributes |
|
---------- |
|
components_ : ndarray of shape (n_components, n_features) |
|
The right singular vectors of the input data. |
|
|
|
explained_variance_ : ndarray of shape (n_components,) |
|
The variance of the training samples transformed by a projection to |
|
each component. |
|
|
|
explained_variance_ratio_ : ndarray of shape (n_components,) |
|
Percentage of variance explained by each of the selected components. |
|
|
|
singular_values_ : ndarray of shape (n_components,) |
|
The singular values corresponding to each of the selected components. |
|
The singular values are equal to the 2-norms of the ``n_components`` |
|
variables in the lower-dimensional space. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
DictionaryLearning : Find a dictionary that sparsely encodes data. |
|
FactorAnalysis : A simple linear generative model with |
|
Gaussian latent variables. |
|
IncrementalPCA : Incremental principal components analysis. |
|
KernelPCA : Kernel Principal component analysis. |
|
NMF : Non-Negative Matrix Factorization. |
|
PCA : Principal component analysis. |
|
|
|
Notes |
|
----- |
|
SVD suffers from a problem called "sign indeterminacy", which means the |
|
sign of the ``components_`` and the output from transform depend on the |
|
algorithm and random state. To work around this, fit instances of this |
|
class to data once, then keep the instance around to do transformations. |
|
|
|
References |
|
---------- |
|
:arxiv:`Halko, et al. (2009). "Finding structure with randomness: |
|
Stochastic algorithms for constructing approximate matrix decompositions" |
|
<0909.4061>` |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.decomposition import TruncatedSVD |
|
>>> from scipy.sparse import csr_matrix |
|
>>> import numpy as np |
|
>>> np.random.seed(0) |
|
>>> X_dense = np.random.rand(100, 100) |
|
>>> X_dense[:, 2 * np.arange(50)] = 0 |
|
>>> X = csr_matrix(X_dense) |
|
>>> svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42) |
|
>>> svd.fit(X) |
|
TruncatedSVD(n_components=5, n_iter=7, random_state=42) |
|
>>> print(svd.explained_variance_ratio_) |
|
[0.0157... 0.0512... 0.0499... 0.0479... 0.0453...] |
|
>>> print(svd.explained_variance_ratio_.sum()) |
|
0.2102... |
|
>>> print(svd.singular_values_) |
|
[35.2410... 4.5981... 4.5420... 4.4486... 4.3288...] |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
"n_components": [Interval(Integral, 1, None, closed="left")], |
|
"algorithm": [StrOptions({"arpack", "randomized"})], |
|
"n_iter": [Interval(Integral, 0, None, closed="left")], |
|
"n_oversamples": [Interval(Integral, 1, None, closed="left")], |
|
"power_iteration_normalizer": [StrOptions({"auto", "OR", "LU", "none"})], |
|
"random_state": ["random_state"], |
|
"tol": [Interval(Real, 0, None, closed="left")], |
|
} |
|
|
|
def __init__( |
|
self, |
|
n_components=2, |
|
*, |
|
algorithm="randomized", |
|
n_iter=5, |
|
n_oversamples=10, |
|
power_iteration_normalizer="auto", |
|
random_state=None, |
|
tol=0.0, |
|
): |
|
self.algorithm = algorithm |
|
self.n_components = n_components |
|
self.n_iter = n_iter |
|
self.n_oversamples = n_oversamples |
|
self.power_iteration_normalizer = power_iteration_normalizer |
|
self.random_state = random_state |
|
self.tol = tol |
|
|
|
def fit(self, X, y=None): |
|
"""Fit model on training data X. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
Training data. |
|
|
|
y : Ignored |
|
Not used, present here for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
self : object |
|
Returns the transformer object. |
|
""" |
|
self.fit_transform(X) |
|
return self |
|
|
|
@_fit_context(prefer_skip_nested_validation=True) |
|
def fit_transform(self, X, y=None): |
|
"""Fit model to X and perform dimensionality reduction on X. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
Training data. |
|
|
|
y : Ignored |
|
Not used, present here for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
X_new : ndarray of shape (n_samples, n_components) |
|
Reduced version of X. This will always be a dense array. |
|
""" |
|
X = validate_data(self, X, accept_sparse=["csr", "csc"], ensure_min_features=2) |
|
random_state = check_random_state(self.random_state) |
|
|
|
if self.algorithm == "arpack": |
|
v0 = _init_arpack_v0(min(X.shape), random_state) |
|
U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol, v0=v0) |
|
|
|
|
|
Sigma = Sigma[::-1] |
|
|
|
U, VT = svd_flip(U[:, ::-1], VT[::-1], u_based_decision=False) |
|
|
|
elif self.algorithm == "randomized": |
|
if self.n_components > X.shape[1]: |
|
raise ValueError( |
|
f"n_components({self.n_components}) must be <=" |
|
f" n_features({X.shape[1]})." |
|
) |
|
U, Sigma, VT = randomized_svd( |
|
X, |
|
self.n_components, |
|
n_iter=self.n_iter, |
|
n_oversamples=self.n_oversamples, |
|
power_iteration_normalizer=self.power_iteration_normalizer, |
|
random_state=random_state, |
|
flip_sign=False, |
|
) |
|
U, VT = svd_flip(U, VT, u_based_decision=False) |
|
|
|
self.components_ = VT |
|
|
|
|
|
|
|
if self.algorithm == "randomized" or ( |
|
self.algorithm == "arpack" and self.tol > 0 |
|
): |
|
X_transformed = safe_sparse_dot(X, self.components_.T) |
|
else: |
|
X_transformed = U * Sigma |
|
|
|
|
|
self.explained_variance_ = exp_var = np.var(X_transformed, axis=0) |
|
if sp.issparse(X): |
|
_, full_var = mean_variance_axis(X, axis=0) |
|
full_var = full_var.sum() |
|
else: |
|
full_var = np.var(X, axis=0).sum() |
|
self.explained_variance_ratio_ = exp_var / full_var |
|
self.singular_values_ = Sigma |
|
|
|
return X_transformed |
|
|
|
def transform(self, X): |
|
"""Perform dimensionality reduction on X. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
New data. |
|
|
|
Returns |
|
------- |
|
X_new : ndarray of shape (n_samples, n_components) |
|
Reduced version of X. This will always be a dense array. |
|
""" |
|
check_is_fitted(self) |
|
X = validate_data(self, X, accept_sparse=["csr", "csc"], reset=False) |
|
return safe_sparse_dot(X, self.components_.T) |
|
|
|
def inverse_transform(self, X): |
|
"""Transform X back to its original space. |
|
|
|
Returns an array X_original whose transform would be X. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_components) |
|
New data. |
|
|
|
Returns |
|
------- |
|
X_original : ndarray of shape (n_samples, n_features) |
|
Note that this is always a dense array. |
|
""" |
|
X = check_array(X) |
|
return np.dot(X, self.components_) |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
tags.input_tags.sparse = True |
|
tags.transformer_tags.preserves_dtype = ["float64", "float32"] |
|
return tags |
|
|
|
@property |
|
def _n_features_out(self): |
|
"""Number of transformed output features.""" |
|
return self.components_.shape[0] |
|
|