From df6232d0c4cae1c4b0a90a4b183bd18bcf487f7a Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 27 Oct 2020 13:10:18 +1100 Subject: [PATCH 01/46] Initial commit for PCA on sparse data Currently, `.fit` and `.fit_transform` work. Need to fix `.transform`. --- sklearn/decomposition/_pca.py | 40 +++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 124f876fc4653..0434f34c2b81e 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -16,11 +16,12 @@ import numpy as np from scipy import linalg from scipy.special import gammaln -from scipy.sparse import issparse -from scipy.sparse.linalg import svds +from scipy.sparse import issparse, spmatrix +from scipy.sparse.linalg import svds, LinearOperator from ._base import _BasePCA from ..utils import check_random_state +from ..utils.sparsefuncs import mean_variance_axis from ..utils.extmath import fast_logdet, randomized_svd, svd_flip from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted @@ -108,6 +109,20 @@ def _infer_dimension(spectrum, n_samples): return ll.argmax() +def _implicitly_center(X: spmatrix, mu: np.ndarray) -> LinearOperator: + mu = mu[None, :] + XH = X.T.conj(copy=False) + _ones = np.ones(X.shape[0])[None, :].dot + return LinearOperator( + matvec=lambda x: X.dot(x) - mu.dot(x), + dtype=X.dtype, + matmat=lambda x: X.dot(x) - mu.dot(x), + shape=X.shape, + rmatvec=lambda x: XH.dot(x) - mu.T.dot(_ones(x)), + rmatmat=lambda x: XH.dot(x) - mu.T.dot(_ones(x)), + ) + + class PCA(_BasePCA): """Principal component analysis (PCA). @@ -389,11 +404,14 @@ def _fit(self, X): # Raise an error for sparse input. # This is more informative than the generic one raised by check_array. - if issparse(X): - raise TypeError('PCA does not support sparse input. See ' - 'TruncatedSVD for a possible alternative.') + if issparse(X) and self.svd_solver != 'arpack': + raise TypeError( + 'PCA only support sparse inputs with the "arpack" solver. See ' + 'TruncatedSVD for a possible alternative.' + ) X = self._validate_data(X, dtype=[np.float64, np.float32], + accept_sparse=("csr", "csc"), ensure_2d=True, copy=self.copy) # Handle n_components==None @@ -523,8 +541,14 @@ def _fit_truncated(self, X, n_components, svd_solver): random_state = check_random_state(self.random_state) # Center data - self.mean_ = np.mean(X, axis=0) - X -= self.mean_ + if issparse(X): + self.mean_, total_var = mean_variance_axis(X, axis=0) + total_var *= n_samples / (n_samples - 1) # ddof=1 + X = _implicitly_center(X, self.mean_) + else: + self.mean_ = np.mean(X, axis=0) + total_var = np.var(X, ddof=1, axis=0) + X -= self.mean_ if svd_solver == 'arpack': # random init solution, as ARPACK does it internally @@ -549,11 +573,9 @@ def _fit_truncated(self, X, n_components, svd_solver): # Get variance explained by singular values self.explained_variance_ = (S ** 2) / (n_samples - 1) - total_var = np.var(X, ddof=1, axis=0) self.explained_variance_ratio_ = \ self.explained_variance_ / total_var.sum() self.singular_values_ = S.copy() # Store the singular values. - if self.n_components_ < min(n_features, n_samples): self.noise_variance_ = (total_var.sum() - self.explained_variance_.sum()) From 14a89545a92e8e2acafc2ee60bba2b4977241527 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 27 Oct 2020 13:24:24 +1100 Subject: [PATCH 02/46] Allow pca.transform(spmatrix) --- sklearn/decomposition/_base.py | 29 ++++++++++++++++++++++++++--- sklearn/decomposition/_pca.py | 20 +++----------------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index 4f17d3df7ba80..3fcb892a0bb7d 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -10,12 +10,27 @@ import numpy as np from scipy import linalg +from scipy.sparse import spmatrix, issparse +from scipy.sparse.linalg import LinearOperator from ..base import BaseEstimator, TransformerMixin from ..utils.validation import check_is_fitted from abc import ABCMeta, abstractmethod +def _implicitly_center(X: spmatrix, mu: np.ndarray) -> LinearOperator: + mu = mu[None, :] + XH = X.T.conj(copy=False) + _ones = np.ones(X.shape[0])[None, :].dot + return LinearOperator( + matvec=lambda x: X.dot(x) - mu.dot(x), + dtype=X.dtype, + matmat=lambda x: X.dot(x) - mu.dot(x), + shape=X.shape, + rmatvec=lambda x: XH.dot(x) - mu.T.dot(_ones(x)), + rmatmat=lambda x: XH.dot(x) - mu.T.dot(_ones(x)), + ) + class _BasePCA(TransformerMixin, BaseEstimator, metaclass=ABCMeta): """Base class for PCA methods. @@ -123,10 +138,18 @@ def transform(self, X): """ check_is_fitted(self) - X = self._validate_data(X, dtype=[np.float64, np.float32], reset=False) + X = self._validate_data( + X, + accept_sparse=("csr", "csc"), + dtype=[np.float64, np.float32], + reset=False + ) if self.mean_ is not None: - X = X - self.mean_ - X_transformed = np.dot(X, self.components_.T) + if issparse(X): + X = _implicitly_center(X, self.mean_) + else: + X = X - self.mean_ + X_transformed = X @ self.components_.T if self.whiten: X_transformed /= np.sqrt(self.explained_variance_) return X_transformed diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 0434f34c2b81e..007e91f32e195 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -16,10 +16,10 @@ import numpy as np from scipy import linalg from scipy.special import gammaln -from scipy.sparse import issparse, spmatrix -from scipy.sparse.linalg import svds, LinearOperator +from scipy.sparse import issparse +from scipy.sparse.linalg import svds -from ._base import _BasePCA +from ._base import _BasePCA, _implicitly_center from ..utils import check_random_state from ..utils.sparsefuncs import mean_variance_axis from ..utils.extmath import fast_logdet, randomized_svd, svd_flip @@ -109,20 +109,6 @@ def _infer_dimension(spectrum, n_samples): return ll.argmax() -def _implicitly_center(X: spmatrix, mu: np.ndarray) -> LinearOperator: - mu = mu[None, :] - XH = X.T.conj(copy=False) - _ones = np.ones(X.shape[0])[None, :].dot - return LinearOperator( - matvec=lambda x: X.dot(x) - mu.dot(x), - dtype=X.dtype, - matmat=lambda x: X.dot(x) - mu.dot(x), - shape=X.shape, - rmatvec=lambda x: XH.dot(x) - mu.T.dot(_ones(x)), - rmatmat=lambda x: XH.dot(x) - mu.T.dot(_ones(x)), - ) - - class PCA(_BasePCA): """Principal component analysis (PCA). From ed8733424efd0485bf00873e8e83817f3c6a769c Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 26 Mar 2021 15:55:40 +1100 Subject: [PATCH 03/46] cleaner implicit centering --- sklearn/decomposition/_base.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index 3fcb892a0bb7d..c45c1d4893044 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -18,17 +18,16 @@ from abc import ABCMeta, abstractmethod -def _implicitly_center(X: spmatrix, mu: np.ndarray) -> LinearOperator: +def _implicitly_center(X, mu): mu = mu[None, :] - XH = X.T.conj(copy=False) - _ones = np.ones(X.shape[0])[None, :].dot + XT = X.T.conj(copy=False) return LinearOperator( - matvec=lambda x: X.dot(x) - mu.dot(x), + matvec = lambda x: X @ x - mu @ x, + matmat = lambda x: X @ x - mu @ x, + rmatvec = lambda x: XT @ x - (mu * x.sum()), + rmatmat = lambda x: XT @ x - (mu * x.sum()), dtype=X.dtype, - matmat=lambda x: X.dot(x) - mu.dot(x), - shape=X.shape, - rmatvec=lambda x: XH.dot(x) - mu.T.dot(_ones(x)), - rmatmat=lambda x: XH.dot(x) - mu.T.dot(_ones(x)), + shape=X.shape ) class _BasePCA(TransformerMixin, BaseEstimator, metaclass=ABCMeta): From f5158a1e945dd3dbfe7af1ecae09d70a912a2aac Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 24 May 2023 11:43:16 -0700 Subject: [PATCH 04/46] Integrate tests from Andrey Co-authored-by: Andrey Portnoy --- sklearn/decomposition/_pca.py | 20 ++++--- sklearn/decomposition/tests/test_pca.py | 72 ++++++++++++++++++++----- 2 files changed, 72 insertions(+), 20 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 0031fa9a5a3e0..0ba5ebaacc3b8 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -643,14 +643,18 @@ def _fit_truncated(self, X, n_components, svd_solver): # Get variance explained by singular values self.explained_variance_ = (S**2) / (n_samples - 1) - # Workaround in-place variance calculation since at the time numpy - # did not have a way to calculate variance in-place. - N = X.shape[0] - 1 - np.square(X, out=X) - np.sum(X, axis=0, out=X[0]) - total_var = (X[0] / N).sum() - - self.explained_variance_ratio_ = self.explained_variance_ / total_var + # Workaround in-place variance calculation for dense arrays since at + # the time numpy did not have a way to calculate variance in-place. + + # if total_var is None: + # N = X.shape[0] - 1 + # np.square(X, out=X) + # np.sum(X, axis=0, out=X[0]) + # total_var = (X[0] / N).sum() + + self.explained_variance_ratio_ = ( + self.explained_variance_ / total_var[:n_components] + ) self.singular_values_ = S.copy() # Store the singular values. if self.n_components_ < min(n_features, n_samples): self.noise_variance_ = total_var - self.explained_variance_.sum() diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 5bf893f92fd16..aa46924748c58 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -1,6 +1,7 @@ import numpy as np import scipy as sp from numpy.testing import assert_array_equal +from scipy.sparse.linalg import LinearOperator import pytest import warnings @@ -14,7 +15,10 @@ from sklearn.decomposition._pca import _infer_dimension iris = datasets.load_iris() -PCA_SOLVERS = ["full", "arpack", "randomized", "auto"] +PCA_SOLVERS = ["full", "arpack", "randomized", "auto", "lobpcg"] + +SPARSE_M, SPARSE_N = 1000, 300 # arbitrary +SPARSE_MAX_COMPONENTS = min(SPARSE_M, SPARSE_N) @pytest.mark.parametrize("svd_solver", PCA_SOLVERS) @@ -39,6 +43,61 @@ def test_pca(svd_solver, n_components): assert_allclose(np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-12) +def linear_operator_from_matrix(A): + return LinearOperator( + shape=A.shape, + dtype=A.dtype, + matvec=lambda x: A @ x, + rmatvec=lambda x: A.T @ x, + matmat=lambda X: A @ X, + rmatmat=lambda X: A.T @ X, + ) + + +def test_linear_operator_matmul(): + A = np.array([[1, 2, 3], [4, 5, 6]]) + B = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) + B = linear_operator_from_matrix(B) + expected_error = "Input operand 1 does not have enough dimensions" + with pytest.raises(ValueError, match=expected_error): + A @ B + + +def test_linear_operator_reversed_matmul(): + A = np.array([[1, 2, 3], [4, 5, 6]]) + B = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) + B = linear_operator_from_matrix(B) + result = (B.T @ A.T).T + assert np.allclose(result, [[38, 44, 50, 56], [83, 98, 113, 128]]) + + +@pytest.mark.parametrize("density", [0.01, 0.05, 0.10, 0.30]) +@pytest.mark.parametrize("n_components", [1, 2, 3, 10, SPARSE_MAX_COMPONENTS]) +@pytest.mark.parametrize("format", ["csr", "csc"]) +@pytest.mark.parametrize("svd_solver", ["arpack"]) +@pytest.mark.parametrize( + "rtol", [1e-07] +) # , 1e-06, 1e-05, 1e-04, 1e-03, 1e-02, 1e-01, 1e-00]) +def test_pca_sparse( + global_random_seed, rtol, svd_solver, format, n_components, density +): + if svd_solver in ["lobpcg", "arpack"] and n_components == SPARSE_MAX_COMPONENTS: + pytest.skip("lobpcg and arpack don't support full solves") + random_state = np.random.RandomState(global_random_seed) + X = sp.sparse.random( + SPARSE_M, SPARSE_N, format=format, random_state=random_state, density=density + ) + pca = PCA(n_components=n_components, svd_solver=svd_solver) + pca.fit(X) + + Xd = np.asarray(X.todense()) + pcad = PCA(n_components=n_components, svd_solver=svd_solver) + pcad.fit(Xd) + + assert_allclose(pca.components_, pcad.components_, rtol=rtol) + assert_allclose(pca.singular_values_, pcad.singular_values_, rtol=rtol) + + def test_no_empty_slice_warning(): # test if we avoid numpy warnings for computing over empty arrays n_components = 10 @@ -492,17 +551,6 @@ def test_pca_svd_solver_auto(data, n_components, expected_solver): assert_allclose(pca_auto.components_, pca_test.components_) -@pytest.mark.parametrize("svd_solver", PCA_SOLVERS) -def test_pca_sparse_input(svd_solver): - X = np.random.RandomState(0).rand(5, 4) - X = sp.sparse.csr_matrix(X) - assert sp.sparse.issparse(X) - - pca = PCA(n_components=3, svd_solver=svd_solver) - with pytest.raises(TypeError): - pca.fit(X) - - @pytest.mark.parametrize("svd_solver", PCA_SOLVERS) def test_pca_deterministic_output(svd_solver): rng = np.random.RandomState(0) From 88492fba52de04a324d3ea0083f9332564c4fabf Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 24 May 2023 15:31:06 -0700 Subject: [PATCH 05/46] Clean up total var, pass tests --- sklearn/decomposition/_pca.py | 20 ++++++++------------ sklearn/decomposition/tests/test_pca.py | 2 +- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 0ba5ebaacc3b8..f21cff3654bc5 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -606,13 +606,13 @@ def _fit_truncated(self, X, n_components, svd_solver): random_state = check_random_state(self.random_state) # Center data + total_var = None if issparse(X): - self.mean_, total_var = mean_variance_axis(X, axis=0) - total_var *= n_samples / (n_samples - 1) # ddof=1 + self.mean_, var = mean_variance_axis(X, axis=0) + total_var = var.sum() * n_samples / (n_samples - 1) # ddof=1 X = _implicitly_center(X, self.mean_) else: self.mean_ = np.mean(X, axis=0) - total_var = np.var(X, ddof=1, axis=0) X -= self.mean_ if svd_solver == "arpack": @@ -645,16 +645,12 @@ def _fit_truncated(self, X, n_components, svd_solver): # Workaround in-place variance calculation for dense arrays since at # the time numpy did not have a way to calculate variance in-place. + if total_var is None: + np.square(X, out=X) + np.sum(X, axis=0, out=X[0]) + total_var = (X[0] / (n_samples - 1)).sum() - # if total_var is None: - # N = X.shape[0] - 1 - # np.square(X, out=X) - # np.sum(X, axis=0, out=X[0]) - # total_var = (X[0] / N).sum() - - self.explained_variance_ratio_ = ( - self.explained_variance_ / total_var[:n_components] - ) + self.explained_variance_ratio_ = self.explained_variance_ / total_var self.singular_values_ = S.copy() # Store the singular values. if self.n_components_ < min(n_features, n_samples): self.noise_variance_ = total_var - self.explained_variance_.sum() diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index aa46924748c58..a5d1b8669c68f 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -15,7 +15,7 @@ from sklearn.decomposition._pca import _infer_dimension iris = datasets.load_iris() -PCA_SOLVERS = ["full", "arpack", "randomized", "auto", "lobpcg"] +PCA_SOLVERS = ["full", "arpack", "randomized", "auto"] SPARSE_M, SPARSE_N = 1000, 300 # arbitrary SPARSE_MAX_COMPONENTS = min(SPARSE_M, SPARSE_N) From b3b1f5c863a40e942676bf413b1a3095382e28e9 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 18 Aug 2023 16:23:38 +0200 Subject: [PATCH 06/46] remove merge artifact --- sklearn/decomposition/_pca.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 823e9345b6755..4d8b7980b26b4 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -673,7 +673,6 @@ def _fit_truncated(self, X, n_components, svd_solver): self.explained_variance_ratio_ = self.explained_variance_ / total_var self.singular_values_ = xp.asarray(S, copy=True) # Store the singular values. - # >>>>>>> main if self.n_components_ < min(n_features, n_samples): self.noise_variance_ = total_var - xp.sum(self.explained_variance_) self.noise_variance_ /= min(n_features, n_samples) - n_components From 74d7afeab2586098fa42ec725ba6678c0652d9bc Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 18 Aug 2023 16:27:33 +0200 Subject: [PATCH 07/46] Document test variables --- sklearn/decomposition/tests/test_pca.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 205911f22e35b..a4bb688e2b25d 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -26,6 +26,9 @@ iris = datasets.load_iris() PCA_SOLVERS = ["full", "arpack", "randomized", "auto"] +# `SPARSE_M` and `SPARSE_N` could be larger, but be aware: +# * SciPy's generation of random sparse matrix can be costly +# * A (SPARSE_M, SPARSE_N) dense array is allocated to compare against SPARSE_M, SPARSE_N = 1000, 300 # arbitrary SPARSE_MAX_COMPONENTS = min(SPARSE_M, SPARSE_N) From 602e3bfb75305f1f66d3f29a8f88620ba0fbe792 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 18 Aug 2023 16:36:23 +0200 Subject: [PATCH 08/46] Address comments on scaling and tolerance for the test --- sklearn/decomposition/tests/test_pca.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index a4bb688e2b25d..ebe758d91726d 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -87,18 +87,19 @@ def test_linear_operator_reversed_matmul(): @pytest.mark.parametrize("n_components", [1, 2, 3, 10, SPARSE_MAX_COMPONENTS]) @pytest.mark.parametrize("format", ["csr", "csc"]) @pytest.mark.parametrize("svd_solver", ["arpack"]) -@pytest.mark.parametrize( - "rtol", [1e-07] -) # , 1e-06, 1e-05, 1e-04, 1e-03, 1e-02, 1e-01, 1e-00]) +@pytest.mark.parametrize("scale", [1, 10, 100]) def test_pca_sparse( - global_random_seed, rtol, svd_solver, format, n_components, density + global_random_seed, svd_solver, format, n_components, density, scale ): + RTOL = 1e-08 + if svd_solver in ["lobpcg", "arpack"] and n_components == SPARSE_MAX_COMPONENTS: pytest.skip("lobpcg and arpack don't support full solves") random_state = np.random.RandomState(global_random_seed) X = sp.sparse.random( SPARSE_M, SPARSE_N, format=format, random_state=random_state, density=density ) + np.multiply(X.data, scale, out=X.data) pca = PCA(n_components=n_components, svd_solver=svd_solver) pca.fit(X) @@ -106,8 +107,8 @@ def test_pca_sparse( pcad = PCA(n_components=n_components, svd_solver=svd_solver) pcad.fit(Xd) - assert_allclose(pca.components_, pcad.components_, rtol=rtol) - assert_allclose(pca.singular_values_, pcad.singular_values_, rtol=rtol) + assert_allclose(pca.components_, pcad.components_, rtol=RTOL) + assert_allclose(pca.singular_values_, pcad.singular_values_, rtol=RTOL) def test_no_empty_slice_warning(): From 0b0d4cd4aef1b9970ede7c361603f66691dc4597 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 18 Aug 2023 16:47:47 +0200 Subject: [PATCH 09/46] Document _implicitly_center --- sklearn/decomposition/_base.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index ee91d5a5cf584..dbbc9a669fd7b 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -11,7 +11,7 @@ from abc import ABCMeta, abstractmethod import numpy as np -from scipy import linalg +from scipy import linalg, sparse from scipy.sparse import issparse from scipy.sparse.linalg import LinearOperator @@ -20,7 +20,14 @@ from ..utils.validation import check_is_fitted -def _implicitly_center(X, mu): +def _implicitly_center( + X: "sparse.spmatrix | sparse.sparray", mu: np.ndarray +) -> LinearOperator: + """Create an implicitly centered linear operator. + + Allows us to perform a PCA on sparse data without ever densifying the whole data + matrix. + """ mu = mu[None, :] XT = X.T.conj(copy=False) return LinearOperator( From f51f6c5e8ef9a3745d9cd2018a8f341ac4ae3050 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 21 Aug 2023 21:35:46 +0200 Subject: [PATCH 10/46] Add tests for erroring on unsupported solver --- sklearn/decomposition/tests/test_pca.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index ebe758d91726d..473b25c9b3398 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -111,6 +111,19 @@ def test_pca_sparse( assert_allclose(pca.singular_values_, pcad.singular_values_, rtol=RTOL) +@pytest.mark.parametrize("svd_solver", ["randomized", "logpcg", "full", "auto"]) +def test_sparse_pca_solver_error(global_random_seed, svd_solver): + random_state = np.random.RandomState(global_random_seed) + X = sp.sparse.random( + SPARSE_M, + SPARSE_N, + random_state=random_state, + ) + pca = PCA(n_components=30, svd_solver=svd_solver) + with pytest.raises(TypeError): + pca.fit(X) + + def test_no_empty_slice_warning(): # test if we avoid numpy warnings for computing over empty arrays n_components = 10 From f73119512a40aef8ecb6e352ffc20a1b08860356 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 21 Aug 2023 23:09:01 +0200 Subject: [PATCH 11/46] Fix rmatmat --- sklearn/decomposition/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index dbbc9a669fd7b..55390cfa6d1fb 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -34,7 +34,7 @@ def _implicitly_center( matvec=lambda x: X @ x - mu @ x, matmat=lambda x: X @ x - mu @ x, rmatvec=lambda x: XT @ x - (mu * x.sum()), - rmatmat=lambda x: XT @ x - (mu * x.sum()), + rmatmat=lambda x: XT @ x - mu.T @ x.sum(axis=0)[None, :], dtype=X.dtype, shape=X.shape, ) From 80d95419339a04f2b0c9d1432f2d4b452cb50e4e Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 22 Aug 2023 10:46:21 +0200 Subject: [PATCH 12/46] Make tolerance 1e-7 --- sklearn/decomposition/tests/test_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 473b25c9b3398..a57d05b8296a0 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -91,7 +91,7 @@ def test_linear_operator_reversed_matmul(): def test_pca_sparse( global_random_seed, svd_solver, format, n_components, density, scale ): - RTOL = 1e-08 + RTOL = 1e-07 # 1e0-8 can result in occasional failures if svd_solver in ["lobpcg", "arpack"] and n_components == SPARSE_MAX_COMPONENTS: pytest.skip("lobpcg and arpack don't support full solves") From 1b69b489ac9440b82d63e63c5180ccd1fa446aeb Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 25 Aug 2023 20:42:57 +0200 Subject: [PATCH 13/46] release note --- doc/whats_new/v1.4.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 73b87d260acba..a47ae88d324c9 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -103,6 +103,11 @@ Changelog :pr:`26315` by :user:`Mateusz Sokół ` and :user:`Olivier Grisel `. +- |Enhancement| :class:`decomposition.PCA` now supports computing directly on scipy sparse + matrices when using the `arpack` solver. When used on :func:`datasets.fetch_20newsgroups_vectorized` + this can lead to speed ups of 100x (single threaded) and 70x less memory. + :pr:`18689` by :user:`Isaac Virshup ` and :user:`Andrey Portnoy `. + :mod:`sklearn.ensemble` ....................... From 9143651215bc540b7ae865351aed47412c3bb3c4 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 25 Aug 2023 21:48:47 +0200 Subject: [PATCH 14/46] Test implicit center --- sklearn/decomposition/tests/test_pca.py | 58 +++++++++++++++---------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index a57d05b8296a0..3c344956736cc 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -5,7 +5,6 @@ import pytest import scipy as sp from numpy.testing import assert_array_equal -from scipy.sparse.linalg import LinearOperator from sklearn import config_context, datasets from sklearn.base import clone @@ -55,32 +54,47 @@ def test_pca(svd_solver, n_components): assert_allclose(np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-12) -def linear_operator_from_matrix(A): - return LinearOperator( - shape=A.shape, - dtype=A.dtype, - matvec=lambda x: A @ x, - rmatvec=lambda x: A.T @ x, - matmat=lambda X: A @ X, - rmatmat=lambda X: A.T @ X, +@pytest.fixture(scope="module", params=[sp.sparse.csr_matrix, sp.sparse.csc_matrix]) +def centered_matrices(request) -> tuple[sp.sparse.linalg.LinearOperator, np.ndarray]: + matrix_class = request.param + from sklearn.decomposition._base import _implicitly_center + + random_state = np.random.default_rng(42) + + X_sparse = matrix_class( + sp.sparse.random(500, 100, density=0.1, format="csr", random_state=random_state) ) + X_dense = X_sparse.toarray() + mu = np.asarray(X_sparse.mean(axis=0)).ravel() + + X_sparse_centered = _implicitly_center(X_sparse, mu) + X_dense_centered = X_dense - mu + + return X_sparse_centered, X_dense_centered + + +def test_implicit_center_matmat(centered_matrices): + X_sparse_centered, X_dense_centered = centered_matrices + Y = np.random.randn(X_dense_centered.shape[1], 50) + assert_allclose(X_dense_centered @ Y, X_sparse_centered @ Y) + + +def test_implicit_center_matvec(centered_matrices): + X_sparse_centered, X_dense_centered = centered_matrices + y = np.random.randn(X_dense_centered.shape[1]) + assert_allclose(X_dense_centered @ y, X_sparse_centered @ y) -def test_linear_operator_matmul(): - A = np.array([[1, 2, 3], [4, 5, 6]]) - B = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) - B = linear_operator_from_matrix(B) - expected_error = "Input operand 1 does not have enough dimensions" - with pytest.raises(ValueError, match=expected_error): - A @ B +def test_implicit_center_rmatmat(centered_matrices): + X_sparse_centered, X_dense_centered = centered_matrices + Y = np.random.randn(X_dense_centered.shape[0], 50) + assert_allclose(X_dense_centered.T @ Y, X_sparse_centered.rmatmat(Y)) -def test_linear_operator_reversed_matmul(): - A = np.array([[1, 2, 3], [4, 5, 6]]) - B = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) - B = linear_operator_from_matrix(B) - result = (B.T @ A.T).T - assert np.allclose(result, [[38, 44, 50, 56], [83, 98, 113, 128]]) +def test_implit_center_rmatvec(centered_matrices): + X_sparse_centered, X_dense_centered = centered_matrices + y = np.random.randn(X_dense_centered.shape[0]) + assert_allclose(X_dense_centered.T @ y, X_sparse_centered.rmatvec(y)) @pytest.mark.parametrize("density", [0.01, 0.05, 0.10, 0.30]) From bcfe31681f82a1a6e84341e0ba18c9c0d57c3ac7 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 25 Aug 2023 22:03:55 +0200 Subject: [PATCH 15/46] Array tests --- sklearn/decomposition/tests/test_pca.py | 31 +++++++++++++++++-------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 13f26913605c5..109c7aa6cca5c 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -100,19 +100,26 @@ def test_implit_center_rmatvec(centered_matrices): @pytest.mark.parametrize("density", [0.01, 0.05, 0.10, 0.30]) @pytest.mark.parametrize("n_components", [1, 2, 3, 10, SPARSE_MAX_COMPONENTS]) -@pytest.mark.parametrize("format", ["csr", "csc"]) +@pytest.mark.parametrize("matrix_class", CSR_CONTAINERS + CSC_CONTAINERS) @pytest.mark.parametrize("svd_solver", ["arpack"]) @pytest.mark.parametrize("scale", [1, 10, 100]) def test_pca_sparse( - global_random_seed, svd_solver, format, n_components, density, scale + global_random_seed, svd_solver, matrix_class, n_components, density, scale ): RTOL = 1e-07 # 1e0-8 can result in occasional failures if svd_solver in ["lobpcg", "arpack"] and n_components == SPARSE_MAX_COMPONENTS: pytest.skip("lobpcg and arpack don't support full solves") - random_state = np.random.RandomState(global_random_seed) - X = sp.sparse.random( - SPARSE_M, SPARSE_N, format=format, random_state=random_state, density=density + + random_state = np.random.default_rng(global_random_seed) + X = matrix_class( + sp.sparse.random( + SPARSE_M, + SPARSE_N, + format=matrix_class.format, + random_state=random_state, + density=density, + ) ) np.multiply(X.data, scale, out=X.data) pca = PCA(n_components=n_components, svd_solver=svd_solver) @@ -127,12 +134,16 @@ def test_pca_sparse( @pytest.mark.parametrize("svd_solver", ["randomized", "logpcg", "full", "auto"]) -def test_sparse_pca_solver_error(global_random_seed, svd_solver): +@pytest.mark.parametrize("matrix_class", CSR_CONTAINERS + CSC_CONTAINERS) +def test_sparse_pca_solver_error(global_random_seed, svd_solver, matrix_class): random_state = np.random.RandomState(global_random_seed) - X = sp.sparse.random( - SPARSE_M, - SPARSE_N, - random_state=random_state, + X = matrix_class( + sp.sparse.random( + SPARSE_M, + SPARSE_N, + format=matrix_class.format, + random_state=random_state, + ) ) pca = PCA(n_components=30, svd_solver=svd_solver) with pytest.raises(TypeError): From 5c9627bd75f7a807c5101345a75ccc2f737cef01 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 25 Aug 2023 22:11:00 +0200 Subject: [PATCH 16/46] Rename _implict_column_offset --- sklearn/decomposition/_base.py | 4 ++-- sklearn/decomposition/_pca.py | 4 ++-- sklearn/decomposition/tests/test_pca.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index d25cab1e0431e..abd35b2708acd 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -20,7 +20,7 @@ from ..utils.validation import check_is_fitted -def _implicitly_center( +def _implicit_column_offset( X: "sparse.spmatrix | sparse.sparray", mu: np.ndarray ) -> LinearOperator: """Create an implicitly centered linear operator. @@ -167,7 +167,7 @@ def transform(self, X): ) if self.mean_ is not None: if issparse(X): - X = _implicitly_center(X, self.mean_) + X = _implicit_column_offset(X, self.mean_) else: X = X - self.mean_ X_transformed = X @ self.components_.T diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 4d8b7980b26b4..34210a8bc2403 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -28,7 +28,7 @@ from ..utils.extmath import fast_logdet, randomized_svd, stable_cumsum, svd_flip from ..utils.sparsefuncs import mean_variance_axis from ..utils.validation import check_is_fitted -from ._base import _BasePCA, _implicitly_center +from ._base import _BasePCA, _implicit_column_offset def _assess_dimension(spectrum, rank, n_samples): @@ -630,7 +630,7 @@ def _fit_truncated(self, X, n_components, svd_solver): if issparse(X): self.mean_, var = mean_variance_axis(X, axis=0) total_var = var.sum() * n_samples / (n_samples - 1) # ddof=1 - X = _implicitly_center(X, self.mean_) + X = _implicit_column_offset(X, self.mean_) else: self.mean_ = xp.mean(X, axis=0) X -= self.mean_ diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 109c7aa6cca5c..0c5bede4c3513 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -58,7 +58,7 @@ def test_pca(svd_solver, n_components): @pytest.fixture(scope="module", params=CSR_CONTAINERS + CSC_CONTAINERS) def centered_matrices(request) -> tuple[sp.sparse.linalg.LinearOperator, np.ndarray]: matrix_class = request.param - from sklearn.decomposition._base import _implicitly_center + from sklearn.decomposition._base import _implicit_column_offset random_state = np.random.default_rng(42) @@ -68,7 +68,7 @@ def centered_matrices(request) -> tuple[sp.sparse.linalg.LinearOperator, np.ndar X_dense = X_sparse.toarray() mu = np.asarray(X_sparse.mean(axis=0)).ravel() - X_sparse_centered = _implicitly_center(X_sparse, mu) + X_sparse_centered = _implicit_column_offset(X_sparse, mu) X_dense_centered = X_dense - mu return X_sparse_centered, X_dense_centered From 858f437a0bb20aeacec8cbb5b72d4a6ac67e1e87 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 25 Aug 2023 23:08:41 +0200 Subject: [PATCH 17/46] scipy 1.11 compat, https://github.com/scipy/scipy/issues/19134 --- sklearn/decomposition/tests/test_pca.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 0c5bede4c3513..f79b5f9d6026e 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import re import warnings @@ -116,12 +118,14 @@ def test_pca_sparse( sp.sparse.random( SPARSE_M, SPARSE_N, - format=matrix_class.format, random_state=random_state, density=density, ) ) - np.multiply(X.data, scale, out=X.data) + # Scale the data + vary the column means + scale_vector = random_state.random(X.shape[1]) * scale + X = X.multiply(scale_vector) + pca = PCA(n_components=n_components, svd_solver=svd_solver) pca.fit(X) @@ -141,7 +145,6 @@ def test_sparse_pca_solver_error(global_random_seed, svd_solver, matrix_class): sp.sparse.random( SPARSE_M, SPARSE_N, - format=matrix_class.format, random_state=random_state, ) ) From 0db8d543fb369c13a174616424c342955acc5d45 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sat, 26 Aug 2023 00:30:15 +0200 Subject: [PATCH 18/46] Finishing touches * Move _implicit_column_offset to sparsefuncs * Fix up some docs * Cut down on test time by removing intermediate parameterizations --- sklearn/decomposition/_base.py | 24 ++----------- sklearn/decomposition/_pca.py | 4 +-- sklearn/decomposition/tests/test_pca.py | 47 ++----------------------- sklearn/utils/sparsefuncs.py | 21 +++++++++++ sklearn/utils/tests/test_sparsefuncs.py | 46 ++++++++++++++++++++++++ 5 files changed, 73 insertions(+), 69 deletions(-) diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index abd35b2708acd..0c4982b99a42f 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -11,35 +11,15 @@ from abc import ABCMeta, abstractmethod import numpy as np -from scipy import linalg, sparse +from scipy import linalg from scipy.sparse import issparse -from scipy.sparse.linalg import LinearOperator from ..base import BaseEstimator, ClassNamePrefixFeaturesOutMixin, TransformerMixin from ..utils._array_api import _add_to_diagonal, device, get_namespace +from ..utils.sparsefuncs import _implicit_column_offset from ..utils.validation import check_is_fitted -def _implicit_column_offset( - X: "sparse.spmatrix | sparse.sparray", mu: np.ndarray -) -> LinearOperator: - """Create an implicitly centered linear operator. - - Allows us to perform a PCA on sparse data without ever densifying the whole data - matrix. - """ - mu = mu[None, :] - XT = X.T.conj(copy=False) - return LinearOperator( - matvec=lambda x: X @ x - mu @ x, - matmat=lambda x: X @ x - mu @ x, - rmatvec=lambda x: XT @ x - (mu * x.sum()), - rmatmat=lambda x: XT @ x - mu.T @ x.sum(axis=0)[None, :], - dtype=X.dtype, - shape=X.shape, - ) - - class _BasePCA( ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator, metaclass=ABCMeta ): diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 34210a8bc2403..92e9932202b52 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -26,9 +26,9 @@ from ..utils._param_validation import Interval, RealNotInt, StrOptions from ..utils.deprecation import deprecated from ..utils.extmath import fast_logdet, randomized_svd, stable_cumsum, svd_flip -from ..utils.sparsefuncs import mean_variance_axis +from ..utils.sparsefuncs import _implicit_column_offset, mean_variance_axis from ..utils.validation import check_is_fitted -from ._base import _BasePCA, _implicit_column_offset +from ._base import _BasePCA def _assess_dimension(spectrum, rank, n_samples): diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index f79b5f9d6026e..13fb55349044c 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -57,51 +57,8 @@ def test_pca(svd_solver, n_components): assert_allclose(np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-12) -@pytest.fixture(scope="module", params=CSR_CONTAINERS + CSC_CONTAINERS) -def centered_matrices(request) -> tuple[sp.sparse.linalg.LinearOperator, np.ndarray]: - matrix_class = request.param - from sklearn.decomposition._base import _implicit_column_offset - - random_state = np.random.default_rng(42) - - X_sparse = matrix_class( - sp.sparse.random(500, 100, density=0.1, format="csr", random_state=random_state) - ) - X_dense = X_sparse.toarray() - mu = np.asarray(X_sparse.mean(axis=0)).ravel() - - X_sparse_centered = _implicit_column_offset(X_sparse, mu) - X_dense_centered = X_dense - mu - - return X_sparse_centered, X_dense_centered - - -def test_implicit_center_matmat(centered_matrices): - X_sparse_centered, X_dense_centered = centered_matrices - Y = np.random.randn(X_dense_centered.shape[1], 50) - assert_allclose(X_dense_centered @ Y, X_sparse_centered @ Y) - - -def test_implicit_center_matvec(centered_matrices): - X_sparse_centered, X_dense_centered = centered_matrices - y = np.random.randn(X_dense_centered.shape[1]) - assert_allclose(X_dense_centered @ y, X_sparse_centered @ y) - - -def test_implicit_center_rmatmat(centered_matrices): - X_sparse_centered, X_dense_centered = centered_matrices - Y = np.random.randn(X_dense_centered.shape[0], 50) - assert_allclose(X_dense_centered.T @ Y, X_sparse_centered.rmatmat(Y)) - - -def test_implit_center_rmatvec(centered_matrices): - X_sparse_centered, X_dense_centered = centered_matrices - y = np.random.randn(X_dense_centered.shape[0]) - assert_allclose(X_dense_centered.T @ y, X_sparse_centered.rmatvec(y)) - - -@pytest.mark.parametrize("density", [0.01, 0.05, 0.10, 0.30]) -@pytest.mark.parametrize("n_components", [1, 2, 3, 10, SPARSE_MAX_COMPONENTS]) +@pytest.mark.parametrize("density", [0.01, 0.1, 0.30]) +@pytest.mark.parametrize("n_components", [1, 2, 10, SPARSE_MAX_COMPONENTS]) @pytest.mark.parametrize("matrix_class", CSR_CONTAINERS + CSC_CONTAINERS) @pytest.mark.parametrize("svd_solver", ["arpack"]) @pytest.mark.parametrize("scale", [1, 10, 100]) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index b2d0e725b8453..639222aba4e85 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -5,6 +5,7 @@ # License: BSD 3 clause import numpy as np import scipy.sparse as sp +from scipy.sparse.linalg import LinearOperator from ..utils.validation import _check_sample_weight from .sparsefuncs_fast import ( @@ -628,3 +629,23 @@ def csc_median_axis_0(X): median[f_ind] = _get_median(data, nz) return median + + +def _implicit_column_offset( + X: "sp.spmatrix | sp.sparray", offset: np.ndarray +) -> LinearOperator: + """Create an implicitly offset linear operator. + + Allows us to perform a PCA on sparse data without ever densifying the whole data + matrix. + """ + offset = offset[None, :] + XT = X.T.conj(copy=False) + return LinearOperator( + matvec=lambda x: X @ x - offset @ x, + matmat=lambda x: X @ x - offset @ x, + rmatvec=lambda x: XT @ x - (offset * x.sum()), + rmatmat=lambda x: XT @ x - offset.T @ x.sum(axis=0)[None, :], + dtype=X.dtype, + shape=X.shape, + ) diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index f3bcaf56bb561..bbfd0743497fb 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import numpy as np import pytest import scipy.sparse as sp @@ -7,6 +9,7 @@ from sklearn.datasets import make_classification from sklearn.utils._testing import assert_allclose +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS from sklearn.utils.sparsefuncs import ( count_nonzero, csc_median_axis_0, @@ -914,3 +917,46 @@ def test_csr_row_norms(dtype): assert norms.dtype == dtype rtol = 1e-6 if dtype == np.float32 else 1e-7 assert_allclose(norms, scipy_norms, rtol=rtol) + + +@pytest.fixture(scope="module", params=CSR_CONTAINERS + CSC_CONTAINERS) +def centered_matrices(request) -> tuple[sp.linalg.LinearOperator, np.ndarray]: + matrix_class = request.param + from sklearn.utils.sparsefuncs import _implicit_column_offset + + random_state = np.random.default_rng(42) + + X_sparse = matrix_class( + sp.random(500, 100, density=0.1, format="csr", random_state=random_state) + ) + X_dense = X_sparse.toarray() + mu = np.asarray(X_sparse.mean(axis=0)).ravel() + + X_sparse_centered = _implicit_column_offset(X_sparse, mu) + X_dense_centered = X_dense - mu + + return X_sparse_centered, X_dense_centered + + +def test_implicit_center_matmat(centered_matrices): + X_sparse_centered, X_dense_centered = centered_matrices + Y = np.random.randn(X_dense_centered.shape[1], 50) + assert_allclose(X_dense_centered @ Y, X_sparse_centered @ Y) + + +def test_implicit_center_matvec(centered_matrices): + X_sparse_centered, X_dense_centered = centered_matrices + y = np.random.randn(X_dense_centered.shape[1]) + assert_allclose(X_dense_centered @ y, X_sparse_centered @ y) + + +def test_implicit_center_rmatmat(centered_matrices): + X_sparse_centered, X_dense_centered = centered_matrices + Y = np.random.randn(X_dense_centered.shape[0], 50) + assert_allclose(X_dense_centered.T @ Y, X_sparse_centered.rmatmat(Y)) + + +def test_implit_center_rmatvec(centered_matrices): + X_sparse_centered, X_dense_centered = centered_matrices + y = np.random.randn(X_dense_centered.shape[0]) + assert_allclose(X_dense_centered.T @ y, X_sparse_centered.rmatvec(y)) From a664999b3bac5d78b3f9eabf42a1f0b6766bfec9 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sat, 26 Aug 2023 01:26:52 +0200 Subject: [PATCH 19/46] Update release note --- doc/whats_new/v1.4.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index add49a97151d6..ad77d83e0799f 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -129,9 +129,11 @@ Changelog subclasses. :pr:`27100` by :user:`Isaac Virshup `. -- |Enhancement| :class:`decomposition.PCA` now supports computing directly on scipy sparse - matrices when using the `arpack` solver. When used on :func:`datasets.fetch_20newsgroups_vectorized` - this can lead to speed ups of 100x (single threaded) and 70x less memory. +- |Feature| :class:`decomposition.PCA` now supports :class:`scipy.sparse.sparray` + and :class:`scipy.sparse.spmatrix` inputs when using the `arpack` solver. + When used on sparse data like :func:`datasets.fetch_20newsgroups_vectorized` this + can lead to speed ups of 100x (single threaded) and 70x lower memory usage. + Based on :user:`Alexander Tarashansky `'s implementation in `scanpy `. :pr:`18689` by :user:`Isaac Virshup ` and :user:`Andrey Portnoy `. :mod:`sklearn.ensemble` From 532904199824a5eb43155f2d0ab59779a35ca52c Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 16:26:48 +0200 Subject: [PATCH 20/46] use toarray --- sklearn/decomposition/tests/test_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index c939dca130733..7da8335a1d674 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -86,7 +86,7 @@ def test_pca_sparse( pca = PCA(n_components=n_components, svd_solver=svd_solver) pca.fit(X) - Xd = np.asarray(X.todense()) + Xd = X.toarray() pcad = PCA(n_components=n_components, svd_solver=svd_solver) pcad.fit(Xd) From 67a40ddd986505700011d42462922ef2b0e21aca Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 16:30:00 +0200 Subject: [PATCH 21/46] Expanded equality testing (apply comment from Julien) --- sklearn/decomposition/tests/test_pca.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 7da8335a1d674..fd4253723bd18 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -90,8 +90,16 @@ def test_pca_sparse( pcad = PCA(n_components=n_components, svd_solver=svd_solver) pcad.fit(Xd) + # Fitted attributes equality assert_allclose(pca.components_, pcad.components_, rtol=RTOL) + assert_allclose(pca.explained_variance_, pcad.explained_variance_, rtol=RTOL) assert_allclose(pca.singular_values_, pcad.singular_values_, rtol=RTOL) + assert_allclose(pca.mean_, pcad.mean_, rtol=RTOL) + assert_allclose(pca.n_components_, pcad.n_components_, rtol=RTOL) + assert_allclose(pca.n_features_, pcad.n_features_, rtol=RTOL) + assert_allclose(pca.n_samples_, pcad.n_samples_, rtol=RTOL) + assert_allclose(pca.noise_variance_, pcad.noise_variance_, rtol=RTOL) + assert_allclose(pca.n_features_in_, pcad.n_features_in_, rtol=RTOL) @pytest.mark.parametrize("svd_solver", ["randomized", "logpcg", "full", "auto"]) From 236478e728d55ed4ddac000f9c2af20d303cfb2f Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 17:23:27 +0200 Subject: [PATCH 22/46] Make sure fit, fit_transform, transform are equivalent for pca on sparse data --- sklearn/decomposition/tests/test_pca.py | 70 +++++++++++++++++++++---- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index fd4253723bd18..230e53df0a79b 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -35,6 +35,18 @@ SPARSE_MAX_COMPONENTS = min(SPARSE_M, SPARSE_N) +def _check_fitted_pca_close(pca1: PCA, pca2: PCA, rtol: float): + assert_allclose(pca1.components_, pca2.components_, rtol=rtol) + assert_allclose(pca1.explained_variance_, pca2.explained_variance_, rtol=rtol) + assert_allclose(pca1.singular_values_, pca2.singular_values_, rtol=rtol) + assert_allclose(pca1.mean_, pca2.mean_, rtol=rtol) + assert_allclose(pca1.n_components_, pca2.n_components_, rtol=rtol) + assert_allclose(pca1.n_features_, pca2.n_features_, rtol=rtol) + assert_allclose(pca1.n_samples_, pca2.n_samples_, rtol=rtol) + assert_allclose(pca1.noise_variance_, pca2.noise_variance_, rtol=rtol) + assert_allclose(pca1.n_features_in_, pca2.n_features_in_, rtol=rtol) + + @pytest.mark.parametrize("svd_solver", PCA_SOLVERS) @pytest.mark.parametrize("n_components", range(1, iris.data.shape[1])) def test_pca(svd_solver, n_components): @@ -91,15 +103,55 @@ def test_pca_sparse( pcad.fit(Xd) # Fitted attributes equality - assert_allclose(pca.components_, pcad.components_, rtol=RTOL) - assert_allclose(pca.explained_variance_, pcad.explained_variance_, rtol=RTOL) - assert_allclose(pca.singular_values_, pcad.singular_values_, rtol=RTOL) - assert_allclose(pca.mean_, pcad.mean_, rtol=RTOL) - assert_allclose(pca.n_components_, pcad.n_components_, rtol=RTOL) - assert_allclose(pca.n_features_, pcad.n_features_, rtol=RTOL) - assert_allclose(pca.n_samples_, pcad.n_samples_, rtol=RTOL) - assert_allclose(pca.noise_variance_, pcad.noise_variance_, rtol=RTOL) - assert_allclose(pca.n_features_in_, pcad.n_features_in_, rtol=RTOL) + _check_fitted_pca_close(pca, pcad, rtol=RTOL) + + # Test transform + X2 = matrix_class( + sp.sparse.random( + SPARSE_M, + SPARSE_N, + random_state=random_state, + density=density, + ) + ) + X2d = X2.toarray() + + assert_allclose(pca.transform(X2), pca.transform(X2d), rtol=RTOL) + assert_allclose(pca.transform(X2), pcad.transform(X2d), rtol=RTOL) + + +@pytest.mark.parametrize("matrix_class", CSR_CONTAINERS + CSC_CONTAINERS) +def test_pca_sparse_fit_transform(global_random_seed, matrix_class): + random_state = np.random.default_rng(global_random_seed) + X = matrix_class( + sp.sparse.random( + SPARSE_M, + SPARSE_N, + random_state=random_state, + density=0.01, + ) + ) + X2 = matrix_class( + sp.sparse.random( + SPARSE_M, + SPARSE_N, + random_state=random_state, + density=0.01, + ) + ) + + pca_fit = PCA(n_components=10, svd_solver="arpack", random_state=global_random_seed) + pca_fit_transform = PCA( + n_components=10, svd_solver="arpack", random_state=global_random_seed + ) + + pca_fit.fit(X) + transformed_X = pca_fit_transform.fit_transform(X) + + _check_fitted_pca_close(pca_fit, pca_fit_transform, rtol=1e-10) + assert_allclose(transformed_X, pca_fit_transform.transform(X), rtol=1e-10) + assert_allclose(transformed_X, pca_fit.transform(X), rtol=1e-10) + assert_allclose(pca_fit.transform(X2), pca_fit_transform.transform(X2), rtol=1e-10) @pytest.mark.parametrize("svd_solver", ["randomized", "logpcg", "full", "auto"]) From 5a15364e29821717eae736295d00ec4e2ab8e127 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 17:31:23 +0200 Subject: [PATCH 23/46] More informative error message for supported solvers for sparse PCA --- sklearn/decomposition/_pca.py | 5 +++-- sklearn/decomposition/tests/test_pca.py | 8 ++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 92e9932202b52..a5a2553c820c7 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -480,8 +480,9 @@ def _fit(self, X): # Raise an error for sparse input and unsupported svd_solver if issparse(X) and self.svd_solver != "arpack": raise TypeError( - 'PCA only support sparse inputs with the "arpack" solver. See ' - "TruncatedSVD for a possible alternative." + 'PCA only support sparse inputs with the "arpack" solver, while ' + f'"{self.svd_solver}" was passed. See TruncatedSVD for a possible' + " alternative." ) # Raise an error for non-Numpy input and arpack solver. if self.svd_solver == "arpack" and is_array_api_compliant: diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 230e53df0a79b..4978bad210001 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -154,7 +154,7 @@ def test_pca_sparse_fit_transform(global_random_seed, matrix_class): assert_allclose(pca_fit.transform(X2), pca_fit_transform.transform(X2), rtol=1e-10) -@pytest.mark.parametrize("svd_solver", ["randomized", "logpcg", "full", "auto"]) +@pytest.mark.parametrize("svd_solver", ["randomized", "full", "auto"]) @pytest.mark.parametrize("matrix_class", CSR_CONTAINERS + CSC_CONTAINERS) def test_sparse_pca_solver_error(global_random_seed, svd_solver, matrix_class): random_state = np.random.RandomState(global_random_seed) @@ -166,7 +166,11 @@ def test_sparse_pca_solver_error(global_random_seed, svd_solver, matrix_class): ) ) pca = PCA(n_components=30, svd_solver=svd_solver) - with pytest.raises(TypeError): + error_msg_pattern = ( + f'PCA only support sparse inputs with the "arpack" solver, while "{svd_solver}"' + " was passed" + ) + with pytest.raises(TypeError, match=error_msg_pattern): pca.fit(X) From af5649258d8180367dc138ae1bea205769d296f5 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 17:34:23 +0200 Subject: [PATCH 24/46] Remove skipped test case --- sklearn/decomposition/tests/test_pca.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 4978bad210001..aa4e91a08d076 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -70,7 +70,7 @@ def test_pca(svd_solver, n_components): @pytest.mark.parametrize("density", [0.01, 0.1, 0.30]) -@pytest.mark.parametrize("n_components", [1, 2, 10, SPARSE_MAX_COMPONENTS]) +@pytest.mark.parametrize("n_components", [1, 2, 10]) @pytest.mark.parametrize("matrix_class", CSR_CONTAINERS + CSC_CONTAINERS) @pytest.mark.parametrize("svd_solver", ["arpack"]) @pytest.mark.parametrize("scale", [1, 10, 100]) @@ -79,9 +79,6 @@ def test_pca_sparse( ): RTOL = 1e-07 # 1e0-8 can result in occasional failures - if svd_solver in ["lobpcg", "arpack"] and n_components == SPARSE_MAX_COMPONENTS: - pytest.skip("lobpcg and arpack don't support full solves") - random_state = np.random.default_rng(global_random_seed) X = matrix_class( sp.sparse.random( From ce5e8d4353ebfb7253a03e082ef7c99f041ea826 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 17:46:14 +0200 Subject: [PATCH 25/46] Update usage of random data in implicit centering tests --- sklearn/utils/tests/test_sparsefuncs.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index c94df49aeda29..ca11e9f9a67d6 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -966,25 +966,29 @@ def centered_matrices(request) -> tuple[sp.linalg.LinearOperator, np.ndarray]: return X_sparse_centered, X_dense_centered -def test_implicit_center_matmat(centered_matrices): +def test_implicit_center_matmat(global_random_seed, centered_matrices): X_sparse_centered, X_dense_centered = centered_matrices - Y = np.random.randn(X_dense_centered.shape[1], 50) + rng = np.random.default_rng(global_random_seed) + Y = rng.standard_normal((X_dense_centered.shape[1], 50)) assert_allclose(X_dense_centered @ Y, X_sparse_centered @ Y) -def test_implicit_center_matvec(centered_matrices): +def test_implicit_center_matvec(global_random_seed, centered_matrices): X_sparse_centered, X_dense_centered = centered_matrices - y = np.random.randn(X_dense_centered.shape[1]) + rng = np.random.default_rng(global_random_seed) + y = rng.standard_normal(X_dense_centered.shape[1]) assert_allclose(X_dense_centered @ y, X_sparse_centered @ y) -def test_implicit_center_rmatmat(centered_matrices): +def test_implicit_center_rmatmat(global_random_seed, centered_matrices): X_sparse_centered, X_dense_centered = centered_matrices - Y = np.random.randn(X_dense_centered.shape[0], 50) + rng = np.random.default_rng(global_random_seed) + Y = rng.standard_normal((X_dense_centered.shape[0], 50)) assert_allclose(X_dense_centered.T @ Y, X_sparse_centered.rmatmat(Y)) -def test_implit_center_rmatvec(centered_matrices): +def test_implit_center_rmatvec(global_random_seed, centered_matrices): X_sparse_centered, X_dense_centered = centered_matrices - y = np.random.randn(X_dense_centered.shape[0]) + rng = np.random.default_rng(global_random_seed) + y = rng.standard_normal(X_dense_centered.shape[0]) assert_allclose(X_dense_centered.T @ y, X_sparse_centered.rmatvec(y)) From 0ce6d502b34d9015a8b5a1c938cd7c9b9dabe487 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 17:48:31 +0200 Subject: [PATCH 26/46] Test both matrix multiplication API and LinearOperator API in implicit centering tests --- sklearn/utils/tests/test_sparsefuncs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index ca11e9f9a67d6..a6b33a759336c 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -970,6 +970,7 @@ def test_implicit_center_matmat(global_random_seed, centered_matrices): X_sparse_centered, X_dense_centered = centered_matrices rng = np.random.default_rng(global_random_seed) Y = rng.standard_normal((X_dense_centered.shape[1], 50)) + assert_allclose(X_dense_centered @ Y, X_sparse_centered.matmat(Y)) assert_allclose(X_dense_centered @ Y, X_sparse_centered @ Y) @@ -977,6 +978,7 @@ def test_implicit_center_matvec(global_random_seed, centered_matrices): X_sparse_centered, X_dense_centered = centered_matrices rng = np.random.default_rng(global_random_seed) y = rng.standard_normal(X_dense_centered.shape[1]) + assert_allclose(X_dense_centered @ y, X_sparse_centered.matvec(y)) assert_allclose(X_dense_centered @ y, X_sparse_centered @ y) @@ -985,6 +987,7 @@ def test_implicit_center_rmatmat(global_random_seed, centered_matrices): rng = np.random.default_rng(global_random_seed) Y = rng.standard_normal((X_dense_centered.shape[0], 50)) assert_allclose(X_dense_centered.T @ Y, X_sparse_centered.rmatmat(Y)) + assert_allclose(X_dense_centered.T @ Y, X_sparse_centered.T @ Y) def test_implit_center_rmatvec(global_random_seed, centered_matrices): @@ -992,3 +995,4 @@ def test_implit_center_rmatvec(global_random_seed, centered_matrices): rng = np.random.default_rng(global_random_seed) y = rng.standard_normal(X_dense_centered.shape[0]) assert_allclose(X_dense_centered.T @ y, X_sparse_centered.rmatvec(y)) + assert_allclose(X_dense_centered.T @ y, X_sparse_centered.T @ y) From a2014406ce34cda2a504b6fe5a103f740f96f97d Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 17:53:47 +0200 Subject: [PATCH 27/46] Update implicit_offset docs Co-authored-by: Julien Jerphanion --- sklearn/utils/sparsefuncs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index 639222aba4e85..dbbf3d0787bfd 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -636,7 +636,7 @@ def _implicit_column_offset( ) -> LinearOperator: """Create an implicitly offset linear operator. - Allows us to perform a PCA on sparse data without ever densifying the whole data + This is used by PCA on sparse data to avoid densifying the whole data matrix. """ offset = offset[None, :] From 05df49c0d33c14a2f09ec30519d3a9e9842b5320 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 18:02:41 +0200 Subject: [PATCH 28/46] Remove type annotations for _implicit_column_offset, move to docs --- sklearn/utils/sparsefuncs.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index dbbf3d0787bfd..be71a30fb9db9 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -631,13 +631,20 @@ def csc_median_axis_0(X): return median -def _implicit_column_offset( - X: "sp.spmatrix | sp.sparray", offset: np.ndarray -) -> LinearOperator: +def _implicit_column_offset(X, offset): """Create an implicitly offset linear operator. This is used by PCA on sparse data to avoid densifying the whole data matrix. + + Params + ------ + X : sparse matrix of shape (n_samples, n_features) + offset : ndarray of shape (n_features,) + + Returns + ------- + centered : LinearOperator """ offset = offset[None, :] XT = X.T.conj(copy=False) From bfa4faa4d641efa2bccded0b79e982ca43bf438d Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 18:05:15 +0200 Subject: [PATCH 29/46] Remove .conj Co-authored-by: Olivier Grisel --- sklearn/utils/sparsefuncs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index be71a30fb9db9..8fbe4539d2859 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -647,7 +647,7 @@ def _implicit_column_offset(X, offset): centered : LinearOperator """ offset = offset[None, :] - XT = X.T.conj(copy=False) + XT = X.T return LinearOperator( matvec=lambda x: X @ x - offset @ x, matmat=lambda x: X @ x - offset @ x, From 9cbd76bd5917b144407c78a33cb250e0fa6f4625 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 18:15:11 +0200 Subject: [PATCH 30/46] matrix_class -> sparse_container --- sklearn/decomposition/tests/test_pca.py | 22 +++++++++++----------- sklearn/utils/tests/test_sparsefuncs.py | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index aa4e91a08d076..02c56a9464a61 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -71,16 +71,16 @@ def test_pca(svd_solver, n_components): @pytest.mark.parametrize("density", [0.01, 0.1, 0.30]) @pytest.mark.parametrize("n_components", [1, 2, 10]) -@pytest.mark.parametrize("matrix_class", CSR_CONTAINERS + CSC_CONTAINERS) +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) @pytest.mark.parametrize("svd_solver", ["arpack"]) @pytest.mark.parametrize("scale", [1, 10, 100]) def test_pca_sparse( - global_random_seed, svd_solver, matrix_class, n_components, density, scale + global_random_seed, svd_solver, sparse_container, n_components, density, scale ): RTOL = 1e-07 # 1e0-8 can result in occasional failures random_state = np.random.default_rng(global_random_seed) - X = matrix_class( + X = sparse_container( sp.sparse.random( SPARSE_M, SPARSE_N, @@ -103,7 +103,7 @@ def test_pca_sparse( _check_fitted_pca_close(pca, pcad, rtol=RTOL) # Test transform - X2 = matrix_class( + X2 = sparse_container( sp.sparse.random( SPARSE_M, SPARSE_N, @@ -117,10 +117,10 @@ def test_pca_sparse( assert_allclose(pca.transform(X2), pcad.transform(X2d), rtol=RTOL) -@pytest.mark.parametrize("matrix_class", CSR_CONTAINERS + CSC_CONTAINERS) -def test_pca_sparse_fit_transform(global_random_seed, matrix_class): +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_pca_sparse_fit_transform(global_random_seed, sparse_container): random_state = np.random.default_rng(global_random_seed) - X = matrix_class( + X = sparse_container( sp.sparse.random( SPARSE_M, SPARSE_N, @@ -128,7 +128,7 @@ def test_pca_sparse_fit_transform(global_random_seed, matrix_class): density=0.01, ) ) - X2 = matrix_class( + X2 = sparse_container( sp.sparse.random( SPARSE_M, SPARSE_N, @@ -152,10 +152,10 @@ def test_pca_sparse_fit_transform(global_random_seed, matrix_class): @pytest.mark.parametrize("svd_solver", ["randomized", "full", "auto"]) -@pytest.mark.parametrize("matrix_class", CSR_CONTAINERS + CSC_CONTAINERS) -def test_sparse_pca_solver_error(global_random_seed, svd_solver, matrix_class): +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_sparse_pca_solver_error(global_random_seed, svd_solver, sparse_container): random_state = np.random.RandomState(global_random_seed) - X = matrix_class( + X = sparse_container( sp.sparse.random( SPARSE_M, SPARSE_N, diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index a6b33a759336c..716be638d6b28 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -949,12 +949,12 @@ def test_csr_row_norms(dtype): @pytest.fixture(scope="module", params=CSR_CONTAINERS + CSC_CONTAINERS) def centered_matrices(request) -> tuple[sp.linalg.LinearOperator, np.ndarray]: - matrix_class = request.param + sparse_container = request.param from sklearn.utils.sparsefuncs import _implicit_column_offset random_state = np.random.default_rng(42) - X_sparse = matrix_class( + X_sparse = sparse_container( sp.random(500, 100, density=0.1, format="csr", random_state=random_state) ) X_dense = X_sparse.toarray() From 7c8bece4014d8619656f0908f2689136b15ab005 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 18:24:57 +0200 Subject: [PATCH 31/46] Update PCA doc strings to indicate sparse arrays now accepted --- sklearn/decomposition/_pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index a5a2553c820c7..0a067fd003994 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -423,7 +423,7 @@ def fit(self, X, y=None): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. @@ -444,7 +444,7 @@ def fit_transform(self, X, y=None): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. From 0ed4d69ff46f76ef21df04f3439a79dd8c0935df Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 19:00:07 +0200 Subject: [PATCH 32/46] Update docs + examples to not recommend against PCA for sparse data --- doc/modules/decomposition.rst | 6 +----- doc/modules/manifold.rst | 2 +- examples/compose/plot_column_transformer.py | 4 ++-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst index b852a6133d542..c1e317d2ff7d3 100644 --- a/doc/modules/decomposition.rst +++ b/doc/modules/decomposition.rst @@ -74,7 +74,7 @@ out-of-core Principal Component Analysis either by: * Using its ``partial_fit`` method on chunks of data fetched sequentially from the local hard drive or a network database. - * Calling its fit method on a sparse matrix or a memory mapped file using + * Calling its fit method on a memory mapped file using ``numpy.memmap``. :class:`IncrementalPCA` only stores estimates of component and noise variances, @@ -420,10 +420,6 @@ in that the matrix :math:`X` does not need to be centered. When the columnwise (per-feature) means of :math:`X` are subtracted from the feature values, truncated SVD on the resulting matrix is equivalent to PCA. -In practical terms, this means -that the :class:`TruncatedSVD` transformer accepts ``scipy.sparse`` -matrices without the need to densify them, -as densifying may fill up memory even for medium-sized document collections. While the :class:`TruncatedSVD` transformer works with any feature matrix, diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst index 1656c09f1371d..7cc6776e37daa 100644 --- a/doc/modules/manifold.rst +++ b/doc/modules/manifold.rst @@ -644,7 +644,7 @@ Barnes-Hut method improves on the exact method where t-SNE complexity is or less. The 2D case is typical when building visualizations. * Barnes-Hut only works with dense input data. Sparse data matrices can only be embedded with the exact method or can be approximated by a dense low rank - projection for instance using :class:`~sklearn.decomposition.TruncatedSVD` + projection for instance using :class:`~sklearn.decomposition.PCA` * Barnes-Hut is an approximation of the exact method. The approximation is parameterized with the angle parameter, therefore the angle parameter is unused when method="exact" diff --git a/examples/compose/plot_column_transformer.py b/examples/compose/plot_column_transformer.py index 669e817cbf81d..1a988305ef9ab 100644 --- a/examples/compose/plot_column_transformer.py +++ b/examples/compose/plot_column_transformer.py @@ -26,7 +26,7 @@ from sklearn.compose import ColumnTransformer from sklearn.datasets import fetch_20newsgroups -from sklearn.decomposition import TruncatedSVD +from sklearn.decomposition import PCA from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import classification_report @@ -141,7 +141,7 @@ def text_stats(posts): Pipeline( [ ("tfidf", TfidfVectorizer()), - ("best", TruncatedSVD(n_components=50)), + ("best", PCA(n_components=50, solver="arpack")), ] ), 1, From 7fe38e8bb318aea7cdcdcb82c3ad097d1ca3d010 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 1 Oct 2023 19:02:00 +0200 Subject: [PATCH 33/46] Add slightly higher tolerance for transforming --- sklearn/decomposition/tests/test_pca.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 02c56a9464a61..b268955ef7143 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -78,6 +78,7 @@ def test_pca_sparse( global_random_seed, svd_solver, sparse_container, n_components, density, scale ): RTOL = 1e-07 # 1e0-8 can result in occasional failures + TRANSFORM_RTOL = 1e-06 # Slightly higher tolerance for transform random_state = np.random.default_rng(global_random_seed) X = sparse_container( @@ -113,8 +114,8 @@ def test_pca_sparse( ) X2d = X2.toarray() - assert_allclose(pca.transform(X2), pca.transform(X2d), rtol=RTOL) - assert_allclose(pca.transform(X2), pcad.transform(X2d), rtol=RTOL) + assert_allclose(pca.transform(X2), pca.transform(X2d), rtol=TRANSFORM_RTOL) + assert_allclose(pca.transform(X2), pcad.transform(X2d), rtol=TRANSFORM_RTOL) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) From 3ca40e2b0c750ca12cb4b50668e6c3a3465db7d8 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 8 Oct 2023 13:29:55 +0000 Subject: [PATCH 34/46] Remove deprecated attribute access --- sklearn/decomposition/tests/test_pca.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index b268955ef7143..27cbf8011f0ff 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -41,7 +41,6 @@ def _check_fitted_pca_close(pca1: PCA, pca2: PCA, rtol: float): assert_allclose(pca1.singular_values_, pca2.singular_values_, rtol=rtol) assert_allclose(pca1.mean_, pca2.mean_, rtol=rtol) assert_allclose(pca1.n_components_, pca2.n_components_, rtol=rtol) - assert_allclose(pca1.n_features_, pca2.n_features_, rtol=rtol) assert_allclose(pca1.n_samples_, pca2.n_samples_, rtol=rtol) assert_allclose(pca1.noise_variance_, pca2.noise_variance_, rtol=rtol) assert_allclose(pca1.n_features_in_, pca2.n_features_in_, rtol=rtol) From f10cc334b28dd85308fdf4c0bf7f398b88e7e281 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 8 Oct 2023 13:34:54 +0000 Subject: [PATCH 35/46] Fix example usage --- examples/compose/plot_column_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/compose/plot_column_transformer.py b/examples/compose/plot_column_transformer.py index 1a988305ef9ab..207f7450a2705 100644 --- a/examples/compose/plot_column_transformer.py +++ b/examples/compose/plot_column_transformer.py @@ -141,7 +141,7 @@ def text_stats(posts): Pipeline( [ ("tfidf", TfidfVectorizer()), - ("best", PCA(n_components=50, solver="arpack")), + ("best", PCA(n_components=50, svd_solver="arpack")), ] ), 1, From 0d91e1c8e99128cf0ab14412dcb94fbc9a11bc26 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 8 Oct 2023 13:40:35 +0000 Subject: [PATCH 36/46] move import --- sklearn/utils/tests/test_sparsefuncs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 4355c1d2dae8b..3a706db851aae 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -14,6 +14,7 @@ from sklearn.utils.sparsefuncs import ( count_nonzero, csc_median_axis_0, + _implicit_column_offset, incr_mean_variance_axis, inplace_column_scale, inplace_row_scale, @@ -963,7 +964,6 @@ def test_csr_row_norms(dtype): @pytest.fixture(scope="module", params=CSR_CONTAINERS + CSC_CONTAINERS) def centered_matrices(request) -> tuple[sp.linalg.LinearOperator, np.ndarray]: sparse_container = request.param - from sklearn.utils.sparsefuncs import _implicit_column_offset random_state = np.random.default_rng(42) From 4bcac1fd8427ec8d2371bbd26710b7b0253700c5 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 8 Oct 2023 13:41:28 +0000 Subject: [PATCH 37/46] remove annotations import --- sklearn/utils/tests/test_sparsefuncs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 3a706db851aae..570deaf06d637 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import numpy as np import pytest import scipy.sparse as sp From e6daeec66abdb3444c6725f7cc623b851c4c7a14 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 8 Oct 2023 13:47:34 +0000 Subject: [PATCH 38/46] Address suggestions to test_pca --- sklearn/decomposition/tests/test_pca.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 27cbf8011f0ff..02225eebd6004 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import re import warnings @@ -76,8 +74,8 @@ def test_pca(svd_solver, n_components): def test_pca_sparse( global_random_seed, svd_solver, sparse_container, n_components, density, scale ): - RTOL = 1e-07 # 1e0-8 can result in occasional failures - TRANSFORM_RTOL = 1e-06 # Slightly higher tolerance for transform + rtol = 1e-07 # 1e0-8 can result in occasional failures + transform_rtol = 1e-06 # Slightly higher tolerance for transform random_state = np.random.default_rng(global_random_seed) X = sparse_container( @@ -100,7 +98,7 @@ def test_pca_sparse( pcad.fit(Xd) # Fitted attributes equality - _check_fitted_pca_close(pca, pcad, rtol=RTOL) + _check_fitted_pca_close(pca, pcad, rtol=rtol) # Test transform X2 = sparse_container( @@ -113,8 +111,8 @@ def test_pca_sparse( ) X2d = X2.toarray() - assert_allclose(pca.transform(X2), pca.transform(X2d), rtol=TRANSFORM_RTOL) - assert_allclose(pca.transform(X2), pcad.transform(X2d), rtol=TRANSFORM_RTOL) + assert_allclose(pca.transform(X2), pca.transform(X2d), rtol=transform_rtol) + assert_allclose(pca.transform(X2), pcad.transform(X2d), rtol=transform_rtol) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) From bf8143018917166a2d747e5b902da69c478f1200 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 8 Oct 2023 13:56:02 +0000 Subject: [PATCH 39/46] made change to variance calculation --- sklearn/decomposition/_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 0a067fd003994..db4431db9d14f 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -669,7 +669,7 @@ def _fit_truncated(self, X, n_components, svd_solver): if total_var is None: N = X.shape[0] - 1 X **= 2 - total_var = xp.sum(xp.sum(X, axis=0) / N) + total_var = xp.sum(X) / N self.explained_variance_ratio_ = self.explained_variance_ / total_var self.singular_values_ = xp.asarray(S, copy=True) # Store the singular values. From e920da0356f4050455ad602089825c8e614773b8 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 8 Oct 2023 14:10:13 +0000 Subject: [PATCH 40/46] fix linting on test_sparsefuncs.py --- sklearn/utils/tests/test_sparsefuncs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 570deaf06d637..a5aaeadc711d5 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -10,9 +10,9 @@ from sklearn.utils._testing import assert_allclose from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS from sklearn.utils.sparsefuncs import ( + _implicit_column_offset, count_nonzero, csc_median_axis_0, - _implicit_column_offset, incr_mean_variance_axis, inplace_column_scale, inplace_row_scale, From c7eda6a71fc3022aca441476b7e3042567054df4 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 8 Oct 2023 14:12:20 +0000 Subject: [PATCH 41/46] speed ups -> speed-ups --- doc/whats_new/v1.4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index c5821edf22c3e..567940901a68b 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -242,7 +242,7 @@ Changelog - |Feature| :class:`decomposition.PCA` now supports :class:`scipy.sparse.sparray` and :class:`scipy.sparse.spmatrix` inputs when using the `arpack` solver. When used on sparse data like :func:`datasets.fetch_20newsgroups_vectorized` this - can lead to speed ups of 100x (single threaded) and 70x lower memory usage. + can lead to speed-ups of 100x (single threaded) and 70x lower memory usage. Based on :user:`Alexander Tarashansky `'s implementation in `scanpy `. :pr:`18689` by :user:`Isaac Virshup ` and :user:`Andrey Portnoy `. From f9c532912fe5e7152b314802ea2dd852a08f13d3 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 8 Oct 2023 14:15:56 +0000 Subject: [PATCH 42/46] BasePCA.transform docs updated to include support for sparse input --- sklearn/decomposition/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index 0c4982b99a42f..9fa720751774f 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -128,7 +128,7 @@ def transform(self, X): Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) New data, where `n_samples` is the number of samples and `n_features` is the number of features. From e4168790deac544bedb71cdace77c042fe93e873 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 8 Oct 2023 15:07:32 +0000 Subject: [PATCH 43/46] Remove type hint --- sklearn/utils/tests/test_sparsefuncs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index a5aaeadc711d5..e9d4772820761 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -960,7 +960,8 @@ def test_csr_row_norms(dtype): @pytest.fixture(scope="module", params=CSR_CONTAINERS + CSC_CONTAINERS) -def centered_matrices(request) -> tuple[sp.linalg.LinearOperator, np.ndarray]: +def centered_matrices(request): + """Returns equivalent tuple[sp.linalg.LinearOperator, np.ndarray].""" sparse_container = request.param random_state = np.random.default_rng(42) From a5197981899048d4d4005a33c6d3fc8200b80b26 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 8 Oct 2023 15:08:29 +0000 Subject: [PATCH 44/46] Add TODO to variance calculation for PCA --- sklearn/decomposition/_pca.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index db4431db9d14f..046d121ac1934 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -666,6 +666,11 @@ def _fit_truncated(self, X, n_components, svd_solver): # Workaround in-place variance calculation since at the time numpy # did not have a way to calculate variance in-place. + # + # TODO: update this code to either: + # * Use the array-api variance calculation, unless memory usage suffers + # * Update sklearn.utils.extmath._incremental_mean_and_var to support array-api + # See: https://github.com/scikit-learn/scikit-learn/pull/18689#discussion_r1335540991 if total_var is None: N = X.shape[0] - 1 X **= 2 From a647d8d265efe6c7650ec2d99a66b66d5fe7e140 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sun, 8 Oct 2023 17:06:44 +0000 Subject: [PATCH 45/46] Dial in tolerances --- sklearn/decomposition/tests/test_pca.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 02225eebd6004..9f8b1a16e926a 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -74,8 +74,9 @@ def test_pca(svd_solver, n_components): def test_pca_sparse( global_random_seed, svd_solver, sparse_container, n_components, density, scale ): - rtol = 1e-07 # 1e0-8 can result in occasional failures - transform_rtol = 1e-06 # Slightly higher tolerance for transform + # Make sure any tolerance changes pass with SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all" + rtol = 5e-07 + transform_rtol = 3e-05 random_state = np.random.default_rng(global_random_seed) X = sparse_container( @@ -90,11 +91,19 @@ def test_pca_sparse( scale_vector = random_state.random(X.shape[1]) * scale X = X.multiply(scale_vector) - pca = PCA(n_components=n_components, svd_solver=svd_solver) + pca = PCA( + n_components=n_components, + svd_solver=svd_solver, + random_state=global_random_seed, + ) pca.fit(X) Xd = X.toarray() - pcad = PCA(n_components=n_components, svd_solver=svd_solver) + pcad = PCA( + n_components=n_components, + svd_solver=svd_solver, + random_state=global_random_seed, + ) pcad.fit(Xd) # Fitted attributes equality @@ -144,9 +153,9 @@ def test_pca_sparse_fit_transform(global_random_seed, sparse_container): transformed_X = pca_fit_transform.fit_transform(X) _check_fitted_pca_close(pca_fit, pca_fit_transform, rtol=1e-10) - assert_allclose(transformed_X, pca_fit_transform.transform(X), rtol=1e-10) - assert_allclose(transformed_X, pca_fit.transform(X), rtol=1e-10) - assert_allclose(pca_fit.transform(X2), pca_fit_transform.transform(X2), rtol=1e-10) + assert_allclose(transformed_X, pca_fit_transform.transform(X), rtol=2e-9) + assert_allclose(transformed_X, pca_fit.transform(X), rtol=2e-9) + assert_allclose(pca_fit.transform(X2), pca_fit_transform.transform(X2), rtol=2e-9) @pytest.mark.parametrize("svd_solver", ["randomized", "full", "auto"]) From 1ccb9a8e9a0d88b53cdaecc6a44b13c808142398 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 2 Nov 2023 14:24:50 +0000 Subject: [PATCH 46/46] Remove last type annotation --- sklearn/decomposition/tests/test_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 9f8b1a16e926a..891419739c470 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -33,7 +33,7 @@ SPARSE_MAX_COMPONENTS = min(SPARSE_M, SPARSE_N) -def _check_fitted_pca_close(pca1: PCA, pca2: PCA, rtol: float): +def _check_fitted_pca_close(pca1, pca2, rtol): assert_allclose(pca1.components_, pca2.components_, rtol=rtol) assert_allclose(pca1.explained_variance_, pca2.explained_variance_, rtol=rtol) assert_allclose(pca1.singular_values_, pca2.singular_values_, rtol=rtol)