diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index b0d36364ec333..bd6a9445a9dce 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -289,6 +289,10 @@ Changelog :pr:`22300` by :user:`Meekail Zain ` and :pr:`15948` by :user:`sysuresh`. +- |Fix| Greatly reduced peak memory usage in :class:`decomposition.PCA` when + calling `fit` or `fit_transform`. + :pr:`22553` by :user:`Meekail Zain `. + :mod:`sklearn.discriminant_analysis` .................................... diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 53afa48bc9343..01cdf5f8ad279 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -619,12 +619,19 @@ def _fit_truncated(self, X, n_components, svd_solver): # Get variance explained by singular values self.explained_variance_ = (S**2) / (n_samples - 1) - total_var = np.var(X, ddof=1, axis=0) - self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum() + + # Workaround in-place variance calculation since at the time numpy + # did not have a way to calculate variance in-place. + N = X.shape[0] - 1 + np.square(X, out=X) + np.sum(X, axis=0, out=X[0]) + total_var = (X[0] / N).sum() + + self.explained_variance_ratio_ = self.explained_variance_ / total_var self.singular_values_ = S.copy() # Store the singular values. if self.n_components_ < min(n_features, n_samples): - self.noise_variance_ = total_var.sum() - self.explained_variance_.sum() + self.noise_variance_ = total_var - self.explained_variance_.sum() self.noise_variance_ /= min(n_features, n_samples) - n_components else: self.noise_variance_ = 0.0 diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index b8c95deaee5a5..44b97a5fc1ce7 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -95,7 +95,7 @@ def test_whitening(solver, copy): X_ = X.copy() pca = PCA( n_components=n_components, whiten=False, copy=copy, svd_solver=solver - ).fit(X_) + ).fit(X_.copy()) X_unwhitened = pca.transform(X_) assert X_unwhitened.shape == (n_samples, n_components) @@ -720,3 +720,14 @@ def test_feature_names_out(): names = pca.get_feature_names_out() assert_array_equal([f"pca{i}" for i in range(2)], names) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_variance_correctness(copy): + """Check the accuracy of PCA's internal variance calculation""" + rng = np.random.RandomState(0) + X = rng.randn(1000, 200) + pca = PCA().fit(X) + pca_var = pca.explained_variance_ / pca.explained_variance_ratio_ + true_var = np.var(X, ddof=1, axis=0).sum() + np.testing.assert_allclose(pca_var, true_var)