Skip to content

FIX Reduces memory usage of PCA.transform #22553

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 2, 2022
4 changes: 4 additions & 0 deletions doc/whats_new/v1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,10 @@ Changelog
:pr:`22300` by :user:`Meekail Zain <micky774>` and :pr:`15948` by
:user:`sysuresh`.

- |Fix| Greatly reduced peak memory usage in :class:`decomposition.PCA` when
calling `fit` or `fit_transform`.
:pr:`22553` by :user:`Meekail Zain <micky774>`.

:mod:`sklearn.discriminant_analysis`
....................................

Expand Down
13 changes: 10 additions & 3 deletions sklearn/decomposition/_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,12 +619,19 @@ def _fit_truncated(self, X, n_components, svd_solver):

# Get variance explained by singular values
self.explained_variance_ = (S**2) / (n_samples - 1)
total_var = np.var(X, ddof=1, axis=0)
self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum()

# Workaround in-place variance calculation since at the time numpy
# did not have a way to calculate variance in-place.
N = X.shape[0] - 1
np.square(X, out=X)
np.sum(X, axis=0, out=X[0])
total_var = (X[0] / N).sum()

self.explained_variance_ratio_ = self.explained_variance_ / total_var
self.singular_values_ = S.copy() # Store the singular values.

if self.n_components_ < min(n_features, n_samples):
self.noise_variance_ = total_var.sum() - self.explained_variance_.sum()
self.noise_variance_ = total_var - self.explained_variance_.sum()
self.noise_variance_ /= min(n_features, n_samples) - n_components
else:
self.noise_variance_ = 0.0
Expand Down
13 changes: 12 additions & 1 deletion sklearn/decomposition/tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def test_whitening(solver, copy):
X_ = X.copy()
pca = PCA(
n_components=n_components, whiten=False, copy=copy, svd_solver=solver
).fit(X_)
).fit(X_.copy())
X_unwhitened = pca.transform(X_)
assert X_unwhitened.shape == (n_samples, n_components)

Expand Down Expand Up @@ -720,3 +720,14 @@ def test_feature_names_out():

names = pca.get_feature_names_out()
assert_array_equal([f"pca{i}" for i in range(2)], names)


@pytest.mark.parametrize("copy", [True, False])
def test_variance_correctness(copy):
"""Check the accuracy of PCA's internal variance calculation"""
rng = np.random.RandomState(0)
X = rng.randn(1000, 200)
pca = PCA().fit(X)
pca_var = pca.explained_variance_ / pca.explained_variance_ratio_
true_var = np.var(X, ddof=1, axis=0).sum()
np.testing.assert_allclose(pca_var, true_var)