Skip to content

TST use global_random_seed in sklearn/decomposition/tests/test_sparse_pca.py #31213

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 17, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 63 additions & 75 deletions sklearn/decomposition/tests/test_sparse_pca.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

import sys

import numpy as np
import pytest
from numpy.testing import assert_array_equal

from sklearn.datasets import make_low_rank_matrix
from sklearn.decomposition import PCA, MiniBatchSparsePCA, SparsePCA
from sklearn.utils import check_random_state
from sklearn.utils._testing import (
Expand Down Expand Up @@ -57,48 +57,58 @@ def test_correct_shapes():
assert U.shape == (12, 13)


def test_fit_transform():
def test_fit_transform(global_random_seed):
alpha = 1
rng = np.random.RandomState(0)
rng = np.random.RandomState(global_random_seed)
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=0)
spca_lars = SparsePCA(
n_components=3, method="lars", alpha=alpha, random_state=global_random_seed
)
spca_lars.fit(Y)

# Test that CD gives similar results
spca_lasso = SparsePCA(n_components=3, method="cd", random_state=0, alpha=alpha)
spca_lasso = SparsePCA(
n_components=3, method="cd", random_state=global_random_seed, alpha=alpha
)
spca_lasso.fit(Y)
assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)


@if_safe_multiprocessing_with_blas
def test_fit_transform_parallel():
def test_fit_transform_parallel(global_random_seed):
alpha = 1
rng = np.random.RandomState(0)
rng = np.random.RandomState(global_random_seed)
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=0)
spca_lars = SparsePCA(
n_components=3, method="lars", alpha=alpha, random_state=global_random_seed
)
spca_lars.fit(Y)
U1 = spca_lars.transform(Y)
# Test multiple CPUs
spca = SparsePCA(
n_components=3, n_jobs=2, method="lars", alpha=alpha, random_state=0
n_components=3,
n_jobs=2,
method="lars",
alpha=alpha,
random_state=global_random_seed,
).fit(Y)
U2 = spca.transform(Y)
assert not np.all(spca_lars.components_ == 0)
assert_array_almost_equal(U1, U2)


def test_transform_nan():
def test_transform_nan(global_random_seed):
# Test that SparsePCA won't return NaN when there is 0 feature in all
# samples.
rng = np.random.RandomState(0)
rng = np.random.RandomState(global_random_seed)
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
Y[:, 0] = 0
estimator = SparsePCA(n_components=8)
estimator = SparsePCA(n_components=8, random_state=global_random_seed)
assert not np.any(np.isnan(estimator.fit_transform(Y)))


def test_fit_transform_tall():
rng = np.random.RandomState(0)
def test_fit_transform_tall(global_random_seed):
rng = np.random.RandomState(global_random_seed)
Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng) # tall array
spca_lars = SparsePCA(n_components=3, method="lars", random_state=rng)
U1 = spca_lars.fit_transform(Y)
Expand All @@ -107,8 +117,8 @@ def test_fit_transform_tall():
assert_array_almost_equal(U1, U2)


def test_initialization():
rng = np.random.RandomState(0)
def test_initialization(global_random_seed):
rng = np.random.RandomState(global_random_seed)
U_init = rng.randn(5, 3)
V_init = rng.randn(3, 4)
model = SparsePCA(
Expand All @@ -135,65 +145,32 @@ def test_mini_batch_correct_shapes():
assert U.shape == (12, 13)


# XXX: test always skipped
@pytest.mark.skipif(True, reason="skipping mini_batch_fit_transform.")
def test_mini_batch_fit_transform():
alpha = 1
rng = np.random.RandomState(0)
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
spca_lars = MiniBatchSparsePCA(n_components=3, random_state=0, alpha=alpha).fit(Y)
U1 = spca_lars.transform(Y)
# Test multiple CPUs
if sys.platform == "win32": # fake parallelism for win32
import joblib

_mp = joblib.parallel.multiprocessing
joblib.parallel.multiprocessing = None
try:
spca = MiniBatchSparsePCA(
n_components=3, n_jobs=2, alpha=alpha, random_state=0
)
U2 = spca.fit(Y).transform(Y)
finally:
joblib.parallel.multiprocessing = _mp
else: # we can efficiently use parallelism
spca = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha, random_state=0)
U2 = spca.fit(Y).transform(Y)
assert not np.all(spca_lars.components_ == 0)
assert_array_almost_equal(U1, U2)
# Test that CD gives similar results
spca_lasso = MiniBatchSparsePCA(
n_components=3, method="cd", alpha=alpha, random_state=0
).fit(Y)
assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)


def test_scaling_fit_transform():
def test_scaling_fit_transform(global_random_seed):
alpha = 1
rng = np.random.RandomState(0)
rng = np.random.RandomState(global_random_seed)
Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=rng)
results_train = spca_lars.fit_transform(Y)
results_test = spca_lars.transform(Y[:10])
assert_allclose(results_train[0], results_test[0])


def test_pca_vs_spca():
rng = np.random.RandomState(0)
def test_pca_vs_spca(global_random_seed):
rng = np.random.RandomState(global_random_seed)
Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)
spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2)
pca = PCA(n_components=2)
spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2, random_state=rng)
pca = PCA(n_components=2, random_state=rng)
pca.fit(Y)
spca.fit(Y)
results_test_pca = pca.transform(Z)
results_test_spca = spca.transform(Z)
assert_allclose(
np.abs(spca.components_.dot(pca.components_.T)), np.eye(2), atol=1e-5
np.abs(spca.components_.dot(pca.components_.T)), np.eye(2), atol=1e-4
)
results_test_pca *= np.sign(results_test_pca[0, :])
results_test_spca *= np.sign(results_test_spca[0, :])
assert_allclose(results_test_pca, results_test_spca)
assert_allclose(results_test_pca, results_test_spca, atol=1e-4)


@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA])
Expand Down Expand Up @@ -236,26 +213,31 @@ def test_sparse_pca_dtype_match(SPCA, method, data_type, expected_type):

@pytest.mark.parametrize("SPCA", (SparsePCA, MiniBatchSparsePCA))
@pytest.mark.parametrize("method", ("lars", "cd"))
def test_sparse_pca_numerical_consistency(SPCA, method):
def test_sparse_pca_numerical_consistency(SPCA, method, global_random_seed):
# Verify numericall consistentency among np.float32 and np.float64
rtol = 1e-3
alpha = 2
n_samples, n_features, n_components = 12, 10, 3
rng = np.random.RandomState(0)
input_array = rng.randn(n_samples, n_features)
n_samples, n_features, n_components = 20, 20, 5
input_array = make_low_rank_matrix(
n_samples=n_samples,
n_features=n_features,
effective_rank=n_components,
random_state=global_random_seed,
)

model_32 = SPCA(
n_components=n_components, alpha=alpha, method=method, random_state=0
n_components=n_components,
method=method,
random_state=global_random_seed,
)
transformed_32 = model_32.fit_transform(input_array.astype(np.float32))

model_64 = SPCA(
n_components=n_components, alpha=alpha, method=method, random_state=0
n_components=n_components,
method=method,
random_state=global_random_seed,
)
transformed_64 = model_64.fit_transform(input_array.astype(np.float64))

assert_allclose(transformed_64, transformed_32, rtol=rtol)
assert_allclose(model_64.components_, model_32.components_, rtol=rtol)
assert_allclose(transformed_64, transformed_32)
assert_allclose(model_64.components_, model_32.components_)


@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA])
Expand Down Expand Up @@ -324,17 +306,20 @@ def test_equivalence_components_pca_spca(global_random_seed):
assert_allclose(pca.components_, spca.components_)


def test_sparse_pca_inverse_transform():
def test_sparse_pca_inverse_transform(global_random_seed):
"""Check that `inverse_transform` in `SparsePCA` and `PCA` are similar."""
rng = np.random.RandomState(0)
rng = np.random.RandomState(global_random_seed)
n_samples, n_features = 10, 5
X = rng.randn(n_samples, n_features)

n_components = 2
spca = SparsePCA(
n_components=n_components, alpha=1e-12, ridge_alpha=1e-12, random_state=0
n_components=n_components,
alpha=1e-12,
ridge_alpha=1e-12,
random_state=global_random_seed,
)
pca = PCA(n_components=n_components, random_state=0)
pca = PCA(n_components=n_components, random_state=global_random_seed)
X_trans_spca = spca.fit_transform(X)
X_trans_pca = pca.fit_transform(X)
assert_allclose(
Expand All @@ -343,17 +328,20 @@ def test_sparse_pca_inverse_transform():


@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA])
def test_transform_inverse_transform_round_trip(SPCA):
def test_transform_inverse_transform_round_trip(SPCA, global_random_seed):
"""Check the `transform` and `inverse_transform` round trip with no loss of
information.
"""
rng = np.random.RandomState(0)
rng = np.random.RandomState(global_random_seed)
n_samples, n_features = 10, 5
X = rng.randn(n_samples, n_features)

n_components = n_features
spca = SPCA(
n_components=n_components, alpha=1e-12, ridge_alpha=1e-12, random_state=0
n_components=n_components,
alpha=1e-12,
ridge_alpha=1e-12,
random_state=global_random_seed,
)
X_trans_spca = spca.fit_transform(X)
assert_allclose(spca.inverse_transform(X_trans_spca), X)