Skip to content

TST use global_random_seed in sklearn/cluster/tests/test_spectral.py #24802

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 41 additions & 24 deletions sklearn/cluster/tests/test_spectral.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize("eigen_solver", ("arpack", "lobpcg"))
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
def test_spectral_clustering(eigen_solver, assign_labels, csr_container):
def test_spectral_clustering(
eigen_solver, assign_labels, csr_container, global_random_seed
):
S = np.array(
[
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
Expand All @@ -54,7 +56,7 @@ def test_spectral_clustering(eigen_solver, assign_labels, csr_container):

for mat in (S, csr_container(S)):
model = SpectralClustering(
random_state=0,
random_state=global_random_seed,
n_clusters=2,
affinity="precomputed",
eigen_solver=eigen_solver,
Expand All @@ -74,9 +76,12 @@ def test_spectral_clustering(eigen_solver, assign_labels, csr_container):

@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
def test_spectral_clustering_sparse(assign_labels, coo_container):
def test_spectral_clustering_sparse(assign_labels, coo_container, global_random_seed):
X, y = make_blobs(
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
n_samples=20,
random_state=global_random_seed,
centers=[[1, 1], [-1, -1]],
cluster_std=0.01,
)

S = rbf_kernel(X, gamma=1)
Expand All @@ -85,7 +90,7 @@ def test_spectral_clustering_sparse(assign_labels, coo_container):

labels = (
SpectralClustering(
random_state=0,
random_state=global_random_seed,
n_clusters=2,
affinity="precomputed",
assign_labels=assign_labels,
Expand All @@ -96,10 +101,13 @@ def test_spectral_clustering_sparse(assign_labels, coo_container):
assert adjusted_rand_score(y, labels) == 1


def test_precomputed_nearest_neighbors_filtering():
def test_precomputed_nearest_neighbors_filtering(global_random_seed):
# Test precomputed graph filtering when containing too many neighbors
X, y = make_blobs(
n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
n_samples=250,
random_state=global_random_seed,
centers=[[1, 1], [-1, -1]],
cluster_std=0.01,
)

n_neighbors = 2
Expand All @@ -109,7 +117,7 @@ def test_precomputed_nearest_neighbors_filtering():
graph = nn.kneighbors_graph(X, mode="connectivity")
labels = (
SpectralClustering(
random_state=0,
random_state=global_random_seed,
n_clusters=2,
affinity="precomputed_nearest_neighbors",
n_neighbors=n_neighbors,
Expand All @@ -122,7 +130,7 @@ def test_precomputed_nearest_neighbors_filtering():
assert_array_equal(results[0], results[1])


def test_affinities():
def test_affinities(global_random_seed):
# Note: in the following, random_state has been selected to have
# a dataset that yields a stable eigen decomposition both when built
# on OSX and Linux
Expand All @@ -135,7 +143,7 @@ def test_affinities():
sp.fit(X)
assert adjusted_rand_score(y, sp.labels_) == 1

sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
sp = SpectralClustering(n_clusters=2, gamma=2, random_state=global_random_seed)
labels = sp.fit(X).labels_
assert adjusted_rand_score(y, labels) == 1

Expand Down Expand Up @@ -164,12 +172,12 @@ def histogram(x, y, **kwargs):
assert (X.shape[0],) == labels.shape


def test_cluster_qr():
def test_cluster_qr(global_random_seed):
# cluster_qr by itself should not be used for clustering generic data
# other than the rows of the eigenvectors within spectral clustering,
# but cluster_qr must still preserve the labels for different dtypes
# of the generic fixed input even if the labels may be meaningless.
random_state = np.random.RandomState(seed=8)
random_state = np.random.RandomState(seed=global_random_seed)
n_samples, n_components = 10, 5
data = random_state.randn(n_samples, n_components)
labels_float64 = cluster_qr(data.astype(np.float64))
Expand All @@ -182,9 +190,9 @@ def test_cluster_qr():
assert np.array_equal(labels_float64, labels_float32)


def test_cluster_qr_permutation_invariance():
def test_cluster_qr_permutation_invariance(global_random_seed):
# cluster_qr must be invariant to sample permutation.
random_state = np.random.RandomState(seed=8)
random_state = np.random.RandomState(seed=global_random_seed)
n_samples, n_components = 100, 5
data = random_state.randn(n_samples, n_components)
perm = random_state.permutation(n_samples)
Expand All @@ -196,9 +204,9 @@ def test_cluster_qr_permutation_invariance():

@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
@pytest.mark.parametrize("n_samples", [50, 100, 150, 500])
def test_discretize(n_samples, coo_container):
def test_discretize(n_samples, coo_container, global_random_seed):
# Test the discretize using a noise assignment matrix
random_state = np.random.RandomState(seed=8)
random_state = np.random.RandomState(seed=global_random_seed)
for n_class in range(2, 10):
# random class labels
y_true = random_state.randint(0, n_class + 1, n_samples)
Expand All @@ -215,7 +223,7 @@ def test_discretize(n_samples, coo_container):
assert adjusted_rand_score(y_true, y_pred) > 0.8


def test_spectral_clustering_with_arpack_amg_solvers():
def test_spectral_clustering_with_arpack_amg_solvers(global_random_seed):
# Test that spectral_clustering is the same for arpack and amg solver
# Based on toy example from plot_segmentation_toy.py

Expand All @@ -236,40 +244,49 @@ def test_spectral_clustering_with_arpack_amg_solvers():
graph.data = np.exp(-graph.data / graph.data.std())

labels_arpack = spectral_clustering(
graph, n_clusters=2, eigen_solver="arpack", random_state=0
graph, n_clusters=2, eigen_solver="arpack", random_state=global_random_seed
)

assert len(np.unique(labels_arpack)) == 2

if amg_loaded:
labels_amg = spectral_clustering(
graph, n_clusters=2, eigen_solver="amg", random_state=0
graph, n_clusters=2, eigen_solver="amg", random_state=global_random_seed
)
assert adjusted_rand_score(labels_arpack, labels_amg) == 1
else:
with pytest.raises(ValueError):
spectral_clustering(graph, n_clusters=2, eigen_solver="amg", random_state=0)


def test_n_components():
def test_n_components(global_random_seed):
# Test that after adding n_components, result is different and
# n_components = n_clusters by default
X, y = make_blobs(
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
n_samples=20,
random_state=global_random_seed,
centers=[[1, 1], [-1, -1]],
cluster_std=0.01,
)
sp = SpectralClustering(n_clusters=2, random_state=0)
sp = SpectralClustering(n_clusters=2, random_state=global_random_seed)
labels = sp.fit(X).labels_
# set n_components = n_cluster and test if result is the same
labels_same_ncomp = (
SpectralClustering(n_clusters=2, n_components=2, random_state=0).fit(X).labels_
SpectralClustering(
n_clusters=2, n_components=2, random_state=global_random_seed
)
.fit(X)
.labels_
)
# test that n_components=n_clusters by default
assert_array_equal(labels, labels_same_ncomp)

# test that n_components affect result
# n_clusters=8 by default, and set n_components=2
labels_diff_ncomp = (
SpectralClustering(n_components=2, random_state=0).fit(X).labels_
SpectralClustering(n_components=2, random_state=global_random_seed)
.fit(X)
.labels_
)
assert not np.array_equal(labels, labels_diff_ncomp)

Expand Down