Skip to content

TST Incorporate global_random_seed in test_optics.py #30844

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 21 additions & 11 deletions sklearn/cluster/tests/test_optics.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ def test_the_extract_xi_labels(ordering, clusters, expected):
def test_extract_xi(global_dtype):
# small and easy test (no clusters around other clusters)
# but with a clear noise data.
# global_random_seed is not used here since the expected labels
# are hardcoded for these specific data.
rng = np.random.RandomState(0)
n_points_per_cluster = 5

Expand Down Expand Up @@ -138,8 +140,8 @@ def test_extract_xi(global_dtype):
assert_array_equal(clust.labels_, expected_labels)


def test_cluster_hierarchy_(global_dtype):
rng = np.random.RandomState(0)
def test_cluster_hierarchy(global_dtype, global_random_seed):
rng = np.random.RandomState(global_random_seed)
n_points_per_cluster = 100
C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2).astype(
global_dtype, copy=False
Expand All @@ -148,12 +150,16 @@ def test_cluster_hierarchy_(global_dtype):
global_dtype, copy=False
)
X = np.vstack((C1, C2))
X = shuffle(X, random_state=0)
X = shuffle(X, random_state=rng)

clusters = OPTICS(min_samples=20, xi=0.1).fit(X).cluster_hierarchy_
clusters = OPTICS(min_samples=20, xi=0.2).fit(X).cluster_hierarchy_
assert clusters.shape == (2, 2)
diff = np.sum(clusters - np.array([[0, 99], [0, 199]]))
assert diff / len(X) < 0.05

# The first cluster should contain all point from C1 but due to how the data is
# generated, some points from C2 may end up in it.
assert 100 <= np.diff(clusters[0]) + 1 <= 115
# The second cluster should contain all points from C1 and C2.
assert np.diff(clusters[-1]) + 1 == 200


@pytest.mark.parametrize(
Expand Down Expand Up @@ -785,10 +791,10 @@ def test_compare_to_ELKI():
assert_allclose(clust1.core_distances_[index], clust2.core_distances_[index])


def test_extract_dbscan(global_dtype):
def test_extract_dbscan(global_dtype, global_random_seed):
# testing an easy dbscan case. Not including clusters with different
# densities.
rng = np.random.RandomState(0)
rng = np.random.RandomState(global_random_seed)
n_points_per_cluster = 20
C1 = [-5, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + 0.2 * rng.randn(n_points_per_cluster, 2)
Expand All @@ -797,7 +803,9 @@ def test_extract_dbscan(global_dtype):
X = np.vstack((C1, C2, C3, C4)).astype(global_dtype, copy=False)

clust = OPTICS(cluster_method="dbscan", eps=0.5).fit(X)
assert_array_equal(np.sort(np.unique(clust.labels_)), [0, 1, 2, 3])
assert_array_equal(
np.sort(np.unique(clust.labels_[clust.labels_ != -1])), [0, 1, 2, 3]
)


@pytest.mark.parametrize("csr_container", [None] + CSR_CONTAINERS)
Expand All @@ -817,12 +825,14 @@ def test_precomputed_dists(global_dtype, csr_container):


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_optics_input_not_modified_precomputed_sparse_nodiag(csr_container):
def test_optics_input_not_modified_precomputed_sparse_nodiag(
csr_container, global_random_seed
):
"""Check that we don't modify in-place the pre-computed sparse matrix.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27508
"""
X = np.random.RandomState(0).rand(6, 6)
X = np.random.RandomState(global_random_seed).rand(6, 6)
# Add zeros on the diagonal that will be implicit when creating
# the sparse matrix. If `X` is modified in-place, the zeros from
# the diagonal will be made explicit.
Expand Down