Skip to content

TST Extend tests for scipy.sparse.*array in sklearn/cluster/tests/test_optics.py #27104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Oct 30, 2023
Merged
7 changes: 4 additions & 3 deletions doc/whats_new/v1.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ and classes are impacted:

**Functions:**

- :func:`cluster.compute_optics_graph` in :pr:`27250` by
:user:`Yao Xiao <Charlie-XIAO>`;
- :func:`cluster.compute_optics_graph` in :pr:`27104` by
:user:`Maren Westermann <marenwestermann>` and in :pr:`27250` by :user:`Yao Xiao <Charlie-XIAO>`;
- :func:`cluster.kmeans_plusplus` in :pr:`27179` by :user:`Nurseit Kamchyev <Bncer>`;
- :func:`decomposition.non_negative_factorization` in :pr:`27100` by
:user:`Isaac Virshup <ivirshup>`;
Expand All @@ -114,7 +114,8 @@ and classes are impacted:
- :class:`cluster.HDBSCAN` in :pr:`27250` by :user:`Yao Xiao <Charlie-XIAO>`;
- :class:`cluster.KMeans` in :pr:`27179` by :user:`Nurseit Kamchyev <Bncer>`;
- :class:`cluster.MiniBatchKMeans` in :pr:`27179` by :user:`Nurseit Kamchyev <Bncer>`;
- :class:`cluster.OPTICS` in :pr:`27250` by :user:`Yao Xiao <Charlie-XIAO>`;
- :class:`cluster.OPTICS` in :pr:`27104` by
:user:`Maren Westermann <marenwestermann>` and in :pr:`27250` by :user:`Yao Xiao <Charlie-XIAO>`;
- :class:`decomposition.NMF` in :pr:`27100` by :user:`Isaac Virshup <ivirshup>`;
- :class:`decomposition.MiniBatchNMF` in :pr:`27100` by
:user:`Isaac Virshup <ivirshup>`;
Expand Down
8 changes: 4 additions & 4 deletions sklearn/cluster/_optics.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,10 +665,10 @@ def _set_reach_dist(

# Only compute distances to unprocessed neighbors:
if metric == "precomputed":
dists = X[point_index, unproc]
if issparse(dists):
dists.sort_indices()
dists = dists.data
dists = X[[point_index], unproc]
if isinstance(dists, np.matrix):
dists = np.asarray(dists)
dists = dists.ravel()
else:
_params = dict() if metric_params is None else metric_params.copy()
if metric == "minkowski" and "p" not in _params:
Expand Down
30 changes: 16 additions & 14 deletions sklearn/cluster/tests/test_optics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import numpy as np
import pytest
from scipy import sparse

from sklearn.cluster import DBSCAN, OPTICS
from sklearn.cluster._optics import _extend_region, _extract_xi_labels
Expand All @@ -16,6 +15,7 @@
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.utils import shuffle
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.utils.fixes import CSR_CONTAINERS

rng = np.random.RandomState(0)
n_points_per_cluster = 10
Expand Down Expand Up @@ -157,18 +157,18 @@ def test_cluster_hierarchy_(global_dtype):


@pytest.mark.parametrize(
"metric, is_sparse",
[["minkowski", False], ["euclidean", True]],
"csr_container, metric",
[(None, "minkowski")] + [(container, "euclidean") for container in CSR_CONTAINERS],
)
def test_correct_number_of_clusters(metric, is_sparse):
def test_correct_number_of_clusters(metric, csr_container):
# in 'auto' mode

n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters)
# Parameters chosen specifically for this task.
# Compute OPTICS
clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric)
clust.fit(sparse.csr_matrix(X) if is_sparse else X)
clust.fit(csr_container(X) if csr_container is not None else X)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
assert n_clusters_1 == n_clusters
Expand Down Expand Up @@ -289,17 +289,18 @@ def test_close_extract():
@pytest.mark.parametrize("eps", [0.1, 0.3, 0.5])
@pytest.mark.parametrize("min_samples", [3, 10, 20])
@pytest.mark.parametrize(
"metric, is_sparse",
[["minkowski", False], ["euclidean", False], ["euclidean", True]],
"csr_container, metric",
[(None, "minkowski"), (None, "euclidean")]
+ [(container, "euclidean") for container in CSR_CONTAINERS],
)
def test_dbscan_optics_parity(eps, min_samples, metric, is_sparse, global_dtype):
def test_dbscan_optics_parity(eps, min_samples, metric, global_dtype, csr_container):
# Test that OPTICS clustering labels are <= 5% difference of DBSCAN

centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=150, centers=centers, cluster_std=0.4, random_state=0
)
X = sparse.csr_matrix(X) if is_sparse else X
X = csr_container(X) if csr_container is not None else X

X = X.astype(global_dtype, copy=False)

Expand Down Expand Up @@ -360,14 +361,15 @@ def test_min_cluster_size(min_cluster_size, global_dtype):
assert_array_equal(clust.labels_, clust_frac.labels_)


def test_min_cluster_size_invalid2():
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_min_cluster_size_invalid2(csr_container):
clust = OPTICS(min_cluster_size=len(X) + 1)
with pytest.raises(ValueError, match="must be no greater than the "):
clust.fit(X)

clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean")
with pytest.raises(ValueError, match="must be no greater than the "):
clust.fit(sparse.csr_matrix(X))
clust.fit(csr_container(X))


def test_processing_order():
Expand Down Expand Up @@ -798,11 +800,11 @@ def test_extract_dbscan(global_dtype):
assert_array_equal(np.sort(np.unique(clust.labels_)), [0, 1, 2, 3])


@pytest.mark.parametrize("is_sparse", [False, True])
def test_precomputed_dists(is_sparse, global_dtype):
@pytest.mark.parametrize("csr_container", [None] + CSR_CONTAINERS)
def test_precomputed_dists(global_dtype, csr_container):
redX = X[::2].astype(global_dtype, copy=False)
dists = pairwise_distances(redX, metric="euclidean")
dists = sparse.csr_matrix(dists) if is_sparse else dists
dists = csr_container(dists) if csr_container is not None else dists
with warnings.catch_warnings():
warnings.simplefilter("ignore", EfficiencyWarning)
clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(
Expand Down