Skip to content

ENH Add sparse input support to OPTICS #22965

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 41 commits into from
May 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
5f7e6de
Try csr support.
huntzhan Aug 23, 2019
18425aa
Change the default metric of OPTICS to euclidean.
huntzhan Aug 23, 2019
f712b46
Retain the default metric minkowski.
huntzhan Aug 26, 2019
876bb29
Undo tests.
huntzhan Aug 26, 2019
71b5c1e
Fix flake8.
huntzhan Aug 26, 2019
6f498a9
Add sparse tests.
huntzhan Aug 26, 2019
7cddb87
Merged huntzhan
yskybloue Aug 17, 2021
0f832d4
Change assert_raise_message to pytest.raises
yskybloue Aug 17, 2021
b237b05
Parametrized tests
yskybloue Aug 18, 2021
a003010
Fix flake8 test_optics.py
yskybloue Aug 18, 2021
2188e5b
Fix flake8 _optics.py
yskybloue Aug 18, 2021
f8a43e4
Add sparse precomputed test case
yskybloue Aug 21, 2021
52fdf93
Black test_optics.py
yskybloue Aug 21, 2021
9dceabd
Merge remote-tracking branch 'upstream/main' into add-optics-sparse
yskybloue Aug 21, 2021
8992c87
Black _optics.py
yskybloue Aug 21, 2021
91fd5ed
Add sparse matrix support to _optics.py
yskybloue Aug 21, 2021
80c2a73
Merge remote-tracking branch 'origin/add-optics-sparse' into add-opti…
yskybloue Aug 21, 2021
53474d2
Added changelog entry
yskybloue Aug 21, 2021
611cf77
Merge branch 'main' into add-optics-sparse
yskybloue Sep 6, 2021
2d4a4d3
Merge branch 'main' into sparse_optics
Micky774 Mar 27, 2022
f15c019
Removed extra changelog entry
Micky774 Mar 27, 2022
011ab6d
Updated changelog entry
Micky774 Mar 27, 2022
367b143
Update changelog
Micky774 Apr 1, 2022
d3ffd52
Streamlined tests
Micky774 Apr 1, 2022
0123693
Added disclaimer on sparse matrix support for metrics
Micky774 Apr 1, 2022
ca0529e
Removed sparse parametrization for algorithm edge-case test
Micky774 Apr 1, 2022
7460308
Merge branch 'main' into sparse_optics
Micky774 Apr 1, 2022
7980f41
Remove unused metric argument
Micky774 Apr 1, 2022
c9e699e
Cleanup to minimize diff
Micky774 Apr 1, 2022
b8b4c11
Merge branch 'main' into sparse_optics
Micky774 Apr 1, 2022
8e4848d
Merge branch 'main' into sparse_optics
Micky774 Apr 2, 2022
bfe75d2
Merge branch 'main' into sparse_optics
Micky774 Apr 7, 2022
cf206bf
Update sklearn/cluster/_optics.py
Micky774 Apr 16, 2022
7c6a6c7
Merge branch 'main' into sparse_optics
Micky774 Apr 16, 2022
4a5ca67
Added explicit dense cast before evaluating maximal distances.
Micky774 Apr 16, 2022
f39ab75
Merge branch 'main' into sparse_optics
Micky774 Apr 22, 2022
54fb89f
Merge branch 'main' into sparse_optics
Micky774 Apr 29, 2022
4d03c54
Surpressed unnecessary warnings
Micky774 Apr 29, 2022
2ae23a9
Merge branch 'main' into sparse_optics
Micky774 Apr 30, 2022
27f3edd
Update sklearn/cluster/_optics.py
Micky774 May 1, 2022
c22d638
fix position in changelog
jeremiedbb May 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/whats_new/v1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@ Changelog
:pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
where 123456 is the *pull request* number, not the issue number.

:mod:`sklearn.cluster`
......................

- |Enhancement| The `predict` and `fit_predict` methods of :class:`cluster.OPTICS` now
accept sparse data type for input data. :pr:`14736` by :user:`Hunt Zhan <huntzhan>`,
:pr:`20802` by :user:`Brandon Pokorny <Clickedbigfoot>`,
and :pr:`22965` by :user:`Meekail Zain <micky774>`.

Code and Documentation Contributors
-----------------------------------

Expand Down
22 changes: 16 additions & 6 deletions sklearn/cluster/_optics.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from ..neighbors import NearestNeighbors
from ..base import BaseEstimator, ClusterMixin
from ..metrics import pairwise_distances
from scipy.sparse import issparse, SparseEfficiencyWarning


class OPTICS(ClusterMixin, BaseEstimator):
Expand Down Expand Up @@ -81,6 +82,7 @@ class OPTICS(ClusterMixin, BaseEstimator):
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
'yule']

Sparse matrices are only supported by scikit-learn metrics.
See the documentation for scipy.spatial.distance for details on these
metrics.

Expand Down Expand Up @@ -263,10 +265,11 @@ def fit(self, X, y=None):

Parameters
----------
X : ndarray of shape (n_samples, n_features), or \
X : {ndarray, sparse matrix} of shape (n_samples, n_features), or \
(n_samples, n_samples) if metric=’precomputed’
A feature array, or array of distances between samples if
metric='precomputed'.
metric='precomputed'. If a sparse matrix is provided, it will be
converted into CSR format.

y : Ignored
Not used, present for API consistency by convention.
Expand All @@ -285,7 +288,13 @@ def fit(self, X, y=None):
)
warnings.warn(msg, DataConversionWarning)

X = self._validate_data(X, dtype=dtype)
X = self._validate_data(X, dtype=dtype, accept_sparse="csr")
if self.metric == "precomputed" and issparse(X):
with warnings.catch_warnings():
warnings.simplefilter("ignore", SparseEfficiencyWarning)
# Set each diagonal to an explicit value so each point is its
# own neighbor
X.setdiag(X.diagonal())
memory = check_memory(self.memory)

if self.cluster_method not in ["dbscan", "xi"]:
Expand Down Expand Up @@ -603,15 +612,16 @@ def _set_reach_dist(
# Only compute distances to unprocessed neighbors:
if metric == "precomputed":
dists = X[point_index, unproc]
if issparse(dists):
dists.sort_indices()
dists = dists.data
Comment on lines +616 to +617
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jeremiedbb This is a change from your previous approval. Are you okay with this?

else:
_params = dict() if metric_params is None else metric_params.copy()
if metric == "minkowski" and "p" not in _params:
# the same logic as neighbors, p is ignored if explicitly set
# in the dict params
_params["p"] = p
dists = pairwise_distances(
P, np.take(X, unproc, axis=0), metric=metric, n_jobs=None, **_params
).ravel()
dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel()

rdists = np.maximum(dists, core_distances_[point_index])
np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists)
Expand Down
46 changes: 37 additions & 9 deletions sklearn/cluster/tests/test_optics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# License: BSD 3 clause
import numpy as np
import pytest
from scipy import sparse
import warnings

from sklearn.datasets import make_blobs
Expand All @@ -15,7 +16,7 @@
from sklearn.utils import shuffle
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose

from sklearn.exceptions import EfficiencyWarning
from sklearn.cluster.tests.common import generate_clustered_data


Expand Down Expand Up @@ -158,15 +159,19 @@ def test_cluster_hierarchy_(global_dtype):
assert diff / len(X) < 0.05


def test_correct_number_of_clusters():
@pytest.mark.parametrize(
"metric, is_sparse",
[["minkowski", False], ["euclidean", True]],
)
def test_correct_number_of_clusters(metric, is_sparse):
# in 'auto' mode

n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters)
# Parameters chosen specifically for this task.
# Compute OPTICS
clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1)
clust.fit(X)
clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric)
clust.fit(sparse.csr_matrix(X) if is_sparse else X)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
assert n_clusters_1 == n_clusters
Expand Down Expand Up @@ -286,18 +291,25 @@ def test_close_extract():

@pytest.mark.parametrize("eps", [0.1, 0.3, 0.5])
@pytest.mark.parametrize("min_samples", [3, 10, 20])
def test_dbscan_optics_parity(eps, min_samples, global_dtype):
@pytest.mark.parametrize(
"metric, is_sparse",
[["minkowski", False], ["euclidean", False], ["euclidean", True]],
)
def test_dbscan_optics_parity(eps, min_samples, metric, is_sparse, global_dtype):
# Test that OPTICS clustering labels are <= 5% difference of DBSCAN

centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
)
X = sparse.csr_matrix(X) if is_sparse else X

X = X.astype(global_dtype, copy=False)

# calculate optics with dbscan extract at 0.3 epsilon
op = OPTICS(min_samples=min_samples, cluster_method="dbscan", eps=eps).fit(X)
op = OPTICS(
min_samples=min_samples, cluster_method="dbscan", eps=eps, metric=metric
).fit(X)

# calculate dbscan labels
db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
Expand Down Expand Up @@ -344,7 +356,8 @@ def test_min_cluster_size(min_cluster_size, global_dtype):
assert min(cluster_sizes) >= min_cluster_size
# check behaviour is the same when min_cluster_size is a fraction
clust_frac = OPTICS(
min_samples=9, min_cluster_size=min_cluster_size / redX.shape[0]
min_samples=9,
min_cluster_size=min_cluster_size / redX.shape[0],
)
clust_frac.fit(redX)
assert_array_equal(clust.labels_, clust_frac.labels_)
Expand All @@ -356,17 +369,26 @@ def test_min_cluster_size_invalid(min_cluster_size):
with pytest.raises(ValueError, match="must be a positive integer or a "):
clust.fit(X)

clust = OPTICS(min_cluster_size=min_cluster_size, metric="euclidean")
with pytest.raises(ValueError, match="must be a positive integer or a "):
clust.fit(sparse.csr_matrix(X))


def test_min_cluster_size_invalid2():
clust = OPTICS(min_cluster_size=len(X) + 1)
with pytest.raises(ValueError, match="must be no greater than the "):
clust.fit(X)

clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean")
with pytest.raises(ValueError, match="must be no greater than the "):
clust.fit(sparse.csr_matrix(X))


def test_processing_order():
# Ensure that we consider all unprocessed points,
# not only direct neighbors. when picking the next point.
Y = [[0], [10], [-10], [25]]

clust = OPTICS(min_samples=3, max_eps=15).fit(Y)
assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
Expand Down Expand Up @@ -796,10 +818,16 @@ def test_extract_dbscan(global_dtype):
assert_array_equal(np.sort(np.unique(clust.labels_)), [0, 1, 2, 3])


def test_precomputed_dists(global_dtype):
@pytest.mark.parametrize("is_sparse", [False, True])
def test_precomputed_dists(is_sparse, global_dtype):
redX = X[::2].astype(global_dtype, copy=False)
dists = pairwise_distances(redX, metric="euclidean")
clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(dists)
dists = sparse.csr_matrix(dists) if is_sparse else dists
with warnings.catch_warnings():
warnings.simplefilter("ignore", EfficiencyWarning)
clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(
dists
)
clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX)

assert_allclose(clust1.reachability_, clust2.reachability_)
Expand Down