From 5f7e6de69e425cfdfeeb82cf692b81fe22589dd5 Mon Sep 17 00:00:00 2001
From: huntzhan <huntzhan.dev@gmail.com>
Date: Fri, 23 Aug 2019 11:53:31 +0800
Subject: [PATCH 01/15] Try csr support.

---
 sklearn/cluster/optics_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index ecf5fa6a2bcc0..732c7cfb352fc 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -233,7 +233,7 @@ def fit(self, X, y=None):
         self : instance of OPTICS
             The instance.
         """
-        X = check_array(X, dtype=np.float)
+        X = check_array(X, accept_sparse='csr')
 
         if self.cluster_method not in ['dbscan', 'xi']:
             raise ValueError("cluster_method should be one of"

From 18425aaffc785024d8baaa2bb31e4fd52427eed8 Mon Sep 17 00:00:00 2001
From: huntzhan <huntzhan.dev@gmail.com>
Date: Fri, 23 Aug 2019 16:15:39 +0800
Subject: [PATCH 02/15] Change the default metric of OPTICS to euclidean.

---
 sklearn/cluster/optics_.py           | 4 ++--
 sklearn/cluster/tests/test_optics.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index 732c7cfb352fc..26734da87d778 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -194,7 +194,7 @@ class OPTICS(BaseEstimator, ClusterMixin):
        the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
     """
 
-    def __init__(self, min_samples=5, max_eps=np.inf, metric='minkowski', p=2,
+    def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean', p=2,
                  metric_params=None, cluster_method='xi', eps=None, xi=0.05,
                  predecessor_correction=True, min_cluster_size=None,
                  algorithm='auto', leaf_size=30, n_jobs=None):
@@ -517,7 +517,7 @@ def _set_reach_dist(core_distances_, reachability_, predecessor_,
             # the same logic as neighbors, p is ignored if explicitly set
             # in the dict params
             _params['p'] = p
-        dists = pairwise_distances(P, np.take(X, unproc, axis=0),
+        dists = pairwise_distances(P, X[unproc],
                                    metric, n_jobs=None,
                                    **_params).ravel()
 
diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 5ae8b3f898fcf..b7e46fe18e921 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -352,7 +352,7 @@ def test_compare_to_ELKI():
     # Tests against known extraction array
     # Does NOT work with metric='euclidean', because sklearn euclidean has
     # worse numeric precision. 'minkowski' is slower but more accurate.
-    clust1 = OPTICS(min_samples=5).fit(X)
+    clust1 = OPTICS(metric='minkowski', min_samples=5).fit(X)
 
     assert_array_equal(clust1.ordering_, np.array(o1))
     assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1))
@@ -386,7 +386,7 @@ def test_compare_to_ELKI():
           11, 19, 15, 10, 47, -1, 20, 22, 25, 25, 25, 25, 22, 22, 23, -1, 30,
           30, 34, 34, 34, 32, 32, 37, 38, -1, -1, -1, -1, -1, -1, -1, -1, -1,
           -1, -1, -1, -1, -1, -1, -1, -1, -1]
-    clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X)
+    clust2 = OPTICS(metric='minkowski', min_samples=5, max_eps=0.5).fit(X)
 
     assert_array_equal(clust2.ordering_, np.array(o2))
     assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2))

From f712b46baa1ad3ca25d52f85f01162e4e613f1a7 Mon Sep 17 00:00:00 2001
From: huntzhan <huntzhan.dev@gmail.com>
Date: Mon, 26 Aug 2019 11:14:49 +0800
Subject: [PATCH 03/15] Retain the default metric minkowski.

---
 sklearn/cluster/optics_.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index 26734da87d778..d3faed82d8650 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -19,6 +19,7 @@
 from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics import pairwise_distances
+from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
 
 
 class OPTICS(BaseEstimator, ClusterMixin):
@@ -194,7 +195,7 @@ class OPTICS(BaseEstimator, ClusterMixin):
        the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
     """
 
-    def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean', p=2,
+    def __init__(self, min_samples=5, max_eps=np.inf, metric='minkowski', p=2,
                  metric_params=None, cluster_method='xi', eps=None, xi=0.05,
                  predecessor_correction=True, min_cluster_size=None,
                  algorithm='auto', leaf_size=30, n_jobs=None):
@@ -222,7 +223,8 @@ def fit(self, X, y=None):
         Parameters
         ----------
         X : array, shape (n_samples, n_features), or (n_samples, n_samples)  \
-if metric=’precomputed’.
+if metric=’precomputed’, or sparse matrix  \
+            if metric in ['cityblock', 'cosine', 'euclidean', 'haversine', 'l2', 'l1', 'manhattan'].
             A feature array, or array of distances between samples if
             metric='precomputed'.
 
@@ -233,7 +235,10 @@ def fit(self, X, y=None):
         self : instance of OPTICS
             The instance.
         """
-        X = check_array(X, accept_sparse='csr')
+        if self.metric in PAIRWISE_DISTANCE_FUNCTIONS:
+            X = check_array(X, accept_sparse='csr')
+        else:
+            X = check_array(X)
 
         if self.cluster_method not in ['dbscan', 'xi']:
             raise ValueError("cluster_method should be one of"

From 876bb2935599e37d5c827a83fbcee12829101e6d Mon Sep 17 00:00:00 2001
From: huntzhan <huntzhan.dev@gmail.com>
Date: Mon, 26 Aug 2019 11:17:14 +0800
Subject: [PATCH 04/15] Undo tests.

---
 sklearn/cluster/tests/test_optics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index b7e46fe18e921..5ae8b3f898fcf 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -352,7 +352,7 @@ def test_compare_to_ELKI():
     # Tests against known extraction array
     # Does NOT work with metric='euclidean', because sklearn euclidean has
     # worse numeric precision. 'minkowski' is slower but more accurate.
-    clust1 = OPTICS(metric='minkowski', min_samples=5).fit(X)
+    clust1 = OPTICS(min_samples=5).fit(X)
 
     assert_array_equal(clust1.ordering_, np.array(o1))
     assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1))
@@ -386,7 +386,7 @@ def test_compare_to_ELKI():
           11, 19, 15, 10, 47, -1, 20, 22, 25, 25, 25, 25, 22, 22, 23, -1, 30,
           30, 34, 34, 34, 32, 32, 37, 38, -1, -1, -1, -1, -1, -1, -1, -1, -1,
           -1, -1, -1, -1, -1, -1, -1, -1, -1]
-    clust2 = OPTICS(metric='minkowski', min_samples=5, max_eps=0.5).fit(X)
+    clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X)
 
     assert_array_equal(clust2.ordering_, np.array(o2))
     assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2))

From 71b5c1e88da2424b2289779c29c20c78040ee861 Mon Sep 17 00:00:00 2001
From: huntzhan <huntzhan.dev@gmail.com>
Date: Mon, 26 Aug 2019 11:24:18 +0800
Subject: [PATCH 05/15] Fix flake8.

---
 sklearn/cluster/optics_.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index d3faed82d8650..3864750901509 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -223,8 +223,9 @@ def fit(self, X, y=None):
         Parameters
         ----------
         X : array, shape (n_samples, n_features), or (n_samples, n_samples)  \
-if metric=’precomputed’, or sparse matrix  \
-            if metric in ['cityblock', 'cosine', 'euclidean', 'haversine', 'l2', 'l1', 'manhattan'].
+if metric=’precomputed’, or sparse matrix (n_samples, n_features) if metric
+            in ['cityblock', 'cosine', 'euclidean', 'haversine', 'l2', 'l1',
+            'manhattan'].
             A feature array, or array of distances between samples if
             metric='precomputed'.
 

From 6f498a9206f8e0b0a7422bb7eeb571918d4f7dd3 Mon Sep 17 00:00:00 2001
From: huntzhan <huntzhan.dev@gmail.com>
Date: Mon, 26 Aug 2019 12:40:06 +0800
Subject: [PATCH 06/15] Add sparse tests.

---
 sklearn/cluster/optics_.py           |   4 +-
 sklearn/cluster/tests/test_optics.py | 197 ++++++++++++++++++++-------
 2 files changed, 151 insertions(+), 50 deletions(-)

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index 3864750901509..be417b98cb7ab 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -236,7 +236,9 @@ def fit(self, X, y=None):
         self : instance of OPTICS
             The instance.
         """
-        if self.metric in PAIRWISE_DISTANCE_FUNCTIONS:
+        # TODO: Support the sparse input for metric = 'precopmuted'.
+        if self.metric != 'precomputed' \
+                and self.metric in PAIRWISE_DISTANCE_FUNCTIONS:
             X = check_array(X, accept_sparse='csr')
         else:
             X = check_array(X)
diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 5ae8b3f898fcf..f251978a2eab4 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -5,6 +5,8 @@
 import numpy as np
 import pytest
 
+from scipy import sparse
+
 from sklearn.datasets.samples_generator import make_blobs
 from sklearn.cluster.optics_ import (OPTICS,
                                      _extend_region,
@@ -101,12 +103,22 @@ def test_extract_xi():
                    xi=0.4).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
+    clust = OPTICS(min_samples=3, min_cluster_size=2,
+                   max_eps=20, cluster_method='xi',
+                   xi=0.4, metric='euclidean').fit(sparse.lil_matrix(X))
+    assert_array_equal(clust.labels_, expected_labels)
+
     # check float min_samples and min_cluster_size
     clust = OPTICS(min_samples=0.1, min_cluster_size=0.08,
                    max_eps=20, cluster_method='xi',
                    xi=0.4).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
+    clust = OPTICS(min_samples=0.1, min_cluster_size=0.08,
+                   max_eps=20, cluster_method='xi',
+                   xi=0.4, metric='euclidean').fit(sparse.lil_matrix(X))
+    assert_array_equal(clust.labels_, expected_labels)
+
     X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6))
     expected_labels = np.r_[[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5,
                             -1, -1, [4] * 5]
@@ -118,6 +130,11 @@ def test_extract_xi():
     # this may fail if the predecessor correction is not at work!
     assert_array_equal(clust.labels_, expected_labels)
 
+    clust = OPTICS(min_samples=3, min_cluster_size=3,
+                   max_eps=20, cluster_method='xi',
+                   xi=0.3, metric='euclidean').fit(sparse.lil_matrix(X))
+    assert_array_equal(clust.labels_, expected_labels)
+
     C1 = [[0, 0], [0, 0.1], [0, -.1], [0.1, 0]]
     C2 = [[10, 10], [10, 9], [10, 11], [9, 10]]
     C3 = [[100, 100], [100, 90], [100, 110], [90, 100]]
@@ -130,6 +147,11 @@ def test_extract_xi():
                    xi=0.04).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
+    clust = OPTICS(min_samples=2, min_cluster_size=2,
+                   max_eps=np.inf, cluster_method='xi',
+                   xi=0.04, metric='euclidean').fit(sparse.lil_matrix(X))
+    assert_array_equal(clust.labels_, expected_labels)
+
 
 def test_cluster_hierarchy_():
     rng = np.random.RandomState(0)
@@ -144,33 +166,42 @@ def test_cluster_hierarchy_():
     diff = np.sum(clusters - np.array([[0, 99], [0, 199]]))
     assert diff / len(X) < 0.05
 
+    clust = OPTICS(min_samples=20, xi=.1,
+                   metric='euclidean').fit(sparse.lil_matrix(X))
+    clusters = clust.cluster_hierarchy_
+    assert clusters.shape == (2, 2)
+    diff = np.sum(clusters - np.array([[0, 99], [0, 199]]))
+    assert diff / len(X) < 0.05
+
 
 def test_correct_number_of_clusters():
     # in 'auto' mode
 
     n_clusters = 3
     X = generate_clustered_data(n_clusters=n_clusters)
-    # Parameters chosen specifically for this task.
-    # Compute OPTICS
-    clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=.1)
-    clust.fit(X)
-    # number of clusters, ignoring noise if present
-    n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
-    assert n_clusters_1 == n_clusters
 
-    # check attribute types and sizes
-    assert clust.labels_.shape == (len(X),)
-    assert clust.labels_.dtype.kind == 'i'
+    for metric in ['minkowski', 'euclidean']:
+        # Parameters chosen specifically for this task.
+        # Compute OPTICS
+        clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=.1, metric=metric)
+        clust.fit(X if metric == 'minkowski' else sparse.lil_matrix(X))
+        # number of clusters, ignoring noise if present
+        n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
+        assert n_clusters_1 == n_clusters
 
-    assert clust.reachability_.shape == (len(X),)
-    assert clust.reachability_.dtype.kind == 'f'
+        # check attribute types and sizes
+        assert clust.labels_.shape == (len(X),)
+        assert clust.labels_.dtype.kind == 'i'
 
-    assert clust.core_distances_.shape == (len(X),)
-    assert clust.core_distances_.dtype.kind == 'f'
+        assert clust.reachability_.shape == (len(X),)
+        assert clust.reachability_.dtype.kind == 'f'
 
-    assert clust.ordering_.shape == (len(X),)
-    assert clust.ordering_.dtype.kind == 'i'
-    assert set(clust.ordering_) == set(range(len(X)))
+        assert clust.core_distances_.shape == (len(X),)
+        assert clust.core_distances_.dtype.kind == 'f'
+
+        assert clust.ordering_.shape == (len(X),)
+        assert clust.ordering_.dtype.kind == 'i'
+        assert set(clust.ordering_) == set(range(len(X)))
 
 
 def test_minimum_number_of_sample_check():
@@ -184,6 +215,14 @@ def test_minimum_number_of_sample_check():
     # Run the fit
     assert_raise_message(ValueError, msg, clust.fit, X)
 
+    # Compute OPTICS
+    X = sparse.lil_matrix([[1, 1]])
+    clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1,
+                   metric='euclidean')
+
+    # Run the fit
+    assert_raise_message(ValueError, msg, clust.fit, X)
+
 
 def test_bad_extract():
     # Test an extraction of eps too close to original eps
@@ -198,6 +237,13 @@ def test_bad_extract():
                    eps=0.3, min_samples=10)
     assert_raise_message(ValueError, msg, clust.fit, X)
 
+    # Compute OPTICS
+    clust = OPTICS(max_eps=5.0 * 0.03,
+                   cluster_method='dbscan',
+                   eps=0.3, min_samples=10,
+                   metric='euclidean')
+    assert_raise_message(ValueError, msg, clust.fit, sparse.lil_matrix(X))
+
 
 def test_bad_reachability():
     msg = "All reachability values are inf. Set a larger max_eps."
@@ -209,6 +255,11 @@ def test_bad_reachability():
         clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015)
         clust.fit(X)
 
+    with pytest.warns(UserWarning, match=msg):
+        clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015,
+                       metric='euclidean')
+        clust.fit(sparse.lil_matrix(X))
+
 
 def test_close_extract():
     # Test extract where extraction eps is close to scaled max_eps
@@ -223,32 +274,43 @@ def test_close_extract():
     # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
     assert max(clust.labels_) == 2
 
+    # Compute OPTICS
+    clust = OPTICS(max_eps=1.0, cluster_method='dbscan',
+                   eps=0.3, min_samples=10,
+                   metric='euclidean').fit(sparse.lil_matrix(X))
+    # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
+    assert max(clust.labels_) == 2
+
 
 @pytest.mark.parametrize('eps', [0.1, .3, .5])
 @pytest.mark.parametrize('min_samples', [3, 10, 20])
 def test_dbscan_optics_parity(eps, min_samples):
     # Test that OPTICS clustering labels are <= 5% difference of DBSCAN
 
-    centers = [[1, 1], [-1, -1], [1, -1]]
-    X, labels_true = make_blobs(n_samples=750, centers=centers,
-                                cluster_std=0.4, random_state=0)
+    for metric in ['minkowski', 'euclidean']:
 
-    # calculate optics with dbscan extract at 0.3 epsilon
-    op = OPTICS(min_samples=min_samples, cluster_method='dbscan',
-                eps=eps).fit(X)
+        centers = [[1, 1], [-1, -1], [1, -1]]
+        _X, labels_true = make_blobs(n_samples=750, centers=centers,
+                                     cluster_std=0.4, random_state=0)
+        X = _X if metric == 'minkowski' else sparse.lil_matrix(_X)
 
-    # calculate dbscan labels
-    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
+        # calculate optics with dbscan extract at 0.3 epsilon
+        op = OPTICS(min_samples=min_samples, cluster_method='dbscan',
+                    eps=eps,
+                    metric=metric).fit(X)
 
-    contingency = contingency_matrix(db.labels_, op.labels_)
-    agree = min(np.sum(np.max(contingency, axis=0)),
-                np.sum(np.max(contingency, axis=1)))
-    disagree = X.shape[0] - agree
+        # calculate dbscan labels
+        db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
 
-    percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)
+        contingency = contingency_matrix(db.labels_, op.labels_)
+        agree = min(np.sum(np.max(contingency, axis=0)),
+                    np.sum(np.max(contingency, axis=1)))
+        disagree = X.shape[0] - agree
 
-    # verify label mismatch is <= 5% labels
-    assert percent_mismatch <= 0.05
+        percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)
+
+        # verify label mismatch is <= 5% labels
+        assert percent_mismatch <= 0.05
 
 
 def test_min_samples_edge_case():
@@ -263,12 +325,24 @@ def test_min_samples_edge_case():
                    xi=0.04).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
+    clust = OPTICS(min_samples=3,
+                   max_eps=7, cluster_method='xi',
+                   xi=0.04,
+                   metric='euclidean').fit(sparse.lil_matrix(X))
+    assert_array_equal(clust.labels_, expected_labels)
+
     expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3]
     clust = OPTICS(min_samples=3,
                    max_eps=3, cluster_method='xi',
                    xi=0.04).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
+    clust = OPTICS(min_samples=3,
+                   max_eps=3, cluster_method='xi',
+                   xi=0.04,
+                   metric='euclidean').fit(sparse.lil_matrix(X))
+    assert_array_equal(clust.labels_, expected_labels)
+
     expected_labels = np.r_[[-1] * 9]
     with pytest.warns(UserWarning, match="All reachability values"):
         clust = OPTICS(min_samples=4,
@@ -276,20 +350,33 @@ def test_min_samples_edge_case():
                        xi=0.04).fit(X)
         assert_array_equal(clust.labels_, expected_labels)
 
+    with pytest.warns(UserWarning, match="All reachability values"):
+        clust = OPTICS(min_samples=4,
+                       max_eps=3, cluster_method='xi',
+                       xi=0.04,
+                       metric='euclidean').fit(sparse.lil_matrix(X))
+        assert_array_equal(clust.labels_, expected_labels)
+
 
 # try arbitrary minimum sizes
 @pytest.mark.parametrize('min_cluster_size', range(2, X.shape[0] // 10, 23))
 def test_min_cluster_size(min_cluster_size):
-    redX = X[::2]  # reduce for speed
-    clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX)
-    cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
-    if cluster_sizes.size:
-        assert min(cluster_sizes) >= min_cluster_size
-    # check behaviour is the same when min_cluster_size is a fraction
-    clust_frac = OPTICS(min_samples=9,
-                        min_cluster_size=min_cluster_size / redX.shape[0])
-    clust_frac.fit(redX)
-    assert_array_equal(clust.labels_, clust_frac.labels_)
+    _redX = X[::2]  # reduce for speed
+
+    for metric in ['minkowski', 'euclidean']:
+        redX = _redX if metric == 'minkowski' else sparse.lil_matrix(_redX)
+
+        clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size,
+                       metric=metric).fit(redX)
+        cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
+        if cluster_sizes.size:
+            assert min(cluster_sizes) >= min_cluster_size
+        # check behaviour is the same when min_cluster_size is a fraction
+        clust_frac = OPTICS(min_samples=9,
+                            min_cluster_size=min_cluster_size / redX.shape[0],
+                            metric=metric)
+        clust_frac.fit(redX)
+        assert_array_equal(clust.labels_, clust_frac.labels_)
 
 
 @pytest.mark.parametrize('min_cluster_size', [0, -1, 1.1, 2.2])
@@ -298,21 +385,33 @@ def test_min_cluster_size_invalid(min_cluster_size):
     with pytest.raises(ValueError, match="must be a positive integer or a "):
         clust.fit(X)
 
+    clust = OPTICS(min_cluster_size=min_cluster_size, metric='euclidean')
+    with pytest.raises(ValueError, match="must be a positive integer or a "):
+        clust.fit(sparse.lil_matrix(X))
+
 
 def test_min_cluster_size_invalid2():
     clust = OPTICS(min_cluster_size=len(X) + 1)
     with pytest.raises(ValueError, match="must be no greater than the "):
         clust.fit(X)
 
+    clust = OPTICS(min_cluster_size=len(X) + 1, metric='euclidean')
+    with pytest.raises(ValueError, match="must be no greater than the "):
+        clust.fit(sparse.lil_matrix(X))
+
 
 def test_processing_order():
-    # Ensure that we consider all unprocessed points,
-    # not only direct neighbors. when picking the next point.
-    Y = [[0], [10], [-10], [25]]
-    clust = OPTICS(min_samples=3, max_eps=15).fit(Y)
-    assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
-    assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
-    assert_array_equal(clust.ordering_, [0, 1, 2, 3])
+    for metric in ['minkowski', 'euclidean']:
+
+        # Ensure that we consider all unprocessed points,
+        # not only direct neighbors. when picking the next point.
+        _Y = [[0], [10], [-10], [25]]
+        Y = _Y if metric == 'minkowski' else sparse.lil_matrix(_Y)
+
+        clust = OPTICS(min_samples=3, max_eps=15, metric=metric).fit(Y)
+        assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
+        assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
+        assert_array_equal(clust.ordering_, [0, 1, 2, 3])
 
 
 def test_compare_to_ELKI():

From 0f832d4ffdf3a161f118ea865d6bb6dc8bba3eaa Mon Sep 17 00:00:00 2001
From: Clickedbigfoot <clickedbigfoot@gmail.com>
Date: Tue, 17 Aug 2021 17:45:02 -0500
Subject: [PATCH 07/15] Change assert_raise_message to pytest.raises

---
 sklearn/cluster/tests/test_optics.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 775872e3f0c9a..0775455e9180b 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -225,7 +225,8 @@ def test_minimum_number_of_sample_check():
                    metric='euclidean')
 
     # Run the fit
-    assert_raise_message(ValueError, msg, clust.fit, X)
+    with pytest.raises(ValueError, match=msg):
+        clust.fit(X)
 
 
 def test_bad_extract():
@@ -246,7 +247,8 @@ def test_bad_extract():
                    cluster_method='dbscan',
                    eps=0.3, min_samples=10,
                    metric='euclidean')
-    assert_raise_message(ValueError, msg, clust.fit, sparse.lil_matrix(X))
+    with pytest.raises(ValueError, match=msg):
+        clust.fit(X)
 
 
 def test_bad_reachability():

From b237b05e091630a2a6b1249cbba52cd4c1f6b78d Mon Sep 17 00:00:00 2001
From: Clickedbigfoot <clickedbigfoot@gmail.com>
Date: Wed, 18 Aug 2021 17:21:16 -0500
Subject: [PATCH 08/15] Parametrized tests

---
 sklearn/cluster/tests/test_optics.py | 308 ++++++++++++---------------
 1 file changed, 140 insertions(+), 168 deletions(-)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 0775455e9180b..532038b4dc7b0 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -82,8 +82,8 @@ def test_the_extract_xi_labels(ordering, clusters, expected):
 
     assert_array_equal(labels, expected)
 
-
-def test_extract_xi():
+@pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
+def test_extract_xi(metric):
     # small and easy test (no clusters around other clusters)
     # but with a clear noise data.
     rng = np.random.RandomState(0)
@@ -99,26 +99,17 @@ def test_extract_xi():
     X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6))
     expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5]
     X, expected_labels = shuffle(X, expected_labels, random_state=rng)
+    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
 
     clust = OPTICS(
-        min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4
-    ).fit(X)
-    assert_array_equal(clust.labels_, expected_labels)
-
-    clust = OPTICS(min_samples=3, min_cluster_size=2,
-                   max_eps=20, cluster_method='xi',
-                   xi=0.4, metric='euclidean').fit(sparse.lil_matrix(X))
+        min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4,
+    metric=metric).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
     # check float min_samples and min_cluster_size
     clust = OPTICS(
-        min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4
-    ).fit(X)
-    assert_array_equal(clust.labels_, expected_labels)
-
-    clust = OPTICS(min_samples=0.1, min_cluster_size=0.08,
-                   max_eps=20, cluster_method='xi',
-                   xi=0.4, metric='euclidean').fit(sparse.lil_matrix(X))
+        min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4,
+    metric=metric).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
     X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6))
@@ -126,147 +117,126 @@ def test_extract_xi():
         [1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5
     ]
     X, expected_labels = shuffle(X, expected_labels, random_state=rng)
+    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
 
     clust = OPTICS(
-        min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3
-    ).fit(X)
+        min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3,
+    metric=metric).fit(X)
     # this may fail if the predecessor correction is not at work!
     assert_array_equal(clust.labels_, expected_labels)
 
-    clust = OPTICS(min_samples=3, min_cluster_size=3,
-                   max_eps=20, cluster_method='xi',
-                   xi=0.3, metric='euclidean').fit(sparse.lil_matrix(X))
-    assert_array_equal(clust.labels_, expected_labels)
-
     C1 = [[0, 0], [0, 0.1], [0, -0.1], [0.1, 0]]
     C2 = [[10, 10], [10, 9], [10, 11], [9, 10]]
     C3 = [[100, 100], [100, 90], [100, 110], [90, 100]]
     X = np.vstack((C1, C2, C3))
     expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4]
     X, expected_labels = shuffle(X, expected_labels, random_state=rng)
+    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
 
     clust = OPTICS(
-        min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04
-    ).fit(X)
-    assert_array_equal(clust.labels_, expected_labels)
-
-    clust = OPTICS(min_samples=2, min_cluster_size=2,
-                   max_eps=np.inf, cluster_method='xi',
-                   xi=0.04, metric='euclidean').fit(sparse.lil_matrix(X))
+        min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04,
+    metric=metric).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
 
-def test_cluster_hierarchy_():
+@pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
+def test_cluster_hierarchy_(metric):
     rng = np.random.RandomState(0)
     n_points_per_cluster = 100
     C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2)
     C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2)
     X = np.vstack((C1, C2))
     X = shuffle(X, random_state=0)
+    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
 
-    clusters = OPTICS(min_samples=20, xi=0.1).fit(X).cluster_hierarchy_
-    assert clusters.shape == (2, 2)
-    diff = np.sum(clusters - np.array([[0, 99], [0, 199]]))
-    assert diff / len(X) < 0.05
-
-    clust = OPTICS(min_samples=20, xi=.1,
-                   metric='euclidean').fit(sparse.lil_matrix(X))
-    clusters = clust.cluster_hierarchy_
+    clusters = OPTICS(min_samples=20, xi=0.1, metric=metric).fit(X).cluster_hierarchy_
     assert clusters.shape == (2, 2)
     diff = np.sum(clusters - np.array([[0, 99], [0, 199]]))
-    assert diff / len(X) < 0.05
+    X_len = X.getnnz(axis=0)[0] if metric == 'euclidean' else len(X)
+    assert diff / X_len < 0.05
 
 
-def test_correct_number_of_clusters():
+@pytest.mark.parametrize(
+    "metric, is_sparse",
+    [
+        ['minkowski', False],
+        ['euclidean', False],
+        ['euclidean', True]
+    ]
+)
+def test_correct_number_of_clusters(metric, is_sparse):
     # in 'auto' mode
 
     n_clusters = 3
     X = generate_clustered_data(n_clusters=n_clusters)
 
-    for metric in ['minkowski', 'euclidean']:
-        # Parameters chosen specifically for this task.
-        # Compute OPTICS
-        clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric)
-        clust.fit(X if metric == 'minkowski' else sparse.lil_matrix(X))
-        # number of clusters, ignoring noise if present
-        n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
-        assert n_clusters_1 == n_clusters
+    # Parameters chosen specifically for this task.
+    # Compute OPTICS
+    clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric)
+    clust.fit(sparse.lil_matrix(X) if is_sparse else X)
+    # number of clusters, ignoring noise if present
+    n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
+    assert n_clusters_1 == n_clusters
 
-        # check attribute types and sizes
-        assert clust.labels_.shape == (len(X),)
-        assert clust.labels_.dtype.kind == 'i'
+    # check attribute types and sizes
+    assert clust.labels_.shape == (len(X),)
+    assert clust.labels_.dtype.kind == 'i'
 
-        assert clust.reachability_.shape == (len(X),)
-        assert clust.reachability_.dtype.kind == 'f'
+    assert clust.reachability_.shape == (len(X),)
+    assert clust.reachability_.dtype.kind == 'f'
 
-        assert clust.core_distances_.shape == (len(X),)
-        assert clust.core_distances_.dtype.kind == 'f'
+    assert clust.core_distances_.shape == (len(X),)
+    assert clust.core_distances_.dtype.kind == 'f'
 
-        assert clust.ordering_.shape == (len(X),)
-        assert clust.ordering_.dtype.kind == 'i'
-        assert set(clust.ordering_) == set(range(len(X)))
+    assert clust.ordering_.shape == (len(X),)
+    assert clust.ordering_.dtype.kind == 'i'
+    assert set(clust.ordering_) == set(range(len(X)))
 
 
-def test_minimum_number_of_sample_check():
+@pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
+def test_minimum_number_of_sample_check(metric):
     # test that we check a minimum number of samples
     msg = "min_samples must be no greater than"
 
     # Compute OPTICS
     X = [[1, 1]]
-    clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1)
-
-    # Run the fit
-    with pytest.raises(ValueError, match=msg):
-        clust.fit(X)
-
-    # Compute OPTICS
-    X = sparse.lil_matrix([[1, 1]])
-    clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1,
-                   metric='euclidean')
+    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
+    clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1, metric=metric)
 
     # Run the fit
     with pytest.raises(ValueError, match=msg):
         clust.fit(X)
 
 
-def test_bad_extract():
+@pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
+def test_bad_extract(metric):
     # Test an extraction of eps too close to original eps
     msg = "Specify an epsilon smaller than 0.15. Got 0.3."
     centers = [[1, 1], [-1, -1], [1, -1]]
     X, labels_true = make_blobs(
         n_samples=750, centers=centers, cluster_std=0.4, random_state=0
     )
+    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
 
     # Compute OPTICS
-    clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10)
-    with pytest.raises(ValueError, match=msg):
-        clust.fit(X)
-
-    # Compute OPTICS
-    clust = OPTICS(max_eps=5.0 * 0.03,
-                   cluster_method='dbscan',
-                   eps=0.3, min_samples=10,
-                   metric='euclidean')
+    clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10, metric=metric)
     with pytest.raises(ValueError, match=msg):
         clust.fit(X)
 
 
-def test_bad_reachability():
+@pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
+def test_bad_reachability(metric):
     msg = "All reachability values are inf. Set a larger max_eps."
     centers = [[1, 1], [-1, -1], [1, -1]]
     X, labels_true = make_blobs(
         n_samples=750, centers=centers, cluster_std=0.4, random_state=0
     )
+    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
 
     with pytest.warns(UserWarning, match=msg):
-        clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015)
+        clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015, metric=metric)
         clust.fit(X)
 
-    with pytest.warns(UserWarning, match=msg):
-        clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015,
-                       metric='euclidean')
-        clust.fit(sparse.lil_matrix(X))
-
 
 def test_nowarn_if_metric_bool_data_bool():
     # make sure no warning is raised if metric and data are both boolean
@@ -310,117 +280,114 @@ def test_nowarn_if_metric_no_bool():
         OPTICS(metric=pairwise_metric).fit(X_num)
         assert len(warn_record) == 0
 
-
-def test_close_extract():
+@pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
+def test_close_extract(metric):
     # Test extract where extraction eps is close to scaled max_eps
 
     centers = [[1, 1], [-1, -1], [1, -1]]
     X, labels_true = make_blobs(
         n_samples=750, centers=centers, cluster_std=0.4, random_state=0
     )
+    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
 
     # Compute OPTICS
-    clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10).fit(X)
-    # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
-    assert max(clust.labels_) == 2
-
-    # Compute OPTICS
-    clust = OPTICS(max_eps=1.0, cluster_method='dbscan',
-                   eps=0.3, min_samples=10,
-                   metric='euclidean').fit(sparse.lil_matrix(X))
+    clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10, metric=metric).fit(X)
     # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
     assert max(clust.labels_) == 2
 
 
 @pytest.mark.parametrize("eps", [0.1, 0.3, 0.5])
 @pytest.mark.parametrize("min_samples", [3, 10, 20])
-def test_dbscan_optics_parity(eps, min_samples):
+@pytest.mark.parametrize(
+    "metric, is_sparse",
+    [
+        ['minkowski', False],
+        ['euclidean', False],
+        ['euclidean', True]
+    ]
+)
+def test_dbscan_optics_parity(eps, min_samples, metric, is_sparse):
     # Test that OPTICS clustering labels are <= 5% difference of DBSCAN @TODO modified
 
-    for metric in ['minkowski', 'euclidean']:
-
-        centers = [[1, 1], [-1, -1], [1, -1]]
-        _X, labels_true = make_blobs(n_samples=750, centers=centers,
-                                     cluster_std=0.4, random_state=0)
-        X = _X if metric == 'minkowski' else sparse.lil_matrix(_X)
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(n_samples=750, centers=centers,
+                                 cluster_std=0.4, random_state=0)
+    X = sparse.lil_matrix(X) if is_sparse else X
 
-        # calculate optics with dbscan extract at 0.3 epsilon
-        op = OPTICS(min_samples=min_samples, cluster_method='dbscan',
-                    eps=eps,
-                    metric=metric).fit(X)
+    # calculate optics with dbscan extract at 0.3 epsilon
+    op = OPTICS(min_samples=min_samples, cluster_method='dbscan',
+                eps=eps,
+                metric=metric).fit(X)
 
-        # calculate dbscan labels
-        db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
+    # calculate dbscan labels
+    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
 
-        contingency = contingency_matrix(db.labels_, op.labels_)
-        agree = min(np.sum(np.max(contingency, axis=0)),
-                    np.sum(np.max(contingency, axis=1)))
-        disagree = X.shape[0] - agree
+    contingency = contingency_matrix(db.labels_, op.labels_)
+    agree = min(np.sum(np.max(contingency, axis=0)),
+                np.sum(np.max(contingency, axis=1)))
+    disagree = X.shape[0] - agree
 
-        percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)
+    percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)
 
-        # verify label mismatch is <= 5% labels
-        assert percent_mismatch <= 0.05
+    # verify label mismatch is <= 5% labels
+    assert percent_mismatch <= 0.05
 
 
-def test_min_samples_edge_case(): #@TODO modified for sparse
+@pytest.mark.parametrize(
+    "metric, is_sparse",
+    [
+        ['minkowski', False],
+        ['euclidean', False],
+        ['euclidean', True]
+    ]
+)
+def test_min_samples_edge_case(metric, is_sparse):
     C1 = [[0, 0], [0, 0.1], [0, -0.1]]
     C2 = [[10, 10], [10, 9], [10, 11]]
     C3 = [[100, 100], [100, 96], [100, 106]]
     X = np.vstack((C1, C2, C3))
+    X = sparse.lil_matrix(X) if is_sparse else X
 
     expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3]
-    clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04).fit(X)
-    assert_array_equal(clust.labels_, expected_labels)
-
-    clust = OPTICS(min_samples=3,
-                   max_eps=7, cluster_method='xi',
-                   xi=0.04,
-                   metric='euclidean').fit(sparse.lil_matrix(X))
+    clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04, metric=metric).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
     expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3]
-    clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
-    assert_array_equal(clust.labels_, expected_labels)
-
-    clust = OPTICS(min_samples=3,
-                   max_eps=3, cluster_method='xi',
-                   xi=0.04,
-                   metric='euclidean').fit(sparse.lil_matrix(X))
+    clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04, metric=metric).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
     expected_labels = np.r_[[-1] * 9]
     with pytest.warns(UserWarning, match="All reachability values"):
-        clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
-        assert_array_equal(clust.labels_, expected_labels)
-
-    with pytest.warns(UserWarning, match="All reachability values"):
-        clust = OPTICS(min_samples=4,
-                       max_eps=3, cluster_method='xi',
-                       xi=0.04,
-                       metric='euclidean').fit(sparse.lil_matrix(X))
+        clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04, metric=metric).fit(X)
         assert_array_equal(clust.labels_, expected_labels)
 
 
 # try arbitrary minimum sizes
 @pytest.mark.parametrize("min_cluster_size", range(2, X.shape[0] // 10, 23))
-def test_min_cluster_size(min_cluster_size):
-    _redX = X[::2]  # reduce for speed @TODO modified for sparse
+@pytest.mark.parametrize(
+    "metric, is_sparse",
+    [
+        ['minkowski', False],
+        ['euclidean', False],
+        ['euclidean', True]
+    ]
+)
+def test_min_cluster_size(min_cluster_size, metric, is_sparse):
+    redX = X[::2]  # reduce for speed
 
-    for metric in ['minkowski', 'euclidean']:
-        redX = _redX if metric == 'minkowski' else sparse.lil_matrix(_redX)
+    redX = sparse.lil_matrix(redX) if is_sparse else redX
 
-        clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size,
-                       metric=metric).fit(redX)
-        cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
-        if cluster_sizes.size:
-            assert min(cluster_sizes) >= min_cluster_size
-        # check behaviour is the same when min_cluster_size is a fraction
-        clust_frac = OPTICS(min_samples=9,
-                            min_cluster_size=min_cluster_size / redX.shape[0],
-                            metric=metric)
-        clust_frac.fit(redX)
-        assert_array_equal(clust.labels_, clust_frac.labels_)
+    clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size,
+                   metric=metric).fit(redX)
+    cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
+    if cluster_sizes.size:
+        assert min(cluster_sizes) >= min_cluster_size
+    # check behaviour is the same when min_cluster_size is a fraction
+    clust_frac = OPTICS(min_samples=9,
+                        min_cluster_size=min_cluster_size / redX.shape[0],
+                        metric=metric)
+    clust_frac.fit(redX)
+    assert_array_equal(clust.labels_, clust_frac.labels_)
 
 
 @pytest.mark.parametrize("min_cluster_size", [0, -1, 1.1, 2.2])
@@ -443,19 +410,24 @@ def test_min_cluster_size_invalid2():
     with pytest.raises(ValueError, match="must be no greater than the "):
         clust.fit(sparse.lil_matrix(X))
 
-
-def test_processing_order():
-    for metric in ['minkowski', 'euclidean']:
-
-        # Ensure that we consider all unprocessed points,
-        # not only direct neighbors. when picking the next point.
-        _Y = [[0], [10], [-10], [25]]
-        Y = _Y if metric == 'minkowski' else sparse.lil_matrix(_Y)
-
-        clust = OPTICS(min_samples=3, max_eps=15, metric=metric).fit(Y)
-        assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
-        assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
-        assert_array_equal(clust.ordering_, [0, 1, 2, 3])
+@pytest.mark.parametrize(
+    "metric, is_sparse",
+    [
+        ['minkowski', False],
+        ['euclidean', False],
+        ['euclidean', True]
+    ]
+)
+def test_processing_order(metric, is_sparse):
+    # Ensure that we consider all unprocessed points,
+    # not only direct neighbors. when picking the next point.
+    Y = [[0], [10], [-10], [25]]
+    Y = sparse.lil_matrix(Y) if is_sparse else Y
+
+    clust = OPTICS(min_samples=3, max_eps=15, metric=metric).fit(Y)
+    assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
+    assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
+    assert_array_equal(clust.ordering_, [0, 1, 2, 3])
 
 
 def test_compare_to_ELKI():

From a00301027716583f5293d7954281d62d4c645d2f Mon Sep 17 00:00:00 2001
From: Clickedbigfoot <clickedbigfoot@gmail.com>
Date: Wed, 18 Aug 2021 17:26:38 -0500
Subject: [PATCH 09/15] Fix flake8 test_optics.py

---
 sklearn/cluster/tests/test_optics.py | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 532038b4dc7b0..72f59f29f70b9 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -82,6 +82,7 @@ def test_the_extract_xi_labels(ordering, clusters, expected):
 
     assert_array_equal(labels, expected)
 
+
 @pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
 def test_extract_xi(metric):
     # small and easy test (no clusters around other clusters)
@@ -103,13 +104,13 @@ def test_extract_xi(metric):
 
     clust = OPTICS(
         min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4,
-    metric=metric).fit(X)
+        metric=metric).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
     # check float min_samples and min_cluster_size
     clust = OPTICS(
         min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4,
-    metric=metric).fit(X)
+        metric=metric).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
     X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6))
@@ -121,7 +122,7 @@ def test_extract_xi(metric):
 
     clust = OPTICS(
         min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3,
-    metric=metric).fit(X)
+        metric=metric).fit(X)
     # this may fail if the predecessor correction is not at work!
     assert_array_equal(clust.labels_, expected_labels)
 
@@ -135,7 +136,7 @@ def test_extract_xi(metric):
 
     clust = OPTICS(
         min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04,
-    metric=metric).fit(X)
+        metric=metric).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
 
@@ -219,7 +220,8 @@ def test_bad_extract(metric):
     X = sparse.lil_matrix(X) if metric == 'euclidean' else X
 
     # Compute OPTICS
-    clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10, metric=metric)
+    clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10,
+                   metric=metric)
     with pytest.raises(ValueError, match=msg):
         clust.fit(X)
 
@@ -280,6 +282,7 @@ def test_nowarn_if_metric_no_bool():
         OPTICS(metric=pairwise_metric).fit(X_num)
         assert len(warn_record) == 0
 
+
 @pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
 def test_close_extract(metric):
     # Test extract where extraction eps is close to scaled max_eps
@@ -291,7 +294,8 @@ def test_close_extract(metric):
     X = sparse.lil_matrix(X) if metric == 'euclidean' else X
 
     # Compute OPTICS
-    clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10, metric=metric).fit(X)
+    clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10,
+                   metric=metric).fit(X)
     # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
     assert max(clust.labels_) == 2
 
@@ -311,7 +315,7 @@ def test_dbscan_optics_parity(eps, min_samples, metric, is_sparse):
 
     centers = [[1, 1], [-1, -1], [1, -1]]
     X, labels_true = make_blobs(n_samples=750, centers=centers,
-                                 cluster_std=0.4, random_state=0)
+                                cluster_std=0.4, random_state=0)
     X = sparse.lil_matrix(X) if is_sparse else X
 
     # calculate optics with dbscan extract at 0.3 epsilon
@@ -349,16 +353,19 @@ def test_min_samples_edge_case(metric, is_sparse):
     X = sparse.lil_matrix(X) if is_sparse else X
 
     expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3]
-    clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04, metric=metric).fit(X)
+    clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04,
+                   metric=metric).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
     expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3]
-    clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04, metric=metric).fit(X)
+    clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04,
+                   metric=metric).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
     expected_labels = np.r_[[-1] * 9]
     with pytest.warns(UserWarning, match="All reachability values"):
-        clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04, metric=metric).fit(X)
+        clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04,
+                       metric=metric).fit(X)
         assert_array_equal(clust.labels_, expected_labels)
 
 
@@ -410,6 +417,7 @@ def test_min_cluster_size_invalid2():
     with pytest.raises(ValueError, match="must be no greater than the "):
         clust.fit(sparse.lil_matrix(X))
 
+
 @pytest.mark.parametrize(
     "metric, is_sparse",
     [

From 2188e5b097be462414be095d7033bd8e71304d46 Mon Sep 17 00:00:00 2001
From: Clickedbigfoot <clickedbigfoot@gmail.com>
Date: Wed, 18 Aug 2021 17:31:25 -0500
Subject: [PATCH 10/15] Fix flake8 _optics.py

---
 sklearn/cluster/_optics.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index 3dd733081f0f4..bef57b3aa4dfe 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -21,7 +21,6 @@
 from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics import pairwise_distances
-from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
 
 
 class OPTICS(ClusterMixin, BaseEstimator):
@@ -281,7 +280,7 @@ def fit(self, X, y=None):
             )
             warnings.warn(msg, DataConversionWarning)
 
-        X = self._validate_data(X, dtype=dtype, accept_sparse='csr') #@TODO original condition was metric != 'precomputed' and metric in PAIRWISE_DISTANCE_FUNCTIONS
+        X = self._validate_data(X, dtype=dtype, accept_sparse='csr')
         memory = check_memory(self.memory)
 
         if self.cluster_method not in ["dbscan", "xi"]:
@@ -607,7 +606,7 @@ def _set_reach_dist(
             _params['p'] = p
         dists = pairwise_distances(P, X[unproc],
                                    metric, n_jobs=None,
-                                   **_params).ravel() #@TODO Check if axis matters. Original X[unproc] was np.take(X,unproc,axis=0)
+                                   **_params).ravel()
 
     rdists = np.maximum(dists, core_distances_[point_index])
     np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists)

From f8a43e4f98fb5ae65fff94aa89b5c7bf1695a3b7 Mon Sep 17 00:00:00 2001
From: Clickedbigfoot <clickedbigfoot@gmail.com>
Date: Sat, 21 Aug 2021 12:29:41 -0500
Subject: [PATCH 11/15] Add sparse precomputed test case

---
 sklearn/cluster/tests/test_optics.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 72f59f29f70b9..78dd0578467d6 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -861,9 +861,11 @@ def test_extract_dbscan():
     assert_array_equal(np.sort(np.unique(clust.labels_)), [0, 1, 2, 3])
 
 
-def test_precomputed_dists():
+@pytest.mark.parametrize("is_sparse", [False, True])
+def test_precomputed_dists(is_sparse):
     redX = X[::2]
     dists = pairwise_distances(redX, metric="euclidean")
+    dists = sparse.lil_matrix(dists).tocsr() if is_sparse else dists
     clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(dists)
     clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX)
 

From 52fdf934d5175ad6b6ce7221466006751283cad4 Mon Sep 17 00:00:00 2001
From: Clickedbigfoot <clickedbigfoot@gmail.com>
Date: Sat, 21 Aug 2021 12:54:16 -0500
Subject: [PATCH 12/15] Black test_optics.py

---
 sklearn/cluster/tests/test_optics.py | 164 ++++++++++++++-------------
 1 file changed, 88 insertions(+), 76 deletions(-)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 78dd0578467d6..feaaa9c3b08ee 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -83,7 +83,7 @@ def test_the_extract_xi_labels(ordering, clusters, expected):
     assert_array_equal(labels, expected)
 
 
-@pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
+@pytest.mark.parametrize("metric", ["minkowski", "euclidean"])
 def test_extract_xi(metric):
     # small and easy test (no clusters around other clusters)
     # but with a clear noise data.
@@ -100,17 +100,27 @@ def test_extract_xi(metric):
     X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6))
     expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5]
     X, expected_labels = shuffle(X, expected_labels, random_state=rng)
-    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
+    X = sparse.lil_matrix(X) if metric == "euclidean" else X
 
     clust = OPTICS(
-        min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4,
-        metric=metric).fit(X)
+        min_samples=3,
+        min_cluster_size=2,
+        max_eps=20,
+        cluster_method="xi",
+        xi=0.4,
+        metric=metric,
+    ).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
     # check float min_samples and min_cluster_size
     clust = OPTICS(
-        min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4,
-        metric=metric).fit(X)
+        min_samples=0.1,
+        min_cluster_size=0.08,
+        max_eps=20,
+        cluster_method="xi",
+        xi=0.4,
+        metric=metric,
+    ).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
     X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6))
@@ -118,11 +128,16 @@ def test_extract_xi(metric):
         [1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5
     ]
     X, expected_labels = shuffle(X, expected_labels, random_state=rng)
-    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
+    X = sparse.lil_matrix(X) if metric == "euclidean" else X
 
     clust = OPTICS(
-        min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3,
-        metric=metric).fit(X)
+        min_samples=3,
+        min_cluster_size=3,
+        max_eps=20,
+        cluster_method="xi",
+        xi=0.3,
+        metric=metric,
+    ).fit(X)
     # this may fail if the predecessor correction is not at work!
     assert_array_equal(clust.labels_, expected_labels)
 
@@ -132,15 +147,20 @@ def test_extract_xi(metric):
     X = np.vstack((C1, C2, C3))
     expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4]
     X, expected_labels = shuffle(X, expected_labels, random_state=rng)
-    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
+    X = sparse.lil_matrix(X) if metric == "euclidean" else X
 
     clust = OPTICS(
-        min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04,
-        metric=metric).fit(X)
+        min_samples=2,
+        min_cluster_size=2,
+        max_eps=np.inf,
+        cluster_method="xi",
+        xi=0.04,
+        metric=metric,
+    ).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
 
-@pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
+@pytest.mark.parametrize("metric", ["minkowski", "euclidean"])
 def test_cluster_hierarchy_(metric):
     rng = np.random.RandomState(0)
     n_points_per_cluster = 100
@@ -148,22 +168,18 @@ def test_cluster_hierarchy_(metric):
     C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2)
     X = np.vstack((C1, C2))
     X = shuffle(X, random_state=0)
-    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
+    X = sparse.lil_matrix(X) if metric == "euclidean" else X
 
     clusters = OPTICS(min_samples=20, xi=0.1, metric=metric).fit(X).cluster_hierarchy_
     assert clusters.shape == (2, 2)
     diff = np.sum(clusters - np.array([[0, 99], [0, 199]]))
-    X_len = X.getnnz(axis=0)[0] if metric == 'euclidean' else len(X)
+    X_len = X.getnnz(axis=0)[0] if metric == "euclidean" else len(X)
     assert diff / X_len < 0.05
 
 
 @pytest.mark.parametrize(
     "metric, is_sparse",
-    [
-        ['minkowski', False],
-        ['euclidean', False],
-        ['euclidean', True]
-    ]
+    [["minkowski", False], ["euclidean", False], ["euclidean", True]],
 )
 def test_correct_number_of_clusters(metric, is_sparse):
     # in 'auto' mode
@@ -181,27 +197,27 @@ def test_correct_number_of_clusters(metric, is_sparse):
 
     # check attribute types and sizes
     assert clust.labels_.shape == (len(X),)
-    assert clust.labels_.dtype.kind == 'i'
+    assert clust.labels_.dtype.kind == "i"
 
     assert clust.reachability_.shape == (len(X),)
-    assert clust.reachability_.dtype.kind == 'f'
+    assert clust.reachability_.dtype.kind == "f"
 
     assert clust.core_distances_.shape == (len(X),)
-    assert clust.core_distances_.dtype.kind == 'f'
+    assert clust.core_distances_.dtype.kind == "f"
 
     assert clust.ordering_.shape == (len(X),)
-    assert clust.ordering_.dtype.kind == 'i'
+    assert clust.ordering_.dtype.kind == "i"
     assert set(clust.ordering_) == set(range(len(X)))
 
 
-@pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
+@pytest.mark.parametrize("metric", ["minkowski", "euclidean"])
 def test_minimum_number_of_sample_check(metric):
     # test that we check a minimum number of samples
     msg = "min_samples must be no greater than"
 
     # Compute OPTICS
     X = [[1, 1]]
-    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
+    X = sparse.lil_matrix(X) if metric == "euclidean" else X
     clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1, metric=metric)
 
     # Run the fit
@@ -209,7 +225,7 @@ def test_minimum_number_of_sample_check(metric):
         clust.fit(X)
 
 
-@pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
+@pytest.mark.parametrize("metric", ["minkowski", "euclidean"])
 def test_bad_extract(metric):
     # Test an extraction of eps too close to original eps
     msg = "Specify an epsilon smaller than 0.15. Got 0.3."
@@ -217,23 +233,28 @@ def test_bad_extract(metric):
     X, labels_true = make_blobs(
         n_samples=750, centers=centers, cluster_std=0.4, random_state=0
     )
-    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
+    X = sparse.lil_matrix(X) if metric == "euclidean" else X
 
     # Compute OPTICS
-    clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10,
-                   metric=metric)
+    clust = OPTICS(
+        max_eps=5.0 * 0.03,
+        cluster_method="dbscan",
+        eps=0.3,
+        min_samples=10,
+        metric=metric,
+    )
     with pytest.raises(ValueError, match=msg):
         clust.fit(X)
 
 
-@pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
+@pytest.mark.parametrize("metric", ["minkowski", "euclidean"])
 def test_bad_reachability(metric):
     msg = "All reachability values are inf. Set a larger max_eps."
     centers = [[1, 1], [-1, -1], [1, -1]]
     X, labels_true = make_blobs(
         n_samples=750, centers=centers, cluster_std=0.4, random_state=0
     )
-    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
+    X = sparse.lil_matrix(X) if metric == "euclidean" else X
 
     with pytest.warns(UserWarning, match=msg):
         clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015, metric=metric)
@@ -283,7 +304,7 @@ def test_nowarn_if_metric_no_bool():
         assert len(warn_record) == 0
 
 
-@pytest.mark.parametrize("metric", ['minkowski', 'euclidean'])
+@pytest.mark.parametrize("metric", ["minkowski", "euclidean"])
 def test_close_extract(metric):
     # Test extract where extraction eps is close to scaled max_eps
 
@@ -291,11 +312,12 @@ def test_close_extract(metric):
     X, labels_true = make_blobs(
         n_samples=750, centers=centers, cluster_std=0.4, random_state=0
     )
-    X = sparse.lil_matrix(X) if metric == 'euclidean' else X
+    X = sparse.lil_matrix(X) if metric == "euclidean" else X
 
     # Compute OPTICS
-    clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10,
-                   metric=metric).fit(X)
+    clust = OPTICS(
+        max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10, metric=metric
+    ).fit(X)
     # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
     assert max(clust.labels_) == 2
 
@@ -304,31 +326,29 @@ def test_close_extract(metric):
 @pytest.mark.parametrize("min_samples", [3, 10, 20])
 @pytest.mark.parametrize(
     "metric, is_sparse",
-    [
-        ['minkowski', False],
-        ['euclidean', False],
-        ['euclidean', True]
-    ]
+    [["minkowski", False], ["euclidean", False], ["euclidean", True]],
 )
 def test_dbscan_optics_parity(eps, min_samples, metric, is_sparse):
     # Test that OPTICS clustering labels are <= 5% difference of DBSCAN @TODO modified
 
     centers = [[1, 1], [-1, -1], [1, -1]]
-    X, labels_true = make_blobs(n_samples=750, centers=centers,
-                                cluster_std=0.4, random_state=0)
+    X, labels_true = make_blobs(
+        n_samples=750, centers=centers, cluster_std=0.4, random_state=0
+    )
     X = sparse.lil_matrix(X) if is_sparse else X
 
     # calculate optics with dbscan extract at 0.3 epsilon
-    op = OPTICS(min_samples=min_samples, cluster_method='dbscan',
-                eps=eps,
-                metric=metric).fit(X)
+    op = OPTICS(
+        min_samples=min_samples, cluster_method="dbscan", eps=eps, metric=metric
+    ).fit(X)
 
     # calculate dbscan labels
     db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
 
     contingency = contingency_matrix(db.labels_, op.labels_)
-    agree = min(np.sum(np.max(contingency, axis=0)),
-                np.sum(np.max(contingency, axis=1)))
+    agree = min(
+        np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))
+    )
     disagree = X.shape[0] - agree
 
     percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)
@@ -339,11 +359,7 @@ def test_dbscan_optics_parity(eps, min_samples, metric, is_sparse):
 
 @pytest.mark.parametrize(
     "metric, is_sparse",
-    [
-        ['minkowski', False],
-        ['euclidean', False],
-        ['euclidean', True]
-    ]
+    [["minkowski", False], ["euclidean", False], ["euclidean", True]],
 )
 def test_min_samples_edge_case(metric, is_sparse):
     C1 = [[0, 0], [0, 0.1], [0, -0.1]]
@@ -353,19 +369,22 @@ def test_min_samples_edge_case(metric, is_sparse):
     X = sparse.lil_matrix(X) if is_sparse else X
 
     expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3]
-    clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04,
-                   metric=metric).fit(X)
+    clust = OPTICS(
+        min_samples=3, max_eps=7, cluster_method="xi", xi=0.04, metric=metric
+    ).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
     expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3]
-    clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04,
-                   metric=metric).fit(X)
+    clust = OPTICS(
+        min_samples=3, max_eps=3, cluster_method="xi", xi=0.04, metric=metric
+    ).fit(X)
     assert_array_equal(clust.labels_, expected_labels)
 
     expected_labels = np.r_[[-1] * 9]
     with pytest.warns(UserWarning, match="All reachability values"):
-        clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04,
-                       metric=metric).fit(X)
+        clust = OPTICS(
+            min_samples=4, max_eps=3, cluster_method="xi", xi=0.04, metric=metric
+        ).fit(X)
         assert_array_equal(clust.labels_, expected_labels)
 
 
@@ -373,26 +392,23 @@ def test_min_samples_edge_case(metric, is_sparse):
 @pytest.mark.parametrize("min_cluster_size", range(2, X.shape[0] // 10, 23))
 @pytest.mark.parametrize(
     "metric, is_sparse",
-    [
-        ['minkowski', False],
-        ['euclidean', False],
-        ['euclidean', True]
-    ]
+    [["minkowski", False], ["euclidean", False], ["euclidean", True]],
 )
 def test_min_cluster_size(min_cluster_size, metric, is_sparse):
     redX = X[::2]  # reduce for speed
 
     redX = sparse.lil_matrix(redX) if is_sparse else redX
 
-    clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size,
-                   metric=metric).fit(redX)
+    clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size, metric=metric).fit(
+        redX
+    )
     cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
     if cluster_sizes.size:
         assert min(cluster_sizes) >= min_cluster_size
     # check behaviour is the same when min_cluster_size is a fraction
-    clust_frac = OPTICS(min_samples=9,
-                        min_cluster_size=min_cluster_size / redX.shape[0],
-                        metric=metric)
+    clust_frac = OPTICS(
+        min_samples=9, min_cluster_size=min_cluster_size / redX.shape[0], metric=metric
+    )
     clust_frac.fit(redX)
     assert_array_equal(clust.labels_, clust_frac.labels_)
 
@@ -403,7 +419,7 @@ def test_min_cluster_size_invalid(min_cluster_size):
     with pytest.raises(ValueError, match="must be a positive integer or a "):
         clust.fit(X)
 
-    clust = OPTICS(min_cluster_size=min_cluster_size, metric='euclidean')
+    clust = OPTICS(min_cluster_size=min_cluster_size, metric="euclidean")
     with pytest.raises(ValueError, match="must be a positive integer or a "):
         clust.fit(sparse.lil_matrix(X))
 
@@ -413,18 +429,14 @@ def test_min_cluster_size_invalid2():
     with pytest.raises(ValueError, match="must be no greater than the "):
         clust.fit(X)
 
-    clust = OPTICS(min_cluster_size=len(X) + 1, metric='euclidean')
+    clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean")
     with pytest.raises(ValueError, match="must be no greater than the "):
         clust.fit(sparse.lil_matrix(X))
 
 
 @pytest.mark.parametrize(
     "metric, is_sparse",
-    [
-        ['minkowski', False],
-        ['euclidean', False],
-        ['euclidean', True]
-    ]
+    [["minkowski", False], ["euclidean", False], ["euclidean", True]],
 )
 def test_processing_order(metric, is_sparse):
     # Ensure that we consider all unprocessed points,

From 8992c870808cfbe3709f02f6bc3c66db268fa570 Mon Sep 17 00:00:00 2001
From: Clickedbigfoot <clickedbigfoot@gmail.com>
Date: Sat, 21 Aug 2021 13:33:32 -0500
Subject: [PATCH 13/15] Black _optics.py

---
 sklearn/cluster/_optics.py | 53 ++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index bef57b3aa4dfe..73b028b64ac14 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -14,13 +14,14 @@
 import warnings
 import numpy as np
 
-from ..exceptions import DataConversionWarning
+from ..exceptions import DataConversionWarning, EfficiencyWarning
 from ..metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS
 from ..utils import gen_batches, get_chunk_n_rows
 from ..utils.validation import check_memory
 from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics import pairwise_distances
+from scipy.sparse import issparse, SparseEfficiencyWarning
 
 
 class OPTICS(ClusterMixin, BaseEstimator):
@@ -280,7 +281,12 @@ def fit(self, X, y=None):
             )
             warnings.warn(msg, DataConversionWarning)
 
-        X = self._validate_data(X, dtype=dtype, accept_sparse='csr')
+        X = self._validate_data(X, dtype=dtype, accept_sparse="csr")
+        if self.metric == "precomputed" and issparse(X):
+            # Set each diagonal to an explicit value so each point is its own neighbor
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", category=SparseEfficiencyWarning)
+                X.setdiag(X.diagonal())
         memory = check_memory(self.memory)
 
         if self.cluster_method not in ["dbscan", "xi"]:
@@ -518,13 +524,16 @@ def compute_optics_graph(
         n_jobs=n_jobs,
     )
 
-    nbrs.fit(X)
-    # Here we first do a kNN query for each point, this differs from
-    # the original OPTICS that only used epsilon range queries.
-    # TODO: handle working_memory somehow?
-    core_distances_ = _compute_core_distances_(
-        X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None
-    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=EfficiencyWarning)
+        # Efficiency warning appears when using sparse precomputed matrices
+        nbrs.fit(X)
+        # Here we first do a kNN query for each point, this differs from
+        # the original OPTICS that only used epsilon range queries.
+        # TODO: handle working_memory somehow?
+        core_distances_ = _compute_core_distances_(
+            X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None
+        )
     # OPTICS puts an upper limit on these, use inf for undefined.
     core_distances_[core_distances_ > max_eps] = np.inf
     np.around(
@@ -587,7 +596,10 @@ def _set_reach_dist(
     # Assume that radius_neighbors is faster without distances
     # and we don't need all distances, nevertheless, this means
     # we may be doing some work twice.
-    indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0]
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=EfficiencyWarning)
+        # Efficiency warning appears when using sparse precomputed matrices
+        indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0]
 
     # Getting indices of neighbors that have not been processed
     unproc = np.compress(~np.take(processed, indices), indices)
@@ -603,13 +615,20 @@ def _set_reach_dist(
         if metric == "minkowski" and "p" not in _params:
             # the same logic as neighbors, p is ignored if explicitly set
             # in the dict params
-            _params['p'] = p
-        dists = pairwise_distances(P, X[unproc],
-                                   metric, n_jobs=None,
-                                   **_params).ravel()
-
-    rdists = np.maximum(dists, core_distances_[point_index])
-    np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists)
+            _params["p"] = p
+        dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel()
+
+    if issparse(dists):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", category=SparseEfficiencyWarning)
+            rdists = dists.maximum(core_distances_[point_index])
+        np.around(
+            rdists.data, decimals=np.finfo(rdists.dtype).precision, out=rdists.data
+        )
+        rdists = np.array(rdists.todense())[0]
+    else:
+        rdists = np.maximum(dists, core_distances_[point_index])
+        np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists)
     improved = np.where(rdists < np.take(reachability_, unproc))
     reachability_[unproc[improved]] = rdists[improved]
     predecessor_[unproc[improved]] = point_index

From 91fd5edbe7f4fe52485b0811b2322d76c1cb38d7 Mon Sep 17 00:00:00 2001
From: Clicked <clickedbigfoot@gmail.com>
Date: Sat, 21 Aug 2021 16:37:52 -0500
Subject: [PATCH 14/15] Add sparse matrix support to _optics.py

Original commit was pushed, but wasn't reflected on github for some reason
---
 sklearn/cluster/_optics.py | 53 ++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index bef57b3aa4dfe..73b028b64ac14 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -14,13 +14,14 @@
 import warnings
 import numpy as np
 
-from ..exceptions import DataConversionWarning
+from ..exceptions import DataConversionWarning, EfficiencyWarning
 from ..metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS
 from ..utils import gen_batches, get_chunk_n_rows
 from ..utils.validation import check_memory
 from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics import pairwise_distances
+from scipy.sparse import issparse, SparseEfficiencyWarning
 
 
 class OPTICS(ClusterMixin, BaseEstimator):
@@ -280,7 +281,12 @@ def fit(self, X, y=None):
             )
             warnings.warn(msg, DataConversionWarning)
 
-        X = self._validate_data(X, dtype=dtype, accept_sparse='csr')
+        X = self._validate_data(X, dtype=dtype, accept_sparse="csr")
+        if self.metric == "precomputed" and issparse(X):
+            # Set each diagonal to an explicit value so each point is its own neighbor
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", category=SparseEfficiencyWarning)
+                X.setdiag(X.diagonal())
         memory = check_memory(self.memory)
 
         if self.cluster_method not in ["dbscan", "xi"]:
@@ -518,13 +524,16 @@ def compute_optics_graph(
         n_jobs=n_jobs,
     )
 
-    nbrs.fit(X)
-    # Here we first do a kNN query for each point, this differs from
-    # the original OPTICS that only used epsilon range queries.
-    # TODO: handle working_memory somehow?
-    core_distances_ = _compute_core_distances_(
-        X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None
-    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=EfficiencyWarning)
+        # Efficiency warning appears when using sparse precomputed matrices
+        nbrs.fit(X)
+        # Here we first do a kNN query for each point, this differs from
+        # the original OPTICS that only used epsilon range queries.
+        # TODO: handle working_memory somehow?
+        core_distances_ = _compute_core_distances_(
+            X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None
+        )
     # OPTICS puts an upper limit on these, use inf for undefined.
     core_distances_[core_distances_ > max_eps] = np.inf
     np.around(
@@ -587,7 +596,10 @@ def _set_reach_dist(
     # Assume that radius_neighbors is faster without distances
     # and we don't need all distances, nevertheless, this means
     # we may be doing some work twice.
-    indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0]
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=EfficiencyWarning)
+        # Efficiency warning appears when using sparse precomputed matrices
+        indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0]
 
     # Getting indices of neighbors that have not been processed
     unproc = np.compress(~np.take(processed, indices), indices)
@@ -603,13 +615,20 @@ def _set_reach_dist(
         if metric == "minkowski" and "p" not in _params:
             # the same logic as neighbors, p is ignored if explicitly set
             # in the dict params
-            _params['p'] = p
-        dists = pairwise_distances(P, X[unproc],
-                                   metric, n_jobs=None,
-                                   **_params).ravel()
-
-    rdists = np.maximum(dists, core_distances_[point_index])
-    np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists)
+            _params["p"] = p
+        dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel()
+
+    if issparse(dists):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", category=SparseEfficiencyWarning)
+            rdists = dists.maximum(core_distances_[point_index])
+        np.around(
+            rdists.data, decimals=np.finfo(rdists.dtype).precision, out=rdists.data
+        )
+        rdists = np.array(rdists.todense())[0]
+    else:
+        rdists = np.maximum(dists, core_distances_[point_index])
+        np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists)
     improved = np.where(rdists < np.take(reachability_, unproc))
     reachability_[unproc[improved]] = rdists[improved]
     predecessor_[unproc[improved]] = point_index

From 53474d2438e81bbe764af24abcefaa88d71d934a Mon Sep 17 00:00:00 2001
From: Clickedbigfoot <clickedbigfoot@gmail.com>
Date: Sat, 21 Aug 2021 17:49:19 -0500
Subject: [PATCH 15/15] Added changelog entry

---
 doc/whats_new/v1.0.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 7d8175a3b5046..5dea3b5eb06e9 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -210,6 +210,11 @@ Changelog
   of connected components is greater than 1. :pr:`20597` by
   `Thomas Fan`_.
 
+- |Enhancement| The `predict` and `fit_predict` methods of
+  :class:`cluster.OPTICS` now accept sparse data type for input
+  data.
+  :pr:`20802` by :user:`Brandon Pokorny <Clickedbigfoot>`
+
 :mod:`sklearn.compose`
 ......................