Skip to content

[MRG] Add sparse input and sparse precomputed distances to sklearn/cluster/_optics.py #20802

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,16 @@ Changelog
- |API| :func:`cluster.spectral_clustering` raises an improved error when passed
a `np.matrix`. :pr:`20560` by `Thomas Fan`_.

- |Fix| :class:`cluster.AgglomerativeClustering` correctly connects components
when connectivity and affinity are both precomputed and the number
of connected components is greater than 1. :pr:`20597` by
`Thomas Fan`_.

- |Enhancement| The `predict` and `fit_predict` methods of
:class:`cluster.OPTICS` now accept sparse data type for input
data.
:pr:`20802` by :user:`Brandon Pokorny <Clickedbigfoot>`

:mod:`sklearn.compose`
......................

Expand Down
56 changes: 38 additions & 18 deletions sklearn/cluster/_optics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
import warnings
import numpy as np

from ..exceptions import DataConversionWarning
from ..exceptions import DataConversionWarning, EfficiencyWarning
from ..metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS
from ..utils import gen_batches, get_chunk_n_rows
from ..utils.validation import check_memory
from ..neighbors import NearestNeighbors
from ..base import BaseEstimator, ClusterMixin
from ..metrics import pairwise_distances
from scipy.sparse import issparse, SparseEfficiencyWarning


class OPTICS(ClusterMixin, BaseEstimator):
Expand Down Expand Up @@ -263,10 +264,11 @@ def fit(self, X, y=None):

Parameters
----------
X : ndarray of shape (n_samples, n_features), or \
X : {ndarray, sparse matrix} of shape (n_samples, n_features), or \
(n_samples, n_samples) if metric=’precomputed’
A feature array, or array of distances between samples if
metric='precomputed'.
metric='precomputed'. If a sparse matrix is provided, it will be
converted into a sparse ``csr_matrix``.

y : ignored
Ignored.
Expand All @@ -285,7 +287,12 @@ def fit(self, X, y=None):
)
warnings.warn(msg, DataConversionWarning)

X = self._validate_data(X, dtype=dtype)
X = self._validate_data(X, dtype=dtype, accept_sparse="csr")
if self.metric == "precomputed" and issparse(X):
# Set each diagonal to an explicit value so each point is its own neighbor
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=SparseEfficiencyWarning)
X.setdiag(X.diagonal())
memory = check_memory(self.memory)

if self.cluster_method not in ["dbscan", "xi"]:
Expand Down Expand Up @@ -523,13 +530,16 @@ def compute_optics_graph(
n_jobs=n_jobs,
)

nbrs.fit(X)
# Here we first do a kNN query for each point, this differs from
# the original OPTICS that only used epsilon range queries.
# TODO: handle working_memory somehow?
core_distances_ = _compute_core_distances_(
X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None
)
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=EfficiencyWarning)
# Efficiency warning appears when using sparse precomputed matrices
nbrs.fit(X)
# Here we first do a kNN query for each point, this differs from
# the original OPTICS that only used epsilon range queries.
# TODO: handle working_memory somehow?
core_distances_ = _compute_core_distances_(
X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None
)
# OPTICS puts an upper limit on these, use inf for undefined.
core_distances_[core_distances_ > max_eps] = np.inf
np.around(
Expand Down Expand Up @@ -592,7 +602,10 @@ def _set_reach_dist(
# Assume that radius_neighbors is faster without distances
# and we don't need all distances, nevertheless, this means
# we may be doing some work twice.
indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0]
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=EfficiencyWarning)
# Efficiency warning appears when using sparse precomputed matrices
indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0]

# Getting indices of neighbors that have not been processed
unproc = np.compress(~np.take(processed, indices), indices)
Expand All @@ -609,12 +622,19 @@ def _set_reach_dist(
# the same logic as neighbors, p is ignored if explicitly set
# in the dict params
_params["p"] = p
dists = pairwise_distances(
P, np.take(X, unproc, axis=0), metric=metric, n_jobs=None, **_params
).ravel()

rdists = np.maximum(dists, core_distances_[point_index])
np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists)
dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel()

if issparse(dists):
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=SparseEfficiencyWarning)
rdists = dists.maximum(core_distances_[point_index])
np.around(
rdists.data, decimals=np.finfo(rdists.dtype).precision, out=rdists.data
)
rdists = np.array(rdists.todense())[0]
else:
rdists = np.maximum(dists, core_distances_[point_index])
np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists)
improved = np.where(rdists < np.take(reachability_, unproc))
reachability_[unproc[improved]] = rdists[improved]
predecessor_[unproc[improved]] = point_index
Expand Down
Loading