Skip to content

[MRG] Make OPTICS more memory efficient when calling kneighbors #12103

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 22, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 38 additions & 5 deletions sklearn/cluster/optics_.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import numpy as np

from ..utils import check_array
from ..utils import gen_batches, get_chunk_n_rows
from ..utils.validation import check_is_fitted
from ..neighbors import NearestNeighbors
from ..base import BaseEstimator, ClusterMixin
Expand Down Expand Up @@ -395,8 +396,6 @@ def fit(self, X, y=None):
# Start all points as 'unprocessed' ##
self.reachability_ = np.empty(n_samples)
self.reachability_.fill(np.inf)
self.core_distances_ = np.empty(n_samples)
self.core_distances_.fill(np.nan)
# Start all points as noise ##
self.labels_ = np.full(n_samples, -1, dtype=int)

Expand All @@ -407,9 +406,7 @@ def fit(self, X, y=None):
n_jobs=self.n_jobs)

nbrs.fit(X)
self.core_distances_[:] = nbrs.kneighbors(X,
self.min_samples)[0][:, -1]

self.core_distances_ = self._compute_core_distances_(X, nbrs)
self.ordering_ = self._calculate_optics_order(X, nbrs)

indices_, self.labels_ = _extract_optics(self.ordering_,
Expand All @@ -425,6 +422,42 @@ def fit(self, X, y=None):

# OPTICS helper functions

def _compute_core_distances_(self, X, neighbors, working_memory=None):
"""Compute the k-th nearest neighbor of each sample

Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]
but with more memory efficiency.

Parameters
----------
X : array, shape (n_samples, n_features)
The data.
neighbors : NearestNeighbors instance
The fitted nearest neighbors estimator.
working_memory : int, optional
The sought maximum memory for temporary distance matrix chunks.
When None (default), the value of
``sklearn.get_config()['working_memory']`` is used.

Returns
-------
core_distances : array, shape (n_samples,)
Distance at which each sample becomes a core point.
Points which will never be core have a distance of inf.
"""
n_samples = len(X)
core_distances = np.empty(n_samples)
core_distances.fill(np.nan)

chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self.min_samples,
max_n_rows=n_samples,
working_memory=working_memory)
slices = gen_batches(n_samples, chunk_n_rows)
for sl in slices:
core_distances[sl] = neighbors.kneighbors(
X[sl], self.min_samples)[0][:, -1]
return core_distances

def _calculate_optics_order(self, X, nbrs):
# Main OPTICS loop. Not parallelizable. The order that entries are
# written to the 'ordering_' list is important!
Expand Down