From f35305eda88709f706d6101d862b795f1ecb0b48 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 3 Sep 2018 15:27:02 +0200 Subject: [PATCH] change max_bound -> max_eps --- doc/modules/clustering.rst | 10 +++---- sklearn/cluster/optics_.py | 40 ++++++++++++++-------------- sklearn/cluster/tests/test_optics.py | 10 +++---- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 968a66e67fdcf..1f8210f35ffb4 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -838,9 +838,9 @@ algorithm builds a *reachability* graph, which assigns each sample both a ``reachability_`` distance, and a spot within the cluster ``ordering_`` attribute; these two attributes are assigned when the model is fitted, and are used to determine cluster membership. If OPTICS is run with the default value -of *inf* set for ``max_bound``, then DBSCAN style cluster extraction can be +of *inf* set for ``max_eps``, then DBSCAN style cluster extraction can be performed in linear time for any given ``eps`` value using the -``extract_dbscan`` method. Setting ``max_bound`` to a lower value will result +``extract_dbscan`` method. Setting ``max_eps`` to a lower value will result in shorter run times, and can be thought of as the maximum cluster object size (in diameter) that OPTICS will be able to extract. @@ -892,10 +892,10 @@ larger parent cluster. shorter run time than OPTICS; however, for repeated runs at varying ``eps`` values, a single run of OPTICS may require less cumulative runtime than DBSCAN. It is also important to note that OPTICS output can be unstable at - ``eps`` values very close to the initial ``max_bound`` value. OPTICS seems + ``eps`` values very close to the initial ``max_eps`` value. OPTICS seems to produce near identical results to DBSCAN provided that ``eps`` passed to ``extract_dbscan`` is a half order of magnitude less than the inital - ``max_bound`` that was used to fit; using a value close to ``max_bound`` + ``max_eps`` that was used to fit; using a value close to ``max_eps`` will throw a warning, and using a value larger will result in an exception. .. topic:: Computational Complexity @@ -909,7 +909,7 @@ larger parent cluster. multithreaded, and has better algorithmic runtime complexity than OPTICS-- at the cost of worse memory scaling. For extremely large datasets that exhaust system memory using HDBSCAN, OPTICS will maintain *n* (as opposed - to *n^2* memory scaling); however, tuning of the ``max_bound`` parameter + to *n^2* memory scaling); however, tuning of the ``max_eps`` parameter will likely need to be used to give a solution in a reasonable amount of wall time. diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index e10a92a7590e6..bc0fe5bfe7ceb 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -21,7 +21,7 @@ from ._optics_inner import quick_scan -def optics(X, min_samples=5, max_bound=np.inf, metric='euclidean', +def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, significant_min=.003, min_cluster_size_ratio=.005, @@ -45,11 +45,11 @@ def optics(X, min_samples=5, max_bound=np.inf, metric='euclidean', The number of samples in a neighborhood for a point to be considered as a core point. - max_bound : float, optional + max_eps : float, optional The maximum distance between two samples for them to be considered as in the same neighborhood. This is also the largest object size expected within the dataset. Default value of "np.inf" will identify - clusters across all scales; reducing `max_bound` will result in + clusters across all scales; reducing `max_eps` will result in shorter run times. metric : string or callable, optional @@ -147,7 +147,7 @@ def optics(X, min_samples=5, max_bound=np.inf, metric='euclidean', Record 28, no. 2 (1999): 49-60. """ - clust = OPTICS(min_samples, max_bound, metric, p, metric_params, + clust = OPTICS(min_samples, max_eps, metric, p, metric_params, maxima_ratio, rejection_ratio, similarity_threshold, significant_min, min_cluster_size_ratio, min_maxima_ratio, @@ -172,11 +172,11 @@ class OPTICS(BaseEstimator, ClusterMixin): The number of samples in a neighborhood for a point to be considered as a core point. - max_bound : float, optional + max_eps : float, optional The maximum distance between two samples for them to be considered as in the same neighborhood. This is also the largest object size expected within the dataset. Default value of "np.inf" will identify - clusters across all scales; reducing `max_bound` will result in + clusters across all scales; reducing `max_eps` will result in shorter run times. metric : string or callable, optional @@ -284,14 +284,14 @@ class OPTICS(BaseEstimator, ClusterMixin): Record 28, no. 2 (1999): 49-60. """ - def __init__(self, min_samples=5, max_bound=np.inf, metric='euclidean', + def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, maxima_ratio=.75, rejection_ratio=.7, similarity_threshold=0.4, significant_min=.003, min_cluster_size_ratio=.005, min_maxima_ratio=0.001, algorithm='ball_tree', leaf_size=30, n_jobs=None): - self.max_bound = max_bound + self.max_eps = max_eps self.min_samples = min_samples self.maxima_ratio = maxima_ratio self.rejection_ratio = rejection_ratio @@ -310,7 +310,7 @@ def fit(self, X, y=None): """Perform OPTICS clustering Extracts an ordered list of points and reachability distances, and - performs initial clustering using `max_bound` distance specified at + performs initial clustering using `max_eps` distance specified at OPTICS object instantiation. Parameters @@ -378,7 +378,7 @@ def fit(self, X, y=None): def _expand_cluster_order(self, point, X, nbrs): # As above, not parallelizable. Parallelizing would allow items in # the 'unprocessed' list to switch to 'processed' - if self.core_distances_[point] <= self.max_bound: + if self.core_distances_[point] <= self.max_eps: while not self._processed[point]: self._processed[point] = True self.ordering_.append(point) @@ -389,7 +389,7 @@ def _expand_cluster_order(self, point, X, nbrs): def _set_reach_dist(self, point_index, X, nbrs): P = np.array(X[point_index]).reshape(1, -1) - indices = nbrs.radius_neighbors(P, radius=self.max_bound, + indices = nbrs.radius_neighbors(P, radius=self.max_eps, return_distance=False)[0] # Getting indices of neighbors that have not been processed @@ -416,17 +416,17 @@ def _set_reach_dist(self, point_index, X, nbrs): def extract_dbscan(self, eps): """Performs DBSCAN extraction for an arbitrary epsilon. - Extraction runs in linear time. Note that if the `max_bound` OPTICS + Extraction runs in linear time. Note that if the `max_eps` OPTICS parameter was set to < inf for extracting reachability and ordering arrays, DBSCAN extractions will be unstable for `eps` values close to - `max_bound`. Setting `eps` < (`max_bound` / 5.0) will guarantee + `max_eps`. Setting `eps` < (`max_eps` / 5.0) will guarantee extraction parity with DBSCAN. Parameters ---------- eps : float or int, required - DBSCAN `eps` parameter. Must be set to < `max_bound`. Equivalence - with DBSCAN algorithm is achieved if `eps` is < (`max_bound` / 5) + DBSCAN `eps` parameter. Must be set to < `max_eps`. Equivalence + with DBSCAN algorithm is achieved if `eps` is < (`max_eps` / 5) Returns ------- @@ -438,14 +438,14 @@ def extract_dbscan(self, eps): """ check_is_fitted(self, 'reachability_') - if eps > self.max_bound: + if eps > self.max_eps: raise ValueError('Specify an epsilon smaller than %s. Got %s.' - % (self.max_bound, eps)) + % (self.max_eps, eps)) - if eps * 5.0 > (self.max_bound * 1.05): + if eps * 5.0 > (self.max_eps * 1.05): warnings.warn( - "Warning, max_bound (%s) is close to eps (%s): " - "Output may be unstable." % (self.max_bound, eps), + "Warning, max_eps (%s) is close to eps (%s): " + "Output may be unstable." % (self.max_eps, eps), RuntimeWarning, stacklevel=2) # Stability warning is documented in _extract_dbscan method... diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 2116e75bf4a54..5a89cb7a0c439 100755 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -27,7 +27,7 @@ def test_correct_number_of_clusters(): X = generate_clustered_data(n_clusters=n_clusters) # Parameters chosen specifically for this task. # Compute OPTICS - clust = OPTICS(max_bound=5.0 * 6.0, min_samples=4, metric='euclidean') + clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, metric='euclidean') clust.fit(X) # number of clusters, ignoring noise if present n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) @@ -41,7 +41,7 @@ def test_minimum_number_of_sample_check(): # Compute OPTICS X = [[1, 1]] - clust = OPTICS(max_bound=5.0 * 0.3, min_samples=10) + clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10) # Run the fit assert_raise_message(ValueError, msg, clust.fit, X) @@ -51,7 +51,7 @@ def test_empty_extract(): # Test extract where fit() has not yet been run. msg = ("This OPTICS instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") - clust = OPTICS(max_bound=5.0 * 0.3, min_samples=10) + clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10) assert_raise_message(ValueError, msg, clust.extract_dbscan, 0.01) @@ -63,7 +63,7 @@ def test_bad_extract(): cluster_std=0.4, random_state=0) # Compute OPTICS - clust = OPTICS(max_bound=5.0 * 0.003, min_samples=10) + clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10) clust2 = clust.fit(X) assert_raise_message(ValueError, msg, clust2.extract_dbscan, 0.3) @@ -76,7 +76,7 @@ def test_close_extract(): cluster_std=0.4, random_state=0) # Compute OPTICS - clust = OPTICS(max_bound=1.0, min_samples=10) + clust = OPTICS(max_eps=1.0, min_samples=10) clust3 = clust.fit(X) # check warning when centers are passed assert_warns(RuntimeWarning, clust3.extract_dbscan, .3)