@@ -41,25 +41,38 @@ def dbscan(
41
41
):
42
42
"""Perform DBSCAN clustering from vector array or distance matrix.
43
43
44
+ This function is a wrapper around :class:`~cluster.DBSCAN`, suitable for
45
+ quick, standalone clustering tasks. For estimator-based workflows, where
46
+ estimator attributes or pipeline integration is required, prefer
47
+ :class:`~cluster.DBSCAN`.
48
+
49
+ DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a
50
+ density-based clustering algorithm that groups together points that are
51
+ closely packed while marking points in low-density regions as outliers.
52
+
44
53
Read more in the :ref:`User Guide <dbscan>`.
45
54
46
55
Parameters
47
56
----------
48
- X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or \
57
+ X : {array-like, scipy sparse matrix} of shape (n_samples, n_features) or \
49
58
(n_samples, n_samples)
50
59
A feature array, or array of distances between samples if
51
- ``metric='precomputed'``.
60
+ ``metric='precomputed'``. When using precomputed distances, X must
61
+ be a square symmetric matrix.
52
62
53
63
eps : float, default=0.5
54
64
The maximum distance between two samples for one to be considered
55
65
as in the neighborhood of the other. This is not a maximum bound
56
66
on the distances of points within a cluster. This is the most
57
67
important DBSCAN parameter to choose appropriately for your data set
58
- and distance function.
68
+ and distance function. Smaller values result in more clusters,
69
+ while larger values result in fewer, larger clusters.
59
70
60
71
min_samples : int, default=5
61
72
The number of samples (or total weight) in a neighborhood for a point
62
73
to be considered as a core point. This includes the point itself.
74
+ Higher values yield fewer, denser clusters, while lower values yield
75
+ more, sparser clusters.
63
76
64
77
metric : str or callable, default='minkowski'
65
78
The metric to use when calculating distance between instances in a
@@ -79,17 +92,23 @@ def dbscan(
79
92
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
80
93
The algorithm to be used by the NearestNeighbors module
81
94
to compute pointwise distances and find nearest neighbors.
82
- See NearestNeighbors module documentation for details.
95
+ 'auto' will attempt to decide the most appropriate algorithm
96
+ based on the values passed to :meth:`fit` method.
97
+ See :class:`~sklearn.neighbors.NearestNeighbors` documentation for
98
+ details.
83
99
84
100
leaf_size : int, default=30
85
101
Leaf size passed to BallTree or cKDTree. This can affect the speed
86
102
of the construction and query, as well as the memory required
87
103
to store the tree. The optimal value depends
88
- on the nature of the problem.
104
+ on the nature of the problem. Generally, smaller leaf sizes
105
+ lead to faster queries but slower construction.
89
106
90
107
p : float, default=2
91
- The power of the Minkowski metric to be used to calculate distance
92
- between points.
108
+ Power parameter for the Minkowski metric. When p = 1, this is equivalent
109
+ to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
110
+ For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected
111
+ to be positive.
93
112
94
113
sample_weight : array-like of shape (n_samples,), default=None
95
114
Weight of each sample, such that a sample with a weight of at least
@@ -101,7 +120,7 @@ def dbscan(
101
120
The number of parallel jobs to run for neighbors search. ``None`` means
102
121
1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
103
122
using all processors. See :term:`Glossary <n_jobs>` for more details.
104
- If precomputed distance are used, parallel execution is not available
123
+ If precomputed distances are used, parallel execution is not available
105
124
and thus n_jobs will have no effect.
106
125
107
126
Returns
@@ -110,7 +129,8 @@ def dbscan(
110
129
Indices of core samples.
111
130
112
131
labels : ndarray of shape (n_samples,)
113
- Cluster labels for each point. Noisy samples are given the label -1.
132
+ Cluster labels for each point. Noisy samples are given the label -1.
133
+ Non-negative integers indicate cluster membership.
114
134
115
135
See Also
116
136
--------
@@ -183,7 +203,11 @@ class DBSCAN(ClusterMixin, BaseEstimator):
183
203
184
204
DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
185
205
Finds core samples of high density and expands clusters from them.
186
- Good for data which contains clusters of similar density.
206
+ This algorithm is particularly good for data which contains clusters of
207
+ similar density and can find clusters of arbitrary shape.
208
+
209
+ Unlike K-means, DBSCAN does not require specifying the number of clusters
210
+ in advance and can identify outliers as noise points.
187
211
188
212
This implementation has a worst case memory complexity of :math:`O({n}^2)`,
189
213
which can occur when the `eps` param is large and `min_samples` is low,
@@ -199,7 +223,7 @@ class DBSCAN(ClusterMixin, BaseEstimator):
199
223
as in the neighborhood of the other. This is not a maximum bound
200
224
on the distances of points within a cluster. This is the most
201
225
important DBSCAN parameter to choose appropriately for your data set
202
- and distance function.
226
+ and distance function. Smaller values generally lead to more clusters.
203
227
204
228
min_samples : int, default=5
205
229
The number of samples (or total weight) in a neighborhood for a point to
@@ -228,7 +252,10 @@ class DBSCAN(ClusterMixin, BaseEstimator):
228
252
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
229
253
The algorithm to be used by the NearestNeighbors module
230
254
to compute pointwise distances and find nearest neighbors.
231
- See NearestNeighbors module documentation for details.
255
+ 'auto' will attempt to decide the most appropriate algorithm
256
+ based on the values passed to :meth:`fit` method.
257
+ See :class:`~sklearn.neighbors.NearestNeighbors` documentation for
258
+ details.
232
259
233
260
leaf_size : int, default=30
234
261
Leaf size passed to BallTree or cKDTree. This can affect the speed
@@ -239,7 +266,7 @@ class DBSCAN(ClusterMixin, BaseEstimator):
239
266
p : float, default=None
240
267
The power of the Minkowski metric to be used to calculate distance
241
268
between points. If None, then ``p=2`` (equivalent to the Euclidean
242
- distance).
269
+ distance). When p=1, this is equivalent to Manhattan distance.
243
270
244
271
n_jobs : int, default=None
245
272
The number of parallel jobs to run.
@@ -255,9 +282,10 @@ class DBSCAN(ClusterMixin, BaseEstimator):
255
282
components_ : ndarray of shape (n_core_samples, n_features)
256
283
Copy of each core sample found by training.
257
284
258
- labels_ : ndarray of shape (n_samples)
285
+ labels_ : ndarray of shape (n_samples, )
259
286
Cluster labels for each point in the dataset given to fit().
260
- Noisy samples are given the label -1.
287
+ Noisy samples are given the label -1. Non-negative integers
288
+ indicate cluster membership.
261
289
262
290
n_features_in_ : int
263
291
Number of features seen during :term:`fit`.
@@ -448,6 +476,9 @@ def fit(self, X, y=None, sample_weight=None):
448
476
def fit_predict (self , X , y = None , sample_weight = None ):
449
477
"""Compute clusters from a data or distance matrix and predict labels.
450
478
479
+ This method fits the model and returns the cluster labels in a single step.
480
+ It is equivalent to calling fit(X).labels_.
481
+
451
482
Parameters
452
483
----------
453
484
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
@@ -469,6 +500,7 @@ def fit_predict(self, X, y=None, sample_weight=None):
469
500
-------
470
501
labels : ndarray of shape (n_samples,)
471
502
Cluster labels. Noisy samples are given the label -1.
503
+ Non-negative integers indicate cluster membership.
472
504
"""
473
505
self .fit (X , sample_weight = sample_weight )
474
506
return self .labels_
0 commit comments