scikit-learn · adrinjalali · Sep 10, 2019 · Sep 10, 2019 · Sep 10, 2019 · Sep 10, 2019
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
@@ -15,10 +15,13 @@
 import sys
 import logging
 import os
+import warnings
+from .exceptions import BadDefaultWarning
 
 from ._config import get_config, set_config, config_context
 
 logger = logging.getLogger(__name__)
+warnings.filterwarnings("once", category=BadDefaultWarning)
 
 
 # PEP0440 compatible formatted version, see:

diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
@@ -16,6 +16,7 @@
 from ..base import BaseEstimator, ClusterMixin
 from ..utils.validation import _check_sample_weight, _deprecate_positional_args
 from ..neighbors import NearestNeighbors
+from ..utils.validation import _validate_bad_defaults
 
 from ._dbscan_inner import dbscan_inner
 
@@ -42,6 +43,9 @@ def dbscan(X, eps=0.5, *, min_samples=5, metric='minkowski',
         important DBSCAN parameter to choose appropriately for your data set
         and distance function.
 
+        Note that there is no good default value for this parameter. An
+        optimal value depends on the data at hand as well as the used metric.
+
     min_samples : int, default=5
         The number of samples (or total weight) in a neighborhood for a point
         to be considered as a core point. This includes the point itself.
@@ -165,6 +169,11 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         important DBSCAN parameter to choose appropriately for your data set
         and distance function.
 
+        Note that there is no good default value for this parameter. An
+        optimal value depends on the data at hand as well as the used metric.
+        If not specified, a warning is raised and the default value of 0.5 is
+        used.
+
     min_samples : int, default=5
         The number of samples (or total weight) in a neighborhood for a point
         to be considered as a core point. This includes the point itself.
@@ -271,8 +280,11 @@ class DBSCAN(ClusterMixin, BaseEstimator):
     DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.
     ACM Transactions on Database Systems (TODS), 42(3), 19.
     """
+
+    _bad_defaults = {'eps': 0.5}
+
     @_deprecate_positional_args
-    def __init__(self, eps=0.5, *, min_samples=5, metric='euclidean',
+    def __init__(self, eps='warn', *, min_samples=5, metric='euclidean',
                  metric_params=None, algorithm='auto', leaf_size=30, p=None,
                  n_jobs=None):
         self.eps = eps
@@ -310,8 +322,9 @@ def fit(self, X, y=None, sample_weight=None):
 
         """
         X = self._validate_data(X, accept_sparse='csr')
+        eps = _validate_bad_defaults(self)['eps']
 
-        if not self.eps > 0.0:
+        if not eps > 0.0:
             raise ValueError("eps must be positive.")
 
         if sample_weight is not None:
@@ -328,7 +341,7 @@ def fit(self, X, y=None, sample_weight=None):
                 X.setdiag(X.diagonal())  # XXX: modifies X's internals in-place
 
         neighbors_model = NearestNeighbors(
-            radius=self.eps, algorithm=self.algorithm,
+            radius=eps, algorithm=self.algorithm,
             leaf_size=self.leaf_size, metric=self.metric,
             metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs)
         neighbors_model.fit(X)

diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py
@@ -13,7 +13,8 @@
            'NonBLASDotWarning',
            'SkipTestWarning',
            'UndefinedMetricWarning',
-           'PositiveSpectrumWarning']
+           'PositiveSpectrumWarning',
+           'BadDefaultWarning']
 
 
 class NotFittedError(ValueError, AttributeError):
@@ -147,3 +148,14 @@ class PositiveSpectrumWarning(UserWarning):
 
     .. versionadded:: 0.22
     """
+
+
+class BadDefaultWarning(UserWarning):
+    """Warning raised for unspecified parameters with no good default.
+
+    This warning is typically raised by _validate_bad_defaults when the user
+    does not specify a value for a parameter with no good default value. An
+    example is the ``eps`` in :class:`cluster.DBSCAN`.
+
+    .. versionadded: 0.24
+    """
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
@@ -12,6 +12,7 @@
 import numpy as np
 import scipy.sparse as sp
 
+from sklearn.base import BaseEstimator
 from sklearn.utils._testing import assert_no_warnings
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import SkipTest
@@ -43,12 +44,14 @@
     _deprecate_positional_args,
     _check_sample_weight,
     _allclose_dense_sparse,
+    _validate_bad_defaults,
     FLOAT_DTYPES)
 from sklearn.utils.validation import _check_fit_params
 
 import sklearn
 
 from sklearn.exceptions import NotFittedError, PositiveSpectrumWarning
+from sklearn.exceptions import BadDefaultWarning
 
 from sklearn.utils._testing import TempMemmap
 
@@ -1107,6 +1110,54 @@ def test_allclose_dense_sparse_raise(toarray):
         _allclose_dense_sparse(x, y)
 
 
+def test_validate_bad_params():
+    msg1 = ("There is no good default value for the following parameters in "
+            "A. Please consult the documentation on how to set them for your "
+            "data."
+            "\n    'param_a' - using default value: 1"
+            "\n    'param_b' - using default value: 'kmeans'")
+    msg2 = ("There is no good default value for the following parameters in "
+            "A. Please consult the documentation on how to set them for your "
+            "data."
+            "\n    'param_b' - using default value: 'kmeans'")
+
+    class A(BaseEstimator):
+        # The param_c should not warn as a result of _validate_bad_defaults
+        # since it's not included in _bad_defaults
+        _bad_defaults = {'param_a': 1, 'param_b': 'kmeans'}
+
+        def __init__(self, param_a='warn', param_b='warn', param_c='warn',
+                     param_d=0):
+            self.param_a = param_a
+            self.param_b = param_b
+            self.param_c = param_c
+            self.param_d = param_d
+
+        def fit(self, X=None, y=None):
+            _validate_bad_defaults(self)
+            return self
+
+    with pytest.warns(BadDefaultWarning, match=msg1):
+        A().fit()
+
+    # should not warn the second time
+    with warnings.catch_warnings(record=True) as warns:
+        A().fit()
+    assert not warns
+
+    with pytest.warns(BadDefaultWarning, match=msg2):
+        A(param_a=1).fit()
+
+    # should not warn the second time
+    with warnings.catch_warnings(record=True) as warns:
+        A(param_a=1).fit()
+    assert not warns
+
+    with warnings.catch_warnings(record=True) as warns:
+        A(param_a=1, param_b='dbscan').fit()
+    assert not warns
+
+
 def test_deprecate_positional_args_warns_for_function():
 
     @_deprecate_positional_args

diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
@@ -29,6 +29,7 @@
 from ..exceptions import NonBLASDotWarning, PositiveSpectrumWarning
 from ..exceptions import NotFittedError
 from ..exceptions import DataConversionWarning
+from ..exceptions import BadDefaultWarning
 
 FLOAT_DTYPES = (np.float64, np.float32, np.float16)
 
@@ -1342,6 +1343,27 @@ def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9):
                      "matrix and an array")
 
 
+def _validate_bad_defaults(obj):
+    if not hasattr(obj, "_bad_defaults"):
+        return
+
+    obj_values = {param: getattr(obj, param) for param in obj._bad_defaults}
+    bad_params = sorted([param for param, value in obj_values.items()
+                        if value == 'warn'])
+    if bad_params:
+        msg = ("There is no good default value for the following "
+               "parameters in {}. Please consult the documentation "
+               "on how to set them for your data.\n    ".format(
+                   obj.__class__.__name__))
+        msg += '\n    '.join(["'{}' - using default value: {!r}".format(
+            param, obj._bad_defaults[param]) for param in bad_params])
+        warnings.warn(msg, BadDefaultWarning)
+    all_params = obj.get_params()
+    for param in bad_params:
+        all_params[param] = obj._bad_defaults[param]
+    return all_params
+
+
 def _check_fit_params(X, fit_params, indices=None):
     """Check and validate the parameters passed during `fit`.