From 47f78471867fc64d7306924d770f5542fbba3dfd Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 10 Sep 2019 13:38:04 +0200 Subject: [PATCH 1/6] bad default value --- sklearn/cluster/dbscan_.py | 12 +++++++- sklearn/utils/tests/test_validation.py | 40 +++++++++++++++++++++++++- sklearn/utils/validation.py | 19 ++++++++++++ 3 files changed, 69 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index c123d22ff01f4..aa746f7c5160c 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -16,6 +16,7 @@ from ..base import BaseEstimator, ClusterMixin from ..utils import check_array, check_consistent_length from ..neighbors import NearestNeighbors +from ..utils.validation import _validate_bad_defaults from ._dbscan_inner import dbscan_inner @@ -41,6 +42,9 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, important DBSCAN parameter to choose appropriately for your data set and distance function. + Note that there is no good default value for this parameter. An + optimal value depends on the data at hand as well as the used metric. + min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. @@ -208,6 +212,9 @@ class DBSCAN(ClusterMixin, BaseEstimator): important DBSCAN parameter to choose appropriately for your data set and distance function. + Note that there is no good default value for this parameter. An + optimal value depends on the data at hand as well as the used metric. + min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. @@ -315,7 +322,9 @@ class DBSCAN(ClusterMixin, BaseEstimator): ACM Transactions on Database Systems (TODS), 42(3), 19. """ - def __init__(self, eps=0.5, min_samples=5, metric='euclidean', + _bad_defaults = {'eps': 0.5} + + def __init__(self, eps='warn', min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None): self.eps = eps @@ -353,6 +362,7 @@ def fit(self, X, y=None, sample_weight=None): """ X = check_array(X, accept_sparse='csr') + _validate_bad_defaults(self) clust = dbscan(X, sample_weight=sample_weight, **self.get_params()) self.core_sample_indices_, self.labels_ = clust diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 0f7ffe9a3e4f0..d36237fdf3e79 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -11,6 +11,7 @@ import numpy as np import scipy.sparse as sp +from sklearn.base import BaseEstimator from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_no_warnings @@ -42,7 +43,8 @@ _num_samples, check_scalar, _check_sample_weight, - _allclose_dense_sparse) + _allclose_dense_sparse, + _validate_bad_defaults) import sklearn from sklearn.exceptions import NotFittedError @@ -938,3 +940,39 @@ def test_allclose_dense_sparse_raise(toarray): "and an array") with pytest.raises(ValueError, match=msg): _allclose_dense_sparse(x, y) + + +def test_validate_bad_params(): + msg1 = ("There is no good default value for the following parameters in " + "A. Please consult the documentation on how to set them for your " + "data." + " 'param_a' - using default value: 1" + " 'param_b' - using default value: kmeans") + msg2 = ("There is no good default value for the following parameters in " + "A. Please consult the documentation on how to set them for your " + "data." + " 'param_b' - using default value: kmeans") + + class A(BaseEstimator): + _bad_defaults = {'param_a': 1, 'param_b': 'kmeans'} + + def __init__(self, param_a='warn', param_b='warn', param_c='warn', + param_d=0): + self.param_a = param_a + self.param_b = param_b + self.param_c = param_c + self.param_d = param_d + + def fit(self, X=None, y=None): + _validate_bad_defaults(self) + return self + + with pytest.warns(UserWarning, match=msg1): + A().fit() + + with pytest.warns(UserWarning, match=msg2): + A(param_a=1).fit() + + with warnings.catch_warnings(record=True) as warns: + A(param_a=1, param_b='dbscan').fit() + assert not warns diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 465acf48e8293..66ec49376546b 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1089,3 +1089,22 @@ def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9): return np.allclose(x, y, rtol=rtol, atol=atol) raise ValueError("Can only compare two sparse matrices, not a sparse " "matrix and an array") + + +def _validate_bad_defaults(obj): + if not hasattr(obj, "_bad_defaults"): + return + + obj_values = {param: getattr(obj, param) for param in obj._bad_defaults} + bad_params = [param for param, value in obj_values.items() + if value == 'warn'] + if bad_params: + msg = ("There is no good default value for the following " + "parameters in {}. Please consult the documentation " + "on how to set them for your data.\n\t".format( + obj.__class__.__name__)) + msg += '\n\t'.join(["'{}' - using default value: {}".format( + param, obj._bad_defaults[param]) for param in bad_params]) + warnings.warn(msg, UserWarning) + for param in bad_params: + setattr(obj, param, obj._bad_defaults[param]) From b1ff3afe28d2a799d5de2865edcb7398ab9c6991 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 10 Sep 2019 14:42:44 +0200 Subject: [PATCH 2/6] fix warning match, fix rm drivers/mmc/host/rtsx_pci_sdmmc.ko.xz --- sklearn/utils/tests/test_validation.py | 6 +++--- sklearn/utils/validation.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index d36237fdf3e79..799ede985fca1 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -946,12 +946,12 @@ def test_validate_bad_params(): msg1 = ("There is no good default value for the following parameters in " "A. Please consult the documentation on how to set them for your " "data." - " 'param_a' - using default value: 1" - " 'param_b' - using default value: kmeans") + "\n 'param_a' - using default value: 1" + "\n 'param_b' - using default value: 'kmeans'") msg2 = ("There is no good default value for the following parameters in " "A. Please consult the documentation on how to set them for your " "data." - " 'param_b' - using default value: kmeans") + "\n 'param_b' - using default value: 'kmeans'") class A(BaseEstimator): _bad_defaults = {'param_a': 1, 'param_b': 'kmeans'} diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 66ec49376546b..874048fda0317 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1103,7 +1103,7 @@ def _validate_bad_defaults(obj): "parameters in {}. Please consult the documentation " "on how to set them for your data.\n\t".format( obj.__class__.__name__)) - msg += '\n\t'.join(["'{}' - using default value: {}".format( + msg += '\n\t'.join(["'{}' - using default value: {!r}".format( param, obj._bad_defaults[param]) for param in bad_params]) warnings.warn(msg, UserWarning) for param in bad_params: From 4b2c335903dcf2344673959c183f977cfd14e25e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 10 Sep 2019 19:42:19 +0200 Subject: [PATCH 3/6] don't change object's params --- sklearn/cluster/dbscan_.py | 8 +++++--- sklearn/utils/validation.py | 4 +++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index aa746f7c5160c..75a2a3829eaa8 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -35,7 +35,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, A feature array, or array of distances between samples if ``metric='precomputed'``. - eps : float, optional + eps : float, default=0.5 The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most @@ -214,6 +214,8 @@ class DBSCAN(ClusterMixin, BaseEstimator): Note that there is no good default value for this parameter. An optimal value depends on the data at hand as well as the used metric. + If not specified, a warning is raised and the default value of 0.5 is + used. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point @@ -362,9 +364,9 @@ def fit(self, X, y=None, sample_weight=None): """ X = check_array(X, accept_sparse='csr') - _validate_bad_defaults(self) + all_params = _validate_bad_defaults(self) clust = dbscan(X, sample_weight=sample_weight, - **self.get_params()) + **all_params) self.core_sample_indices_, self.labels_ = clust if len(self.core_sample_indices_): # fix for scipy sparse indexing issue diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 874048fda0317..f39f2eab6fd91 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1106,5 +1106,7 @@ def _validate_bad_defaults(obj): msg += '\n\t'.join(["'{}' - using default value: {!r}".format( param, obj._bad_defaults[param]) for param in bad_params]) warnings.warn(msg, UserWarning) + all_params = obj.get_params() for param in bad_params: - setattr(obj, param, obj._bad_defaults[param]) + all_params[param] = obj._bad_defaults[param] + return all_params From 331cc837be6317b9f35824642a39e1549d06da65 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 10 Sep 2019 19:46:10 +0200 Subject: [PATCH 4/6] address the other comments --- sklearn/utils/tests/test_validation.py | 8 +++++--- sklearn/utils/validation.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 799ede985fca1..69ba74d88342e 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -946,14 +946,16 @@ def test_validate_bad_params(): msg1 = ("There is no good default value for the following parameters in " "A. Please consult the documentation on how to set them for your " "data." - "\n 'param_a' - using default value: 1" - "\n 'param_b' - using default value: 'kmeans'") + "\n 'param_a' - using default value: 1" + "\n 'param_b' - using default value: 'kmeans'") msg2 = ("There is no good default value for the following parameters in " "A. Please consult the documentation on how to set them for your " "data." - "\n 'param_b' - using default value: 'kmeans'") + "\n 'param_b' - using default value: 'kmeans'") class A(BaseEstimator): + # The param_c should not warn as a result of _validate_bad_defaults + # since it's not included in _bad_defaults _bad_defaults = {'param_a': 1, 'param_b': 'kmeans'} def __init__(self, param_a='warn', param_b='warn', param_c='warn', diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index f39f2eab6fd91..5c376f3ddb291 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1101,9 +1101,9 @@ def _validate_bad_defaults(obj): if bad_params: msg = ("There is no good default value for the following " "parameters in {}. Please consult the documentation " - "on how to set them for your data.\n\t".format( + "on how to set them for your data.\n ".format( obj.__class__.__name__)) - msg += '\n\t'.join(["'{}' - using default value: {!r}".format( + msg += '\n '.join(["'{}' - using default value: {!r}".format( param, obj._bad_defaults[param]) for param in bad_params]) warnings.warn(msg, UserWarning) all_params = obj.get_params() From 67cc364baf3be50f4c8564643b5e7ffb0d5e348e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 10 Sep 2019 20:38:46 +0200 Subject: [PATCH 5/6] sort the params --- sklearn/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5c376f3ddb291..63b76c45cd78b 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1096,8 +1096,8 @@ def _validate_bad_defaults(obj): return obj_values = {param: getattr(obj, param) for param in obj._bad_defaults} - bad_params = [param for param, value in obj_values.items() - if value == 'warn'] + bad_params = sorted([param for param, value in obj_values.items() + if value == 'warn']) if bad_params: msg = ("There is no good default value for the following " "parameters in {}. Please consult the documentation " From 335dd3862fc8511398984b13776ac91bc240701d Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 28 May 2020 11:28:56 +0200 Subject: [PATCH 6/6] trying to warn once --- sklearn/cluster/_dbscan.py | 2 +- sklearn/utils/tests/test_validation.py | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 98afb8bc21d5e..108e54c7ffeb0 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -341,7 +341,7 @@ def fit(self, X, y=None, sample_weight=None): X.setdiag(X.diagonal()) # XXX: modifies X's internals in-place neighbors_model = NearestNeighbors( - radius=self.eps, algorithm=self.algorithm, + radius=eps, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs) neighbors_model.fit(X) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index d5af535f9857a..a474f54d06a9c 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -51,6 +51,7 @@ import sklearn from sklearn.exceptions import NotFittedError, PositiveSpectrumWarning +from sklearn.exceptions import BadDefaultWarning from sklearn.utils._testing import TempMemmap @@ -1111,6 +1112,7 @@ def test_allclose_dense_sparse_raise(toarray): def test_validate_bad_params(): msg1 = ("There is no good default value for the following parameters in " + "A. Please consult the documentation on how to set them for your " "data." "\n 'param_a' - using default value: 1" "\n 'param_b' - using default value: 'kmeans'") @@ -1135,16 +1137,30 @@ def fit(self, X=None, y=None): _validate_bad_defaults(self) return self - with pytest.warns(UserWarning, match=msg1): + with pytest.warns(BadDefaultWarning, match=msg1): + A().fit() + + # should not warn the second time + with warnings.catch_warnings(record=True) as warns: A().fit() + assert not warns + + with pytest.warns(BadDefaultWarning, match=msg2): + A(param_a=1).fit() - with pytest.warns(UserWarning, match=msg2): + # should not warn the second time + with warnings.catch_warnings(record=True) as warns: A(param_a=1).fit() + assert not warns with warnings.catch_warnings(record=True) as warns: A(param_a=1, param_b='dbscan').fit() assert not warns + +def test_deprecate_positional_args_warns_for_function(): + + @_deprecate_positional_args def f1(a, b, *, c=1, d=1): pass @@ -1248,4 +1264,3 @@ def test_check_sparse_pandas_sp_format(sp_format): assert sp.issparse(result) assert result.format == sp_format assert_allclose_dense_sparse(sp_mat, result) ->>>>>>> upstream/master