From 85dc97ebfd8453f59343b97e0e475f404bbfeab0 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 18 Sep 2019 14:30:20 -0400 Subject: [PATCH 01/13] add test that zero sample weight means samples are ignored --- sklearn/utils/estimator_checks.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 438892db23865..b303bd0d77e46 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -726,8 +726,10 @@ def check_sample_weights_invariance(name, estimator_orig): estimator1 = clone(estimator_orig) estimator2 = clone(estimator_orig) + estimator3 = clone(estimator_orig) set_random_state(estimator1, random_state=0) set_random_state(estimator2, random_state=0) + set_random_state(estimator3, random_state=0) X = np.array([[1, 3], [1, 3], [1, 3], [1, 3], [2, 1], [2, 1], [2, 1], [2, 1], @@ -735,22 +737,39 @@ def check_sample_weights_invariance(name, estimator_orig): [4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.dtype('float')) y = np.array([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int')) + + X2 = np.vstack([X, X]) + y2 = np.hstack([y, 3 - y]) y = _enforce_estimator_tags_y(estimator1, y) + y2 = _enforce_estimator_tags_y(estimator3, y2) + weights = np.ones(shape=len(y) * 2) + weights[len(y):] = 0 + X2, y2, weights = shuffle(X2, y2, weights, random_state=0) estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y))) estimator2.fit(X, y=y, sample_weight=None) + estimator3.fit(X2, y=y2, sample_weight=weights) for method in ["predict", "transform"]: if hasattr(estimator_orig, method): X_pred1 = getattr(estimator1, method)(X) X_pred2 = getattr(estimator2, method)(X) + X_pred3 = getattr(estimator3, method)(X) if sparse.issparse(X_pred1): X_pred1 = X_pred1.toarray() X_pred2 = X_pred2.toarray() + X_pred3 = X_pred3.toarray() assert_allclose(X_pred1, X_pred2, err_msg="For %s sample_weight=None is not" " equivalent to sample_weight=ones" % name) + assert_allclose( + X_pred1, X_pred3, + err_msg="For %s sample_weight is not" + " equivalent to removing samples" + % name) + + @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) From 95a2d5b75082c405d1afe1ffc2335837d1d66664 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 18 Sep 2019 15:23:03 -0400 Subject: [PATCH 02/13] add predict_proba and decision_function to tests --- sklearn/utils/estimator_checks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index b303bd0d77e46..d8a61e8431576 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -750,7 +750,8 @@ def check_sample_weights_invariance(name, estimator_orig): estimator2.fit(X, y=y, sample_weight=None) estimator3.fit(X2, y=y2, sample_weight=weights) - for method in ["predict", "transform"]: + for method in ["predict", "predict_proba", + "decision_function", "transform"]: if hasattr(estimator_orig, method): X_pred1 = getattr(estimator1, method)(X) X_pred2 = getattr(estimator2, method)(X) From 24acb536cee4ab05da3308248dca3d41d20225f9 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 20 Feb 2020 23:15:12 +0100 Subject: [PATCH 03/13] Mark failing estimators as xfail --- sklearn/calibration.py | 8 +++ sklearn/cluster/_kmeans.py | 16 ++++++ sklearn/ensemble/_iforest.py | 8 +++ sklearn/linear_model/_logistic.py | 8 +++ sklearn/linear_model/_ransac.py | 8 +++ sklearn/linear_model/_stochastic_gradient.py | 16 ++++++ sklearn/neighbors/_kde.py | 8 +++ sklearn/svm/_classes.py | 52 +++++++++++++++++++- 8 files changed, 123 insertions(+), 1 deletion(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index ff9c4b3e75c44..0cecd41c70b8b 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -233,6 +233,14 @@ class that has the highest probability, and can thus be different check_is_fitted(self) return self.classes_[np.argmax(self.predict_proba(X), axis=1)] + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } + class _CalibratedClassifier: """Probability calibration with isotonic regression or sigmoid. diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 7e4df5908137b..735e24e012559 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1203,6 +1203,14 @@ def score(self, X, y=None, sample_weight=None): return -_labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[1] + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } + def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, old_center_buffer, compute_squared_diff, @@ -1857,3 +1865,11 @@ def predict(self, X, sample_weight=None): X = self._check_test_data(X) return self._labels_inertia_minibatch(X, sample_weight)[0] + + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 6aa4dac35a156..1f7b0e1e56a8c 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -469,6 +469,14 @@ def _compute_score_samples(self, X, subsample_features): ) return scores + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } + def _average_path_length(n_samples_leaf): """ diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 4b541884eece8..6068f253e62de 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -2083,3 +2083,11 @@ def score(self, X, y, sample_weight=None): scoring = get_scorer(scoring) return scoring(self, X, y, sample_weight=sample_weight) + + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 0363032359524..bf1b87670efd3 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -493,3 +493,11 @@ def score(self, X, y): check_is_fitted(self) return self.estimator_.score(X, y) + + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 7fc2126a131b1..9d8dc9e027766 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -1090,6 +1090,14 @@ def predict_log_proba(self): def _predict_log_proba(self, X): return np.log(self.predict_proba(X)) + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } + class BaseSGDRegressor(RegressorMixin, BaseSGD): @@ -1569,3 +1577,11 @@ def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, warm_start=warm_start, average=average) + + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 5b44f6f6b2b75..8d8c841933e3b 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -268,3 +268,11 @@ def sample(self, n_samples=1, random_state=None): correction = (gammainc(0.5 * dim, 0.5 * s_sq) ** (1. / dim) * self.bandwidth / np.sqrt(s_sq)) return data[i] + X * correction[:, np.newaxis] + + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'sample_weight must have positive values', + } + } diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 1d96900555400..63d8ad60fe2ea 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -237,6 +237,14 @@ def fit(self, X, y, sample_weight=None): return self + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } + class LinearSVR(RegressorMixin, LinearModel): """Linear Support Vector Regression. @@ -408,6 +416,14 @@ def fit(self, X, y, sample_weight=None): return self + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } + class SVC(BaseSVC): """C-Support Vector Classification. @@ -625,6 +641,14 @@ def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale', break_ties=break_ties, random_state=random_state) + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } + class NuSVC(BaseSVC): """Nu-Support Vector Classification. @@ -833,7 +857,9 @@ def _more_tags(self): '_xfail_test': { 'check_methods_subset_invariance': 'fails for the decision_function method', - 'check_class_weight_classifiers': 'class_weight is ignored.' + 'check_class_weight_classifiers': 'class_weight is ignored.', + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', } } @@ -986,6 +1012,14 @@ def probA_(self): def probB_(self): return self._probB + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } + class NuSVR(RegressorMixin, BaseLibSVM): """Nu Support Vector Regression. @@ -1111,6 +1145,14 @@ def __init__(self, nu=0.5, C=1.0, kernel='rbf', degree=3, probability=False, cache_size=cache_size, class_weight=None, verbose=verbose, max_iter=max_iter, random_state=None) + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } + class OneClassSVM(OutlierMixin, BaseLibSVM): """Unsupervised Outlier Detection. @@ -1319,3 +1361,11 @@ def probA_(self): @property def probB_(self): return self._probB + + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } From 662a91fb895c1508f506153068809b22ec07f8a2 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 20 Feb 2020 23:25:24 +0100 Subject: [PATCH 04/13] Split sample wight invariance tests in two --- sklearn/calibration.py | 2 +- sklearn/cluster/_kmeans.py | 4 +- sklearn/ensemble/_iforest.py | 2 +- sklearn/linear_model/_logistic.py | 2 +- sklearn/linear_model/_ransac.py | 2 +- sklearn/linear_model/_stochastic_gradient.py | 4 +- sklearn/neighbors/_kde.py | 2 +- sklearn/svm/_classes.py | 14 ++--- sklearn/utils/estimator_checks.py | 64 +++++++++++++++----- 9 files changed, 65 insertions(+), 31 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 0cecd41c70b8b..480ad754d3c85 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -236,7 +236,7 @@ class that has the highest probability, and can thus be different def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 735e24e012559..9d91d6c05e175 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1206,7 +1206,7 @@ def score(self, X, y=None, sample_weight=None): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } @@ -1869,7 +1869,7 @@ def predict(self, X, sample_weight=None): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 1f7b0e1e56a8c..8ae6c5630ff57 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -472,7 +472,7 @@ def _compute_score_samples(self, X, subsample_features): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 6068f253e62de..8188acfd71367 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -2087,7 +2087,7 @@ def score(self, X, y, sample_weight=None): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index bf1b87670efd3..11c8d72ea7748 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -497,7 +497,7 @@ def score(self, X, y): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 9d8dc9e027766..a6d8863c3df3a 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -1093,7 +1093,7 @@ def _predict_log_proba(self, X): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } @@ -1581,7 +1581,7 @@ def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 8d8c841933e3b..957fb58a568c6 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -272,7 +272,7 @@ def sample(self, n_samples=1, random_state=None): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'sample_weight must have positive values', } } diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 63d8ad60fe2ea..412b95e47281b 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -240,7 +240,7 @@ def fit(self, X, y, sample_weight=None): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } @@ -419,7 +419,7 @@ def fit(self, X, y, sample_weight=None): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } @@ -644,7 +644,7 @@ def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale', def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } @@ -858,7 +858,7 @@ def _more_tags(self): 'check_methods_subset_invariance': 'fails for the decision_function method', 'check_class_weight_classifiers': 'class_weight is ignored.', - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } @@ -1015,7 +1015,7 @@ def probB_(self): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } @@ -1148,7 +1148,7 @@ def __init__(self, nu=0.5, C=1.0, kernel='rbf', degree=3, def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } @@ -1365,7 +1365,7 @@ def probB_(self): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance': + 'check_sample_weights_invariance_zeros': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index df170815ea87a..71c5764442df1 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -80,7 +80,8 @@ def _yield_checks(name, estimator): yield check_sample_weights_pandas_series yield check_sample_weights_not_an_array yield check_sample_weights_list - yield check_sample_weights_invariance + yield check_sample_weights_invariance_ones + yield check_sample_weights_invariance_zeros yield check_estimators_fit_returns_self yield partial(check_estimators_fit_returns_self, readonly_memmap=True) @@ -765,7 +766,7 @@ def check_sample_weights_list(name, estimator_orig): @ignore_warnings(category=FutureWarning) -def check_sample_weights_invariance(name, estimator_orig): +def check_sample_weights_invariance_ones(name, estimator_orig): # check that the estimators yield same results for # unit weights and no weights if (has_fit_parameter(estimator_orig, "sample_weight") and @@ -775,53 +776,86 @@ def check_sample_weights_invariance(name, estimator_orig): estimator1 = clone(estimator_orig) estimator2 = clone(estimator_orig) - estimator3 = clone(estimator_orig) set_random_state(estimator1, random_state=0) set_random_state(estimator2, random_state=0) - set_random_state(estimator3, random_state=0) X = np.array([[1, 3], [1, 3], [1, 3], [1, 3], [2, 1], [2, 1], [2, 1], [2, 1], [3, 3], [3, 3], [3, 3], [3, 3], - [4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.dtype('float')) + [4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.float64) y = np.array([1, 1, 1, 1, 2, 2, 2, 2, - 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int')) + 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.int) X2 = np.vstack([X, X]) y2 = np.hstack([y, 3 - y]) y = _enforce_estimator_tags_y(estimator1, y) - y2 = _enforce_estimator_tags_y(estimator3, y2) - weights = np.ones(shape=len(y) * 2) - weights[len(y):] = 0 - X2, y2, weights = shuffle(X2, y2, weights, random_state=0) estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y))) estimator2.fit(X, y=y, sample_weight=None) - estimator3.fit(X2, y=y2, sample_weight=weights) for method in ["predict", "predict_proba", "decision_function", "transform"]: if hasattr(estimator_orig, method): X_pred1 = getattr(estimator1, method)(X) X_pred2 = getattr(estimator2, method)(X) - X_pred3 = getattr(estimator3, method)(X) if sparse.issparse(X_pred1): X_pred1 = X_pred1.toarray() X_pred2 = X_pred2.toarray() - X_pred3 = X_pred3.toarray() assert_allclose(X_pred1, X_pred2, err_msg="For %s sample_weight=None is not" " equivalent to sample_weight=ones" % name) + +@ignore_warnings(category=FutureWarning) +def check_sample_weights_invariance_zeros(name, estimator_orig): + # check that the estimators yield same results for + # unit weights and no weights + if (has_fit_parameter(estimator_orig, "sample_weight") and + not (hasattr(estimator_orig, "_pairwise") + and estimator_orig._pairwise)): + # We skip pairwise because the data is not pairwise + + estimator1 = clone(estimator_orig) + estimator2 = clone(estimator_orig) + set_random_state(estimator1, random_state=0) + set_random_state(estimator2, random_state=0) + + X = np.array([[1, 3], [1, 3], [1, 3], [1, 3], + [2, 1], [2, 1], [2, 1], [2, 1], + [3, 3], [3, 3], [3, 3], [3, 3], + [4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.dtype('float')) + y = np.array([1, 1, 1, 1, 2, 2, 2, 2, + 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int')) + + X2 = np.vstack([X, X]) + y2 = np.hstack([y, 3 - y]) + y = _enforce_estimator_tags_y(estimator1, y) + y2 = _enforce_estimator_tags_y(estimator2, y2) + weights = np.ones(shape=len(y) * 2) + weights[len(y):] = 0 + X2, y2, weights = shuffle(X2, y2, weights, random_state=0) + + estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y))) + estimator2.fit(X2, y=y2, sample_weight=weights) + + for method in ["predict", "predict_proba", + "decision_function", "transform"]: + if hasattr(estimator_orig, method): + X_pred1 = getattr(estimator1, method)(X) + X_pred2 = getattr(estimator2, method)(X) + + if sparse.issparse(X_pred1): + X_pred1 = X_pred1.toarray() + X_pred2 = X_pred2.toarray() + assert_allclose( - X_pred1, X_pred3, + X_pred1, X_pred2, err_msg="For %s sample_weight is not" " equivalent to removing samples" % name) - @ignore_warnings(category=(FutureWarning, UserWarning)) def check_dtype_object(name, estimator_orig): # check that estimators treat dtype object as numeric if possible From 54db9078cf42d83b3574fe27f8f1828f8dfa4128 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 20 Feb 2020 23:57:02 +0100 Subject: [PATCH 05/13] Parametrize check_sample_weights_invariance check with kind parameter --- sklearn/calibration.py | 2 +- sklearn/cluster/_kmeans.py | 4 +- sklearn/ensemble/_iforest.py | 2 +- sklearn/linear_model/_logistic.py | 2 +- sklearn/linear_model/_ransac.py | 2 +- sklearn/linear_model/_stochastic_gradient.py | 4 +- sklearn/neighbors/_kde.py | 2 +- sklearn/svm/_classes.py | 14 ++-- sklearn/utils/estimator_checks.py | 79 ++++++-------------- 9 files changed, 37 insertions(+), 74 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 480ad754d3c85..67aedc49cc65c 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -236,7 +236,7 @@ class that has the highest probability, and can thus be different def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 9d91d6c05e175..c7a5cf38ad337 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1206,7 +1206,7 @@ def score(self, X, y=None, sample_weight=None): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } @@ -1869,7 +1869,7 @@ def predict(self, X, sample_weight=None): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 8ae6c5630ff57..1d2580c6f03cc 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -472,7 +472,7 @@ def _compute_score_samples(self, X, subsample_features): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 8188acfd71367..624c2109f7431 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -2087,7 +2087,7 @@ def score(self, X, y, sample_weight=None): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 11c8d72ea7748..3b01e4f9a741f 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -497,7 +497,7 @@ def score(self, X, y): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index a6d8863c3df3a..0d49015824112 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -1093,7 +1093,7 @@ def _predict_log_proba(self, X): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } @@ -1581,7 +1581,7 @@ def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 957fb58a568c6..2255c18ac5355 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -272,7 +272,7 @@ def sample(self, n_samples=1, random_state=None): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'sample_weight must have positive values', } } diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 412b95e47281b..c8f313be64c44 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -240,7 +240,7 @@ def fit(self, X, y, sample_weight=None): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } @@ -419,7 +419,7 @@ def fit(self, X, y, sample_weight=None): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } @@ -644,7 +644,7 @@ def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale', def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } @@ -858,7 +858,7 @@ def _more_tags(self): 'check_methods_subset_invariance': 'fails for the decision_function method', 'check_class_weight_classifiers': 'class_weight is ignored.', - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } @@ -1015,7 +1015,7 @@ def probB_(self): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } @@ -1148,7 +1148,7 @@ def __init__(self, nu=0.5, C=1.0, kernel='rbf', degree=3, def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } @@ -1365,7 +1365,7 @@ def probB_(self): def _more_tags(self): return { '_xfail_test': { - 'check_sample_weights_invariance_zeros': + 'check_sample_weights_invariance(kind=zeros)': 'zero sample_weight is not equivalent to removing samples', } } diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 71c5764442df1..130e5a1b003cf 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -80,8 +80,8 @@ def _yield_checks(name, estimator): yield check_sample_weights_pandas_series yield check_sample_weights_not_an_array yield check_sample_weights_list - yield check_sample_weights_invariance_ones - yield check_sample_weights_invariance_zeros + yield partial(check_sample_weights_invariance, kind='ones') + yield partial(check_sample_weights_invariance, kind='zeros') yield check_estimators_fit_returns_self yield partial(check_estimators_fit_returns_self, readonly_memmap=True) @@ -766,7 +766,7 @@ def check_sample_weights_list(name, estimator_orig): @ignore_warnings(category=FutureWarning) -def check_sample_weights_invariance_ones(name, estimator_orig): +def check_sample_weights_invariance(name, estimator_orig, kind="ones"): # check that the estimators yield same results for # unit weights and no weights if (has_fit_parameter(estimator_orig, "sample_weight") and @@ -786,74 +786,37 @@ def check_sample_weights_invariance_ones(name, estimator_orig): y = np.array([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.int) - X2 = np.vstack([X, X]) - y2 = np.hstack([y, 3 - y]) - y = _enforce_estimator_tags_y(estimator1, y) - - estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y))) - estimator2.fit(X, y=y, sample_weight=None) - - for method in ["predict", "predict_proba", - "decision_function", "transform"]: - if hasattr(estimator_orig, method): - X_pred1 = getattr(estimator1, method)(X) - X_pred2 = getattr(estimator2, method)(X) - if sparse.issparse(X_pred1): - X_pred1 = X_pred1.toarray() - X_pred2 = X_pred2.toarray() - assert_allclose(X_pred1, X_pred2, - err_msg="For %s sample_weight=None is not" - " equivalent to sample_weight=ones" - % name) - -@ignore_warnings(category=FutureWarning) -def check_sample_weights_invariance_zeros(name, estimator_orig): - # check that the estimators yield same results for - # unit weights and no weights - if (has_fit_parameter(estimator_orig, "sample_weight") and - not (hasattr(estimator_orig, "_pairwise") - and estimator_orig._pairwise)): - # We skip pairwise because the data is not pairwise - - estimator1 = clone(estimator_orig) - estimator2 = clone(estimator_orig) - set_random_state(estimator1, random_state=0) - set_random_state(estimator2, random_state=0) - - X = np.array([[1, 3], [1, 3], [1, 3], [1, 3], - [2, 1], [2, 1], [2, 1], [2, 1], - [3, 3], [3, 3], [3, 3], [3, 3], - [4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.dtype('float')) - y = np.array([1, 1, 1, 1, 2, 2, 2, 2, - 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int')) + if kind == 'ones': + X2 = X + y2 = y + sw2 = None + err_msg = (f"For {name} sample_weight=None is not equivalent to " + f"sample_weight=ones") + elif kind == 'zeros': + X2 = np.vstack([X, X]) + y2 = np.hstack([y, 3 - y]) + sw2 = np.ones(shape=len(y) * 2) + sw2[len(y):] = 0 + err_msg=(f"For {name} sample_weight is not equivalent " + f"to removing samples") + else: + raise ValueError - X2 = np.vstack([X, X]) - y2 = np.hstack([y, 3 - y]) y = _enforce_estimator_tags_y(estimator1, y) - y2 = _enforce_estimator_tags_y(estimator2, y2) - weights = np.ones(shape=len(y) * 2) - weights[len(y):] = 0 - X2, y2, weights = shuffle(X2, y2, weights, random_state=0) + y2 = _enforce_estimator_tags_y(estimator1, y2) estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y))) - estimator2.fit(X2, y=y2, sample_weight=weights) + estimator2.fit(X2, y=y2, sample_weight=sw2) for method in ["predict", "predict_proba", "decision_function", "transform"]: if hasattr(estimator_orig, method): X_pred1 = getattr(estimator1, method)(X) X_pred2 = getattr(estimator2, method)(X) - if sparse.issparse(X_pred1): X_pred1 = X_pred1.toarray() X_pred2 = X_pred2.toarray() - - assert_allclose( - X_pred1, X_pred2, - err_msg="For %s sample_weight is not" - " equivalent to removing samples" - % name) - + assert_allclose(X_pred1, X_pred2, err_msg=err_msg) @ignore_warnings(category=(FutureWarning, UserWarning)) From 2f541cabd973b7b238e9bf65bb0889783e70d391 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 21 Feb 2020 00:02:15 +0100 Subject: [PATCH 06/13] Shuffle data --- sklearn/utils/estimator_checks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 130e5a1b003cf..43898ccb5877b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -797,6 +797,8 @@ def check_sample_weights_invariance(name, estimator_orig, kind="ones"): y2 = np.hstack([y, 3 - y]) sw2 = np.ones(shape=len(y) * 2) sw2[len(y):] = 0 + X2, y2, sw2 = shuffle(X2, y2, sw2, random_state=0) + err_msg=(f"For {name} sample_weight is not equivalent " f"to removing samples") else: From ec8542b6d40f522012133bb841a00b06f79a8d0f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 21 Feb 2020 00:08:25 +0100 Subject: [PATCH 07/13] lint --- sklearn/utils/estimator_checks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 43898ccb5877b..3bd16bcb44f7c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -799,13 +799,13 @@ def check_sample_weights_invariance(name, estimator_orig, kind="ones"): sw2[len(y):] = 0 X2, y2, sw2 = shuffle(X2, y2, sw2, random_state=0) - err_msg=(f"For {name} sample_weight is not equivalent " - f"to removing samples") + err_msg = (f"For {name} sample_weight is not equivalent " + f"to removing samples") else: raise ValueError y = _enforce_estimator_tags_y(estimator1, y) - y2 = _enforce_estimator_tags_y(estimator1, y2) + y2 = _enforce_estimator_tags_y(estimator2, y2) estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y))) estimator2.fit(X2, y=y2, sample_weight=sw2) From 0028872136c9e22f36f0756225258d4e3a0b8498 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 21 Feb 2020 00:57:53 +0100 Subject: [PATCH 08/13] Skip tests marked as xfail in check_estimator --- sklearn/tests/test_common.py | 7 +++---- sklearn/utils/estimator_checks.py | 9 +++++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 508356155aaf7..cffda8ac09013 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -96,10 +96,9 @@ def test_estimators(estimator, check, request): xfail_checks = _safe_tags(estimator, '_xfail_test') check_name = _set_check_estimator_ids(check) - if xfail_checks: - if check_name in xfail_checks: - msg = xfail_checks[check_name] - request.applymarker(pytest.mark.xfail(reason=msg)) + if xfail_checks and check_name in xfail_checks: + msg = xfail_checks[check_name] + request.applymarker(pytest.mark.xfail(reason=msg)) check(estimator) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3bd16bcb44f7c..e0c97277c5349 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -418,6 +418,7 @@ def check_estimator(Estimator, generate_only=False): if isinstance(Estimator, type): # got a class checks_generator = _generate_class_checks(Estimator) + estimator = _construct_instance(Estimator) else: # got an instance estimator = Estimator @@ -427,7 +428,15 @@ def check_estimator(Estimator, generate_only=False): if generate_only: return checks_generator + xfail_checks = _safe_tags(estimator, '_xfail_test') + for estimator, check in checks_generator: + check_name = _set_check_estimator_ids(check) + if xfail_checks and check_name in xfail_checks: + # skip tests marked as a known failure and raise a warning + msg = xfail_checks[check_name] + warnings.warn(f'Skipping {check_name}: {msg}', SkipTestWarning) + continue try: check(estimator) except SkipTest as exception: From a9ebc8f5314006f5f73d5026b8bf72db9fa18326 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 5 Mar 2020 11:44:55 +0100 Subject: [PATCH 09/13] Address Joels's comment --- sklearn/linear_model/_ridge.py | 8 ++++++++ sklearn/utils/estimator_checks.py | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 1a9cb661318e9..91748e675dceb 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1905,3 +1905,11 @@ def fit(self, X, y, sample_weight=None): @property def classes_(self): return self._label_binarizer.classes_ + + def _more_tags(self): + return { + '_xfail_test': { + 'check_sample_weights_invariance(kind=zeros)': + 'zero sample_weight is not equivalent to removing samples', + } + } diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d2da891244e52..8652de1a8599d 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -853,7 +853,9 @@ def check_sample_weights_invariance(name, estimator_orig, kind="ones"): err_msg = (f"For {name} sample_weight=None is not equivalent to " f"sample_weight=ones") elif kind == 'zeros': - X2 = np.vstack([X, X]) + # Construct a dataset that is very different to (X, y) if weights + # are disregarded, but identical to (X, y) given weights. + X2 = np.vstack([X, X + 1]) y2 = np.hstack([y, 3 - y]) sw2 = np.ones(shape=len(y) * 2) sw2[len(y):] = 0 From dcc52f0558e7787f69355fab914f26d3aa5dd6f2 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 5 Mar 2020 14:56:07 +0100 Subject: [PATCH 10/13] Update sklearn/utils/estimator_checks.py Co-Authored-By: Nicolas Hug --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 8652de1a8599d..c7e2702437c34 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -458,7 +458,7 @@ def check_estimator(Estimator, generate_only=False): for estimator, check in checks_generator: check_name = _set_check_estimator_ids(check) - if xfail_checks and check_name in xfail_checks: + if check_name in xfail_checks: # skip tests marked as a known failure and raise a warning msg = xfail_checks[check_name] warnings.warn(f'Skipping {check_name}: {msg}', SkipTestWarning) From 137f0d6088e793f52f363ce94670f15b0451f8ca Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 5 Mar 2020 14:59:39 +0100 Subject: [PATCH 11/13] Improve wording of the comment --- sklearn/utils/estimator_checks.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index c7e2702437c34..389ec1be3ed3e 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -466,8 +466,7 @@ def check_estimator(Estimator, generate_only=False): try: check(estimator) except SkipTest as exception: - # the only SkipTest thrown currently results from not - # being able to import pandas. + # raise warning for tests that are are skipped warnings.warn(str(exception), SkipTestWarning) From ed7abce96eb8962c263463ca9556430da2e6d718 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 5 Mar 2020 15:04:02 +0100 Subject: [PATCH 12/13] Address comments --- sklearn/utils/estimator_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 389ec1be3ed3e..fcfa9f533aa64 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -848,7 +848,7 @@ def check_sample_weights_invariance(name, estimator_orig, kind="ones"): if kind == 'ones': X2 = X y2 = y - sw2 = None + sw2 = np.ones(shape=len(y)) err_msg = (f"For {name} sample_weight=None is not equivalent to " f"sample_weight=ones") elif kind == 'zeros': @@ -868,7 +868,7 @@ def check_sample_weights_invariance(name, estimator_orig, kind="ones"): y = _enforce_estimator_tags_y(estimator1, y) y2 = _enforce_estimator_tags_y(estimator2, y2) - estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y))) + estimator1.fit(X, y=y, sample_weight=None) estimator2.fit(X2, y=y2, sample_weight=sw2) for method in ["predict", "predict_proba", From 5beadf7b182c378ad3a014dfb72bcf8a35617f0f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 5 Mar 2020 15:27:11 +0100 Subject: [PATCH 13/13] Revert "Update sklearn/utils/estimator_checks.py" This reverts commit dcc52f0558e7787f69355fab914f26d3aa5dd6f2. --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index fcfa9f533aa64..083fba0e6e45d 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -458,7 +458,7 @@ def check_estimator(Estimator, generate_only=False): for estimator, check in checks_generator: check_name = _set_check_estimator_ids(check) - if check_name in xfail_checks: + if xfail_checks and check_name in xfail_checks: # skip tests marked as a known failure and raise a warning msg = xfail_checks[check_name] warnings.warn(f'Skipping {check_name}: {msg}', SkipTestWarning)