From 6ab4fe68d1fd13ead6dd416faf1d127f37859425 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Jul 2019 16:26:06 +0200 Subject: [PATCH 1/3] FIX make sure sample_weight is taken into account by estimators --- sklearn/utils/estimator_checks.py | 40 +++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 249cb022f8e87..0f872729864bb 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -78,6 +78,7 @@ def _yield_checks(name, estimator): yield check_sample_weights_pandas_series yield check_sample_weights_list yield check_sample_weights_invariance + yield check_sample_weights_equivalence_sampling yield check_estimators_fit_returns_self yield partial(check_estimators_fit_returns_self, readonly_memmap=True) @@ -631,6 +632,45 @@ def check_sample_weights_invariance(name, estimator_orig): % name) +@ignore_warnings(category=(DeprecationWarning, FutureWarning)) +def check_sample_weights_equivalence_sampling(name, estimator_orig): + # check that the estimators yield same results for + # over-sample dataset by indice filtering and using sampl_weight + if (has_fit_parameter(estimator_orig, "sample_weight") and + not (hasattr(estimator_orig, "_pairwise") + and estimator_orig._pairwise)): + # We skip pairwise because the data is not pairwise + + estimator1 = clone(estimator_orig) + estimator2 = clone(estimator_orig) + set_random_state(estimator1, random_state=0) + set_random_state(estimator2, random_state=0) + + if is_classifier(estimator1): + X, y = load_iris(return_X_y=True) + else: + X, y = load_boston(return_X_y=True) + y = enforce_estimator_tags_y(estimator1, y) + + indices = np.arange(start=0, stop=y.size, step=2) + sample_weight = np.ones((y.size,)) * np.bincount(indices, + minlength=y.size) + + estimator1.fit(X, y=y, sample_weight=sample_weight) + estimator2.fit(X[indices], y[indices]) + + err_msg = ("For {} does not yield to the same results when given " + "sample_weight and an up-sampled dataset") + for method in ["predict", "transform"]: + if hasattr(estimator_orig, method): + X_pred1 = getattr(estimator1, method)(X) + X_pred2 = getattr(estimator2, method)(X) + if sparse.issparse(X_pred1): + X_pred1 = X_pred1.toarray() + X_pred2 = X_pred2.toarray() + assert_allclose(X_pred1, X_pred2, err_msg=err_msg) + + @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) def check_dtype_object(name, estimator_orig): # check that estimators treat dtype object as numeric if possible From a8530113b16636639eb959c6b2963a59c5a716cd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Jul 2019 22:36:53 +0200 Subject: [PATCH 2/3] typo --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 0f872729864bb..696d25c6565ff 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -635,7 +635,7 @@ def check_sample_weights_invariance(name, estimator_orig): @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_sample_weights_equivalence_sampling(name, estimator_orig): # check that the estimators yield same results for - # over-sample dataset by indice filtering and using sampl_weight + # over-sample dataset by indice filtering and using sample_weight if (has_fit_parameter(estimator_orig, "sample_weight") and not (hasattr(estimator_orig, "_pairwise") and estimator_orig._pairwise)): From 1271d8da05245bafaede62f9f20c4398fb57ba48 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 5 Jul 2019 15:17:57 +0200 Subject: [PATCH 3/3] improve readibility --- sklearn/utils/estimator_checks.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 0f872729864bb..059b763199f4f 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -652,9 +652,10 @@ def check_sample_weights_equivalence_sampling(name, estimator_orig): X, y = load_boston(return_X_y=True) y = enforce_estimator_tags_y(estimator1, y) - indices = np.arange(start=0, stop=y.size, step=2) - sample_weight = np.ones((y.size,)) * np.bincount(indices, - minlength=y.size) + step = 2 + indices = np.arange(start=0, stop=y.size, step=step) + sample_weight = np.zeros((y.size,)) + sample_weight[::step] = 1. estimator1.fit(X, y=y, sample_weight=sample_weight) estimator2.fit(X[indices], y[indices]) @@ -665,10 +666,7 @@ def check_sample_weights_equivalence_sampling(name, estimator_orig): if hasattr(estimator_orig, method): X_pred1 = getattr(estimator1, method)(X) X_pred2 = getattr(estimator2, method)(X) - if sparse.issparse(X_pred1): - X_pred1 = X_pred1.toarray() - X_pred2 = X_pred2.toarray() - assert_allclose(X_pred1, X_pred2, err_msg=err_msg) + assert_allclose_dense_sparse(X_pred1, X_pred2, err_msg=err_msg) @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning))