From 840cd39ef26c9083a5abbbbfd0bbb6dafea0c0b6 Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Mon, 24 Feb 2020 15:00:54 -0800 Subject: [PATCH 01/14] pass sample_weight when predicting on stacked folds --- sklearn/ensemble/_stacking.py | 3 +++ sklearn/ensemble/tests/test_stacking.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 7af2d6dc95d2b..58b6aa6c26564 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -169,6 +169,9 @@ def fit(self, X, y, sample_weight=None): predictions = Parallel(n_jobs=self.n_jobs)( delayed(cross_val_predict)(clone(est), X, y, cv=deepcopy(cv), method=meth, n_jobs=self.n_jobs, + fit_params={ + 'sample_weight': sample_weight + }, verbose=self.verbose) for est, meth in zip(all_estimators, self.stack_method_) if est != 'drop' diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py index 1eff7ba5f7de7..0b9cee448e0ed 100644 --- a/sklearn/ensemble/tests/test_stacking.py +++ b/sklearn/ensemble/tests/test_stacking.py @@ -38,6 +38,7 @@ from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import KFold +from sklearn.utils._mocking import CheckingClassifier from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import ignore_warnings @@ -439,6 +440,19 @@ def test_stacking_with_sample_weight(stacker, X, y): assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0 +def test_fit_stacking_with_sample_weight_passed(): + # check sample_weight is passed to all invokations of fit + stacker = StackingClassifier( + estimators=[ + ('lr', CheckingClassifier(expected_fit_params=['sample_weight'])) + ], + final_estimator=CheckingClassifier( + expected_fit_params=['sample_weight'] + ) + ) + stacker.fit(X_iris, y_iris, sample_weight=np.ones(X_iris.shape[0])) + + @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") @pytest.mark.parametrize( "stacker, X, y", From 71997990e1b01ec23ec145ac7d0f23c776c8f27f Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Mon, 24 Feb 2020 19:21:10 -0800 Subject: [PATCH 02/14] use more descriptive spec def --- sklearn/ensemble/tests/test_stacking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py index 0b9cee448e0ed..1d4f897020363 100644 --- a/sklearn/ensemble/tests/test_stacking.py +++ b/sklearn/ensemble/tests/test_stacking.py @@ -440,7 +440,7 @@ def test_stacking_with_sample_weight(stacker, X, y): assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0 -def test_fit_stacking_with_sample_weight_passed(): +def test_stacking_classifier_sample_weight_fit_param(): # check sample_weight is passed to all invokations of fit stacker = StackingClassifier( estimators=[ From 676a720a9c6abf5958ad7f63aa8de3c979f306d7 Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Tue, 25 Feb 2020 09:44:16 -0800 Subject: [PATCH 03/14] fix specs by conditionally passing sample_weight --- sklearn/ensemble/_stacking.py | 25 ++++++------------------- sklearn/ensemble/tests/test_stacking.py | 2 +- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 58b6aa6c26564..3b7a19184d24b 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -165,13 +165,13 @@ def fit(self, X, y, sample_weight=None): self._method_name(name, est, meth) for name, est, meth in zip(names, all_estimators, stack_method) ] - + fit_params = {"sample_weight": sample_weight} \ + if sample_weight is not None \ + else None predictions = Parallel(n_jobs=self.n_jobs)( delayed(cross_val_predict)(clone(est), X, y, cv=deepcopy(cv), method=meth, n_jobs=self.n_jobs, - fit_params={ - 'sample_weight': sample_weight - }, + fit_params=fit_params, verbose=self.verbose) for est, meth in zip(all_estimators, self.stack_method_) if est != 'drop' @@ -185,21 +185,8 @@ def fit(self, X, y, sample_weight=None): ] X_meta = self._concatenate_predictions(X, predictions) - if sample_weight is not None: - try: - self.final_estimator_.fit( - X_meta, y, sample_weight=sample_weight - ) - except TypeError as exc: - if "unexpected keyword argument 'sample_weight'" in str(exc): - raise TypeError( - "Underlying estimator {} does not support sample " - "weights." - .format(self.final_estimator_.__class__.__name__) - ) from exc - raise - else: - self.final_estimator_.fit(X_meta, y) + _parallel_fit_estimator(self.final_estimator_, X_meta, y, + sample_weight=sample_weight) return self diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py index 1d4f897020363..f8a3f290e96b5 100644 --- a/sklearn/ensemble/tests/test_stacking.py +++ b/sklearn/ensemble/tests/test_stacking.py @@ -441,7 +441,7 @@ def test_stacking_with_sample_weight(stacker, X, y): def test_stacking_classifier_sample_weight_fit_param(): - # check sample_weight is passed to all invokations of fit + # check sample_weight is passed to all invocations of fit stacker = StackingClassifier( estimators=[ ('lr', CheckingClassifier(expected_fit_params=['sample_weight'])) From 9d7003152c43c719b6e8a0fef96a9bd40c0fb61a Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Tue, 25 Feb 2020 09:55:01 -0800 Subject: [PATCH 04/14] Add entry in release notes describing improvement --- doc/whats_new/v0.23.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 720a19cd862f6..0a10d366767b9 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -179,6 +179,12 @@ Changelog used during `fit`. :pr:`16437` by :user:`Jin-Hwan CHO `. +- |Fix| Fixed a bug in :class:`ensemble._BaseStacking` where the +`sample_weight` argument was not being passed to `cross_val_predict` +when evaluating the base estimators on cross-validation folds +to obtain the input to the meta estimator. +:pr:`16539` by :user: `Bill DeRose `. + :mod:`sklearn.feature_extraction` ................................. From 324a59443f5150a396580821492f1df63f1a91b3 Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Tue, 25 Feb 2020 10:04:01 -0800 Subject: [PATCH 05/14] add versionchanged note about sample_weight --- sklearn/ensemble/_stacking.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 3b7a19184d24b..64f2d86e2b981 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -122,6 +122,10 @@ def fit(self, X, y, sample_weight=None): Note that this is supported only if all underlying estimators support sample weights. + .. versionchanged:: 0.23 + when not None, `sample_weight` is passed to all underlying + estimators + Returns ------- self : object From c032fd3b567b025d6ecaa0b4fac65572a56ad4b2 Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Tue, 25 Feb 2020 10:06:52 -0800 Subject: [PATCH 06/14] fixup witespace in docstring --- sklearn/ensemble/_stacking.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 64f2d86e2b981..76a334e9b2e43 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -123,8 +123,8 @@ def fit(self, X, y, sample_weight=None): support sample weights. .. versionchanged:: 0.23 - when not None, `sample_weight` is passed to all underlying - estimators + when not None, `sample_weight` is passed to all underlying + estimators Returns ------- From 8996d3ed82313c71f563a2910c1151ca78a84d50 Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Tue, 25 Feb 2020 10:18:54 -0800 Subject: [PATCH 07/14] indent bullet list in release notes --- doc/whats_new/v0.23.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 0a10d366767b9..5adf53c25d4c5 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -180,10 +180,10 @@ Changelog :pr:`16437` by :user:`Jin-Hwan CHO `. - |Fix| Fixed a bug in :class:`ensemble._BaseStacking` where the -`sample_weight` argument was not being passed to `cross_val_predict` -when evaluating the base estimators on cross-validation folds -to obtain the input to the meta estimator. -:pr:`16539` by :user: `Bill DeRose `. + `sample_weight` argument was not being passed to `cross_val_predict` + when evaluating the base estimators on cross-validation folds + to obtain the input to the meta estimator. + :pr:`16539` by :user: `Bill DeRose `. :mod:`sklearn.feature_extraction` ................................. From 9f14750909e14c979f11c9fc972e79f47e598535 Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Fri, 28 Feb 2020 15:02:20 -0800 Subject: [PATCH 08/14] fix: update release notes to reference to StackingClassifier and StackingRegressor instead of _BaseStacking --- doc/whats_new/v0.23.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 1890c1a0f5f92..63497e34eedde 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -179,11 +179,12 @@ Changelog used during `fit`. :pr:`16437` by :user:`Jin-Hwan CHO `. -- |Fix| Fixed a bug in :class:`ensemble._BaseStacking` where the - `sample_weight` argument was not being passed to `cross_val_predict` - when evaluating the base estimators on cross-validation folds +- |Fix| Fixed a bug in :class:`ensemble.StackingClassifier` and + :class:`ensemble.StackingRegressor` where the `sample_weight` + argument was not being passed to `cross_val_predict` when + evaluating the base estimators on cross-validation folds to obtain the input to the meta estimator. - :pr:`16539` by :user: `Bill DeRose `. + :pr:`16539` by :user:`Bill DeRose `. :mod:`sklearn.feature_extraction` ................................. From 4a17b2ccb5eac3c1b12e93366fe3e1fd05575cc6 Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Fri, 28 Feb 2020 15:03:11 -0800 Subject: [PATCH 09/14] fix: minor formatting suggestion from review comments --- sklearn/ensemble/_stacking.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 76a334e9b2e43..04d04e2a1f98e 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -169,9 +169,9 @@ def fit(self, X, y, sample_weight=None): self._method_name(name, est, meth) for name, est, meth in zip(names, all_estimators, stack_method) ] - fit_params = {"sample_weight": sample_weight} \ - if sample_weight is not None \ - else None + fit_params = ({"sample_weight": sample_weight} + if sample_weight is not None + else None) predictions = Parallel(n_jobs=self.n_jobs)( delayed(cross_val_predict)(clone(est), X, y, cv=deepcopy(cv), method=meth, n_jobs=self.n_jobs, From 7bae037552fc8f16e780d681401ad6a1c17e0700 Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Fri, 28 Feb 2020 15:09:08 -0800 Subject: [PATCH 10/14] fix: add comment describing odd naming of _parallel_fit_estimator --- sklearn/ensemble/_base.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 41e50f7d5a7fc..fb5a881adeb9f 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -22,7 +22,12 @@ def _parallel_fit_estimator(estimator, X, y, sample_weight=None, message_clsname=None, message=None): - """Private function used to fit an estimator within a job.""" + """Private function used to fit an estimator within a job. + Naming is weird here because we only fit a single estimator. + TODO: rename _parallel_fit_estimator to _fit_single_estimator. + See: + https://github.com/scikit-learn/scikit-learn/pull/16539/files#r385937964 + """ if sample_weight is not None: try: with _print_elapsed_time(message_clsname, message): From c2407916c9712e987f89d018f8fa61d9b2381403 Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Fri, 28 Feb 2020 15:30:41 -0800 Subject: [PATCH 11/14] fix: update link to PR with direct link to issue --- sklearn/ensemble/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index fb5a881adeb9f..714d8591c6052 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -26,7 +26,7 @@ def _parallel_fit_estimator(estimator, X, y, sample_weight=None, Naming is weird here because we only fit a single estimator. TODO: rename _parallel_fit_estimator to _fit_single_estimator. See: - https://github.com/scikit-learn/scikit-learn/pull/16539/files#r385937964 + https://github.com/scikit-learn/scikit-learn/issues/16595 """ if sample_weight is not None: try: From 25cca551114c1c98bd59fa5ab871a5e059d1fb59 Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Sat, 29 Feb 2020 11:17:35 -0800 Subject: [PATCH 12/14] set n_features_in_ when CheckingClassifier is fit --- sklearn/utils/_mocking.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/_mocking.py b/sklearn/utils/_mocking.py index 07c7b9a70ca05..7dc2e76391697 100644 --- a/sklearn/utils/_mocking.py +++ b/sklearn/utils/_mocking.py @@ -96,6 +96,7 @@ def fit(self, X, y, **fit_params): assert self.check_X(X) if self.check_y is not None: assert self.check_y(y) + self._check_n_features(X, reset=True) self.classes_ = np.unique(check_array(y, ensure_2d=False, allow_nd=True)) if self.expected_fit_params: From 78d75b87b4aaa544851e37e2fab0a89f6d87a981 Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Sat, 29 Feb 2020 11:27:07 -0800 Subject: [PATCH 13/14] fix formatting --- sklearn/ensemble/_stacking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 283ad0440415f..ba817613523f6 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -191,7 +191,7 @@ def fit(self, X, y, sample_weight=None): X_meta = self._concatenate_predictions(X, predictions) _fit_single_estimator(self.final_estimator_, X_meta, y, - sample_weight=sample_weight) + sample_weight=sample_weight) return self From 8bcbbe9c0f951abab4a7fbd29f303b836041cd52 Mon Sep 17 00:00:00 2001 From: Bill DeRose Date: Sat, 29 Feb 2020 11:49:11 -0800 Subject: [PATCH 14/14] use len(X) as n_features_in_ --- sklearn/utils/_mocking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/_mocking.py b/sklearn/utils/_mocking.py index 7dc2e76391697..91d2fe5b89d2e 100644 --- a/sklearn/utils/_mocking.py +++ b/sklearn/utils/_mocking.py @@ -96,7 +96,7 @@ def fit(self, X, y, **fit_params): assert self.check_X(X) if self.check_y is not None: assert self.check_y(y) - self._check_n_features(X, reset=True) + self.n_features_in_ = len(X) self.classes_ = np.unique(check_array(y, ensure_2d=False, allow_nd=True)) if self.expected_fit_params: