From 4edc11036adda6e6dfc913c2e7308d863dbd29fe Mon Sep 17 00:00:00 2001 From: hmasdev Date: Thu, 2 Nov 2023 12:45:36 +0900 Subject: [PATCH 1/6] make StackingRegressor support Multioutput --- sklearn/ensemble/_stacking.py | 7 +- sklearn/ensemble/tests/test_stacking.py | 85 +++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 8e27facda11df..431e71c262ced 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -958,7 +958,12 @@ def fit(self, X, y, sample_weight=None): Returns a fitted instance. """ _raise_for_unsupported_routing(self, "fit", sample_weight=sample_weight) - y = column_or_1d(y, warn=True) + try: + # Single Target + y = column_or_1d(y, warn=True) + except ValueError: + # Multioutput target. + y = self._validate_data(X="no_validation", y=y, multi_output=True) return super().fit(X, y, sample_weight) def transform(self, X): diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py index d15aa32077689..5593e4481cc42 100644 --- a/sklearn/ensemble/tests/test_stacking.py +++ b/sklearn/ensemble/tests/test_stacking.py @@ -34,6 +34,7 @@ RidgeClassifier, ) from sklearn.model_selection import KFold, StratifiedKFold, train_test_split +from sklearn.multioutput import MultiOutputRegressor from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.preprocessing import scale @@ -859,3 +860,87 @@ def test_stacking_classifier_base_regressor(): clf.predict(X_test) clf.predict_proba(X_test) assert clf.score(X_test, y_test) > 0.8 + + +def test_stacking_regressor_multioutput(): + """Check that a stacking regressor with multioutput works""" + cv = 2 + acceptable_relative_tolerance = 1e-10 + acceptable_aboslute_tolerance = 1e-10 + + X_train = np.hstack([np.arange(5)] * cv).reshape(-1, 1) + y_train = np.hstack([2 * X_train + 1, 3 * X_train - 2]) + assert y_train.ndim > 1 + + estimator1 = LinearRegression(fit_intercept=True) + estimator2 = MultiOutputRegressor(DummyRegressor(strategy="constant", constant=0)) + final_estimator = LinearRegression(fit_intercept=False, positive=True) + + reg = StackingRegressor( + estimators=[("lr", estimator1), ("dr", estimator2)], + final_estimator=final_estimator, + cv=KFold(n_splits=cv, shuffle=False), + passthrough=False, + ) + + reg.fit(X_train, y_train) + # predict + y_pred = reg.predict(X_train) + # NOTE: In this case the estimator can predict almost exactly the target + assert_allclose( + y_pred, + y_train, + rtol=acceptable_relative_tolerance, + atol=acceptable_aboslute_tolerance, + ) + # transform + X_trans = reg.transform(X_train) + # NOTE: The result of transform is the horizontal stack of the predictions + assert_allclose( + X_trans, + np.hstack([y_train, np.zeros(y_train.shape)]), + rtol=acceptable_relative_tolerance, + atol=acceptable_aboslute_tolerance, + ) + + +def test_stacking_regressor_multioutput_with_passthrough(): + """Check that a stacking regressor with multioutput works""" + cv = 2 + acceptable_relative_tolerance = 1e-10 + acceptable_aboslute_tolerance = 1e-10 + + X_train = np.hstack([np.arange(5)] * cv).reshape(-1, 1) + y_train = np.hstack([2 * X_train + 1, 3 * X_train - 2]) + assert y_train.ndim > 1 + + estimator1 = LinearRegression(fit_intercept=True) + estimator2 = MultiOutputRegressor(DummyRegressor(strategy="constant", constant=0)) + final_estimator = LinearRegression(fit_intercept=False, positive=True) + + reg = StackingRegressor( + estimators=[("lr", estimator1), ("dr", estimator2)], + final_estimator=final_estimator, + cv=KFold(n_splits=cv, shuffle=False), + passthrough=True, + ) + + reg.fit(X_train, y_train) + # predict + y_pred = reg.predict(X_train) + # NOTE: In this case, the estimator can predict almost exactly the target + assert_allclose( + y_pred, + y_train, + rtol=acceptable_relative_tolerance, + atol=acceptable_aboslute_tolerance, + ) + # transform + X_trans = reg.transform(X_train) + # NOTE: X_trans should be the horizontal stack of the predictions and X_train + assert_allclose( + X_trans, + np.hstack([y_train, np.zeros(y_train.shape), X_train]), + rtol=acceptable_relative_tolerance, + atol=acceptable_aboslute_tolerance, + ) From dd947bdc385fec9cc1d7c54ea023e1f62570731d Mon Sep 17 00:00:00 2001 From: hmasdev Date: Thu, 2 Nov 2023 13:06:54 +0900 Subject: [PATCH 2/6] update docstring of StackingRegressor --- sklearn/ensemble/_stacking.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 431e71c262ced..5ae500d788ecc 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -944,7 +944,7 @@ def fit(self, X, y, sample_weight=None): Training vectors, where `n_samples` is the number of samples and `n_features` is the number of features. - y : array-like of shape (n_samples,) + y : array-like of shape (n_samples,) or (n_samples, n_outputs) Target values. sample_weight : array-like of shape (n_samples,), default=None @@ -977,8 +977,10 @@ def transform(self, X): Returns ------- - y_preds : ndarray of shape (n_samples, n_estimators) + y_preds : ndarray of shape + (n_samples, n_estimators) or (n_samples, n_estimators x n_outputs). Prediction outputs for each estimator. + If passthrough=True, the number of columns increases by n_features. """ return self._transform(X) @@ -991,7 +993,7 @@ def fit_transform(self, X, y, sample_weight=None): Training vectors, where `n_samples` is the number of samples and `n_features` is the number of features. - y : array-like of shape (n_samples,) + y : array-like of shape (n_samples,) or (n_samples, n_outputs) Target values. sample_weight : array-like of shape (n_samples,), default=None @@ -1001,8 +1003,10 @@ def fit_transform(self, X, y, sample_weight=None): Returns ------- - y_preds : ndarray of shape (n_samples, n_estimators) + y_preds : ndarray of shape + (n_samples, n_estimators) or (n_samples, n_estimators x n_outputs). Prediction outputs for each estimator. + If passthrough=True, the number of columns increases by n_features. """ return super().fit_transform(X, y, sample_weight=sample_weight) From f3235932d0764a8bbc1824c8b4f216d3038f3cc5 Mon Sep 17 00:00:00 2001 From: hmasdev Date: Tue, 6 Feb 2024 07:57:11 +0900 Subject: [PATCH 3/6] test: update stacking regressor test with Ridge --- sklearn/ensemble/tests/test_stacking.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py index 5593e4481cc42..0a1430947c5e5 100644 --- a/sklearn/ensemble/tests/test_stacking.py +++ b/sklearn/ensemble/tests/test_stacking.py @@ -874,7 +874,7 @@ def test_stacking_regressor_multioutput(): estimator1 = LinearRegression(fit_intercept=True) estimator2 = MultiOutputRegressor(DummyRegressor(strategy="constant", constant=0)) - final_estimator = LinearRegression(fit_intercept=False, positive=True) + final_estimator = Ridge(alpha=1e-12, fit_intercept=False, random_state=42) reg = StackingRegressor( estimators=[("lr", estimator1), ("dr", estimator2)], @@ -916,7 +916,7 @@ def test_stacking_regressor_multioutput_with_passthrough(): estimator1 = LinearRegression(fit_intercept=True) estimator2 = MultiOutputRegressor(DummyRegressor(strategy="constant", constant=0)) - final_estimator = LinearRegression(fit_intercept=False, positive=True) + final_estimator = Ridge(alpha=1e-12, fit_intercept=False, random_state=42) reg = StackingRegressor( estimators=[("lr", estimator1), ("dr", estimator2)], From da2f4b7fc80635bc6e1708898de46be927394359 Mon Sep 17 00:00:00 2001 From: hmasdev Date: Mon, 22 Apr 2024 21:59:50 +0900 Subject: [PATCH 4/6] Commit suggestion: try-except to if syntax --- sklearn/ensemble/_stacking.py | 7 ++-- sklearn/ensemble/tests/test_stacking.py | 43 +++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 908e5fff61c15..0cee6d4fe54a3 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -967,12 +967,9 @@ def fit(self, X, y, sample_weight=None): Returns a fitted instance. """ _raise_for_unsupported_routing(self, "fit", sample_weight=sample_weight) - try: - # Single Target + y = self._validate_data(y=y, multi_output=True) + if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) - except ValueError: - # Multioutput target. - y = self._validate_data(X="no_validation", y=y, multi_output=True) return super().fit(X, y, sample_weight) def transform(self, X): diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py index 465e64ed2914b..69fb991e0f9e0 100644 --- a/sklearn/ensemble/tests/test_stacking.py +++ b/sklearn/ensemble/tests/test_stacking.py @@ -862,6 +862,49 @@ def test_stacking_classifier_base_regressor(): assert clf.score(X_test, y_test) > 0.8 +def test_stacking_regressor_singleoutput_but_2d(): + """Check that a stacking regressor with a single output but 2D target works""" + cv = 2 + acceptable_relative_tolerance = 1e-10 + acceptable_aboslute_tolerance = 1e-10 + + X_train = np.hstack([np.arange(5)] * cv).reshape(-1, 1) + y_train = (2 * X_train + 1).reshape(-1, 1) + + estimator1 = LinearRegression(fit_intercept=True) + estimator2 = DummyRegressor(strategy="constant", constant=0) + final_estimator = Ridge(alpha=1e-12, fit_intercept=False, random_state=42) + + reg = StackingRegressor( + estimators=[("lr", estimator1), ("dr", estimator2)], + final_estimator=final_estimator, + cv=KFold(n_splits=cv, shuffle=False), + passthrough=False, + ) + + reg.fit(X_train, y_train) + # predict + y_pred = reg.predict(X_train) + # NOTE: In this case the estimator can predict almost exactly the target + assert_allclose( + y_pred, + # NOTE: when the target is 2D but with a single output, + # the predictions are 1D because of column_or_1d + y_train.flatten(), + rtol=acceptable_relative_tolerance, + atol=acceptable_aboslute_tolerance, + ) + # transform + X_trans = reg.transform(X_train) + # NOTE: The result of transform is the horizontal stack of the predictions + assert_allclose( + X_trans, + np.hstack([y_train, np.zeros(y_train.shape)]), + rtol=acceptable_relative_tolerance, + atol=acceptable_aboslute_tolerance, + ) + + def test_stacking_regressor_multioutput(): """Check that a stacking regressor with multioutput works""" cv = 2 From 2c2a49cc58df88d451e4d30f4c82101a54cdffad Mon Sep 17 00:00:00 2001 From: hmasdev Date: Tue, 7 May 2024 23:10:15 +0900 Subject: [PATCH 5/6] Commit suggestion: update comments in test code --- sklearn/ensemble/tests/test_stacking.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py index 69fb991e0f9e0..9e9298c3905f5 100644 --- a/sklearn/ensemble/tests/test_stacking.py +++ b/sklearn/ensemble/tests/test_stacking.py @@ -883,18 +883,16 @@ def test_stacking_regressor_singleoutput_but_2d(): ) reg.fit(X_train, y_train) - # predict y_pred = reg.predict(X_train) # NOTE: In this case the estimator can predict almost exactly the target + # when the target is 2D but with a single output, the predictions are 1D + # because of column_or_1d assert_allclose( y_pred, - # NOTE: when the target is 2D but with a single output, - # the predictions are 1D because of column_or_1d y_train.flatten(), rtol=acceptable_relative_tolerance, atol=acceptable_aboslute_tolerance, ) - # transform X_trans = reg.transform(X_train) # NOTE: The result of transform is the horizontal stack of the predictions assert_allclose( @@ -906,7 +904,7 @@ def test_stacking_regressor_singleoutput_but_2d(): def test_stacking_regressor_multioutput(): - """Check that a stacking regressor with multioutput works""" + """Check that a stacking regressor works with multioutput""" cv = 2 acceptable_relative_tolerance = 1e-10 acceptable_aboslute_tolerance = 1e-10 @@ -927,7 +925,6 @@ def test_stacking_regressor_multioutput(): ) reg.fit(X_train, y_train) - # predict y_pred = reg.predict(X_train) # NOTE: In this case the estimator can predict almost exactly the target assert_allclose( @@ -936,7 +933,6 @@ def test_stacking_regressor_multioutput(): rtol=acceptable_relative_tolerance, atol=acceptable_aboslute_tolerance, ) - # transform X_trans = reg.transform(X_train) # NOTE: The result of transform is the horizontal stack of the predictions assert_allclose( @@ -948,7 +944,7 @@ def test_stacking_regressor_multioutput(): def test_stacking_regressor_multioutput_with_passthrough(): - """Check that a stacking regressor with multioutput works""" + """Check that a stacking regressor works with multioutput""" cv = 2 acceptable_relative_tolerance = 1e-10 acceptable_aboslute_tolerance = 1e-10 @@ -969,7 +965,6 @@ def test_stacking_regressor_multioutput_with_passthrough(): ) reg.fit(X_train, y_train) - # predict y_pred = reg.predict(X_train) # NOTE: In this case, the estimator can predict almost exactly the target assert_allclose( @@ -978,7 +973,6 @@ def test_stacking_regressor_multioutput_with_passthrough(): rtol=acceptable_relative_tolerance, atol=acceptable_aboslute_tolerance, ) - # transform X_trans = reg.transform(X_train) # NOTE: X_trans should be the horizontal stack of the predictions and X_train assert_allclose( From bcf4e0eecd1426a05c4cb688ac74059238593ad6 Mon Sep 17 00:00:00 2001 From: hmasdev Date: Wed, 8 May 2024 21:07:06 +0900 Subject: [PATCH 6/6] update docstring of _BaseStacking.fit --- sklearn/ensemble/_stacking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 0cee6d4fe54a3..91fdcdf1f068f 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -180,7 +180,7 @@ def fit(self, X, y, sample_weight=None): Training vectors, where `n_samples` is the number of samples and `n_features` is the number of features. - y : array-like of shape (n_samples,) + y : array-like of shape (n_samples,) or (n_samples, n_outputs) Target values. sample_weight : array-like of shape (n_samples,) or default=None