From a645c4fd7ad1b37534f673eecdb5d8756c3b97bd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 5 Jul 2019 16:12:11 +0200 Subject: [PATCH 01/21] FIX normalize with max of samples with non-null weights in AdaBoostRegressor --- doc/whats_new/v0.22.rst | 4 ++ .../ensemble/tests/test_weight_boosting.py | 39 ++++++++++++------- sklearn/ensemble/weight_boosting.py | 20 +++++----- 3 files changed, 40 insertions(+), 23 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index f0670e1293369..9eff6ecf86d6c 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -60,6 +60,10 @@ Changelog parameter called `warm_start` that enables warm starting. :pr:`14012` by :user:`Johann Faouzi `. +- |Fix| :class:`ensemble.AdaBoostRegressor` where the loss should be normalized + by the max of the samples with non-null weights. + :pr:`xxx` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.linear_model` ........................... diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index 1cb1e9d1431cf..c7f47be2a9903 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -1,12 +1,16 @@ """Testing for the boost module (sklearn.ensemble.boost).""" import numpy as np +import pytest from sklearn.utils.testing import assert_array_equal, assert_array_less -from sklearn.utils.testing import assert_array_almost_equal +from sklearn.utils.testing import assert_array_almost_equal, assert_allclose from sklearn.utils.testing import assert_raises, assert_raises_regexp +from sklearn.utils.testing import set_random_state from sklearn.base import BaseEstimator +from sklearn.base import clone +from sklearn.dummy import DummyClassifier, DummyRegressor from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.ensemble import AdaBoostClassifier @@ -304,16 +308,6 @@ def test_base_estimator(): clf.fit, X_fail, y_fail) -def test_sample_weight_missing(): - from sklearn.cluster import KMeans - - clf = AdaBoostClassifier(KMeans(), algorithm="SAMME") - assert_raises(ValueError, clf.fit, X, y_regr) - - clf = AdaBoostRegressor(KMeans()) - assert_raises(ValueError, clf.fit, X, y_regr) - - def test_sparse_classification(): # Check classification with sparse input. @@ -486,9 +480,6 @@ def test_multidimensional_X(): Check that the AdaBoost estimators can work with n-dimensional data matrix """ - - from sklearn.dummy import DummyClassifier, DummyRegressor - rng = np.random.RandomState(0) X = rng.randn(50, 3, 3) @@ -503,3 +494,23 @@ def test_multidimensional_X(): boost = AdaBoostRegressor(DummyRegressor()) boost.fit(X, yr) boost.predict(X) + + +class ClassifierWithoutWeight(DummyClassifier): + """Classifier not supporting `sample_weight`.""" + def fit(self, X, y): + super().fit(X, y) + return self + + +@pytest.mark.parametrize("algorithm", ['SAMME', 'SAMME.R']) +def test_adaboostclassifier_without_sample_weight(algorithm): + X, y = iris.data, iris.target + base_estimator = ClassifierWithoutWeight() + clf = AdaBoostClassifier( + base_estimator=base_estimator, algorithm=algorithm + ) + err_msg = ("{} doesn't support sample_weight" + .format(base_estimator.__class__.__name__)) + with pytest.raises(ValueError, match=err_msg): + clf.fit(X, y) diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index 3cb4baa0d9a0c..09fc632421546 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -1039,13 +1039,10 @@ def _boost(self, iboost, X, y, sample_weight, random_state): estimator = self._make_estimator(random_state=random_state) # Weighted sampling of the training set with replacement - # For NumPy >= 1.7.0 use np.random.choice - cdf = stable_cumsum(sample_weight) - cdf /= cdf[-1] - uniform_samples = random_state.random_sample(_num_samples(X)) - bootstrap_idx = cdf.searchsorted(uniform_samples, side='right') - # searchsorted returns a scalar - bootstrap_idx = np.array(bootstrap_idx, copy=False) + bootstrap_idx = random_state.choice( + np.arange(_num_samples(X)), size=_num_samples(X), replace=True, + p=sample_weight + ) # Fit on the bootstrapped sample and obtain a prediction # for all samples in the training set @@ -1055,9 +1052,14 @@ def _boost(self, iboost, X, y, sample_weight, random_state): y_predict = estimator.predict(X) error_vect = np.abs(y_predict - y) - error_max = error_vect.max() - if error_max != 0.: + sample_mask = sample_weight > 0 + if not np.count_nonzero(sample_mask): + error_max = 0 + else: + error_max = error_vect[sample_mask].max() + + if error_max != 0: error_vect /= error_max if self.loss == 'square': From cb52f6585271a8ad4a3ef66897f7f1bec555526e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 8 Jul 2019 15:50:41 +0200 Subject: [PATCH 02/21] iter --- .../ensemble/tests/test_weight_boosting.py | 39 ++++++++++++++++--- sklearn/ensemble/weight_boosting.py | 9 +++-- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index c7f47be2a9903..7eecf03c4a884 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -3,6 +3,12 @@ import numpy as np import pytest +from scipy.sparse import csc_matrix +from scipy.sparse import csr_matrix +from scipy.sparse import coo_matrix +from scipy.sparse import dok_matrix +from scipy.sparse import lil_matrix + from sklearn.utils.testing import assert_array_equal, assert_array_less from sklearn.utils.testing import assert_array_almost_equal, assert_allclose from sklearn.utils.testing import assert_raises, assert_raises_regexp @@ -16,11 +22,7 @@ from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import AdaBoostRegressor from sklearn.ensemble import weight_boosting -from scipy.sparse import csc_matrix -from scipy.sparse import csr_matrix -from scipy.sparse import coo_matrix -from scipy.sparse import dok_matrix -from scipy.sparse import lil_matrix +from sklearn.linear_model import LinearRegression from sklearn.svm import SVC, SVR from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils import shuffle @@ -514,3 +516,30 @@ def test_adaboostclassifier_without_sample_weight(algorithm): .format(base_estimator.__class__.__name__)) with pytest.raises(ValueError, match=err_msg): clf.fit(X, y) + + +def test_adaboost_regressor_sample_weight(): + # check that giving weight will have an influence on the error computed + # for a weak learner + X, y = datasets.make_regression(n_features=50, random_state=0) + + # add an arbitrary outlier to make sure + X = np.vstack([X, X.sum(axis=0)]) + y = np.hstack([y, 10]) + + regr_no_outlier = AdaBoostRegressor( + base_estimator=LinearRegression(), n_estimators=4, random_state=0 + ) + regr_with_weight = clone(regr_no_outlier) + + # fit 2 models: + # - a model without the outlier + # - a model containing the outlier but with a null sample-weight + # Therefore, the error of the first weak learner will be identical. + regr_no_outlier.fit(X[:-1], y[:-1]) + sample_weight = np.array([1.] * (y.size - 1) + [0.]) + regr_with_weight.fit(X, y, sample_weight=sample_weight) + + # check that the error is similar with 2 decimals + assert (regr_no_outlier.estimator_errors_[0] == + pytest.approx(regr_with_weight.estimator_errors_[0], abs=1e-2)) diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index 09fc632421546..c1864ff6e48d6 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -1054,7 +1054,7 @@ def _boost(self, iboost, X, y, sample_weight, random_state): error_vect = np.abs(y_predict - y) sample_mask = sample_weight > 0 - if not np.count_nonzero(sample_mask): + if np.all(~sample_mask): error_max = 0 else: error_max = error_vect[sample_mask].max() @@ -1068,7 +1068,8 @@ def _boost(self, iboost, X, y, sample_weight, random_state): error_vect = 1. - np.exp(- error_vect) # Calculate the average loss - estimator_error = (sample_weight * error_vect).sum() + estimator_error = (sample_weight[sample_mask] * + error_vect[sample_mask]).sum() if estimator_error <= 0: # Stop if fit is perfect @@ -1086,9 +1087,9 @@ def _boost(self, iboost, X, y, sample_weight, random_state): estimator_weight = self.learning_rate * np.log(1. / beta) if not iboost == self.n_estimators - 1: - sample_weight *= np.power( + sample_weight[sample_mask] *= np.power( beta, - (1. - error_vect) * self.learning_rate) + (1. - error_vect[sample_mask]) * self.learning_rate) return sample_weight, estimator_weight, estimator_error From 805a6b51c716a067515931b8d2fefa3b65b5afc1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 8 Jul 2019 15:54:13 +0200 Subject: [PATCH 03/21] update PR number --- doc/whats_new/v0.22.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 0345b65a260ba..3a14d275ea0b6 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -67,7 +67,7 @@ Changelog - |Fix| :class:`ensemble.AdaBoostRegressor` where the loss should be normalized by the max of the samples with non-null weights. - :pr:`xxx` by :user:`Guillaume Lemaitre `. + :pr:`14294` by :user:`Guillaume Lemaitre `. :mod:`sklearn.linear_model` ........................... From b0f2966d405bf855a1094675a22c1d3ca9d0d1ef Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 8 Jul 2019 15:54:43 +0200 Subject: [PATCH 04/21] PEP8 --- sklearn/ensemble/tests/test_weight_boosting.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index 7eecf03c4a884..833ac0091837c 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -10,9 +10,8 @@ from scipy.sparse import lil_matrix from sklearn.utils.testing import assert_array_equal, assert_array_less -from sklearn.utils.testing import assert_array_almost_equal, assert_allclose +from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_raises, assert_raises_regexp -from sklearn.utils.testing import set_random_state from sklearn.base import BaseEstimator from sklearn.base import clone From 0f317d988f06881697cd1ebd7682727d30115bcc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 8 Jul 2019 16:15:39 +0200 Subject: [PATCH 05/21] iter --- sklearn/ensemble/tests/test_weight_boosting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index 833ac0091837c..46992c9d19e07 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -539,6 +539,6 @@ def test_adaboost_regressor_sample_weight(): sample_weight = np.array([1.] * (y.size - 1) + [0.]) regr_with_weight.fit(X, y, sample_weight=sample_weight) - # check that the error is similar with 2 decimals + # check that the error is similar with 1 decimal assert (regr_no_outlier.estimator_errors_[0] == - pytest.approx(regr_with_weight.estimator_errors_[0], abs=1e-2)) + pytest.approx(regr_with_weight.estimator_errors_[0], abs=1e-1)) From 6ee41fdeb10372cde6a00e7b07d1a6725e31fcb6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 10 Jul 2019 17:04:20 +0200 Subject: [PATCH 06/21] iter --- sklearn/ensemble/weight_boosting.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index c1864ff6e48d6..63faec2f6bf10 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -1052,12 +1052,8 @@ def _boost(self, iboost, X, y, sample_weight, random_state): y_predict = estimator.predict(X) error_vect = np.abs(y_predict - y) - sample_mask = sample_weight > 0 - if np.all(~sample_mask): - error_max = 0 - else: - error_max = error_vect[sample_mask].max() + error_max = error_vect[sample_mask].max() if error_max != 0: error_vect /= error_max From ea46a7cac24c608ffe9b1a5954733da7f72e1591 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 10 Sep 2019 09:36:12 +0200 Subject: [PATCH 07/21] ignore line coverage --- sklearn/ensemble/tests/test_weight_boosting.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index bf5ac61d38614..182983c592456 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -499,8 +499,7 @@ def test_multidimensional_X(): class ClassifierWithoutWeight(DummyClassifier): """Classifier not supporting `sample_weight`.""" def fit(self, X, y): - super().fit(X, y) - return self + return super().fit(X, y) # pragma: no cover @pytest.mark.parametrize("algorithm", ['SAMME', 'SAMME.R']) From 6902effced55f33e4527ab66ed880b745c8a3f84 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 10 Sep 2019 09:44:17 +0200 Subject: [PATCH 08/21] FIX use _check_sample_weight to validate sample_weight --- sklearn/ensemble/weight_boosting.py | 20 ++++++-------------- sklearn/utils/validation.py | 4 ++-- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index eece8d0117c9c..10207297917a1 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -38,6 +38,7 @@ from ..utils.extmath import stable_cumsum from ..metrics import accuracy_score, r2_score from ..utils.validation import check_is_fitted +from ..utils.validation import _check_sample_weight from ..utils.validation import has_fit_parameter from ..utils.validation import _num_samples @@ -117,20 +118,11 @@ def fit(self, X, y, sample_weight=None): X, y = self._validate_data(X, y) - if sample_weight is None: - # Initialize weights to 1 / n_samples - sample_weight = np.empty(_num_samples(X), dtype=np.float64) - sample_weight[:] = 1. / _num_samples(X) - else: - sample_weight = check_array(sample_weight, ensure_2d=False) - # Normalize existing weights - sample_weight = sample_weight / sample_weight.sum(dtype=np.float64) - - # Check that the sample weights sum is positive - if sample_weight.sum() <= 0: - raise ValueError( - "Attempting to fit with a non-positive " - "weighted number of samples.") + sample_weight = _check_sample_weight(sample_weight, X, np.float64) + sample_weight /= sample_weight.sum() + if sample_weight.sum() <= 0: + raise ValueError("Attempting to fit with a non-positive weighted " + "number of samples.") # Check parameters self._validate_estimator() diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 465acf48e8293..d158cd4f60ad0 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1043,8 +1043,8 @@ def _check_sample_weight(sample_weight, X, dtype=None): if dtype is None: dtype = [np.float64, np.float32] sample_weight = check_array( - sample_weight, accept_sparse=False, - ensure_2d=False, dtype=dtype, order="C" + sample_weight, accept_sparse=False, ensure_2d=False, dtype=dtype, + order="C" ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") From a5dec847f385153e9d86b5d341536a5dbf8bfc69 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 12 Sep 2019 11:24:40 +0200 Subject: [PATCH 09/21] address jeremie comments --- doc/whats_new/v0.22.rst | 12 ++++-- .../ensemble/tests/test_gradient_boosting.py | 15 +------ .../ensemble/tests/test_weight_boosting.py | 43 +++++++++++-------- sklearn/ensemble/weight_boosting.py | 2 +- sklearn/utils/mocking.py | 15 +++++++ 5 files changed, 49 insertions(+), 38 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index fa5f8ffaaade4..d9c351e09c774 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -176,6 +176,14 @@ Changelog `predict_proba` give consistent results. :pr:`14114` by :user:`Guillaume Lemaitre `. +- |Fix| :class:`ensemble.AdaBoostRegressor` where the loss should be normalized + by the max of the samples with non-null weights. + :pr:`14294` by :user:`Guillaume Lemaitre `. + +- |Fix| Add test for :class:`ensemble.AdaBoostClassifier` to raise consistent + error message when base estimator does not support `sample_weight`. + :pr:`14294` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.feature_extraction` ................................. @@ -244,10 +252,6 @@ Changelog `KeyError` when using `kernel="precomputed"`. :pr:`14706` by :user:`Venkatachalam N `. -- |Fix| :class:`ensemble.AdaBoostRegressor` where the loss should be normalized - by the max of the samples with non-null weights. - :pr:`14294` by :user:`Guillaume Lemaitre `. - :mod:`sklearn.linear_model` ........................... diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 17e09f7f07156..d8f58f7e1ebb8 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -24,6 +24,7 @@ from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split from sklearn.utils import check_random_state, tosequence +from sklearn.utils.mocking import _NoSampleWeightWrapper from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_array_equal @@ -1331,20 +1332,6 @@ def test_early_stopping_stratified(): gbc.fit(X, y) -class _NoSampleWeightWrapper(BaseEstimator): - def __init__(self, est): - self.est = est - - def fit(self, X, y): - self.est.fit(X, y) - - def predict(self, X): - return self.est.predict(X) - - def predict_proba(self, X): - return self.est.predict_proba(X) - - def _make_multiclass(): return make_classification(n_classes=3, n_clusters_per_class=1) diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index 182983c592456..87e17fe9a126c 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -25,6 +25,7 @@ from sklearn.svm import SVC, SVR from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils import shuffle +from sklearn.utils.mocking import _NoSampleWeightWrapper from sklearn import datasets @@ -496,16 +497,10 @@ def test_multidimensional_X(): boost.predict(X) -class ClassifierWithoutWeight(DummyClassifier): - """Classifier not supporting `sample_weight`.""" - def fit(self, X, y): - return super().fit(X, y) # pragma: no cover - - @pytest.mark.parametrize("algorithm", ['SAMME', 'SAMME.R']) def test_adaboostclassifier_without_sample_weight(algorithm): X, y = iris.data, iris.target - base_estimator = ClassifierWithoutWeight() + base_estimator = _NoSampleWeightWrapper(DummyClassifier()) clf = AdaBoostClassifier( base_estimator=base_estimator, algorithm=algorithm ) @@ -515,32 +510,42 @@ def test_adaboostclassifier_without_sample_weight(algorithm): clf.fit(X, y) -def test_adaboost_regressor_sample_weight(): +def test_adaboostregressor_sample_weight(): # check that giving weight will have an influence on the error computed # for a weak learner - X, y = datasets.make_regression(n_features=50, random_state=0) + rng = np.random.RandomState(42) + X = np.linspace(0, 100, num=10000) + y = (.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001) + X = X.reshape(-1, 1) - # add an arbitrary outlier to make sure - X = np.vstack([X, X.sum(axis=0)]) - y = np.hstack([y, 10]) + # add an arbitrary outlier + X[-1] *= 10 + y[-1] = 10000 + # random_state=0 ensure that the underlying boostrap will use the outlier regr_no_outlier = AdaBoostRegressor( - base_estimator=LinearRegression(), n_estimators=4, random_state=0 + base_estimator=LinearRegression(), n_estimators=1, random_state=0 ) regr_with_weight = clone(regr_no_outlier) + regr_with_outlier = clone(regr_no_outlier) - # fit 2 models: + # fit 3 models: + # - a model containing the outlier # - a model without the outlier # - a model containing the outlier but with a null sample-weight - # Therefore, the error of the first weak learner will be identical. + regr_with_outlier.fit(X, y) regr_no_outlier.fit(X[:-1], y[:-1]) - sample_weight = np.array([1.] * (y.size - 1) + [0.]) + sample_weight = np.ones_like(y) + sample_weight[-1] = 0 regr_with_weight.fit(X, y, sample_weight=sample_weight) - # check that the error is similar with 1 decimal - assert (regr_no_outlier.estimator_errors_[0] == - pytest.approx(regr_with_weight.estimator_errors_[0], abs=1e-1)) + score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1]) + score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1]) + score_with_weight = regr_with_weight.score(X[:-1], y[:-1]) + assert score_with_outlier < score_no_outlier + assert score_with_outlier < score_with_weight + assert score_no_outlier == pytest.approx(score_with_weight) @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) def test_adaboost_consistent_predict(algorithm): diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index 10207297917a1..820e5e6a5b984 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -928,7 +928,7 @@ class AdaBoostRegressor(RegressorMixin, BaseWeightBoosting): .. [1] Y. Freund, R. Schapire, "A Decision-Theoretic Generalization of on-Line Learning and an Application to Boosting", 1995. - .. [2] H. Drucker, "Improving Regressors using Boosting Techniques", 1997. + .. [2] H. Drucker, "`Improving Regressors using Boosting Techniques", 1997. """ def __init__(self, diff --git a/sklearn/utils/mocking.py b/sklearn/utils/mocking.py index 45ac89f992a78..a46b8ddca8430 100644 --- a/sklearn/utils/mocking.py +++ b/sklearn/utils/mocking.py @@ -135,3 +135,18 @@ def score(self, X=None, Y=None): def _more_tags(self): return {'_skip_test': True, 'X_types': ['1dlabel']} + + +class _NoSampleWeightWrapper(BaseEstimator): + """Wrap estimator which will not expose `sample_weight`.""" + def __init__(self, est): + self.est = est + + def fit(self, X, y): + return self.est.fit(X, y) + + def predict(self, X): + return self.est.predict(X) + + def predict_proba(self, X): + return self.est.predict_proba(X) From 810c637bcd47a3a62eb4668d0225ba27119ff853 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 12 Sep 2019 11:52:25 +0200 Subject: [PATCH 10/21] fix inplace/mask copy operation --- sklearn/ensemble/weight_boosting.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index 820e5e6a5b984..ad4c2b5b4e279 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -1035,19 +1035,20 @@ def _boost(self, iboost, X, y, sample_weight, random_state): error_vect = np.abs(y_predict - y) sample_mask = sample_weight > 0 - error_max = error_vect[sample_mask].max() + masked_sample_weight = sample_weight[sample_mask] + masked_error_vector = error_vect[sample_mask] + error_max = masked_error_vector.max() if error_max != 0: - error_vect /= error_max + masked_error_vector /= error_max if self.loss == 'square': - error_vect **= 2 + masked_error_vector **= 2 elif self.loss == 'exponential': - error_vect = 1. - np.exp(- error_vect) + masked_error_vector = 1. - np.exp(-masked_error_vector) # Calculate the average loss - estimator_error = (sample_weight[sample_mask] * - error_vect[sample_mask]).sum() + estimator_error = (masked_sample_weight * masked_error_vector).sum() if estimator_error <= 0: # Stop if fit is perfect @@ -1066,8 +1067,8 @@ def _boost(self, iboost, X, y, sample_weight, random_state): if not iboost == self.n_estimators - 1: sample_weight[sample_mask] *= np.power( - beta, - (1. - error_vect[sample_mask]) * self.learning_rate) + beta, (1. - masked_error_vector) * self.learning_rate + ) return sample_weight, estimator_weight, estimator_error From fd50b0807514492fb5bb5986035b0047d41efa5f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 12 Sep 2019 13:30:42 +0200 Subject: [PATCH 11/21] add default value for wrapper --- sklearn/utils/mocking.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/mocking.py b/sklearn/utils/mocking.py index a46b8ddca8430..e98ac51347b00 100644 --- a/sklearn/utils/mocking.py +++ b/sklearn/utils/mocking.py @@ -1,6 +1,7 @@ import numpy as np from ..base import BaseEstimator, ClassifierMixin +from ..dummy import DummyClassifier from .validation import _num_samples, check_array @@ -138,8 +139,14 @@ def _more_tags(self): class _NoSampleWeightWrapper(BaseEstimator): - """Wrap estimator which will not expose `sample_weight`.""" - def __init__(self, est): + """Wrap estimator which will not expose `sample_weight`. + + Parameters + ---------- + est : estimator, default=None + The estimator to wrap. + """ + def __init__(self, est=None): self.est = est def fit(self, X, y): @@ -150,3 +157,6 @@ def predict(self, X): def predict_proba(self, X): return self.est.predict_proba(X) + + def _more_tags(self): + return {'_skip_test': True} From da5a2a0c861f174b485760e5978bf552371cc984 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 12 Sep 2019 15:08:18 +0200 Subject: [PATCH 12/21] PEP8 --- sklearn/utils/mocking.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/mocking.py b/sklearn/utils/mocking.py index e98ac51347b00..c89beb6083bfc 100644 --- a/sklearn/utils/mocking.py +++ b/sklearn/utils/mocking.py @@ -1,7 +1,6 @@ import numpy as np from ..base import BaseEstimator, ClassifierMixin -from ..dummy import DummyClassifier from .validation import _num_samples, check_array From 1da7cc790826fb7b9b2a7cf6c2ea5d908398f67d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 12 Sep 2019 17:36:56 +0200 Subject: [PATCH 13/21] Apply suggestions from code review Co-Authored-By: jeremiedbb <34657725+jeremiedbb@users.noreply.github.com> --- doc/whats_new/v0.22.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index e819df7b18aa4..bc2e0c6bcfe55 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -204,7 +204,7 @@ Changelog :pr:`14907` by `Adrin Jalali`_. - |Fix| :class:`ensemble.AdaBoostRegressor` where the loss should be normalized - by the max of the samples with non-null weights. + by the max of the samples with non-null weights only. :pr:`14294` by :user:`Guillaume Lemaitre `. - |Fix| Add test for :class:`ensemble.AdaBoostClassifier` to raise consistent From ff2d23d81aff13f0a405143a87ca32315641b411 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 12 Sep 2019 17:38:28 +0200 Subject: [PATCH 14/21] apply jeremie comments --- doc/whats_new/v0.22.rst | 4 ---- sklearn/ensemble/weight_boosting.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index bc2e0c6bcfe55..cf9d90d78cb10 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -207,10 +207,6 @@ Changelog by the max of the samples with non-null weights only. :pr:`14294` by :user:`Guillaume Lemaitre `. -- |Fix| Add test for :class:`ensemble.AdaBoostClassifier` to raise consistent - error message when base estimator does not support `sample_weight`. - :pr:`14294` by :user:`Guillaume Lemaitre `. - :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index ad4c2b5b4e279..b7a383d35059f 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -928,7 +928,7 @@ class AdaBoostRegressor(RegressorMixin, BaseWeightBoosting): .. [1] Y. Freund, R. Schapire, "A Decision-Theoretic Generalization of on-Line Learning and an Application to Boosting", 1995. - .. [2] H. Drucker, "`Improving Regressors using Boosting Techniques", 1997. + .. [2] H. Drucker, "Improving Regressors using Boosting Techniques", 1997. """ def __init__(self, From 335b62aa84baeff96298b9fc236eb8936cc5d0d5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 13 Sep 2019 10:08:02 +0200 Subject: [PATCH 15/21] increase coverage --- sklearn/ensemble/tests/test_weight_boosting.py | 18 ++++++++++++++++-- sklearn/ensemble/weight_boosting.py | 5 ++--- sklearn/utils/mocking.py | 2 +- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index 87e17fe9a126c..b89849538ff8b 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -142,9 +142,10 @@ def test_iris(): np.abs(clf_samme.predict_proba(iris.data) - prob_samme)) -def test_boston(): +@pytest.mark.parametrize('loss', ['linear', 'square', 'exponential']) +def test_boston(loss): # Check consistency on dataset boston house prices. - reg = AdaBoostRegressor(random_state=0) + reg = AdaBoostRegressor(loss=loss, random_state=0) reg.fit(boston.data, boston.target) score = reg.score(boston.data, boston.target) assert score > 0.85 @@ -562,3 +563,16 @@ def test_adaboost_consistent_predict(algorithm): np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test) ) + +@pytest.mark.parametrize( + 'model, X, y', + [(AdaBoostClassifier(), *datasets.load_iris(return_X_y=True)), + (AdaBoostRegressor(), *datasets.load_boston(return_X_y=True))] +) +def test_adaboost_negative_weight_error(model, X, y): + sample_weight = np.ones_like(y) + sample_weight[-1] = -10 + + err_msg = "sample_weight cannot contain negative weight" + with pytest.raises(ValueError, match=err_msg): + model.fit(X, y, sample_weight=sample_weight) diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index b7a383d35059f..6892a39348c78 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -120,9 +120,8 @@ def fit(self, X, y, sample_weight=None): sample_weight = _check_sample_weight(sample_weight, X, np.float64) sample_weight /= sample_weight.sum() - if sample_weight.sum() <= 0: - raise ValueError("Attempting to fit with a non-positive weighted " - "number of samples.") + if np.any(sample_weight < 0): + raise ValueError("sample_weight cannot contain negative weights") # Check parameters self._validate_estimator() diff --git a/sklearn/utils/mocking.py b/sklearn/utils/mocking.py index c89beb6083bfc..8a00466a3224d 100644 --- a/sklearn/utils/mocking.py +++ b/sklearn/utils/mocking.py @@ -158,4 +158,4 @@ def predict_proba(self, X): return self.est.predict_proba(X) def _more_tags(self): - return {'_skip_test': True} + return {'_skip_test': True} # pragma: no cover From 68084b5f4f574ef42540c25cc4e63670ae50977f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 13 Sep 2019 10:18:20 +0200 Subject: [PATCH 16/21] PEP8 --- sklearn/ensemble/tests/test_weight_boosting.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index b89849538ff8b..96b9c97a63458 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -564,6 +564,7 @@ def test_adaboost_consistent_predict(algorithm): model.predict(X_test) ) + @pytest.mark.parametrize( 'model, X, y', [(AdaBoostClassifier(), *datasets.load_iris(return_X_y=True)), From 2f8054e0b065c2a1e4f473137817930b3ba241dc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 3 Oct 2019 10:35:24 +0200 Subject: [PATCH 17/21] address adrin comments --- sklearn/ensemble/tests/test_weight_boosting.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index 96b9c97a63458..4b61be7972949 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -515,7 +515,7 @@ def test_adaboostregressor_sample_weight(): # check that giving weight will have an influence on the error computed # for a weak learner rng = np.random.RandomState(42) - X = np.linspace(0, 100, num=10000) + X = np.linspace(0, 100, num=1000) y = (.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001) X = X.reshape(-1, 1) @@ -567,8 +567,8 @@ def test_adaboost_consistent_predict(algorithm): @pytest.mark.parametrize( 'model, X, y', - [(AdaBoostClassifier(), *datasets.load_iris(return_X_y=True)), - (AdaBoostRegressor(), *datasets.load_boston(return_X_y=True))] + [(AdaBoostClassifier(), iris.data, iris.target), + (AdaBoostRegressor(), boston.data, boston.target)] ) def test_adaboost_negative_weight_error(model, X, y): sample_weight = np.ones_like(y) From cbcd68152caa9d908233f1c13b48750cb6e56d52 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 3 Oct 2019 10:40:21 +0200 Subject: [PATCH 18/21] change import for mocking --- sklearn/ensemble/tests/test_weight_boosting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index 4b61be7972949..440f908ba7114 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -25,7 +25,7 @@ from sklearn.svm import SVC, SVR from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils import shuffle -from sklearn.utils.mocking import _NoSampleWeightWrapper +from sklearn.utils._mocking import _NoSampleWeightWrapper from sklearn import datasets From 362ac2abb7fdb1873307f14f13d5053e3012736b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 3 Oct 2019 12:14:23 +0200 Subject: [PATCH 19/21] fix --- sklearn/ensemble/tests/test_gradient_boosting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index b8164dd3d4a75..8398fa8471750 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -24,7 +24,7 @@ from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split from sklearn.utils import check_random_state, tosequence -from sklearn.utils.mocking import _NoSampleWeightWrapper +from sklearn.utils._mocking import _NoSampleWeightWrapper from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_array_equal From 31a62766580e58f3d2a593bda8dbc662cf796033 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 22 Oct 2019 14:24:55 +0200 Subject: [PATCH 20/21] PEP8 --- sklearn/ensemble/tests/test_weight_boosting.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index 3b806f55fa868..2b82d985bc1d5 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -21,7 +21,6 @@ from sklearn.model_selection import GridSearchCV from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import AdaBoostRegressor -from sklearn.ensemble import weight_boosting from sklearn.ensemble._weight_boosting import _samme_proba from scipy.sparse import csc_matrix from scipy.sparse import csr_matrix From d494b3ffad4ad2a5333907e0541fe0cf0b84bf2b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 23 Oct 2019 16:22:47 +0200 Subject: [PATCH 21/21] address comment adrin --- sklearn/ensemble/tests/test_gradient_boosting.py | 4 ++-- sklearn/ensemble/tests/test_weight_boosting.py | 4 ++-- sklearn/utils/_mocking.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 8398fa8471750..f19c2cc09ce5e 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -24,7 +24,7 @@ from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split from sklearn.utils import check_random_state, tosequence -from sklearn.utils._mocking import _NoSampleWeightWrapper +from sklearn.utils._mocking import NoSampleWeightWrapper from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_array_equal @@ -1317,7 +1317,7 @@ def test_gradient_boosting_with_init(gb, dataset_maker, init_estimator): gb(init=init_est).fit(X, y, sample_weight=sample_weight) # init does not support sample weights - init_est = _NoSampleWeightWrapper(init_estimator()) + init_est = NoSampleWeightWrapper(init_estimator()) gb(init=init_est).fit(X, y) # ok no sample weights with pytest.raises(ValueError, match="estimator.*does not support sample weights"): diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index 440f908ba7114..ada5cab0d2aeb 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -25,7 +25,7 @@ from sklearn.svm import SVC, SVR from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils import shuffle -from sklearn.utils._mocking import _NoSampleWeightWrapper +from sklearn.utils._mocking import NoSampleWeightWrapper from sklearn import datasets @@ -501,7 +501,7 @@ def test_multidimensional_X(): @pytest.mark.parametrize("algorithm", ['SAMME', 'SAMME.R']) def test_adaboostclassifier_without_sample_weight(algorithm): X, y = iris.data, iris.target - base_estimator = _NoSampleWeightWrapper(DummyClassifier()) + base_estimator = NoSampleWeightWrapper(DummyClassifier()) clf = AdaBoostClassifier( base_estimator=base_estimator, algorithm=algorithm ) diff --git a/sklearn/utils/_mocking.py b/sklearn/utils/_mocking.py index fb6b410720622..3edcf8da53a95 100644 --- a/sklearn/utils/_mocking.py +++ b/sklearn/utils/_mocking.py @@ -137,7 +137,7 @@ def _more_tags(self): return {'_skip_test': True, 'X_types': ['1dlabel']} -class _NoSampleWeightWrapper(BaseEstimator): +class NoSampleWeightWrapper(BaseEstimator): """Wrap estimator which will not expose `sample_weight`. Parameters