From a866d58589e57c2b94f6bbcec044af418b28444b Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 17 Dec 2021 16:22:53 -0500 Subject: [PATCH 1/6] ENH Adds class_weights to HistGradientBoostingClassifier --- doc/whats_new/v1.1.rst | 3 + .../gradient_boosting.py | 36 +++++++++++- .../tests/test_gradient_boosting.py | 55 +++++++++++++++++++ 3 files changed, 93 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 394b3bd6652e2..a5adbb2f09fb3 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -189,6 +189,9 @@ Changelog :class:`ensemble.ExtraTreesClassifier`. :pr:`20803` by :user:`Brian Sun `. +- |API| Adds `class_weight` to :class:`ensemble.HistGradientBoostingClassifier`. + :pr:`xxxxx` by `Thomas Fan`_. + :mod:`sklearn.feature_extraction.text` ...................................... diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 097ceeeadc588..03669bc743052 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -8,7 +8,7 @@ import numpy as np from timeit import default_timer as time from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier -from ...utils import check_random_state, resample +from ...utils import check_random_state, resample, compute_sample_weight from ...utils.validation import ( check_is_fitted, check_consistent_length, @@ -118,6 +118,14 @@ def _validate_parameters(self): "monotonic constraints are not supported for multiclass classification." ) + def _finalize_sample_weight(self, sample_weight, y): + """Finalize sample weight. + + Used by subclasses to adjust sample_weights. This is useful for implementing + class weights. + """ + return sample_weight + def _check_categories(self, X): """Check and validate categorical features in X @@ -240,6 +248,8 @@ def fit(self, X, y, sample_weight=None): # TODO: remove when PDP supports sample weights self._fitted_with_sw = True + sample_weight = self._finalize_sample_weight(sample_weight, y) + rng = check_random_state(self.random_state) # When warm starting, we want to re-use the same seed that was used @@ -1446,6 +1456,16 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): is enabled. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. + class_weight : dict or 'balanced', default=None + Weights associated with classes in the form `{class_label: weight}`. + If not given, all classes are supposed to have weight one. + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as `n_samples / (n_classes * np.bincount(y))`. + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if `sample_weight is` specified. + + .. versionadded:: 1.1 Attributes ---------- @@ -1531,6 +1551,7 @@ def __init__( tol=1e-7, verbose=0, random_state=None, + class_weight=None, ): super(HistGradientBoostingClassifier, self).__init__( loss=loss, @@ -1552,6 +1573,19 @@ def __init__( verbose=verbose, random_state=random_state, ) + self.class_weight = class_weight + + def _finalize_sample_weight(self, sample_weight, y): + """Adjust sample_weights with class_weights.""" + if self.class_weight is None: + return sample_weight + + expanded_class_weight = compute_sample_weight(self.class_weight, y) + + if sample_weight is not None: + return sample_weight * expanded_class_weight + else: + return expanded_class_weight def predict(self, X): """Predict classes for X. diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 79581525b50bb..c991c9239cdd0 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -681,6 +681,61 @@ def test_sample_weight_effect(problem, duplication): assert np.allclose(est_sw._raw_predict(X_dup), est_dup._raw_predict(X_dup)) +def test_class_weights(): + """High level test to check class_weights""" + n_samples = 255 + n_features = 2 + + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + n_redundant=0, + n_clusters_per_class=1, + n_classes=2, + random_state=0, + ) + y_is_1 = y == 1 + + # class_weight is the same as sample weights with the corresponding class + clf = HistGradientBoostingClassifier( + min_samples_leaf=2, random_state=0, max_depth=2 + ) + sample_weight = np.ones(shape=(n_samples)) + sample_weight[y_is_1] = 3.0 + clf.fit(X, y, sample_weight=sample_weight) + + class_weight = {0: 1.0, 1: 3.0} + clf_class_weighted = clone(clf).set_params(class_weight=class_weight) + clf_class_weighted.fit(X, y) + + assert_allclose(clf.decision_function(X), clf_class_weighted.decision_function(X)) + + # Check that sample_weight and class_weight are multiplicative + clf.fit(X, y, sample_weight=sample_weight ** 2) + clf_class_weighted.fit(X, y, sample_weight=sample_weight) + assert_allclose(clf.decision_function(X), clf_class_weighted.decision_function(X)) + + # Make imbalanced dataset + X_imb = np.concatenate((X[~y_is_1], X[y_is_1][:10])) + y_imb = np.concatenate((y[~y_is_1], y[y_is_1][:10])) + + # class_weight="balanced" is the same as sample_weights to be + # inversely proportional to n_samples / (n_classes * np.bincount(y)) + clf_balanced = clone(clf).set_params(class_weight="balanced") + clf_balanced.fit(X_imb, y_imb) + + class_weight = y_imb.shape[0] / (2 * np.bincount(y_imb)) + sample_weight = class_weight[y_imb] + clf_sample_weight = clone(clf).set_params(class_weight=None) + clf_sample_weight.fit(X_imb, y_imb, sample_weight=sample_weight) + + assert_allclose( + clf_balanced.decision_function(X_imb), + clf_sample_weight.decision_function(X_imb), + ) + + @pytest.mark.parametrize("loss_name", ("squared_error", "absolute_error")) def test_sum_hessians_are_sample_weight(loss_name): # For losses with constant hessians, the sum_hessians field of the From cfef17d61fc43bb5cbf656a6f4c6a31034200d55 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 17 Dec 2021 16:25:09 -0500 Subject: [PATCH 2/6] DOC Adds whats new PR number --- doc/whats_new/v1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index a5adbb2f09fb3..b89e2a1a23e39 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -190,7 +190,7 @@ Changelog :pr:`20803` by :user:`Brian Sun `. - |API| Adds `class_weight` to :class:`ensemble.HistGradientBoostingClassifier`. - :pr:`xxxxx` by `Thomas Fan`_. + :pr:`22014` by `Thomas Fan`_. :mod:`sklearn.feature_extraction.text` ...................................... From 550857920d12c4915925a86f758a8d1281329294 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 16 Sep 2022 11:22:27 +0200 Subject: [PATCH 3/6] update for 1.2 --- .../ensemble/_hist_gradient_boosting/gradient_boosting.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 15bbf6d916f4f..dcb40bc102b8f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1547,9 +1547,9 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): weights inversely proportional to class frequencies in the input data as `n_samples / (n_classes * np.bincount(y))`. Note that these weights will be multiplied with sample_weight (passed - through the fit method) if `sample_weight is` specified. + through the fit method) if `sample_weight` is specified. - .. versionadded:: 1.1 + .. versionadded:: 1.2 Attributes ---------- @@ -1631,6 +1631,7 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): ), BaseLoss, ], + "class_weight": [dict, StrOptions({"balanced"}), None], } def __init__( From 27e9b81f6f0830a7161e4eaa2cfc8ec899457f7e Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 16 Sep 2022 11:24:56 +0200 Subject: [PATCH 4/6] cln --- doc/whats_new/v1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 952d2867360a3..c0819b15848b2 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -659,7 +659,7 @@ Changelog equivalent to `criterion="entropy"`. :pr:`23047` by :user:`Christian Lorentzen `. -- |Enhancement| Adds :term:`get_feature_names_out` to +- |Enhancement| Adds :meth:`get_feature_names_out` to :class:`ensemble.VotingClassifier`, :class:`ensemble.VotingRegressor`, :class:`ensemble.StackingClassifier`, and :class:`ensemble.StackingRegressor`. :pr:`22695` and :pr:`22697` by `Thomas Fan`_. From 0962c8cd4404245fa894087574bea93453dc3c52 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 16 Sep 2022 11:25:59 +0200 Subject: [PATCH 5/6] oops --- doc/whats_new/v1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index c0819b15848b2..952d2867360a3 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -659,7 +659,7 @@ Changelog equivalent to `criterion="entropy"`. :pr:`23047` by :user:`Christian Lorentzen `. -- |Enhancement| Adds :meth:`get_feature_names_out` to +- |Enhancement| Adds :term:`get_feature_names_out` to :class:`ensemble.VotingClassifier`, :class:`ensemble.VotingRegressor`, :class:`ensemble.StackingClassifier`, and :class:`ensemble.StackingRegressor`. :pr:`22695` and :pr:`22697` by `Thomas Fan`_. From bb48881d2bf06a08e98b82b526eecd30e64b4f8c Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 16 Sep 2022 11:31:18 +0200 Subject: [PATCH 6/6] API -> Feature --- doc/whats_new/v1.2.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 7f1e143148e87..9960f9b9027e5 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -207,6 +207,9 @@ Changelog :mod:`sklearn.ensemble` ....................... +- |Feature| Adds `class_weight` to :class:`ensemble.HistGradientBoostingClassifier`. + :pr:`22014` by `Thomas Fan`_. + - |Efficiency| Improve runtime performance of :class:`ensemble.IsolationForest` by avoiding data copies. :pr:`23252` by :user:`Zhehao Liu `. @@ -238,9 +241,6 @@ Changelog `base_estimator_` is deprecated in 1.2 and will be removed in 1.4. :pr:`23819` by :user:`Adrian Trujillo ` and :user:`Edoardo Abati `. -- |API| Adds `class_weight` to :class:`ensemble.HistGradientBoostingClassifier`. - :pr:`22014` by `Thomas Fan`_. - :mod:`sklearn.feature_selection` ................................