diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index b12a22c804687..9960f9b9027e5 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -207,6 +207,9 @@ Changelog :mod:`sklearn.ensemble` ....................... +- |Feature| Adds `class_weight` to :class:`ensemble.HistGradientBoostingClassifier`. + :pr:`22014` by `Thomas Fan`_. + - |Efficiency| Improve runtime performance of :class:`ensemble.IsolationForest` by avoiding data copies. :pr:`23252` by :user:`Zhehao Liu `. diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index ea3e6a82a5edb..dcb40bc102b8f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -17,7 +17,7 @@ PinballLoss, ) from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier -from ...utils import check_random_state, resample +from ...utils import check_random_state, resample, compute_sample_weight from ...utils.validation import ( check_is_fitted, check_consistent_length, @@ -159,6 +159,14 @@ def _validate_parameters(self): "monotonic constraints are not supported for multiclass classification." ) + def _finalize_sample_weight(self, sample_weight, y): + """Finalize sample weight. + + Used by subclasses to adjust sample_weights. This is useful for implementing + class weights. + """ + return sample_weight + def _check_categories(self, X): """Check and validate categorical features in X @@ -283,6 +291,8 @@ def fit(self, X, y, sample_weight=None): # TODO: remove when PDP supports sample weights self._fitted_with_sw = True + sample_weight = self._finalize_sample_weight(sample_weight, y) + rng = check_random_state(self.random_state) # When warm starting, we want to re-use the same seed that was used @@ -1530,6 +1540,16 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): is enabled. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. + class_weight : dict or 'balanced', default=None + Weights associated with classes in the form `{class_label: weight}`. + If not given, all classes are supposed to have weight one. + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as `n_samples / (n_classes * np.bincount(y))`. + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if `sample_weight` is specified. + + .. versionadded:: 1.2 Attributes ---------- @@ -1611,6 +1631,7 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): ), BaseLoss, ], + "class_weight": [dict, StrOptions({"balanced"}), None], } def __init__( @@ -1634,6 +1655,7 @@ def __init__( tol=1e-7, verbose=0, random_state=None, + class_weight=None, ): super(HistGradientBoostingClassifier, self).__init__( loss=loss, @@ -1655,6 +1677,19 @@ def __init__( verbose=verbose, random_state=random_state, ) + self.class_weight = class_weight + + def _finalize_sample_weight(self, sample_weight, y): + """Adjust sample_weights with class_weights.""" + if self.class_weight is None: + return sample_weight + + expanded_class_weight = compute_sample_weight(self.class_weight, y) + + if sample_weight is not None: + return sample_weight * expanded_class_weight + else: + return expanded_class_weight def predict(self, X): """Predict classes for X. diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index a260e9b37142d..5d57993c0e09d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1123,3 +1123,58 @@ def test_no_user_warning_with_scoring(): with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) est.fit(X_df, y) + + +def test_class_weights(): + """High level test to check class_weights.""" + n_samples = 255 + n_features = 2 + + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + n_redundant=0, + n_clusters_per_class=1, + n_classes=2, + random_state=0, + ) + y_is_1 = y == 1 + + # class_weight is the same as sample weights with the corresponding class + clf = HistGradientBoostingClassifier( + min_samples_leaf=2, random_state=0, max_depth=2 + ) + sample_weight = np.ones(shape=(n_samples)) + sample_weight[y_is_1] = 3.0 + clf.fit(X, y, sample_weight=sample_weight) + + class_weight = {0: 1.0, 1: 3.0} + clf_class_weighted = clone(clf).set_params(class_weight=class_weight) + clf_class_weighted.fit(X, y) + + assert_allclose(clf.decision_function(X), clf_class_weighted.decision_function(X)) + + # Check that sample_weight and class_weight are multiplicative + clf.fit(X, y, sample_weight=sample_weight**2) + clf_class_weighted.fit(X, y, sample_weight=sample_weight) + assert_allclose(clf.decision_function(X), clf_class_weighted.decision_function(X)) + + # Make imbalanced dataset + X_imb = np.concatenate((X[~y_is_1], X[y_is_1][:10])) + y_imb = np.concatenate((y[~y_is_1], y[y_is_1][:10])) + + # class_weight="balanced" is the same as sample_weights to be + # inversely proportional to n_samples / (n_classes * np.bincount(y)) + clf_balanced = clone(clf).set_params(class_weight="balanced") + clf_balanced.fit(X_imb, y_imb) + + class_weight = y_imb.shape[0] / (2 * np.bincount(y_imb)) + sample_weight = class_weight[y_imb] + clf_sample_weight = clone(clf).set_params(class_weight=None) + clf_sample_weight.fit(X_imb, y_imb, sample_weight=sample_weight) + + assert_allclose( + clf_balanced.decision_function(X_imb), + clf_sample_weight.decision_function(X_imb), + )