scikit-learn · jeremiedbb · Sep 16, 2022 · Dec 17, 2021 · Dec 17, 2021 · Jan 14, 2022
diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -207,6 +207,9 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
+- |Feature| Adds `class_weight` to :class:`ensemble.HistGradientBoostingClassifier`.
+  :pr:`22014` by `Thomas Fan`_.
+
 - |Efficiency| Improve runtime performance of :class:`ensemble.IsolationForest`
   by avoiding data copies. :pr:`23252` by :user:`Zhehao Liu <MaxwellLZH>`.
 

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -17,7 +17,7 @@
     PinballLoss,
 )
 from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier
-from ...utils import check_random_state, resample
+from ...utils import check_random_state, resample, compute_sample_weight
 from ...utils.validation import (
     check_is_fitted,
     check_consistent_length,
@@ -159,6 +159,14 @@ def _validate_parameters(self):
                 "monotonic constraints are not supported for multiclass classification."
             )
 
+    def _finalize_sample_weight(self, sample_weight, y):
+        """Finalize sample weight.
+
+        Used by subclasses to adjust sample_weights. This is useful for implementing
+        class weights.
+        """
+        return sample_weight
+
     def _check_categories(self, X):
         """Check and validate categorical features in X
 
@@ -283,6 +291,8 @@ def fit(self, X, y, sample_weight=None):
             # TODO: remove when PDP supports sample weights
             self._fitted_with_sw = True
 
+        sample_weight = self._finalize_sample_weight(sample_weight, y)
+
         rng = check_random_state(self.random_state)
 
         # When warm starting, we want to re-use the same seed that was used
@@ -1530,6 +1540,16 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
         is enabled.
         Pass an int for reproducible output across multiple function calls.
         See :term:`Glossary <random_state>`.
+    class_weight : dict or 'balanced', default=None
+        Weights associated with classes in the form `{class_label: weight}`.
+        If not given, all classes are supposed to have weight one.
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as `n_samples / (n_classes * np.bincount(y))`.
+        Note that these weights will be multiplied with sample_weight (passed
+        through the fit method) if `sample_weight` is specified.
+
+        .. versionadded:: 1.2
 
     Attributes
     ----------
@@ -1611,6 +1631,7 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
             ),
             BaseLoss,
         ],
+        "class_weight": [dict, StrOptions({"balanced"}), None],
     }
 
     def __init__(
@@ -1634,6 +1655,7 @@ def __init__(
         tol=1e-7,
         verbose=0,
         random_state=None,
+        class_weight=None,
     ):
         super(HistGradientBoostingClassifier, self).__init__(
             loss=loss,
@@ -1655,6 +1677,19 @@ def __init__(
             verbose=verbose,
             random_state=random_state,
         )
+        self.class_weight = class_weight
+
+    def _finalize_sample_weight(self, sample_weight, y):
+        """Adjust sample_weights with class_weights."""
+        if self.class_weight is None:
+            return sample_weight
+
+        expanded_class_weight = compute_sample_weight(self.class_weight, y)
+
+        if sample_weight is not None:
+            return sample_weight * expanded_class_weight
+        else:
+            return expanded_class_weight
 
     def predict(self, X):
         """Predict classes for X.

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -1123,3 +1123,58 @@ def test_no_user_warning_with_scoring():
     with warnings.catch_warnings():
         warnings.simplefilter("error", UserWarning)
         est.fit(X_df, y)
+
+
+def test_class_weights():
+    """High level test to check class_weights."""
+    n_samples = 255
+    n_features = 2
+
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_informative=n_features,
+        n_redundant=0,
+        n_clusters_per_class=1,
+        n_classes=2,
+        random_state=0,
+    )
+    y_is_1 = y == 1
+
+    # class_weight is the same as sample weights with the corresponding class
+    clf = HistGradientBoostingClassifier(
+        min_samples_leaf=2, random_state=0, max_depth=2
+    )
+    sample_weight = np.ones(shape=(n_samples))
+    sample_weight[y_is_1] = 3.0
+    clf.fit(X, y, sample_weight=sample_weight)
+
+    class_weight = {0: 1.0, 1: 3.0}
+    clf_class_weighted = clone(clf).set_params(class_weight=class_weight)
+    clf_class_weighted.fit(X, y)
+
+    assert_allclose(clf.decision_function(X), clf_class_weighted.decision_function(X))
+
+    # Check that sample_weight and class_weight are multiplicative
+    clf.fit(X, y, sample_weight=sample_weight**2)
+    clf_class_weighted.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(clf.decision_function(X), clf_class_weighted.decision_function(X))
+
+    # Make imbalanced dataset
+    X_imb = np.concatenate((X[~y_is_1], X[y_is_1][:10]))
+    y_imb = np.concatenate((y[~y_is_1], y[y_is_1][:10]))
+
+    # class_weight="balanced" is the same as sample_weights to be
+    # inversely proportional to n_samples / (n_classes * np.bincount(y))
+    clf_balanced = clone(clf).set_params(class_weight="balanced")
+    clf_balanced.fit(X_imb, y_imb)
+
+    class_weight = y_imb.shape[0] / (2 * np.bincount(y_imb))
+    sample_weight = class_weight[y_imb]
+    clf_sample_weight = clone(clf).set_params(class_weight=None)
+    clf_sample_weight.fit(X_imb, y_imb, sample_weight=sample_weight)
+
+    assert_allclose(
+        clf_balanced.decision_function(X_imb),
+        clf_sample_weight.decision_function(X_imb),
+    )