Skip to content

ENH Adds class_weight to HistGradientBoostingClassifier #22014

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

3 changes: 3 additions & 0 deletions doc/whats_new/v1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,9 @@ Changelog
:mod:`sklearn.ensemble`
.......................

- |Feature| Adds `class_weight` to :class:`ensemble.HistGradientBoostingClassifier`.
:pr:`22014` by `Thomas Fan`_.

- |Efficiency| Improve runtime performance of :class:`ensemble.IsolationForest`
by avoiding data copies. :pr:`23252` by :user:`Zhehao Liu <MaxwellLZH>`.

Expand Down
37 changes: 36 additions & 1 deletion sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
PinballLoss,
)
from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier
from ...utils import check_random_state, resample
from ...utils import check_random_state, resample, compute_sample_weight
from ...utils.validation import (
check_is_fitted,
check_consistent_length,
Expand Down Expand Up @@ -159,6 +159,14 @@ def _validate_parameters(self):
"monotonic constraints are not supported for multiclass classification."
)

def _finalize_sample_weight(self, sample_weight, y):
"""Finalize sample weight.

Used by subclasses to adjust sample_weights. This is useful for implementing
class weights.
"""
return sample_weight

def _check_categories(self, X):
"""Check and validate categorical features in X

Expand Down Expand Up @@ -283,6 +291,8 @@ def fit(self, X, y, sample_weight=None):
# TODO: remove when PDP supports sample weights
self._fitted_with_sw = True

sample_weight = self._finalize_sample_weight(sample_weight, y)

rng = check_random_state(self.random_state)

# When warm starting, we want to re-use the same seed that was used
Expand Down Expand Up @@ -1530,6 +1540,16 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
is enabled.
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.
class_weight : dict or 'balanced', default=None
Weights associated with classes in the form `{class_label: weight}`.
If not given, all classes are supposed to have weight one.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as `n_samples / (n_classes * np.bincount(y))`.
Note that these weights will be multiplied with sample_weight (passed
through the fit method) if `sample_weight` is specified.

.. versionadded:: 1.2

Attributes
----------
Expand Down Expand Up @@ -1611,6 +1631,7 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
),
BaseLoss,
],
"class_weight": [dict, StrOptions({"balanced"}), None],
}

def __init__(
Expand All @@ -1634,6 +1655,7 @@ def __init__(
tol=1e-7,
verbose=0,
random_state=None,
class_weight=None,
):
super(HistGradientBoostingClassifier, self).__init__(
loss=loss,
Expand All @@ -1655,6 +1677,19 @@ def __init__(
verbose=verbose,
random_state=random_state,
)
self.class_weight = class_weight

def _finalize_sample_weight(self, sample_weight, y):
"""Adjust sample_weights with class_weights."""
if self.class_weight is None:
return sample_weight

expanded_class_weight = compute_sample_weight(self.class_weight, y)

if sample_weight is not None:
return sample_weight * expanded_class_weight
else:
return expanded_class_weight

def predict(self, X):
"""Predict classes for X.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1123,3 +1123,58 @@ def test_no_user_warning_with_scoring():
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
est.fit(X_df, y)


def test_class_weights():
"""High level test to check class_weights."""
n_samples = 255
n_features = 2

X, y = make_classification(
n_samples=n_samples,
n_features=n_features,
n_informative=n_features,
n_redundant=0,
n_clusters_per_class=1,
n_classes=2,
random_state=0,
)
y_is_1 = y == 1

# class_weight is the same as sample weights with the corresponding class
clf = HistGradientBoostingClassifier(
min_samples_leaf=2, random_state=0, max_depth=2
)
sample_weight = np.ones(shape=(n_samples))
sample_weight[y_is_1] = 3.0
clf.fit(X, y, sample_weight=sample_weight)

class_weight = {0: 1.0, 1: 3.0}
clf_class_weighted = clone(clf).set_params(class_weight=class_weight)
clf_class_weighted.fit(X, y)

assert_allclose(clf.decision_function(X), clf_class_weighted.decision_function(X))

# Check that sample_weight and class_weight are multiplicative
clf.fit(X, y, sample_weight=sample_weight**2)
clf_class_weighted.fit(X, y, sample_weight=sample_weight)
assert_allclose(clf.decision_function(X), clf_class_weighted.decision_function(X))

# Make imbalanced dataset
X_imb = np.concatenate((X[~y_is_1], X[y_is_1][:10]))
y_imb = np.concatenate((y[~y_is_1], y[y_is_1][:10]))

# class_weight="balanced" is the same as sample_weights to be
# inversely proportional to n_samples / (n_classes * np.bincount(y))
clf_balanced = clone(clf).set_params(class_weight="balanced")
clf_balanced.fit(X_imb, y_imb)

class_weight = y_imb.shape[0] / (2 * np.bincount(y_imb))
sample_weight = class_weight[y_imb]
clf_sample_weight = clone(clf).set_params(class_weight=None)
clf_sample_weight.fit(X_imb, y_imb, sample_weight=sample_weight)

assert_allclose(
clf_balanced.decision_function(X_imb),
clf_sample_weight.decision_function(X_imb),
)