Skip to content

DEP auto, binary_crossentropy, categorical_crossentropy in HGBT #23040

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Apr 7, 2022
13 changes: 5 additions & 8 deletions benchmarks/bench_hist_gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,7 @@ def one_run(n_samples):
loss = args.loss
if args.problem == "classification":
if loss == "default":
# loss='auto' does not work with get_equivalent_estimator()
loss = (
"binary_crossentropy"
if args.n_classes == 2
else "categorical_crossentropy"
)
loss = "log_loss"
else:
# regression
if loss == "default":
Expand Down Expand Up @@ -159,7 +154,7 @@ def one_run(n_samples):
xgb_score_duration = None
if args.xgboost:
print("Fitting an XGBoost model...")
xgb_est = get_equivalent_estimator(est, lib="xgboost")
xgb_est = get_equivalent_estimator(est, lib="xgboost", n_classes=args.n_classes)

tic = time()
xgb_est.fit(X_train, y_train, sample_weight=sample_weight_train)
Expand All @@ -176,7 +171,9 @@ def one_run(n_samples):
cat_score_duration = None
if args.catboost:
print("Fitting a CatBoost model...")
cat_est = get_equivalent_estimator(est, lib="catboost")
cat_est = get_equivalent_estimator(
est, lib="catboost", n_classes=args.n_classes
)

tic = time()
cat_est.fit(X_train, y_train, sample_weight=sample_weight_train)
Expand Down
7 changes: 5 additions & 2 deletions benchmarks/bench_hist_gradient_boosting_adult.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import argparse
from time import time

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score, roc_auc_score
Expand Down Expand Up @@ -48,6 +50,7 @@ def predict(est, data_test, target_test):
data = fetch_openml(data_id=179, as_frame=False) # adult dataset
X, y = data.data, data.target

n_classes = len(np.unique(y))
n_features = X.shape[1]
n_categorical_features = len(data.categories)
n_numerical_features = n_features - n_categorical_features
Expand All @@ -61,7 +64,7 @@ def predict(est, data_test, target_test):
# already clean
is_categorical = [name in data.categories for name in data.feature_names]
est = HistGradientBoostingClassifier(
loss="binary_crossentropy",
loss="log_loss",
learning_rate=lr,
max_iter=n_trees,
max_bins=max_bins,
Expand All @@ -76,7 +79,7 @@ def predict(est, data_test, target_test):
predict(est, X_test, y_test)

if args.lightgbm:
est = get_equivalent_estimator(est, lib="lightgbm")
est = get_equivalent_estimator(est, lib="lightgbm", n_classes=n_classes)
est.set_params(max_cat_to_onehot=1) # dont use OHE
categorical_features = [
f_idx for (f_idx, is_cat) in enumerate(is_categorical) if is_cat
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/bench_hist_gradient_boosting_categorical_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def predict(est, data_test):

is_categorical = [True] * n_features
est = HistGradientBoostingClassifier(
loss="binary_crossentropy",
loss="log_loss",
learning_rate=lr,
max_iter=n_trees,
max_bins=max_bins,
Expand All @@ -73,7 +73,7 @@ def predict(est, data_test):
predict(est, X)

if args.lightgbm:
est = get_equivalent_estimator(est, lib="lightgbm")
est = get_equivalent_estimator(est, lib="lightgbm", n_classes=2)
est.set_params(max_cat_to_onehot=1) # dont use OHE
categorical_features = list(range(n_features))
fit(est, X, y, "lightgbm", categorical_feature=categorical_features)
Expand Down
9 changes: 5 additions & 4 deletions benchmarks/bench_hist_gradient_boosting_higgsboson.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def predict(est, data_test, target_test):
data_train, data_test, target_train, target_test = train_test_split(
data, target, test_size=0.2, random_state=0
)
n_classes = len(np.unique(target))

if subsample is not None:
data_train, target_train = data_train[:subsample], target_train[:subsample]
Expand All @@ -88,7 +89,7 @@ def predict(est, data_test, target_test):
print(f"Training set with {n_samples} records with {n_features} features.")

est = HistGradientBoostingClassifier(
loss="binary_crossentropy",
loss="log_loss",
learning_rate=lr,
max_iter=n_trees,
max_bins=max_bins,
Expand All @@ -101,16 +102,16 @@ def predict(est, data_test, target_test):
predict(est, data_test, target_test)

if args.lightgbm:
est = get_equivalent_estimator(est, lib="lightgbm")
est = get_equivalent_estimator(est, lib="lightgbm", n_classes=n_classes)
fit(est, data_train, target_train, "lightgbm")
predict(est, data_test, target_test)

if args.xgboost:
est = get_equivalent_estimator(est, lib="xgboost")
est = get_equivalent_estimator(est, lib="xgboost", n_classes=n_classes)
fit(est, data_train, target_train, "xgboost")
predict(est, data_test, target_test)

if args.catboost:
est = get_equivalent_estimator(est, lib="catboost")
est = get_equivalent_estimator(est, lib="catboost", n_classes=n_classes)
fit(est, data_train, target_train, "catboost")
predict(est, data_test, target_test)
10 changes: 5 additions & 5 deletions benchmarks/bench_hist_gradient_boosting_threading.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,7 @@ def get_estimator_and_data():
if args.problem == "classification":
if loss == "default":
# loss='auto' does not work with get_equivalent_estimator()
loss = (
"binary_crossentropy" if args.n_classes == 2 else "categorical_crossentropy"
)
loss = "log_loss"
else:
# regression
if loss == "default":
Expand Down Expand Up @@ -191,7 +189,7 @@ def one_run(n_threads, n_samples):
xgb_score_duration = None
if args.xgboost:
print("Fitting an XGBoost model...")
xgb_est = get_equivalent_estimator(est, lib="xgboost")
xgb_est = get_equivalent_estimator(est, lib="xgboost", n_classes=args.n_classes)
xgb_est.set_params(nthread=n_threads)

tic = time()
Expand All @@ -209,7 +207,9 @@ def one_run(n_threads, n_samples):
cat_score_duration = None
if args.catboost:
print("Fitting a CatBoost model...")
cat_est = get_equivalent_estimator(est, lib="catboost")
cat_est = get_equivalent_estimator(
est, lib="catboost", n_classes=args.n_classes
)
cat_est.set_params(thread_count=n_threads)

tic = time()
Expand Down
9 changes: 5 additions & 4 deletions doc/modules/ensemble.rst
Original file line number Diff line number Diff line change
Expand Up @@ -949,10 +949,11 @@ controls the number of iterations of the boosting process::
Available losses for regression are 'squared_error',
'absolute_error', which is less sensitive to outliers, and
'poisson', which is well suited to model counts and frequencies. For
classification, 'binary_crossentropy' is used for binary classification and
'categorical_crossentropy' is used for multiclass classification. By default
the loss is 'auto' and will select the appropriate loss depending on
:term:`y` passed to :term:`fit`.
classification, 'log_loss' is the only option. For binary classification it uses the
binary log loss, also kown as binomial deviance or binary cross-entropy. For
`n_classes >= 3`, it uses the multi-class log loss function, with multinomial deviance
and categorical cross-entropy as alternative names. The appropriate loss version is
selected based on :term:`y` passed to :term:`fit`.

The size of the trees can be controlled through the ``max_leaf_nodes``,
``max_depth``, and ``min_samples_leaf`` parameters.
Expand Down
5 changes: 5 additions & 0 deletions doc/whats_new/v1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@ Changelog
default.
:pr:`23036` by :user:`Christian Lorentzen <lorentzenchr>`.

- For :class:`ensemble.HistGradientBoostingClassifier`, the `loss` parameter names
"auto", "binary_crossentropy" and "categorical_crossentropy" are deprecated in
favor of the new name "log_loss", which is now the default.
:pr:`23040` by :user:`Christian Lorentzen <lorentzenchr>`.

- For :class:`linear_model.SGDClassifier`, the `loss` parameter name
"log" is deprecated in favor of the new name "log_loss".
:pr:`23046` by :user:`Christian Lorentzen <lorentzenchr>`.
Expand Down
76 changes: 50 additions & 26 deletions sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@


_LOSSES = _LOSSES.copy()
# TODO: Remove least_squares and least_absolute_deviation in v1.2
# TODO(1.2): Remove "least_squares" and "least_absolute_deviation"
# TODO(1.3): Remove "binary_crossentropy" and "categorical_crossentropy"
_LOSSES.update(
{
"least_squares": HalfSquaredError,
Expand Down Expand Up @@ -1299,6 +1300,7 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
0.92...
"""

# TODO(1.2): remove "least_absolute_deviation"
_VALID_LOSSES = (
"squared_error",
"least_squares",
Expand Down Expand Up @@ -1455,13 +1457,25 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):

Parameters
----------
loss : {'auto', 'binary_crossentropy', 'categorical_crossentropy'}, \
default='auto'
The loss function to use in the boosting process. 'binary_crossentropy'
(also known as logistic loss) is used for binary classification and
generalizes to 'categorical_crossentropy' for multiclass
classification. 'auto' will automatically choose either loss depending
on the nature of the problem.
loss : {'log_loss', 'auto', 'binary_crossentropy', 'categorical_crossentropy'}, \
default='log_loss'
The loss function to use in the boosting process.

For binary classification problems, 'log_loss' is also known as logistic loss,
binomial deviance or binary crossentropy. Internally, the model fits one tree
per boosting iteration and uses the logistic sigmoid function (expit) as
inverse link function to compute the predicted positive class probability.

For multiclass classification problems, 'log_loss' is also known as multinomial
deviance or categorical crossentropy. Internally, the model fits one tree per
boosting iteration and per class and uses the softmax function as inverse link
function to compute the predicted probabilities of the classes.

.. deprecated:: 1.1
The loss arguments 'auto', 'binary_crossentropy' and
'categorical_crossentropy' were deprecated in v1.1 and will be removed in
version 1.3. Use `loss='log_loss'` which is equivalent.

learning_rate : float, default=0.1
The learning rate, also known as *shrinkage*. This is used as a
multiplicative factor for the leaves values. Use ``1`` for no
Expand Down Expand Up @@ -1617,11 +1631,17 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
1.0
"""

_VALID_LOSSES = ("binary_crossentropy", "categorical_crossentropy", "auto")
# TODO(1.3): Remove "binary_crossentropy", "categorical_crossentropy", "auto"
_VALID_LOSSES = (
"log_loss",
"binary_crossentropy",
"categorical_crossentropy",
"auto",
)

def __init__(
self,
loss="auto",
loss="log_loss",
*,
learning_rate=0.1,
max_iter=100,
Expand Down Expand Up @@ -1798,33 +1818,37 @@ def _encode_y(self, y):
return encoded_y

def _get_loss(self, sample_weight):
if self.loss == "auto":
# TODO(1.3): Remove "auto", "binary_crossentropy", "categorical_crossentropy"
if self.loss in ("auto", "binary_crossentropy", "categorical_crossentropy"):
warnings.warn(
f"The loss '{self.loss}' was deprecated in v1.1 and will be removed in "
"version 1.3. Use 'log_loss' which is equivalent.",
FutureWarning,
)

if self.loss in ("log_loss", "auto"):
if self.n_trees_per_iteration_ == 1:
return _LOSSES["binary_crossentropy"](sample_weight=sample_weight)
return HalfBinomialLoss(sample_weight=sample_weight)
else:
return _LOSSES["categorical_crossentropy"](
sample_weight=sample_weight,
n_classes=self.n_trees_per_iteration_,
return HalfMultinomialLoss(
sample_weight=sample_weight, n_classes=self.n_trees_per_iteration_
)

if self.loss == "categorical_crossentropy":
if self.n_trees_per_iteration_ == 1:
raise ValueError(
"loss='categorical_crossentropy' is not suitable for "
"a binary classification problem. Please use "
"loss='auto' or loss='binary_crossentropy' instead."
f"loss='{self.loss}' is not suitable for a binary classification "
"problem. Please use loss='log_loss' instead."
)
else:
return _LOSSES[self.loss](
return HalfMultinomialLoss(
sample_weight=sample_weight, n_classes=self.n_trees_per_iteration_
)
else:
if self.loss == "binary_crossentropy":
if self.n_trees_per_iteration_ > 1:
raise ValueError(
"loss='binary_crossentropy' is not defined for multiclass"
" classification with n_classes="
f"{self.n_trees_per_iteration_}, use loss="
"'categorical_crossentropy' instead."
f"loss='{self.loss}' is not defined for multiclass "
f"classification with n_classes={self.n_trees_per_iteration_}, "
"use loss='log_loss' instead."
)
else:
return _LOSSES[self.loss](sample_weight=sample_weight)
return HalfBinomialLoss(sample_weight=sample_weight)
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,17 @@ def test_same_predictions_classification(
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

est_sklearn = HistGradientBoostingClassifier(
loss="binary_crossentropy",
loss="log_loss",
max_iter=max_iter,
max_bins=max_bins,
learning_rate=1,
early_stopping=False,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes,
)
est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm")
est_lightgbm = get_equivalent_estimator(
est_sklearn, lib="lightgbm", n_classes=n_classes
)

est_lightgbm.fit(X_train, y_train)
est_sklearn.fit(X_train, y_train)
Expand Down Expand Up @@ -198,7 +200,7 @@ def test_same_predictions_multiclass_classification(
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

est_sklearn = HistGradientBoostingClassifier(
loss="categorical_crossentropy",
loss="log_loss",
max_iter=max_iter,
max_bins=max_bins,
learning_rate=lr,
Expand Down
Loading