scikit-learn · jeremiedbb · Apr 7, 2022 · Apr 3, 2022 · Apr 6, 2022 · Apr 6, 2022
diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py
@@ -115,12 +115,7 @@ def one_run(n_samples):
     loss = args.loss
     if args.problem == "classification":
         if loss == "default":
-            # loss='auto' does not work with get_equivalent_estimator()
-            loss = (
-                "binary_crossentropy"
-                if args.n_classes == 2
-                else "categorical_crossentropy"
-            )
+            loss = "log_loss"
     else:
         # regression
         if loss == "default":
@@ -159,7 +154,7 @@ def one_run(n_samples):
     xgb_score_duration = None
     if args.xgboost:
         print("Fitting an XGBoost model...")
-        xgb_est = get_equivalent_estimator(est, lib="xgboost")
+        xgb_est = get_equivalent_estimator(est, lib="xgboost", n_classes=args.n_classes)
 
         tic = time()
         xgb_est.fit(X_train, y_train, sample_weight=sample_weight_train)
@@ -176,7 +171,9 @@ def one_run(n_samples):
     cat_score_duration = None
     if args.catboost:
         print("Fitting a CatBoost model...")
-        cat_est = get_equivalent_estimator(est, lib="catboost")
+        cat_est = get_equivalent_estimator(
+            est, lib="catboost", n_classes=args.n_classes
+        )
 
         tic = time()
         cat_est.fit(X_train, y_train, sample_weight=sample_weight_train)

diff --git a/benchmarks/bench_hist_gradient_boosting_adult.py b/benchmarks/bench_hist_gradient_boosting_adult.py
@@ -1,6 +1,8 @@
 import argparse
 from time import time
 
+import numpy as np
+
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import fetch_openml
 from sklearn.metrics import accuracy_score, roc_auc_score
@@ -48,6 +50,7 @@ def predict(est, data_test, target_test):
 data = fetch_openml(data_id=179, as_frame=False)  # adult dataset
 X, y = data.data, data.target
 
+n_classes = len(np.unique(y))
 n_features = X.shape[1]
 n_categorical_features = len(data.categories)
 n_numerical_features = n_features - n_categorical_features
@@ -61,7 +64,7 @@ def predict(est, data_test, target_test):
 # already clean
 is_categorical = [name in data.categories for name in data.feature_names]
 est = HistGradientBoostingClassifier(
-    loss="binary_crossentropy",
+    loss="log_loss",
     learning_rate=lr,
     max_iter=n_trees,
     max_bins=max_bins,
@@ -76,7 +79,7 @@ def predict(est, data_test, target_test):
 predict(est, X_test, y_test)
 
 if args.lightgbm:
-    est = get_equivalent_estimator(est, lib="lightgbm")
+    est = get_equivalent_estimator(est, lib="lightgbm", n_classes=n_classes)
     est.set_params(max_cat_to_onehot=1)  # dont use OHE
     categorical_features = [
         f_idx for (f_idx, is_cat) in enumerate(is_categorical) if is_cat

diff --git a/benchmarks/bench_hist_gradient_boosting_categorical_only.py b/benchmarks/bench_hist_gradient_boosting_categorical_only.py
@@ -58,7 +58,7 @@ def predict(est, data_test):
 
 is_categorical = [True] * n_features
 est = HistGradientBoostingClassifier(
-    loss="binary_crossentropy",
+    loss="log_loss",
     learning_rate=lr,
     max_iter=n_trees,
     max_bins=max_bins,
@@ -73,7 +73,7 @@ def predict(est, data_test):
 predict(est, X)
 
 if args.lightgbm:
-    est = get_equivalent_estimator(est, lib="lightgbm")
+    est = get_equivalent_estimator(est, lib="lightgbm", n_classes=2)
     est.set_params(max_cat_to_onehot=1)  # dont use OHE
     categorical_features = list(range(n_features))
     fit(est, X, y, "lightgbm", categorical_feature=categorical_features)

diff --git a/benchmarks/bench_hist_gradient_boosting_higgsboson.py b/benchmarks/bench_hist_gradient_boosting_higgsboson.py
@@ -80,6 +80,7 @@ def predict(est, data_test, target_test):
 data_train, data_test, target_train, target_test = train_test_split(
     data, target, test_size=0.2, random_state=0
 )
+n_classes = len(np.unique(target))
 
 if subsample is not None:
     data_train, target_train = data_train[:subsample], target_train[:subsample]
@@ -88,7 +89,7 @@ def predict(est, data_test, target_test):
 print(f"Training set with {n_samples} records with {n_features} features.")
 
 est = HistGradientBoostingClassifier(
-    loss="binary_crossentropy",
+    loss="log_loss",
     learning_rate=lr,
     max_iter=n_trees,
     max_bins=max_bins,
@@ -101,16 +102,16 @@ def predict(est, data_test, target_test):
 predict(est, data_test, target_test)
 
 if args.lightgbm:
-    est = get_equivalent_estimator(est, lib="lightgbm")
+    est = get_equivalent_estimator(est, lib="lightgbm", n_classes=n_classes)
     fit(est, data_train, target_train, "lightgbm")
     predict(est, data_test, target_test)
 
 if args.xgboost:
-    est = get_equivalent_estimator(est, lib="xgboost")
+    est = get_equivalent_estimator(est, lib="xgboost", n_classes=n_classes)
     fit(est, data_train, target_train, "xgboost")
     predict(est, data_test, target_test)
 
 if args.catboost:
-    est = get_equivalent_estimator(est, lib="catboost")
+    est = get_equivalent_estimator(est, lib="catboost", n_classes=n_classes)
     fit(est, data_train, target_train, "catboost")
     predict(est, data_test, target_test)
diff --git a/benchmarks/bench_hist_gradient_boosting_threading.py b/benchmarks/bench_hist_gradient_boosting_threading.py
@@ -118,9 +118,7 @@ def get_estimator_and_data():
 if args.problem == "classification":
     if loss == "default":
         # loss='auto' does not work with get_equivalent_estimator()
-        loss = (
-            "binary_crossentropy" if args.n_classes == 2 else "categorical_crossentropy"
-        )
+        loss = "log_loss"
 else:
     # regression
     if loss == "default":
@@ -191,7 +189,7 @@ def one_run(n_threads, n_samples):
     xgb_score_duration = None
     if args.xgboost:
         print("Fitting an XGBoost model...")
-        xgb_est = get_equivalent_estimator(est, lib="xgboost")
+        xgb_est = get_equivalent_estimator(est, lib="xgboost", n_classes=args.n_classes)
         xgb_est.set_params(nthread=n_threads)
 
         tic = time()
@@ -209,7 +207,9 @@ def one_run(n_threads, n_samples):
     cat_score_duration = None
     if args.catboost:
         print("Fitting a CatBoost model...")
-        cat_est = get_equivalent_estimator(est, lib="catboost")
+        cat_est = get_equivalent_estimator(
+            est, lib="catboost", n_classes=args.n_classes
+        )
         cat_est.set_params(thread_count=n_threads)
 
         tic = time()

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -949,10 +949,11 @@ controls the number of iterations of the boosting process::
 Available losses for regression are 'squared_error',
 'absolute_error', which is less sensitive to outliers, and
 'poisson', which is well suited to model counts and frequencies. For
-classification, 'binary_crossentropy' is used for binary classification and
-'categorical_crossentropy' is used for multiclass classification. By default
-the loss is 'auto' and will select the appropriate loss depending on
-:term:`y` passed to :term:`fit`.
+classification, 'log_loss' is the only option. For binary classification it uses the
+binary log loss, also kown as binomial deviance or binary cross-entropy. For
+`n_classes >= 3`, it uses the multi-class log loss function, with multinomial deviance
+and categorical cross-entropy as alternative names. The appropriate loss version is
+selected based on :term:`y` passed to :term:`fit`.
 
 The size of the trees can be controlled through the ``max_leaf_nodes``,
 ``max_depth``, and ``min_samples_leaf`` parameters.

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -98,6 +98,11 @@ Changelog
     default.
     :pr:`23036` by :user:`Christian Lorentzen <lorentzenchr>`.
 
+  - For :class:`ensemble.HistGradientBoostingClassifier`, the `loss` parameter names
+    "auto", "binary_crossentropy" and "categorical_crossentropy" are deprecated in
+    favor of the new name "log_loss", which is now the default.
+    :pr:`23040` by :user:`Christian Lorentzen <lorentzenchr>`.
+
   - For :class:`linear_model.SGDClassifier`, the `loss` parameter name
     "log" is deprecated in favor of the new name "log_loss".
     :pr:`23046` by :user:`Christian Lorentzen <lorentzenchr>`.

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -37,7 +37,8 @@
 
 
 _LOSSES = _LOSSES.copy()
-# TODO: Remove least_squares and least_absolute_deviation in v1.2
+# TODO(1.2): Remove "least_squares" and "least_absolute_deviation"
+# TODO(1.3): Remove "binary_crossentropy" and "categorical_crossentropy"
 _LOSSES.update(
     {
         "least_squares": HalfSquaredError,
@@ -1299,6 +1300,7 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     0.92...
     """
 
+    # TODO(1.2): remove "least_absolute_deviation"
     _VALID_LOSSES = (
         "squared_error",
         "least_squares",
@@ -1455,13 +1457,25 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
 
     Parameters
     ----------
-    loss : {'auto', 'binary_crossentropy', 'categorical_crossentropy'}, \
-            default='auto'
-        The loss function to use in the boosting process. 'binary_crossentropy'
-        (also known as logistic loss) is used for binary classification and
-        generalizes to 'categorical_crossentropy' for multiclass
-        classification. 'auto' will automatically choose either loss depending
-        on the nature of the problem.
+    loss : {'log_loss', 'auto', 'binary_crossentropy', 'categorical_crossentropy'}, \
+            default='log_loss'
+        The loss function to use in the boosting process.
+
+        For binary classification problems, 'log_loss' is also known as logistic loss,
+        binomial deviance or binary crossentropy. Internally, the model fits one tree
+        per boosting iteration and uses the logistic sigmoid function (expit) as
+        inverse link function to compute the predicted positive class probability.
+
+        For multiclass classification problems, 'log_loss' is also known as multinomial
+        deviance or categorical crossentropy. Internally, the model fits one tree per
+        boosting iteration and per class and uses the softmax function as inverse link
+        function to compute the predicted probabilities of the classes.
+
+        .. deprecated:: 1.1
+            The loss arguments 'auto', 'binary_crossentropy' and
+            'categorical_crossentropy' were deprecated in v1.1 and will be removed in
+            version 1.3. Use `loss='log_loss'` which is equivalent.
+
     learning_rate : float, default=0.1
         The learning rate, also known as *shrinkage*. This is used as a
         multiplicative factor for the leaves values. Use ``1`` for no
@@ -1617,11 +1631,17 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
     1.0
     """
 
-    _VALID_LOSSES = ("binary_crossentropy", "categorical_crossentropy", "auto")
+    # TODO(1.3): Remove "binary_crossentropy", "categorical_crossentropy", "auto"
+    _VALID_LOSSES = (
+        "log_loss",
+        "binary_crossentropy",
+        "categorical_crossentropy",
+        "auto",
+    )
 
     def __init__(
         self,
-        loss="auto",
+        loss="log_loss",
         *,
         learning_rate=0.1,
         max_iter=100,
@@ -1798,33 +1818,37 @@ def _encode_y(self, y):
         return encoded_y
 
     def _get_loss(self, sample_weight):
-        if self.loss == "auto":
+        # TODO(1.3): Remove "auto", "binary_crossentropy", "categorical_crossentropy"
+        if self.loss in ("auto", "binary_crossentropy", "categorical_crossentropy"):
+            warnings.warn(
+                f"The loss '{self.loss}' was deprecated in v1.1 and will be removed in "
+                "version 1.3. Use 'log_loss' which is equivalent.",
+                FutureWarning,
+            )
+
+        if self.loss in ("log_loss", "auto"):
             if self.n_trees_per_iteration_ == 1:
-                return _LOSSES["binary_crossentropy"](sample_weight=sample_weight)
+                return HalfBinomialLoss(sample_weight=sample_weight)
             else:
-                return _LOSSES["categorical_crossentropy"](
-                    sample_weight=sample_weight,
-                    n_classes=self.n_trees_per_iteration_,
+                return HalfMultinomialLoss(
+                    sample_weight=sample_weight, n_classes=self.n_trees_per_iteration_
                 )
-
         if self.loss == "categorical_crossentropy":
             if self.n_trees_per_iteration_ == 1:
                 raise ValueError(
-                    "loss='categorical_crossentropy' is not suitable for "
-                    "a binary classification problem. Please use "
-                    "loss='auto' or loss='binary_crossentropy' instead."
+                    f"loss='{self.loss}' is not suitable for a binary classification "
+                    "problem. Please use loss='log_loss' instead."
                 )
             else:
-                return _LOSSES[self.loss](
+                return HalfMultinomialLoss(
                     sample_weight=sample_weight, n_classes=self.n_trees_per_iteration_
                 )
-        else:
+        if self.loss == "binary_crossentropy":
             if self.n_trees_per_iteration_ > 1:
                 raise ValueError(
-                    "loss='binary_crossentropy' is not defined for multiclass"
-                    " classification with n_classes="
-                    f"{self.n_trees_per_iteration_}, use loss="
-                    "'categorical_crossentropy' instead."
+                    f"loss='{self.loss}' is not defined for multiclass "
+                    f"classification with n_classes={self.n_trees_per_iteration_}, "
+                    "use loss='log_loss' instead."
                 )
             else:
-                return _LOSSES[self.loss](sample_weight=sample_weight)
+                return HalfBinomialLoss(sample_weight=sample_weight)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
@@ -124,15 +124,17 @@ def test_same_predictions_classification(
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
 
     est_sklearn = HistGradientBoostingClassifier(
-        loss="binary_crossentropy",
+        loss="log_loss",
         max_iter=max_iter,
         max_bins=max_bins,
         learning_rate=1,
         early_stopping=False,
         min_samples_leaf=min_samples_leaf,
         max_leaf_nodes=max_leaf_nodes,
     )
-    est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm")
+    est_lightgbm = get_equivalent_estimator(
+        est_sklearn, lib="lightgbm", n_classes=n_classes
+    )
 
     est_lightgbm.fit(X_train, y_train)
     est_sklearn.fit(X_train, y_train)
@@ -198,7 +200,7 @@ def test_same_predictions_multiclass_classification(
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
 
     est_sklearn = HistGradientBoostingClassifier(
-        loss="categorical_crossentropy",
+        loss="log_loss",
         max_iter=max_iter,
         max_bins=max_bins,
         learning_rate=lr,