scikit-learn · ogrisel · Feb 22, 2022 · Nov 26, 2021 · Nov 26, 2021 · Jan 16, 2022
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -214,6 +214,11 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
+- |MajorFeature| Added additional option `loss="quantile"` to
+  :class:`ensemble.HistGradientBoostingRegressor` for modelling quantiles.
+  The quantile level can be specified with the new parameter `quantile`.
+  :pr:`21800` and :pr:`20567` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 - |Efficiency| :meth:`fit` of :class:`ensemble.BaseGradientBoosting` now
   calls :func:`check_array` with parameter `force_all_finite=False` for non
   initial warm-start runs as it has already been checked before.

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -15,6 +15,7 @@
     HalfMultinomialLoss,
     HalfPoissonLoss,
     HalfSquaredError,
+    PinballLoss,
 )
 from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier
 from ...utils import check_random_state, resample
@@ -42,6 +43,7 @@
         "least_squares": HalfSquaredError,
         "least_absolute_deviation": AbsoluteError,
         "poisson": HalfPoissonLoss,
+        "quantile": PinballLoss,
         "binary_crossentropy": HalfBinomialLoss,
         "categorical_crossentropy": HalfMultinomialLoss,
     }
@@ -1115,17 +1117,21 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
 
     Parameters
     ----------
-    loss : {'squared_error', 'absolute_error', 'poisson'}, \
+    loss : {'squared_error', 'absolute_error', 'poisson', 'quantile'}, \
             default='squared_error'
         The loss function to use in the boosting process. Note that the
         "squared error" and "poisson" losses actually implement
         "half least squares loss" and "half poisson deviance" to simplify the
         computation of the gradient. Furthermore, "poisson" loss internally
         uses a log-link and requires ``y >= 0``.
+        "quantile" uses the pinball loss.
 
         .. versionchanged:: 0.23
            Added option 'poisson'.
 
+        .. versionchanged:: 1.1
+           Added option 'quantile'.
+
         .. deprecated:: 1.0
             The loss 'least_squares' was deprecated in v1.0 and will be removed
             in version 1.2. Use `loss='squared_error'` which is equivalent.
@@ -1135,6 +1141,9 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
             be removed in version 1.2. Use `loss='absolute_error'` which is
             equivalent.
 
+    quantile : float, default=None
+        If loss is "quantile", this parameter specifies which quantile to be estimated
+        and must be between 0 and 1.
     learning_rate : float, default=0.1
         The learning rate, also known as *shrinkage*. This is used as a
         multiplicative factor for the leaves values. Use ``1`` for no
@@ -1294,12 +1303,14 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
         "absolute_error",
         "least_absolute_deviation",
         "poisson",
+        "quantile",
     )
 
     def __init__(
         self,
         loss="squared_error",
         *,
+        quantile=None,
         learning_rate=0.1,
         max_iter=100,
         max_leaf_nodes=31,
@@ -1338,6 +1349,7 @@ def __init__(
             verbose=verbose,
             random_state=random_state,
         )
+        self.quantile = quantile
 
     def predict(self, X):
         """Predict values for X.
@@ -1409,7 +1421,12 @@ def _get_loss(self, sample_weight):
             )
             return _LOSSES["absolute_error"](sample_weight=sample_weight)
 
-        return _LOSSES[self.loss](sample_weight=sample_weight)
+        if self.loss == "quantile":
+            return _LOSSES[self.loss](
+                sample_weight=sample_weight, quantile=self.quantile
+            )
+        else:
+            return _LOSSES[self.loss](sample_weight=sample_weight)
 
 
 class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -7,6 +7,7 @@
     HalfMultinomialLoss,
     HalfPoissonLoss,
     HalfSquaredError,
+    PinballLoss,
 )
 from sklearn.datasets import make_classification, make_regression
 from sklearn.datasets import make_low_rank_matrix
@@ -35,6 +36,7 @@
     "squared_error": HalfSquaredError,
     "absolute_error": AbsoluteError,
     "poisson": HalfPoissonLoss,
+    "quantile": PinballLoss,
     "binary_crossentropy": HalfBinomialLoss,
     "categorical_crossentropy": HalfMultinomialLoss,
 }
@@ -249,6 +251,44 @@ def test_absolute_error_sample_weight():
     gbdt.fit(X, y, sample_weight=sample_weight)
 
 
+@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8])
+def test_asymmetric_error(quantile):
+    """Test quantile regression for asymmetric distributed targets."""
+    n_samples = 10_000
+    rng = np.random.RandomState(42)
+    # take care that X @ coef + intercept > 0
+    X = np.concatenate(
+        (
+            np.abs(rng.randn(n_samples)[:, None]),
+            -rng.randint(2, size=(n_samples, 1)),
+        ),
+        axis=1,
+    )
+    intercept = 1.23
+    coef = np.array([0.5, -2])
+    # For an exponential distribution with rate lambda, e.g. exp(-lambda * x),
+    # the quantile at level q is:
+    #   quantile(q) = - log(1 - q) / lambda
+    #   scale = 1/lambda = -quantile(q) / log(1-q)
+    y = rng.exponential(
+        scale=-(X @ coef + intercept) / np.log(1 - quantile), size=n_samples
+    )
+    model = HistGradientBoostingRegressor(
+        loss="quantile",
+        quantile=quantile,
+        max_iter=25,
+        random_state=0,
+        max_leaf_nodes=10,
+    ).fit(X, y)
+    assert_allclose(np.mean(model.predict(X) > y), quantile, rtol=1e-2)
+
+    pinball_loss = PinballLoss(quantile=quantile)
+    loss_true_quantile = pinball_loss(y, X @ coef + intercept)
+    loss_pred_quantile = pinball_loss(y, model.predict(X))
+    # we are overfitting
+    assert loss_pred_quantile <= loss_true_quantile
+
+
 @pytest.mark.parametrize("y", [([1.0, -2.0, 0.0]), ([0.0, 0.0, 0.0])])
 def test_poisson_y_positive(y):
     # Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0.