diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index eef1135855560..f4e5557921bde 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -214,6 +214,11 @@ Changelog :mod:`sklearn.ensemble` ....................... +- |MajorFeature| Added additional option `loss="quantile"` to + :class:`ensemble.HistGradientBoostingRegressor` for modelling quantiles. + The quantile level can be specified with the new parameter `quantile`. + :pr:`21800` and :pr:`20567` by :user:`Christian Lorentzen `. + - |Efficiency| :meth:`fit` of :class:`ensemble.BaseGradientBoosting` now calls :func:`check_array` with parameter `force_all_finite=False` for non initial warm-start runs as it has already been checked before. diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index e7388f62568c8..fb7518ec60d74 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -15,6 +15,7 @@ HalfMultinomialLoss, HalfPoissonLoss, HalfSquaredError, + PinballLoss, ) from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier from ...utils import check_random_state, resample @@ -42,6 +43,7 @@ "least_squares": HalfSquaredError, "least_absolute_deviation": AbsoluteError, "poisson": HalfPoissonLoss, + "quantile": PinballLoss, "binary_crossentropy": HalfBinomialLoss, "categorical_crossentropy": HalfMultinomialLoss, } @@ -1115,17 +1117,21 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): Parameters ---------- - loss : {'squared_error', 'absolute_error', 'poisson'}, \ + loss : {'squared_error', 'absolute_error', 'poisson', 'quantile'}, \ default='squared_error' The loss function to use in the boosting process. Note that the "squared error" and "poisson" losses actually implement "half least squares loss" and "half poisson deviance" to simplify the computation of the gradient. Furthermore, "poisson" loss internally uses a log-link and requires ``y >= 0``. + "quantile" uses the pinball loss. .. versionchanged:: 0.23 Added option 'poisson'. + .. versionchanged:: 1.1 + Added option 'quantile'. + .. deprecated:: 1.0 The loss 'least_squares' was deprecated in v1.0 and will be removed in version 1.2. Use `loss='squared_error'` which is equivalent. @@ -1135,6 +1141,9 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): be removed in version 1.2. Use `loss='absolute_error'` which is equivalent. + quantile : float, default=None + If loss is "quantile", this parameter specifies which quantile to be estimated + and must be between 0 and 1. learning_rate : float, default=0.1 The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no @@ -1294,12 +1303,14 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): "absolute_error", "least_absolute_deviation", "poisson", + "quantile", ) def __init__( self, loss="squared_error", *, + quantile=None, learning_rate=0.1, max_iter=100, max_leaf_nodes=31, @@ -1338,6 +1349,7 @@ def __init__( verbose=verbose, random_state=random_state, ) + self.quantile = quantile def predict(self, X): """Predict values for X. @@ -1409,7 +1421,12 @@ def _get_loss(self, sample_weight): ) return _LOSSES["absolute_error"](sample_weight=sample_weight) - return _LOSSES[self.loss](sample_weight=sample_weight) + if self.loss == "quantile": + return _LOSSES[self.loss]( + sample_weight=sample_weight, quantile=self.quantile + ) + else: + return _LOSSES[self.loss](sample_weight=sample_weight) class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index c3c4816044d3f..431876423655e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -7,6 +7,7 @@ HalfMultinomialLoss, HalfPoissonLoss, HalfSquaredError, + PinballLoss, ) from sklearn.datasets import make_classification, make_regression from sklearn.datasets import make_low_rank_matrix @@ -35,6 +36,7 @@ "squared_error": HalfSquaredError, "absolute_error": AbsoluteError, "poisson": HalfPoissonLoss, + "quantile": PinballLoss, "binary_crossentropy": HalfBinomialLoss, "categorical_crossentropy": HalfMultinomialLoss, } @@ -249,6 +251,44 @@ def test_absolute_error_sample_weight(): gbdt.fit(X, y, sample_weight=sample_weight) +@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8]) +def test_asymmetric_error(quantile): + """Test quantile regression for asymmetric distributed targets.""" + n_samples = 10_000 + rng = np.random.RandomState(42) + # take care that X @ coef + intercept > 0 + X = np.concatenate( + ( + np.abs(rng.randn(n_samples)[:, None]), + -rng.randint(2, size=(n_samples, 1)), + ), + axis=1, + ) + intercept = 1.23 + coef = np.array([0.5, -2]) + # For an exponential distribution with rate lambda, e.g. exp(-lambda * x), + # the quantile at level q is: + # quantile(q) = - log(1 - q) / lambda + # scale = 1/lambda = -quantile(q) / log(1-q) + y = rng.exponential( + scale=-(X @ coef + intercept) / np.log(1 - quantile), size=n_samples + ) + model = HistGradientBoostingRegressor( + loss="quantile", + quantile=quantile, + max_iter=25, + random_state=0, + max_leaf_nodes=10, + ).fit(X, y) + assert_allclose(np.mean(model.predict(X) > y), quantile, rtol=1e-2) + + pinball_loss = PinballLoss(quantile=quantile) + loss_true_quantile = pinball_loss(y, X @ coef + intercept) + loss_pred_quantile = pinball_loss(y, model.predict(X)) + # we are overfitting + assert loss_pred_quantile <= loss_true_quantile + + @pytest.mark.parametrize("y", [([1.0, -2.0, 0.0]), ([0.0, 0.0, 0.0])]) def test_poisson_y_positive(y): # Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0.