diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 576f88e66a71e..2f7e7590a1c6b 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -262,6 +262,13 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
+- |Enhancement| :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
+  :class:`~sklearn.ensemble.HistGradientBoostingRegressor` take cgroups quotas
+  into account when deciding the number of threads used by OpenMP. This
+  avoids performance problems caused by over-subscription when using those
+  classes in a docker container for instance. :pr:`20477`
+  by `Thomas Fan`_.
+
 - |Fix| Do not allow to compute out-of-bag (OOB) score in
   :class:`ensemble.RandomForestClassifier` and
   :class:`ensemble.ExtraTreesClassifier` with multiclass-multioutput target
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
index fc564c9a126ab..5f5dd68935fd4 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
@@ -22,6 +22,7 @@ np.import_array()
 def _map_to_bins(const X_DTYPE_C [:, :] data,
                  list binning_thresholds,
                  const unsigned char missing_values_bin_idx,
+                 int n_threads,
                  X_BINNED_DTYPE_C [::1, :] binned):
     """Bin continuous and categorical values to discrete integer-coded levels.
 
@@ -35,6 +36,8 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
     binning_thresholds : list of arrays
         For each feature, stores the increasing numeric values that are
         used to separate the bins.
+    n_threads : int
+        Number of OpenMP threads to use.
     binned : ndarray, shape (n_samples, n_features)
         Output array, must be fortran aligned.
     """
@@ -45,12 +48,14 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
         _map_col_to_bins(data[:, feature_idx],
                              binning_thresholds[feature_idx],
                              missing_values_bin_idx,
+                             n_threads,
                              binned[:, feature_idx])
 
 
 cdef void _map_col_to_bins(const X_DTYPE_C [:] data,
                                const X_DTYPE_C [:] binning_thresholds,
                                const unsigned char missing_values_bin_idx,
+                               int n_threads,
                                X_BINNED_DTYPE_C [:] binned):
     """Binary search to find the bin index for each value in the data."""
     cdef:
@@ -59,8 +64,8 @@ cdef void _map_col_to_bins(const X_DTYPE_C [:] data,
         int right
         int middle
 
-    for i in prange(data.shape[0], schedule='static', nogil=True):
-
+    for i in prange(data.shape[0], schedule='static', nogil=True,
+                    num_threads=n_threads):
         if isnan(data[i]):
             binned[i] = missing_values_bin_idx
         else:
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
index 18f1b6a365421..f684ca57e560d 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
@@ -18,7 +18,9 @@ np.import_array()
 
 def _update_raw_predictions(
         Y_DTYPE_C [::1] raw_predictions,  # OUT
-        grower):
+        grower,
+        n_threads,
+):
     """Update raw_predictions with the predictions of the newest tree.
 
     This is equivalent to (and much faster than):
@@ -42,7 +44,7 @@ def _update_raw_predictions(
     values = np.array([leaf.value for leaf in leaves], dtype=Y_DTYPE)
 
     _update_raw_predictions_helper(raw_predictions, starts, stops, partition,
-                                   values)
+                                   values, n_threads)
 
 
 cdef inline void _update_raw_predictions_helper(
@@ -50,13 +52,16 @@ cdef inline void _update_raw_predictions_helper(
         const unsigned int [::1] starts,
         const unsigned int [::1] stops,
         const unsigned int [::1] partition,
-        const Y_DTYPE_C [::1] values):
+        const Y_DTYPE_C [::1] values,
+        int n_threads,
+):
 
     cdef:
         unsigned int position
         int leaf_idx
         int n_leaves = starts.shape[0]
 
-    for leaf_idx in prange(n_leaves, schedule='static', nogil=True):
+    for leaf_idx in prange(n_leaves, schedule='static', nogil=True,
+                           num_threads=n_threads):
         for position in range(starts[leaf_idx], stops[leaf_idx]):
             raw_predictions[partition[position]] += values[leaf_idx]
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
index 4114cd24aa8df..da900e28c6457 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
@@ -21,14 +21,16 @@ np.import_array()
 def _update_gradients_least_squares(
         G_H_DTYPE_C [::1] gradients,  # OUT
         const Y_DTYPE_C [::1] y_true,  # IN
-        const Y_DTYPE_C [::1] raw_predictions):  # IN
+        const Y_DTYPE_C [::1] raw_predictions, # IN
+        int n_threads,  # IN
+):
 
     cdef:
         int n_samples
         int i
 
     n_samples = raw_predictions.shape[0]
-    for i in prange(n_samples, schedule='static', nogil=True):
+    for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
         # Note: a more correct expression is 2 * (raw_predictions - y_true)
         # but since we use 1 for the constant hessian value (and not 2) this
         # is strictly equivalent for the leaves values.
@@ -40,14 +42,16 @@ def _update_gradients_hessians_least_squares(
         G_H_DTYPE_C [::1] hessians,  # OUT
         const Y_DTYPE_C [::1] y_true,  # IN
         const Y_DTYPE_C [::1] raw_predictions,  # IN
-        const Y_DTYPE_C [::1] sample_weight):  # IN
+        const Y_DTYPE_C [::1] sample_weight,  # IN
+        int n_threads,  # IN
+):
 
     cdef:
         int n_samples
         int i
 
     n_samples = raw_predictions.shape[0]
-    for i in prange(n_samples, schedule='static', nogil=True):
+    for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
         # Note: a more correct exp is 2 * (raw_predictions - y_true) * sample_weight
         # but since we use 1 for the constant hessian value (and not 2) this
         # is strictly equivalent for the leaves values.
@@ -60,14 +64,15 @@ def _update_gradients_hessians_least_absolute_deviation(
         G_H_DTYPE_C [::1] hessians,  # OUT
         const Y_DTYPE_C [::1] y_true,  # IN
         const Y_DTYPE_C [::1] raw_predictions,  # IN
-        const Y_DTYPE_C [::1] sample_weight):  # IN
-
+        const Y_DTYPE_C [::1] sample_weight, # IN
+        int n_threads,  # IN
+):
     cdef:
         int n_samples
         int i
 
     n_samples = raw_predictions.shape[0]
-    for i in prange(n_samples, schedule='static', nogil=True):
+    for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
         # gradient = sign(raw_predicition - y_pred) * sample_weight
         gradients[i] = sample_weight[i] * (2 *
                         (y_true[i] - raw_predictions[i] < 0) - 1)
@@ -77,14 +82,15 @@ def _update_gradients_hessians_least_absolute_deviation(
 def _update_gradients_least_absolute_deviation(
         G_H_DTYPE_C [::1] gradients,  # OUT
         const Y_DTYPE_C [::1] y_true,  # IN
-        const Y_DTYPE_C [::1] raw_predictions):  # IN
-
+        const Y_DTYPE_C [::1] raw_predictions,  # IN
+        int n_threads,  # IN
+):
     cdef:
         int n_samples
         int i
 
     n_samples = raw_predictions.shape[0]
-    for i in prange(n_samples, schedule='static', nogil=True):
+    for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
         # gradient = sign(raw_predicition - y_pred)
         gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1
 
@@ -94,8 +100,9 @@ def _update_gradients_hessians_poisson(
         G_H_DTYPE_C [::1] hessians,  # OUT
         const Y_DTYPE_C [::1] y_true,  # IN
         const Y_DTYPE_C [::1] raw_predictions,  # IN
-        const Y_DTYPE_C [::1] sample_weight):  # IN
-
+        const Y_DTYPE_C [::1] sample_weight, # IN
+        int n_threads,  # IN
+):
     cdef:
         int n_samples
         int i
@@ -103,14 +110,14 @@ def _update_gradients_hessians_poisson(
 
     n_samples = raw_predictions.shape[0]
     if sample_weight is None:
-        for i in prange(n_samples, schedule='static', nogil=True):
+        for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
             # Note: We use only half of the deviance loss. Therefore, there is
             # no factor of 2.
             y_pred = exp(raw_predictions[i])
             gradients[i] = (y_pred - y_true[i])
             hessians[i] = y_pred
     else:
-        for i in prange(n_samples, schedule='static', nogil=True):
+        for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
             # Note: We use only half of the deviance loss. Therefore, there is
             # no factor of 2.
             y_pred = exp(raw_predictions[i])
@@ -123,7 +130,9 @@ def _update_gradients_hessians_binary_crossentropy(
         G_H_DTYPE_C [::1] hessians,  # OUT
         const Y_DTYPE_C [::1] y_true,  # IN
         const Y_DTYPE_C [::1] raw_predictions,  # IN
-        const Y_DTYPE_C [::1] sample_weight):  # IN
+        const Y_DTYPE_C [::1] sample_weight,  # IN
+        int n_threads,  # IN
+):
     cdef:
         int n_samples
         Y_DTYPE_C p_i  # proba that ith sample belongs to positive class
@@ -131,12 +140,12 @@ def _update_gradients_hessians_binary_crossentropy(
 
     n_samples = raw_predictions.shape[0]
     if sample_weight is None:
-        for i in prange(n_samples, schedule='static', nogil=True):
+        for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
             p_i = _cexpit(raw_predictions[i])
             gradients[i] = p_i - y_true[i]
             hessians[i] = p_i * (1. - p_i)
     else:
-        for i in prange(n_samples, schedule='static', nogil=True):
+        for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
             p_i = _cexpit(raw_predictions[i])
             gradients[i] = (p_i - y_true[i]) * sample_weight[i]
             hessians[i] = p_i * (1. - p_i) * sample_weight[i]
@@ -147,7 +156,9 @@ def _update_gradients_hessians_categorical_crossentropy(
         G_H_DTYPE_C [:, ::1] hessians,  # OUT
         const Y_DTYPE_C [::1] y_true,  # IN
         const Y_DTYPE_C [:, ::1] raw_predictions,  # IN
-        const Y_DTYPE_C [::1] sample_weight):  # IN
+        const Y_DTYPE_C [::1] sample_weight,  # IN
+        int n_threads,  # IN
+):
     cdef:
         int prediction_dim = raw_predictions.shape[0]
         int n_samples = raw_predictions.shape[1]
@@ -160,7 +171,7 @@ def _update_gradients_hessians_categorical_crossentropy(
         Y_DTYPE_C p_i_k
 
     if sample_weight is None:
-        for i in prange(n_samples, schedule='static', nogil=True):
+        for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
             # first compute softmaxes of sample i for each class
             for k in range(prediction_dim):
                 p[i, k] = raw_predictions[k, i]  # prepare softmax
@@ -171,7 +182,7 @@ def _update_gradients_hessians_categorical_crossentropy(
                 gradients[k, i] = p_i_k - (y_true[i] == k)
                 hessians[k, i] = p_i_k * (1. - p_i_k)
     else:
-        for i in prange(n_samples, schedule='static', nogil=True):
+        for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
             # first compute softmaxes of sample i for each class
             for k in range(prediction_dim):
                 p[i, k] = raw_predictions[k, i]  # prepare softmax
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
index d1bb98d82c936..a6b2f8b90de8e 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
@@ -30,12 +30,14 @@ def _predict_from_raw_data(  # raw data = non-binned data
         const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets,
         const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets,
         const unsigned int [::1] f_idx_map,
+        int n_threads,
         Y_DTYPE_C [:] out):
 
     cdef:
         int i
 
-    for i in prange(numeric_data.shape[0], schedule='static', nogil=True):
+    for i in prange(numeric_data.shape[0], schedule='static', nogil=True,
+                    num_threads=n_threads):
         out[i] = _predict_one_from_raw_data(
             nodes, numeric_data, raw_left_cat_bitsets,
             known_cat_bitsets,
@@ -95,12 +97,14 @@ def _predict_from_binned_data(
         const X_BINNED_DTYPE_C [:, :] binned_data,
         BITSET_INNER_DTYPE_C [:, :] binned_left_cat_bitsets,
         const unsigned char missing_values_bin_idx,
+        int n_threads,
         Y_DTYPE_C [:] out):
 
     cdef:
         int i
 
-    for i in prange(binned_data.shape[0], schedule='static', nogil=True):
+    for i in prange(binned_data.shape[0], schedule='static', nogil=True,
+                    num_threads=n_threads):
         out[i] = _predict_one_from_binned_data(nodes,
                                                binned_data,
                                                binned_left_cat_bitsets, i,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py
index f8c1d3553e2c5..c76ee270b2270 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/binning.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py
@@ -12,6 +12,7 @@
 from ...utils import check_random_state, check_array
 from ...base import BaseEstimator, TransformerMixin
 from ...utils.validation import check_is_fitted
+from ...utils._openmp_helpers import _openmp_effective_n_threads
 from ._binning import _map_to_bins
 from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF, X_BITSET_INNER_DTYPE
 from ._bitset import set_bitset_memoryview
@@ -115,6 +116,11 @@ class _BinMapper(TransformerMixin, BaseEstimator):
         Pass an int for reproducible output across multiple
         function calls.
         See :term: `Glossary <random_state>`.
+    n_threads : int, default=None
+        Number of OpenMP threads to use. `_openmp_effective_n_threads` is called
+        to determine the effective number of threads use, which takes cgroups CPU
+        quotes into account. See the docstring of `_openmp_effective_n_threads`
+        for details.
 
     Attributes
     ----------
@@ -151,12 +157,14 @@ def __init__(
         is_categorical=None,
         known_categories=None,
         random_state=None,
+        n_threads=None,
     ):
         self.n_bins = n_bins
         self.subsample = subsample
         self.is_categorical = is_categorical
         self.known_categories = known_categories
         self.random_state = random_state
+        self.n_threads = n_threads
 
     def fit(self, X, y=None):
         """Fit data X by computing the binning thresholds.
@@ -264,8 +272,12 @@ def transform(self, X):
                 "This estimator was fitted with {} features but {} got passed "
                 "to transform()".format(self.n_bins_non_missing_.shape[0], X.shape[1])
             )
+
+        n_threads = _openmp_effective_n_threads(self.n_threads)
         binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
-        _map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_, binned)
+        _map_to_bins(
+            X, self.bin_thresholds_, self.missing_values_bin_idx_, n_threads, binned
+        )
         return binned
 
     def make_known_categories_bitsets(self):
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 09dcfa779e756..1a98326a36f89 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -14,6 +14,7 @@
     check_consistent_length,
     _check_sample_weight,
 )
+from ...utils._openmp_helpers import _openmp_effective_n_threads
 from ...utils.multiclass import check_classification_targets
 from ...metrics import check_scoring
 from ...model_selection import train_test_split
@@ -263,8 +264,14 @@ def fit(self, X, y, sample_weight=None):
         # data.
         self._in_fit = True
 
+        # `_openmp_effective_n_threads` is used to take cgroups CPU quotes
+        # into account when determine the maximum number of threads to use.
+        n_threads = _openmp_effective_n_threads()
+
         if isinstance(self.loss, str):
-            self._loss = self._get_loss(sample_weight=sample_weight)
+            self._loss = self._get_loss(
+                sample_weight=sample_weight, n_threads=n_threads
+            )
         elif isinstance(self.loss, BaseLoss):
             self._loss = self.loss
 
@@ -328,6 +335,7 @@ def fit(self, X, y, sample_weight=None):
             is_categorical=self.is_categorical_,
             known_categories=known_categories,
             random_state=self._random_seed,
+            n_threads=n_threads,
         )
         X_binned_train = self._bin_data(X_train, is_training_data=True)
         if X_val is not None:
@@ -449,9 +457,11 @@ def fit(self, X, y, sample_weight=None):
             self.validation_score_ = self.validation_score_.tolist()
 
             # Compute raw predictions
-            raw_predictions = self._raw_predict(X_binned_train)
+            raw_predictions = self._raw_predict(X_binned_train, n_threads=n_threads)
             if self.do_early_stopping_ and self._use_validation_data:
-                raw_predictions_val = self._raw_predict(X_binned_val)
+                raw_predictions_val = self._raw_predict(
+                    X_binned_val, n_threads=n_threads
+                )
             else:
                 raw_predictions_val = None
 
@@ -510,6 +520,7 @@ def fit(self, X, y, sample_weight=None):
                     min_samples_leaf=self.min_samples_leaf,
                     l2_regularization=self.l2_regularization,
                     shrinkage=self.learning_rate,
+                    n_threads=n_threads,
                 )
                 grower.grow()
 
@@ -530,7 +541,7 @@ def fit(self, X, y, sample_weight=None):
                 # Update raw_predictions with the predictions of the newly
                 # created tree.
                 tic_pred = time()
-                _update_raw_predictions(raw_predictions[k, :], grower)
+                _update_raw_predictions(raw_predictions[k, :], grower, n_threads)
                 toc_pred = time()
                 acc_prediction_time += toc_pred - tic_pred
 
@@ -541,7 +552,9 @@ def fit(self, X, y, sample_weight=None):
                     if self._use_validation_data:
                         for k, pred in enumerate(self._predictors[-1]):
                             raw_predictions_val[k, :] += pred.predict_binned(
-                                X_binned_val, self._bin_mapper.missing_values_bin_idx_
+                                X_binned_val,
+                                self._bin_mapper.missing_values_bin_idx_,
+                                n_threads,
                             )
 
                     should_early_stop = self._check_early_stopping_loss(
@@ -809,13 +822,18 @@ def _print_iteration_stats(self, iteration_start_time):
 
         print(log_msg)
 
-    def _raw_predict(self, X):
+    def _raw_predict(self, X, n_threads=None):
         """Return the sum of the leaves values over all predictors.
 
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
             The input samples.
+        n_threads : int, default=None
+            Number of OpenMP threads to use. `_openmp_effective_n_threads` is called
+            to determine the effective number of threads use, which takes cgroups CPU
+            quotes into account. See the docstring of `_openmp_effective_n_threads`
+            for details.
 
         Returns
         -------
@@ -837,10 +855,17 @@ def _raw_predict(self, X):
             dtype=self._baseline_prediction.dtype,
         )
         raw_predictions += self._baseline_prediction
-        self._predict_iterations(X, self._predictors, raw_predictions, is_binned)
+
+        # We intentionally decouple the number of threads used at prediction
+        # time from the number of threads used at fit time because the model
+        # can be deployed on a different machine for prediction purposes.
+        n_threads = _openmp_effective_n_threads(n_threads)
+        self._predict_iterations(
+            X, self._predictors, raw_predictions, is_binned, n_threads
+        )
         return raw_predictions
 
-    def _predict_iterations(self, X, predictors, raw_predictions, is_binned):
+    def _predict_iterations(self, X, predictors, raw_predictions, is_binned, n_threads):
         """Add the predictions of the predictors to raw_predictions."""
         if not is_binned:
             (
@@ -853,13 +878,15 @@ def _predict_iterations(self, X, predictors, raw_predictions, is_binned):
                 if is_binned:
                     predict = partial(
                         predictor.predict_binned,
-                        missing_values_bin_idx=self._bin_mapper.missing_values_bin_idx_,  # noqa
+                        missing_values_bin_idx=self._bin_mapper.missing_values_bin_idx_,
+                        n_threads=n_threads,
                     )
                 else:
                     predict = partial(
                         predictor.predict,
                         known_cat_bitsets=known_cat_bitsets,
                         f_idx_map=f_idx_map,
+                        n_threads=n_threads,
                     )
                 raw_predictions[k, :] += predict(X)
 
@@ -894,12 +921,18 @@ def _staged_raw_predict(self, X):
             dtype=self._baseline_prediction.dtype,
         )
         raw_predictions += self._baseline_prediction
+
+        # We intentionally decouple the number of threads used at prediction
+        # time from the number of threads used at fit time because the model
+        # can be deployed on a different machine for prediction purposes.
+        n_threads = _openmp_effective_n_threads()
         for iteration in range(len(self._predictors)):
             self._predict_iterations(
                 X,
                 self._predictors[iteration : iteration + 1],
                 raw_predictions,
                 is_binned=False,
+                n_threads=n_threads,
             )
             yield raw_predictions.copy()
 
@@ -949,7 +982,7 @@ def _more_tags(self):
         return {"allow_nan": True}
 
     @abstractmethod
-    def _get_loss(self, sample_weight):
+    def _get_loss(self, sample_weight, n_threads):
         pass
 
     @abstractmethod
@@ -1241,7 +1274,7 @@ def _encode_y(self, y):
                 )
         return y
 
-    def _get_loss(self, sample_weight):
+    def _get_loss(self, sample_weight, n_threads):
         # TODO: Remove in v1.2
         if self.loss == "least_squares":
             warnings.warn(
@@ -1250,7 +1283,9 @@ def _get_loss(self, sample_weight):
                 "equivalent.",
                 FutureWarning,
             )
-            return _LOSSES["squared_error"](sample_weight=sample_weight)
+            return _LOSSES["squared_error"](
+                sample_weight=sample_weight, n_threads=n_threads
+            )
         elif self.loss == "least_absolute_deviation":
             warnings.warn(
                 "The loss 'least_absolute_deviation' was deprecated in v1.0 "
@@ -1258,9 +1293,11 @@ def _get_loss(self, sample_weight):
                 "which is equivalent.",
                 FutureWarning,
             )
-            return _LOSSES["absolute_error"](sample_weight=sample_weight)
+            return _LOSSES["absolute_error"](
+                sample_weight=sample_weight, n_threads=n_threads
+            )
 
-        return _LOSSES[self.loss](sample_weight=sample_weight)
+        return _LOSSES[self.loss](sample_weight=sample_weight, n_threads=n_threads)
 
 
 class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
@@ -1610,7 +1647,7 @@ def _encode_y(self, y):
         encoded_y = encoded_y.astype(Y_DTYPE, copy=False)
         return encoded_y
 
-    def _get_loss(self, sample_weight):
+    def _get_loss(self, sample_weight, n_threads):
         if self.loss == "categorical_crossentropy" and self.n_trees_per_iteration_ == 1:
             raise ValueError(
                 "'categorical_crossentropy' is not suitable for "
@@ -1620,8 +1657,12 @@ def _get_loss(self, sample_weight):
 
         if self.loss == "auto":
             if self.n_trees_per_iteration_ == 1:
-                return _LOSSES["binary_crossentropy"](sample_weight=sample_weight)
+                return _LOSSES["binary_crossentropy"](
+                    sample_weight=sample_weight, n_threads=n_threads
+                )
             else:
-                return _LOSSES["categorical_crossentropy"](sample_weight=sample_weight)
+                return _LOSSES["categorical_crossentropy"](
+                    sample_weight=sample_weight, n_threads=n_threads
+                )
 
-        return _LOSSES[self.loss](sample_weight=sample_weight)
+        return _LOSSES[self.loss](sample_weight=sample_weight, n_threads=n_threads)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py
index 650c38f3ee3aa..1733b5745f8a2 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -20,6 +20,8 @@
 from .common import Y_DTYPE
 from .common import MonotonicConstraint
 from ._bitset import set_raw_bitset_from_binned_bitset
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
 
 EPS = np.finfo(Y_DTYPE).eps  # to avoid zero division errors
 
@@ -175,6 +177,11 @@ class TreeGrower:
     shrinkage : float, default=1.
         The shrinkage parameter to apply to the leaves values, also known as
         learning rate.
+    n_threads : int, default=None
+        Number of OpenMP threads to use. `_openmp_effective_n_threads` is called
+        to determine the effective number of threads use, which takes cgroups CPU
+        quotes into account. See the docstring of `_openmp_effective_n_threads`
+        for details.
     """
 
     def __init__(
@@ -194,6 +201,7 @@ def __init__(
         l2_regularization=0.0,
         min_hessian_to_split=1e-3,
         shrinkage=1.0,
+        n_threads=None,
     ):
 
         self._validate_parameters(
@@ -205,6 +213,7 @@ def __init__(
             l2_regularization,
             min_hessian_to_split,
         )
+        n_threads = _openmp_effective_n_threads(n_threads)
 
         if n_bins_non_missing is None:
             n_bins_non_missing = n_bins - 1
@@ -257,7 +266,7 @@ def __init__(
 
         hessians_are_constant = hessians.shape[0] == 1
         self.histogram_builder = HistogramBuilder(
-            X_binned, n_bins, gradients, hessians, hessians_are_constant
+            X_binned, n_bins, gradients, hessians, hessians_are_constant, n_threads
         )
         missing_values_bin_idx = n_bins - 1
         self.splitter = Splitter(
@@ -272,6 +281,7 @@ def __init__(
             min_samples_leaf,
             min_gain_to_split,
             hessians_are_constant,
+            n_threads,
         )
         self.n_bins_non_missing = n_bins_non_missing
         self.missing_values_bin_idx = missing_values_bin_idx
@@ -286,6 +296,7 @@ def __init__(
         self.X_binned = X_binned
         self.min_gain_to_split = min_gain_to_split
         self.shrinkage = shrinkage
+        self.n_threads = n_threads
         self.splittable_nodes = []
         self.finalized_leaves = []
         self.total_find_split_time = 0.0  # time spent finding the best splits
@@ -366,11 +377,11 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant):
         """Initialize root node and finalize it if needed."""
         n_samples = self.X_binned.shape[0]
         depth = 0
-        sum_gradients = sum_parallel(gradients)
+        sum_gradients = sum_parallel(gradients, self.n_threads)
         if self.histogram_builder.hessians_are_constant:
             sum_hessians = hessians[0] * n_samples
         else:
-            sum_hessians = sum_parallel(hessians)
+            sum_hessians = sum_parallel(hessians, self.n_threads)
         self.root = TreeNode(
             depth=depth,
             sample_indices=self.splitter.partition,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
index 625aecc4f09f5..068935230e900 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
@@ -85,11 +85,13 @@ cdef class HistogramBuilder:
         G_H_DTYPE_C [::1] ordered_gradients
         G_H_DTYPE_C [::1] ordered_hessians
         unsigned char hessians_are_constant
+        int n_threads
 
     def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned,
                  unsigned int n_bins, G_H_DTYPE_C [::1] gradients,
                  G_H_DTYPE_C [::1] hessians,
-                 unsigned char hessians_are_constant):
+                 unsigned char hessians_are_constant,
+                 int n_threads):
 
         self.X_binned = X_binned
         self.n_features = X_binned.shape[1]
@@ -102,6 +104,7 @@ cdef class HistogramBuilder:
         self.ordered_gradients = gradients.copy()
         self.ordered_hessians = hessians.copy()
         self.hessians_are_constant = hessians_are_constant
+        self.n_threads = n_threads
 
     def compute_histograms_brute(
             HistogramBuilder self,
@@ -137,6 +140,7 @@ cdef class HistogramBuilder:
                 shape=(self.n_features, self.n_bins),
                 dtype=HISTOGRAM_DTYPE
             )
+            int n_threads = self.n_threads
 
         with nogil:
             n_samples = sample_indices.shape[0]
@@ -146,14 +150,17 @@ cdef class HistogramBuilder:
             # cache hit.
             if sample_indices.shape[0] != gradients.shape[0]:
                 if hessians_are_constant:
-                    for i in prange(n_samples, schedule='static'):
+                    for i in prange(n_samples, schedule='static',
+                                    num_threads=n_threads):
                         ordered_gradients[i] = gradients[sample_indices[i]]
                 else:
-                    for i in prange(n_samples, schedule='static'):
+                    for i in prange(n_samples, schedule='static',
+                                    num_threads=n_threads):
                         ordered_gradients[i] = gradients[sample_indices[i]]
                         ordered_hessians[i] = hessians[sample_indices[i]]
 
-            for feature_idx in prange(n_features, schedule='static'):
+            for feature_idx in prange(n_features, schedule='static',
+                                      num_threads=n_threads):
                 # Compute histogram of each feature
                 self._compute_histogram_brute_single_feature(
                     feature_idx, sample_indices, histograms)
@@ -238,8 +245,10 @@ cdef class HistogramBuilder:
                 shape=(self.n_features, self.n_bins),
                 dtype=HISTOGRAM_DTYPE
             )
+            int n_threads = self.n_threads
 
-        for feature_idx in prange(n_features, schedule='static', nogil=True):
+        for feature_idx in prange(n_features, schedule='static', nogil=True,
+                                  num_threads=n_threads):
             # Compute histogram of each feature
             _subtract_histograms(feature_idx,
                                  self.n_bins,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py
index d0bf2d969cf88..c5870f97f900e 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py
@@ -20,14 +20,16 @@
 from ._loss import _update_gradients_hessians_binary_crossentropy
 from ._loss import _update_gradients_hessians_categorical_crossentropy
 from ._loss import _update_gradients_hessians_poisson
+from ...utils._openmp_helpers import _openmp_effective_n_threads
 from ...utils.stats import _weighted_percentile
 
 
 class BaseLoss(ABC):
     """Base class for a loss."""
 
-    def __init__(self, hessians_are_constant):
+    def __init__(self, hessians_are_constant, n_threads=None):
         self.hessians_are_constant = hessians_are_constant
+        self.n_threads = _openmp_effective_n_threads(n_threads)
 
     def __call__(self, y_true, raw_predictions, sample_weight):
         """Return the weighted average loss"""
@@ -157,11 +159,13 @@ class LeastSquares(BaseLoss):
     with what is done in LightGBM).
     """
 
-    def __init__(self, sample_weight):
+    def __init__(self, sample_weight, n_threads=None):
         # If sample weights are provided, the hessians and gradients
         # are multiplied by sample_weight, which means the hessians are
         # equal to sample weights.
-        super().__init__(hessians_are_constant=sample_weight is None)
+        super().__init__(
+            hessians_are_constant=sample_weight is None, n_threads=n_threads
+        )
 
     def pointwise_loss(self, y_true, raw_predictions):
         # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
@@ -185,11 +189,18 @@ def update_gradients_and_hessians(
         raw_predictions = raw_predictions.reshape(-1)
         gradients = gradients.reshape(-1)
         if sample_weight is None:
-            _update_gradients_least_squares(gradients, y_true, raw_predictions)
+            _update_gradients_least_squares(
+                gradients, y_true, raw_predictions, self.n_threads
+            )
         else:
             hessians = hessians.reshape(-1)
             _update_gradients_hessians_least_squares(
-                gradients, hessians, y_true, raw_predictions, sample_weight
+                gradients,
+                hessians,
+                y_true,
+                raw_predictions,
+                sample_weight,
+                self.n_threads,
             )
 
 
@@ -201,11 +212,13 @@ class LeastAbsoluteDeviation(BaseLoss):
         loss(x_i) = |y_true_i - raw_pred_i|
     """
 
-    def __init__(self, sample_weight):
+    def __init__(self, sample_weight, n_threads=None):
         # If sample weights are provided, the hessians and gradients
         # are multiplied by sample_weight, which means the hessians are
         # equal to sample weights.
-        super().__init__(hessians_are_constant=sample_weight is None)
+        super().__init__(
+            hessians_are_constant=sample_weight is None, n_threads=n_threads
+        )
 
     # This variable indicates whether the loss requires the leaves values to
     # be updated once the tree has been trained. The trees are trained to
@@ -243,12 +256,20 @@ def update_gradients_and_hessians(
         gradients = gradients.reshape(-1)
         if sample_weight is None:
             _update_gradients_least_absolute_deviation(
-                gradients, y_true, raw_predictions
+                gradients,
+                y_true,
+                raw_predictions,
+                self.n_threads,
             )
         else:
             hessians = hessians.reshape(-1)
             _update_gradients_hessians_least_absolute_deviation(
-                gradients, hessians, y_true, raw_predictions, sample_weight
+                gradients,
+                hessians,
+                y_true,
+                raw_predictions,
+                sample_weight,
+                self.n_threads,
             )
 
     def update_leaves_values(self, grower, y_true, raw_predictions, sample_weight):
@@ -285,8 +306,8 @@ class Poisson(BaseLoss):
     the computation of the gradients.
     """
 
-    def __init__(self, sample_weight):
-        super().__init__(hessians_are_constant=False)
+    def __init__(self, sample_weight, n_threads=None):
+        super().__init__(hessians_are_constant=False, n_threads=n_threads)
 
     inverse_link_function = staticmethod(np.exp)
 
@@ -318,7 +339,12 @@ def update_gradients_and_hessians(
         gradients = gradients.reshape(-1)
         hessians = hessians.reshape(-1)
         _update_gradients_hessians_poisson(
-            gradients, hessians, y_true, raw_predictions, sample_weight
+            gradients,
+            hessians,
+            y_true,
+            raw_predictions,
+            sample_weight,
+            self.n_threads,
         )
 
 
@@ -334,8 +360,8 @@ class BinaryCrossEntropy(BaseLoss):
     section 4.4.1 (about logistic regression).
     """
 
-    def __init__(self, sample_weight):
-        super().__init__(hessians_are_constant=False)
+    def __init__(self, sample_weight, n_threads=None):
+        super().__init__(hessians_are_constant=False, n_threads=n_threads)
 
     inverse_link_function = staticmethod(expit)
 
@@ -370,7 +396,7 @@ def update_gradients_and_hessians(
         gradients = gradients.reshape(-1)
         hessians = hessians.reshape(-1)
         _update_gradients_hessians_binary_crossentropy(
-            gradients, hessians, y_true, raw_predictions, sample_weight
+            gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads
         )
 
     def predict_proba(self, raw_predictions):
@@ -391,8 +417,8 @@ class CategoricalCrossEntropy(BaseLoss):
     cross-entropy to more than 2 classes.
     """
 
-    def __init__(self, sample_weight):
-        super().__init__(hessians_are_constant=False)
+    def __init__(self, sample_weight, n_threads=None):
+        super().__init__(hessians_are_constant=False, n_threads=n_threads)
 
     def pointwise_loss(self, y_true, raw_predictions):
         one_hot_true = np.zeros_like(raw_predictions)
@@ -419,7 +445,7 @@ def update_gradients_and_hessians(
         self, gradients, hessians, y_true, raw_predictions, sample_weight
     ):
         _update_gradients_hessians_categorical_crossentropy(
-            gradients, hessians, y_true, raw_predictions, sample_weight
+            gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads
         )
 
     def predict_proba(self, raw_predictions):
diff --git a/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py
index a356325356dc2..746fa34753121 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/predictor.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/predictor.py
@@ -42,7 +42,7 @@ def get_max_depth(self):
         """Return maximum depth among all leaves."""
         return int(self.nodes["depth"].max())
 
-    def predict(self, X, known_cat_bitsets, f_idx_map):
+    def predict(self, X, known_cat_bitsets, f_idx_map, n_threads):
         """Predict raw values for non-binned data.
 
         Parameters
@@ -57,6 +57,9 @@ def predict(self, X, known_cat_bitsets, f_idx_map):
             Map from original feature index to the corresponding index in the
             known_cat_bitsets array.
 
+        n_threads : int
+            Number of OpenMP threads to use.
+
         Returns
         -------
         y : ndarray, shape (n_samples,)
@@ -64,11 +67,17 @@ def predict(self, X, known_cat_bitsets, f_idx_map):
         """
         out = np.empty(X.shape[0], dtype=Y_DTYPE)
         _predict_from_raw_data(
-            self.nodes, X, self.raw_left_cat_bitsets, known_cat_bitsets, f_idx_map, out
+            self.nodes,
+            X,
+            self.raw_left_cat_bitsets,
+            known_cat_bitsets,
+            f_idx_map,
+            n_threads,
+            out,
         )
         return out
 
-    def predict_binned(self, X, missing_values_bin_idx):
+    def predict_binned(self, X, missing_values_bin_idx, n_threads):
         """Predict raw values for binned data.
 
         Parameters
@@ -79,6 +88,8 @@ def predict_binned(self, X, missing_values_bin_idx):
             Index of the bin that is used for missing values. This is the
             index of the last bin and is always equal to max_bins (as passed
             to the GBDT classes), or equivalently to n_bins - 1.
+        n_threads : int
+            Number of OpenMP threads to use.
 
         Returns
         -------
@@ -87,7 +98,12 @@ def predict_binned(self, X, missing_values_bin_idx):
         """
         out = np.empty(X.shape[0], dtype=Y_DTYPE)
         _predict_from_binned_data(
-            self.nodes, X, self.binned_left_cat_bitsets, missing_values_bin_idx, out
+            self.nodes,
+            X,
+            self.binned_left_cat_bitsets,
+            missing_values_bin_idx,
+            n_threads,
+            out,
         )
         return out
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
index 39884e5e6ee63..5ddba5cd02678 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
@@ -16,8 +16,6 @@ cimport cython
 from cython.parallel import prange
 import numpy as np
 cimport numpy as np
-IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-    from openmp cimport omp_get_max_threads
 from libc.stdlib cimport malloc, free, qsort
 from libc.string cimport memcpy
 from numpy.math cimport INFINITY
@@ -177,6 +175,7 @@ cdef class Splitter:
         unsigned int [::1] partition
         unsigned int [::1] left_indices_buffer
         unsigned int [::1] right_indices_buffer
+        int n_threads
 
     def __init__(self,
                  const X_BINNED_DTYPE_C [::1, :] X_binned,
@@ -189,7 +188,8 @@ cdef class Splitter:
                  Y_DTYPE_C min_hessian_to_split=1e-3,
                  unsigned int min_samples_leaf=20,
                  Y_DTYPE_C min_gain_to_split=0.,
-                 unsigned char hessians_are_constant=False):
+                 unsigned char hessians_are_constant=False,
+                 unsigned int n_threads=1):
 
         self.X_binned = X_binned
         self.n_features = X_binned.shape[1]
@@ -203,6 +203,7 @@ cdef class Splitter:
         self.min_samples_leaf = min_samples_leaf
         self.min_gain_to_split = min_gain_to_split
         self.hessians_are_constant = hessians_are_constant
+        self.n_threads = n_threads
 
         # The partition array maps each sample index into the leaves of the
         # tree (a leaf in this context is a node that isn't splitted yet, not
@@ -306,10 +307,7 @@ cdef class Splitter:
             # split_info.left_cat_bitset directly, so we need a tmp var
             BITSET_INNER_DTYPE_C [:] cat_bitset_tmp = split_info.left_cat_bitset
             BITSET_DTYPE_C left_cat_bitset
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                int n_threads = omp_get_max_threads()
-            ELSE:
-                int n_threads = 1
+            int n_threads = self.n_threads
 
             int [:] sizes = np.full(n_threads, n_samples // n_threads,
                                     dtype=np.int32)
@@ -456,13 +454,15 @@ cdef class Splitter:
             const unsigned char [::1] has_missing_values = self.has_missing_values
             const unsigned char [::1] is_categorical = self.is_categorical
             const signed char [::1] monotonic_cst = self.monotonic_cst
+            int n_threads = self.n_threads
 
         with nogil:
 
             split_infos = <split_info_struct *> malloc(
                 self.n_features * sizeof(split_info_struct))
 
-            for feature_idx in prange(n_features, schedule='static'):
+            for feature_idx in prange(n_features, schedule='static',
+                                      num_threads=n_threads):
                 split_infos[feature_idx].feature_idx = feature_idx
 
                 # For each feature, find best bin to split on
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
index 57403c3792571..7cbc6603ee01f 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
@@ -10,6 +10,9 @@
 from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import ALMOST_INF
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
+n_threads = _openmp_effective_n_threads()
 
 
 DATA = (
@@ -93,7 +96,7 @@ def test_map_to_bins(max_bins):
     ]
     binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order="F")
     last_bin_idx = max_bins
-    _map_to_bins(DATA, bin_thresholds, last_bin_idx, binned)
+    _map_to_bins(DATA, bin_thresholds, last_bin_idx, n_threads, binned)
     assert binned.shape == DATA.shape
     assert binned.dtype == np.uint8
     assert binned.flags.f_contiguous
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 517a96a77044e..79581525b50bb 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -21,6 +21,9 @@
 from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
 from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
 from sklearn.utils import shuffle
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
+n_threads = _openmp_effective_n_threads()
 
 
 X_classification, y_classification = make_classification(random_state=0)
@@ -693,7 +696,7 @@ def test_sum_hessians_are_sample_weight(loss_name):
 
     sample_weight = rng.normal(size=n_samples)
 
-    loss = _LOSSES[loss_name](sample_weight=sample_weight)
+    loss = _LOSSES[loss_name](sample_weight=sample_weight, n_threads=n_threads)
     gradients, hessians = loss.init_gradients_and_hessians(
         n_samples=n_samples, prediction_dim=1, sample_weight=sample_weight
     )
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
index fe4568339a9ac..6ff30a5888fe3 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
@@ -12,6 +12,9 @@
 from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import X_BITSET_INNER_DTYPE
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
+n_threads = _openmp_effective_n_threads()
 
 
 def _make_training_data(n_bins=256, constant_hessian=True):
@@ -195,12 +198,14 @@ def test_predictor_from_grower():
         dtype=np.uint8,
     )
     missing_values_bin_idx = n_bins - 1
-    predictions = predictor.predict_binned(input_data, missing_values_bin_idx)
+    predictions = predictor.predict_binned(
+        input_data, missing_values_bin_idx, n_threads
+    )
     expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1]
     assert np.allclose(predictions, expected_targets)
 
     # Check that training set can be recovered exactly:
-    predictions = predictor.predict_binned(X_binned, missing_values_bin_idx)
+    predictions = predictor.predict_binned(X_binned, missing_values_bin_idx, n_threads)
     assert np.allclose(predictions, -all_gradients)
 
 
@@ -381,7 +386,7 @@ def test_missing_value_predict_only():
     known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
     f_idx_map = np.zeros(0, dtype=np.uint32)
 
-    y_pred = predictor.predict(all_nans, known_cat_bitsets, f_idx_map)
+    y_pred = predictor.predict(all_nans, known_cat_bitsets, f_idx_map, n_threads)
     assert np.all(y_pred == prediction_main_path)
 
 
@@ -409,6 +414,7 @@ def test_split_on_nan_with_infinite_values():
         n_bins_non_missing=n_bins_non_missing,
         has_missing_values=has_missing_values,
         min_samples_leaf=1,
+        n_threads=n_threads,
     )
 
     grower.grow()
@@ -424,9 +430,11 @@ def test_split_on_nan_with_infinite_values():
     # Make sure in particular that the +inf sample is mapped to the left child
     # Note that lightgbm "fails" here and will assign the inf sample to the
     # right child, even though it's a "split on nan" situation.
-    predictions = predictor.predict(X, known_cat_bitsets, f_idx_map)
+    predictions = predictor.predict(X, known_cat_bitsets, f_idx_map, n_threads)
     predictions_binned = predictor.predict_binned(
-        X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_
+        X_binned,
+        missing_values_bin_idx=bin_mapper.missing_values_bin_idx_,
+        n_threads=n_threads,
     )
     np.testing.assert_allclose(predictions, -gradients)
     np.testing.assert_allclose(predictions_binned, -gradients)
@@ -450,6 +458,7 @@ def test_grow_tree_categories():
         shrinkage=1.0,
         min_samples_leaf=1,
         is_categorical=is_categorical,
+        n_threads=n_threads,
     )
     grower.grow()
     assert grower.n_nodes == 3
@@ -485,7 +494,9 @@ def test_grow_tree_categories():
     # make sure binned missing values are mapped to the left child during
     # prediction
     prediction_binned = predictor.predict_binned(
-        np.asarray([[6]]).astype(X_BINNED_DTYPE), missing_values_bin_idx=6
+        np.asarray([[6]]).astype(X_BINNED_DTYPE),
+        missing_values_bin_idx=6,
+        n_threads=n_threads,
     )
     assert_allclose(prediction_binned, [-1])  # negative gradient
 
@@ -493,7 +504,9 @@ def test_grow_tree_categories():
     # prediction
     known_cat_bitsets = np.zeros((1, 8), dtype=np.uint32)  # ignored anyway
     f_idx_map = np.array([0], dtype=np.uint32)
-    prediction = predictor.predict(np.array([[np.nan]]), known_cat_bitsets, f_idx_map)
+    prediction = predictor.predict(
+        np.array([[np.nan]]), known_cat_bitsets, f_idx_map, n_threads
+    )
     assert_allclose(prediction, [-1])
 
 
@@ -535,14 +548,18 @@ def test_ohe_equivalence(min_samples_leaf, n_unique_categories, target):
     predictor = grower.make_predictor(
         binning_thresholds=np.zeros((1, n_unique_categories))
     )
-    preds = predictor.predict_binned(X_binned, missing_values_bin_idx=255)
+    preds = predictor.predict_binned(
+        X_binned, missing_values_bin_idx=255, n_threads=n_threads
+    )
 
     grower_ohe = TreeGrower(X_ohe, gradients, hessians, **grower_params)
     grower_ohe.grow()
     predictor_ohe = grower_ohe.make_predictor(
         binning_thresholds=np.zeros((X_ohe.shape[1], n_unique_categories))
     )
-    preds_ohe = predictor_ohe.predict_binned(X_ohe, missing_values_bin_idx=255)
+    preds_ohe = predictor_ohe.predict_binned(
+        X_ohe, missing_values_bin_idx=255, n_threads=n_threads
+    )
 
     assert predictor.get_max_depth() <= predictor_ohe.get_max_depth()
     if target == "binary" and n_unique_categories > 2:
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
index 9081471477691..813163802f956 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
@@ -11,6 +11,9 @@
 from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
 from sklearn.utils._testing import skip_if_32bit
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
+n_threads = _openmp_effective_n_threads()
 
 
 def get_derivatives_helper(loss):
@@ -134,7 +137,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
     else:
         y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
     raw_predictions = rng.normal(size=(prediction_dim, n_samples)).astype(Y_DTYPE)
-    loss = _LOSSES[loss](sample_weight=None)
+    loss = _LOSSES[loss](sample_weight=None, n_threads=n_threads)
     get_gradients, get_hessians = get_derivatives_helper(loss)
 
     # only take gradients and hessians of first tree / class.
@@ -297,7 +300,7 @@ def test_sample_weight_multiplies_gradients(loss, problem, sample_weight):
     else:
         sample_weight = rng.normal(size=n_samples).astype(Y_DTYPE)
 
-    loss_ = _LOSSES[loss](sample_weight=sample_weight)
+    loss_ = _LOSSES[loss](sample_weight=sample_weight, n_threads=n_threads)
 
     baseline_prediction = loss_.get_baseline_prediction(y_true, None, prediction_dim)
     raw_predictions = np.zeros(
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
index fa9496d386fd8..4ab65c55a8620 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
@@ -12,6 +12,9 @@
 from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
+n_threads = _openmp_effective_n_threads()
 
 
 def is_increasing(a):
@@ -295,7 +298,7 @@ def test_bounded_value_min_gain_to_split():
     hessians_are_constant = False
 
     builder = HistogramBuilder(
-        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant
+        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
     )
     n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
     has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
index f0227969ae366..e3c725965f550 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
@@ -20,6 +20,9 @@
     set_bitset_memoryview,
     set_raw_bitset_from_binned_bitset,
 )
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
+n_threads = _openmp_effective_n_threads()
 
 
 @pytest.mark.parametrize("n_bins", [200, 256])
@@ -54,10 +57,10 @@ def test_regression_dataset(n_bins):
     known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE)
     f_idx_map = np.zeros(0, dtype=np.uint32)
 
-    y_pred_train = predictor.predict(X_train, known_cat_bitsets, f_idx_map)
+    y_pred_train = predictor.predict(X_train, known_cat_bitsets, f_idx_map, n_threads)
     assert r2_score(y_train, y_pred_train) > 0.82
 
-    y_pred_test = predictor.predict(X_test, known_cat_bitsets, f_idx_map)
+    y_pred_test = predictor.predict(X_test, known_cat_bitsets, f_idx_map, n_threads)
     assert r2_score(y_test, y_pred_test) > 0.67
 
 
@@ -101,7 +104,7 @@ def test_infinite_values_and_thresholds(num_threshold, expected_predictions):
     f_idx_map = np.zeros(0, dtype=np.uint32)
 
     predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets)
-    predictions = predictor.predict(X, known_cat_bitset, f_idx_map)
+    predictions = predictor.predict(X, known_cat_bitset, f_idx_map, n_threads)
 
     assert np.all(predictions == expected_predictions)
 
@@ -151,7 +154,9 @@ def test_categorical_predictor(bins_go_left, expected_predictions):
     predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets)
 
     # Check binned data gives correct predictions
-    prediction_binned = predictor.predict_binned(X_binned, missing_values_bin_idx=6)
+    prediction_binned = predictor.predict_binned(
+        X_binned, missing_values_bin_idx=6, n_threads=n_threads
+    )
     assert_allclose(prediction_binned, expected_predictions)
 
     # manually construct bitset
@@ -161,17 +166,22 @@ def test_categorical_predictor(bins_go_left, expected_predictions):
 
     # Check with un-binned data
     predictions = predictor.predict(
-        categories.reshape(-1, 1), known_cat_bitsets, f_idx_map
+        categories.reshape(-1, 1), known_cat_bitsets, f_idx_map, n_threads
     )
     assert_allclose(predictions, expected_predictions)
 
     # Check missing goes left because missing_values_bin_idx=6
     X_binned_missing = np.array([[6]], dtype=X_BINNED_DTYPE).T
-    predictions = predictor.predict_binned(X_binned_missing, missing_values_bin_idx=6)
+    predictions = predictor.predict_binned(
+        X_binned_missing, missing_values_bin_idx=6, n_threads=n_threads
+    )
     assert_allclose(predictions, [1])
 
     # missing and unknown go left
     predictions = predictor.predict(
-        np.array([[np.nan, 17]], dtype=X_DTYPE).T, known_cat_bitsets, f_idx_map
+        np.array([[np.nan, 17]], dtype=X_DTYPE).T,
+        known_cat_bitsets,
+        f_idx_map,
+        n_threads,
     )
     assert_allclose(predictions, [1, 1])
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
index aa7befe90211e..0d19bdc6df72b 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
@@ -12,6 +12,9 @@
 )
 from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
 from sklearn.utils._testing import skip_if_32bit
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
+n_threads = _openmp_effective_n_threads()
 
 
 @pytest.mark.parametrize("n_bins", [3, 32, 256])
@@ -40,7 +43,12 @@ def test_histogram_split(n_bins):
             sum_gradients = all_gradients.sum()
 
             builder = HistogramBuilder(
-                X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant
+                X_binned,
+                n_bins,
+                all_gradients,
+                all_hessians,
+                hessians_are_constant,
+                n_threads,
             )
             n_bins_non_missing = np.array(
                 [n_bins - 1] * X_binned.shape[1], dtype=np.uint32
@@ -120,7 +128,7 @@ def test_gradient_and_hessian_sanity(constant_hessian):
         sum_hessians = all_hessians.sum()
 
     builder = HistogramBuilder(
-        X_binned, n_bins, all_gradients, all_hessians, constant_hessian
+        X_binned, n_bins, all_gradients, all_hessians, constant_hessian, n_threads
     )
     n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
     has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
@@ -263,7 +271,7 @@ def test_split_indices():
     hessians_are_constant = True
 
     builder = HistogramBuilder(
-        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant
+        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
     )
     n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32)
     has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
@@ -339,7 +347,7 @@ def test_min_gain_to_split():
     hessians_are_constant = False
 
     builder = HistogramBuilder(
-        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant
+        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
     )
     n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
     has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
@@ -508,7 +516,7 @@ def test_splitting_missing_values(
     hessians_are_constant = True
 
     builder = HistogramBuilder(
-        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant
+        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
     )
 
     n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
@@ -612,7 +620,7 @@ def test_splitting_categorical_cat_smooth(
     hessians_are_constant = True
 
     builder = HistogramBuilder(
-        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant
+        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
     )
 
     n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
@@ -798,7 +806,7 @@ def test_splitting_categorical_sanity(
     hessians_are_constant = True
 
     builder = HistogramBuilder(
-        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant
+        X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
     )
 
     n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
index 3b323b3e298b8..77b3101cdb656 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
@@ -143,13 +143,14 @@ def get_equivalent_estimator(estimator, lib='lightgbm'):
             return CatBoostRegressor(**catboost_params)
 
 
-def sum_parallel(G_H_DTYPE_C [:] array):
+def sum_parallel(G_H_DTYPE_C [:] array, int n_threads):
 
     cdef:
         Y_DTYPE_C out = 0.
         int i = 0
 
-    for i in prange(array.shape[0], schedule='static', nogil=True):
+    for i in prange(array.shape[0], schedule='static', nogil=True,
+                    num_threads=n_threads):
         out += array[i]
 
     return out