ENH add feature subsampling per split for HGBT (#27139)

lorentzenchr · ogrisel · web-flow · commit fc11deaf7f56 · 2023-11-14T18:26:01.000Z
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
diff --git a/benchmarks/bench_hist_gradient_boosting_higgsboson.py b/benchmarks/bench_hist_gradient_boosting_higgsboson.py
@@ -25,6 +25,7 @@
 parser.add_argument("--no-predict", action="store_true", default=False)
 parser.add_argument("--cache-loc", type=str, default="/tmp")
 parser.add_argument("--no-interactions", type=bool, default=False)
+parser.add_argument("--max-features", type=float, default=1)
 args = parser.parse_args()
 
 HERE = os.path.dirname(__file__)
@@ -36,6 +37,7 @@
 subsample = args.subsample
 lr = args.learning_rate
 max_bins = args.max_bins
+max_features = args.max_features
 
 
 @m.cache
@@ -104,6 +106,7 @@ def predict(est, data_test, target_test):
     random_state=0,
     verbose=1,
     interaction_cst=interaction_cst,
+    max_features=max_features,
 )
 fit(est, data_train, target_train, "sklearn")
 predict(est, data_test, target_test)
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -303,6 +303,12 @@ Changelog
   :pr:`13649` by :user:`Samuel Ronsin <samronsin>`,
   initiated by :user:`Patrick O'Reilly <pat-oreilly>`.
 
+- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` got the new parameter
+  `max_features` to specify the proportion of randomly chosen features considered
+  in each split.
+  :pr:`27139` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 - |Efficiency| :class:`ensemble.GradientBoostingClassifier` is faster,
   for binary and in particular for multiclass problems thanks to the private loss
   function module.
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -105,6 +105,7 @@ class BaseHistGradientBoosting(BaseEstimator, ABC):
         "max_depth": [Interval(Integral, 1, None, closed="left"), None],
         "min_samples_leaf": [Interval(Integral, 1, None, closed="left")],
         "l2_regularization": [Interval(Real, 0, None, closed="left")],
+        "max_features": [Interval(RealNotInt, 0, 1, closed="right")],
         "monotonic_cst": ["array-like", dict, None],
         "interaction_cst": [
             list,
@@ -139,6 +140,7 @@ def __init__(
         max_depth,
         min_samples_leaf,
         l2_regularization,
+        max_features,
         max_bins,
         categorical_features,
         monotonic_cst,
@@ -159,6 +161,7 @@ def __init__(
         self.max_depth = max_depth
         self.min_samples_leaf = min_samples_leaf
         self.l2_regularization = l2_regularization
+        self.max_features = max_features
         self.max_bins = max_bins
         self.monotonic_cst = monotonic_cst
         self.interaction_cst = interaction_cst
@@ -393,10 +396,12 @@ def fit(self, X, y, sample_weight=None):
         rng = check_random_state(self.random_state)
 
         # When warm starting, we want to reuse the same seed that was used
-        # the first time fit was called (e.g. for subsampling or for the
-        # train/val split).
-        if not (self.warm_start and self._is_fitted()):
+        # the first time fit was called (e.g. train/val split).
+        # For feature subsampling, we want to continue with the rng we started with.
+        if not self.warm_start or not self._is_fitted():
             self._random_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8")
+            feature_subsample_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8")
+            self._feature_subsample_rng = np.random.default_rng(feature_subsample_seed)
 
         self._validate_parameters()
         monotonic_cst = _check_monotonic_cst(self, self.monotonic_cst)
@@ -700,6 +705,8 @@ def fit(self, X, y, sample_weight=None):
                     max_depth=self.max_depth,
                     min_samples_leaf=self.min_samples_leaf,
                     l2_regularization=self.l2_regularization,
+                    feature_fraction_per_split=self.max_features,
+                    rng=self._feature_subsample_rng,
                     shrinkage=self.learning_rate,
                     n_threads=n_threads,
                 )
@@ -1261,8 +1268,16 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
         than a few hundred samples, it is recommended to lower this value
         since only very shallow trees would be built.
     l2_regularization : float, default=0
-        The L2 regularization parameter. Use ``0`` for no regularization
-        (default).
+        The L2 regularization parameter. Use ``0`` for no regularization (default).
+    max_features : float, default=1.0
+        Proportion of randomly chosen features in each and every node split.
+        This is a form of regularization, smaller values make the trees weaker
+        learners and might prevent overfitting.
+        If interaction constraints from `interaction_cst` are present, only allowed
+        features are taken into account for the subsampling.
+
+        .. versionadded:: 1.4
+
     max_bins : int, default=255
         The maximum number of bins to use for non-missing values. Before
         training, each feature of the input array `X` is binned into
@@ -1463,6 +1478,7 @@ def __init__(
         max_depth=None,
         min_samples_leaf=20,
         l2_regularization=0.0,
+        max_features=1.0,
         max_bins=255,
         categorical_features=None,
         monotonic_cst=None,
@@ -1484,6 +1500,7 @@ def __init__(
             max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization,
+            max_features=max_features,
             max_bins=max_bins,
             monotonic_cst=monotonic_cst,
             interaction_cst=interaction_cst,
@@ -1620,7 +1637,16 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
         than a few hundred samples, it is recommended to lower this value
         since only very shallow trees would be built.
     l2_regularization : float, default=0
-        The L2 regularization parameter. Use 0 for no regularization.
+        The L2 regularization parameter. Use ``0`` for no regularization (default).
+    max_features : float, default=1.0
+        Proportion of randomly chosen features in each and every node split.
+        This is a form of regularization, smaller values make the trees weaker
+        learners and might prevent overfitting.
+        If interaction constraints from `interaction_cst` are present, only allowed
+        features are taken into account for the subsampling.
+
+        .. versionadded:: 1.4
+
     max_bins : int, default=255
         The maximum number of bins to use for non-missing values. Before
         training, each feature of the input array `X` is binned into
@@ -1823,6 +1849,7 @@ def __init__(
         max_depth=None,
         min_samples_leaf=20,
         l2_regularization=0.0,
+        max_features=1.0,
         max_bins=255,
         categorical_features=None,
         monotonic_cst=None,
@@ -1845,6 +1872,7 @@ def __init__(
             max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization,
+            max_features=max_features,
             max_bins=max_bins,
             categorical_features=categorical_features,
             monotonic_cst=monotonic_cst,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -164,6 +164,10 @@ class TreeGrower:
     min_gain_to_split : float, default=0.
         The minimum gain needed to split a node. Splits with lower gain will
         be ignored.
+    min_hessian_to_split : float, default=1e-3
+        The minimum sum of hessians needed in each node. Splits that result in
+        at least one child having a sum of hessians less than
+        ``min_hessian_to_split`` are discarded.
     n_bins : int, default=256
         The total number of bins, including the bin for missing values. Used
         to define the shape of the histograms.
@@ -189,10 +193,12 @@ class TreeGrower:
         List of interaction constraints.
     l2_regularization : float, default=0.
         The L2 regularization parameter.
-    min_hessian_to_split : float, default=1e-3
-        The minimum sum of hessians needed in each node. Splits that result in
-        at least one child having a sum of hessians less than
-        ``min_hessian_to_split`` are discarded.
+    feature_fraction_per_split : float, default=1
+        Proportion of randomly chosen features in each and every node split.
+        This is a form of regularization, smaller values make the trees weaker
+        learners and might prevent overfitting.
+    rng : Generator
+        Numpy random Generator used for feature subsampling.
     shrinkage : float, default=1.
         The shrinkage parameter to apply to the leaves values, also known as
         learning rate.
@@ -234,14 +240,16 @@ def __init__(
         max_depth=None,
         min_samples_leaf=20,
         min_gain_to_split=0.0,
+        min_hessian_to_split=1e-3,
         n_bins=256,
         n_bins_non_missing=None,
         has_missing_values=False,
         is_categorical=None,
         monotonic_cst=None,
         interaction_cst=None,
         l2_regularization=0.0,
-        min_hessian_to_split=1e-3,
+        feature_fraction_per_split=1.0,
+        rng=np.random.default_rng(),
         shrinkage=1.0,
         n_threads=None,
     ):
@@ -297,33 +305,35 @@ def __init__(
         )
         missing_values_bin_idx = n_bins - 1
         self.splitter = Splitter(
-            X_binned,
-            n_bins_non_missing,
-            missing_values_bin_idx,
-            has_missing_values,
-            is_categorical,
-            monotonic_cst,
-            l2_regularization,
-            min_hessian_to_split,
-            min_samples_leaf,
-            min_gain_to_split,
-            hessians_are_constant,
-            n_threads,
+            X_binned=X_binned,
+            n_bins_non_missing=n_bins_non_missing,
+            missing_values_bin_idx=missing_values_bin_idx,
+            has_missing_values=has_missing_values,
+            is_categorical=is_categorical,
+            monotonic_cst=monotonic_cst,
+            l2_regularization=l2_regularization,
+            min_hessian_to_split=min_hessian_to_split,
+            min_samples_leaf=min_samples_leaf,
+            min_gain_to_split=min_gain_to_split,
+            hessians_are_constant=hessians_are_constant,
+            feature_fraction_per_split=feature_fraction_per_split,
+            rng=rng,
+            n_threads=n_threads,
         )
+        self.X_binned = X_binned
+        self.max_leaf_nodes = max_leaf_nodes
+        self.max_depth = max_depth
+        self.min_samples_leaf = min_samples_leaf
+        self.min_gain_to_split = min_gain_to_split
         self.n_bins_non_missing = n_bins_non_missing
         self.missing_values_bin_idx = missing_values_bin_idx
-        self.max_leaf_nodes = max_leaf_nodes
         self.has_missing_values = has_missing_values
+        self.is_categorical = is_categorical
         self.monotonic_cst = monotonic_cst
         self.interaction_cst = interaction_cst
-        self.is_categorical = is_categorical
         self.l2_regularization = l2_regularization
-        self.n_features = X_binned.shape[1]
-        self.max_depth = max_depth
-        self.min_samples_leaf = min_samples_leaf
-        self.X_binned = X_binned
-        self.min_gain_to_split = min_gain_to_split
         self.shrinkage = shrinkage
+        self.n_features = X_binned.shape[1]
         self.n_threads = n_threads
         self.splittable_nodes = []
         self.finalized_leaves = []
diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
@@ -9,8 +9,9 @@
 
 cimport cython
 from cython.parallel import prange
+cimport numpy as cnp
 import numpy as np
-from libc.math cimport INFINITY
+from libc.math cimport INFINITY, ceil
 from libc.stdlib cimport malloc, free, qsort
 from libc.string cimport memcpy
 
@@ -24,6 +25,8 @@ from ._bitset cimport init_bitset
 from ._bitset cimport set_bitset
 from ._bitset cimport in_bitset
 
+cnp.import_array()
+
 
 cdef struct split_info_struct:
     # Same as the SplitInfo class, but we need a C struct to use it in the
@@ -155,6 +158,11 @@ cdef class Splitter:
         be ignored.
     hessians_are_constant: bool, default is False
         Whether hessians are constant.
+    feature_fraction_per_split : float, default=1
+        Proportion of randomly chosen features in each and every node split.
+        This is a form of regularization, smaller values make the trees weaker
+        learners and might prevent overfitting.
+    rng : Generator
     n_threads : int, default=1
         Number of OpenMP threads to use.
     """
@@ -171,6 +179,8 @@ cdef class Splitter:
         Y_DTYPE_C min_hessian_to_split
         unsigned int min_samples_leaf
         Y_DTYPE_C min_gain_to_split
+        Y_DTYPE_C feature_fraction_per_split
+        rng
 
         unsigned int [::1] partition
         unsigned int [::1] left_indices_buffer
@@ -189,20 +199,24 @@ cdef class Splitter:
                  unsigned int min_samples_leaf=20,
                  Y_DTYPE_C min_gain_to_split=0.,
                  unsigned char hessians_are_constant=False,
+                 Y_DTYPE_C feature_fraction_per_split=1.0,
+                 rng=np.random.RandomState(),
                  unsigned int n_threads=1):
 
         self.X_binned = X_binned
         self.n_features = X_binned.shape[1]
         self.n_bins_non_missing = n_bins_non_missing
         self.missing_values_bin_idx = missing_values_bin_idx
         self.has_missing_values = has_missing_values
-        self.monotonic_cst = monotonic_cst
         self.is_categorical = is_categorical
+        self.monotonic_cst = monotonic_cst
         self.l2_regularization = l2_regularization
         self.min_hessian_to_split = min_hessian_to_split
         self.min_samples_leaf = min_samples_leaf
         self.min_gain_to_split = min_gain_to_split
         self.hessians_are_constant = hessians_are_constant
+        self.feature_fraction_per_split = feature_fraction_per_split
+        self.rng = rng
         self.n_threads = n_threads
 
         # The partition array maps each sample index into the leaves of the
@@ -475,20 +489,36 @@ cdef class Splitter:
             const signed char [::1] monotonic_cst = self.monotonic_cst
             int n_threads = self.n_threads
             bint has_interaction_cst = False
+            Y_DTYPE_C feature_fraction_per_split = self.feature_fraction_per_split
+            cnp.npy_bool [:] subsample_mask
+            int n_subsampled_features
 
         has_interaction_cst = allowed_features is not None
         if has_interaction_cst:
             n_allowed_features = allowed_features.shape[0]
         else:
             n_allowed_features = self.n_features
 
+        if feature_fraction_per_split < 1.0:
+            # We do all random sampling before the nogil and make sure that we sample
+            # exactly n_subsampled_features >= 1 features.
+            n_subsampled_features = max(
+                1,
+                int(ceil(feature_fraction_per_split * n_allowed_features)),
+            )
+            subsample_mask_arr = np.full(n_allowed_features, False)
+            subsample_mask_arr[:n_subsampled_features] = True
+            self.rng.shuffle(subsample_mask_arr)
+            # https://github.com/numpy/numpy/issues/18273
+            subsample_mask = subsample_mask_arr
+
         with nogil:
 
             split_infos = <split_info_struct *> malloc(
                 n_allowed_features * sizeof(split_info_struct))
 
-            # split_info_idx is index of split_infos of size n_features_allowed
-            # features_idx is the index of the feature column in X
+            # split_info_idx is index of split_infos of size n_allowed_features.
+            # features_idx is the index of the feature column in X.
             for split_info_idx in prange(n_allowed_features, schedule='static',
                                          num_threads=n_threads):
                 if has_interaction_cst:
@@ -506,6 +536,13 @@ cdef class Splitter:
                 split_infos[split_info_idx].gain = -1
                 split_infos[split_info_idx].is_categorical = is_categorical[feature_idx]
 
+                # Note that subsample_mask is indexed by split_info_idx and not by
+                # feature_idx because we only need to exclude the same features again
+                # and again. We do NOT need to access the features directly by using
+                # allowed_features.
+                if feature_fraction_per_split < 1.0 and not subsample_mask[split_info_idx]:
+                    continue
+
                 if is_categorical[feature_idx]:
                     self._find_best_bin_to_split_category(
                         feature_idx, has_missing_values[feature_idx],
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py