scikit-learn · lorentzenchr · Jan 7, 2024 · Jan 17, 2024 · Jan 18, 2024 · Jan 23, 2024
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
@@ -3,13 +3,14 @@
 from cython.parallel import prange
 from libc.math cimport isnan
 
+from sklearn.utils._typedefs cimport uint16_t
 from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C
 
 
-def _map_to_bins(const X_DTYPE_C [:, :] data,
+def _map_to_bins(const X_DTYPE_C[:, :] data,
+                 const uint16_t[::1] n_bins_non_missing,
                  list binning_thresholds,
                  const unsigned char[::1] is_categorical,
-                 const unsigned char missing_values_bin_idx,
                  int n_threads,
                  X_BINNED_DTYPE_C [::1, :] binned):
     """Bin continuous and categorical values to discrete integer-coded levels.
@@ -21,6 +22,10 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
     ----------
     data : ndarray, shape (n_samples, n_features)
         The data to bin.
+    n_bins_non_missing : ndarray of shape (n_features,) dtype=np.uint16
+        For each feature, gives the number of bins actually used for non-missing
+        values. The index of the bin where missing values are mapped is always
+        given by the last bin, i.e. bin index ``n_bins_non_missing_``.
     binning_thresholds : list of arrays
         For each feature, stores the increasing numeric values that are
         used to separate the bins.
@@ -36,20 +41,20 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
 
     for feature_idx in range(data.shape[1]):
         _map_col_to_bins(
-            data[:, feature_idx],
-            binning_thresholds[feature_idx],
-            is_categorical[feature_idx],
-            missing_values_bin_idx,
-            n_threads,
-            binned[:, feature_idx]
+            data=data[:, feature_idx],
+            binning_thresholds=binning_thresholds[feature_idx],
+            is_categorical=is_categorical[feature_idx],
+            missing_values_bin_idx=n_bins_non_missing[feature_idx],
+            n_threads=n_threads,
+            binned=binned[:, feature_idx]
         )
 
 
 cdef void _map_col_to_bins(
-    const X_DTYPE_C [:] data,
-    const X_DTYPE_C [:] binning_thresholds,
+    const X_DTYPE_C[:] data,
+    const X_DTYPE_C[:] binning_thresholds,
     const unsigned char is_categorical,
-    const unsigned char missing_values_bin_idx,
+    const uint16_t missing_values_bin_idx,
     int n_threads,
     X_BINNED_DTYPE_C [:] binned
 ):

diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
@@ -11,7 +11,7 @@ from .common cimport X_BINNED_DTYPE_C
 from .common cimport BITSET_INNER_DTYPE_C
 from .common cimport node_struct
 from ._bitset cimport in_bitset_2d_memoryview
-from sklearn.utils._typedefs cimport intp_t
+from sklearn.utils._typedefs cimport intp_t, uint16_t
 
 
 def _predict_from_raw_data(  # raw data = non-binned data
@@ -89,7 +89,7 @@ def _predict_from_binned_data(
         node_struct [:] nodes,
         const X_BINNED_DTYPE_C [:, :] binned_data,
         BITSET_INNER_DTYPE_C [:, :] binned_left_cat_bitsets,
-        const unsigned char missing_values_bin_idx,
+        const uint16_t [::1] n_bins_non_missing,
         int n_threads,
         Y_DTYPE_C [:] out):
 
@@ -100,29 +100,32 @@ def _predict_from_binned_data(
                     num_threads=n_threads):
         out[i] = _predict_one_from_binned_data(nodes,
                                                binned_data,
-                                               binned_left_cat_bitsets, i,
-                                               missing_values_bin_idx)
+                                               binned_left_cat_bitsets,
+                                               i,
+                                               n_bins_non_missing)
 
 
 cdef inline Y_DTYPE_C _predict_one_from_binned_data(
         node_struct [:] nodes,
         const X_BINNED_DTYPE_C [:, :] binned_data,
         const BITSET_INNER_DTYPE_C [:, :] binned_left_cat_bitsets,
         const int row,
-        const unsigned char missing_values_bin_idx) noexcept nogil:
+        const uint16_t [::1] n_bins_non_missing) noexcept nogil:
     # Need to pass the whole array and the row index, else prange won't work.
     # See issue Cython #2798
 
     cdef:
         node_struct node = nodes[0]
         unsigned int node_idx = 0
         X_BINNED_DTYPE_C data_val
+        uint16_t missing_values_bin_idx
 
     while True:
         if node.is_leaf:
             return node.value
 
         data_val = binned_data[row, node.feature_idx]
+        missing_values_bin_idx = n_bins_non_missing[node.feature_idx]
 
         if data_val == missing_values_bin_idx:
             if node.missing_go_to_left:

diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py
@@ -136,18 +136,14 @@ class _BinMapper(TransformerMixin, BaseEstimator):
           value to the raw category value. The size of the array is equal to
           ``min(max_bins, category_cardinality)`` where we ignore missing
           values in the cardinality.
-    n_bins_non_missing_ : ndarray, dtype=np.uint32
+    n_bins_non_missing_ : ndarray of shape (n_features,), dtype=np.uint16
         For each feature, gives the number of bins actually used for
         non-missing values. For features with a lot of unique values, this is
         equal to ``n_bins - 1``.
+        The index of the bin where missing values are mapped is always given by the
+        last bin, i.e. bin index ``n_bins_non_missing_`` (no unsused bins).
     is_categorical_ : ndarray of shape (n_features,), dtype=np.uint8
         Indicator for categorical features.
-    missing_values_bin_idx_ : np.uint8
-        The index of the bin where missing values are mapped. This is a
-        constant across all features. This corresponds to the last bin, and
-        it is always equal to ``n_bins - 1``. Note that if ``n_bins_non_missing_``
-        is less than ``n_bins - 1`` for a given feature, then there are
-        empty (and unused) bins.
     """
 
     def __init__(
@@ -223,8 +219,6 @@ def fit(self, X, y=None):
                     "but categories were passed."
                 )
 
-        self.missing_values_bin_idx_ = self.n_bins - 1
-
         self.bin_thresholds_ = []
         n_bins_non_missing = []
 
@@ -242,7 +236,7 @@ def fit(self, X, y=None):
 
             self.bin_thresholds_.append(thresholds)
 
-        self.n_bins_non_missing_ = np.array(n_bins_non_missing, dtype=np.uint32)
+        self.n_bins_non_missing_ = np.array(n_bins_non_missing, dtype=np.uint16)
         return self
 
     def transform(self, X):
@@ -277,9 +271,9 @@ def transform(self, X):
         binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
         _map_to_bins(
             X,
+            self.n_bins_non_missing_,
             self.bin_thresholds_,
             self.is_categorical_,
-            self.missing_values_bin_idx_,
             n_threads,
             binned,
         )

diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
@@ -1,5 +1,5 @@
 cimport numpy as cnp
-from sklearn.utils._typedefs cimport intp_t
+from sklearn.utils._typedefs cimport intp_t, uint32_t
 
 cnp.import_array()
 
@@ -11,6 +11,7 @@ ctypedef cnp.npy_float32 G_H_DTYPE_C
 ctypedef cnp.npy_uint32 BITSET_INNER_DTYPE_C
 ctypedef BITSET_INNER_DTYPE_C[8] BITSET_DTYPE_C
 
+
 cdef packed struct hist_struct:
     # Same as histogram dtype but we need a struct to declare views. It needs
     # to be packed since by default numpy dtypes aren't aligned
@@ -42,3 +43,18 @@ cpdef enum MonotonicConstraint:
     NO_CST = 0
     POS = 1
     NEG = -1
+
+
+cdef class Histograms:
+    cdef:
+        int n_features
+        uint32_t [::1] bin_offsets_view
+        hist_struct [::1] histograms_view
+    cdef public:
+        object bin_offsets
+        # Only the attribute histograms will be used in the splitter.
+        object histograms
+
+    cdef inline uint32_t n_bins(self, int feature_idx) noexcept nogil
+
+    cdef inline hist_struct* at(self, int feature_idx, uint32_t bin_idx) noexcept nogil
diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pyx b/sklearn/ensemble/_hist_gradient_boosting/common.pyx
@@ -1,5 +1,8 @@
 import numpy as np
 
+cimport cython
+from ...utils._typedefs cimport uint32_t
+
 # Y_DYTPE is the dtype to which the targets y are converted to. This is also
 # dtype for leaf values, gains, and sums of gradients / hessians. The gradients
 # and hessians arrays are stored as floats to avoid using too much memory.
@@ -42,3 +45,85 @@ PREDICTOR_RECORD_DTYPE = np.dtype([
 ])
 
 ALMOST_INF = 1e300  # see LightGBM AvoidInf()
+
+
+@cython.final
+cdef class Histograms:
+    """An extension type (class) for histograms with variable bins.
+
+    This class only allocates the smallest possible 1d ndarray and provides access
+    to it like a 2d jagged array. It is comparable to a jagged/ragged array.
+
+    Attributes
+    ----------
+    Public, i.e. accessible from Python:
+
+        bin_offsets : ndarray of shape (n_features + 1), dtype=np.uint32
+            The bin offsets specify which partition of the histograms ndarray belongs
+            to which features: feature j goes from `histograms[bin_offsets[j]]` until
+            `histograms[bin_offsets[j + 1] - 1]`. `bin_offsets[n_features + 1]` gives
+            the total number of bins over all features.
+        histograms : ndarray of shape (bin_offsets[n_features + 1],), \
+                dtype=HISTOGRAM_DTYPE
+            The 1-dimensional array of all histograms for all features.
+
+    Private, i.e. only accessible from Cython:
+
+        bin_offsets_view : memoryview of `bin_offsets`, dtype=uint32_t
+        histograms_view : memoryview of `histograms`, dtype=hist_stucts
+        n_features : int
+    """
+
+    def __init__(self, n_features, bin_offsets):
+        self.n_features = n_features
+        if isinstance(bin_offsets, int):
+            self.bin_offsets = np.zeros(shape=self.n_features + 1, dtype=np.uint32)
+            self.bin_offsets[1:] = np.cumsum(np.full(shape=n_features, fill_value=bin_offsets))
+        else:
+            self.bin_offsets = bin_offsets
+        self.bin_offsets_view = self.bin_offsets
+
+        self.histograms = np.empty(
+            shape=self.bin_offsets[-1],
+            dtype=HISTOGRAM_DTYPE,
+        )
+        self.histograms_view = self.histograms
+
+    cdef inline uint32_t n_bins(self, int feature_idx) noexcept nogil:
+        return self.bin_offsets_view[feature_idx + 1] - self.bin_offsets_view[feature_idx]
+
+    cdef inline hist_struct* at(self, int feature_idx, uint32_t bin_idx) noexcept nogil:
+        return &self.histograms_view[self.bin_offsets_view[feature_idx] + bin_idx]
+
+    # From here on, only for better testing.
+    def fill_zeros(self):
+        cdef:
+            uint32_t i
+            n = self.histograms_view.shape[0]
+        for i in range(n):
+            self.histograms_view[i].sum_gradients = 0.
+            self.histograms_view[i].sum_hessians = 0.
+            self.histograms_view[i].count = 0
+        return self
+
+    def copy(self):
+        """Return a copy of self."""
+        h = Histograms(
+            n_features=self.n_features,
+            bin_offsets=self.bin_offsets.copy(),
+        )
+        np.copyto(h.histograms, self.histograms)
+        return h
+
+    def feature_histo(self, feature_idx):
+        start = self.bin_offsets[feature_idx]
+        end = self.bin_offsets[feature_idx + 1]
+        return self.histograms[start:end]
+
+    def feature_sum(self, key):
+        return np.asarray(
+            [np.sum(self.feature_histo(f)[key]) for f in range(self.n_features)]
+        )
+
+    def value_at(self, feature_idx, bin_idx):
+        return self.histograms[self.bin_offsets[feature_idx] + bin_idx]
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -679,8 +679,10 @@ def fit(self, X, y, sample_weight=None):
             X_binned_val = None
 
         # Uses binned data to check for missing values
+        # Note that the missing bin index is always the last bin, i.e. the value of
+        # n_bins_non_missing.
         has_missing_values = (
-            (X_binned_train == self._bin_mapper.missing_values_bin_idx_)
+            (X_binned_train == self._bin_mapper.n_bins_non_missing_)
             .any(axis=0)
             .astype(np.uint8)
         )
@@ -940,7 +942,7 @@ def fit(self, X, y, sample_weight=None):
                     for k, pred in enumerate(self._predictors[-1]):
                         raw_predictions_val[:, k] += pred.predict_binned(
                             X_binned_val,
-                            self._bin_mapper.missing_values_bin_idx_,
+                            self._bin_mapper.n_bins_non_missing_,
                             n_threads,
                         )
 
@@ -1296,7 +1298,7 @@ def _predict_iterations(self, X, predictors, raw_predictions, is_binned, n_threa
                 if is_binned:
                     predict = partial(
                         predictor.predict_binned,
-                        missing_values_bin_idx=self._bin_mapper.missing_values_bin_idx_,
+                        n_bins_non_missing=self._bin_mapper.n_bins_non_missing_,
                         n_threads=n_threads,
                     )
                 else:

diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -180,7 +180,7 @@ class TreeGrower:
     n_bins : int, default=256
         The total number of bins, including the bin for missing values. Used
         to define the shape of the histograms.
-    n_bins_non_missing : ndarray, dtype=np.uint32, default=None
+    n_bins_non_missing : ndarray of shape (n_features,), dtype=np.uint16, default=None
         For each feature, gives the number of bins actually used for
         non-missing values. For features with a lot of unique values, this
         is equal to ``n_bins - 1``. If it's an int, all features are
@@ -189,7 +189,7 @@ class TreeGrower:
     has_missing_values : bool or ndarray, dtype=bool, default=False
         Whether each feature contains missing values (in the training data).
         If it's a bool, the same value is used for all features.
-    is_categorical : ndarray of bool of shape (n_features,), default=None
+    is_categorical : ndarray of shape (n_features,), dtype=bool, default=None
         Indicates categorical features.
     monotonic_cst : array-like of int of shape (n_features,), dtype=int, default=None
         Indicates the monotonic constraint to enforce on each feature.
@@ -224,8 +224,6 @@ class TreeGrower:
     root : TreeNode
     finalized_leaves : list of TreeNode
     splittable_nodes : list of TreeNode
-    missing_values_bin_idx : int
-        Equals n_bins - 1
     n_categorical_splits : int
     n_features : int
     n_nodes : int
@@ -274,10 +272,10 @@ def __init__(
 
         if isinstance(n_bins_non_missing, numbers.Integral):
             n_bins_non_missing = np.array(
-                [n_bins_non_missing] * X_binned.shape[1], dtype=np.uint32
+                [n_bins_non_missing] * X_binned.shape[1], dtype=np.uint16
             )
         else:
-            n_bins_non_missing = np.asarray(n_bins_non_missing, dtype=np.uint32)
+            n_bins_non_missing = np.asarray(n_bins_non_missing, dtype=np.uint16)
 
         if isinstance(has_missing_values, bool):
             has_missing_values = [has_missing_values] * X_binned.shape[1]
@@ -310,13 +308,17 @@ def __init__(
 
         hessians_are_constant = hessians.shape[0] == 1
         self.histogram_builder = HistogramBuilder(
-            X_binned, n_bins, gradients, hessians, hessians_are_constant, n_threads
+            X_binned=X_binned,
+            n_bins=n_bins_non_missing.astype(np.uint32)
+            + 1,  # need total number of bins
+            gradients=gradients,
+            hessians=hessians,
+            hessians_are_constant=hessians_are_constant,
+            n_threads=n_threads,
         )
-        missing_values_bin_idx = n_bins - 1
         self.splitter = Splitter(
             X_binned=X_binned,
             n_bins_non_missing=n_bins_non_missing,
-            missing_values_bin_idx=missing_values_bin_idx,
             has_missing_values=has_missing_values,
             is_categorical=is_categorical,
             monotonic_cst=monotonic_cst,
@@ -335,7 +337,6 @@ def __init__(
         self.min_samples_leaf = min_samples_leaf
         self.min_gain_to_split = min_gain_to_split
         self.n_bins_non_missing = n_bins_non_missing
-        self.missing_values_bin_idx = missing_values_bin_idx
         self.has_missing_values = has_missing_values
         self.is_categorical = is_categorical
         self.monotonic_cst = monotonic_cst