Skip to content

DEBUG hgbt variable bins #28603 #28621

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
27 changes: 16 additions & 11 deletions sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
from cython.parallel import prange
from libc.math cimport isnan

from sklearn.utils._typedefs cimport uint16_t
from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C


def _map_to_bins(const X_DTYPE_C [:, :] data,
def _map_to_bins(const X_DTYPE_C[:, :] data,
const uint16_t[::1] n_bins_non_missing,
list binning_thresholds,
const unsigned char[::1] is_categorical,
const unsigned char missing_values_bin_idx,
int n_threads,
X_BINNED_DTYPE_C [::1, :] binned):
"""Bin continuous and categorical values to discrete integer-coded levels.
Expand All @@ -21,6 +22,10 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
----------
data : ndarray, shape (n_samples, n_features)
The data to bin.
n_bins_non_missing : ndarray of shape (n_features,) dtype=np.uint16
For each feature, gives the number of bins actually used for non-missing
values. The index of the bin where missing values are mapped is always
given by the last bin, i.e. bin index ``n_bins_non_missing_``.
binning_thresholds : list of arrays
For each feature, stores the increasing numeric values that are
used to separate the bins.
Expand All @@ -36,20 +41,20 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,

for feature_idx in range(data.shape[1]):
_map_col_to_bins(
data[:, feature_idx],
binning_thresholds[feature_idx],
is_categorical[feature_idx],
missing_values_bin_idx,
n_threads,
binned[:, feature_idx]
data=data[:, feature_idx],
binning_thresholds=binning_thresholds[feature_idx],
is_categorical=is_categorical[feature_idx],
missing_values_bin_idx=n_bins_non_missing[feature_idx],
n_threads=n_threads,
binned=binned[:, feature_idx]
)


cdef void _map_col_to_bins(
const X_DTYPE_C [:] data,
const X_DTYPE_C [:] binning_thresholds,
const X_DTYPE_C[:] data,
const X_DTYPE_C[:] binning_thresholds,
const unsigned char is_categorical,
const unsigned char missing_values_bin_idx,
const uint16_t missing_values_bin_idx,
int n_threads,
X_BINNED_DTYPE_C [:] binned
):
Expand Down
13 changes: 8 additions & 5 deletions sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ from .common cimport X_BINNED_DTYPE_C
from .common cimport BITSET_INNER_DTYPE_C
from .common cimport node_struct
from ._bitset cimport in_bitset_2d_memoryview
from sklearn.utils._typedefs cimport intp_t
from sklearn.utils._typedefs cimport intp_t, uint16_t


def _predict_from_raw_data( # raw data = non-binned data
Expand Down Expand Up @@ -89,7 +89,7 @@ def _predict_from_binned_data(
node_struct [:] nodes,
const X_BINNED_DTYPE_C [:, :] binned_data,
BITSET_INNER_DTYPE_C [:, :] binned_left_cat_bitsets,
const unsigned char missing_values_bin_idx,
const uint16_t [::1] n_bins_non_missing,
int n_threads,
Y_DTYPE_C [:] out):

Expand All @@ -100,29 +100,32 @@ def _predict_from_binned_data(
num_threads=n_threads):
out[i] = _predict_one_from_binned_data(nodes,
binned_data,
binned_left_cat_bitsets, i,
missing_values_bin_idx)
binned_left_cat_bitsets,
i,
n_bins_non_missing)


cdef inline Y_DTYPE_C _predict_one_from_binned_data(
node_struct [:] nodes,
const X_BINNED_DTYPE_C [:, :] binned_data,
const BITSET_INNER_DTYPE_C [:, :] binned_left_cat_bitsets,
const int row,
const unsigned char missing_values_bin_idx) noexcept nogil:
const uint16_t [::1] n_bins_non_missing) noexcept nogil:
# Need to pass the whole array and the row index, else prange won't work.
# See issue Cython #2798

cdef:
node_struct node = nodes[0]
unsigned int node_idx = 0
X_BINNED_DTYPE_C data_val
uint16_t missing_values_bin_idx

while True:
if node.is_leaf:
return node.value

data_val = binned_data[row, node.feature_idx]
missing_values_bin_idx = n_bins_non_missing[node.feature_idx]

if data_val == missing_values_bin_idx:
if node.missing_go_to_left:
Expand Down
16 changes: 5 additions & 11 deletions sklearn/ensemble/_hist_gradient_boosting/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,18 +136,14 @@ class _BinMapper(TransformerMixin, BaseEstimator):
value to the raw category value. The size of the array is equal to
``min(max_bins, category_cardinality)`` where we ignore missing
values in the cardinality.
n_bins_non_missing_ : ndarray, dtype=np.uint32
n_bins_non_missing_ : ndarray of shape (n_features,), dtype=np.uint16
For each feature, gives the number of bins actually used for
non-missing values. For features with a lot of unique values, this is
equal to ``n_bins - 1``.
The index of the bin where missing values are mapped is always given by the
last bin, i.e. bin index ``n_bins_non_missing_`` (no unsused bins).
is_categorical_ : ndarray of shape (n_features,), dtype=np.uint8
Indicator for categorical features.
missing_values_bin_idx_ : np.uint8
The index of the bin where missing values are mapped. This is a
constant across all features. This corresponds to the last bin, and
it is always equal to ``n_bins - 1``. Note that if ``n_bins_non_missing_``
is less than ``n_bins - 1`` for a given feature, then there are
empty (and unused) bins.
"""

def __init__(
Expand Down Expand Up @@ -223,8 +219,6 @@ def fit(self, X, y=None):
"but categories were passed."
)

self.missing_values_bin_idx_ = self.n_bins - 1

self.bin_thresholds_ = []
n_bins_non_missing = []

Expand All @@ -242,7 +236,7 @@ def fit(self, X, y=None):

self.bin_thresholds_.append(thresholds)

self.n_bins_non_missing_ = np.array(n_bins_non_missing, dtype=np.uint32)
self.n_bins_non_missing_ = np.array(n_bins_non_missing, dtype=np.uint16)
return self

def transform(self, X):
Expand Down Expand Up @@ -277,9 +271,9 @@ def transform(self, X):
binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
_map_to_bins(
X,
self.n_bins_non_missing_,
self.bin_thresholds_,
self.is_categorical_,
self.missing_values_bin_idx_,
n_threads,
binned,
)
Expand Down
18 changes: 17 additions & 1 deletion sklearn/ensemble/_hist_gradient_boosting/common.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
cimport numpy as cnp
from sklearn.utils._typedefs cimport intp_t
from sklearn.utils._typedefs cimport intp_t, uint32_t

cnp.import_array()

Expand All @@ -11,6 +11,7 @@ ctypedef cnp.npy_float32 G_H_DTYPE_C
ctypedef cnp.npy_uint32 BITSET_INNER_DTYPE_C
ctypedef BITSET_INNER_DTYPE_C[8] BITSET_DTYPE_C


cdef packed struct hist_struct:
# Same as histogram dtype but we need a struct to declare views. It needs
# to be packed since by default numpy dtypes aren't aligned
Expand Down Expand Up @@ -42,3 +43,18 @@ cpdef enum MonotonicConstraint:
NO_CST = 0
POS = 1
NEG = -1


cdef class Histograms:
cdef:
int n_features
uint32_t [::1] bin_offsets_view
hist_struct [::1] histograms_view
cdef public:
object bin_offsets
# Only the attribute histograms will be used in the splitter.
object histograms

cdef inline uint32_t n_bins(self, int feature_idx) noexcept nogil

cdef inline hist_struct* at(self, int feature_idx, uint32_t bin_idx) noexcept nogil
85 changes: 85 additions & 0 deletions sklearn/ensemble/_hist_gradient_boosting/common.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import numpy as np

cimport cython
from ...utils._typedefs cimport uint32_t

# Y_DYTPE is the dtype to which the targets y are converted to. This is also
# dtype for leaf values, gains, and sums of gradients / hessians. The gradients
# and hessians arrays are stored as floats to avoid using too much memory.
Expand Down Expand Up @@ -42,3 +45,85 @@ PREDICTOR_RECORD_DTYPE = np.dtype([
])

ALMOST_INF = 1e300 # see LightGBM AvoidInf()


@cython.final
cdef class Histograms:
"""An extension type (class) for histograms with variable bins.

This class only allocates the smallest possible 1d ndarray and provides access
to it like a 2d jagged array. It is comparable to a jagged/ragged array.

Attributes
----------
Public, i.e. accessible from Python:

bin_offsets : ndarray of shape (n_features + 1), dtype=np.uint32
The bin offsets specify which partition of the histograms ndarray belongs
to which features: feature j goes from `histograms[bin_offsets[j]]` until
`histograms[bin_offsets[j + 1] - 1]`. `bin_offsets[n_features + 1]` gives
the total number of bins over all features.
histograms : ndarray of shape (bin_offsets[n_features + 1],), \
dtype=HISTOGRAM_DTYPE
The 1-dimensional array of all histograms for all features.

Private, i.e. only accessible from Cython:

bin_offsets_view : memoryview of `bin_offsets`, dtype=uint32_t
histograms_view : memoryview of `histograms`, dtype=hist_stucts
n_features : int
"""

def __init__(self, n_features, bin_offsets):
self.n_features = n_features
if isinstance(bin_offsets, int):
self.bin_offsets = np.zeros(shape=self.n_features + 1, dtype=np.uint32)
self.bin_offsets[1:] = np.cumsum(np.full(shape=n_features, fill_value=bin_offsets))
else:
self.bin_offsets = bin_offsets
self.bin_offsets_view = self.bin_offsets

self.histograms = np.empty(
shape=self.bin_offsets[-1],
dtype=HISTOGRAM_DTYPE,
)
self.histograms_view = self.histograms

cdef inline uint32_t n_bins(self, int feature_idx) noexcept nogil:
return self.bin_offsets_view[feature_idx + 1] - self.bin_offsets_view[feature_idx]

cdef inline hist_struct* at(self, int feature_idx, uint32_t bin_idx) noexcept nogil:
return &self.histograms_view[self.bin_offsets_view[feature_idx] + bin_idx]

# From here on, only for better testing.
def fill_zeros(self):
cdef:
uint32_t i
n = self.histograms_view.shape[0]
for i in range(n):
self.histograms_view[i].sum_gradients = 0.
self.histograms_view[i].sum_hessians = 0.
self.histograms_view[i].count = 0
return self

def copy(self):
"""Return a copy of self."""
h = Histograms(
n_features=self.n_features,
bin_offsets=self.bin_offsets.copy(),
)
np.copyto(h.histograms, self.histograms)
return h

def feature_histo(self, feature_idx):
start = self.bin_offsets[feature_idx]
end = self.bin_offsets[feature_idx + 1]
return self.histograms[start:end]

def feature_sum(self, key):
return np.asarray(
[np.sum(self.feature_histo(f)[key]) for f in range(self.n_features)]
)

def value_at(self, feature_idx, bin_idx):
return self.histograms[self.bin_offsets[feature_idx] + bin_idx]
8 changes: 5 additions & 3 deletions sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,8 +679,10 @@ def fit(self, X, y, sample_weight=None):
X_binned_val = None

# Uses binned data to check for missing values
# Note that the missing bin index is always the last bin, i.e. the value of
# n_bins_non_missing.
has_missing_values = (
(X_binned_train == self._bin_mapper.missing_values_bin_idx_)
(X_binned_train == self._bin_mapper.n_bins_non_missing_)
.any(axis=0)
.astype(np.uint8)
)
Expand Down Expand Up @@ -940,7 +942,7 @@ def fit(self, X, y, sample_weight=None):
for k, pred in enumerate(self._predictors[-1]):
raw_predictions_val[:, k] += pred.predict_binned(
X_binned_val,
self._bin_mapper.missing_values_bin_idx_,
self._bin_mapper.n_bins_non_missing_,
n_threads,
)

Expand Down Expand Up @@ -1296,7 +1298,7 @@ def _predict_iterations(self, X, predictors, raw_predictions, is_binned, n_threa
if is_binned:
predict = partial(
predictor.predict_binned,
missing_values_bin_idx=self._bin_mapper.missing_values_bin_idx_,
n_bins_non_missing=self._bin_mapper.n_bins_non_missing_,
n_threads=n_threads,
)
else:
Expand Down
21 changes: 11 additions & 10 deletions sklearn/ensemble/_hist_gradient_boosting/grower.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ class TreeGrower:
n_bins : int, default=256
The total number of bins, including the bin for missing values. Used
to define the shape of the histograms.
n_bins_non_missing : ndarray, dtype=np.uint32, default=None
n_bins_non_missing : ndarray of shape (n_features,), dtype=np.uint16, default=None
For each feature, gives the number of bins actually used for
non-missing values. For features with a lot of unique values, this
is equal to ``n_bins - 1``. If it's an int, all features are
Expand All @@ -189,7 +189,7 @@ class TreeGrower:
has_missing_values : bool or ndarray, dtype=bool, default=False
Whether each feature contains missing values (in the training data).
If it's a bool, the same value is used for all features.
is_categorical : ndarray of bool of shape (n_features,), default=None
is_categorical : ndarray of shape (n_features,), dtype=bool, default=None
Indicates categorical features.
monotonic_cst : array-like of int of shape (n_features,), dtype=int, default=None
Indicates the monotonic constraint to enforce on each feature.
Expand Down Expand Up @@ -224,8 +224,6 @@ class TreeGrower:
root : TreeNode
finalized_leaves : list of TreeNode
splittable_nodes : list of TreeNode
missing_values_bin_idx : int
Equals n_bins - 1
n_categorical_splits : int
n_features : int
n_nodes : int
Expand Down Expand Up @@ -274,10 +272,10 @@ def __init__(

if isinstance(n_bins_non_missing, numbers.Integral):
n_bins_non_missing = np.array(
[n_bins_non_missing] * X_binned.shape[1], dtype=np.uint32
[n_bins_non_missing] * X_binned.shape[1], dtype=np.uint16
)
else:
n_bins_non_missing = np.asarray(n_bins_non_missing, dtype=np.uint32)
n_bins_non_missing = np.asarray(n_bins_non_missing, dtype=np.uint16)

if isinstance(has_missing_values, bool):
has_missing_values = [has_missing_values] * X_binned.shape[1]
Expand Down Expand Up @@ -310,13 +308,17 @@ def __init__(

hessians_are_constant = hessians.shape[0] == 1
self.histogram_builder = HistogramBuilder(
X_binned, n_bins, gradients, hessians, hessians_are_constant, n_threads
X_binned=X_binned,
n_bins=n_bins_non_missing.astype(np.uint32)
+ 1, # need total number of bins
gradients=gradients,
hessians=hessians,
hessians_are_constant=hessians_are_constant,
n_threads=n_threads,
)
missing_values_bin_idx = n_bins - 1
self.splitter = Splitter(
X_binned=X_binned,
n_bins_non_missing=n_bins_non_missing,
missing_values_bin_idx=missing_values_bin_idx,
has_missing_values=has_missing_values,
is_categorical=is_categorical,
monotonic_cst=monotonic_cst,
Expand All @@ -335,7 +337,6 @@ def __init__(
self.min_samples_leaf = min_samples_leaf
self.min_gain_to_split = min_gain_to_split
self.n_bins_non_missing = n_bins_non_missing
self.missing_values_bin_idx = missing_values_bin_idx
self.has_missing_values = has_missing_values
self.is_categorical = is_categorical
self.monotonic_cst = monotonic_cst
Expand Down
Loading