Skip to content

CLN Refactors find binning threshold #18395

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 39 additions & 56 deletions sklearn/ensemble/_hist_gradient_boosting/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,76 +16,53 @@
from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF


def _find_binning_thresholds(data, max_bins, subsample, random_state):
def _find_binning_thresholds(col_data, max_bins):
"""Extract feature-wise quantiles from numerical data.

Missing values are ignored for finding the thresholds.

Parameters
----------
data : array-like of shape (n_samples, n_features)
The data to bin.

col_data : array-like, shape (n_features,)
The numerical feature to bin.
max_bins: int
The maximum number of bins to use for non-missing values. If for a
given feature the number of unique values is less than ``max_bins``,
then those unique values will be used to compute the bin thresholds,
instead of the quantiles.

subsample : int or None
If ``n_samples > subsample``, then ``sub_samples`` samples will be
randomly chosen to compute the quantiles. If ``None``, the whole data
is used.

random_state: int, RandomState instance or None
Pseudo-random number generator to control the random sub-sampling.
Pass an int for reproducible output across multiple
function calls.
See :term: `Glossary <random_state>`.
instead of the quantiles

Return
------
binning_thresholds: list of ndarray
binning_thresholds: ndarray
For each feature, stores the increasing numeric values that can
be used to separate the bins. Thus ``len(binning_thresholds) ==
n_features``.
"""
rng = check_random_state(random_state)
if subsample is not None and data.shape[0] > subsample:
subset = rng.choice(data.shape[0], subsample, replace=False)
data = data.take(subset, axis=0)

binning_thresholds = []
for f_idx in range(data.shape[1]):
col_data = data[:, f_idx]
# ignore missing values when computing bin thresholds
missing_mask = np.isnan(col_data)
if missing_mask.any():
col_data = col_data[~missing_mask]
col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
distinct_values = np.unique(col_data)
if len(distinct_values) <= max_bins:
midpoints = distinct_values[:-1] + distinct_values[1:]
midpoints *= .5
else:
# We sort again the data in this case. We could compute
# approximate midpoint percentiles using the output of
# np.unique(col_data, return_counts) instead but this is more
# work and the performance benefit will be limited because we
# work on a fixed-size subsample of the full data.
percentiles = np.linspace(0, 100, num=max_bins + 1)
percentiles = percentiles[1:-1]
midpoints = np.percentile(col_data, percentiles,
interpolation='midpoint').astype(X_DTYPE)
assert midpoints.shape[0] == max_bins - 1

# We avoid having +inf thresholds: +inf thresholds are only allowed in
# a "split on nan" situation.
np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)

binning_thresholds.append(midpoints)

return binning_thresholds
# ignore missing values when computing bin thresholds
missing_mask = np.isnan(col_data)
if missing_mask.any():
col_data = col_data[~missing_mask]
col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we have a problem if a column has 100% missing values. But this problem is already present in master.

distinct_values = np.unique(col_data)
if len(distinct_values) <= max_bins:
midpoints = distinct_values[:-1] + distinct_values[1:]
midpoints *= .5
else:
# We sort again the data in this case. We could compute
# approximate midpoint percentiles using the output of
# np.unique(col_data, return_counts) instead but this is more
# work and the performance benefit will be limited because we
# work on a fixed-size subsample of the full data.
percentiles = np.linspace(0, 100, num=max_bins + 1)
percentiles = percentiles[1:-1]
midpoints = np.percentile(col_data, percentiles,
interpolation='midpoint').astype(X_DTYPE)
assert midpoints.shape[0] == max_bins - 1

# We avoid having +inf thresholds: +inf thresholds are only allowed in
# a "split on nan" situation.
np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)
return midpoints


class _BinMapper(TransformerMixin, BaseEstimator):
Expand Down Expand Up @@ -170,9 +147,15 @@ def fit(self, X, y=None):

X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
max_bins = self.n_bins - 1
self.bin_thresholds_ = _find_binning_thresholds(
X, max_bins, subsample=self.subsample,
random_state=self.random_state)

rng = check_random_state(self.random_state)
if self.subsample is not None and X.shape[0] > self.subsample:
subset = rng.choice(X.shape[0], self.subsample, replace=False)
X = X.take(subset, axis=0)

self.bin_thresholds_ = [
_find_binning_thresholds(X[:, f_idx], max_bins)
for f_idx in range(X.shape[1])]

self.n_bins_non_missing_ = np.array(
[thresholds.shape[0] + 1 for thresholds in self.bin_thresholds_],
Expand Down
38 changes: 13 additions & 25 deletions sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from sklearn.ensemble._hist_gradient_boosting.binning import (
_BinMapper,
_find_binning_thresholds as _find_binning_thresholds_orig,
_find_binning_thresholds,
_map_to_bins
)
from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
Expand All @@ -17,45 +17,34 @@
).astype(X_DTYPE)


def _find_binning_thresholds(data, max_bins=255, subsample=int(2e5),
random_state=None):
# Just a redef to avoid having to pass arguments all the time (as the
# function is private we don't use default values for parameters)
return _find_binning_thresholds_orig(data, max_bins, subsample,
random_state)


def test_find_binning_thresholds_regular_data():
data = np.linspace(0, 10, 1001).reshape(-1, 1)
bin_thresholds = _find_binning_thresholds(data, max_bins=10)
assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9])
assert len(bin_thresholds) == 1
assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9])

bin_thresholds = _find_binning_thresholds(data, max_bins=5)
assert_allclose(bin_thresholds[0], [2, 4, 6, 8])
assert len(bin_thresholds) == 1
assert_allclose(bin_thresholds, [2, 4, 6, 8])


def test_find_binning_thresholds_small_regular_data():
data = np.linspace(0, 10, 11).reshape(-1, 1)

bin_thresholds = _find_binning_thresholds(data, max_bins=5)
assert_allclose(bin_thresholds[0], [2, 4, 6, 8])
assert_allclose(bin_thresholds, [2, 4, 6, 8])

bin_thresholds = _find_binning_thresholds(data, max_bins=10)
assert_allclose(bin_thresholds[0], [1, 2, 3, 4, 5, 6, 7, 8, 9])
assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9])

bin_thresholds = _find_binning_thresholds(data, max_bins=11)
assert_allclose(bin_thresholds[0], np.arange(10) + .5)
assert_allclose(bin_thresholds, np.arange(10) + .5)

bin_thresholds = _find_binning_thresholds(data, max_bins=255)
assert_allclose(bin_thresholds[0], np.arange(10) + .5)
assert_allclose(bin_thresholds, np.arange(10) + .5)


def test_find_binning_thresholds_random_data():
bin_thresholds = _find_binning_thresholds(DATA, max_bins=255,
random_state=0)
assert len(bin_thresholds) == 2
bin_thresholds = [_find_binning_thresholds(DATA[:, i], max_bins=255)
for i in range(2)]
for i in range(len(bin_thresholds)):
assert bin_thresholds[i].shape == (254,) # 255 - 1
assert bin_thresholds[i].dtype == DATA.dtype
Expand All @@ -68,9 +57,8 @@ def test_find_binning_thresholds_random_data():


def test_find_binning_thresholds_low_n_bins():
bin_thresholds = _find_binning_thresholds(DATA, max_bins=128,
random_state=0)
assert len(bin_thresholds) == 2
bin_thresholds = [_find_binning_thresholds(DATA[:, i], max_bins=128)
for i in range(2)]
for i in range(len(bin_thresholds)):
assert bin_thresholds[i].shape == (127,) # 128 - 1
assert bin_thresholds[i].dtype == DATA.dtype
Expand All @@ -94,8 +82,8 @@ def test_bin_mapper_n_features_transform():

@pytest.mark.parametrize('max_bins', [16, 128, 255])
def test_map_to_bins(max_bins):
bin_thresholds = _find_binning_thresholds(DATA, max_bins=max_bins,
random_state=0)
bin_thresholds = [_find_binning_thresholds(DATA[:, i], max_bins=max_bins)
for i in range(2)]
binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order='F')
last_bin_idx = max_bins
_map_to_bins(DATA, bin_thresholds, last_bin_idx, binned)
Expand Down