Skip to content

FIX Fix RandomForestRegressor doesn't accept max_samples=1.0 #20156 #20159

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Jun 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,12 @@ Changelog
:class:`ensemble.StackingClassifier` and :class:`ensemble.StackingRegressor`.
:pr:`19564` by `Thomas Fan`_.

- |Fix| Fixed the range of the argument max_samples to be (0.0, 1.0]
in :class:`ensemble.RandomForestClassifier`,
:class:`ensemble.RandomForestRegressor`, where `max_samples=1.0` is
interpreted as using all `n_samples` for bootstrapping. :pr:`20159` by
:user:`murata-yu`.

:mod:`sklearn.feature_extraction`
.................................

Expand Down
17 changes: 10 additions & 7 deletions sklearn/ensemble/_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
max_samples : int or float
The maximum number of samples to draw from the total available:
- if float, this indicates a fraction of the total and should be
the interval `(0, 1)`;
the interval `(0.0, 1.0]`;
- if int, this indicates the exact number of samples;
- if None, this indicates the total number of samples.

Expand All @@ -105,8 +105,8 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
return max_samples

if isinstance(max_samples, numbers.Real):
if not (0 < max_samples < 1):
msg = "`max_samples` must be in range (0, 1) but got value {}"
if not (0 < max_samples <= 1):
msg = "`max_samples` must be in range (0.0, 1.0] but got value {}"
raise ValueError(msg.format(max_samples))
return round(n_samples * max_samples)

Expand Down Expand Up @@ -1163,7 +1163,7 @@ class RandomForestClassifier(ForestClassifier):
- If None (default), then draw `X.shape[0]` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
`max_samples` should be in the interval `(0, 1)`.
`max_samples` should be in the interval `(0.0, 1.0]`.

.. versionadded:: 0.22

Expand Down Expand Up @@ -1473,7 +1473,7 @@ class RandomForestRegressor(ForestRegressor):
- If None (default), then draw `X.shape[0]` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
`max_samples` should be in the interval `(0, 1)`.
`max_samples` should be in the interval `(0.0, 1.0]`.

.. versionadded:: 0.22

Expand Down Expand Up @@ -1557,6 +1557,7 @@ class RandomForestRegressor(ForestRegressor):
>>> print(regr.predict([[0, 0, 0, 0]]))
[-8.32987858]
"""

def __init__(self,
n_estimators=100, *,
criterion="squared_error",
Expand Down Expand Up @@ -1789,7 +1790,7 @@ class ExtraTreesClassifier(ForestClassifier):
- If None (default), then draw `X.shape[0]` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
`max_samples` should be in the interval `(0, 1)`.
`max_samples` should be in the interval `(0.0, 1.0]`.

.. versionadded:: 0.22

Expand Down Expand Up @@ -1873,6 +1874,7 @@ class labels (multi-output problem).
>>> clf.predict([[0, 0, 0, 0]])
array([1])
"""

def __init__(self,
n_estimators=100, *,
criterion="gini",
Expand Down Expand Up @@ -2095,7 +2097,7 @@ class ExtraTreesRegressor(ForestRegressor):
- If None (default), then draw `X.shape[0]` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
`max_samples` should be in the interval `(0, 1)`.
`max_samples` should be in the interval `(0.0, 1.0]`.

.. versionadded:: 0.22

Expand Down Expand Up @@ -2168,6 +2170,7 @@ class ExtraTreesRegressor(ForestRegressor):
>>> reg.score(X_test, y_test)
0.2708...
"""

def __init__(self,
n_estimators=100, *,
criterion="squared_error",
Expand Down
43 changes: 37 additions & 6 deletions sklearn/ensemble/tests/test_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
from sklearn.svm import LinearSVC
from sklearn.utils.validation import check_random_state

from sklearn.metrics import mean_squared_error

from sklearn.tree._classes import SPARSE_SPLITTERS


Expand Down Expand Up @@ -1419,16 +1421,14 @@ def test_forest_degenerate_feature_importances():
'max_samples, exc_type, exc_msg',
[(int(1e9), ValueError,
"`max_samples` must be in range 1 to 6 but got value 1000000000"),
(1.0, ValueError,
r"`max_samples` must be in range \(0, 1\) but got value 1.0"),
(2.0, ValueError,
r"`max_samples` must be in range \(0, 1\) but got value 2.0"),
r"`max_samples` must be in range \(0.0, 1.0\] but got value 2.0"),
(0.0, ValueError,
r"`max_samples` must be in range \(0, 1\) but got value 0.0"),
r"`max_samples` must be in range \(0.0, 1.0\] but got value 0.0"),
(np.nan, ValueError,
r"`max_samples` must be in range \(0, 1\) but got value nan"),
r"`max_samples` must be in range \(0.0, 1.0\] but got value nan"),
(np.inf, ValueError,
r"`max_samples` must be in range \(0, 1\) but got value inf"),
r"`max_samples` must be in range \(0.0, 1.0\] but got value inf"),
('str max_samples?!', TypeError,
r"`max_samples` should be int or float, but got "
r"type '\<class 'str'\>'"),
Expand All @@ -1443,6 +1443,37 @@ def test_max_samples_exceptions(name, max_samples, exc_type, exc_msg):
est.fit(X, y)


@pytest.mark.parametrize('name', FOREST_REGRESSORS)
def test_max_samples_boundary_regressors(name):
X_train, X_test, y_train, y_test = train_test_split(
X_reg, y_reg, train_size=0.7, test_size=0.3, random_state=0)

ms_1_model = FOREST_REGRESSORS[name](max_samples=1.0, random_state=0)
ms_1_predict = ms_1_model.fit(X_train, y_train).predict(X_test)

ms_None_model = FOREST_REGRESSORS[name](max_samples=None, random_state=0)
ms_None_predict = ms_None_model.fit(X_train, y_train).predict(X_test)

ms_1_ms = mean_squared_error(ms_1_predict, y_test)
ms_None_ms = mean_squared_error(ms_None_predict, y_test)

assert ms_1_ms == pytest.approx(ms_None_ms)


@pytest.mark.parametrize('name', FOREST_CLASSIFIERS)
def test_max_samples_boundary_classifiers(name):
X_train, X_test, y_train, _ = train_test_split(
X_large, y_large, random_state=0, stratify=y_large)

ms_1_model = FOREST_CLASSIFIERS[name](max_samples=1.0, random_state=0)
ms_1_proba = ms_1_model.fit(X_train, y_train).predict_proba(X_test)

ms_None_model = FOREST_CLASSIFIERS[name](max_samples=None, random_state=0)
ms_None_proba = ms_None_model.fit(X_train, y_train).predict_proba(X_test)

np.testing.assert_allclose(ms_1_proba, ms_None_proba)


def test_forest_y_sparse():
X = [[1, 2, 3]]
y = csr_matrix([4, 5, 6])
Expand Down