Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,12 @@ Changelog
:class:`ensemble.StackingClassifier` and :class:`ensemble.StackingRegressor`.
:pr:`19564` by `Thomas Fan`_.

- |Fix| Fixed the range of the argument max_samples to be (0.0, 1.0]
in :class:`ensemble.RandomForestClassifier`,
:class:`ensemble.RandomForestRegressor`, where `max_samples=1.0` is
interpreted as using all `n_samples` for bootstrapping. :pr:`20159` by
:user:`murata-yu`.

:mod:`sklearn.feature_extraction`
.................................

Expand Down
17 changes: 10 additions & 7 deletions sklearn/ensemble/_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
max_samples : int or float
The maximum number of samples to draw from the total available:
- if float, this indicates a fraction of the total and should be
the interval `(0, 1)`;
the interval `(0.0, 1.0]`;
- if int, this indicates the exact number of samples;
- if None, this indicates the total number of samples.

Expand All @@ -105,8 +105,8 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
return max_samples

if isinstance(max_samples, numbers.Real):
if not (0 < max_samples < 1):
msg = "`max_samples` must be in range (0, 1) but got value {}"
if not (0 < max_samples <= 1):
msg = "`max_samples` must be in range (0.0, 1.0] but got value {}"
raise ValueError(msg.format(max_samples))
return round(n_samples * max_samples)

Expand Down Expand Up @@ -1163,7 +1163,7 @@ class RandomForestClassifier(ForestClassifier):
- If None (default), then draw `X.shape[0]` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
`max_samples` should be in the interval `(0, 1)`.
`max_samples` should be in the interval `(0.0, 1.0]`.

.. versionadded:: 0.22

Expand Down Expand Up @@ -1473,7 +1473,7 @@ class RandomForestRegressor(ForestRegressor):
- If None (default), then draw `X.shape[0]` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
`max_samples` should be in the interval `(0, 1)`.
`max_samples` should be in the interval `(0.0, 1.0]`.

.. versionadded:: 0.22

Expand Down Expand Up @@ -1557,6 +1557,7 @@ class RandomForestRegressor(ForestRegressor):
>>> print(regr.predict([[0, 0, 0, 0]]))
[-8.32987858]
"""

def __init__(self,
n_estimators=100, *,
criterion="squared_error",
Expand Down Expand Up @@ -1789,7 +1790,7 @@ class ExtraTreesClassifier(ForestClassifier):
- If None (default), then draw `X.shape[0]` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
`max_samples` should be in the interval `(0, 1)`.
`max_samples` should be in the interval `(0.0, 1.0]`.

.. versionadded:: 0.22

Expand Down Expand Up @@ -1873,6 +1874,7 @@ class labels (multi-output problem).
>>> clf.predict([[0, 0, 0, 0]])
array([1])
"""

def __init__(self,
n_estimators=100, *,
criterion="gini",
Expand Down Expand Up @@ -2095,7 +2097,7 @@ class ExtraTreesRegressor(ForestRegressor):
- If None (default), then draw `X.shape[0]` samples.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples. Thus,
`max_samples` should be in the interval `(0, 1)`.
`max_samples` should be in the interval `(0.0, 1.0]`.

.. versionadded:: 0.22

Expand Down Expand Up @@ -2168,6 +2170,7 @@ class ExtraTreesRegressor(ForestRegressor):
>>> reg.score(X_test, y_test)
0.2708...
"""

def __init__(self,
n_estimators=100, *,
criterion="squared_error",
Expand Down
43 changes: 37 additions & 6 deletions sklearn/ensemble/tests/test_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
from sklearn.svm import LinearSVC
from sklearn.utils.validation import check_random_state

from sklearn.metrics import mean_squared_error

from sklearn.tree._classes import SPARSE_SPLITTERS


Expand Down Expand Up @@ -1419,16 +1421,14 @@ def test_forest_degenerate_feature_importances():
'max_samples, exc_type, exc_msg',
[(int(1e9), ValueError,
"`max_samples` must be in range 1 to 6 but got value 1000000000"),
(1.0, ValueError,
r"`max_samples` must be in range \(0, 1\) but got value 1.0"),
(2.0, ValueError,
r"`max_samples` must be in range \(0, 1\) but got value 2.0"),
r"`max_samples` must be in range \(0.0, 1.0\] but got value 2.0"),
(0.0, ValueError,
r"`max_samples` must be in range \(0, 1\) but got value 0.0"),
r"`max_samples` must be in range \(0.0, 1.0\] but got value 0.0"),
(np.nan, ValueError,
r"`max_samples` must be in range \(0, 1\) but got value nan"),
r"`max_samples` must be in range \(0.0, 1.0\] but got value nan"),
(np.inf, ValueError,
r"`max_samples` must be in range \(0, 1\) but got value inf"),
r"`max_samples` must be in range \(0.0, 1.0\] but got value inf"),
('str max_samples?!', TypeError,
r"`max_samples` should be int or float, but got "
r"type '\<class 'str'\>'"),
Expand All @@ -1443,6 +1443,37 @@ def test_max_samples_exceptions(name, max_samples, exc_type, exc_msg):
est.fit(X, y)


@pytest.mark.parametrize('name', FOREST_REGRESSORS)
def test_max_samples_boundary_regressors(name):
X_train, X_test, y_train, y_test = train_test_split(
X_reg, y_reg, train_size=0.7, test_size=0.3, random_state=0)

ms_1_model = FOREST_REGRESSORS[name](max_samples=1.0, random_state=0)
ms_1_predict = ms_1_model.fit(X_train, y_train).predict(X_test)

ms_None_model = FOREST_REGRESSORS[name](max_samples=None, random_state=0)
ms_None_predict = ms_None_model.fit(X_train, y_train).predict(X_test)

ms_1_ms = mean_squared_error(ms_1_predict, y_test)
ms_None_ms = mean_squared_error(ms_None_predict, y_test)

assert ms_1_ms == pytest.approx(ms_None_ms)


@pytest.mark.parametrize('name', FOREST_CLASSIFIERS)
def test_max_samples_boundary_classifiers(name):
X_train, X_test, y_train, _ = train_test_split(
X_large, y_large, random_state=0, stratify=y_large)

ms_1_model = FOREST_CLASSIFIERS[name](max_samples=1.0, random_state=0)
ms_1_proba = ms_1_model.fit(X_train, y_train).predict_proba(X_test)

ms_None_model = FOREST_CLASSIFIERS[name](max_samples=None, random_state=0)
ms_None_proba = ms_None_model.fit(X_train, y_train).predict_proba(X_test)

np.testing.assert_allclose(ms_1_proba, ms_None_proba)


def test_forest_y_sparse():
X = [[1, 2, 3]]
y = csr_matrix([4, 5, 6])
Expand Down