scikit-learn · thomasjpfan · Jun 4, 2021 · May 28, 2021 · May 28, 2021 · May 28, 2021
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -257,6 +257,12 @@ Changelog
   :class:`ensemble.StackingClassifier` and :class:`ensemble.StackingRegressor`.
   :pr:`19564` by `Thomas Fan`_.
 
+- |Fix| Fixed the range of the argument max_samples to be (0.0, 1.0]
+  in :class:`ensemble.RandomForestClassifier`,
+  :class:`ensemble.RandomForestRegressor`, where `max_samples=1.0` is
+  interpreted as using all `n_samples` for bootstrapping. :pr:`20159` by
+  :user:`murata-yu`.
+
 :mod:`sklearn.feature_extraction`
 .................................
 

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
@@ -86,7 +86,7 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
     max_samples : int or float
         The maximum number of samples to draw from the total available:
             - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
+              the interval `(0.0, 1.0]`;
             - if int, this indicates the exact number of samples;
             - if None, this indicates the total number of samples.
 
@@ -105,8 +105,8 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
         return max_samples
 
     if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
+        if not (0 < max_samples <= 1):
+            msg = "`max_samples` must be in range (0.0, 1.0] but got value {}"
             raise ValueError(msg.format(max_samples))
         return round(n_samples * max_samples)
 
@@ -1163,7 +1163,7 @@ class RandomForestClassifier(ForestClassifier):
         - If None (default), then draw `X.shape[0]` samples.
         - If int, then draw `max_samples` samples.
         - If float, then draw `max_samples * X.shape[0]` samples. Thus,
-          `max_samples` should be in the interval `(0, 1)`.
+          `max_samples` should be in the interval `(0.0, 1.0]`.
 
         .. versionadded:: 0.22
 
@@ -1473,7 +1473,7 @@ class RandomForestRegressor(ForestRegressor):
         - If None (default), then draw `X.shape[0]` samples.
         - If int, then draw `max_samples` samples.
         - If float, then draw `max_samples * X.shape[0]` samples. Thus,
-          `max_samples` should be in the interval `(0, 1)`.
+          `max_samples` should be in the interval `(0.0, 1.0]`.
 
         .. versionadded:: 0.22
 
@@ -1557,6 +1557,7 @@ class RandomForestRegressor(ForestRegressor):
     >>> print(regr.predict([[0, 0, 0, 0]]))
     [-8.32987858]
     """
+
     def __init__(self,
                  n_estimators=100, *,
                  criterion="squared_error",
@@ -1789,7 +1790,7 @@ class ExtraTreesClassifier(ForestClassifier):
         - If None (default), then draw `X.shape[0]` samples.
         - If int, then draw `max_samples` samples.
         - If float, then draw `max_samples * X.shape[0]` samples. Thus,
-          `max_samples` should be in the interval `(0, 1)`.
+          `max_samples` should be in the interval `(0.0, 1.0]`.
 
         .. versionadded:: 0.22
 
@@ -1873,6 +1874,7 @@ class labels (multi-output problem).
     >>> clf.predict([[0, 0, 0, 0]])
     array([1])
     """
+
     def __init__(self,
                  n_estimators=100, *,
                  criterion="gini",
@@ -2095,7 +2097,7 @@ class ExtraTreesRegressor(ForestRegressor):
         - If None (default), then draw `X.shape[0]` samples.
         - If int, then draw `max_samples` samples.
         - If float, then draw `max_samples * X.shape[0]` samples. Thus,
-          `max_samples` should be in the interval `(0, 1)`.
+          `max_samples` should be in the interval `(0.0, 1.0]`.
 
         .. versionadded:: 0.22
 
@@ -2168,6 +2170,7 @@ class ExtraTreesRegressor(ForestRegressor):
     >>> reg.score(X_test, y_test)
     0.2708...
     """
+
     def __init__(self,
                  n_estimators=100, *,
                  criterion="squared_error",

diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
@@ -50,6 +50,8 @@
 from sklearn.svm import LinearSVC
 from sklearn.utils.validation import check_random_state
 
+from sklearn.metrics import mean_squared_error
+
 from sklearn.tree._classes import SPARSE_SPLITTERS
 
 
@@ -1419,16 +1421,14 @@ def test_forest_degenerate_feature_importances():
     'max_samples, exc_type, exc_msg',
     [(int(1e9), ValueError,
       "`max_samples` must be in range 1 to 6 but got value 1000000000"),
-     (1.0, ValueError,
-      r"`max_samples` must be in range \(0, 1\) but got value 1.0"),
      (2.0, ValueError,
-      r"`max_samples` must be in range \(0, 1\) but got value 2.0"),
+      r"`max_samples` must be in range \(0.0, 1.0\] but got value 2.0"),
      (0.0, ValueError,
-      r"`max_samples` must be in range \(0, 1\) but got value 0.0"),
+      r"`max_samples` must be in range \(0.0, 1.0\] but got value 0.0"),
      (np.nan, ValueError,
-      r"`max_samples` must be in range \(0, 1\) but got value nan"),
+      r"`max_samples` must be in range \(0.0, 1.0\] but got value nan"),
      (np.inf, ValueError,
-      r"`max_samples` must be in range \(0, 1\) but got value inf"),
+      r"`max_samples` must be in range \(0.0, 1.0\] but got value inf"),
      ('str max_samples?!', TypeError,
       r"`max_samples` should be int or float, but got "
       r"type '\<class 'str'\>'"),
@@ -1443,6 +1443,37 @@ def test_max_samples_exceptions(name, max_samples, exc_type, exc_msg):
         est.fit(X, y)
 
 
+@pytest.mark.parametrize('name', FOREST_REGRESSORS)
+def test_max_samples_boundary_regressors(name):
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_reg, y_reg, train_size=0.7, test_size=0.3, random_state=0)
+
+    ms_1_model = FOREST_REGRESSORS[name](max_samples=1.0, random_state=0)
+    ms_1_predict = ms_1_model.fit(X_train, y_train).predict(X_test)
+
+    ms_None_model = FOREST_REGRESSORS[name](max_samples=None, random_state=0)
+    ms_None_predict = ms_None_model.fit(X_train, y_train).predict(X_test)
+
+    ms_1_ms = mean_squared_error(ms_1_predict, y_test)
+    ms_None_ms = mean_squared_error(ms_None_predict, y_test)
+
+    assert ms_1_ms == pytest.approx(ms_None_ms)
+
+
+@pytest.mark.parametrize('name', FOREST_CLASSIFIERS)
+def test_max_samples_boundary_classifiers(name):
+    X_train, X_test, y_train, _ = train_test_split(
+        X_large, y_large, random_state=0, stratify=y_large)
+
+    ms_1_model = FOREST_CLASSIFIERS[name](max_samples=1.0, random_state=0)
+    ms_1_proba = ms_1_model.fit(X_train, y_train).predict_proba(X_test)
+
+    ms_None_model = FOREST_CLASSIFIERS[name](max_samples=None, random_state=0)
+    ms_None_proba = ms_None_model.fit(X_train, y_train).predict_proba(X_test)
+
+    np.testing.assert_allclose(ms_1_proba, ms_None_proba)
+
+
 def test_forest_y_sparse():
     X = [[1, 2, 3]]
     y = csr_matrix([4, 5, 6])