Skip to content

MAINT Clean-up deprecated max_features="auto" in trees/forests/gb #25941

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 0 additions & 15 deletions sklearn/ensemble/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@
from ..base import is_classifier, is_regressor
from ..base import BaseEstimator
from ..base import MetaEstimatorMixin
from ..tree import (
DecisionTreeRegressor,
BaseDecisionTree,
DecisionTreeClassifier,
)
from ..utils import Bunch, _print_elapsed_time, deprecated
from ..utils import check_random_state
from ..utils.metaestimators import _BaseComposition
Expand Down Expand Up @@ -192,16 +187,6 @@ def _make_estimator(self, append=True, random_state=None):
estimator = clone(self.estimator_)
estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params})

# TODO(1.3): Remove
# max_features = 'auto' would cause warnings in every call to
# Tree.fit(..)
if isinstance(estimator, BaseDecisionTree):
if getattr(estimator, "max_features", None) == "auto":
if isinstance(estimator, DecisionTreeClassifier):
estimator.set_params(max_features="sqrt")
elif isinstance(estimator, DecisionTreeRegressor):
estimator.set_params(max_features=1.0)

if random_state is not None:
_set_random_states(estimator, random_state)

Expand Down
44 changes: 1 addition & 43 deletions sklearn/ensemble/_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,28 +408,6 @@ def fit(self, X, y, sample_weight=None):
n_samples_bootstrap = None

self._validate_estimator()
if isinstance(self, (RandomForestRegressor, ExtraTreesRegressor)):
# TODO(1.3): Remove "auto"
if self.max_features == "auto":
warn(
"`max_features='auto'` has been deprecated in 1.1 "
"and will be removed in 1.3. To keep the past behaviour, "
"explicitly set `max_features=1.0` or remove this "
"parameter as it is also the default value for "
"RandomForestRegressors and ExtraTreesRegressors.",
FutureWarning,
)
elif isinstance(self, (RandomForestClassifier, ExtraTreesClassifier)):
# TODO(1.3): Remove "auto"
if self.max_features == "auto":
warn(
"`max_features='auto'` has been deprecated in 1.1 "
"and will be removed in 1.3. To keep the past behaviour, "
"explicitly set `max_features='sqrt'` or remove this "
"parameter as it is also the default value for "
"RandomForestClassifiers and ExtraTreesClassifiers.",
FutureWarning,
)

if not self.bootstrap and self.oob_score:
raise ValueError("Out of bag estimation only available if bootstrap=True")
Expand Down Expand Up @@ -1172,18 +1150,13 @@ class RandomForestClassifier(ForestClassifier):
- If float, then `max_features` is a fraction and
`max(1, int(max_features * n_features_in_))` features are considered at each
split.
- If "auto", then `max_features=sqrt(n_features)`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.

.. versionchanged:: 1.1
The default of `max_features` changed from `"auto"` to `"sqrt"`.

.. deprecated:: 1.1
The `"auto"` option was deprecated in 1.1 and will be removed
in 1.3.

Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
Expand Down Expand Up @@ -1547,7 +1520,6 @@ class RandomForestRegressor(ForestRegressor):
- If float, then `max_features` is a fraction and
`max(1, int(max_features * n_features_in_))` features are considered at each
split.
- If "auto", then `max_features=n_features`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None or 1.0, then `max_features=n_features`.
Expand All @@ -1559,10 +1531,6 @@ class RandomForestRegressor(ForestRegressor):
.. versionchanged:: 1.1
The default of `max_features` changed from `"auto"` to 1.0.

.. deprecated:: 1.1
The `"auto"` option was deprecated in 1.1 and will be removed
in 1.3.

Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
Expand Down Expand Up @@ -1716,7 +1684,7 @@ class RandomForestRegressor(ForestRegressor):
search of the best split. To obtain a deterministic behaviour during
fitting, ``random_state`` has to be fixed.

The default value ``max_features="auto"`` uses ``n_features``
The default value ``max_features=1.0`` uses ``n_features``
rather than ``n_features / 3``. The latter was originally suggested in
[1], whereas the former was more recently justified empirically in [2].

Expand Down Expand Up @@ -1871,18 +1839,13 @@ class ExtraTreesClassifier(ForestClassifier):
- If float, then `max_features` is a fraction and
`max(1, int(max_features * n_features_in_))` features are considered at each
split.
- If "auto", then `max_features=sqrt(n_features)`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.

.. versionchanged:: 1.1
The default of `max_features` changed from `"auto"` to `"sqrt"`.

.. deprecated:: 1.1
The `"auto"` option was deprecated in 1.1 and will be removed
in 1.3.

Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
Expand Down Expand Up @@ -2237,7 +2200,6 @@ class ExtraTreesRegressor(ForestRegressor):
- If float, then `max_features` is a fraction and
`max(1, int(max_features * n_features_in_))` features are considered at each
split.
- If "auto", then `max_features=n_features`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None or 1.0, then `max_features=n_features`.
Expand All @@ -2249,10 +2211,6 @@ class ExtraTreesRegressor(ForestRegressor):
.. versionchanged:: 1.1
The default of `max_features` changed from `"auto"` to 1.0.

.. deprecated:: 1.1
The `"auto"` option was deprecated in 1.1 and will be removed
in 1.3.

Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
Expand Down
6 changes: 2 additions & 4 deletions sklearn/ensemble/_gb.py
Original file line number Diff line number Diff line change
Expand Up @@ -965,13 +965,12 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.

max_features : {'auto', 'sqrt', 'log2'}, int or float, default=None
max_features : {'sqrt', 'log2'}, int or float, default=None
The number of features to consider when looking for the best split:

- If int, values must be in the range `[1, inf)`.
- If float, values must be in the range `(0.0, 1.0]` and the features
considered at each split will be `max(1, int(max_features * n_features_in_))`.
- If 'auto', then `max_features=sqrt(n_features)`.
- If 'sqrt', then `max_features=sqrt(n_features)`.
- If 'log2', then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.
Expand Down Expand Up @@ -1531,13 +1530,12 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.

max_features : {'auto', 'sqrt', 'log2'}, int or float, default=None
max_features : {'sqrt', 'log2'}, int or float, default=None
The number of features to consider when looking for the best split:

- If int, values must be in the range `[1, inf)`.
- If float, values must be in the range `(0.0, 1.0]` and the features
considered at each split will be `max(1, int(max_features * n_features_in_))`.
- If "auto", then `max_features=n_features`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.
Expand Down
29 changes: 0 additions & 29 deletions sklearn/ensemble/tests/test_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1700,35 +1700,6 @@ def test_little_tree_with_small_max_samples(ForestClass):
assert tree1.node_count > tree2.node_count, msg


# TODO: Remove in v1.3
@pytest.mark.parametrize(
"Estimator",
[
ExtraTreesClassifier,
ExtraTreesRegressor,
RandomForestClassifier,
RandomForestRegressor,
],
)
def test_max_features_deprecation(Estimator):
"""Check warning raised for max_features="auto" deprecation."""
X = np.array([[1, 2], [3, 4]])
y = np.array([1, 0])
est = Estimator(max_features="auto")

err_msg = (
r"`max_features='auto'` has been deprecated in 1.1 "
r"and will be removed in 1.3. To keep the past behaviour, "
r"explicitly set `max_features=(1.0|'sqrt')` or remove this "
r"parameter as it is also the default value for RandomForest"
r"(Regressors|Classifiers) and ExtraTrees(Regressors|"
r"Classifiers)\."
)

with pytest.warns(FutureWarning, match=err_msg):
est.fit(X, y)


@pytest.mark.parametrize("Forest", FOREST_REGRESSORS)
def test_mse_criterion_object_segfault_smoke_test(Forest):
# This is a smoke test to ensure that passing a mutable criterion
Expand Down
10 changes: 4 additions & 6 deletions sklearn/ensemble/tests/test_gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,21 +345,19 @@ def test_feature_importance_regression(
assert set(sorted_features[1:4]) == {"Longitude", "AveOccup", "Latitude"}


# TODO(1.3): Remove warning filter
@pytest.mark.filterwarnings("ignore:`max_features='auto'` has been deprecated in 1.1")
def test_max_feature_auto():
def test_max_features():
# Test if max features is set properly for floats and str.
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
_, n_features = X.shape

X_train = X[:2000]
y_train = y[:2000]

gbrt = GradientBoostingClassifier(n_estimators=1, max_features="auto")
gbrt = GradientBoostingClassifier(n_estimators=1, max_features=None)
gbrt.fit(X_train, y_train)
assert gbrt.max_features_ == int(np.sqrt(n_features))
assert gbrt.max_features_ == n_features

gbrt = GradientBoostingRegressor(n_estimators=1, max_features="auto")
gbrt = GradientBoostingRegressor(n_estimators=1, max_features=None)
gbrt.fit(X_train, y_train)
assert gbrt.max_features_ == n_features

Expand Down
22 changes: 1 addition & 21 deletions sklearn/tree/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
"max_features": [
Interval(Integral, 1, None, closed="left"),
Interval(RealNotInt, 0.0, 1.0, closed="right"),
StrOptions({"auto", "sqrt", "log2"}, deprecated={"auto"}),
StrOptions({"sqrt", "log2"}),
None,
],
"random_state": ["random_state"],
Expand Down Expand Up @@ -653,15 +653,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
- If float, then `max_features` is a fraction and
`max(1, int(max_features * n_features_in_))` features are considered at
each split.
- If "auto", then `max_features=sqrt(n_features)`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.

.. deprecated:: 1.1
The `"auto"` option was deprecated in 1.1 and will be removed
in 1.3.

Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
Expand Down Expand Up @@ -1047,15 +1042,10 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
- If float, then `max_features` is a fraction and
`max(1, int(max_features * n_features_in_))` features are considered at each
split.
- If "auto", then `max_features=n_features`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.

.. deprecated:: 1.1
The `"auto"` option was deprecated in 1.1 and will be removed
in 1.3.

Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
Expand Down Expand Up @@ -1350,18 +1340,13 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
- If float, then `max_features` is a fraction and
`max(1, int(max_features * n_features_in_))` features are considered at
each split.
- If "auto", then `max_features=sqrt(n_features)`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.

.. versionchanged:: 1.1
The default of `max_features` changed from `"auto"` to `"sqrt"`.

.. deprecated:: 1.1
The `"auto"` option was deprecated in 1.1 and will be removed
in 1.3.

Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
Expand Down Expand Up @@ -1620,18 +1605,13 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
- If float, then `max_features` is a fraction and
`max(1, int(max_features * n_features_in_))` features are considered at each
split.
- If "auto", then `max_features=n_features`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.

.. versionchanged:: 1.1
The default of `max_features` changed from `"auto"` to `1.0`.

.. deprecated:: 1.1
The `"auto"` option was deprecated in 1.1 and will be removed
in 1.3.

Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
Expand Down
33 changes: 0 additions & 33 deletions sklearn/tree/tests/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,20 +503,8 @@ def test_importances_gini_equal_squared_error():
assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)


# TODO(1.3): Remove warning filter
@pytest.mark.filterwarnings("ignore:`max_features='auto'` has been deprecated in 1.1")
def test_max_features():
# Check max_features.
for name, TreeRegressor in REG_TREES.items():
reg = TreeRegressor(max_features="auto")
reg.fit(diabetes.data, diabetes.target)
assert reg.max_features_ == diabetes.data.shape[1]

for name, TreeClassifier in CLF_TREES.items():
clf = TreeClassifier(max_features="auto")
clf.fit(iris.data, iris.target)
assert clf.max_features_ == 2

for name, TreeEstimator in ALL_TREES.items():
est = TreeEstimator(max_features="sqrt")
est.fit(iris.data, iris.target)
Expand Down Expand Up @@ -2369,27 +2357,6 @@ def test_check_node_ndarray():
_check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype)


# TODO(1.3): Remove
def test_max_features_auto_deprecated():
for Tree in CLF_TREES.values():
tree = Tree(max_features="auto")
msg = (
"`max_features='auto'` has been deprecated in 1.1 and will be removed in"
" 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'`."
)
with pytest.warns(FutureWarning, match=msg):
tree.fit(X, y)

for Tree in REG_TREES.values():
tree = Tree(max_features="auto")
msg = (
"`max_features='auto'` has been deprecated in 1.1 and will be removed in"
" 1.3. To keep the past behaviour, explicitly set `max_features=1.0'`."
)
with pytest.warns(FutureWarning, match=msg):
tree.fit(X, y)


@pytest.mark.parametrize(
"Splitter", chain(DENSE_SPLITTERS.values(), SPARSE_SPLITTERS.values())
)
Expand Down