Skip to content

ENH Adds Poisson criterion in RandomForestRegressor #19304 #19836

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Jun 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
e380b5b
ENH adding handling for Poisson criterion in RandomForestRegressor #1…
bsun94 Apr 7, 2021
a4f7c3e
reformatted message raised by ValueError for the Poisson criterion as…
bsun94 Apr 10, 2021
705a587
ENH updated formatting for #19304
bsun94 Apr 10, 2021
3b7b2f2
Resolved merge conflict with main for #19304
bsun94 Apr 10, 2021
f20f904
slight adjustment made to docstring #19304
bsun94 Apr 11, 2021
168e8d3
more formatting changes made #19304
bsun94 Apr 11, 2021
fd242b7
troubleshooting for CI tests for #19304
bsun94 Apr 11, 2021
44ab402
Removed blank line as per final review
bsun94 Apr 13, 2021
02781e8
ENH added new test for verifying functioning of poisson criterion #19304
bsun94 Apr 17, 2021
3488d86
Merged with upstream to resolve merge conflicts
bsun94 Apr 17, 2021
909e339
Slight typo fix #19304
bsun94 Apr 17, 2021
4b75001
split test cases apart for #19304
bsun94 Apr 18, 2021
326e379
blank line removed for #19304
bsun94 Apr 24, 2021
758d731
Refreshing branch with upstream to troubleshoot CI #19304
bsun94 May 8, 2021
211ffa9
Refreshing local repo
bsun94 May 27, 2021
b12dff1
ENH Updates made to test_poisson_vs_mse for #19304
bsun94 May 27, 2021
caa3408
ENH typo fixed for #19304
bsun94 May 27, 2021
6270670
ENH updates made to clipping in test_poisson_vs_mse for #19304
bsun94 May 31, 2021
90f8ff9
TST Changes assert_raises to raises in sklearn/utils/test_estimator_c…
azihna May 27, 2021
53b07b1
DOC Update minimal versions for dependencies (#20143)
cmarmo May 27, 2021
f5abaa4
MAINT silence spurious mypy error (#20147)
ogrisel May 27, 2021
ad62527
Add missing link to user guide in PolynomialFeatures API documentatio…
naozin555 May 27, 2021
4018fc8
ENH Allowing sparse inputs for prediction in AffinityPropagation (#20…
venkyyuvy May 27, 2021
e026cdf
TST Fixes test and mis-matched pandas version
thomasjpfan May 27, 2021
3f511e8
CI Try numpy 1.14.6
thomasjpfan May 28, 2021
5087036
DOC Upate to whatsnew to 1.14.6
thomasjpfan May 28, 2021
659b382
Merge remote-tracking branch 'upstream/main' into pr/19836
thomasjpfan Jun 5, 2021
70df2ec
DOC Fix up docstring
thomasjpfan Jun 5, 2021
35de51d
DOC Small fixes to docstring
thomasjpfan Jun 5, 2021
cd66938
DOC Add period
thomasjpfan Jun 6, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,10 @@ Changelog
:class:`ensemble.StackingClassifier` and :class:`ensemble.StackingRegressor`.
:pr:`19564` by `Thomas Fan`_.

- |Enhancement| Documented and tested support of the Poisson criterion for
:class:`ensemble.RandomForestRegressor`. :pr:`19836` by
:user:`Brian Sun <bsun94>`.

- |Fix| Fixed the range of the argument max_samples to be (0.0, 1.0]
in :class:`ensemble.RandomForestClassifier`,
:class:`ensemble.RandomForestRegressor`, where `max_samples=1.0` is
Expand Down
18 changes: 15 additions & 3 deletions sklearn/ensemble/_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,14 @@ def fit(self, X, y, sample_weight=None):
# [:, np.newaxis] that does not.
y = np.reshape(y, (-1, 1))

if self.criterion == "poisson":
if np.any(y < 0):
raise ValueError("Some value(s) of y are negative which is "
"not allowed for Poisson regression.")
if np.sum(y) <= 0:
raise ValueError("Sum of y is not strictly positive which "
"is necessary for Poisson regression.")

self.n_outputs_ = y.shape[1]

y, expanded_class_weight = self._validate_y_class_weight(y)
Expand Down Expand Up @@ -1324,16 +1332,20 @@ class RandomForestRegressor(ForestRegressor):
The default value of ``n_estimators`` changed from 10 to 100
in 0.22.

criterion : {"squared_error", "mse", "absolute_error", "mae"}, \
criterion : {"squared_error", "mse", "absolute_error", "poisson"}, \
default="squared_error"
The function to measure the quality of a split. Supported criteria
are "squared_error" for the mean squared error, which is equal to
variance reduction as feature selection criterion, and "absolute_error"
for the mean absolute error.
variance reduction as feature selection criterion, "absolute_error"
for the mean absolute error, and "poisson" which uses reduction in
Poisson deviance to find splits.

.. versionadded:: 0.18
Mean Absolute Error (MAE) criterion.

.. versionadded:: 1.0
Poisson criterion.

.. deprecated:: 1.0
Criterion "mse" was deprecated in v1.0 and will be removed in
version 1.2. Use `criterion="squared_error"` which is equivalent.
Expand Down
89 changes: 89 additions & 0 deletions sklearn/ensemble/tests/test_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import joblib
from numpy.testing import assert_allclose

from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_poisson_deviance
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_equal
Expand Down Expand Up @@ -185,6 +187,76 @@ def test_regression(name, criterion):
check_regression_criterion(name, criterion)


def test_poisson_vs_mse():
"""Test that random forest with poisson criterion performs better than
mse for a poisson target."""
rng = np.random.RandomState(42)
n_train, n_test, n_features = 500, 500, 10
X = datasets.make_low_rank_matrix(n_samples=n_train + n_test,
n_features=n_features, random_state=rng)
X = np.abs(X)
X /= np.max(np.abs(X), axis=0)
# We create a log-linear Poisson model
coef = rng.uniform(low=-4, high=1, size=n_features)
y = rng.poisson(lam=np.exp(X @ coef))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test,
random_state=rng)

forest_poi = RandomForestRegressor(
criterion="poisson",
min_samples_leaf=10,
max_features="sqrt",
random_state=rng)
forest_mse = RandomForestRegressor(
criterion="squared_error",
min_samples_leaf=10,
max_features="sqrt",
random_state=rng)

forest_poi.fit(X_train, y_train)
forest_mse.fit(X_train, y_train)
dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)

for X, y, val in [(X_train, y_train, "train"), (X_test, y_test, "test")]:
metric_poi = mean_poisson_deviance(y, forest_poi.predict(X))
# squared_error forest might produce non-positive predictions => clip
# If y = 0 for those, the poisson deviance gets too good.
# If we drew more samples, we would eventually get y > 0 and the
# poisson deviance would explode, i.e. be undefined. Therefore, we do
# not clip to a tiny value like 1e-15, but to 0.1. This acts like a
# mild penalty to the non-positive predictions.
metric_mse = mean_poisson_deviance(
y,
np.clip(forest_mse.predict(X), 1e-6, None))
metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
# As squared_error might correctly predict 0 in train set, its train
# score can be better than Poisson. This is no longer the case for the
# test set. But keep the above comment for clipping in mind.
if val == "test":
assert metric_poi < metric_mse
assert metric_poi < metric_dummy


@pytest.mark.parametrize('criterion', ('poisson', 'squared_error'))
def test_balance_property_random_forest(criterion):
""""Test that sum(y_pred)==sum(y_true) on the training set."""
rng = np.random.RandomState(42)
n_train, n_test, n_features = 500, 500, 10
X = datasets.make_low_rank_matrix(n_samples=n_train + n_test,
n_features=n_features, random_state=rng)

coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
y = rng.poisson(lam=np.exp(X @ coef))

reg = RandomForestRegressor(criterion=criterion,
n_estimators=10,
bootstrap=False,
random_state=rng)
reg.fit(X, y)

assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y))


def check_regressor_attributes(name):
# Regression models should not have a classes_ attribute.
r = FOREST_REGRESSORS[name](random_state=0)
Expand Down Expand Up @@ -1367,6 +1439,23 @@ def test_min_impurity_decrease():
assert tree.min_impurity_decrease == 0.1


def test_poisson_y_positive_check():
est = RandomForestRegressor(criterion="poisson")
X = np.zeros((3, 3))

y = [-1, 1, 3]
err_msg = (r"Some value\(s\) of y are negative which is "
r"not allowed for Poisson regression.")
with pytest.raises(ValueError, match=err_msg):
est.fit(X, y)

y = [0, 0, 0]
err_msg = (r"Sum of y is not strictly positive which "
r"is necessary for Poisson regression.")
with pytest.raises(ValueError, match=err_msg):
est.fit(X, y)


# mypy error: Variable "DEFAULT_JOBLIB_BACKEND" is not valid type
class MyBackend(DEFAULT_JOBLIB_BACKEND): # type: ignore
def __init__(self, *args, **kwargs):
Expand Down