From 3d350d080a843da425f11bd7d8c8a8c6ff073af3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Dec 2019 15:35:32 +0100 Subject: [PATCH 1/3] bumpversion 0.7.0.dev0 --- doc/whats_new.rst | 2 ++ imblearn/_version.py | 2 +- setup.cfg | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 0461d6908..2d9e036a0 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -4,6 +4,8 @@ Release history =============== +.. include:: whats_new/v0.7.rst + .. include:: whats_new/v0.6.rst .. include:: whats_new/v0.5.rst diff --git a/imblearn/_version.py b/imblearn/_version.py index 2aca88897..76055aad3 100644 --- a/imblearn/_version.py +++ b/imblearn/_version.py @@ -22,4 +22,4 @@ # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = "0.6.0" +__version__ = "0.7.0.dev0" diff --git a/setup.cfg b/setup.cfg index d4fe170e2..c2bce0b86 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.0 +current_version = 0.7.0.dev0 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? serialize = From 0ebce0cb4d692eee56ed438338585938dcfa1a5e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Dec 2019 16:55:52 +0100 Subject: [PATCH 2/3] FIX max_samples was computed on X instead of X_resampled --- doc/whats_new/v0.6.rst | 21 +++++++++++++++++++++ imblearn/ensemble/_forest.py | 5 +++++ imblearn/ensemble/tests/test_forest.py | 8 +++++--- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst index 6c6e03b5f..852b71b28 100644 --- a/doc/whats_new/v0.6.rst +++ b/doc/whats_new/v0.6.rst @@ -1,3 +1,24 @@ +.. _changes_0_6_1: + +Version 0.6.1 +============== + +**In Development** + +This is a bug-fix release to primarily resolve some packaging issues in version +0.6.0. It also includes minor documentation improvements and some bug fixes. + +Changelog +--------- + +Bug fixes +......... + +- Fix a bug in :class:`imblearn.ensemble.BalancedRandomForestClassifier` + leading to a wrong number of samples used during fitting due `max_samples` + and therefore a bad computation of the OOB score. + :pr:`656` by :user:`Guillaume Lemaitre `. + .. _changes_0_6: Version 0.6.0 diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index ba79b0105..f9417c649 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -53,6 +53,8 @@ def _local_parallel_build_trees( X_resampled, y_resampled = sampler.fit_resample(X, y) if sample_weight is not None: sample_weight = _safe_indexing(sample_weight, sampler.sample_indices_) + if _get_n_samples_bootstrap is not None: + n_samples_bootstrap = min(n_samples_bootstrap, X_resampled.shape[0]) tree = _parallel_build_trees( tree, forest, @@ -214,6 +216,9 @@ class BalancedRandomForestClassifier(RandomForestClassifier): - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0, 1)`. + Be aware that the final number samples used will be the minimum between + the number of samples given in `max_samples` and the number of samples + obtained after resampling. .. versionadded:: 0.22 Added in `scikit-learn` in 0.22 diff --git a/imblearn/ensemble/tests/test_forest.py b/imblearn/ensemble/tests/test_forest.py index 533cec425..3451be9b6 100644 --- a/imblearn/ensemble/tests/test_forest.py +++ b/imblearn/ensemble/tests/test_forest.py @@ -182,14 +182,16 @@ def test_balanced_random_forest_pruning(imbalanced_dataset): assert n_nodes_no_pruning > n_nodes_pruning -def test_balanced_random_forest_oob_binomial(): +@pytest.mark.parametrize("ratio", [0.5, 0.1]) +@pytest.mark.filterwarnings("ignore:Some inputs do not have OOB scores") +def test_balanced_random_forest_oob_binomial(ratio): # Regression test for #655: check that the oob score is closed to 0.5 # a binomial experiment. rng = np.random.RandomState(42) n_samples = 1000 X = np.arange(n_samples).reshape(-1, 1) - y = rng.binomial(1, 0.5, size=n_samples) + y = rng.binomial(1, ratio, size=n_samples) erf = BalancedRandomForestClassifier(oob_score=True, random_state=42) erf.fit(X, y) - assert np.abs(erf.oob_score_ - 0.5) < 0.05 + assert np.abs(erf.oob_score_ - 0.5) < 0.1 From a4e54529994f43465ce0df5f9dec1ccceff80528 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Dec 2019 17:10:36 +0100 Subject: [PATCH 3/3] add more trees for a better estiamte --- imblearn/ensemble/tests/test_forest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/imblearn/ensemble/tests/test_forest.py b/imblearn/ensemble/tests/test_forest.py index 3451be9b6..944527fc1 100644 --- a/imblearn/ensemble/tests/test_forest.py +++ b/imblearn/ensemble/tests/test_forest.py @@ -115,7 +115,9 @@ def test_balanced_random_forest_oob(imbalanced_dataset): X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42, stratify=y ) - est = BalancedRandomForestClassifier(oob_score=True, random_state=0) + est = BalancedRandomForestClassifier( + oob_score=True, random_state=0, n_estimators=1000 + ) est.fit(X_train, y_train) test_score = est.score(X_test, y_test)