diff --git a/README.rst b/README.rst index dee6419cb..900d8dc67 100644 --- a/README.rst +++ b/README.rst @@ -30,7 +30,7 @@ .. |PythonMinVersion| replace:: 3.8 .. |NumPyMinVersion| replace:: 1.17.3 .. |SciPyMinVersion| replace:: 1.3.2 -.. |ScikitLearnMinVersion| replace:: 1.1.0 +.. |ScikitLearnMinVersion| replace:: 1.1.3 .. |MatplotlibMinVersion| replace:: 3.1.2 .. |PandasMinVersion| replace:: 1.0.5 .. |TensorflowMinVersion| replace:: 2.4.3 diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4605d2ced..e733ed93f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -51,7 +51,7 @@ jobs: black --check --diff . displayName: Run black - bash: | - ./build_tools/circle/linting.sh + ./build_tools/azure/linting.sh displayName: Run linting - bash: | mypy imblearn/ @@ -112,7 +112,7 @@ jobs: ne(variables['Build.Reason'], 'Schedule') ) matrix: - py37_conda_forge_openblas_ubuntu_1804: + py38_conda_forge_openblas_ubuntu_1804: DISTRIB: 'conda' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '3.8' @@ -141,12 +141,12 @@ jobs: THREADPOOLCTL_VERSION: 'min' COVERAGE: 'false' # Linux + Python 3.8 build with OpenBLAS and without SITE_JOBLIB - py37_conda_defaults_openblas: + py38_conda_defaults_openblas: DISTRIB: 'conda' CONDA_CHANNEL: 'conda-forge' PYTHON_VERSION: '3.8' BLAS: 'openblas' - NUMPY_VERSION: '1.19.5' # we cannot get an older version of the dependencies resolution + NUMPY_VERSION: '1.21.0' # we cannot get an older version of the dependencies resolution SCIPY_VERSION: 'min' SKLEARN_VERSION: 'min' MATPLOTLIB_VERSION: 'none' @@ -275,6 +275,3 @@ jobs: PYTHON_ARCH: '64' PYTEST_VERSION: '*' COVERAGE: 'true' - py38_pip_openblas_32bit: - PYTHON_VERSION: '3.8' - PYTHON_ARCH: '32' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 932c5b150..250d56dea 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -67,7 +67,8 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then make_conda "python=$PYTHON_VERSION" python -m pip install -U pip - python -m pip install scikit-learn pandas matplotlib + python -m pip install pandas matplotlib + python -m pip install --pre scikit-learn elif [[ "$DISTRIB" == "conda-pip-latest-tensorflow" ]]; then make_conda "python=$PYTHON_VERSION" diff --git a/build_tools/azure/linting.sh b/build_tools/azure/linting.sh new file mode 100755 index 000000000..21ef53c80 --- /dev/null +++ b/build_tools/azure/linting.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -e +# pipefail is necessary to propagate exit codes +set -o pipefail + +flake8 --show-source . +echo -e "No problem detected by flake8\n" + +# For docstrings and warnings of deprecated attributes to be rendered +# properly, the property decorator must come before the deprecated decorator +# (else they are treated as functions) + +# do not error when grep -B1 "@property" finds nothing +set +e +bad_deprecation_property_order=`git grep -A 10 "@property" -- "*.py" | awk '/@property/,/def /' | grep -B1 "@deprecated"` + +if [ ! -z "$bad_deprecation_property_order" ] +then + echo "property decorator should come before deprecated decorator" + echo "found the following occurrencies:" + echo $bad_deprecation_property_order + exit 1 +fi + +# Check for default doctest directives ELLIPSIS and NORMALIZE_WHITESPACE + +doctest_directive="$(git grep -nw -E "# doctest\: \+(ELLIPSIS|NORMALIZE_WHITESPACE)")" + +if [ ! -z "$doctest_directive" ] +then + echo "ELLIPSIS and NORMALIZE_WHITESPACE doctest directives are enabled by default, but were found in:" + echo "$doctest_directive" + exit 1 +fi + +joblib_import="$(git grep -l -A 10 -E "joblib import.+delayed" -- "*.py" ":!sklearn/utils/_joblib.py" ":!sklearn/utils/fixes.py")" + +if [ ! -z "$joblib_import" ]; then + echo "Use from sklearn.utils.fixes import delayed instead of joblib delayed. The following files contains imports to joblib.delayed:" + echo "$joblib_import" + exit 1 +fi diff --git a/doc/ensemble.rst b/doc/ensemble.rst index dabaee4a7..21d6a6e0c 100644 --- a/doc/ensemble.rst +++ b/doc/ensemble.rst @@ -35,10 +35,10 @@ data set, this classifier will favor the majority classes:: >>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) >>> bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(), ... random_state=0) - >>> bc.fit(X_train, y_train) #doctest: +ELLIPSIS + >>> bc.fit(X_train, y_train) #doctest: BaggingClassifier(...) >>> y_pred = bc.predict(X_test) - >>> balanced_accuracy_score(y_test, y_pred) # doctest: +ELLIPSIS + >>> balanced_accuracy_score(y_test, y_pred) # doctest: 0.77... In :class:`BalancedBaggingClassifier`, each bootstrap sample will be further @@ -54,10 +54,10 @@ sampling is controlled by the parameter `sampler` or the two parameters ... sampling_strategy='auto', ... replacement=False, ... random_state=0) - >>> bbc.fit(X_train, y_train) # doctest: +ELLIPSIS + >>> bbc.fit(X_train, y_train) # doctest: BalancedBaggingClassifier(...) >>> y_pred = bbc.predict(X_test) - >>> balanced_accuracy_score(y_test, y_pred) # doctest: +ELLIPSIS + >>> balanced_accuracy_score(y_test, y_pred) # doctest: 0.8... Changing the `sampler` will give rise to different known implementation @@ -78,10 +78,10 @@ each tree of the forest will be provided a balanced bootstrap sample >>> from imblearn.ensemble import BalancedRandomForestClassifier >>> brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0) - >>> brf.fit(X_train, y_train) # doctest: +ELLIPSIS + >>> brf.fit(X_train, y_train) # doctest: BalancedRandomForestClassifier(...) >>> y_pred = brf.predict(X_test) - >>> balanced_accuracy_score(y_test, y_pred) # doctest: +ELLIPSIS + >>> balanced_accuracy_score(y_test, y_pred) # doctest: 0.8... .. _boosting: @@ -97,10 +97,10 @@ a boosting iteration :cite:`seiffert2009rusboost`:: >>> from imblearn.ensemble import RUSBoostClassifier >>> rusboost = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R', ... random_state=0) - >>> rusboost.fit(X_train, y_train) # doctest: +ELLIPSIS + >>> rusboost.fit(X_train, y_train) # doctest: RUSBoostClassifier(...) >>> y_pred = rusboost.predict(X_test) - >>> balanced_accuracy_score(y_test, y_pred) # doctest: +ELLIPSIS + >>> balanced_accuracy_score(y_test, y_pred) # doctest: 0... A specific method which uses :class:`~sklearn.ensemble.AdaBoostClassifier` as @@ -111,10 +111,10 @@ the :class:`BalancedBaggingClassifier` API, one can construct the ensemble as:: >>> from imblearn.ensemble import EasyEnsembleClassifier >>> eec = EasyEnsembleClassifier(random_state=0) - >>> eec.fit(X_train, y_train) # doctest: +ELLIPSIS + >>> eec.fit(X_train, y_train) # doctest: EasyEnsembleClassifier(...) >>> y_pred = eec.predict(X_test) - >>> balanced_accuracy_score(y_test, y_pred) # doctest: +ELLIPSIS + >>> balanced_accuracy_score(y_test, y_pred) # doctest: 0.6... .. topic:: Examples diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index 1c2b6a3fe..63cdc0dfa 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -40,7 +40,7 @@ a classifier:: >>> from sklearn.svm import LinearSVC >>> clf = LinearSVC() - >>> clf.fit(X_resampled, y_resampled) # doctest : +ELLIPSIS + >>> clf.fit(X_resampled, y_resampled) LinearSVC(...) In the figure below, we compare the decision functions of a classifier trained diff --git a/imblearn/_min_dependencies.py b/imblearn/_min_dependencies.py index a9e0d41eb..72976f2b1 100644 --- a/imblearn/_min_dependencies.py +++ b/imblearn/_min_dependencies.py @@ -4,10 +4,10 @@ NUMPY_MIN_VERSION = "1.17.3" SCIPY_MIN_VERSION = "1.3.2" PANDAS_MIN_VERSION = "1.0.5" -SKLEARN_MIN_VERSION = "1.1.0" +SKLEARN_MIN_VERSION = "1.1.3" TENSORFLOW_MIN_VERSION = "2.4.3" KERAS_MIN_VERSION = "2.4.3" -JOBLIB_MIN_VERSION = "1.0.0" +JOBLIB_MIN_VERSION = "1.1.1" THREADPOOLCTL_MIN_VERSION = "2.0.0" PYTEST_MIN_VERSION = "5.0.1" diff --git a/imblearn/combine/_smote_enn.py b/imblearn/combine/_smote_enn.py index 4d2e411e6..df84b07c9 100644 --- a/imblearn/combine/_smote_enn.py +++ b/imblearn/combine/_smote_enn.py @@ -91,7 +91,7 @@ class SMOTEENN(BaseSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification - >>> from imblearn.combine import SMOTEENN # doctest: +NORMALIZE_WHITESPACE + >>> from imblearn.combine import SMOTEENN # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) diff --git a/imblearn/combine/_smote_tomek.py b/imblearn/combine/_smote_tomek.py index c0c9b6e80..ad2303ff0 100644 --- a/imblearn/combine/_smote_tomek.py +++ b/imblearn/combine/_smote_tomek.py @@ -90,7 +90,7 @@ class SMOTETomek(BaseSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.combine import \ -SMOTETomek # doctest: +NORMALIZE_WHITESPACE +SMOTETomek # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py index b35e5e570..9afc4fc1d 100644 --- a/imblearn/ensemble/_bagging.py +++ b/imblearn/ensemble/_bagging.py @@ -4,7 +4,9 @@ # Christos Aridas # License: MIT +import inspect import numbers +import warnings import numpy as np @@ -41,10 +43,12 @@ class BalancedBaggingClassifier(BaggingClassifier): Parameters ---------- - base_estimator : estimator object, default=None + estimator : estimator object, default=None The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree. + .. versionadded:: 0.10 + n_estimators : int, default=10 The number of base estimators in the ensemble. @@ -100,18 +104,37 @@ class BalancedBaggingClassifier(BaggingClassifier): .. versionadded:: 0.8 + base_estimator : estimator object, default=None + The base estimator to fit on random subsets of the dataset. + If None, then the base estimator is a decision tree. + + .. deprecated:: 0.10 + `base_estimator` was renamed to `estimator` in version 0.10 and + will be removed in 0.12. + Attributes ---------- + estimator_ : estimator + The base estimator from which the ensemble is grown. + + .. versionadded:: 0.10 + base_estimator_ : estimator The base estimator from which the ensemble is grown. + .. deprecated:: 1.2 + `base_estimator_` is deprecated in `scikit-learn` 1.2 and will be + removed in 1.4. Use `estimator_` instead. When the minimum version + of `scikit-learn` supported by `imbalanced-learn` will reach 1.4, + this attribute will be removed. + n_features_ : int The number of features when `fit` is performed. .. deprecated:: 1.0 `n_features_` is deprecated in `scikit-learn` 1.0 and will be removed - in version 1.2. Depending of the version of `scikit-learn` installed, - you will get be warned or not. + in version 1.2. When the minimum version of `scikit-learn` supported + by `imbalanced-learn` will reach 1.2, this attribute will be removed. estimators_ : list of estimators The collection of fitted base estimators. @@ -209,7 +232,7 @@ class BalancedBaggingClassifier(BaggingClassifier): >>> from sklearn.model_selection import train_test_split >>> from sklearn.metrics import confusion_matrix >>> from imblearn.ensemble import \ -BalancedBaggingClassifier # doctest: +NORMALIZE_WHITESPACE +BalancedBaggingClassifier # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -218,7 +241,7 @@ class BalancedBaggingClassifier(BaggingClassifier): >>> X_train, X_test, y_train, y_test = train_test_split(X, y, ... random_state=0) >>> bbc = BalancedBaggingClassifier(random_state=42) - >>> bbc.fit(X_train, y_train) # doctest: +ELLIPSIS + >>> bbc.fit(X_train, y_train) # doctest: BalancedBaggingClassifier(...) >>> y_pred = bbc.predict(X_test) >>> print(confusion_matrix(y_test, y_pred)) @@ -229,7 +252,7 @@ class BalancedBaggingClassifier(BaggingClassifier): @_deprecate_positional_args def __init__( self, - base_estimator=None, + estimator=None, n_estimators=10, *, max_samples=1.0, @@ -244,10 +267,18 @@ def __init__( random_state=None, verbose=0, sampler=None, + base_estimator="deprecated", ): + # TODO: remove when supporting scikit-learn>=1.2 + bagging_classifier_signature = inspect.signature(super().__init__) + estimator_params = {"base_estimator": base_estimator} + if "estimator" in bagging_classifier_signature.parameters: + estimator_params["estimator"] = estimator + else: + self.estimator = estimator super().__init__( - base_estimator, + **estimator_params, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, @@ -294,7 +325,21 @@ def _validate_estimator(self, default=DecisionTreeClassifier()): f"n_estimators must be greater than zero, " f"got {self.n_estimators}." ) - if self.base_estimator is not None: + if self.estimator is not None and ( + self.base_estimator not in [None, "deprecated"] + ): + raise ValueError( + "Both `estimator` and `base_estimator` were set. Only set `estimator`." + ) + + if self.estimator is not None: + base_estimator = clone(self.estimator) + elif self.base_estimator not in [None, "deprecated"]: + warnings.warn( + "`base_estimator` was renamed to `estimator` in version 0.10 and " + "will be removed in 0.12.", + FutureWarning, + ) base_estimator = clone(self.base_estimator) else: base_estimator = clone(default) @@ -302,12 +347,32 @@ def _validate_estimator(self, default=DecisionTreeClassifier()): if self.sampler_._sampling_type != "bypass": self.sampler_.set_params(sampling_strategy=self._sampling_strategy) - self.base_estimator_ = Pipeline( - [ - ("sampler", self.sampler_), - ("classifier", base_estimator), - ] + self._estimator = Pipeline( + [("sampler", self.sampler_), ("classifier", base_estimator)] + ) + try: + # scikit-learn < 1.2 + self.base_estimator_ = self._estimator + except AttributeError: + pass + + # TODO: remove when supporting scikit-learn>=1.4 + @property + def estimator_(self): + """Estimator used to grow the ensemble.""" + return self._estimator + + # TODO: remove when supporting scikit-learn>=1.2 + @property + def n_features_(self): + """Number of features when ``fit`` is performed.""" + warnings.warn( + "`n_features_` was deprecated in scikit-learn 1.0. This attribute will " + "not be accessible when the minimum supported version of scikit-learn " + "is 1.2.", + FutureWarning, ) + return self.n_features_in_ def fit(self, X, y): """Build a Bagging ensemble of estimators from the training set (X, y). diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py index 40b9f3953..16030020d 100644 --- a/imblearn/ensemble/_easy_ensemble.py +++ b/imblearn/ensemble/_easy_ensemble.py @@ -4,7 +4,9 @@ # Christos Aridas # License: MIT +import inspect import numbers +import warnings import numpy as np @@ -44,10 +46,12 @@ class EasyEnsembleClassifier(BaggingClassifier): n_estimators : int, default=10 Number of AdaBoost learners in the ensemble. - base_estimator : estimator object, default=AdaBoostClassifier() + estimator : estimator object, default=AdaBoostClassifier() The base AdaBoost classifier used in the inner ensemble. Note that you can set the number of inner learner by passing your own instance. + .. versionadded:: 0.10 + warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit @@ -65,11 +69,30 @@ class EasyEnsembleClassifier(BaggingClassifier): verbose : int, default=0 Controls the verbosity of the building process. + base_estimator : estimator object, default=AdaBoostClassifier() + The base AdaBoost classifier used in the inner ensemble. Note that you + can set the number of inner learner by passing your own instance. + + .. deprecated:: 0.10 + `base_estimator` was renamed to `estimator` in version 0.10 and will + be removed in 0.12. + Attributes ---------- + estimator_ : estimator + The base estimator from which the ensemble is grown. + + .. versionadded:: 0.10 + base_estimator_ : estimator The base estimator from which the ensemble is grown. + .. deprecated:: 1.2 + `base_estimator_` is deprecated in `scikit-learn` 1.2 and will be + removed in 1.4. Use `estimator_` instead. When the minimum version + of `scikit-learn` supported by `imbalanced-learn` will reach 1.4, + this attribute will be removed. + estimators_ : list of estimators The collection of fitted base estimators. @@ -86,12 +109,12 @@ class EasyEnsembleClassifier(BaggingClassifier): The number of classes. n_features_ : int - The number of features when ``fit`` is performed. + The number of features when `fit` is performed. .. deprecated:: 1.0 `n_features_` is deprecated in `scikit-learn` 1.0 and will be removed - in version 1.2. Depending of the version of `scikit-learn` installed, - you will get be warned or not. + in version 1.2. When the minimum version of `scikit-learn` supported + by `imbalanced-learn` will reach 1.2, this attribute will be removed. n_features_in_ : int Number of features in the input dataset. @@ -135,7 +158,7 @@ class EasyEnsembleClassifier(BaggingClassifier): >>> from sklearn.model_selection import train_test_split >>> from sklearn.metrics import confusion_matrix >>> from imblearn.ensemble import \ -EasyEnsembleClassifier # doctest: +NORMALIZE_WHITESPACE +EasyEnsembleClassifier # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -144,7 +167,7 @@ class EasyEnsembleClassifier(BaggingClassifier): >>> X_train, X_test, y_train, y_test = train_test_split(X, y, ... random_state=0) >>> eec = EasyEnsembleClassifier(random_state=42) - >>> eec.fit(X_train, y_train) # doctest: +ELLIPSIS + >>> eec.fit(X_train, y_train) # doctest: EasyEnsembleClassifier(...) >>> y_pred = eec.predict(X_test) >>> print(confusion_matrix(y_test, y_pred)) @@ -156,7 +179,7 @@ class EasyEnsembleClassifier(BaggingClassifier): def __init__( self, n_estimators=10, - base_estimator=None, + estimator=None, *, warm_start=False, sampling_strategy="auto", @@ -164,9 +187,18 @@ def __init__( n_jobs=None, random_state=None, verbose=0, + base_estimator="deprecated", ): + # TODO: remove when supporting scikit-learn>=1.2 + bagging_classifier_signature = inspect.signature(super().__init__) + estimator_params = {"base_estimator": base_estimator} + if "estimator" in bagging_classifier_signature.parameters: + estimator_params["estimator"] = estimator + else: + self.estimator = estimator + super().__init__( - base_estimator, + **estimator_params, n_estimators=n_estimators, max_samples=1.0, max_features=1.0, @@ -209,23 +241,55 @@ def _validate_estimator(self, default=AdaBoostClassifier()): f"n_estimators must be greater than zero, " f"got {self.n_estimators}." ) - if self.base_estimator is not None: + if self.estimator is not None and ( + self.base_estimator not in [None, "deprecated"] + ): + raise ValueError( + "Both `estimator` and `base_estimator` were set. Only set `estimator`." + ) + + if self.estimator is not None: + base_estimator = clone(self.estimator) + elif self.base_estimator not in [None, "deprecated"]: + warnings.warn( + "`base_estimator` was renamed to `estimator` in version 0.10 and " + "will be removed in 0.12.", + FutureWarning, + ) base_estimator = clone(self.base_estimator) else: base_estimator = clone(default) - self.base_estimator_ = Pipeline( - [ - ( - "sampler", - RandomUnderSampler( - sampling_strategy=self._sampling_strategy, - replacement=self.replacement, - ), - ), - ("classifier", base_estimator), - ] + sampler = RandomUnderSampler( + sampling_strategy=self._sampling_strategy, + replacement=self.replacement, + ) + self._estimator = Pipeline( + [("sampler", sampler), ("classifier", base_estimator)] + ) + try: + self.base_estimator_ = self._estimator + except AttributeError: + # scikit-learn < 1.2 + pass + + # TODO: remove when supporting scikit-learn>=1.4 + @property + def estimator_(self): + """Estimator used to grow the ensemble.""" + return self._estimator + + # TODO: remove when supporting scikit-learn>=1.2 + @property + def n_features_(self): + """Number of features when ``fit`` is performed.""" + warnings.warn( + "`n_features_` was deprecated in scikit-learn 1.0. This attribute will " + "not be accessible when the minimum supported version of scikit-learn " + "is 1.2.", + FutureWarning, ) + return self.n_features_in_ def fit(self, X, y): """Build a Bagging ensemble of estimators from the training set (X, y). diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index 77c2de908..3af09c8f7 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -12,7 +12,7 @@ from numpy import float64 as DOUBLE from scipy.sparse import issparse -from joblib import Parallel, delayed +from joblib import Parallel from sklearn.base import clone, is_classifier from sklearn.ensemble import RandomForestClassifier @@ -24,6 +24,7 @@ from sklearn.tree import DecisionTreeClassifier from sklearn.utils import check_random_state from sklearn.utils import _safe_indexing +from sklearn.utils.fixes import delayed from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _check_sample_weight @@ -230,10 +231,22 @@ class BalancedRandomForestClassifier(RandomForestClassifier): Attributes ---------- + estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier` instance + The child estimator template used to create the collection of fitted + sub-estimators. + + .. versionadded:: 0.10 + base_estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier` instance The child estimator template used to create the collection of fitted sub-estimators. + .. deprecated:: 1.2 + `base_estimator_` is deprecated in `scikit-learn` 1.2 and will be + removed in 1.4. Use `estimator_` instead. When the minimum version + of `scikit-learn` supported by `imbalanced-learn` will reach 1.4, + this attribute will be removed. + estimators_ : list of :class:`~sklearn.tree.DecisionTreeClassifier` The collection of fitted sub-estimators. @@ -255,12 +268,12 @@ class labels (multi-output problem). number of classes for each output (multi-output problem). n_features_ : int - The number of features when ``fit`` is performed. + The number of features when `fit` is performed. .. deprecated:: 1.0 `n_features_` is deprecated in `scikit-learn` 1.0 and will be removed - in version 1.2. Depending of the version of `scikit-learn` installed, - you will get be warned or not. + in version 1.2. When the minimum version of `scikit-learn` supported + by `imbalanced-learn` will reach 1.2, this attribute will be removed. n_features_in_ : int Number of features in the input dataset. @@ -314,9 +327,9 @@ class labels (multi-output problem). ... n_informative=4, weights=[0.2, 0.3, 0.5], ... random_state=0) >>> clf = BalancedRandomForestClassifier(max_depth=2, random_state=0) - >>> clf.fit(X, y) # doctest: +ELLIPSIS + >>> clf.fit(X, y) # doctest: BalancedRandomForestClassifier(...) - >>> print(clf.feature_importances_) # doctest: +ELLIPSIS + >>> print(clf.feature_importances_) # doctest: [...] >>> print(clf.predict([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) @@ -385,10 +398,21 @@ def _validate_estimator(self, default=DecisionTreeClassifier()): f"n_estimators must be greater than zero, " f"got {self.n_estimators}." ) - if self.base_estimator is not None: - self.base_estimator_ = clone(self.base_estimator) + if hasattr(self, "estimator"): + base_estimator = self.estimator else: - self.base_estimator_ = clone(default) + base_estimator = self.base_estimator + + if base_estimator is not None: + self._estimator = clone(base_estimator) + else: + self._estimator = clone(default) + + try: + # scikit-learn < 1.2 + self.base_estimator_ = self._estimator + except AttributeError: + pass self.base_sampler_ = RandomUnderSampler( sampling_strategy=self._sampling_strategy, @@ -400,7 +424,7 @@ def _make_sampler_estimator(self, random_state=None): Warning: This method should be used to properly instantiate new sub-estimators. """ - estimator = clone(self.base_estimator_) + estimator = clone(self._estimator) estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params}) sampler = clone(self.base_sampler_) @@ -691,10 +715,23 @@ def _compute_oob_predictions(self, X, y): return oob_pred + # TODO: remove when supporting scikit-learn>=1.4 + @property + def estimator_(self): + """Estimator used to grow the ensemble.""" + return self._estimator + + # TODO: remove when supporting scikit-learn>=1.2 @property def n_features_(self): - """Number of features when fitting the estimator.""" - return getattr(self.n_features_in_, "n_features_", self._n_features) + """Number of features when ``fit`` is performed.""" + warn( + "`n_features_` was deprecated in scikit-learn 1.0. This attribute will " + "not be accessible when the minimum supported version of scikit-learn " + "is 1.2.", + FutureWarning, + ) + return self.n_features_in_ def _more_tags(self): return { diff --git a/imblearn/ensemble/_weight_boosting.py b/imblearn/ensemble/_weight_boosting.py index 6376c3865..b7fdd2f65 100644 --- a/imblearn/ensemble/_weight_boosting.py +++ b/imblearn/ensemble/_weight_boosting.py @@ -1,3 +1,6 @@ +import inspect +import numbers +import warnings from copy import deepcopy import numpy as np @@ -5,7 +8,9 @@ from sklearn.base import clone from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble._base import _set_random_states +from sklearn.tree import DecisionTreeClassifier from sklearn.utils import _safe_indexing +from sklearn.utils.validation import has_fit_parameter from ..under_sampling.base import BaseUnderSampler from ..under_sampling import RandomUnderSampler @@ -31,12 +36,14 @@ class RUSBoostClassifier(AdaBoostClassifier): Parameters ---------- - base_estimator : estimator object, default=None + estimator : estimator object, default=None The base estimator from which the boosted ensemble is built. Support for sample weighting is required, as well as proper ``classes_`` and ``n_classes_`` attributes. If ``None``, then the base estimator is ``DecisionTreeClassifier(max_depth=1)``. + .. versionadded:: 0.12 + n_estimators : int, default=50 The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. @@ -60,11 +67,32 @@ class RUSBoostClassifier(AdaBoostClassifier): {random_state} + base_estimator : estimator object, default=None + The base estimator from which the boosted ensemble is built. + Support for sample weighting is required, as well as proper + ``classes_`` and ``n_classes_`` attributes. If ``None``, then + the base estimator is ``DecisionTreeClassifier(max_depth=1)``. + + .. deprecated:: 0.10 + `base_estimator` is deprecated in version 0.10 and will be removed + in 0.12. Use `estimator` instead. + Attributes ---------- + estimator_ : estimator + The base estimator from which the ensemble is grown. + + .. versionadded:: 0.10 + base_estimator_ : estimator The base estimator from which the ensemble is grown. + .. deprecated:: 1.2 + `base_estimator_` is deprecated in `scikit-learn` 1.2 and will be + removed in 1.4. Use `estimator_` instead. When the minimum version + of `scikit-learn` supported by `imbalanced-learn` will reach 1.4, + this attribute will be removed. + estimators_ : list of classifiers The collection of fitted sub-estimators. @@ -125,16 +153,16 @@ class RUSBoostClassifier(AdaBoostClassifier): ... n_informative=4, weights=[0.2, 0.3, 0.5], ... random_state=0) >>> clf = RUSBoostClassifier(random_state=0) - >>> clf.fit(X, y) # doctest: +ELLIPSIS + >>> clf.fit(X, y) # doctest: RUSBoostClassifier(...) - >>> clf.predict(X) # doctest: +ELLIPSIS + >>> clf.predict(X) # doctest: array([...]) """ @_deprecate_positional_args def __init__( self, - base_estimator=None, + estimator=None, *, n_estimators=50, learning_rate=1.0, @@ -142,9 +170,18 @@ def __init__( sampling_strategy="auto", replacement=False, random_state=None, + base_estimator="deprecated", ): + # TODO: remove when supporting scikit-learn>=1.2 + bagging_classifier_signature = inspect.signature(super().__init__) + estimator_params = {"base_estimator": base_estimator} + if "estimator" in bagging_classifier_signature.parameters: + estimator_params["estimator"] = estimator + else: + self.estimator = estimator + super().__init__( - base_estimator=base_estimator, + **estimator_params, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm, @@ -181,9 +218,65 @@ def fit(self, X, y, sample_weight=None): return self def _validate_estimator(self): - """Check the estimator and the n_estimator attribute, set the - `base_estimator_` attribute.""" - super()._validate_estimator() + """Check the estimator and the n_estimator attribute. + + Sets the `estimator_` attributes. + """ + if not isinstance(self.n_estimators, numbers.Integral): + raise ValueError( + "n_estimators must be an integer, got {0}.".format( + type(self.n_estimators) + ) + ) + + if self.n_estimators <= 0: + raise ValueError( + "n_estimators must be greater than zero, got {0}.".format( + self.n_estimators + ) + ) + + if self.estimator is not None and ( + self.base_estimator not in [None, "deprecated"] + ): + raise ValueError( + "Both `estimator` and `base_estimator` were set. Only set `estimator`." + ) + + default = DecisionTreeClassifier(max_depth=1) + if self.estimator is not None: + base_estimator = clone(self.estimator) + elif self.base_estimator not in [None, "deprecated"]: + warnings.warn( + "`base_estimator` was renamed to `estimator` in version 0.10 and " + "will be removed in 0.12.", + FutureWarning, + ) + base_estimator = clone(self.base_estimator) + else: + base_estimator = clone(default) + + self._estimator = base_estimator + try: + # scikit-learn < 1.2 + self.base_estimator_ = self._estimator + except AttributeError: + pass + + # SAMME-R requires predict_proba-enabled estimators + if self.algorithm == "SAMME.R": + if not hasattr(self._estimator, "predict_proba"): + raise TypeError( + "AdaBoostClassifier with algorithm='SAMME.R' requires " + "that the weak learner supports the calculation of class " + "probabilities with a predict_proba method.\n" + "Please change the base estimator or set " + "algorithm='SAMME' instead." + ) + if not has_fit_parameter(self._estimator, "sample_weight"): + raise ValueError( + f"{self._estimator.__class__.__name__} doesn't support sample_weight." + ) self.base_sampler_ = RandomUnderSampler( sampling_strategy=self.sampling_strategy, @@ -195,7 +288,7 @@ def _make_sampler_estimator(self, append=True, random_state=None): Warning: This method should be used to properly instantiate new sub-estimators. """ - estimator = clone(self.base_estimator_) + estimator = clone(self._estimator) estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params}) sampler = clone(self.base_sampler_) @@ -323,3 +416,9 @@ def _boost_discrete(self, iboost, X, y, sample_weight, random_state): sample_weight *= np.exp(estimator_weight * incorrect * (sample_weight > 0)) return sample_weight, estimator_weight, estimator_error + + # TODO: remove when supporting scikit-learn>=1.4 + @property + def estimator_(self): + """Estimator used to grow the ensemble.""" + return self._estimator diff --git a/imblearn/ensemble/tests/test_bagging.py b/imblearn/ensemble/tests/test_bagging.py index f3eff340a..c398f1ded 100644 --- a/imblearn/ensemble/tests/test_bagging.py +++ b/imblearn/ensemble/tests/test_bagging.py @@ -8,18 +8,21 @@ import numpy as np import pytest +import sklearn from sklearn.datasets import load_iris, make_hastie_10_2, make_classification from sklearn.model_selection import ( GridSearchCV, ParameterGrid, train_test_split, ) +from sklearn.cluster import KMeans from sklearn.dummy import DummyClassifier from sklearn.linear_model import Perceptron, LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.feature_selection import SelectKBest +from sklearn.utils.fixes import parse_version from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose @@ -31,11 +34,12 @@ from imblearn.pipeline import make_pipeline from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler +sklearn_version = parse_version(sklearn.__version__) iris = load_iris() @pytest.mark.parametrize( - "base_estimator", + "estimator", [ None, DummyClassifier(strategy="prior"), @@ -56,7 +60,7 @@ } ), ) -def test_balanced_bagging_classifier(base_estimator, params): +def test_balanced_bagging_classifier(estimator, params): # Check classification for various parameter settings. X, y = make_imbalance( iris.data, @@ -66,9 +70,9 @@ def test_balanced_bagging_classifier(base_estimator, params): ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - BalancedBaggingClassifier( - base_estimator=base_estimator, random_state=0, **params - ).fit(X_train, y_train).predict(X_test) + BalancedBaggingClassifier(estimator=estimator, random_state=0, **params).fit( + X_train, y_train + ).predict(X_test) def test_bootstrap_samples(): @@ -81,12 +85,12 @@ def test_bootstrap_samples(): ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - base_estimator = DecisionTreeClassifier().fit(X_train, y_train) + estimator = DecisionTreeClassifier().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set # disable the resampling by passing an empty dictionary. ensemble = BalancedBaggingClassifier( - base_estimator=DecisionTreeClassifier(), + estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=False, n_estimators=10, @@ -94,17 +98,17 @@ def test_bootstrap_samples(): random_state=0, ).fit(X_train, y_train) - assert ensemble.score(X_train, y_train) == base_estimator.score(X_train, y_train) + assert ensemble.score(X_train, y_train) == estimator.score(X_train, y_train) # with bootstrap, trees are no longer perfect on the training set ensemble = BalancedBaggingClassifier( - base_estimator=DecisionTreeClassifier(), + estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=True, random_state=0, ).fit(X_train, y_train) - assert ensemble.score(X_train, y_train) < base_estimator.score(X_train, y_train) + assert ensemble.score(X_train, y_train) < estimator.score(X_train, y_train) def test_bootstrap_features(): @@ -118,7 +122,7 @@ def test_bootstrap_features(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ensemble = BalancedBaggingClassifier( - base_estimator=DecisionTreeClassifier(), + estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=False, random_state=0, @@ -128,7 +132,7 @@ def test_bootstrap_features(): assert np.unique(features).shape[0] == X.shape[1] ensemble = BalancedBaggingClassifier( - base_estimator=DecisionTreeClassifier(), + estimator=DecisionTreeClassifier(), max_features=1.0, bootstrap_features=True, random_state=0, @@ -153,7 +157,7 @@ def test_probability(): with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BalancedBaggingClassifier( - base_estimator=DecisionTreeClassifier(), random_state=0 + estimator=DecisionTreeClassifier(), random_state=0 ).fit(X_train, y_train) assert_array_almost_equal( @@ -168,7 +172,7 @@ def test_probability(): # Degenerate case, where some classes are missing ensemble = BalancedBaggingClassifier( - base_estimator=LogisticRegression(solver="lbfgs", multi_class="auto"), + estimator=LogisticRegression(solver="lbfgs", multi_class="auto"), random_state=0, max_samples=5, ) @@ -196,9 +200,9 @@ def test_oob_score_classification(): ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - for base_estimator in [DecisionTreeClassifier(), SVC(gamma="scale")]: + for estimator in [DecisionTreeClassifier(), SVC(gamma="scale")]: clf = BalancedBaggingClassifier( - base_estimator=base_estimator, + estimator=estimator, n_estimators=100, bootstrap=True, oob_score=True, @@ -212,7 +216,7 @@ def test_oob_score_classification(): # Test with few estimators with pytest.warns(UserWarning): BalancedBaggingClassifier( - base_estimator=base_estimator, + estimator=estimator, n_estimators=1, bootstrap=True, oob_score=True, @@ -231,7 +235,7 @@ def test_single_estimator(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = BalancedBaggingClassifier( - base_estimator=KNeighborsClassifier(), + estimator=KNeighborsClassifier(), n_estimators=1, bootstrap=False, bootstrap_features=False, @@ -269,7 +273,7 @@ def test_balanced_bagging_classifier_error(params): iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50} ) base = DecisionTreeClassifier() - clf = BalancedBaggingClassifier(base_estimator=base, **params) + clf = BalancedBaggingClassifier(estimator=base, **params) with pytest.raises(ValueError): clf.fit(X, y) @@ -284,7 +288,7 @@ def test_gridsearch(): y[y == 2] = 1 # Grid search with scoring based on decision_function - parameters = {"n_estimators": (1, 2), "base_estimator__C": (1, 2)} + parameters = {"n_estimators": (1, 2), "estimator__C": (1, 2)} GridSearchCV( BalancedBaggingClassifier(SVC(gamma="scale")), @@ -294,8 +298,8 @@ def test_gridsearch(): ).fit(X, y) -def test_base_estimator(): - # Check base_estimator and its default values. +def test_estimator(): + # Check estimator and its default values. X, y = make_imbalance( iris.data, iris.target, @@ -308,19 +312,19 @@ def test_base_estimator(): X_train, y_train ) - assert isinstance(ensemble.base_estimator_.steps[-1][1], DecisionTreeClassifier) + assert isinstance(ensemble.estimator_.steps[-1][1], DecisionTreeClassifier) ensemble = BalancedBaggingClassifier( DecisionTreeClassifier(), n_jobs=3, random_state=0 ).fit(X_train, y_train) - assert isinstance(ensemble.base_estimator_.steps[-1][1], DecisionTreeClassifier) + assert isinstance(ensemble.estimator_.steps[-1][1], DecisionTreeClassifier) ensemble = BalancedBaggingClassifier( Perceptron(max_iter=1000, tol=1e-3), n_jobs=3, random_state=0 ).fit(X_train, y_train) - assert isinstance(ensemble.base_estimator_.steps[-1][1], Perceptron) + assert isinstance(ensemble.estimator_.steps[-1][1], Perceptron) def test_bagging_with_pipeline(): @@ -518,12 +522,16 @@ def fit(self, X, y, sample_weight=None): return super().fit(X, y, sample_weight=sample_weight) +@pytest.mark.filterwarnings("ignore:Number of distinct clusters") @pytest.mark.parametrize( "sampler, n_samples_bootstrap", [ (None, 15), (RandomUnderSampler(), 15), # under-sampling with sample_indices_ - (ClusterCentroids(), 15), # under-sampling without sample_indices_ + ( + ClusterCentroids(estimator=KMeans(n_init=1)), + 15, + ), # under-sampling without sample_indices_ (RandomOverSampler(), 40), # over-sampling with sample_indices_ (SMOTE(), 40), # over-sampling without sample_indices_ ], @@ -538,7 +546,7 @@ def test_balanced_bagging_classifier_samplers(sampler, n_samples_bootstrap): ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = BalancedBaggingClassifier( - base_estimator=CountDecisionTreeClassifier(), + estimator=CountDecisionTreeClassifier(), n_estimators=2, sampler=sampler, random_state=0, @@ -593,7 +601,7 @@ def roughly_balanced_bagging(X, y, replace=False): # Roughly Balanced Bagging rbb = BalancedBaggingClassifier( - base_estimator=CountDecisionTreeClassifier(), + estimator=CountDecisionTreeClassifier(), n_estimators=2, sampler=FunctionSampler( func=roughly_balanced_bagging, kw_args={"replace": replace} @@ -604,3 +612,33 @@ def roughly_balanced_bagging(X, y, replace=False): for estimator in rbb.estimators_: class_counts = estimator[-1].class_counts_ assert (class_counts[0] / class_counts[1]) > 0.8 + + +def test_balanced_bagging_classifier_n_features(): + """Check that we raise a FutureWarning when accessing `n_features_`.""" + X, y = load_iris(return_X_y=True) + estimator = BalancedBaggingClassifier().fit(X, y) + with pytest.warns(FutureWarning, match="`n_features_` was deprecated"): + estimator.n_features_ + + +@pytest.mark.skipif( + sklearn_version < parse_version("1.2"), reason="requires scikit-learn>=1.2" +) +def test_balanced_bagging_classifier_base_estimator(): + """Check that we raise a FutureWarning when accessing `base_estimator_`.""" + X, y = load_iris(return_X_y=True) + estimator = BalancedBaggingClassifier().fit(X, y) + with pytest.warns(FutureWarning, match="`base_estimator_` was deprecated"): + estimator.base_estimator_ + + +def test_balanced_bagging_classifier_set_both_estimator_and_base_estimator(): + """Check that we raise a ValueError when setting both `estimator` and + `base_estimator`.""" + X, y = load_iris(return_X_y=True) + err_msg = "Both `estimator` and `base_estimator` were set. Only set `estimator`." + with pytest.raises(ValueError, match=err_msg): + BalancedBaggingClassifier( + estimator=KNeighborsClassifier(), base_estimator=KNeighborsClassifier() + ).fit(X, y) diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index fb118a92f..e5d8f6d93 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -6,11 +6,13 @@ import pytest import numpy as np +import sklearn from sklearn.datasets import load_iris, make_hastie_10_2 from sklearn.ensemble import AdaBoostClassifier from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.feature_selection import SelectKBest +from sklearn.utils.fixes import parse_version from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal @@ -19,6 +21,7 @@ from imblearn.under_sampling import RandomUnderSampler from imblearn.pipeline import make_pipeline +sklearn_version = parse_version(sklearn.__version__) iris = load_iris() # Generate a global dataset to use @@ -42,10 +45,10 @@ @pytest.mark.parametrize("n_estimators", [10, 20]) @pytest.mark.parametrize( - "base_estimator", + "estimator", [AdaBoostClassifier(n_estimators=5), AdaBoostClassifier(n_estimators=10)], ) -def test_easy_ensemble_classifier(n_estimators, base_estimator): +def test_easy_ensemble_classifier(n_estimators, estimator): # Check classification for various parameter settings. X, y = make_imbalance( iris.data, @@ -57,14 +60,14 @@ def test_easy_ensemble_classifier(n_estimators, base_estimator): eec = EasyEnsembleClassifier( n_estimators=n_estimators, - base_estimator=base_estimator, + estimator=estimator, n_jobs=-1, random_state=RND_SEED, ) eec.fit(X_train, y_train).score(X_test, y_test) assert len(eec.estimators_) == n_estimators for est in eec.estimators_: - assert len(est.named_steps["classifier"]) == base_estimator.n_estimators + assert len(est.named_steps["classifier"]) == estimator.n_estimators # test the different prediction function eec.predict(X_test) eec.predict_proba(X_test) @@ -72,8 +75,8 @@ def test_easy_ensemble_classifier(n_estimators, base_estimator): eec.decision_function(X_test) -def test_base_estimator(): - # Check base_estimator and its default values. +def test_estimator(): + # Check estimator and its default values. X, y = make_imbalance( iris.data, iris.target, @@ -86,13 +89,13 @@ def test_base_estimator(): X_train, y_train ) - assert isinstance(ensemble.base_estimator_.steps[-1][1], AdaBoostClassifier) + assert isinstance(ensemble.estimator_.steps[-1][1], AdaBoostClassifier) ensemble = EasyEnsembleClassifier( 2, AdaBoostClassifier(), n_jobs=-1, random_state=0 ).fit(X_train, y_train) - assert isinstance(ensemble.base_estimator_.steps[-1][1], AdaBoostClassifier) + assert isinstance(ensemble.estimator_.steps[-1][1], AdaBoostClassifier) def test_bagging_with_pipeline(): @@ -104,7 +107,7 @@ def test_bagging_with_pipeline(): ) estimator = EasyEnsembleClassifier( n_estimators=2, - base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier()), + estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier()), ) estimator.fit(X, y).predict(X) @@ -184,21 +187,15 @@ def test_warm_start_equivalence(): assert_allclose(y1, y2) -@pytest.mark.parametrize( - "n_estimators, msg_error", - [ - (1.0, "n_estimators must be an integer"), - (-10, "n_estimators must be greater than zero"), - ], -) -def test_easy_ensemble_classifier_error(n_estimators, msg_error): +@pytest.mark.parametrize("n_estimators", [1.0, -10]) +def test_easy_ensemble_classifier_error(n_estimators): X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) - with pytest.raises(ValueError, match=msg_error): + with pytest.raises(ValueError): eec = EasyEnsembleClassifier(n_estimators=n_estimators) eec.fit(X, y) @@ -230,11 +227,41 @@ def test_easy_ensemble_classifier_grid_search(): parameters = { "n_estimators": [1, 2], - "base_estimator__n_estimators": [3, 4], + "estimator__n_estimators": [3, 4], } grid_search = GridSearchCV( - EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()), + EasyEnsembleClassifier(estimator=AdaBoostClassifier()), parameters, cv=5, ) grid_search.fit(X, y) + + +def test_easy_ensemble_classifier_n_features(): + """Check that we raise a FutureWarning when accessing `n_features_`.""" + X, y = load_iris(return_X_y=True) + estimator = EasyEnsembleClassifier().fit(X, y) + with pytest.warns(FutureWarning, match="`n_features_` was deprecated"): + estimator.n_features_ + + +@pytest.mark.skipif( + sklearn_version < parse_version("1.2"), reason="warns for scikit-learn>=1.2" +) +def test_easy_ensemble_classifier_base_estimator(): + """Check that we raise a FutureWarning when accessing `base_estimator_`.""" + X, y = load_iris(return_X_y=True) + estimator = EasyEnsembleClassifier().fit(X, y) + with pytest.warns(FutureWarning, match="`base_estimator_` was deprecated"): + estimator.base_estimator_ + + +def test_easy_ensemble_classifier_set_both_estimator_and_base_estimator(): + """Check that we raise a ValueError when setting both `estimator` and + `base_estimator`.""" + X, y = load_iris(return_X_y=True) + err_msg = "Both `estimator` and `base_estimator` were set. Only set `estimator`." + with pytest.raises(ValueError, match=err_msg): + EasyEnsembleClassifier( + estimator=AdaBoostClassifier(), base_estimator=AdaBoostClassifier() + ).fit(X, y) diff --git a/imblearn/ensemble/tests/test_forest.py b/imblearn/ensemble/tests/test_forest.py index ad1420631..5e35eb43f 100644 --- a/imblearn/ensemble/tests/test_forest.py +++ b/imblearn/ensemble/tests/test_forest.py @@ -2,14 +2,18 @@ import numpy as np -from sklearn.datasets import make_classification +import sklearn +from sklearn.datasets import make_classification, load_iris from sklearn.model_selection import GridSearchCV from sklearn.model_selection import train_test_split +from sklearn.utils.fixes import parse_version from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from imblearn.ensemble import BalancedRandomForestClassifier +sklearn_version = parse_version(sklearn.__version__) + @pytest.fixture def imbalanced_dataset(): @@ -188,3 +192,22 @@ def test_balanced_random_forest_oob_binomial(ratio): erf = BalancedRandomForestClassifier(oob_score=True, random_state=42) erf.fit(X, y) assert np.abs(erf.oob_score_ - 0.5) < 0.1 + + +def test_balanced_bagging_classifier_n_features(): + """Check that we raise a FutureWarning when accessing `n_features_`.""" + X, y = load_iris(return_X_y=True) + estimator = BalancedRandomForestClassifier().fit(X, y) + with pytest.warns(FutureWarning, match="`n_features_` was deprecated"): + estimator.n_features_ + + +@pytest.mark.skipif( + sklearn_version < parse_version("1.2"), reason="requires scikit-learn>=1.2" +) +def test_balanced_bagging_classifier_base_estimator(): + """Check that we raise a FutureWarning when accessing `base_estimator_`.""" + X, y = load_iris(return_X_y=True) + estimator = BalancedRandomForestClassifier().fit(X, y) + with pytest.warns(FutureWarning, match="`base_estimator_` was deprecated"): + estimator.base_estimator_ diff --git a/imblearn/ensemble/tests/test_weight_boosting.py b/imblearn/ensemble/tests/test_weight_boosting.py index e1394a2b5..c69e39500 100644 --- a/imblearn/ensemble/tests/test_weight_boosting.py +++ b/imblearn/ensemble/tests/test_weight_boosting.py @@ -2,12 +2,17 @@ import numpy as np -from sklearn.datasets import make_classification +import sklearn +from sklearn.datasets import make_classification, load_iris from sklearn.model_selection import train_test_split +from sklearn.tree import DecisionTreeClassifier +from sklearn.utils.fixes import parse_version from sklearn.utils._testing import assert_array_equal from imblearn.ensemble import RUSBoostClassifier +sklearn_version = parse_version(sklearn.__version__) + @pytest.fixture def imbalanced_dataset(): @@ -26,19 +31,11 @@ def imbalanced_dataset(): @pytest.mark.parametrize( - "boosting_params, err_type, err_msg", - [ - ( - {"n_estimators": "whatever"}, - TypeError, - "n_estimators must be an instance of int, not str.", - ), - ({"n_estimators": -100}, ValueError, "n_estimators == -100, must be >= 1."), - ], + "boosting_params", [{"n_estimators": "whatever"}, {"n_estimators": -100}] ) -def test_rusboost_error(imbalanced_dataset, boosting_params, err_type, err_msg): +def test_rusboost_error(imbalanced_dataset, boosting_params): rusboost = RUSBoostClassifier(**boosting_params) - with pytest.raises(err_type, match=err_msg): + with pytest.raises((ValueError, TypeError)): rusboost.fit(*imbalanced_dataset) @@ -105,3 +102,25 @@ def test_rusboost_sample_weight(imbalanced_dataset, algorithm): with pytest.raises(AssertionError): assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight) + + +@pytest.mark.skipif( + sklearn_version < parse_version("1.2"), reason="requires scikit-learn>=1.2" +) +def test_rus_boost_classifier_base_estimator(): + """Check that we raise a FutureWarning when accessing `base_estimator_`.""" + X, y = load_iris(return_X_y=True) + estimator = RUSBoostClassifier().fit(X, y) + with pytest.warns(FutureWarning, match="`base_estimator_` was deprecated"): + estimator.base_estimator_ + + +def test_rus_boost_classifier_set_both_estimator_and_base_estimator(): + """Check that we raise a ValueError when setting both `estimator` and + `base_estimator`.""" + X, y = load_iris(return_X_y=True) + err_msg = "Both `estimator` and `base_estimator` were set. Only set `estimator`." + with pytest.raises(ValueError, match=err_msg): + RUSBoostClassifier( + estimator=DecisionTreeClassifier(), base_estimator=DecisionTreeClassifier() + ).fit(X, y) diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py index 0f6141f8f..49a59fadc 100644 --- a/imblearn/metrics/_classification.py +++ b/imblearn/metrics/_classification.py @@ -856,9 +856,8 @@ def classification_report_imbalanced( >>> import numpy as np >>> from imblearn.metrics import classification_report_imbalanced >>> y_true = [0, 1, 2, 2, 2] - >>> y_pred = [0, 0, 2, 2, 1] # doctest : +NORMALIZE_WHITESPACE - >>> target_names = ['class 0', 'class 1', \ - 'class 2'] # doctest : +NORMALIZE_WHITESPACE + >>> y_pred = [0, 0, 2, 2, 1] + >>> target_names = ['class 0', 'class 1', 'class 2'] >>> print(classification_report_imbalanced(y_true, y_pred, \ target_names=target_names)) pre rec spe f1 geo iba\ diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index 23939ef25..a0f4383ed 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -107,7 +107,7 @@ class ADASYN(BaseOverSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ -ADASYN # doctest: +NORMALIZE_WHITESPACE +ADASYN # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 8df574c20..6323e04da 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -121,7 +121,7 @@ class RandomOverSampler(BaseOverSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ -RandomOverSampler # doctest: +NORMALIZE_WHITESPACE +RandomOverSampler # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 6249dc876..29dcabf6e 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -294,7 +294,7 @@ class SMOTE(BaseSMOTE): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ -SMOTE # doctest: +NORMALIZE_WHITESPACE +SMOTE # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py index df01e9b2b..b3ec3d46c 100644 --- a/imblearn/over_sampling/_smote/filter.py +++ b/imblearn/over_sampling/_smote/filter.py @@ -139,7 +139,7 @@ class BorderlineSMOTE(BaseSMOTE): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ -BorderlineSMOTE # doctest: +NORMALIZE_WHITESPACE +BorderlineSMOTE # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -393,7 +393,7 @@ class SVMSMOTE(BaseSMOTE): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ -SVMSMOTE # doctest: +NORMALIZE_WHITESPACE +SVMSMOTE # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py index abeecc729..7165ab9c9 100644 --- a/imblearn/over_sampling/base.py +++ b/imblearn/over_sampling/base.py @@ -52,4 +52,4 @@ class BaseOverSampler(BaseSampler): - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. - """.strip() + """.strip() # noqa: E501 diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index b112a0a20..320f086d1 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -90,7 +90,7 @@ class Pipeline(pipeline.Pipeline): >>> from sklearn.neighbors import KNeighborsClassifier as KNN >>> from sklearn.metrics import classification_report >>> from imblearn.over_sampling import SMOTE - >>> from imblearn.pipeline import Pipeline # doctest: +NORMALIZE_WHITESPACE + >>> from imblearn.pipeline import Pipeline # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -101,7 +101,7 @@ class Pipeline(pipeline.Pipeline): >>> knn = KNN() >>> pipeline = Pipeline([('smt', smt), ('pca', pca), ('knn', knn)]) >>> X_train, X_test, y_train, y_test = tts(X, y, random_state=42) - >>> pipeline.fit(X_train, y_train) # doctest: +ELLIPSIS + >>> pipeline.fit(X_train, y_train) # doctest: Pipeline(...) >>> y_hat = pipeline.predict(X_test) >>> print(classification_report(y_test, y_hat)) @@ -437,7 +437,7 @@ def make_pipeline(*steps, memory=None, verbose=False): >>> from sklearn.naive_bayes import GaussianNB >>> from sklearn.preprocessing import StandardScaler >>> make_pipeline(StandardScaler(), GaussianNB(priors=None)) - ... # doctest: +NORMALIZE_WHITESPACE + ... # doctest: Pipeline(steps=[('standardscaler', StandardScaler()), ('gaussiannb', GaussianNB())]) """ diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 556f0c980..8295adf84 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -95,7 +95,7 @@ class ClusterCentroids(BaseUnderSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ -ClusterCentroids # doctest: +NORMALIZE_WHITESPACE +ClusterCentroids # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -104,7 +104,7 @@ class ClusterCentroids(BaseUnderSampler): >>> cc = ClusterCentroids(random_state=42) >>> X_res, y_res = cc.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) - ... # doctest: +ELLIPSIS + ... # doctest: Resampled dataset shape Counter({{...}}) """ diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index d06a867be..be12d0bf0 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -104,7 +104,7 @@ class EditedNearestNeighbours(BaseCleaningSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ -EditedNearestNeighbours # doctest: +NORMALIZE_WHITESPACE +EditedNearestNeighbours # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -272,8 +272,7 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification - >>> from imblearn.under_sampling import \ -RepeatedEditedNearestNeighbours # doctest : +NORMALIZE_WHITESPACE + >>> from imblearn.under_sampling import RepeatedEditedNearestNeighbours >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -473,7 +472,7 @@ class without early stopping. >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ -AllKNN # doctest: +NORMALIZE_WHITESPACE +AllKNN # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 65aceadb1..32e2d754c 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -101,7 +101,7 @@ class InstanceHardnessThreshold(BaseUnderSampler): Original dataset shape Counter({{1: 900, 0: 100}}) >>> iht = InstanceHardnessThreshold(random_state=42) >>> X_res, y_res = iht.fit_resample(X, y) - >>> print('Resampled dataset shape %s' % Counter(y_res)) # doctest: +ELLIPSIS + >>> print('Resampled dataset shape %s' % Counter(y_res)) # doctest: Resampled dataset shape Counter({{1: 5..., 0: 100}}) """ diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index 5246e54f0..8913b4c4f 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -96,7 +96,7 @@ class NearMiss(BaseUnderSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ -NearMiss # doctest: +NORMALIZE_WHITESPACE +NearMiss # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py index 3cb3d5320..f73e1397e 100644 --- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py @@ -105,7 +105,7 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ -NeighbourhoodCleaningRule # doctest: +NORMALIZE_WHITESPACE +NeighbourhoodCleaningRule # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index abe9484d9..bd7863791 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -92,7 +92,7 @@ class OneSidedSelection(BaseCleaningSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ - OneSidedSelection # doctest: +NORMALIZE_WHITESPACE + OneSidedSelection # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index fe2b9b7a2..4447efbfb 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -69,7 +69,7 @@ class RandomUnderSampler(BaseUnderSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ -RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE +RandomUnderSampler # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py index 7ee8d31d9..3e7907939 100644 --- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py @@ -72,7 +72,7 @@ class TomekLinks(BaseCleaningSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ -TomekLinks # doctest: +NORMALIZE_WHITESPACE +TomekLinks # doctest: >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) diff --git a/imblearn/under_sampling/base.py b/imblearn/under_sampling/base.py index fc1e09314..82db6bdd0 100644 --- a/imblearn/under_sampling/base.py +++ b/imblearn/under_sampling/base.py @@ -52,7 +52,7 @@ class BaseUnderSampler(BaseSampler): - When callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each class. - """.rstrip() + """.rstrip() # noqa: E501 class BaseCleaningSampler(BaseSampler): diff --git a/setup.cfg b/setup.cfg index 8b15065af..05df1039a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,13 +2,13 @@ current_version = 0.10.0.dev0 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? -serialize = +serialize = {major}.{minor}.{patch}.{release}{dev} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = gamma -values = +values = dev gamma @@ -22,20 +22,55 @@ test = pytest [tool:pytest] doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS testpaths = imblearn -addopts = +addopts = --doctest-modules --color=yes -rs -filterwarnings = +filterwarnings = ignore:the matrix subclass:PendingDeprecationWarning [flake8] +# max line length for black max-line-length = 88 -ignore = E121,E123,E126,E226,E24,E704,W503,W504,E203 -per-file-ignores = - examples/*: E402 +target-version = ['py37'] +# Default flake8 3.5 ignored flags +ignore= + # check ignored by default in flake8. Meaning unclear. + E24, + # continuation line under-indented + E121, + # closing bracket does not match indentation + E123, + # continuation line over-indented for hanging indent + E126, + # space before : (needed for how black formats slicing) + E203, + # missing whitespace around arithmetic operator + E226, + # multiple statements on one line (def) + E704, + # do not assign a lambda expression, use a def + E731, + # do not use variables named 'l', 'O', or 'I' + E741, + # line break before binary operator + W503, + # line break after binary operator + W504 +exclude= + .git, + __pycache__, + dist, + doc/_build, + doc/auto_examples, + build, + +# It's fine not to put the import at the top of the file in the examples +# folder. +per-file-ignores = + examples/*: E402 + doc/conf.py: E402 [mypy] ignore_missing_imports = True allow_redefinition = True -