scikit-learn-contrib · glemaitre · Sep 5, 2018 · Sep 5, 2018 · Sep 5, 2018 · Sep 5, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -35,7 +35,7 @@ matrix:
     - env: DISTRIB="conda" PYTHON_VERSION="2.7"
            NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="0.20rc"
     - env: DISTRIB="conda" PYTHON_VERSION="3.6"
-           NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="0.20rc"
+           NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="0.20rc"
     - env: DISTRIB="conda" PYTHON_VERSION="3.7"
            NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="0.20rc"
     - env: DISTRIB="conda" PYTHON_VERSION="3.7"

diff --git a/appveyor.yml b/appveyor.yml
@@ -42,7 +42,7 @@ install:
   - activate testenv
   - conda install scipy numpy -y -q
   - pip install --pre scikit-learn
-  - "conda install %OPTIONAL_DEP% -y -q"
+  - conda install %OPTIONAL_DEP% -y -q
   - conda install pytest pytest-cov -y -q
   - pip install codecov
   - pip install .

diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
@@ -88,11 +88,12 @@ conda update --yes --quiet conda
 
 # Configure the conda environment and put it in the path using the
 # provided versions
-conda create -n $CONDA_ENV_NAME --yes --quiet python=3
+conda create -n $CONDA_ENV_NAME --yes --quiet python=3.6
 source activate $CONDA_ENV_NAME
 
-conda install --yes pip numpy scipy scikit-learn pillow matplotlib sphinx \
+conda install --yes pip numpy scipy pillow matplotlib sphinx \
       sphinx_rtd_theme numpydoc pandas keras
+pip install --pre scikit-learn
 pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git
 
 # Build and install imbalanced-learn in dev mode

diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
@@ -40,9 +40,9 @@ if [[ "$DISTRIB" == "conda" ]]; then
     source activate testenv
     conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION
 
-    if [[ $PYTHON_VERSION == "3.7" ]]; then
-        conda install --yes pandas
-        conda install --yes -c conda-forge keras
+    if [[ $PYTHON_VERSION == "3.6" ]]; then
+        # Tensorflow is not available in Python 3.7 yet.
+        conda install --yes pandas keras tensorflow
         KERAS_BACKEND=tensorflow
         python -c "import keras.backend"
         sed -i -e 's/"backend":[[:space:]]*"[^"]*/"backend":\ "'$KERAS_BACKEND'/g' ~/.keras/keras.json;

diff --git a/doc/ensemble.rst b/doc/ensemble.rst
@@ -32,11 +32,11 @@ under-sampling the original set::
   >>> print(sorted(Counter(y).items()))
   [(0, 64), (1, 262), (2, 4674)]
   >>> from imblearn.ensemble import EasyEnsemble
-  >>> ee = EasyEnsemble(random_state=0, n_subsets=10)
-  >>> X_resampled, y_resampled = ee.fit_resample(X, y)
-  >>> print(X_resampled.shape)
+  >>> ee = EasyEnsemble(random_state=0, n_subsets=10) # doctest: +SKIP
+  >>> X_resampled, y_resampled = ee.fit_resample(X, y) # doctest: +SKIP
+  >>> print(X_resampled.shape) # doctest: +SKIP
   (10, 192, 2)
-  >>> print(sorted(Counter(y_resampled[0]).items()))
+  >>> print(sorted(Counter(y_resampled[0]).items())) # doctest: +SKIP
   [(0, 64), (1, 64), (2, 64)]
 
 :class:`EasyEnsemble` has two important parameters: (i) ``n_subsets`` will be
@@ -53,7 +53,9 @@ parameter ``n_max_subset`` and an additional bootstraping can be activated with
   >>> from imblearn.ensemble import BalanceCascade
   >>> from sklearn.linear_model import LogisticRegression
   >>> bc = BalanceCascade(random_state=0,
-  ...                     estimator=LogisticRegression(random_state=0),
+  ...                     estimator=LogisticRegression(solver='lbfgs',
+  ...                                                  multi_class='auto',
+  ...                                                  random_state=0),
   ...                     n_max_subset=4)
   >>> X_resampled, y_resampled = bc.fit_resample(X, y)
   >>> print(X_resampled.shape)

diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst
@@ -340,7 +340,7 @@ used as::
   >>> oss = OneSidedSelection(random_state=0)
   >>> X_resampled, y_resampled = oss.fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
-  [(0, 64), (1, 174), (2, 4403)]
+  [(0, 64), (1, 174), (2, 4404)]
 
 Our implementation offer to set the number of seeds to put in the set :math:`C`
 originally by setting the parameter ``n_seeds_S``.
@@ -379,7 +379,8 @@ removed. The class can be used as::
   >>> from sklearn.linear_model import LogisticRegression
   >>> from imblearn.under_sampling import InstanceHardnessThreshold
   >>> iht = InstanceHardnessThreshold(random_state=0,
-  ...                                 estimator=LogisticRegression())
+  ...                                 estimator=LogisticRegression(
+  ...                                     solver='lbfgs', multi_class='auto'))
   >>> X_resampled, y_resampled = iht.fit_resample(X, y)
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 64), (1, 64), (2, 64)]

diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst
@@ -97,6 +97,9 @@ Maintenance
 - Upgrade requirements to scikit-learn 0.20.
   :issue:`379` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- Catch deprecation warning in testing.
+  :issue:`441` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Documentation
 .............
 

diff --git a/examples/plot_outlier_rejections.py b/examples/plot_outlier_rejections.py
@@ -37,6 +37,7 @@ def plot_scatter(X, y, title):
     plt.legend()
     plt.title(title)
 
+
 ##############################################################################
 # Toy data generation
 ##############################################################################
@@ -82,11 +83,13 @@ def plot_scatter(X, y, title):
 # :class:`imblearn.FunctionSampler` will be called when using the method
 # ``fit_resample``.
 
+
 def outlier_rejection(X, y):
     """This will be our function used to resample our dataset."""
     model = IsolationForest(max_samples=100,
                             contamination=0.4,
-                            random_state=rng)
+                            random_state=rng,
+                            behaviour='new')
     model.fit(X)
     y_pred = model.predict(X)
     return X[y_pred == 1], y[y_pred == 1]
@@ -105,11 +108,12 @@ def outlier_rejection(X, y):
 # affected during the prediction.
 
 pipe = make_pipeline(FunctionSampler(func=outlier_rejection),
-                     LogisticRegression(random_state=rng))
+                     LogisticRegression(solver='lbfgs', multi_class='auto',
+                                        random_state=rng))
 y_pred = pipe.fit(X_train, y_train).predict(X_test)
 print(classification_report(y_test, y_pred))
 
-clf = LogisticRegression(random_state=rng)
+clf = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=rng)
 y_pred = clf.fit(X_train, y_train).predict(X_test)
 print(classification_report(y_test, y_pred))
 

diff --git a/examples/under-sampling/plot_comparison_under_sampling.py b/examples/under-sampling/plot_comparison_under_sampling.py
@@ -235,8 +235,9 @@ def plot_decision_function(X, y, clf, ax):
 clf = LinearSVC().fit(X, y)
 plot_decision_function(X, y, clf, ax1)
 ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
-sampler = InstanceHardnessThreshold(random_state=0,
-                                    estimator=LogisticRegression())
+sampler = InstanceHardnessThreshold(
+    random_state=0, estimator=LogisticRegression(solver='lbfgs',
+                                                 multi_class='auto'))
 clf = make_pipeline(sampler, LinearSVC())
 clf.fit(X, y)
 plot_decision_function(X, y, clf, ax2)

diff --git a/examples/under-sampling/plot_instance_hardness_threshold.py b/examples/under-sampling/plot_instance_hardness_threshold.py
@@ -60,7 +60,9 @@ def plot_resampling(ax, X, y, title):
         c0, c1 = plot_resampling(ax, X_vis, y, 'Original set')
     else:
         iht = InstanceHardnessThreshold(sampling_strategy=sampling_strategy,
-                                        estimator=LogisticRegression(),
+                                        estimator=LogisticRegression(
+                                            solver='lbfgs',
+                                            multi_class='auto'),
                                         return_indices=True)
         X_res, y_res, idx_res = iht.fit_resample(X, y)
         X_res_vis = pca.transform(X_res)

diff --git a/imblearn/combine/tests/test_smote_enn.py b/imblearn/combine/tests/test_smote_enn.py
@@ -48,8 +48,7 @@ def test_sample_regular():
 def test_sample_regular_pass_smote_enn():
     smote = SMOTEENN(
         smote=SMOTE(sampling_strategy='auto', random_state=RND_SEED),
-        enn=EditedNearestNeighbours(
-            sampling_strategy='all', random_state=RND_SEED),
+        enn=EditedNearestNeighbours(sampling_strategy='all'),
         random_state=RND_SEED)
     X_resampled, y_resampled = smote.fit_resample(X, Y)
 
@@ -77,8 +76,7 @@ def test_sample_regular_half():
 
 def test_validate_estimator_init():
     smote = SMOTE(random_state=RND_SEED)
-    enn = EditedNearestNeighbours(
-        random_state=RND_SEED, sampling_strategy='all')
+    enn = EditedNearestNeighbours(sampling_strategy='all')
     smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED)
     X_resampled, y_resampled = smt.fit_resample(X, Y)
     X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [

diff --git a/imblearn/combine/tests/test_smote_tomek.py b/imblearn/combine/tests/test_smote_tomek.py
@@ -70,7 +70,7 @@ def test_sample_regular_half():
 
 def test_validate_estimator_init():
     smote = SMOTE(random_state=RND_SEED)
-    tomek = TomekLinks(random_state=RND_SEED, sampling_strategy='all')
+    tomek = TomekLinks(sampling_strategy='all')
     smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED)
     X_resampled, y_resampled = smt.fit_resample(X, Y)
     X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [

diff --git a/imblearn/datasets/tests/test_imbalance.py b/imblearn/datasets/tests/test_imbalance.py
@@ -7,6 +7,7 @@
 
 from collections import Counter
 
+import pytest
 import numpy as np
 
 from pytest import raises
@@ -53,6 +54,7 @@ def test_make_imbalance_dict():
     assert Counter(y_) == {0: 10, 1: 20, 2: 50}
 
 
+@pytest.mark.filterwarnings("ignore:'ratio' has been deprecated in 0.4")
 def test_make_imbalance_ratio():
     # check that using 'ratio' is working
     sampling_strategy = {0: 10, 1: 20, 2: 30}

diff --git a/imblearn/ensemble/_balance_cascade.py b/imblearn/ensemble/_balance_cascade.py
@@ -179,7 +179,7 @@ def _fit_resample(self, X, y):
             # fit and predict using cross validation
             X_subset = safe_indexing(X, subset_indices)
             y_subset = safe_indexing(y, subset_indices)
-            pred = cross_val_predict(self.estimator_, X_subset, y_subset)
+            pred = cross_val_predict(self.estimator_, X_subset, y_subset, cv=3)
             # extract the prediction about the targeted classes only
             pred_target = pred[:index_under_sample.size]
             index_classified = index_under_sample[pred_target == safe_indexing(

diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py
@@ -93,9 +93,9 @@ class EasyEnsemble(BaseEnsembleSampler):
     ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
     >>> print('Original dataset shape %s' % Counter(y))
     Original dataset shape Counter({{1: 900, 0: 100}})
-    >>> ee = EasyEnsemble(random_state=42)
-    >>> X_res, y_res = ee.fit_resample(X, y)
-    >>> print('Resampled dataset shape %s' % Counter(y_res[0]))
+    >>> ee = EasyEnsemble(random_state=42) # doctest: +SKIP
+    >>> X_res, y_res = ee.fit_resample(X, y) # doctest: +SKIP
+    >>> print('Resampled dataset shape %s' % Counter(y_res[0])) # doctest: +SKIP
     Resampled dataset shape Counter({{0: 100, 1: 100}})
 
     """

diff --git a/imblearn/ensemble/tests/test_bagging.py b/imblearn/ensemble/tests/test_bagging.py
@@ -47,10 +47,10 @@ def test_balanced_bagging_classifier():
     for base_estimator in [
             None,
             DummyClassifier(),
-            Perceptron(),
+            Perceptron(max_iter=1000, tol=1e-3),
             DecisionTreeClassifier(),
             KNeighborsClassifier(),
-            SVC()
+            SVC(gamma='scale')
     ]:
         for params in grid:
             BalancedBaggingClassifier(
@@ -155,8 +155,10 @@ def test_probability():
 
         # Degenerate case, where some classes are missing
         ensemble = BalancedBaggingClassifier(
-            base_estimator=LogisticRegression(), random_state=0,
-            max_samples=5).fit(X_train, y_train)
+            base_estimator=LogisticRegression(solver='lbfgs',
+                                              multi_class='auto'),
+            random_state=0, max_samples=5)
+        ensemble.fit(X_train, y_train)
 
         assert_array_almost_equal(
             np.sum(ensemble.predict_proba(X_test), axis=1),
@@ -179,7 +181,7 @@ def test_oob_score_classification():
         random_state=0)
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 
-    for base_estimator in [DecisionTreeClassifier(), SVC()]:
+    for base_estimator in [DecisionTreeClassifier(), SVC(gamma='scale')]:
         clf = BalancedBaggingClassifier(
             base_estimator=base_estimator,
             n_estimators=100,
@@ -282,8 +284,8 @@ def test_gridsearch():
     parameters = {'n_estimators': (1, 2), 'base_estimator__C': (1, 2)}
 
     GridSearchCV(
-        BalancedBaggingClassifier(SVC()), parameters, scoring="roc_auc").fit(
-            X, y)
+        BalancedBaggingClassifier(SVC(gamma='scale')), parameters, cv=3,
+        scoring="roc_auc").fit(X, y)
 
 
 def test_base_estimator():
@@ -311,7 +313,8 @@ def test_base_estimator():
                       DecisionTreeClassifier)
 
     ensemble = BalancedBaggingClassifier(
-        Perceptron(), n_jobs=3, random_state=0).fit(X_train, y_train)
+        Perceptron(max_iter=1000, tol=1e-3), n_jobs=3, random_state=0).fit(
+            X_train, y_train)
 
     assert isinstance(ensemble.base_estimator_.steps[-1][1], Perceptron)
 
@@ -445,7 +448,8 @@ def test_estimators_samples():
 
     # remap the y outside of the BalancedBaggingclassifier
     # _, y = np.unique(y, return_inverse=True)
-    bagging = BalancedBaggingClassifier(LogisticRegression(),
+    bagging = BalancedBaggingClassifier(LogisticRegression(solver='lbfgs',
+                                                           multi_class='auto'),
                                         max_samples=0.5,
                                         max_features=0.5, random_state=1,
                                         bootstrap=False)

diff --git a/imblearn/ensemble/tests/test_balance_cascade.py b/imblearn/ensemble/tests/test_balance_cascade.py
@@ -118,7 +118,7 @@ def test_fit_resample_auto_early_stop():
 
 def test_give_classifier_obj():
     sampling_strategy = 'auto'
-    estimator = RandomForestClassifier(random_state=RND_SEED)
+    estimator = RandomForestClassifier(n_estimators=10, random_state=RND_SEED)
     bc = BalanceCascade(
         sampling_strategy=sampling_strategy,
         random_state=RND_SEED,

diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py
@@ -292,5 +292,5 @@ def test_easy_ensemble_classifier_grid_search():
                   'base_estimator__n_estimators': [3, 4]}
     grid_search = GridSearchCV(
         EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()),
-        parameters)
+        parameters, cv=5, iid=False)
     grid_search.fit(X, y)
diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py
@@ -24,6 +24,10 @@
 from ..utils._docstring import _random_state_docstring
 from ..tensorflow import balanced_batch_generator as tf_bbg
 
+DONT_HAVE_RANDOM_STATE = ('NearMiss', 'EditedNearestNeighbours',
+                          'RepeatedEditedNearestNeighbours', 'AllKNN',
+                          'NeighbourhoodCleaningRule', 'TomekLinks')
+
 
 class BalancedBatchGenerator(ParentClass):
     """Create balanced batches when training a keras model.
@@ -122,7 +126,9 @@ def _sample(self):
                                  "which has an attribute 'return_indices'.")
             self.sampler_ = clone(self.sampler)
             self.sampler_.set_params(return_indices=True)
-            set_random_state(self.sampler_, random_state)
+            # FIXME: Remove in 0.6
+            if self.sampler_.__class__.__name__ not in DONT_HAVE_RANDOM_STATE:
+                set_random_state(self.sampler_, random_state)
 
         _, _, self.indices_ = self.sampler_.fit_resample(self.X, self.y)
         # shuffle the indices since the sampler are packing them by class

diff --git a/imblearn/keras/tests/test_generator.py b/imblearn/keras/tests/test_generator.py
@@ -13,6 +13,7 @@
 from imblearn.datasets import make_imbalance
 from imblearn.under_sampling import ClusterCentroids
 from imblearn.under_sampling import NearMiss
+from imblearn.over_sampling import RandomOverSampler
 
 from imblearn.keras import BalancedBatchGenerator
 from imblearn.keras import balanced_batch_generator
@@ -38,6 +39,7 @@ def test_balanced_batch_generator_class_no_return_indices():
 @pytest.mark.parametrize(
     "sampler, sample_weight",
     [(None, None),
+     (RandomOverSampler(), None),
      (NearMiss(), None),
      (None, np.random.uniform(size=(y.shape[0])))]
 )
@@ -75,6 +77,7 @@ def test_balanced_batch_generator_function_no_return_indices():
 @pytest.mark.parametrize(
     "sampler, sample_weight",
     [(None, None),
+     (RandomOverSampler(), None),
      (NearMiss(), None),
      (None, np.random.uniform(size=(y.shape[0])))]
 )

diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py
@@ -632,11 +632,13 @@ class is unrecognized by the classifier, G-mean resolves to zero. To
         tp_sum = tp_sum[indices]
         true_sum = true_sum[indices]
 
-        recall = _prf_divide(tp_sum, true_sum, "recall", "true", None,
-                             "recall")
+        with np.errstate(divide='ignore', invalid='ignore'):
+            recall = _prf_divide(tp_sum, true_sum, "recall", "true", None,
+                                 "recall")
         recall[recall == 0] = correction
 
-        gmean = sp.stats.gmean(recall)
+        with np.errstate(divide='ignore', invalid='ignore'):
+            gmean = sp.stats.gmean(recall)
         # old version of scipy return MaskedConstant instead of 0.0
         if isinstance(gmean, np.ma.core.MaskedConstant):
             return 0.0