[MRG+2] FIX: make the deprecation in fit and not init in IsolationForest (#11574)

glemaitre · amueller · commit cba99e01330c · 2018-07-17T14:52:30.000-05:00
* FIX make the deprecation in fit instead than init

* DOC mention version changed

* FIX investigate weird failing test
diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py
@@ -70,6 +70,10 @@ class IsolationForest(BaseBagging, OutlierMixin):
         on the decision function. If 'auto', the decision function threshold is
         determined as in the original paper.
 
+        .. versionchanged:: 0.20
+           The default value of ``contamination`` will change from 0.1 in 0.20
+           to ``'auto'`` in 0.22.
+
     max_features : int or float, optional (default=1.0)
         The number of features to draw from X to train each base estimator.
 
@@ -150,12 +154,6 @@ def __init__(self,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose)
-
-        if contamination == "legacy":
-            warnings.warn('default contamination parameter 0.1 will change '
-                          'in version 0.22 to "auto". This will change the '
-                          'predict method behavior.',
-                          DeprecationWarning)
         self.contamination = contamination
 
     def _set_oob_score(self, X, y):
@@ -178,6 +176,15 @@ def fit(self, X, y=None, sample_weight=None):
         -------
         self : object
         """
+        if self.contamination == "legacy":
+            warnings.warn('default contamination parameter 0.1 will change '
+                          'in version 0.22 to "auto". This will change the '
+                          'predict method behavior.',
+                          FutureWarning)
+            self._contamination = 0.1
+        else:
+            self._contamination = self.contamination
+
         X = check_array(X, accept_sparse=['csc'])
         if issparse(X):
             # Pre-sort indices to avoid that each individual tree of the
@@ -219,19 +226,16 @@ def fit(self, X, y=None, sample_weight=None):
                                           max_depth=max_depth,
                                           sample_weight=sample_weight)
 
-        if self.contamination == "auto":
+        if self._contamination == "auto":
             # 0.5 plays a special role as described in the original paper.
             # we take the opposite as we consider the opposite of their score.
             self.offset_ = -0.5
             # need to save (depreciated) threshold_ in this case:
             self._threshold_ = sp.stats.scoreatpercentile(
                 self.score_samples(X), 100. * 0.1)
-        elif self.contamination == "legacy":  # to be rm in 0.22
-            self.offset_ = sp.stats.scoreatpercentile(
-                self.score_samples(X), 100. * 0.1)
         else:
             self.offset_ = sp.stats.scoreatpercentile(
-                self.score_samples(X), 100. * self.contamination)
+                self.score_samples(X), 100. * self._contamination)
 
         return self
 
diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
@@ -62,6 +62,7 @@ def test_iforest():
                             **params).fit(X_train).predict(X_test)
 
 
+@pytest.mark.filterwarnings('ignore:default contamination')
 def test_iforest_sparse():
     """Check IForest for various parameter settings on sparse input."""
     rng = check_random_state(0)
@@ -89,6 +90,7 @@ def test_iforest_sparse():
             assert_array_equal(sparse_results, dense_results)
 
 
+@pytest.mark.filterwarnings('ignore:default contamination')
 def test_iforest_error():
     """Test that it gives proper exception on deficient input."""
     X = iris.data
@@ -127,6 +129,7 @@ def test_iforest_error():
     assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:])
 
 
+@pytest.mark.filterwarnings('ignore:default contamination')
 def test_recalculate_max_depth():
     """Check max_depth recalculation when max_samples is reset to n_samples"""
     X = iris.data
@@ -135,6 +138,7 @@ def test_recalculate_max_depth():
         assert_equal(est.max_depth, int(np.ceil(np.log2(X.shape[0]))))
 
 
+@pytest.mark.filterwarnings('ignore:default contamination')
 def test_max_samples_attribute():
     X = iris.data
     clf = IsolationForest().fit(X)
@@ -150,6 +154,7 @@ def test_max_samples_attribute():
     assert_equal(clf.max_samples_, 0.4*X.shape[0])
 
 
+@pytest.mark.filterwarnings('ignore:default contamination')
 def test_iforest_parallel_regression():
     """Check parallel regression."""
     rng = check_random_state(0)
@@ -174,6 +179,7 @@ def test_iforest_parallel_regression():
     assert_array_almost_equal(y1, y3)
 
 
+@pytest.mark.filterwarnings('ignore:default contamination')
 def test_iforest_performance():
     """Test Isolation Forest performs well"""
 
@@ -213,13 +219,15 @@ def test_iforest_works():
         assert_array_equal(pred, 6 * [1] + 2 * [-1])
 
 
+@pytest.mark.filterwarnings('ignore:default contamination')
 def test_max_samples_consistency():
     # Make sure validated max_samples in iforest and BaseBagging are identical
     X = iris.data
     clf = IsolationForest().fit(X)
     assert_equal(clf.max_samples_, clf._max_samples)
 
 
+@pytest.mark.filterwarnings('ignore:default contamination')
 def test_iforest_subsampled_features():
     # It tests non-regression for #5732 which failed at predict.
     rng = check_random_state(0)
@@ -244,6 +252,7 @@ def test_iforest_average_path_length():
                               [1., result_one, result_two], decimal=10)
 
 
+@pytest.mark.filterwarnings('ignore:default contamination')
 def test_score_samples():
     X_train = [[1, 1], [1, 2], [2, 1]]
     clf1 = IsolationForest(contamination=0.1).fit(X_train)
@@ -257,12 +266,15 @@ def test_score_samples():
 
 
 def test_deprecation():
-    assert_warns_message(DeprecationWarning,
+    X = [[0.0], [1.0]]
+    clf = IsolationForest()
+
+    assert_warns_message(FutureWarning,
                          'default contamination parameter 0.1 will change '
                          'in version 0.22 to "auto"',
-                         IsolationForest, )
-    X = [[0.0], [1.0]]
-    clf = IsolationForest().fit(X)
+                         clf.fit, X)
+
+    clf = IsolationForest(contamination='auto').fit(X)
     assert_warns_message(DeprecationWarning,
                          "threshold_ attribute is deprecated in 0.20 and will"
                          " be removed in 0.22.",
diff --git a/sklearn/linear_model/tests/test_sag.py b/sklearn/linear_model/tests/test_sag.py
@@ -17,6 +17,7 @@
 from sklearn.utils.extmath import row_norms
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_array_almost_equal
+from sklearn.utils.testing import assert_allclose
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_raise_message
 from sklearn.utils.testing import ignore_warnings
@@ -269,7 +270,6 @@ def test_classifier_matching():
         assert_array_almost_equal(intercept2, clf.intercept_, decimal=9)
 
 
-@ignore_warnings
 def test_regressor_matching():
     n_samples = 10
     n_features = 5
@@ -295,10 +295,10 @@ def test_regressor_matching():
                                dloss=squared_dloss,
                                fit_intercept=fit_intercept)
 
-    assert_array_almost_equal(weights1, clf.coef_, decimal=10)
-    assert_array_almost_equal(intercept1, clf.intercept_, decimal=10)
-    assert_array_almost_equal(weights2, clf.coef_, decimal=10)
-    assert_array_almost_equal(intercept2, clf.intercept_, decimal=10)
+    assert_allclose(weights1, clf.coef_)
+    assert_allclose(intercept1, clf.intercept_)
+    assert_allclose(weights2, clf.coef_)
+    assert_allclose(intercept2, clf.intercept_)
 
 
 @ignore_warnings