scikit-learn · glemaitre · Jul 23, 2018 · Jul 16, 2018 · Jul 17, 2018 · Jul 18, 2018
diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py
@@ -119,7 +119,8 @@ def print_outlier_ratio(y):
     y_test = y[n_samples_train:]
 
     print('--- Fitting the IsolationForest estimator...')
-    model = IsolationForest(n_jobs=-1, random_state=random_state)
+    model = IsolationForest(behaviour='new', n_jobs=-1,
+                            random_state=random_state)
     tstart = time()
     model.fit(X_train)
     fit_time = time() - tstart

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -901,6 +901,17 @@ Outlier Detection models
   the ``fit_predict`` method is avaiable.
   By :user:`Albert Thomas <albertcthomas>`.
 
+ - A ``behaviour`` parameter has been introduced in :class:`ensemble.IsolationForest`
+  to ensure backward compatibility.
+  In the old behaviour, the ``decision_function`` is independent of the ``contamination``
+  parameter. A threshold attribute depending on the ``contamination`` parameter is thus
+  used.
+  In the new behaviour the ``decision_function`` is dependent on the ``contamination``
+  parameter, in such a way that 0 becomes its natural threshold to detect outliers.
+  Setting behaviour to "old" is deprecated and will not be possible in version 0.22.
+  Beside, the behaviour parameter will be removed in 0.24.
+  :issue:`11553` by `Nicolas Goix`_.
+
 Covariance
 
 - The :func:`covariance.graph_lasso`, :class:`covariance.GraphLasso` and

diff --git a/examples/ensemble/plot_isolation_forest.py b/examples/ensemble/plot_isolation_forest.py
@@ -40,7 +40,8 @@
 X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
 
 # fit the model
-clf = IsolationForest(max_samples=100, random_state=rng, contamination='auto')
+clf = IsolationForest(behaviour='new', max_samples=100,
+                      random_state=rng, contamination='auto')
 clf.fit(X_train)
 y_pred_train = clf.predict(X_train)
 y_pred_test = clf.predict(X_test)

diff --git a/examples/plot_anomaly_comparison.py b/examples/plot_anomaly_comparison.py
@@ -80,7 +80,8 @@
     ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
     ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf",
                                       gamma=0.1)),
-    ("Isolation Forest", IsolationForest(contamination=outliers_fraction,
+    ("Isolation Forest", IsolationForest(behaviour='new',
+                                         contamination=outliers_fraction,
                                          random_state=42)),
     ("Local Outlier Factor", LocalOutlierFactor(
         n_neighbors=35, contamination=outliers_fraction))]

diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py
@@ -89,6 +89,26 @@ class IsolationForest(BaseBagging, OutlierMixin):
         The number of jobs to run in parallel for both `fit` and `predict`.
         If -1, then the number of jobs is set to the number of cores.
 
+    behaviour : str, default='old'
+        Behaviour of the ``decision_function`` which can be either 'old' or
+        'new'. Passing ``behaviour='new'`` makes the ``decision_function``
+        change to match other anomaly detection algorithm API which will be
+        the default behaviour in the future. As explained in details in the
+        ``offset_`` attribute documentation, the ``decision_function`` becomes
+        dependent on the contamination parameter, in such a way that 0 becomes
+        its natural threshold to detect outliers.
+
+        .. versionadded:: 0.20
+           ``behaviour`` is added in 0.20 for back-compatibility purpose.
+
+        .. deprecated:: 0.20
+           ``behaviour='old'`` is deprecated in 0.20 and will not be possible
+           in 0.22.
+
+        .. deprecated:: 0.22
+           ``behaviour`` parameter will be deprecated in 0.22 and removed in
+           0.24.
+
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
@@ -114,12 +134,16 @@ class IsolationForest(BaseBagging, OutlierMixin):
     offset_ : float
         Offset used to define the decision function from the raw scores.
         We have the relation: ``decision_function = score_samples - offset_``.
+        Assuming behaviour == 'new', offset_ is defined as follows.
         When the contamination parameter is set to "auto", the offset is equal
         to -0.5 as the scores of inliers are close to 0 and the scores of
         outliers are close to -1. When a contamination parameter different
         than "auto" is provided, the offset is defined in such a way we obtain
         the expected number of outliers (samples with decision function < 0)
         in training.
+        Assuming the behaviour parameter is set to 'old', we always have
+        offset_ = -0.5, making the decision function independent from the
+        contamination parameter.
 
     References
     ----------
@@ -138,6 +162,7 @@ def __init__(self,
                  max_features=1.,
                  bootstrap=False,
                  n_jobs=1,
+                 behaviour='old',
                  random_state=None,
                  verbose=0):
         super(IsolationForest, self).__init__(
@@ -154,6 +179,8 @@ def __init__(self,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose)
+
+        self.behaviour = behaviour
         self.contamination = contamination
 
     def _set_oob_score(self, X, y):
@@ -185,6 +212,13 @@ def fit(self, X, y=None, sample_weight=None):
         else:
             self._contamination = self.contamination
 
+        if self.behaviour == 'old':
+            warnings.warn('behaviour="old" is deprecated and will be removed '
+                          'in version 0.22. Please use behaviour="new", which '
+                          'makes the decision_function change to match '
+                          'other anomaly detection algorithm API.',
+                          FutureWarning)
+
         X = check_array(X, accept_sparse=['csc'])
         if issparse(X):
             # Pre-sort indices to avoid that each individual tree of the
@@ -226,16 +260,29 @@ def fit(self, X, y=None, sample_weight=None):
                                           max_depth=max_depth,
                                           sample_weight=sample_weight)
 
+        if self.behaviour == 'old':
+            # in this case, decision_function = 0.5 + self.score_samples(X):
+            if self._contamination == "auto":
+                raise ValueError("contamination parameter cannot be set to "
+                                 "'auto' when behaviour == 'old'.")
+
+            self.offset_ = -0.5
+            self._threshold_ = sp.stats.scoreatpercentile(
+                self.decision_function(X), 100. * self._contamination)
+
+            return self
+
+        # else, self.behaviour == 'new':
         if self._contamination == "auto":
             # 0.5 plays a special role as described in the original paper.
             # we take the opposite as we consider the opposite of their score.
             self.offset_ = -0.5
-            # need to save (depreciated) threshold_ in this case:
-            self._threshold_ = sp.stats.scoreatpercentile(
-                self.score_samples(X), 100. * 0.1)
-        else:
-            self.offset_ = sp.stats.scoreatpercentile(
-                self.score_samples(X), 100. * self._contamination)
+            return self
+
+        # else, define offset_ wrt contamination parameter, so that the
+        # threshold_ attribute is implicitly 0 and is not needed anymore:
+        self.offset_ = sp.stats.scoreatpercentile(
+            self.score_samples(X), 100. * self._contamination)
 
         return self
 
@@ -258,7 +305,8 @@ def predict(self, X):
         check_is_fitted(self, ["offset_"])
         X = check_array(X, accept_sparse='csr')
         is_inlier = np.ones(X.shape[0], dtype=int)
-        is_inlier[self.decision_function(X) < 0] = -1
+        threshold = self.threshold_ if self.behaviour == 'old' else 0
+        is_inlier[self.decision_function(X) < threshold] = -1
         return is_inlier
 
     def decision_function(self, X):
@@ -359,11 +407,12 @@ def score_samples(self, X):
 
     @property
     def threshold_(self):
+        if self.behaviour != 'old':
+            raise AttributeError("threshold_ attribute does not exist when "
+                                 "behaviour != 'old'")
         warnings.warn("threshold_ attribute is deprecated in 0.20 and will"
                       " be removed in 0.22.", DeprecationWarning)
-        if self.contamination == 'auto':
-            return self._threshold_
-        return self.offset_
+        return self._threshold_
 
 
 def _average_path_length(n_samples_leaf):

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
@@ -15,6 +15,7 @@
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_greater
@@ -47,6 +48,7 @@
 boston.target = boston.target[perm]
 
 
+@pytest.mark.filterwarnings('ignore:threshold_ attribute')
 def test_iforest():
     """Check Isolation Forest for various parameter settings."""
     X_train = np.array([[0, 1], [1, 2]])
@@ -63,6 +65,8 @@ def test_iforest():
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:threshold_ attribute')
+@pytest.mark.filterwarnings('ignore:behaviour="old"')
 def test_iforest_sparse():
     """Check IForest for various parameter settings on sparse input."""
     rng = check_random_state(0)
@@ -91,6 +95,8 @@ def test_iforest_sparse():
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:threshold_ attribute')
+@pytest.mark.filterwarnings('ignore:behaviour="old"')
 def test_iforest_error():
     """Test that it gives proper exception on deficient input."""
     X = iris.data
@@ -128,8 +134,14 @@ def test_iforest_error():
     # test X_test n_features match X_train one:
     assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:])
 
+    # test threshold_ attribute error when behaviour is not old:
+    msg = "threshold_ attribute does not exist when behaviour != 'old'"
+    assert_raises_regex(AttributeError, msg, getattr,
+                        IsolationForest(behaviour='new'), 'threshold_')
+
 
 @pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:behaviour="old"')
 def test_recalculate_max_depth():
     """Check max_depth recalculation when max_samples is reset to n_samples"""
     X = iris.data
@@ -139,6 +151,7 @@ def test_recalculate_max_depth():
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:behaviour="old"')
 def test_max_samples_attribute():
     X = iris.data
     clf = IsolationForest().fit(X)
@@ -155,6 +168,8 @@ def test_max_samples_attribute():
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:threshold_ attribute')
+@pytest.mark.filterwarnings('ignore:behaviour="old"')
 def test_iforest_parallel_regression():
     """Check parallel regression."""
     rng = check_random_state(0)
@@ -180,6 +195,7 @@ def test_iforest_parallel_regression():
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:behaviour="old"')
 def test_iforest_performance():
     """Test Isolation Forest performs well"""
 
@@ -204,13 +220,15 @@ def test_iforest_performance():
     assert_greater(roc_auc_score(y_test, y_pred), 0.98)
 
 
+@pytest.mark.filterwarnings('ignore:threshold_ attribute')
 def test_iforest_works():
     # toy sample (the last two samples are outliers)
     X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]
 
     # Test IsolationForest
     for contamination in [0.25, "auto"]:
-        clf = IsolationForest(random_state=rng, contamination=contamination)
+        clf = IsolationForest(behaviour='new', random_state=rng,
+                              contamination=contamination)
         clf.fit(X)
         decision_func = - clf.decision_function(X)
         pred = clf.predict(X)
@@ -220,6 +238,7 @@ def test_iforest_works():
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:behaviour="old"')
 def test_max_samples_consistency():
     # Make sure validated max_samples in iforest and BaseBagging are identical
     X = iris.data
@@ -228,6 +247,8 @@ def test_max_samples_consistency():
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:threshold_ attribute')
+@pytest.mark.filterwarnings('ignore:behaviour="old"')
 def test_iforest_subsampled_features():
     # It tests non-regression for #5732 which failed at predict.
     rng = check_random_state(0)
@@ -253,6 +274,7 @@ def test_iforest_average_path_length():
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:behaviour="old"')
 def test_score_samples():
     X_train = [[1, 1], [1, 2], [2, 1]]
     clf1 = IsolationForest(contamination=0.1).fit(X_train)
@@ -265,6 +287,8 @@ def test_score_samples():
                        clf2.score_samples([[2., 2.]]))
 
 
+@pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:behaviour="old"')
 def test_deprecation():
     X = [[0.0], [1.0]]
     clf = IsolationForest()
@@ -274,8 +298,23 @@ def test_deprecation():
                          'in version 0.22 to "auto"',
                          clf.fit, X)
 
-    clf = IsolationForest(contamination='auto').fit(X)
+    assert_warns_message(FutureWarning,
+                         'behaviour="old" is deprecated and will be removed '
+                         'in version 0.22',
+                         clf.fit, X)
+
+    clf = IsolationForest().fit(X)
     assert_warns_message(DeprecationWarning,
                          "threshold_ attribute is deprecated in 0.20 and will"
                          " be removed in 0.22.",
                          getattr, clf, "threshold_")
+
+
+@pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:behaviour="old"')
+def test_behaviour_param():
+    X_train = [[1, 1], [1, 2], [2, 1]]
+    clf1 = IsolationForest(behaviour='old').fit(X_train)
+    clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train)
+    assert_array_equal(clf1.decision_function([[2., 2.]]),
+                       clf2.decision_function([[2., 2.]]))
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -368,6 +368,13 @@ def set_checking_parameters(estimator):
     if estimator.__class__.__name__ == "TheilSenRegressor":
         estimator.max_subpopulation = 100
 
+    if estimator.__class__.__name__ == "IsolationForest":
+        # XXX to be removed in 0.22.
+        # this is used because the old IsolationForest does not
+        # respect the outlier detection API and thus and does not
+        # pass the outlier detection common tests.
+        estimator.set_params(behaviour='new')
+
     if isinstance(estimator, BaseRandomProjection):
         # Due to the jl lemma and often very few samples, the number
         # of components of the random matrix projection will be probably