iforest behaviour param

ngoix · ngoix · commit f11f657e2f23 · 2018-07-18T00:27:41.000+02:00
fix + cosmit

cosmit

common test

update examples

cosmit + fix travis

attribute error instead of value + mask warning in test

whatsnew
diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py
@@ -119,7 +119,8 @@ def print_outlier_ratio(y):
     y_test = y[n_samples_train:]
 
     print('--- Fitting the IsolationForest estimator...')
-    model = IsolationForest(n_jobs=-1, random_state=random_state)
+    model = IsolationForest(behaviour='new', n_jobs=-1,
+                            random_state=random_state)
     tstart = time()
     model.fit(X_train)
     fit_time = time() - tstart
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -861,6 +861,15 @@ Outlier Detection models
   ``raw_values`` parameter is deprecated as the shifted Mahalanobis distance
   will be always returned in 0.22. :issue:`9015` by `Nicolas Goix`_.
 
+- A ``behaviour`` parameter has been introduced in :class:`ensemble.IsolationForest`
+  to ensure backward compatibility.
+  In the old behaviour, the ``decision_function`` is independent of the ``contamination``
+  parameter. A threshold attribute depending on the ``contamination`` parameter is thus
+  used.
+  In the new behaviour the ``decision_function`` is dependent on the ``contamination``
+  parameter, in such a way that 0 becomes its natural threshold to detect outliers.
+  :issue:`11553` by `Nicolas Goix`_.
+
 Covariance
 
 - The :func:`covariance.graph_lasso`, :class:`covariance.GraphLasso` and
diff --git a/examples/covariance/plot_outlier_detection.py b/examples/covariance/plot_outlier_detection.py
@@ -32,7 +32,6 @@
 """
 
 import numpy as np
-from scipy import stats
 import matplotlib.pyplot as plt
 import matplotlib.font_manager
 
@@ -58,7 +57,8 @@
     "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                                      kernel="rbf", gamma=0.1),
     "Robust covariance": EllipticEnvelope(contamination=outliers_fraction),
-    "Isolation Forest": IsolationForest(max_samples=n_samples,
+    "Isolation Forest": IsolationForest(behaviour='new',
+                                        max_samples=n_samples,
                                         contamination=outliers_fraction,
                                         random_state=rng),
     "Local Outlier Factor": LocalOutlierFactor(
diff --git a/examples/ensemble/plot_isolation_forest.py b/examples/ensemble/plot_isolation_forest.py
@@ -40,7 +40,8 @@
 X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
 
 # fit the model
-clf = IsolationForest(max_samples=100, random_state=rng, contamination='auto')
+clf = IsolationForest(behaviour='new', max_samples=100,
+                      random_state=rng, contamination='auto')
 clf.fit(X_train)
 y_pred_train = clf.predict(X_train)
 y_pred_test = clf.predict(X_test)
diff --git a/examples/plot_anomaly_comparison.py b/examples/plot_anomaly_comparison.py
@@ -54,7 +54,8 @@
     ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
     ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf",
                                       gamma=0.1)),
-    ("Isolation Forest", IsolationForest(contamination=outliers_fraction,
+    ("Isolation Forest", IsolationForest(behaviour='new',
+                                         contamination=outliers_fraction,
                                          random_state=42)),
     ("Local Outlier Factor", LocalOutlierFactor(
         n_neighbors=35, contamination=outliers_fraction))]
diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py
@@ -89,6 +89,15 @@ class IsolationForest(BaseBagging, OutlierMixin):
         The number of jobs to run in parallel for both `fit` and `predict`.
         If -1, then the number of jobs is set to the number of cores.
 
+    behaviour: str, optional (default='old')
+        Accepted values are 'old' or 'new'. Behaviour of the decision_function.
+        Default "behaviour" parameter will change to "new" in version 0.22.
+        Passing behaviour="new" makes the decision_function change to match
+        other anomaly detection algorithm API, as explained in details in the
+        offset_ attribute documentation. Basically, the decision_function
+        becomes dependent on the contamination parameter, in such a way that
+        0 becomes its natural threshold to detect outliers.
+
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
@@ -114,12 +123,16 @@ class IsolationForest(BaseBagging, OutlierMixin):
     offset_ : float
         Offset used to define the decision function from the raw scores.
         We have the relation: ``decision_function = score_samples - offset_``.
+        Assuming behaviour == 'new', offset_ is defined as follows.
         When the contamination parameter is set to "auto", the offset is equal
         to -0.5 as the scores of inliers are close to 0 and the scores of
         outliers are close to -1. When a contamination parameter different
         than "auto" is provided, the offset is defined in such a way we obtain
         the expected number of outliers (samples with decision function < 0)
         in training.
+        Assuming the behaviour parameter is set to 'old', we always have
+        offset_ = -0.5, making the decision function independent from the
+        contamination parameter.
 
     References
     ----------
@@ -138,6 +151,7 @@ def __init__(self,
                  max_features=1.,
                  bootstrap=False,
                  n_jobs=1,
+                 behaviour='old',
                  random_state=None,
                  verbose=0):
         super(IsolationForest, self).__init__(
@@ -154,8 +168,17 @@ def __init__(self,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose)
+
+        self.behaviour = behaviour
         self.contamination = contamination
 
+        if behaviour == 'old':
+            warnings.warn('Default "behaviour" parameter will change to "new" '
+                          'in version 0.22. Passing behaviour="new" makes '
+                          'IsolationForest decision_function change to match '
+                          'other anomaly detection algorithm API.',
+                          FutureWarning)
+
     def _set_oob_score(self, X, y):
         raise NotImplementedError("OOB score not supported by iforest")
 
@@ -226,16 +249,29 @@ def fit(self, X, y=None, sample_weight=None):
                                           max_depth=max_depth,
                                           sample_weight=sample_weight)
 
+        if self.behaviour == 'old':
+            # in this case, decision_function = 0.5 + self.score_samples(X):
+            if self._contamination == "auto":
+                raise ValueError("contamination parameter cannot be set to "
+                                 "'auto' when behaviour == 'old'.")
+
+            self.offset_ = -0.5
+            self._threshold_ = sp.stats.scoreatpercentile(
+                self.decision_function(X), 100. * self._contamination)
+
+            return self
+
+        # else, self.behaviour == 'new':
         if self._contamination == "auto":
             # 0.5 plays a special role as described in the original paper.
             # we take the opposite as we consider the opposite of their score.
             self.offset_ = -0.5
-            # need to save (depreciated) threshold_ in this case:
-            self._threshold_ = sp.stats.scoreatpercentile(
-                self.score_samples(X), 100. * 0.1)
-        else:
-            self.offset_ = sp.stats.scoreatpercentile(
-                self.score_samples(X), 100. * self._contamination)
+            return self
+
+        # else, define offset_ wrt contamination parameter, so that the
+        # threshold_ attribute is implicitly 0 and is not needed anymore:
+        self.offset_ = sp.stats.scoreatpercentile(
+            self.score_samples(X), 100. * self._contamination)
 
         return self
 
@@ -258,7 +294,8 @@ def predict(self, X):
         check_is_fitted(self, ["offset_"])
         X = check_array(X, accept_sparse='csr')
         is_inlier = np.ones(X.shape[0], dtype=int)
-        is_inlier[self.decision_function(X) < 0] = -1
+        threshold = self.threshold_ if self.behaviour == 'old' else 0
+        is_inlier[self.decision_function(X) < threshold] = -1
         return is_inlier
 
     def decision_function(self, X):
@@ -359,11 +396,12 @@ def score_samples(self, X):
 
     @property
     def threshold_(self):
+        if self.behaviour != 'old':
+            raise AttributeError("threshold_ attribute does not exist when "
+                                 "behaviour != 'old'")
         warnings.warn("threshold_ attribute is deprecated in 0.20 and will"
                       " be removed in 0.22.", DeprecationWarning)
-        if self.contamination == 'auto':
-            return self._threshold_
-        return self.offset_
+        return self._threshold_
 
 
 def _average_path_length(n_samples_leaf):
diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
@@ -15,6 +15,7 @@
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_greater
@@ -47,6 +48,7 @@
 boston.target = boston.target[perm]
 
 
+@pytest.mark.filterwarnings('ignore:threshold_ attribute')
 def test_iforest():
     """Check Isolation Forest for various parameter settings."""
     X_train = np.array([[0, 1], [1, 2]])
@@ -63,6 +65,7 @@ def test_iforest():
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:threshold_ attribute')
 def test_iforest_sparse():
     """Check IForest for various parameter settings on sparse input."""
     rng = check_random_state(0)
@@ -91,6 +94,7 @@ def test_iforest_sparse():
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:threshold_ attribute')
 def test_iforest_error():
     """Test that it gives proper exception on deficient input."""
     X = iris.data
@@ -128,6 +132,11 @@ def test_iforest_error():
     # test X_test n_features match X_train one:
     assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:])
 
+    # test threshold_ attribute error when behaviour is not old:
+    msg = "threshold_ attribute does not exist when behaviour != 'old'"
+    assert_raises_regex(AttributeError, msg, getattr,
+                        IsolationForest(behaviour='new'), 'threshold_')
+
 
 @pytest.mark.filterwarnings('ignore:default contamination')
 def test_recalculate_max_depth():
@@ -155,6 +164,7 @@ def test_max_samples_attribute():
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:threshold_ attribute')
 def test_iforest_parallel_regression():
     """Check parallel regression."""
     rng = check_random_state(0)
@@ -204,13 +214,15 @@ def test_iforest_performance():
     assert_greater(roc_auc_score(y_test, y_pred), 0.98)
 
 
+@pytest.mark.filterwarnings('ignore:threshold_ attribute')
 def test_iforest_works():
     # toy sample (the last two samples are outliers)
     X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]
 
     # Test IsolationForest
     for contamination in [0.25, "auto"]:
-        clf = IsolationForest(random_state=rng, contamination=contamination)
+        clf = IsolationForest(behaviour='new', random_state=rng,
+                              contamination=contamination)
         clf.fit(X)
         decision_func = - clf.decision_function(X)
         pred = clf.predict(X)
@@ -228,6 +240,7 @@ def test_max_samples_consistency():
 
 
 @pytest.mark.filterwarnings('ignore:default contamination')
+@pytest.mark.filterwarnings('ignore:threshold_ attribute')
 def test_iforest_subsampled_features():
     # It tests non-regression for #5732 which failed at predict.
     rng = check_random_state(0)
@@ -274,8 +287,21 @@ def test_deprecation():
                          'in version 0.22 to "auto"',
                          clf.fit, X)
 
-    clf = IsolationForest(contamination='auto').fit(X)
+    assert_warns_message(FutureWarning,
+                         'Default "behaviour" parameter will change to "new" '
+                         'in version 0.22',
+                         IsolationForest, )
+
+    clf = IsolationForest().fit(X)
     assert_warns_message(DeprecationWarning,
                          "threshold_ attribute is deprecated in 0.20 and will"
                          " be removed in 0.22.",
                          getattr, clf, "threshold_")
+
+
+def test_behaviour_param():
+    X_train = [[1, 1], [1, 2], [2, 1]]
+    clf1 = IsolationForest(behaviour='old').fit(X_train)
+    clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train)
+    assert_array_equal(clf1.decision_function([[2., 2.]]),
+                       clf2.decision_function([[2., 2.]]))
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -366,6 +366,13 @@ def set_checking_parameters(estimator):
     if estimator.__class__.__name__ == "TheilSenRegressor":
         estimator.max_subpopulation = 100
 
+    if estimator.__class__.__name__ == "IsolationForest":
+        # XXX to be removed in 0.22.
+        # this is used because the old IsolationForest does not
+        # respect the outlier detection API and thus and does not
+        # pass the outlier detection common tests.
+        estimator.set_params(behaviour='new')
+
     if isinstance(estimator, BaseRandomProjection):
         # Due to the jl lemma and often very few samples, the number
         # of components of the random matrix projection will be probably