@@ -89,6 +89,15 @@ class IsolationForest(BaseBagging, OutlierMixin):
89
89
The number of jobs to run in parallel for both `fit` and `predict`.
90
90
If -1, then the number of jobs is set to the number of cores.
91
91
92
+ behaviour: str, optional (default='old')
93
+ Accepted values are 'old' or 'new'. Behaviour of the decision_function.
94
+ Default "behaviour" parameter will change to "new" in version 0.22.
95
+ Passing behaviour="new" makes the decision_function change to match
96
+ other anomaly detection algorithm API, as explained in details in the
97
+ offset_ attribute documentation. Basically, the decision_function
98
+ becomes dependent on the contamination parameter, in such a way that
99
+ 0 becomes its natural threshold to detect outliers.
100
+
92
101
random_state : int, RandomState instance or None, optional (default=None)
93
102
If int, random_state is the seed used by the random number generator;
94
103
If RandomState instance, random_state is the random number generator;
@@ -114,12 +123,16 @@ class IsolationForest(BaseBagging, OutlierMixin):
114
123
offset_ : float
115
124
Offset used to define the decision function from the raw scores.
116
125
We have the relation: ``decision_function = score_samples - offset_``.
126
+ Assuming behaviour == 'new', offset_ is defined as follows.
117
127
When the contamination parameter is set to "auto", the offset is equal
118
128
to -0.5 as the scores of inliers are close to 0 and the scores of
119
129
outliers are close to -1. When a contamination parameter different
120
130
than "auto" is provided, the offset is defined in such a way we obtain
121
131
the expected number of outliers (samples with decision function < 0)
122
132
in training.
133
+ Assuming the behaviour parameter is set to 'old', we always have
134
+ offset_ = -0.5, making the decision function independent from the
135
+ contamination parameter.
123
136
124
137
References
125
138
----------
@@ -138,6 +151,7 @@ def __init__(self,
138
151
max_features = 1. ,
139
152
bootstrap = False ,
140
153
n_jobs = 1 ,
154
+ behaviour = 'old' ,
141
155
random_state = None ,
142
156
verbose = 0 ):
143
157
super (IsolationForest , self ).__init__ (
@@ -154,8 +168,17 @@ def __init__(self,
154
168
n_jobs = n_jobs ,
155
169
random_state = random_state ,
156
170
verbose = verbose )
171
+
172
+ self .behaviour = behaviour
157
173
self .contamination = contamination
158
174
175
+ if behaviour == 'old' :
176
+ warnings .warn ('Default "behaviour" parameter will change to "new" '
177
+ 'in version 0.22. Passing behaviour="new" makes '
178
+ 'IsolationForest decision_function change to match '
179
+ 'other anomaly detection algorithm API.' ,
180
+ FutureWarning )
181
+
159
182
def _set_oob_score (self , X , y ):
160
183
raise NotImplementedError ("OOB score not supported by iforest" )
161
184
@@ -226,16 +249,29 @@ def fit(self, X, y=None, sample_weight=None):
226
249
max_depth = max_depth ,
227
250
sample_weight = sample_weight )
228
251
252
+ if self .behaviour == 'old' :
253
+ # in this case, decision_function = 0.5 + self.score_samples(X):
254
+ if self ._contamination == "auto" :
255
+ raise ValueError ("contamination parameter cannot be set to "
256
+ "'auto' when behaviour == 'old'." )
257
+
258
+ self .offset_ = - 0.5
259
+ self ._threshold_ = sp .stats .scoreatpercentile (
260
+ self .decision_function (X ), 100. * self ._contamination )
261
+
262
+ return self
263
+
264
+ # else, self.behaviour == 'new':
229
265
if self ._contamination == "auto" :
230
266
# 0.5 plays a special role as described in the original paper.
231
267
# we take the opposite as we consider the opposite of their score.
232
268
self .offset_ = - 0.5
233
- # need to save (depreciated) threshold_ in this case:
234
- self . _threshold_ = sp . stats . scoreatpercentile (
235
- self . score_samples ( X ), 100. * 0.1 )
236
- else :
237
- self .offset_ = sp .stats .scoreatpercentile (
238
- self .score_samples (X ), 100. * self ._contamination )
269
+ return self
270
+
271
+ # else, define offset_ wrt contamination parameter, so that the
272
+ # threshold_ attribute is implicitly 0 and is not needed anymore :
273
+ self .offset_ = sp .stats .scoreatpercentile (
274
+ self .score_samples (X ), 100. * self ._contamination )
239
275
240
276
return self
241
277
@@ -258,7 +294,8 @@ def predict(self, X):
258
294
check_is_fitted (self , ["offset_" ])
259
295
X = check_array (X , accept_sparse = 'csr' )
260
296
is_inlier = np .ones (X .shape [0 ], dtype = int )
261
- is_inlier [self .decision_function (X ) < 0 ] = - 1
297
+ threshold = self .threshold_ if self .behaviour == 'old' else 0
298
+ is_inlier [self .decision_function (X ) < threshold ] = - 1
262
299
return is_inlier
263
300
264
301
def decision_function (self , X ):
@@ -359,11 +396,12 @@ def score_samples(self, X):
359
396
360
397
@property
361
398
def threshold_ (self ):
399
+ if self .behaviour != 'old' :
400
+ raise AttributeError ("threshold_ attribute does not exist when "
401
+ "behaviour != 'old'" )
362
402
warnings .warn ("threshold_ attribute is deprecated in 0.20 and will"
363
403
" be removed in 0.22." , DeprecationWarning )
364
- if self .contamination == 'auto' :
365
- return self ._threshold_
366
- return self .offset_
404
+ return self ._threshold_
367
405
368
406
369
407
def _average_path_length (n_samples_leaf ):
0 commit comments