Skip to content

[MRG+2] FIX: make the deprecation in fit and not init in IsolationForest #11574

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 17, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 15 additions & 11 deletions sklearn/ensemble/iforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ class IsolationForest(BaseBagging, OutlierMixin):
on the decision function. If 'auto', the decision function threshold is
determined as in the original paper.

.. versionchanged:: 0.20
The default value of ``contamination`` will change from 0.1 in 0.20
to ``'auto'`` in 0.22.

max_features : int or float, optional (default=1.0)
The number of features to draw from X to train each base estimator.

Expand Down Expand Up @@ -150,12 +154,6 @@ def __init__(self,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose)

if contamination == "legacy":
warnings.warn('default contamination parameter 0.1 will change '
'in version 0.22 to "auto". This will change the '
'predict method behavior.',
DeprecationWarning)
self.contamination = contamination

def _set_oob_score(self, X, y):
Expand All @@ -178,6 +176,15 @@ def fit(self, X, y=None, sample_weight=None):
-------
self : object
"""
if self.contamination == "legacy":
warnings.warn('default contamination parameter 0.1 will change '
'in version 0.22 to "auto". This will change the '
'predict method behavior.',
FutureWarning)
self._contamination = 0.1
else:
self._contamination = self.contamination

X = check_array(X, accept_sparse=['csc'])
if issparse(X):
# Pre-sort indices to avoid that each individual tree of the
Expand Down Expand Up @@ -219,19 +226,16 @@ def fit(self, X, y=None, sample_weight=None):
max_depth=max_depth,
sample_weight=sample_weight)

if self.contamination == "auto":
if self._contamination == "auto":
# 0.5 plays a special role as described in the original paper.
# we take the opposite as we consider the opposite of their score.
self.offset_ = -0.5
# need to save (depreciated) threshold_ in this case:
self._threshold_ = sp.stats.scoreatpercentile(
self.score_samples(X), 100. * 0.1)
elif self.contamination == "legacy": # to be rm in 0.22
self.offset_ = sp.stats.scoreatpercentile(
self.score_samples(X), 100. * 0.1)
else:
self.offset_ = sp.stats.scoreatpercentile(
self.score_samples(X), 100. * self.contamination)
self.score_samples(X), 100. * self._contamination)

return self

Expand Down
20 changes: 16 additions & 4 deletions sklearn/ensemble/tests/test_iforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def test_iforest():
**params).fit(X_train).predict(X_test)


@pytest.mark.filterwarnings('ignore:default contamination')
def test_iforest_sparse():
"""Check IForest for various parameter settings on sparse input."""
rng = check_random_state(0)
Expand Down Expand Up @@ -89,6 +90,7 @@ def test_iforest_sparse():
assert_array_equal(sparse_results, dense_results)


@pytest.mark.filterwarnings('ignore:default contamination')
def test_iforest_error():
"""Test that it gives proper exception on deficient input."""
X = iris.data
Expand Down Expand Up @@ -127,6 +129,7 @@ def test_iforest_error():
assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:])


@pytest.mark.filterwarnings('ignore:default contamination')
def test_recalculate_max_depth():
"""Check max_depth recalculation when max_samples is reset to n_samples"""
X = iris.data
Expand All @@ -135,6 +138,7 @@ def test_recalculate_max_depth():
assert_equal(est.max_depth, int(np.ceil(np.log2(X.shape[0]))))


@pytest.mark.filterwarnings('ignore:default contamination')
def test_max_samples_attribute():
X = iris.data
clf = IsolationForest().fit(X)
Expand All @@ -150,6 +154,7 @@ def test_max_samples_attribute():
assert_equal(clf.max_samples_, 0.4*X.shape[0])


@pytest.mark.filterwarnings('ignore:default contamination')
def test_iforest_parallel_regression():
"""Check parallel regression."""
rng = check_random_state(0)
Expand All @@ -174,6 +179,7 @@ def test_iforest_parallel_regression():
assert_array_almost_equal(y1, y3)


@pytest.mark.filterwarnings('ignore:default contamination')
def test_iforest_performance():
"""Test Isolation Forest performs well"""

Expand Down Expand Up @@ -213,13 +219,15 @@ def test_iforest_works():
assert_array_equal(pred, 6 * [1] + 2 * [-1])


@pytest.mark.filterwarnings('ignore:default contamination')
def test_max_samples_consistency():
# Make sure validated max_samples in iforest and BaseBagging are identical
X = iris.data
clf = IsolationForest().fit(X)
assert_equal(clf.max_samples_, clf._max_samples)


@pytest.mark.filterwarnings('ignore:default contamination')
def test_iforest_subsampled_features():
# It tests non-regression for #5732 which failed at predict.
rng = check_random_state(0)
Expand All @@ -244,6 +252,7 @@ def test_iforest_average_path_length():
[1., result_one, result_two], decimal=10)


@pytest.mark.filterwarnings('ignore:default contamination')
def test_score_samples():
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = IsolationForest(contamination=0.1).fit(X_train)
Expand All @@ -257,12 +266,15 @@ def test_score_samples():


def test_deprecation():
assert_warns_message(DeprecationWarning,
X = [[0.0], [1.0]]
clf = IsolationForest()

assert_warns_message(FutureWarning,
'default contamination parameter 0.1 will change '
'in version 0.22 to "auto"',
IsolationForest, )
X = [[0.0], [1.0]]
clf = IsolationForest().fit(X)
clf.fit, X)

clf = IsolationForest(contamination='auto').fit(X)
assert_warns_message(DeprecationWarning,
"threshold_ attribute is deprecated in 0.20 and will"
" be removed in 0.22.",
Expand Down
10 changes: 5 additions & 5 deletions sklearn/linear_model/tests/test_sag.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from sklearn.utils.extmath import row_norms
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_allclose
from sklearn.utils.testing import assert_greater
from sklearn.utils.testing import assert_raise_message
from sklearn.utils.testing import ignore_warnings
Expand Down Expand Up @@ -269,7 +270,6 @@ def test_classifier_matching():
assert_array_almost_equal(intercept2, clf.intercept_, decimal=9)


@ignore_warnings
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really do not get this weird side-effect at a distance kind of thing ... it feels like something is really fishy here. I'll try to reproduce locally.

def test_regressor_matching():
n_samples = 10
n_features = 5
Expand All @@ -295,10 +295,10 @@ def test_regressor_matching():
dloss=squared_dloss,
fit_intercept=fit_intercept)

assert_array_almost_equal(weights1, clf.coef_, decimal=10)
assert_array_almost_equal(intercept1, clf.intercept_, decimal=10)
assert_array_almost_equal(weights2, clf.coef_, decimal=10)
assert_array_almost_equal(intercept2, clf.intercept_, decimal=10)
assert_allclose(weights1, clf.coef_)
assert_allclose(intercept1, clf.intercept_)
assert_allclose(weights2, clf.coef_)
assert_allclose(intercept2, clf.intercept_)


@ignore_warnings
Expand Down