Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
16729ac
Fixed depth formula in iforest
Mar 12, 2017
86ab126
Added non-regression test for issue #8549
Mar 12, 2017
6de409d
reverted some whitespace changes
Mar 12, 2017
29fa2c0
Made changes to what's new and whitespace changes
Mar 12, 2017
0832c73
Update whats_new.rst
Mar 12, 2017
e5e40b3
Update whats_new.rst
Mar 12, 2017
5df8e14
fixed faulty whitespace
Mar 13, 2017
b42d763
Merge branch 'fix-8549' of https://github.com/PTRWang/scikit-learn in…
Mar 13, 2017
df5acc4
faulty whitespace fix and change to whats new
Mar 13, 2017
06225a9
added constants to iforest average_path_length and the according non …
Mar 13, 2017
aaaea54
COSMIT
jnothman Mar 13, 2017
c9256be
Update whats_new.rst
Mar 13, 2017
fd62429
Corrected IsolationForest average path formula and added integer arra…
Mar 13, 2017
c141c20
Merge branch 'fix-8549' of https://github.com/PTRWang/scikit-learn in…
Mar 13, 2017
7e3a0c5
changed line to under 80 char
Mar 13, 2017
96d0489
Update whats_new.rst
Mar 13, 2017
55f4001
Update whats_new.rst
Mar 13, 2017
3b4c511
reran tests
Mar 13, 2017
b85c25b
redefine np.euler_gamma
Mar 13, 2017
477f50e
added import statement for euler_gammma in iforest and test_iforest
Mar 13, 2017
9a37bad
changed np.euler_gamma to euler_gamma
Mar 13, 2017
41a4a32
fix small formatting issue
Mar 13, 2017
9cbae33
fix small formatting issue
Mar 13, 2017
c9bba59
modified average_path_length tests
Mar 13, 2017
a36870f
formatting fix + removed redundant tests
Mar 14, 2017
6d887f4
fix import error
Mar 14, 2017
e7f98a8
retry remote server error
Mar 14, 2017
68b40a7
retry remote server error
Mar 14, 2017
d3dc543
retry remote server error
Mar 14, 2017
2e040dc
re-added some iforest tests
Mar 14, 2017
d2084b4
re-added some iforest tests
Mar 14, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ parameters, may produce different models from the previous version. This often
occurs due to changes in the modelling logic (bug fixes or enhancements), or in
random sampling procedures.

* *to be listed*
* :class:`sklearn.ensemble.IsolationForest` (bug fix)

Details are listed in the changelog below.

Expand Down Expand Up @@ -156,7 +156,11 @@ Enhancements

Bug fixes
.........
- Fixed a bug where :class:`sklearn.cluster.DBSCAN` gives incorrect
- Fixed a bug where :class:`sklearn.ensemble.IsolationForest` uses an
an incorrect formula for the average path length
:issue:`8549` by `Peter Wang <https://github.com/PTRWang>`_.

- Fixed a bug where :class:`sklearn.cluster.DBSCAN` gives incorrect
result when input is a precomputed sparse matrix with initial
rows all zero.
:issue:`8306` by :user:`Akshay Gupta <Akshay0724>`
Expand All @@ -167,7 +171,7 @@ Bug fixes

- Fixed a bug where :func:`sklearn.model_selection.BaseSearchCV.inverse_transform`
returns self.best_estimator_.transform() instead of self.best_estimator_.inverse_transform()
:issue:`8344` by :user:`Akshay Gupta <Akshay0724>`
:issue:`8344` by :user:`Akshay Gupta <Akshay0724>`

- Fixed a bug where :class:`sklearn.linear_model.RandomizedLasso` and
:class:`sklearn.linear_model.RandomizedLogisticRegression` breaks for
Expand Down Expand Up @@ -274,13 +278,13 @@ API changes summary
selection classes to be used with tools such as
:func:`sklearn.model_selection.cross_val_predict`.
:issue:`2879` by :user:`Stephen Hoover <stephen-hoover>`.
- Estimators with both methods ``decision_function`` and ``predict_proba``
are now required to have a monotonic relation between them. The
method ``check_decision_proba_consistency`` has been added in
**sklearn.utils.estimator_checks** to check their consistency.

- Estimators with both methods ``decision_function`` and ``predict_proba``
are now required to have a monotonic relation between them. The
method ``check_decision_proba_consistency`` has been added in
**sklearn.utils.estimator_checks** to check their consistency.
:issue:`7578` by :user:`Shubham Bhardwaj <shubham0704>`


.. _changes_0_18_1:

Expand Down
5 changes: 3 additions & 2 deletions sklearn/ensemble/iforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
import scipy as sp
from warnings import warn
from sklearn.utils.fixes import euler_gamma

from scipy.sparse import issparse

Expand Down Expand Up @@ -300,7 +301,7 @@ def _average_path_length(n_samples_leaf):
if n_samples_leaf <= 1:
return 1.
else:
return 2. * (np.log(n_samples_leaf) + 0.5772156649) - 2. * (
return 2. * (np.log(n_samples_leaf - 1.) + euler_gamma) - 2. * (
n_samples_leaf - 1.) / n_samples_leaf

else:
Expand All @@ -314,7 +315,7 @@ def _average_path_length(n_samples_leaf):

average_path_length[mask] = 1.
average_path_length[not_mask] = 2. * (
np.log(n_samples_leaf[not_mask]) + 0.5772156649) - 2. * (
np.log(n_samples_leaf[not_mask] - 1.) + euler_gamma) - 2. * (
n_samples_leaf[not_mask] - 1.) / n_samples_leaf[not_mask]

return average_path_length.reshape(n_samples_leaf_shape)
16 changes: 16 additions & 0 deletions sklearn/ensemble/tests/test_iforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

import numpy as np

from sklearn.utils.fixes import euler_gamma
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_raises
Expand All @@ -19,6 +21,7 @@

from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import IsolationForest
from sklearn.ensemble.iforest import _average_path_length
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston, load_iris
from sklearn.utils import check_random_state
Expand Down Expand Up @@ -211,3 +214,16 @@ def test_iforest_subsampled_features():
clf = IsolationForest(max_features=0.8)
clf.fit(X_train, y_train)
clf.predict(X_test)


def test_iforest_average_path_length():
# It tests non-regression for #8549 which used the wrong formula
# for average path length, strictly for the integer case

result_one = 2. * (np.log(4.) + euler_gamma) - 2. * 4. / 5.
result_two = 2. * (np.log(998.) + euler_gamma) - 2. * 998. / 999.
assert_almost_equal(_average_path_length(1), 1., decimal=10)
assert_almost_equal(_average_path_length(5), result_one, decimal=10)
assert_almost_equal(_average_path_length(999), result_two, decimal=10)
assert_array_almost_equal(_average_path_length(np.array([1, 5, 999])),
[1., result_one, result_two], decimal=10)
2 changes: 2 additions & 0 deletions sklearn/utils/fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def _parse_version(version_string):
version.append(x)
return tuple(version)

euler_gamma = getattr(np, 'euler_gamma',
0.577215664901532860606512090082402431)

np_version = _parse_version(np.__version__)
sp_version = _parse_version(scipy.__version__)
Expand Down