scikit-learn · raghavrv · Mar 14, 2017 · Mar 12, 2017 · Mar 12, 2017 · Mar 12, 2017
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -18,7 +18,7 @@ parameters, may produce different models from the previous version. This often
 occurs due to changes in the modelling logic (bug fixes or enhancements), or in
 random sampling procedures.
 
-* *to be listed*
+   * :class:`sklearn.ensemble.IsolationForest` (bug fix)
 
 Details are listed in the changelog below.
 
@@ -156,7 +156,11 @@ Enhancements
 
 Bug fixes
 .........
-   - Fixed a bug where :class:`sklearn.cluster.DBSCAN` gives incorrect 
+   - Fixed a bug where :class:`sklearn.ensemble.IsolationForest` uses an
+     an incorrect formula for the average path length
+     :issue:`8549` by `Peter Wang <https://github.com/PTRWang>`_.
+
+   - Fixed a bug where :class:`sklearn.cluster.DBSCAN` gives incorrect
      result when input is a precomputed sparse matrix with initial
      rows all zero.
      :issue:`8306` by :user:`Akshay Gupta <Akshay0724>`
@@ -167,7 +171,7 @@ Bug fixes
 
    - Fixed a bug where :func:`sklearn.model_selection.BaseSearchCV.inverse_transform`
      returns self.best_estimator_.transform() instead of self.best_estimator_.inverse_transform()
-     :issue:`8344` by :user:`Akshay Gupta <Akshay0724>` 
+     :issue:`8344` by :user:`Akshay Gupta <Akshay0724>`
 
    - Fixed a bug where :class:`sklearn.linear_model.RandomizedLasso` and
      :class:`sklearn.linear_model.RandomizedLogisticRegression` breaks for
@@ -274,13 +278,13 @@ API changes summary
       selection classes to be used with tools such as
       :func:`sklearn.model_selection.cross_val_predict`.
       :issue:`2879` by :user:`Stephen Hoover <stephen-hoover>`.
-      
-   - Estimators with both methods ``decision_function`` and ``predict_proba`` 
-     are now required to have a monotonic relation between them. The 
-     method ``check_decision_proba_consistency`` has been added in 
-     **sklearn.utils.estimator_checks** to check their consistency. 
+
+   - Estimators with both methods ``decision_function`` and ``predict_proba``
+     are now required to have a monotonic relation between them. The
+     method ``check_decision_proba_consistency`` has been added in
+     **sklearn.utils.estimator_checks** to check their consistency.
      :issue:`7578` by :user:`Shubham Bhardwaj <shubham0704>`
-      
+
 
 .. _changes_0_18_1:
 

diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py
@@ -7,6 +7,7 @@
 import numpy as np
 import scipy as sp
 from warnings import warn
+from sklearn.utils.fixes import euler_gamma
 
 from scipy.sparse import issparse
 
@@ -300,7 +301,7 @@ def _average_path_length(n_samples_leaf):
         if n_samples_leaf <= 1:
             return 1.
         else:
-            return 2. * (np.log(n_samples_leaf) + 0.5772156649) - 2. * (
+            return 2. * (np.log(n_samples_leaf - 1.) + euler_gamma) - 2. * (
                 n_samples_leaf - 1.) / n_samples_leaf
 
     else:
@@ -314,7 +315,7 @@ def _average_path_length(n_samples_leaf):
 
         average_path_length[mask] = 1.
         average_path_length[not_mask] = 2. * (
-            np.log(n_samples_leaf[not_mask]) + 0.5772156649) - 2. * (
+            np.log(n_samples_leaf[not_mask] - 1.) + euler_gamma) - 2. * (
                 n_samples_leaf[not_mask] - 1.) / n_samples_leaf[not_mask]
 
         return average_path_length.reshape(n_samples_leaf_shape)
diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
@@ -8,6 +8,8 @@
 
 import numpy as np
 
+from sklearn.utils.fixes import euler_gamma
+from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_raises
@@ -19,6 +21,7 @@
 
 from sklearn.model_selection import ParameterGrid
 from sklearn.ensemble import IsolationForest
+from sklearn.ensemble.iforest import _average_path_length
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_boston, load_iris
 from sklearn.utils import check_random_state
@@ -211,3 +214,16 @@ def test_iforest_subsampled_features():
     clf = IsolationForest(max_features=0.8)
     clf.fit(X_train, y_train)
     clf.predict(X_test)
+
+
+def test_iforest_average_path_length():
+    # It tests non-regression for #8549 which used the wrong formula
+    # for average path length, strictly for the integer case
+
+    result_one = 2. * (np.log(4.) + euler_gamma) - 2. * 4. / 5.
+    result_two = 2. * (np.log(998.) + euler_gamma) - 2. * 998. / 999.
+    assert_almost_equal(_average_path_length(1), 1., decimal=10)
+    assert_almost_equal(_average_path_length(5), result_one, decimal=10)
+    assert_almost_equal(_average_path_length(999), result_two, decimal=10)
+    assert_array_almost_equal(_average_path_length(np.array([1, 5, 999])),
+                              [1., result_one, result_two], decimal=10)
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
@@ -36,6 +36,8 @@ def _parse_version(version_string):
             version.append(x)
     return tuple(version)
 
+euler_gamma = getattr(np, 'euler_gamma',
+                      0.577215664901532860606512090082402431)
 
 np_version = _parse_version(np.__version__)
 sp_version = _parse_version(scipy.__version__)