Fix ARDRegression accuracy issue with scipy 1.3.0 (scikit-learn#14067)

timstaley · jnothman · commit dd2fa685d0b6 · 2019-07-28T22:28:46.000+10:00
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -49,16 +49,20 @@ Changelog
 :mod:`sklearn.impute`
 .....................
 
-- |Fix| Fixed a bug in :class:`SimpleImputer` and :class:`IterativeImputer`
-  so that no errors are thrown when there are missing values in training data.
-  :pr:`13974` by `Frank Hoang <fhoang7>`.
+- |Fix| Fixed a bug in :class:`impute.SimpleImputer` and
+  :class:`impute.IterativeImputer` so that no errors are thrown when there are
+  missing values in training data. :pr:`13974` by `Frank Hoang <fhoang7>`.
 
 :mod:`sklearn.linear_model`
 ...........................
 - |Fix| Fixed a bug in :class:`linear_model.LogisticRegressionCV` where
   ``refit=False`` would fail depending on the ``'multiclass'`` and
   ``'penalty'`` parameters (regression introduced in 0.21). :pr:`14087` by
   `Nicolas Hug`_.
+- |Fix| Compatibility fix for :class:`linear_model.ARDRegression` and
+  Scipy>=1.3.0. Adapts to upstream changes to the default `pinvh` cutoff
+  threshold which otherwise results in poor accuracy in some cases.
+  :pr:`14067` by :user:`Tim Staley <timstaley>`.
 
 :mod:`sklearn.tree`
 ...................
diff --git a/sklearn/externals/_scipy_linalg.py b/sklearn/externals/_scipy_linalg.py
@@ -0,0 +1,118 @@
+# This should remained pinned to version 1.2 and not updated like other
+# externals.
+"""Copyright (c) 2001-2002 Enthought, Inc.  2003-2019, SciPy Developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following
+   disclaimer in the documentation and/or other materials provided
+   with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+import numpy as np
+import scipy.linalg.decomp as decomp
+
+
+def pinvh(a, cond=None, rcond=None, lower=True, return_rank=False,
+          check_finite=True):
+    """
+    Compute the (Moore-Penrose) pseudo-inverse of a Hermitian matrix.
+
+    Copied in from scipy==1.2.2, in order to preserve the default choice of the
+    `cond` and `above_cutoff` values which determine which values of the matrix
+    inversion lie below threshold and are so set to zero. Changes in scipy 1.3
+    resulted in a smaller default threshold and thus slower convergence of
+    dependent algorithms in some cases (see Sklearn github issue #14055).
+
+    Calculate a generalized inverse of a Hermitian or real symmetric matrix
+    using its eigenvalue decomposition and including all eigenvalues with
+    'large' absolute value.
+
+    Parameters
+    ----------
+    a : (N, N) array_like
+        Real symmetric or complex hermetian matrix to be pseudo-inverted
+    cond, rcond : float or None
+        Cutoff for 'small' eigenvalues.
+        Singular values smaller than rcond * largest_eigenvalue are considered
+        zero.
+
+        If None or -1, suitable machine precision is used.
+    lower : bool, optional
+        Whether the pertinent array data is taken from the lower or upper
+        triangle of a. (Default: lower)
+    return_rank : bool, optional
+        if True, return the effective rank of the matrix
+    check_finite : bool, optional
+        Whether to check that the input matrix contains only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination) if the inputs do contain infinities or NaNs.
+
+    Returns
+    -------
+    B : (N, N) ndarray
+        The pseudo-inverse of matrix `a`.
+    rank : int
+        The effective rank of the matrix.  Returned if return_rank == True
+
+    Raises
+    ------
+    LinAlgError
+        If eigenvalue does not converge
+
+    Examples
+    --------
+    >>> from scipy.linalg import pinvh
+    >>> a = np.random.randn(9, 6)
+    >>> a = np.dot(a, a.T)
+    >>> B = pinvh(a)
+    >>> np.allclose(a, np.dot(a, np.dot(B, a)))
+    True
+    >>> np.allclose(B, np.dot(B, np.dot(a, B)))
+    True
+
+    """
+    a = decomp._asarray_validated(a, check_finite=check_finite)
+    s, u = decomp.eigh(a, lower=lower, check_finite=False)
+
+    if rcond is not None:
+        cond = rcond
+    if cond in [None, -1]:
+        t = u.dtype.char.lower()
+        factor = {'f': 1E3, 'd': 1E6}
+        cond = factor[t] * np.finfo(t).eps
+
+    # For Hermitian matrices, singular values equal abs(eigenvalues)
+    above_cutoff = (abs(s) > cond * np.max(abs(s)))
+    psigma_diag = 1.0 / s[above_cutoff]
+    u = u[:, above_cutoff]
+
+    B = np.dot(u * psigma_diag, np.conjugate(u).T)
+
+    if return_rank:
+        return B, len(psigma_diag)
+    else:
+        return B
diff --git a/sklearn/linear_model/bayes.py b/sklearn/linear_model/bayes.py
@@ -8,12 +8,12 @@
 from math import log
 import numpy as np
 from scipy import linalg
-from scipy.linalg import pinvh
 
 from .base import LinearModel, _rescale_data
 from ..base import RegressorMixin
 from ..utils.extmath import fast_logdet
 from ..utils import check_X_y
+from ..utils.fixes import pinvh
 
 
 ###############################################################################
diff --git a/sklearn/linear_model/tests/test_bayes.py b/sklearn/linear_model/tests/test_bayes.py
@@ -188,6 +188,24 @@ def test_toy_ard_object():
     assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
 
 
+def test_ard_accuracy_on_easy_problem():
+    # Check that ARD converges with reasonable accuracy on an easy problem
+    # (Github issue #14055)
+    # This particular seed seems to converge poorly in the failure-case
+    # (scipy==1.3.0, sklearn==0.21.2)
+    seed = 45
+    X = np.random.RandomState(seed=seed).normal(size=(250, 3))
+    y = X[:, 1]
+
+    regressor = ARDRegression()
+    regressor.fit(X, y)
+
+    abs_coef_error = np.abs(1 - regressor.coef_[1])
+    # Expect an accuracy of better than 1E-4 in most cases -
+    # Failure-case produces 0.16!
+    assert abs_coef_error < 0.01
+
+
 def test_return_std():
     # Test return_std option for both Bayesian regressors
     def f(X):
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
@@ -39,6 +39,13 @@ def _parse_version(version_string):
     from scipy.misc import comb, logsumexp  # noqa
 
 
+if sp_version >= (1, 3):
+    # Preserves earlier default choice of pinvh cutoff `cond` value.
+    # Can be removed once issue #14055 is fully addressed.
+    from ..externals._scipy_linalg import pinvh
+else:
+    from scipy.linalg import pinvh # noqa
+
 if sp_version >= (0, 19):
     def _argmax(arr_or_spmatrix, axis=None):
         return arr_or_spmatrix.argmax(axis=axis)