MNT initialize weights when using ARPACK solver with a utility (scikit-learn#18302)

gauravkdesai · Ivan PANICO · summer-bebop · web-flow · commit da562b4fa58b · 2020-11-27T18:25:35.000+01:00
Co-authored-by: Ivan PANICO &lt;ivpanico@gmail.com&gt;
Co-authored-by: Ivan Panico &lt;iv.panico@gmail.com&gt;
Co-authored-by: Thomas J. Fan &lt;thomasjpfan@gmail.com&gt;
Co-authored-by: Guillaume Lemaitre &lt;g.lemaitre58@gmail.com&gt;
diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py
@@ -74,6 +74,7 @@
 from collections import defaultdict
 import os.path
 
+from sklearn.utils._arpack import _init_arpack_v0
 from sklearn.utils import gen_batches
 from sklearn.utils.validation import check_random_state
 from sklearn.utils.extmath import randomized_svd
@@ -256,7 +257,7 @@ def svd_timing(X, n_comps, n_iter, n_oversamples,
     return U, mu, V, call_time
 
 
-def norm_diff(A, norm=2, msg=True):
+def norm_diff(A, norm=2, msg=True, random_state=None):
     """
     Compute the norm diff with the original matrix, when randomized
     SVD is called with *params.
@@ -268,7 +269,11 @@ def norm_diff(A, norm=2, msg=True):
         print("... computing %s norm ..." % norm)
     if norm == 2:
         # s = sp.linalg.norm(A, ord=2)  # slow
-        value = sp.sparse.linalg.svds(A, k=1, return_singular_vectors=False)
+        v0 = _init_arpack_v0(min(A.shape), random_state)
+        value = sp.sparse.linalg.svds(A,
+                                      k=1,
+                                      return_singular_vectors=False,
+                                      v0=v0)
     else:
         if sp.sparse.issparse(A):
             value = sp.sparse.linalg.norm(A, ord=norm)
@@ -298,7 +303,7 @@ def bench_a(X, dataset_name, power_iter, n_oversamples, n_comps):
     all_time = defaultdict(list)
     if enable_spectral_norm:
         all_spectral = defaultdict(list)
-        X_spectral_norm = norm_diff(X, norm=2, msg=False)
+        X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0)
     all_frobenius = defaultdict(list)
     X_fro_norm = norm_diff(X, norm='fro', msg=False)
 
@@ -312,8 +317,9 @@ def bench_a(X, dataset_name, power_iter, n_oversamples, n_comps):
             all_time[label].append(time)
             if enable_spectral_norm:
                 A = U.dot(np.diag(s).dot(V))
-                all_spectral[label].append(norm_diff(X - A, norm=2) /
-                                           X_spectral_norm)
+                all_spectral[label].append(
+                    norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm
+                )
             f = scalable_frobenius_norm_discrepancy(X, U, s, V)
             all_frobenius[label].append(f / X_fro_norm)
 
@@ -327,8 +333,9 @@ def bench_a(X, dataset_name, power_iter, n_oversamples, n_comps):
             all_time[label].append(time)
             if enable_spectral_norm:
                 A = U.dot(np.diag(s).dot(V))
-                all_spectral[label].append(norm_diff(X - A, norm=2) /
-                                           X_spectral_norm)
+                all_spectral[label].append(
+                    norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm
+                )
             f = scalable_frobenius_norm_discrepancy(X, U, s, V)
             all_frobenius[label].append(f / X_fro_norm)
 
@@ -353,7 +360,7 @@ def bench_b(power_list):
     for rank in ranks:
         X = make_low_rank_matrix(effective_rank=rank, **data_params)
         if enable_spectral_norm:
-            X_spectral_norm = norm_diff(X, norm=2, msg=False)
+            X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0)
         X_fro_norm = norm_diff(X, norm='fro', msg=False)
 
         for n_comp in [int(rank/2), rank, rank*2]:
@@ -364,8 +371,10 @@ def bench_b(power_list):
                                         power_iteration_normalizer='LU')
                 if enable_spectral_norm:
                     A = U.dot(np.diag(s).dot(V))
-                    all_spectral[label].append(norm_diff(X - A, norm=2) /
-                                               X_spectral_norm)
+                    all_spectral[label].append(
+                        norm_diff(X - A, norm=2, random_state=0) /
+                        X_spectral_norm
+                    )
                 f = scalable_frobenius_norm_discrepancy(X, U, s, V)
                 all_frobenius[label].append(f / X_fro_norm)
 
@@ -388,7 +397,7 @@ def bench_c(datasets, n_comps):
             continue
 
         if enable_spectral_norm:
-            X_spectral_norm = norm_diff(X, norm=2, msg=False)
+            X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0)
         X_fro_norm = norm_diff(X, norm='fro', msg=False)
         n_comps = np.minimum(n_comps, np.min(X.shape))
 
@@ -401,8 +410,9 @@ def bench_c(datasets, n_comps):
         all_time[label].append(time)
         if enable_spectral_norm:
             A = U.dot(np.diag(s).dot(V))
-            all_spectral[label].append(norm_diff(X - A, norm=2) /
-                                       X_spectral_norm)
+            all_spectral[label].append(
+                norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm
+            )
         f = scalable_frobenius_norm_discrepancy(X, U, s, V)
         all_frobenius[label].append(f / X_fro_norm)
 
@@ -415,8 +425,9 @@ def bench_c(datasets, n_comps):
             all_time[label].append(time)
             if enable_spectral_norm:
                 A = U.dot(np.diag(s).dot(V))
-                all_spectral[label].append(norm_diff(X - A, norm=2) /
-                                           X_spectral_norm)
+                all_spectral[label].append(
+                    norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm
+                )
             f = scalable_frobenius_norm_discrepancy(X, U, s, V)
             all_frobenius[label].append(f / X_fro_norm)
 
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -26,7 +26,10 @@ random sampling procedures.
   between 32-bits and 64-bits data when the kernel has small positive
   eigenvalues.
 
-- |Fix| :class:`linear_model.Perceptron` when `penalty='elasticnet'`
+- |Fix| :class:`decomposition.TruncatedSVD` becomes deterministic by exposing
+  a `random_state` parameter.
+
+- |Fix| :class:`linear_model.Perceptron` when `penalty='elasticnet'`.
 
 Details are listed in the changelog below.
 
@@ -169,6 +172,12 @@ Changelog
   `y_std_` attributes were deprecated and will be removed in 0.26.
   :pr:`18768` by :user:`Maren Westermann <marenwestermann>`.
 
+- |Fix| :class:`decomposition.TruncatedSVD` becomes deterministic by using the
+  `random_state`. It controls the weights' initialization of the underlying
+  ARPACK solver.
+  :pr:` #18302` by :user:`Gaurav Desai <gauravkdesai>` and
+  :user:`Ivan Panico <FollowKenny>`.
+
 :mod:`sklearn.datasets`
 .......................
 
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
@@ -7,7 +7,7 @@
 from scipy import linalg
 from scipy.sparse.linalg import eigsh
 
-from ..utils import check_random_state
+from ..utils._arpack import _init_arpack_v0
 from ..utils.extmath import svd_flip
 from ..utils.validation import check_is_fitted, _check_psd_eigenvalues
 from ..utils.deprecation import deprecated
@@ -209,9 +209,7 @@ def _fit_transform(self, K):
             self.lambdas_, self.alphas_ = linalg.eigh(
                 K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1))
         elif eigen_solver == 'arpack':
-            random_state = check_random_state(self.random_state)
-            # initialize with [-1,1] as in ARPACK
-            v0 = random_state.uniform(-1, 1, K.shape[0])
+            v0 = _init_arpack_v0(K.shape[0], self.random_state)
             self.lambdas_, self.alphas_ = eigsh(K, n_components,
                                                 which="LA",
                                                 tol=self.tol,
diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
@@ -21,6 +21,7 @@
 
 from ._base import _BasePCA
 from ..utils import check_random_state
+from ..utils._arpack import _init_arpack_v0
 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
@@ -527,8 +528,7 @@ def _fit_truncated(self, X, n_components, svd_solver):
         X -= self.mean_
 
         if svd_solver == 'arpack':
-            # random init solution, as ARPACK does it internally
-            v0 = random_state.uniform(-1, 1, size=min(X.shape))
+            v0 = _init_arpack_v0(min(X.shape), random_state)
             U, S, Vt = svds(X, k=n_components, tol=self.tol, v0=v0)
             # svds doesn't abide by scipy.linalg.svd/randomized_svd
             # conventions, so reverse its outputs.
diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py
@@ -12,6 +12,7 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array, check_random_state
+from ..utils._arpack import _init_arpack_v0
 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip
 from ..utils.sparsefuncs import mean_variance_axis
 from ..utils.validation import _deprecate_positional_args
@@ -165,7 +166,8 @@ def fit_transform(self, X, y=None):
         random_state = check_random_state(self.random_state)
 
         if self.algorithm == "arpack":
-            U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol)
+            v0 = _init_arpack_v0(min(X.shape), random_state)
+            U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol, v0=v0)
             # svds doesn't abide by scipy.linalg.svd/randomized_svd
             # conventions, so reverse its outputs.
             Sigma = Sigma[::-1]
diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py
@@ -11,6 +11,7 @@
 
 from ..base import BaseEstimator, TransformerMixin, _UnstableArchMixin
 from ..utils import check_random_state, check_array
+from ..utils._arpack import _init_arpack_v0
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
@@ -162,9 +163,7 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
             eigen_solver = 'dense'
 
     if eigen_solver == 'arpack':
-        random_state = check_random_state(random_state)
-        # initialize with [-1,1] as in ARPACK
-        v0 = random_state.uniform(-1, 1, M.shape[0])
+        v0 = _init_arpack_v0(M.shape[0], random_state)
         try:
             eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0,
                                                 tol=tol, maxiter=max_iter,
diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
@@ -15,7 +15,12 @@
 from scipy.sparse.csgraph import laplacian as csgraph_laplacian
 
 from ..base import BaseEstimator
-from ..utils import check_random_state, check_array, check_symmetric
+from ..utils import (
+    check_array,
+    check_random_state,
+    check_symmetric,
+)
+from ..utils._arpack import _init_arpack_v0
 from ..utils.extmath import _deterministic_vector_sign_flip
 from ..utils.fixes import lobpcg
 from ..metrics.pairwise import rbf_kernel
@@ -270,7 +275,7 @@ def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None,
             # We are computing the opposite of the laplacian inplace so as
             # to spare a memory allocation of a possibly very large array
             laplacian *= -1
-            v0 = random_state.uniform(-1, 1, laplacian.shape[0])
+            v0 = _init_arpack_v0(laplacian.shape[0], random_state)
             _, diffusion_map = eigsh(
                 laplacian, k=n_components, sigma=1.0, which='LM',
                 tol=eigen_tol, v0=v0)
diff --git a/sklearn/utils/_arpack.py b/sklearn/utils/_arpack.py
@@ -0,0 +1,30 @@
+from .validation import check_random_state
+
+
+def _init_arpack_v0(size, random_state):
+    """Initialize the starting vector for iteration in ARPACK functions.
+
+    Initialize a ndarray with values sampled from the uniform distribution on
+    [-1, 1]. This initialization model has been chosen to be consistent with
+    the ARPACK one as another initialization can lead to convergence issues.
+
+    Parameters
+    ----------
+    size : int
+        The size of the eigenvalue vector to be initialized.
+
+    random_state : int, RandomState instance or None, default=None
+        The seed of the pseudo random number generator used to generate a
+        uniform distribution. If int, random_state is the seed used by the
+        random number generator; If RandomState instance, random_state is the
+        random number generator; If None, the random number generator is the
+        RandomState instance used by `np.random`.
+
+    Returns
+    -------
+    v0 : ndarray of shape (size,)
+        The initialized vector.
+    """
+    random_state = check_random_state(random_state)
+    v0 = random_state.uniform(-1, 1, size)
+    return v0
diff --git a/sklearn/utils/tests/test_arpack.py b/sklearn/utils/tests/test_arpack.py
@@ -0,0 +1,16 @@
+import pytest
+from numpy.testing import assert_allclose
+
+from sklearn.utils import check_random_state
+from sklearn.utils._arpack import _init_arpack_v0
+
+
+@pytest.mark.parametrize("seed", range(100))
+def test_init_arpack_v0(seed):
+    # check that the initalization a sampling from an uniform distribution
+    # where we can fix the random state
+    size = 1000
+    v0 = _init_arpack_v0(size, seed)
+
+    rng = check_random_state(seed)
+    assert_allclose(v0, rng.uniform(-1, 1, size=size))