scikit-learn · glemaitre · Feb 29, 2024 · Feb 13, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
@@ -92,9 +92,10 @@ Changelog
 - |Feature| A fitted :class:`compose.ColumnTransformer` now implements `__getitem__`
   which returns the fitted transformers by name. :pr:`27990` by `Thomas Fan`_.
 
-- |ENH| :class:`compose.TransformedTargetRegressor` now raises an error in `fit` if
-  only `inverse_func` is provided without `func` (that would default to identity) being
-  explicitly set as well. :pr:`28483` by :user:`Stefanie Senger <StefanieSenger>`.
+- |Enhancement| :class:`compose.TransformedTargetRegressor` now raises an error in `fit`
+  if only `inverse_func` is provided without `func` (that would default to identity)
+  being explicitly set as well.
+  :pr:`28483` by :user:`Stefanie Senger <StefanieSenger>`.
 
 :mod:`sklearn.datasets`
 .......................
@@ -113,6 +114,13 @@ Changelog
   By default, the functions will retry up to 3 times in case of network failures.
   :pr:`28160` by :user:`Zhehao Liu <MaxwellLZH>` and :user:`Filip Karlo Došilović <fkdosilovic>`.
 
+:mod:`sklearn.decomposition`
+............................
+
+- |Enhancement| :class:`~decomposition.PCA` now automatically selects the ARPACK solver
+  for sparse inputs when `svd_solver="auto"` instead of raising an error.
+  :pr:`28498` by :user:`Thanh Lam Dang <lamdang2k>`.
+
 :mod:`sklearn.dummy`
 ....................
 
@@ -166,7 +174,7 @@ Changelog
   :class:`linear_model.Lasso` and :class:`linear_model.LassoCV` now explicitly don't
   accept large sparse data formats. :pr:`27576` by :user:`Stefanie Senger
   <StefanieSenger>`.
-  
+
 - |API| :class:`linear_model.RidgeCV` and :class:`linear_model.RidgeClassifierCV`
   will now allow `alpha=0` when `cv != None`, which is consistent with
   :class:`linear_model.Ridge` and :class:`linear_model.RidgeClassifier`.

diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
@@ -465,14 +465,12 @@ def _fit(self, X):
         """Dispatch to the right submethod depending on the chosen solver."""
         xp, is_array_api_compliant = get_namespace(X)
 
-        # Raise an error for sparse input and unsupported svd_solver
-        if issparse(X) and self.svd_solver != "arpack":
+        if issparse(X) and self.svd_solver not in {"arpack", "auto"}:
             raise TypeError(
                 'PCA only support sparse inputs with the "arpack" solver, while '
                 f'"{self.svd_solver}" was passed. See TruncatedSVD for a possible'
                 " alternative."
             )
-        # Raise an error for non-Numpy input and arpack solver.
         if self.svd_solver == "arpack" and is_array_api_compliant:
             raise ValueError(
                 "PCA with svd_solver='arpack' is not supported for Array API inputs."
@@ -485,29 +483,28 @@ def _fit(self, X):
             ensure_2d=True,
             copy=self.copy,
         )
+        self._fit_svd_solver = self.svd_solver
+        if self._fit_svd_solver == "auto" and issparse(X):
+            self._fit_svd_solver = "arpack"
 
-        # Handle n_components==None
         if self.n_components is None:
-            if self.svd_solver != "arpack":
+            if self._fit_svd_solver != "arpack":
                 n_components = min(X.shape)
             else:
                 n_components = min(X.shape) - 1
         else:
             n_components = self.n_components
 
-        # Handle svd_solver
-        self._fit_svd_solver = self.svd_solver
         if self._fit_svd_solver == "auto":
             # Small problem or n_components == 'mle', just call full PCA
             if max(X.shape) <= 500 or n_components == "mle":
                 self._fit_svd_solver = "full"
             elif 1 <= n_components < 0.8 * min(X.shape):
                 self._fit_svd_solver = "randomized"
-            # This is also the case of n_components in (0,1)
+            # This is also the case of n_components in (0, 1)
             else:
                 self._fit_svd_solver = "full"
 
-        # Call different fits for either full or truncated SVD
         if self._fit_svd_solver == "full":
             return self._fit_full(X, n_components)
         elif self._fit_svd_solver in ["arpack", "randomized"]:

diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
@@ -159,7 +159,7 @@ def test_pca_sparse_fit_transform(global_random_seed, sparse_container):
     assert_allclose(pca_fit.transform(X2), pca_fit_transform.transform(X2), rtol=2e-9)
 
 
-@pytest.mark.parametrize("svd_solver", ["randomized", "full", "auto"])
+@pytest.mark.parametrize("svd_solver", ["randomized", "full"])
 @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
 def test_sparse_pca_solver_error(global_random_seed, svd_solver, sparse_container):
     random_state = np.random.RandomState(global_random_seed)
@@ -179,6 +179,24 @@ def test_sparse_pca_solver_error(global_random_seed, svd_solver, sparse_containe
         pca.fit(X)
 
 
+@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
+def test_sparse_pca_auto_arpack_singluar_values_consistency(
+    global_random_seed, sparse_container
+):
+    """Check that "auto" and "arpack" solvers are equivalent for sparse inputs."""
+    random_state = np.random.RandomState(global_random_seed)
+    X = sparse_container(
+        sp.sparse.random(
+            SPARSE_M,
+            SPARSE_N,
+            random_state=random_state,
+        )
+    )
+    pca_arpack = PCA(n_components=10, svd_solver="arpack").fit(X)
+    pca_auto = PCA(n_components=10, svd_solver="auto").fit(X)
+    assert_allclose(pca_arpack.singular_values_, pca_auto.singular_values_, rtol=5e-3)
+
+
 def test_no_empty_slice_warning():
     # test if we avoid numpy warnings for computing over empty arrays
     n_components = 10