From 64823973694495d013d558b569e15d9dc78b7a01 Mon Sep 17 00:00:00 2001 From: tdang2k Date: Tue, 13 Feb 2024 17:02:10 +0100 Subject: [PATCH 01/16] Correct PCA documentation about sparse input --- sklearn/decomposition/_pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index d121c5e5c186f..515a234edac06 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -133,8 +133,8 @@ class PCA(_BasePCA): It can also use the scipy.sparse.linalg ARPACK implementation of the truncated SVD. - Notice that this class does not support sparse input. See - :class:`TruncatedSVD` for an alternative with sparse data. + This class now supports `scipy.sparse.sparray` and `scipy.sparse.spmatrix` + inputs when using the ARPACK solver. For a usage example, see :ref:`sphx_glr_auto_examples_decomposition_plot_pca_iris.py` From 22f9cd0b3e8e8de156cce315e264a066e4f1fc24 Mon Sep 17 00:00:00 2001 From: tdang2k Date: Thu, 15 Feb 2024 09:45:03 +0100 Subject: [PATCH 02/16] Merge two paragraphs --- sklearn/decomposition/_pca.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 515a234edac06..bf9b909dd2814 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -131,10 +131,9 @@ class PCA(_BasePCA): data and the number of components to extract. It can also use the scipy.sparse.linalg ARPACK implementation of the - truncated SVD. - - This class now supports `scipy.sparse.sparray` and `scipy.sparse.spmatrix` - inputs when using the ARPACK solver. + truncated SVD. With sparse inputs, the ARPACK implementation of the truncated + SVD can be used through :func:`scipy.sparse.linalg.svds`. Alternatively, one may + consider :class:`TruncatedSVD` where the data are not centered. For a usage example, see :ref:`sphx_glr_auto_examples_decomposition_plot_pca_iris.py` From a5ddb352c05c511800de7ed0b4a009fb3c0be2a3 Mon Sep 17 00:00:00 2001 From: tdang2k Date: Thu, 15 Feb 2024 10:58:22 +0100 Subject: [PATCH 03/16] Remove trailing space --- sklearn/decomposition/_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index bf9b909dd2814..6d019faae76e3 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -131,7 +131,7 @@ class PCA(_BasePCA): data and the number of components to extract. It can also use the scipy.sparse.linalg ARPACK implementation of the - truncated SVD. With sparse inputs, the ARPACK implementation of the truncated + truncated SVD. With sparse inputs, the ARPACK implementation of the truncated SVD can be used through :func:`scipy.sparse.linalg.svds`. Alternatively, one may consider :class:`TruncatedSVD` where the data are not centered. From b87380db0bdec22df69098d97996174c4134e8f4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 19 Feb 2024 11:41:21 +0100 Subject: [PATCH 04/16] remove redundant sentence --- sklearn/decomposition/_pca.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 6d019faae76e3..187e3b1067bee 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -130,10 +130,9 @@ class PCA(_BasePCA): SVD by the method of Halko et al. 2009, depending on the shape of the input data and the number of components to extract. - It can also use the scipy.sparse.linalg ARPACK implementation of the - truncated SVD. With sparse inputs, the ARPACK implementation of the truncated - SVD can be used through :func:`scipy.sparse.linalg.svds`. Alternatively, one may - consider :class:`TruncatedSVD` where the data are not centered. + With sparse inputs, the ARPACK implementation of the truncated SVD can be + used (i.e. through :func:`scipy.sparse.linalg.svds`). Alternatively, one + may consider :class:`TruncatedSVD` where the data are not centered. For a usage example, see :ref:`sphx_glr_auto_examples_decomposition_plot_pca_iris.py` From 03b84b72bd52550ef682708df6c5ba6d94e763c1 Mon Sep 17 00:00:00 2001 From: tdang2k Date: Thu, 22 Feb 2024 11:03:13 +0100 Subject: [PATCH 05/16] Select arpack, remove error raising and adapt test --- sklearn/decomposition/_pca.py | 8 ++------ sklearn/decomposition/tests/test_pca.py | 19 +++++++++++-------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 187e3b1067bee..0527b80e97ec7 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -465,13 +465,9 @@ def _fit(self, X): """Dispatch to the right submethod depending on the chosen solver.""" xp, is_array_api_compliant = get_namespace(X) - # Raise an error for sparse input and unsupported svd_solver + # Automatically select "arpack" solver if the input is sparse if issparse(X) and self.svd_solver != "arpack": - raise TypeError( - 'PCA only support sparse inputs with the "arpack" solver, while ' - f'"{self.svd_solver}" was passed. See TruncatedSVD for a possible' - " alternative." - ) + self.svd_solver = "arpack" # Raise an error for non-Numpy input and arpack solver. if self.svd_solver == "arpack" and is_array_api_compliant: raise ValueError( diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 44281b9038697..ffd1c8f348c1d 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -161,7 +161,9 @@ def test_pca_sparse_fit_transform(global_random_seed, sparse_container): @pytest.mark.parametrize("svd_solver", ["randomized", "full", "auto"]) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) -def test_sparse_pca_solver_error(global_random_seed, svd_solver, sparse_container): +def test_sparse_pca_solver_automatically_select_arpack( + global_random_seed, svd_solver, sparse_container +): random_state = np.random.RandomState(global_random_seed) X = sparse_container( sp.sparse.random( @@ -170,13 +172,14 @@ def test_sparse_pca_solver_error(global_random_seed, svd_solver, sparse_containe random_state=random_state, ) ) - pca = PCA(n_components=30, svd_solver=svd_solver) - error_msg_pattern = ( - f'PCA only support sparse inputs with the "arpack" solver, while "{svd_solver}"' - " was passed" - ) - with pytest.raises(TypeError, match=error_msg_pattern): - pca.fit(X) + pca_arpack = PCA(n_components=10, svd_solver="arpack") + pca_others = PCA(n_components=10, svd_solver=svd_solver) + + # check the equivalence of pca_arpack.fit and pca_others.fit + X_arpack = pca_arpack.fit(X) + X_others = pca_others.fit(X) + + assert_allclose(X_arpack.singular_values_, X_others.singular_values_, rtol=5e-3) def test_no_empty_slice_warning(): From c8768cbed9d9335bc39212a74253be59baa7d949 Mon Sep 17 00:00:00 2001 From: tdang2k Date: Thu, 22 Feb 2024 11:52:50 +0100 Subject: [PATCH 06/16] Add changelog --- doc/whats_new/v1.5.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index f8588a89aeb7a..d898fbec6f24e 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -64,6 +64,13 @@ Changelog - |Feature| A fitted :class:`compose.ColumnTransformer` now implements `__getitem__` which returns the fitted transformers by name. :pr:`27990` by `Thomas Fan`_. +:mod:`sklearn.decomposition` +............................ + +- |Enhancement| :class:`decomposition.PCA` now automatically select `arpack` solver + for sparse inputs and does not raise error if other solvers are passed. + :pr:`28498` by :user:`Thanh Lam Dang `. + :mod:`sklearn.dummy` .................... From 6701cc4705a2af78c1d2814039125a901f5dd504 Mon Sep 17 00:00:00 2001 From: tdang2k Date: Fri, 23 Feb 2024 10:01:58 +0100 Subject: [PATCH 07/16] Force arpack only when auto is chosen, raise TypeError otherwise --- sklearn/decomposition/_pca.py | 14 ++++++++--- sklearn/decomposition/tests/test_pca.py | 33 +++++++++++++++++++------ 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 0527b80e97ec7..6ee6336527bcb 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -465,9 +465,13 @@ def _fit(self, X): """Dispatch to the right submethod depending on the chosen solver.""" xp, is_array_api_compliant = get_namespace(X) - # Automatically select "arpack" solver if the input is sparse - if issparse(X) and self.svd_solver != "arpack": - self.svd_solver = "arpack" + # Raise an error for sparse input and unsupported svd_solver + if issparse(X) and self.svd_solver not in {"arpack", "auto"}: + raise TypeError( + 'PCA only support sparse inputs with the "arpack" solver, while ' + f'"{self.svd_solver}" was passed. See TruncatedSVD for a possible' + " alternative." + ) # Raise an error for non-Numpy input and arpack solver. if self.svd_solver == "arpack" and is_array_api_compliant: raise ValueError( @@ -495,7 +499,9 @@ def _fit(self, X): self._fit_svd_solver = self.svd_solver if self._fit_svd_solver == "auto": # Small problem or n_components == 'mle', just call full PCA - if max(X.shape) <= 500 or n_components == "mle": + if issparse(X): + self._fit_svd_solver = "arpack" + elif max(X.shape) <= 500 or n_components == "mle": self._fit_svd_solver = "full" elif 1 <= n_components < 0.8 * min(X.shape): self._fit_svd_solver = "randomized" diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index ffd1c8f348c1d..1fab1dd92f463 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -159,10 +159,29 @@ def test_pca_sparse_fit_transform(global_random_seed, sparse_container): assert_allclose(pca_fit.transform(X2), pca_fit_transform.transform(X2), rtol=2e-9) -@pytest.mark.parametrize("svd_solver", ["randomized", "full", "auto"]) +@pytest.mark.parametrize("svd_solver", ["randomized", "full"]) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) -def test_sparse_pca_solver_automatically_select_arpack( - global_random_seed, svd_solver, sparse_container +def test_sparse_pca_solver_error(global_random_seed, svd_solver, sparse_container): + random_state = np.random.RandomState(global_random_seed) + X = sparse_container( + sp.sparse.random( + SPARSE_M, + SPARSE_N, + random_state=random_state, + ) + ) + pca = PCA(n_components=30, svd_solver=svd_solver) + error_msg_pattern = ( + f'PCA only support sparse inputs with the "arpack" solver, while "{svd_solver}"' + " was passed" + ) + with pytest.raises(TypeError, match=error_msg_pattern): + pca.fit(X) + + +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_sparse_pca_auto_arpack_singluar_values_consistency( + global_random_seed, sparse_container ): random_state = np.random.RandomState(global_random_seed) X = sparse_container( @@ -173,13 +192,13 @@ def test_sparse_pca_solver_automatically_select_arpack( ) ) pca_arpack = PCA(n_components=10, svd_solver="arpack") - pca_others = PCA(n_components=10, svd_solver=svd_solver) + pca_auto = PCA(n_components=10, svd_solver="auto") - # check the equivalence of pca_arpack.fit and pca_others.fit + # check the equivalence of pca_arpack.fit and pca_auto.fit X_arpack = pca_arpack.fit(X) - X_others = pca_others.fit(X) + X_auto = pca_auto.fit(X) - assert_allclose(X_arpack.singular_values_, X_others.singular_values_, rtol=5e-3) + assert_allclose(X_arpack.singular_values_, X_auto.singular_values_, rtol=5e-3) def test_no_empty_slice_warning(): From f35b5d13f5262b1b9eda4915fd4649f95c124e20 Mon Sep 17 00:00:00 2001 From: tdang2k Date: Fri, 23 Feb 2024 11:18:42 +0100 Subject: [PATCH 08/16] Fix error when n_components is unspecified, solver is auto and X sparse --- sklearn/decomposition/_pca.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 6ee6336527bcb..0d95de9c0b200 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -498,9 +498,13 @@ def _fit(self, X): # Handle svd_solver self._fit_svd_solver = self.svd_solver if self._fit_svd_solver == "auto": - # Small problem or n_components == 'mle', just call full PCA + # Automatically select arpack for sparse inputs + # Reduce n_components by 1 when passing from auto to arpack if issparse(X): self._fit_svd_solver = "arpack" + if self.n_components is None: + n_components = n_components - 1 + # Small problem or n_components == 'mle', just call full PCA elif max(X.shape) <= 500 or n_components == "mle": self._fit_svd_solver = "full" elif 1 <= n_components < 0.8 * min(X.shape): From 0c7e3a3f4ec4b6fa90e811df92f19c20ddb77c1a Mon Sep 17 00:00:00 2001 From: tdang2k Date: Sun, 25 Feb 2024 11:06:40 +0100 Subject: [PATCH 09/16] Apply comments --- doc/whats_new/v1.5.rst | 9 +++++---- sklearn/decomposition/_pca.py | 2 +- sklearn/decomposition/tests/test_pca.py | 12 ++++-------- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index d898fbec6f24e..93e562ad4d0a7 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -66,10 +66,11 @@ Changelog :mod:`sklearn.decomposition` ............................ - -- |Enhancement| :class:`decomposition.PCA` now automatically select `arpack` solver - for sparse inputs and does not raise error if other solvers are passed. - :pr:`28498` by :user:`Thanh Lam Dang `. +- |ENH| :class:`compose.TransformedTargetRegressor` now raises an error in `fit` if + only `inverse_func` is provided without `func` (that would default to identity) being + explicitly set as well. :pr:`28483` by :user:`Stefanie Senger `. +- |Enhancement| :class:`~decomposition.PCA` now automatically selects the ARPACK solver + for sparse inputs when `svd_solver="auto"` instead of raising an error. :mod:`sklearn.dummy` .................... diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 0d95de9c0b200..d767a6df40068 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -503,7 +503,7 @@ def _fit(self, X): if issparse(X): self._fit_svd_solver = "arpack" if self.n_components is None: - n_components = n_components - 1 + n_components -= 1 # Small problem or n_components == 'mle', just call full PCA elif max(X.shape) <= 500 or n_components == "mle": self._fit_svd_solver = "full" diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 1fab1dd92f463..1ec359a028f8c 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -183,6 +183,7 @@ def test_sparse_pca_solver_error(global_random_seed, svd_solver, sparse_containe def test_sparse_pca_auto_arpack_singluar_values_consistency( global_random_seed, sparse_container ): + """Check that "auto" and "arpack" solvers are equivalent for sparse inputs.""" random_state = np.random.RandomState(global_random_seed) X = sparse_container( sp.sparse.random( @@ -191,14 +192,9 @@ def test_sparse_pca_auto_arpack_singluar_values_consistency( random_state=random_state, ) ) - pca_arpack = PCA(n_components=10, svd_solver="arpack") - pca_auto = PCA(n_components=10, svd_solver="auto") - - # check the equivalence of pca_arpack.fit and pca_auto.fit - X_arpack = pca_arpack.fit(X) - X_auto = pca_auto.fit(X) - - assert_allclose(X_arpack.singular_values_, X_auto.singular_values_, rtol=5e-3) + pca_arpack = PCA(n_components=10, svd_solver="arpack").fit(X) + pca_auto = PCA(n_components=10, svd_solver="auto").fit(X) + assert_allclose(pca_arpack.singular_values_, pca_auto.singular_values_, rtol=5e-3) def test_no_empty_slice_warning(): From 4fc932ed2e399504dbfa1186f4bb9e53e3747e25 Mon Sep 17 00:00:00 2001 From: Thanh Lam DANG <70220760+lamdang2k@users.noreply.github.com> Date: Sun, 25 Feb 2024 11:23:10 +0100 Subject: [PATCH 10/16] Update v1.5.rst --- doc/whats_new/v1.5.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 35f5547f4ff51..38690a2b65c9d 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -74,14 +74,16 @@ Changelog - |Feature| A fitted :class:`compose.ColumnTransformer` now implements `__getitem__` which returns the fitted transformers by name. :pr:`27990` by `Thomas Fan`_. - :mod:`sklearn.decomposition` ............................ + - |Enhancement| :class:`compose.TransformedTargetRegressor` now raises an error in `fit` if only `inverse_func` is provided without `func` (that would default to identity) being explicitly set as well. :pr:`28483` by :user:`Stefanie Senger `. + - |Enhancement| :class:`~decomposition.PCA` now automatically selects the ARPACK solver for sparse inputs when `svd_solver="auto"` instead of raising an error. + :pr:`28498` by :user:`Thanh Lam Dang `. :mod:`sklearn.dummy` .................... From 9156ad1768e3d72aefca70a9ebd8ad59fe3de8f6 Mon Sep 17 00:00:00 2001 From: tdang2k Date: Mon, 26 Feb 2024 20:32:07 +0100 Subject: [PATCH 11/16] Fix doc error --- doc/whats_new/v1.5.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 7634b2787fcab..267dc56fc0efd 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -97,12 +97,12 @@ Changelog - |Enhancement| :class:`compose.TransformedTargetRegressor` now raises an error in `fit` if only `inverse_func` is provided without `func` (that would default to identity) being - explicitly set as well. :pr:`28483` by :user:`Stefanie Senger - + explicitly set as well. :pr:`28483` by :user:`Stefanie Senger `. + - |Enhancement| :class:`~decomposition.PCA` now automatically selects the ARPACK solver for sparse inputs when `svd_solver="auto"` instead of raising an error. :pr:`28498` by :user:`Thanh Lam Dang `. - + :mod:`sklearn.datasets` ....................... @@ -173,7 +173,7 @@ Changelog :class:`linear_model.Lasso` and :class:`linear_model.LassoCV` now explicitly don't accept large sparse data formats. :pr:`27576` by :user:`Stefanie Senger `. - + - |API| :class:`linear_model.RidgeCV` and :class:`linear_model.RidgeClassifierCV` will now allow `alpha=0` when `cv != None`, which is consistent with :class:`linear_model.Ridge` and :class:`linear_model.RidgeClassifier`. From b99d202d334ab303a8e6e9127e9414222e063e91 Mon Sep 17 00:00:00 2001 From: tdang2k Date: Tue, 27 Feb 2024 17:49:55 +0100 Subject: [PATCH 12/16] Fix order in changelog --- doc/whats_new/v1.5.rst | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 267dc56fc0efd..08ce388a6f8ff 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -92,17 +92,6 @@ Changelog - |Feature| A fitted :class:`compose.ColumnTransformer` now implements `__getitem__` which returns the fitted transformers by name. :pr:`27990` by `Thomas Fan`_. -:mod:`sklearn.decomposition` -............................ - -- |Enhancement| :class:`compose.TransformedTargetRegressor` now raises an error in `fit` if - only `inverse_func` is provided without `func` (that would default to identity) being - explicitly set as well. :pr:`28483` by :user:`Stefanie Senger `. - -- |Enhancement| :class:`~decomposition.PCA` now automatically selects the ARPACK solver - for sparse inputs when `svd_solver="auto"` instead of raising an error. - :pr:`28498` by :user:`Thanh Lam Dang `. - :mod:`sklearn.datasets` ....................... @@ -120,6 +109,17 @@ Changelog By default, the functions will retry up to 3 times in case of network failures. :pr:`28160` by :user:`Zhehao Liu ` and :user:`Filip Karlo Došilović `. +:mod:`sklearn.decomposition` +............................ + +- |Enhancement| :class:`compose.TransformedTargetRegressor` now raises an error in `fit` if + only `inverse_func` is provided without `func` (that would default to identity) being + explicitly set as well. :pr:`28483` by :user:`Stefanie Senger `. + +- |Enhancement| :class:`~decomposition.PCA` now automatically selects the ARPACK solver + for sparse inputs when `svd_solver="auto"` instead of raising an error. + :pr:`28498` by :user:`Thanh Lam Dang `. + :mod:`sklearn.dummy` .................... From aeb42a48e5a9dd6c8594fc20998a00389abe0b39 Mon Sep 17 00:00:00 2001 From: tdang2k Date: Wed, 28 Feb 2024 19:18:49 +0100 Subject: [PATCH 13/16] Refactor codes and fix module name in changelog --- doc/whats_new/v1.5.rst | 7 +++++-- sklearn/decomposition/_pca.py | 11 ++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 08ce388a6f8ff..b8f407117a2d7 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -109,13 +109,16 @@ Changelog By default, the functions will retry up to 3 times in case of network failures. :pr:`28160` by :user:`Zhehao Liu ` and :user:`Filip Karlo Došilović `. -:mod:`sklearn.decomposition` -............................ +:mod:`sklearn.compose` +...................... - |Enhancement| :class:`compose.TransformedTargetRegressor` now raises an error in `fit` if only `inverse_func` is provided without `func` (that would default to identity) being explicitly set as well. :pr:`28483` by :user:`Stefanie Senger `. +:mod:`sklearn.decomposition` +............................ + - |Enhancement| :class:`~decomposition.PCA` now automatically selects the ARPACK solver for sparse inputs when `svd_solver="auto"` instead of raising an error. :pr:`28498` by :user:`Thanh Lam Dang `. diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index d767a6df40068..30a5fc13d8959 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -485,6 +485,9 @@ def _fit(self, X): ensure_2d=True, copy=self.copy, ) + # Handle sparse input, automatically select arpack solver + if self.svd_solver == "auto" and issparse(X): + self.svd_solver = "arpack" # Handle n_components==None if self.n_components is None: @@ -498,14 +501,8 @@ def _fit(self, X): # Handle svd_solver self._fit_svd_solver = self.svd_solver if self._fit_svd_solver == "auto": - # Automatically select arpack for sparse inputs - # Reduce n_components by 1 when passing from auto to arpack - if issparse(X): - self._fit_svd_solver = "arpack" - if self.n_components is None: - n_components -= 1 # Small problem or n_components == 'mle', just call full PCA - elif max(X.shape) <= 500 or n_components == "mle": + if max(X.shape) <= 500 or n_components == "mle": self._fit_svd_solver = "full" elif 1 <= n_components < 0.8 * min(X.shape): self._fit_svd_solver = "randomized" From 0bb9273a8eb5c642448751e5071655783b47a998 Mon Sep 17 00:00:00 2001 From: tdang2k Date: Wed, 28 Feb 2024 19:21:07 +0100 Subject: [PATCH 14/16] Remove redundant compose --- doc/whats_new/v1.5.rst | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index b8f407117a2d7..1128bb1f2338b 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -92,6 +92,10 @@ Changelog - |Feature| A fitted :class:`compose.ColumnTransformer` now implements `__getitem__` which returns the fitted transformers by name. :pr:`27990` by `Thomas Fan`_. +- |Enhancement| :class:`compose.TransformedTargetRegressor` now raises an error in `fit` if + only `inverse_func` is provided without `func` (that would default to identity) being + explicitly set as well. :pr:`28483` by :user:`Stefanie Senger `. + :mod:`sklearn.datasets` ....................... @@ -109,13 +113,6 @@ Changelog By default, the functions will retry up to 3 times in case of network failures. :pr:`28160` by :user:`Zhehao Liu ` and :user:`Filip Karlo Došilović `. -:mod:`sklearn.compose` -...................... - -- |Enhancement| :class:`compose.TransformedTargetRegressor` now raises an error in `fit` if - only `inverse_func` is provided without `func` (that would default to identity) being - explicitly set as well. :pr:`28483` by :user:`Stefanie Senger `. - :mod:`sklearn.decomposition` ............................ From a1af4242e2b4553f794d2137d248afac615625f2 Mon Sep 17 00:00:00 2001 From: tdang2k Date: Wed, 28 Feb 2024 20:02:37 +0100 Subject: [PATCH 15/16] Do not modify svd_solver --- sklearn/decomposition/_pca.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 30a5fc13d8959..ba8cdb992c899 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -486,12 +486,13 @@ def _fit(self, X): copy=self.copy, ) # Handle sparse input, automatically select arpack solver - if self.svd_solver == "auto" and issparse(X): - self.svd_solver = "arpack" + self._fit_svd_solver = self.svd_solver + if self._fit_svd_solver == "auto" and issparse(X): + self._fit_svd_solver = "arpack" # Handle n_components==None if self.n_components is None: - if self.svd_solver != "arpack": + if self._fit_svd_solver != "arpack": n_components = min(X.shape) else: n_components = min(X.shape) - 1 @@ -499,7 +500,6 @@ def _fit(self, X): n_components = self.n_components # Handle svd_solver - self._fit_svd_solver = self.svd_solver if self._fit_svd_solver == "auto": # Small problem or n_components == 'mle', just call full PCA if max(X.shape) <= 500 or n_components == "mle": From 69df867280c63fc54071044135240db17804e770 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 29 Feb 2024 15:45:24 +0100 Subject: [PATCH 16/16] cosmetic --- doc/whats_new/v1.5.rst | 7 ++++--- sklearn/decomposition/_pca.py | 8 +------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 1128bb1f2338b..fc3c2337fc4e1 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -92,9 +92,10 @@ Changelog - |Feature| A fitted :class:`compose.ColumnTransformer` now implements `__getitem__` which returns the fitted transformers by name. :pr:`27990` by `Thomas Fan`_. -- |Enhancement| :class:`compose.TransformedTargetRegressor` now raises an error in `fit` if - only `inverse_func` is provided without `func` (that would default to identity) being - explicitly set as well. :pr:`28483` by :user:`Stefanie Senger `. +- |Enhancement| :class:`compose.TransformedTargetRegressor` now raises an error in `fit` + if only `inverse_func` is provided without `func` (that would default to identity) + being explicitly set as well. + :pr:`28483` by :user:`Stefanie Senger `. :mod:`sklearn.datasets` ....................... diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index ba8cdb992c899..abd2fda2d5d2f 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -465,14 +465,12 @@ def _fit(self, X): """Dispatch to the right submethod depending on the chosen solver.""" xp, is_array_api_compliant = get_namespace(X) - # Raise an error for sparse input and unsupported svd_solver if issparse(X) and self.svd_solver not in {"arpack", "auto"}: raise TypeError( 'PCA only support sparse inputs with the "arpack" solver, while ' f'"{self.svd_solver}" was passed. See TruncatedSVD for a possible' " alternative." ) - # Raise an error for non-Numpy input and arpack solver. if self.svd_solver == "arpack" and is_array_api_compliant: raise ValueError( "PCA with svd_solver='arpack' is not supported for Array API inputs." @@ -485,12 +483,10 @@ def _fit(self, X): ensure_2d=True, copy=self.copy, ) - # Handle sparse input, automatically select arpack solver self._fit_svd_solver = self.svd_solver if self._fit_svd_solver == "auto" and issparse(X): self._fit_svd_solver = "arpack" - # Handle n_components==None if self.n_components is None: if self._fit_svd_solver != "arpack": n_components = min(X.shape) @@ -499,18 +495,16 @@ def _fit(self, X): else: n_components = self.n_components - # Handle svd_solver if self._fit_svd_solver == "auto": # Small problem or n_components == 'mle', just call full PCA if max(X.shape) <= 500 or n_components == "mle": self._fit_svd_solver = "full" elif 1 <= n_components < 0.8 * min(X.shape): self._fit_svd_solver = "randomized" - # This is also the case of n_components in (0,1) + # This is also the case of n_components in (0, 1) else: self._fit_svd_solver = "full" - # Call different fits for either full or truncated SVD if self._fit_svd_solver == "full": return self._fit_full(X, n_components) elif self._fit_svd_solver in ["arpack", "randomized"]: