From 39867629f84a4671c060c9fa59d7056818f9eb53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien?= Date: Mon, 25 Feb 2019 12:18:37 +0100 Subject: [PATCH 1/8] enforce deterministic output in kernel PCA --- sklearn/decomposition/kernel_pca.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index bb91d0cbbad6c..72bfe0d4a5440 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -8,6 +8,7 @@ from scipy.sparse.linalg import eigsh from ..utils import check_random_state +from ..utils.extmath import svd_flip from ..utils.validation import check_is_fitted, check_array from ..exceptions import NotFittedError from ..base import BaseEstimator, TransformerMixin, _UnstableOn32BitMixin @@ -210,6 +211,11 @@ def _fit_transform(self, K): maxiter=self.max_iter, v0=v0) + # flip eigenvectors' sign to enforce deterministic output + # note: copying the second element is needed so that both inputs do + # not refer to the same object + self.alphas_, _ = svd_flip(self.alphas_, self.alphas_.copy().T) + # sort eigenvectors in descending order indices = self.lambdas_.argsort()[::-1] self.lambdas_ = self.lambdas_[indices] From 3b1590f4e1b967cb695733dacc863e4b47d33e24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien?= Date: Mon, 25 Feb 2019 14:43:59 +0100 Subject: [PATCH 2/8] add tests and update whats new --- doc/whats_new/v0.21.rst | 7 ++++--- .../decomposition/tests/test_kernel_pca.py | 19 ++++++++++++++++++- sklearn/decomposition/tests/test_pca.py | 16 ++++++++++++++++ 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 36582d834c708..fc79a4d859c18 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -76,9 +76,6 @@ Support for Python 3.4 and below has been officially dropped. the default value is used. :issue:`12988` by :user:`Zijie (ZJ) Poh `. -:mod:`sklearn.decomposition` -............................ - - |Fix| Fixed a bug in :class:`decomposition.NMF` where `init = 'nndsvd'`, `init = 'nndsvda'`, and `init = 'nndsvdar'` are allowed when `n_components < n_features` instead of @@ -86,6 +83,10 @@ Support for Python 3.4 and below has been officially dropped. :issue:`11650` by :user:`Hossein Pourbozorg ` and :user:`Zijie (ZJ) Poh `. +- |Enhancement| :class:`decomposition.KernelPCA` now has deterministic output + (resolved sign ambiguity in eigenvalue decomposition of the kernel matrix). + :issue:`13241` by :user:`Aurélien Bellet `. + :mod:`sklearn.discriminant_analysis` .................................... diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 4e4c5cb2be4b5..5664b214ac63e 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -4,7 +4,7 @@ from sklearn.utils.testing import (assert_array_almost_equal, assert_less, assert_equal, assert_not_equal, - assert_raises) + assert_raises, assert_array_equal) from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles @@ -71,6 +71,23 @@ def test_kernel_pca_consistent_transform(): assert_array_almost_equal(transformed1, transformed2) +def test_kernel_pca_deterministic_output(): + state = np.random.RandomState(0) + X = state.rand(10, 10) + eigen_solver = ('arpack', 'dense') + transformed_X = np.zeros((10 * len(eigen_solver), 2)) + + i = 0 + for solver in eigen_solver: + for _ in range(10): + kpca = KernelPCA(n_components=2, eigen_solver=solver, + random_state=i) + transformed_X[i, :] = kpca.fit_transform(X)[0] + i += 1 + + assert_array_equal(np.isclose(transformed_X, transformed_X[0, :]), True) + + def test_kernel_pca_sparse(): rng = np.random.RandomState(0) X_fit = sp.csr_matrix(rng.random_sample((5, 4))) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index db562836cbab0..6a8bf9e23c7c1 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -6,6 +6,7 @@ from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_almost_equal +from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_raise_message @@ -703,6 +704,21 @@ def test_pca_dtype_preservation(svd_solver): check_pca_int_dtype_upcast_to_double(svd_solver) +def test_pca_deterministic_output(): + state = np.random.RandomState(0) + X = state.rand(10, 10) + transformed_X = np.zeros((10 * len(solver_list), 2)) + + i = 0 + for solver in solver_list: + for _ in range(10): + pca = PCA(n_components=2, svd_solver=solver, random_state=i) + transformed_X[i, :] = pca.fit_transform(X)[0] + i += 1 + + assert_array_equal(np.isclose(transformed_X, transformed_X[0, :]), True) + + def check_pca_float_dtype_preservation(svd_solver): # Ensure that PCA does not upscale the dtype when input is float32 X_64 = np.random.RandomState(0).rand(1000, 4).astype(np.float64) From 2ebeb985d7e86adfc330888e454f062ece3f3633 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien?= Date: Mon, 25 Feb 2019 15:02:24 +0100 Subject: [PATCH 3/8] replace state by rng --- sklearn/decomposition/tests/test_kernel_pca.py | 4 ++-- sklearn/decomposition/tests/test_pca.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 5664b214ac63e..2ab7856d9419e 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -72,8 +72,8 @@ def test_kernel_pca_consistent_transform(): def test_kernel_pca_deterministic_output(): - state = np.random.RandomState(0) - X = state.rand(10, 10) + rng = np.random.RandomState(0) + X = rng.rand(10, 10) eigen_solver = ('arpack', 'dense') transformed_X = np.zeros((10 * len(eigen_solver), 2)) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 6a8bf9e23c7c1..7bfdf25eb4544 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -705,8 +705,8 @@ def test_pca_dtype_preservation(svd_solver): def test_pca_deterministic_output(): - state = np.random.RandomState(0) - X = state.rand(10, 10) + rng = np.random.RandomState(0) + X = rng.rand(10, 10) transformed_X = np.zeros((10 * len(solver_list), 2)) i = 0 From 348d9a7d7cc1ff57d601c13230523d9c9d9ee0a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien?= Date: Mon, 25 Feb 2019 15:38:30 +0100 Subject: [PATCH 4/8] simplified assert --- sklearn/decomposition/tests/test_kernel_pca.py | 4 ++-- sklearn/decomposition/tests/test_pca.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 2ab7856d9419e..5420b36749c02 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -4,7 +4,7 @@ from sklearn.utils.testing import (assert_array_almost_equal, assert_less, assert_equal, assert_not_equal, - assert_raises, assert_array_equal) + assert_raises) from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles @@ -85,7 +85,7 @@ def test_kernel_pca_deterministic_output(): transformed_X[i, :] = kpca.fit_transform(X)[0] i += 1 - assert_array_equal(np.isclose(transformed_X, transformed_X[0, :]), True) + assert np.isclose(transformed_X, transformed_X[0, :]).all() def test_kernel_pca_sparse(): diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 7bfdf25eb4544..bff7a4395febe 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -6,7 +6,6 @@ from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_almost_equal -from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_raise_message @@ -716,7 +715,7 @@ def test_pca_deterministic_output(): transformed_X[i, :] = pca.fit_transform(X)[0] i += 1 - assert_array_equal(np.isclose(transformed_X, transformed_X[0, :]), True) + assert np.isclose(transformed_X, transformed_X[0, :]).all() def check_pca_float_dtype_preservation(svd_solver): From 3370c2136dfe8519d8f58d2464ebfce0c2817b06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien?= Date: Mon, 25 Feb 2019 16:18:34 +0100 Subject: [PATCH 5/8] avoid copy --- sklearn/decomposition/kernel_pca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 72bfe0d4a5440..3ffe1b1726452 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -214,7 +214,8 @@ def _fit_transform(self, K): # flip eigenvectors' sign to enforce deterministic output # note: copying the second element is needed so that both inputs do # not refer to the same object - self.alphas_, _ = svd_flip(self.alphas_, self.alphas_.copy().T) + self.alphas_, _ = svd_flip(self.alphas_, + np.empty_like(self.alphas_).T) # sort eigenvectors in descending order indices = self.lambdas_.argsort()[::-1] From 0e1c1dfd958f30b514637ed4837c01e3e1ffbf7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien?= Date: Mon, 25 Feb 2019 16:56:33 +0100 Subject: [PATCH 6/8] clarify tests --- sklearn/decomposition/tests/test_kernel_pca.py | 12 +++++------- sklearn/decomposition/tests/test_pca.py | 11 +++++------ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 5420b36749c02..847de5e820082 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -4,7 +4,7 @@ from sklearn.utils.testing import (assert_array_almost_equal, assert_less, assert_equal, assert_not_equal, - assert_raises) + assert_raises, assert_allclose) from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles @@ -75,17 +75,15 @@ def test_kernel_pca_deterministic_output(): rng = np.random.RandomState(0) X = rng.rand(10, 10) eigen_solver = ('arpack', 'dense') - transformed_X = np.zeros((10 * len(eigen_solver), 2)) - i = 0 for solver in eigen_solver: - for _ in range(10): + transformed_X = np.zeros((20, 2)) + for i in range(20): kpca = KernelPCA(n_components=2, eigen_solver=solver, random_state=i) transformed_X[i, :] = kpca.fit_transform(X)[0] - i += 1 - - assert np.isclose(transformed_X, transformed_X[0, :]).all() + assert_allclose( + transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2)) def test_kernel_pca_sparse(): diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index bff7a4395febe..3c44148c2d323 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -6,6 +6,7 @@ from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_almost_equal +from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_raise_message @@ -706,16 +707,14 @@ def test_pca_dtype_preservation(svd_solver): def test_pca_deterministic_output(): rng = np.random.RandomState(0) X = rng.rand(10, 10) - transformed_X = np.zeros((10 * len(solver_list), 2)) - i = 0 for solver in solver_list: - for _ in range(10): + transformed_X = np.zeros((20, 2)) + for i in range(20): pca = PCA(n_components=2, svd_solver=solver, random_state=i) transformed_X[i, :] = pca.fit_transform(X)[0] - i += 1 - - assert np.isclose(transformed_X, transformed_X[0, :]).all() + assert_allclose( + transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2)) def check_pca_float_dtype_preservation(svd_solver): From d4b46ee348d8c534e3a85887cb72b24b688c35cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien?= Date: Mon, 25 Feb 2019 16:57:01 +0100 Subject: [PATCH 7/8] remove now useless comment --- sklearn/decomposition/kernel_pca.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 3ffe1b1726452..bff79be619be9 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -212,8 +212,6 @@ def _fit_transform(self, K): v0=v0) # flip eigenvectors' sign to enforce deterministic output - # note: copying the second element is needed so that both inputs do - # not refer to the same object self.alphas_, _ = svd_flip(self.alphas_, np.empty_like(self.alphas_).T) From ac5f543542ea3103101e3c499f4f3d1f1ebe2cd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien?= Date: Mon, 25 Feb 2019 16:59:17 +0100 Subject: [PATCH 8/8] use rng as seed everywhere --- sklearn/decomposition/tests/test_kernel_pca.py | 2 +- sklearn/decomposition/tests/test_pca.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 847de5e820082..0b78f74a58143 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -80,7 +80,7 @@ def test_kernel_pca_deterministic_output(): transformed_X = np.zeros((20, 2)) for i in range(20): kpca = KernelPCA(n_components=2, eigen_solver=solver, - random_state=i) + random_state=rng) transformed_X[i, :] = kpca.fit_transform(X)[0] assert_allclose( transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2)) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 3c44148c2d323..cbd9ca06143ba 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -711,7 +711,7 @@ def test_pca_deterministic_output(): for solver in solver_list: transformed_X = np.zeros((20, 2)) for i in range(20): - pca = PCA(n_components=2, svd_solver=solver, random_state=i) + pca = PCA(n_components=2, svd_solver=solver, random_state=rng) transformed_X[i, :] = pca.fit_transform(X)[0] assert_allclose( transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2))