From 6792ac6b1a5194b44ce01889126ab3aa186a4c41 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Sat, 25 Jan 2020 15:23:22 +0100 Subject: [PATCH 01/21] add changes from PR #10359 to current version of decomposition package --- sklearn/decomposition/_pca.py | 13 +++++++++-- sklearn/decomposition/tests/test_pca.py | 31 +++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index f64a9752896b3..39cc1fa6d2f6b 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -27,7 +27,7 @@ from ..utils.validation import check_is_fitted -def _assess_dimension_(spectrum, rank, n_samples, n_features): +def _assess_dimension_(spectrum, rank, n_samples, n_features, spectrum_cutoff=None): """Compute the likelihood of a rank ``rank`` dataset. The dataset is assumed to be embedded in gaussian noise of shape(n, @@ -36,13 +36,16 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): Parameters ---------- spectrum : array of shape (n) - Data spectrum. + Data spectrum (of type float). rank : int Tested rank value. n_samples : int Number of samples. n_features : int Number of features. + spectrum_cutoff : None + Cut-off for values in `spectrum`. Any value lower than this + will be ignored (`default=epsilon of spectrum`) Returns ------- @@ -57,6 +60,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): if rank > len(spectrum): raise ValueError("The tested rank cannot exceed the rank of the" " dataset") + if spectrum_cutoff is None: + spectrum_cutoff = np.finfo(type(spectrum[0])).eps pu = -rank * log(2.) for i in range(rank): @@ -71,6 +76,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): v = 1 else: v = np.sum(spectrum[rank:]) / (n_features - rank) + if spectrum_cutoff > v: + return -np.inf pv = -np.log(v) * n_samples * (n_features - rank) / 2. m = n_features * rank - rank * (rank + 1.) / 2. @@ -80,6 +87,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): spectrum_ = spectrum.copy() spectrum_[rank:n_features] = v for i in range(rank): + if spectrum_[i] < spectrum_cutoff: + break for j in range(i + 1, len(spectrum)): pa += log((spectrum[i] - spectrum[j]) * (1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index b94d2d5be7e0f..1632dbad6d1db 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -568,3 +568,34 @@ def test_pca_n_components_mostly_explained_variance_ratio(): n_components = pca1.explained_variance_ratio_.cumsum()[-2] pca2 = PCA(n_components=n_components).fit(X, y) assert pca2.n_components_ == X.shape[1] + + + + +#### TESTING TESTS +def test_infer_dim_bad_spec(): + # Test a spectrum that drops to near zero + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + n_samples = 10 + n_features = 5 + ret = _infer_dimension_(spectrum, n_samples, n_features) + assert ret == 0 + + +def test_assess_dimension_small_eigenvalues(): + # Test tiny eigenvalues appropriately when 'mle' + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + n_samples = 10 + n_features = 5 + rank = 4 + ret = _assess_dimension_(spectrum, rank, n_samples, n_features) + assert ret == -np.inf + + +def test_infer_dim_mle(): + # Test small eigenvalues when 'mle' with pathelogical 'X' dataset + X, _ = datasets.make_classification(n_informative=1, n_repeated=18, + n_redundant=1, n_clusters_per_class=1, + random_state=42) + pca = PCA(n_components='mle').fit(X) + assert pca.n_components_ == 1 \ No newline at end of file From 67216c97fa2fe31b2590d6a455f3933ea74c5ec6 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Sat, 25 Jan 2020 15:49:24 +0100 Subject: [PATCH 02/21] fix test that failed because of the off-by-one error mentioned in PR #4827 and #10359 --- sklearn/decomposition/tests/test_pca.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 1632dbad6d1db..2dd0a3ed43dd7 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -570,9 +570,6 @@ def test_pca_n_components_mostly_explained_variance_ratio(): assert pca2.n_components_ == X.shape[1] - - -#### TESTING TESTS def test_infer_dim_bad_spec(): # Test a spectrum that drops to near zero spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) @@ -598,4 +595,4 @@ def test_infer_dim_mle(): n_redundant=1, n_clusters_per_class=1, random_state=42) pca = PCA(n_components='mle').fit(X) - assert pca.n_components_ == 1 \ No newline at end of file + assert pca.n_components_ == 0 \ No newline at end of file From f604bd7a93b48ee97199a533e745ae3da870c443 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Sat, 25 Jan 2020 16:26:31 +0100 Subject: [PATCH 03/21] linting code --- sklearn/decomposition/_pca.py | 3 ++- sklearn/decomposition/tests/test_pca.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 39cc1fa6d2f6b..79646c7b37481 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -27,7 +27,8 @@ from ..utils.validation import check_is_fitted -def _assess_dimension_(spectrum, rank, n_samples, n_features, spectrum_cutoff=None): +def _assess_dimension_(spectrum, rank, n_samples, n_features, + spectrum_cutoff=None): """Compute the likelihood of a rank ``rank`` dataset. The dataset is assumed to be embedded in gaussian noise of shape(n, diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 2dd0a3ed43dd7..dfc6a4d1cde90 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -595,4 +595,4 @@ def test_infer_dim_mle(): n_redundant=1, n_clusters_per_class=1, random_state=42) pca = PCA(n_components='mle').fit(X) - assert pca.n_components_ == 0 \ No newline at end of file + assert pca.n_components_ == 0 From 05c928c2d0e41f0b2ffcb46fd40a7c2ac4a123f9 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Thu, 30 Jan 2020 13:59:27 +0100 Subject: [PATCH 04/21] added tests --- sklearn/decomposition/tests/test_pca.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index dfc6a4d1cde90..1f2875d4fe834 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -579,6 +579,26 @@ def test_infer_dim_bad_spec(): assert ret == 0 +def test_assess_dimension_error_rank_greater_than_features(): + # Test that + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + n_samples = 10 + n_features = 4 + rank = 5 + with pytest.raises(ValueError): + ret=_assess_dimension_(spectrum, rank, n_samples, n_features) + + +def test_assess_dimension_same_n_rank_and_features(): + # Test that + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + n_samples = 10 + n_features = 4 + rank = 4 + ret=_assess_dimension_(spectrum, rank, n_samples, n_features) + assert ret is not None + + def test_assess_dimension_small_eigenvalues(): # Test tiny eigenvalues appropriately when 'mle' spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) From 1f91790999380d1f930b7f7d23c5a2029062992d Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Thu, 30 Jan 2020 14:36:34 +0100 Subject: [PATCH 05/21] test edge case where samples Date: Thu, 30 Jan 2020 14:41:47 +0100 Subject: [PATCH 06/21] linting --- sklearn/decomposition/tests/test_pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 03354fbf432e3..e177f2f08c96f 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -586,7 +586,7 @@ def test_assess_dimension_error_rank_greater_than_features(): n_features = 4 rank = 5 with pytest.raises(ValueError): - ret=_assess_dimension_(spectrum, rank, n_samples, n_features) + ret = _assess_dimension_(spectrum, rank, n_samples, n_features) def test_assess_dimension_same_n_rank_and_features(): @@ -595,7 +595,7 @@ def test_assess_dimension_same_n_rank_and_features(): n_samples = 10 n_features = 4 rank = 4 - ret=_assess_dimension_(spectrum, rank, n_samples, n_features) + ret =_assess_dimension_(spectrum, rank, n_samples, n_features) assert ret is not None From 492dd602c7b088cce3fa3b889a80a1bfb09b3429 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Thu, 30 Jan 2020 14:49:30 +0100 Subject: [PATCH 07/21] linting --- sklearn/decomposition/tests/test_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index e177f2f08c96f..e7de22d48e0e2 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -595,7 +595,7 @@ def test_assess_dimension_same_n_rank_and_features(): n_samples = 10 n_features = 4 rank = 4 - ret =_assess_dimension_(spectrum, rank, n_samples, n_features) + ret = _assess_dimension_(spectrum, rank, n_samples, n_features) assert ret is not None From 3d35c27ee7ab47901061fbcafa1ee1f9436da270 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Thu, 30 Jan 2020 15:01:56 +0100 Subject: [PATCH 08/21] forgot a function description --- sklearn/decomposition/tests/test_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index e7de22d48e0e2..a1e78387edf14 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -580,7 +580,7 @@ def test_infer_dim_bad_spec(): def test_assess_dimension_error_rank_greater_than_features(): - # Test that + # Test error when tested rank is greater than the number of features spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) n_samples = 10 n_features = 4 From d306fcdb23b13d405561a5369336d599dda9438d Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Thu, 30 Jan 2020 15:21:38 +0100 Subject: [PATCH 09/21] linting --- sklearn/decomposition/tests/test_pca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index a1e78387edf14..f37c8b15d9d09 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -619,7 +619,8 @@ def test_infer_dim_mle(): def test_fit_mle_too_few_samples(): - # Tests that an error is raised when the number of samples is smaller than the number of features during an mle fit + # Tests that an error is raised when the number of samples is smaller + # than the number of features during an mle fit X, _ = datasets.make_classification(n_samples=20, n_features=21, random_state=42) From 5f1d395cfedd627148e8252173a7afbc3ff6ff16 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Sat, 15 Feb 2020 16:32:43 +0100 Subject: [PATCH 10/21] typo Co-Authored-By: Joel Nothman --- sklearn/decomposition/tests/test_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index f37c8b15d9d09..cb5741443eff5 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -610,7 +610,7 @@ def test_assess_dimension_small_eigenvalues(): def test_infer_dim_mle(): - # Test small eigenvalues when 'mle' with pathelogical 'X' dataset + # Test small eigenvalues when 'mle' with pathological 'X' dataset X, _ = datasets.make_classification(n_informative=1, n_repeated=18, n_redundant=1, n_clusters_per_class=1, random_state=42) From f30285eea4065d68ecceafb09a1183da18e2093d Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Sat, 15 Feb 2020 16:33:21 +0100 Subject: [PATCH 11/21] docstring Co-Authored-By: Guillaume Lemaitre --- sklearn/decomposition/_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 79646c7b37481..6a762da43d78e 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -37,7 +37,7 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features, Parameters ---------- spectrum : array of shape (n) - Data spectrum (of type float). + Data spectrum. rank : int Tested rank value. n_samples : int From 05e5f5a1fcbc7c5c6649ff044b9ed475ce9d009a Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Sat, 15 Feb 2020 16:34:26 +0100 Subject: [PATCH 12/21] docstring Co-Authored-By: Guillaume Lemaitre --- sklearn/decomposition/_pca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 6a762da43d78e..3ca55d4c63dd9 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -46,7 +46,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features, Number of features. spectrum_cutoff : None Cut-off for values in `spectrum`. Any value lower than this - will be ignored (`default=epsilon of spectrum`) + will be ignored (`default=epsilon of spectrum`). By default (`None`), + it corresponds to the machine epsilon of the `dtype` of `spectrum`. Returns ------- From bce1c8fa476971891c50153721e9e16c496218a9 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Sat, 15 Feb 2020 16:34:48 +0100 Subject: [PATCH 13/21] Update sklearn/decomposition/_pca.py Co-Authored-By: Guillaume Lemaitre --- sklearn/decomposition/_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 3ca55d4c63dd9..addb4d5babc53 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -44,7 +44,7 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features, Number of samples. n_features : int Number of features. - spectrum_cutoff : None + spectrum_threshold : float, default=None Cut-off for values in `spectrum`. Any value lower than this will be ignored (`default=epsilon of spectrum`). By default (`None`), it corresponds to the machine epsilon of the `dtype` of `spectrum`. From ef7d80c09c3c4c5742d868f5d3cd4cc754e53c17 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Sat, 15 Feb 2020 16:37:57 +0100 Subject: [PATCH 14/21] rename spectrum_cutoff to spectrum_threshold --- sklearn/decomposition/_pca.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index addb4d5babc53..187df1e51f7af 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -28,7 +28,7 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features, - spectrum_cutoff=None): + spectrum_threshold=None): """Compute the likelihood of a rank ``rank`` dataset. The dataset is assumed to be embedded in gaussian noise of shape(n, @@ -62,8 +62,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features, if rank > len(spectrum): raise ValueError("The tested rank cannot exceed the rank of the" " dataset") - if spectrum_cutoff is None: - spectrum_cutoff = np.finfo(type(spectrum[0])).eps + if spectrum_threshold is None: + spectrum_threshold = np.finfo(type(spectrum[0])).eps pu = -rank * log(2.) for i in range(rank): @@ -78,7 +78,7 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features, v = 1 else: v = np.sum(spectrum[rank:]) / (n_features - rank) - if spectrum_cutoff > v: + if spectrum_threshold > v: return -np.inf pv = -np.log(v) * n_samples * (n_features - rank) / 2. @@ -89,7 +89,7 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features, spectrum_ = spectrum.copy() spectrum_[rank:n_features] = v for i in range(rank): - if spectrum_[i] < spectrum_cutoff: + if spectrum_[i] < spectrum_threshold: break for j in range(i + 1, len(spectrum)): pa += log((spectrum[i] - spectrum[j]) * From db66fc13a38cb8cbaa11d6ef4c95401a27f6d2d0 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Sat, 15 Feb 2020 16:41:11 +0100 Subject: [PATCH 15/21] rename _assess_dimension_ to _assess_dimension and _infer_dimension_ to _infer_dimension --- sklearn/decomposition/_pca.py | 8 ++++---- sklearn/decomposition/tests/test_pca.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 187df1e51f7af..54a2c42944518 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -27,7 +27,7 @@ from ..utils.validation import check_is_fitted -def _assess_dimension_(spectrum, rank, n_samples, n_features, +def _assess_dimension(spectrum, rank, n_samples, n_features, spectrum_threshold=None): """Compute the likelihood of a rank ``rank`` dataset. @@ -100,7 +100,7 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features, return ll -def _infer_dimension_(spectrum, n_samples, n_features): +def _infer_dimension(spectrum, n_samples, n_features): """Infers the dimension of a dataset of shape (n_samples, n_features) The dataset is described by its spectrum `spectrum`. @@ -108,7 +108,7 @@ def _infer_dimension_(spectrum, n_samples, n_features): n_spectrum = len(spectrum) ll = np.empty(n_spectrum) for rank in range(n_spectrum): - ll[rank] = _assess_dimension_(spectrum, rank, n_samples, n_features) + ll[rank] = _assess_dimension(spectrum, rank, n_samples, n_features) return ll.argmax() @@ -469,7 +469,7 @@ def _fit_full(self, X, n_components): # Postprocess the number of components required if n_components == 'mle': n_components = \ - _infer_dimension_(explained_variance_, n_samples, n_features) + _infer_dimension(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index cb5741443eff5..e52920c79684b 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -9,7 +9,7 @@ from sklearn.decomposition import PCA from sklearn.datasets import load_iris from sklearn.decomposition._pca import _assess_dimension_ -from sklearn.decomposition._pca import _infer_dimension_ +from sklearn.decomposition._pca import _infer_dimension iris = datasets.load_iris() PCA_SOLVERS = ['full', 'arpack', 'randomized', 'auto'] @@ -348,7 +348,7 @@ def test_infer_dim_2(): pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ - assert _infer_dimension_(spect, n, p) > 1 + assert _infer_dimension(spect, n, p) > 1 def test_infer_dim_3(): @@ -361,7 +361,7 @@ def test_infer_dim_3(): pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ - assert _infer_dimension_(spect, n, p) > 2 + assert _infer_dimension(spect, n, p) > 2 @pytest.mark.parametrize( @@ -575,7 +575,7 @@ def test_infer_dim_bad_spec(): spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) n_samples = 10 n_features = 5 - ret = _infer_dimension_(spectrum, n_samples, n_features) + ret = _infer_dimension(spectrum, n_samples, n_features) assert ret == 0 From 2c12704526c668d486d56d85896c619eaa9e87ad Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Sat, 15 Feb 2020 16:45:36 +0100 Subject: [PATCH 16/21] remove spectrum threshold as a keyword --- sklearn/decomposition/_pca.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 54a2c42944518..e12b5838b011f 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -27,8 +27,7 @@ from ..utils.validation import check_is_fitted -def _assess_dimension(spectrum, rank, n_samples, n_features, - spectrum_threshold=None): +def _assess_dimension(spectrum, rank, n_samples, n_features): """Compute the likelihood of a rank ``rank`` dataset. The dataset is assumed to be embedded in gaussian noise of shape(n, @@ -47,7 +46,7 @@ def _assess_dimension(spectrum, rank, n_samples, n_features, spectrum_threshold : float, default=None Cut-off for values in `spectrum`. Any value lower than this will be ignored (`default=epsilon of spectrum`). By default (`None`), - it corresponds to the machine epsilon of the `dtype` of `spectrum`. + it corresponds to the machine epsilon of the `dtype` of `spectrum`. Returns ------- @@ -62,8 +61,8 @@ def _assess_dimension(spectrum, rank, n_samples, n_features, if rank > len(spectrum): raise ValueError("The tested rank cannot exceed the rank of the" " dataset") - if spectrum_threshold is None: - spectrum_threshold = np.finfo(type(spectrum[0])).eps + + spectrum_threshold = np.finfo(type(spectrum[0])).eps pu = -rank * log(2.) for i in range(rank): From adcfa89aeb8bd4963f0ea6d72595bdbb9fe65be7 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Sat, 15 Feb 2020 17:05:51 +0100 Subject: [PATCH 17/21] fix tests --- sklearn/decomposition/tests/test_pca.py | 35 ++++++++++++++----------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index e52920c79684b..bd27dc95d4198 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -8,7 +8,7 @@ from sklearn import datasets from sklearn.decomposition import PCA from sklearn.datasets import load_iris -from sklearn.decomposition._pca import _assess_dimension_ +from sklearn.decomposition._pca import _assess_dimension from sklearn.decomposition._pca import _infer_dimension iris = datasets.load_iris() @@ -333,7 +333,7 @@ def test_infer_dim_1(): pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ - ll = np.array([_assess_dimension_(spect, k, n, p) for k in range(p)]) + ll = np.array([_assess_dimension(spect, k, n, p) for k in range(p)]) assert ll[1] > ll.max() - .01 * n @@ -580,23 +580,26 @@ def test_infer_dim_bad_spec(): def test_assess_dimension_error_rank_greater_than_features(): - # Test error when tested rank is greater than the number of features + # Test error when tested rank is greater than the number of features. + # This will not ever happen when using _assess_dimension through + # _infer_dimension because of the loop range. spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) n_samples = 10 n_features = 4 rank = 5 - with pytest.raises(ValueError): - ret = _assess_dimension_(spectrum, rank, n_samples, n_features) + with pytest.raises(ValueError, match="The tested rank cannot exceed " + "the rank of the dataset"): + ret = _assess_dimension(spectrum, rank, n_samples, n_features) -def test_assess_dimension_same_n_rank_and_features(): - # Test that - spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) - n_samples = 10 - n_features = 4 - rank = 4 - ret = _assess_dimension_(spectrum, rank, n_samples, n_features) - assert ret is not None +# def test_assess_dimension_same_n_rank_and_features(): +# # Test that +# spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) +# n_samples = 10 +# n_features = 4 +# rank = 4 +# ret = _assess_dimension(spectrum, rank, n_samples, n_features) +# assert ret is not None def test_assess_dimension_small_eigenvalues(): @@ -605,7 +608,7 @@ def test_assess_dimension_small_eigenvalues(): n_samples = 10 n_features = 5 rank = 4 - ret = _assess_dimension_(spectrum, rank, n_samples, n_features) + ret = _assess_dimension(spectrum, rank, n_samples, n_features) assert ret == -np.inf @@ -625,5 +628,7 @@ def test_fit_mle_too_few_samples(): random_state=42) pca = PCA(n_components='mle', svd_solver='full') - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="n_components='mle' is only " + "supported if " + "n_samples >= n_features"): pca.fit(X) From fdc5f3f822e3621fcb9b27a190956289fc2959c5 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Tue, 25 Feb 2020 17:37:13 +0100 Subject: [PATCH 18/21] clean up, comments and doc --- doc/whats_new/v0.23.rst | 4 ++++ sklearn/decomposition/_pca.py | 10 +++++---- sklearn/decomposition/tests/test_pca.py | 27 +++++++++---------------- 3 files changed, 19 insertions(+), 22 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 96702dae01235..0e846e787c18c 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -133,6 +133,10 @@ Changelog - |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will exclusively choose the components that explain the variance greater than `n_components`. :pr:`15669` by :user:`Krishna Chaitanya ` +- |Fix| :func:`decomposition._pca._assess_dimension` now correctly handles small + eigenvalues. :pr: `4441` by :user:`Lisa Schwetlick `, and + :user:`Gelavizh Ahmadi ` and + :user:`Marija Vlajic Wheeler `. - |Enhancement| :class:`decomposition.NMF` and :func:`decomposition.non_negative_factorization` now preserves float32 dtype. diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index e12b5838b011f..3fd3d68c05f82 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -43,10 +43,6 @@ def _assess_dimension(spectrum, rank, n_samples, n_features): Number of samples. n_features : int Number of features. - spectrum_threshold : float, default=None - Cut-off for values in `spectrum`. Any value lower than this - will be ignored (`default=epsilon of spectrum`). By default (`None`), - it corresponds to the machine epsilon of the `dtype` of `spectrum`. Returns ------- @@ -73,6 +69,8 @@ def _assess_dimension(spectrum, rank, n_samples, n_features): pl = -pl * n_samples / 2. if rank == n_features: + # TODO: this line is never executed because _infer_dimension's + # for loop is off by one pv = 0 v = 1 else: @@ -89,6 +87,10 @@ def _assess_dimension(spectrum, rank, n_samples, n_features): spectrum_[rank:n_features] = v for i in range(rank): if spectrum_[i] < spectrum_threshold: + # TODO: this line is never executed (off by one in _infer_dimension) + # this break only happens when rank == n_features and + # spectrum_[i] < spectrum_threshold, otherwise the early return + # above catches this case. break for j in range(i + 1, len(spectrum)): pa += log((spectrum[i] - spectrum[j]) * diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index bd27dc95d4198..a02b80fc1eefa 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -571,7 +571,7 @@ def test_pca_n_components_mostly_explained_variance_ratio(): def test_infer_dim_bad_spec(): - # Test a spectrum that drops to near zero + # Test a spectrum that drops to near zero for PR #16224 spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) n_samples = 10 n_features = 5 @@ -580,9 +580,10 @@ def test_infer_dim_bad_spec(): def test_assess_dimension_error_rank_greater_than_features(): - # Test error when tested rank is greater than the number of features. + # Test error when tested rank is greater than the number of features + # for PR #16224 # This will not ever happen when using _assess_dimension through - # _infer_dimension because of the loop range. + # _infer_dimension because of the loop range spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) n_samples = 10 n_features = 4 @@ -591,29 +592,19 @@ def test_assess_dimension_error_rank_greater_than_features(): "the rank of the dataset"): ret = _assess_dimension(spectrum, rank, n_samples, n_features) - -# def test_assess_dimension_same_n_rank_and_features(): -# # Test that -# spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) -# n_samples = 10 -# n_features = 4 -# rank = 4 -# ret = _assess_dimension(spectrum, rank, n_samples, n_features) -# assert ret is not None - - def test_assess_dimension_small_eigenvalues(): - # Test tiny eigenvalues appropriately when 'mle' + # Test tiny eigenvalues appropriately when using 'mle' + # for PR #16224 spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) n_samples = 10 n_features = 5 - rank = 4 + rank = 3 ret = _assess_dimension(spectrum, rank, n_samples, n_features) assert ret == -np.inf - def test_infer_dim_mle(): # Test small eigenvalues when 'mle' with pathological 'X' dataset + # for PR #16224 X, _ = datasets.make_classification(n_informative=1, n_repeated=18, n_redundant=1, n_clusters_per_class=1, random_state=42) @@ -623,7 +614,7 @@ def test_infer_dim_mle(): def test_fit_mle_too_few_samples(): # Tests that an error is raised when the number of samples is smaller - # than the number of features during an mle fit + # than the number of features during an mle fit for PR #16224 X, _ = datasets.make_classification(n_samples=20, n_features=21, random_state=42) From 8272383c1534f82b0af1cd5fa06db61cea83fcba Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Tue, 25 Feb 2020 17:44:24 +0100 Subject: [PATCH 19/21] linting --- sklearn/decomposition/tests/test_pca.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index a02b80fc1eefa..e6c2cb63f9794 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -590,7 +590,8 @@ def test_assess_dimension_error_rank_greater_than_features(): rank = 5 with pytest.raises(ValueError, match="The tested rank cannot exceed " "the rank of the dataset"): - ret = _assess_dimension(spectrum, rank, n_samples, n_features) + _assess_dimension(spectrum, rank, n_samples, n_features) + def test_assess_dimension_small_eigenvalues(): # Test tiny eigenvalues appropriately when using 'mle' @@ -602,6 +603,7 @@ def test_assess_dimension_small_eigenvalues(): ret = _assess_dimension(spectrum, rank, n_samples, n_features) assert ret == -np.inf + def test_infer_dim_mle(): # Test small eigenvalues when 'mle' with pathological 'X' dataset # for PR #16224 From 3f238c03b0fa3eb85d3cea39a3745d69d54dfb43 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Tue, 25 Feb 2020 17:48:52 +0100 Subject: [PATCH 20/21] linting --- sklearn/decomposition/_pca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 3fd3d68c05f82..7a0140b01fc9b 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -87,7 +87,8 @@ def _assess_dimension(spectrum, rank, n_samples, n_features): spectrum_[rank:n_features] = v for i in range(rank): if spectrum_[i] < spectrum_threshold: - # TODO: this line is never executed (off by one in _infer_dimension) + # TODO: this line is never executed + # (off by one in _infer_dimension) # this break only happens when rank == n_features and # spectrum_[i] < spectrum_threshold, otherwise the early return # above catches this case. From 353a0003aab0f5d9f8645a804d70cf90adec8519 Mon Sep 17 00:00:00 2001 From: Lisa Schwetlick Date: Tue, 3 Mar 2020 21:58:30 +0100 Subject: [PATCH 21/21] remove comment --- sklearn/decomposition/tests/test_pca.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index e6c2cb63f9794..438478a55f6fa 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -582,8 +582,6 @@ def test_infer_dim_bad_spec(): def test_assess_dimension_error_rank_greater_than_features(): # Test error when tested rank is greater than the number of features # for PR #16224 - # This will not ever happen when using _assess_dimension through - # _infer_dimension because of the loop range spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) n_samples = 10 n_features = 4