From 10e8c3b0a5bc10eea9b8ca1336451f355bce31c4 Mon Sep 17 00:00:00 2001 From: "kumarashutosh.ee@gmail.com" Date: Fri, 22 Dec 2017 00:05:52 +0530 Subject: [PATCH 1/3] conditions added and checked --- sklearn/decomposition/pca.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 2b715b7e06824..3f7c90145f412 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -31,7 +31,7 @@ from ..utils.validation import check_is_fitted -def _assess_dimension_(spectrum, rank, n_samples, n_features): +def _assess_dimension_(spectrum, rank, n_samples, n_features, rcond=1e-15): """Compute the likelihood of a rank ``rank`` dataset The dataset is assumed to be embedded in gaussian noise of shape(n, @@ -47,6 +47,9 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): Number of samples. n_features : int Number of features. + rcond : float + Cut-off for values in `spectrum`. Any value lower than this + will be ignored (`default=1e-15`) Returns ------- @@ -75,6 +78,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): v = 1 else: v = np.sum(spectrum[rank:]) / (n_features - rank) + if rcond > v: + return -np.inf pv = -np.log(v) * n_samples * (n_features - rank) / 2. m = n_features * rank - rank * (rank + 1.) / 2. @@ -84,6 +89,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): spectrum_ = spectrum.copy() spectrum_[rank:n_features] = v for i in range(rank): + if spectrum_[i] < rcond: + break for j in range(i + 1, len(spectrum)): pa += log((spectrum[i] - spectrum[j]) * (1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples) From 727e21f4fa7aa0c638d7045aa735db96908a8a42 Mon Sep 17 00:00:00 2001 From: "kumarashutosh.ee@gmail.com" Date: Fri, 22 Dec 2017 08:46:05 +0530 Subject: [PATCH 2/3] tests added --- sklearn/decomposition/tests/test_pca.py | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index f1889d1462d2b..31afb9e6e80d8 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -537,6 +537,34 @@ def test_infer_dim_3(): assert_greater(_infer_dimension_(spect, n, p), 2) +def test_infer_dim_bad_spec(): + # Test a spectrum that drops to near zero + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + n_samples = 10 + n_features = 5 + ret = _infer_dimension_(spectrum, n_samples, n_features) + assert_equal(ret, 0) + + +def test_assess_dimension_small_eigenvalues(): + # Test tiny eigenvalues appropriately when 'mle' + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + n_samples = 10 + n_features = 5 + rank = 4 + ret = _assess_dimension_(spectrum, rank, n_samples, n_features) + assert_equal(ret, -np.inf) + + +def test_infer_dim_mle(): + # Test small eigenvalues when 'mle' with pathelogical 'X' dataset + X, _ = datasets.make_classification(n_informative=1, n_repeated=18, + n_redundant=1, n_clusters_per_class=1, + random_state=42) + pca = PCA(n_components='mle').fit(X) + assert_equal(pca.n_components_, 0) + + def test_infer_dim_by_explained_variance(): X = iris.data pca = PCA(n_components=0.95, svd_solver='full') From 9cb74901b92e8f020816f995227276e990125028 Mon Sep 17 00:00:00 2001 From: "kumarashutosh.ee@gmail.com" Date: Wed, 27 Dec 2017 00:00:19 +0530 Subject: [PATCH 3/3] changes added --- sklearn/decomposition/pca.py | 3 ++- sklearn/decomposition/tests/test_pca.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 3f7c90145f412..ddc8701550c9d 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -455,7 +455,8 @@ def _fit_full(self, X, n_components): # Postprocess the number of components required if n_components == 'mle': n_components = \ - _infer_dimension_(explained_variance_, n_samples, n_features) + _infer_dimension_(explained_variance_, n_samples, + n_features) + 1 elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 31afb9e6e80d8..e1b87f733304a 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -490,7 +490,7 @@ def test_pca_dim(): X[:10] += np.array([3, 4, 5, 1, 2]) pca = PCA(n_components='mle', svd_solver='full').fit(X) assert_equal(pca.n_components, 'mle') - assert_equal(pca.n_components_, 1) + assert_equal(pca.n_components_, 2) def test_infer_dim_1(): @@ -562,7 +562,7 @@ def test_infer_dim_mle(): n_redundant=1, n_clusters_per_class=1, random_state=42) pca = PCA(n_components='mle').fit(X) - assert_equal(pca.n_components_, 0) + assert_equal(pca.n_components_, 1) def test_infer_dim_by_explained_variance():