scikit-learn · thechargedneutron · Dec 21, 2017 · Dec 22, 2017 · Dec 26, 2017 · Dec 26, 2017
diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
@@ -31,7 +31,7 @@
 from ..utils.validation import check_is_fitted
 
 
-def _assess_dimension_(spectrum, rank, n_samples, n_features):
+def _assess_dimension_(spectrum, rank, n_samples, n_features, rcond=1e-15):
     """Compute the likelihood of a rank ``rank`` dataset
 
     The dataset is assumed to be embedded in gaussian noise of shape(n,
@@ -47,6 +47,9 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
         Number of samples.
     n_features : int
         Number of features.
+    rcond : float
+        Cut-off for values in `spectrum`. Any value lower than this
+        will be ignored (`default=1e-15`)
 
     Returns
     -------
@@ -75,6 +78,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
         v = 1
     else:
         v = np.sum(spectrum[rank:]) / (n_features - rank)
+        if rcond > v:
+            return -np.inf
         pv = -np.log(v) * n_samples * (n_features - rank) / 2.
 
     m = n_features * rank - rank * (rank + 1.) / 2.
@@ -84,6 +89,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
     spectrum_ = spectrum.copy()
     spectrum_[rank:n_features] = v
     for i in range(rank):
+        if spectrum_[i] < rcond:
+            break
         for j in range(i + 1, len(spectrum)):
             pa += log((spectrum[i] - spectrum[j]) *
                       (1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples)
@@ -448,7 +455,8 @@ def _fit_full(self, X, n_components):
         # Postprocess the number of components required
         if n_components == 'mle':
             n_components = \
-                _infer_dimension_(explained_variance_, n_samples, n_features)
+                _infer_dimension_(explained_variance_, n_samples,
+                                  n_features) + 1
         elif 0 < n_components < 1.0:
             # number of components for which the cumulated explained
             # variance percentage is superior to the desired threshold

diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
@@ -490,7 +490,7 @@ def test_pca_dim():
     X[:10] += np.array([3, 4, 5, 1, 2])
     pca = PCA(n_components='mle', svd_solver='full').fit(X)
     assert_equal(pca.n_components, 'mle')
-    assert_equal(pca.n_components_, 1)
+    assert_equal(pca.n_components_, 2)
 
 
 def test_infer_dim_1():
@@ -537,6 +537,34 @@ def test_infer_dim_3():
     assert_greater(_infer_dimension_(spect, n, p), 2)
 
 
+def test_infer_dim_bad_spec():
+    # Test a spectrum that drops to near zero
+    spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
+    n_samples = 10
+    n_features = 5
+    ret = _infer_dimension_(spectrum, n_samples, n_features)
+    assert_equal(ret, 0)
+
+
+def test_assess_dimension_small_eigenvalues():
+    # Test tiny eigenvalues appropriately when 'mle'
+    spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
+    n_samples = 10
+    n_features = 5
+    rank = 4
+    ret = _assess_dimension_(spectrum, rank, n_samples, n_features)
+    assert_equal(ret, -np.inf)
+
+
+def test_infer_dim_mle():
+    # Test small eigenvalues when 'mle' with pathelogical 'X' dataset
+    X, _ = datasets.make_classification(n_informative=1, n_repeated=18,
+                                        n_redundant=1, n_clusters_per_class=1,
+                                        random_state=42)
+    pca = PCA(n_components='mle').fit(X)
+    assert_equal(pca.n_components_, 1)
+
+
 def test_infer_dim_by_explained_variance():
     X = iris.data
     pca = PCA(n_components=0.95, svd_solver='full')