scikit-learn · rth · Mar 4, 2020 · Jan 25, 2020 · Jan 25, 2020 · Jan 25, 2020
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
@@ -133,6 +133,10 @@ Changelog
 - |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will
    exclusively choose the components that explain the variance greater than
    `n_components`. :pr:`15669` by :user:`Krishna Chaitanya <krishnachaitanya9>`
+- |Fix| :func:`decomposition._pca._assess_dimension` now correctly handles small
+   eigenvalues. :pr: `4441` by :user:`Lisa Schwetlick <lschwetlick>`, and
+   :user:`Gelavizh Ahmadi <gelavizh1>` and
+   :user:`Marija Vlajic Wheeler <marijavlajic>`.
 
 - |Enhancement| :class:`decomposition.NMF` and
   :func:`decomposition.non_negative_factorization` now preserves float32 dtype.

diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
@@ -27,7 +27,7 @@
 from ..utils.validation import check_is_fitted
 
 
-def _assess_dimension_(spectrum, rank, n_samples, n_features):
+def _assess_dimension(spectrum, rank, n_samples, n_features):
     """Compute the likelihood of a rank ``rank`` dataset.
 
     The dataset is assumed to be embedded in gaussian noise of shape(n,
@@ -58,6 +58,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
         raise ValueError("The tested rank cannot exceed the rank of the"
                          " dataset")
 
+    spectrum_threshold = np.finfo(type(spectrum[0])).eps
+
     pu = -rank * log(2.)
     for i in range(rank):
         pu += (gammaln((n_features - i) / 2.) -
@@ -67,10 +69,14 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
     pl = -pl * n_samples / 2.
 
     if rank == n_features:
+        # TODO: this line is never executed because _infer_dimension's
+        # for loop is off by one
         pv = 0
         v = 1
     else:
         v = np.sum(spectrum[rank:]) / (n_features - rank)
+        if spectrum_threshold > v:
+            return -np.inf
         pv = -np.log(v) * n_samples * (n_features - rank) / 2.
 
     m = n_features * rank - rank * (rank + 1.) / 2.
@@ -80,6 +86,13 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
     spectrum_ = spectrum.copy()
     spectrum_[rank:n_features] = v
     for i in range(rank):
+        if spectrum_[i] < spectrum_threshold:
+            # TODO: this line is never executed
+            # (off by one in _infer_dimension)
+            # this break only happens when rank == n_features and
+            # spectrum_[i] < spectrum_threshold, otherwise the early return
+            # above catches this case.
+            break
         for j in range(i + 1, len(spectrum)):
             pa += log((spectrum[i] - spectrum[j]) *
                       (1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples)
@@ -89,15 +102,15 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
     return ll
 
 
-def _infer_dimension_(spectrum, n_samples, n_features):
+def _infer_dimension(spectrum, n_samples, n_features):
     """Infers the dimension of a dataset of shape (n_samples, n_features)
 
     The dataset is described by its spectrum `spectrum`.
     """
     n_spectrum = len(spectrum)
     ll = np.empty(n_spectrum)
     for rank in range(n_spectrum):
-        ll[rank] = _assess_dimension_(spectrum, rank, n_samples, n_features)
+        ll[rank] = _assess_dimension(spectrum, rank, n_samples, n_features)
     return ll.argmax()
 
 
@@ -458,7 +471,7 @@ def _fit_full(self, X, n_components):
         # Postprocess the number of components required
         if n_components == 'mle':
             n_components = \
-                _infer_dimension_(explained_variance_, n_samples, n_features)
+                _infer_dimension(explained_variance_, n_samples, n_features)
         elif 0 < n_components < 1.0:
             # number of components for which the cumulated explained
             # variance percentage is superior to the desired threshold

diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
@@ -8,8 +8,8 @@
 from sklearn import datasets
 from sklearn.decomposition import PCA
 from sklearn.datasets import load_iris
-from sklearn.decomposition._pca import _assess_dimension_
-from sklearn.decomposition._pca import _infer_dimension_
+from sklearn.decomposition._pca import _assess_dimension
+from sklearn.decomposition._pca import _infer_dimension
 
 iris = datasets.load_iris()
 PCA_SOLVERS = ['full', 'arpack', 'randomized', 'auto']
@@ -333,7 +333,7 @@ def test_infer_dim_1():
     pca = PCA(n_components=p, svd_solver='full')
     pca.fit(X)
     spect = pca.explained_variance_
-    ll = np.array([_assess_dimension_(spect, k, n, p) for k in range(p)])
+    ll = np.array([_assess_dimension(spect, k, n, p) for k in range(p)])
     assert ll[1] > ll.max() - .01 * n
 
 
@@ -348,7 +348,7 @@ def test_infer_dim_2():
     pca = PCA(n_components=p, svd_solver='full')
     pca.fit(X)
     spect = pca.explained_variance_
-    assert _infer_dimension_(spect, n, p) > 1
+    assert _infer_dimension(spect, n, p) > 1
 
 
 def test_infer_dim_3():
@@ -361,7 +361,7 @@ def test_infer_dim_3():
     pca = PCA(n_components=p, svd_solver='full')
     pca.fit(X)
     spect = pca.explained_variance_
-    assert _infer_dimension_(spect, n, p) > 2
+    assert _infer_dimension(spect, n, p) > 2
 
 
 @pytest.mark.parametrize(
@@ -568,3 +568,58 @@ def test_pca_n_components_mostly_explained_variance_ratio():
     n_components = pca1.explained_variance_ratio_.cumsum()[-2]
     pca2 = PCA(n_components=n_components).fit(X, y)
     assert pca2.n_components_ == X.shape[1]
+
+
+def test_infer_dim_bad_spec():
+    # Test a spectrum that drops to near zero for PR #16224
+    spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
+    n_samples = 10
+    n_features = 5
+    ret = _infer_dimension(spectrum, n_samples, n_features)
+    assert ret == 0
+
+
+def test_assess_dimension_error_rank_greater_than_features():
+    # Test error when tested rank is greater than the number of features
+    # for PR #16224
+    spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
+    n_samples = 10
+    n_features = 4
+    rank = 5
+    with pytest.raises(ValueError, match="The tested rank cannot exceed "
+                                         "the rank of the dataset"):
+        _assess_dimension(spectrum, rank, n_samples, n_features)
+
+
+def test_assess_dimension_small_eigenvalues():
+    # Test tiny eigenvalues appropriately when using 'mle'
+    # for  PR #16224
+    spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
+    n_samples = 10
+    n_features = 5
+    rank = 3
+    ret = _assess_dimension(spectrum, rank, n_samples, n_features)
+    assert ret == -np.inf
+
+
+def test_infer_dim_mle():
+    # Test small eigenvalues when 'mle' with pathological 'X' dataset
+    # for PR #16224
+    X, _ = datasets.make_classification(n_informative=1, n_repeated=18,
+                                        n_redundant=1, n_clusters_per_class=1,
+                                        random_state=42)
+    pca = PCA(n_components='mle').fit(X)
+    assert pca.n_components_ == 0
+
+
+def test_fit_mle_too_few_samples():
+    # Tests that an error is raised when the number of samples is smaller
+    # than the number of features during an mle fit for PR #16224
+    X, _ = datasets.make_classification(n_samples=20, n_features=21,
+                                        random_state=42)
+
+    pca = PCA(n_components='mle', svd_solver='full')
+    with pytest.raises(ValueError, match="n_components='mle' is only "
+                                         "supported if "
+                                         "n_samples >= n_features"):
+        pca.fit(X)