diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 96702dae01235..0e846e787c18c 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -133,6 +133,10 @@ Changelog - |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will exclusively choose the components that explain the variance greater than `n_components`. :pr:`15669` by :user:`Krishna Chaitanya ` +- |Fix| :func:`decomposition._pca._assess_dimension` now correctly handles small + eigenvalues. :pr: `4441` by :user:`Lisa Schwetlick `, and + :user:`Gelavizh Ahmadi ` and + :user:`Marija Vlajic Wheeler `. - |Enhancement| :class:`decomposition.NMF` and :func:`decomposition.non_negative_factorization` now preserves float32 dtype. diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index f64a9752896b3..7a0140b01fc9b 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -27,7 +27,7 @@ from ..utils.validation import check_is_fitted -def _assess_dimension_(spectrum, rank, n_samples, n_features): +def _assess_dimension(spectrum, rank, n_samples, n_features): """Compute the likelihood of a rank ``rank`` dataset. The dataset is assumed to be embedded in gaussian noise of shape(n, @@ -58,6 +58,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): raise ValueError("The tested rank cannot exceed the rank of the" " dataset") + spectrum_threshold = np.finfo(type(spectrum[0])).eps + pu = -rank * log(2.) for i in range(rank): pu += (gammaln((n_features - i) / 2.) - @@ -67,10 +69,14 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): pl = -pl * n_samples / 2. if rank == n_features: + # TODO: this line is never executed because _infer_dimension's + # for loop is off by one pv = 0 v = 1 else: v = np.sum(spectrum[rank:]) / (n_features - rank) + if spectrum_threshold > v: + return -np.inf pv = -np.log(v) * n_samples * (n_features - rank) / 2. m = n_features * rank - rank * (rank + 1.) / 2. @@ -80,6 +86,13 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): spectrum_ = spectrum.copy() spectrum_[rank:n_features] = v for i in range(rank): + if spectrum_[i] < spectrum_threshold: + # TODO: this line is never executed + # (off by one in _infer_dimension) + # this break only happens when rank == n_features and + # spectrum_[i] < spectrum_threshold, otherwise the early return + # above catches this case. + break for j in range(i + 1, len(spectrum)): pa += log((spectrum[i] - spectrum[j]) * (1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples) @@ -89,7 +102,7 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features): return ll -def _infer_dimension_(spectrum, n_samples, n_features): +def _infer_dimension(spectrum, n_samples, n_features): """Infers the dimension of a dataset of shape (n_samples, n_features) The dataset is described by its spectrum `spectrum`. @@ -97,7 +110,7 @@ def _infer_dimension_(spectrum, n_samples, n_features): n_spectrum = len(spectrum) ll = np.empty(n_spectrum) for rank in range(n_spectrum): - ll[rank] = _assess_dimension_(spectrum, rank, n_samples, n_features) + ll[rank] = _assess_dimension(spectrum, rank, n_samples, n_features) return ll.argmax() @@ -458,7 +471,7 @@ def _fit_full(self, X, n_components): # Postprocess the number of components required if n_components == 'mle': n_components = \ - _infer_dimension_(explained_variance_, n_samples, n_features) + _infer_dimension(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index b94d2d5be7e0f..438478a55f6fa 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -8,8 +8,8 @@ from sklearn import datasets from sklearn.decomposition import PCA from sklearn.datasets import load_iris -from sklearn.decomposition._pca import _assess_dimension_ -from sklearn.decomposition._pca import _infer_dimension_ +from sklearn.decomposition._pca import _assess_dimension +from sklearn.decomposition._pca import _infer_dimension iris = datasets.load_iris() PCA_SOLVERS = ['full', 'arpack', 'randomized', 'auto'] @@ -333,7 +333,7 @@ def test_infer_dim_1(): pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ - ll = np.array([_assess_dimension_(spect, k, n, p) for k in range(p)]) + ll = np.array([_assess_dimension(spect, k, n, p) for k in range(p)]) assert ll[1] > ll.max() - .01 * n @@ -348,7 +348,7 @@ def test_infer_dim_2(): pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ - assert _infer_dimension_(spect, n, p) > 1 + assert _infer_dimension(spect, n, p) > 1 def test_infer_dim_3(): @@ -361,7 +361,7 @@ def test_infer_dim_3(): pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ - assert _infer_dimension_(spect, n, p) > 2 + assert _infer_dimension(spect, n, p) > 2 @pytest.mark.parametrize( @@ -568,3 +568,58 @@ def test_pca_n_components_mostly_explained_variance_ratio(): n_components = pca1.explained_variance_ratio_.cumsum()[-2] pca2 = PCA(n_components=n_components).fit(X, y) assert pca2.n_components_ == X.shape[1] + + +def test_infer_dim_bad_spec(): + # Test a spectrum that drops to near zero for PR #16224 + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + n_samples = 10 + n_features = 5 + ret = _infer_dimension(spectrum, n_samples, n_features) + assert ret == 0 + + +def test_assess_dimension_error_rank_greater_than_features(): + # Test error when tested rank is greater than the number of features + # for PR #16224 + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + n_samples = 10 + n_features = 4 + rank = 5 + with pytest.raises(ValueError, match="The tested rank cannot exceed " + "the rank of the dataset"): + _assess_dimension(spectrum, rank, n_samples, n_features) + + +def test_assess_dimension_small_eigenvalues(): + # Test tiny eigenvalues appropriately when using 'mle' + # for PR #16224 + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + n_samples = 10 + n_features = 5 + rank = 3 + ret = _assess_dimension(spectrum, rank, n_samples, n_features) + assert ret == -np.inf + + +def test_infer_dim_mle(): + # Test small eigenvalues when 'mle' with pathological 'X' dataset + # for PR #16224 + X, _ = datasets.make_classification(n_informative=1, n_repeated=18, + n_redundant=1, n_clusters_per_class=1, + random_state=42) + pca = PCA(n_components='mle').fit(X) + assert pca.n_components_ == 0 + + +def test_fit_mle_too_few_samples(): + # Tests that an error is raised when the number of samples is smaller + # than the number of features during an mle fit for PR #16224 + X, _ = datasets.make_classification(n_samples=20, n_features=21, + random_state=42) + + pca = PCA(n_components='mle', svd_solver='full') + with pytest.raises(ValueError, match="n_components='mle' is only " + "supported if " + "n_samples >= n_features"): + pca.fit(X)