Skip to content

[MRG+1] Adress decomposition.PCA mle option problem #16224

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Mar 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v0.23.rst
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ Changelog
- |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will
exclusively choose the components that explain the variance greater than
`n_components`. :pr:`15669` by :user:`Krishna Chaitanya <krishnachaitanya9>`
- |Fix| :func:`decomposition._pca._assess_dimension` now correctly handles small
eigenvalues. :pr: `4441` by :user:`Lisa Schwetlick <lschwetlick>`, and
:user:`Gelavizh Ahmadi <gelavizh1>` and
:user:`Marija Vlajic Wheeler <marijavlajic>`.

- |Enhancement| :class:`decomposition.NMF` and
:func:`decomposition.non_negative_factorization` now preserves float32 dtype.
Expand Down
21 changes: 17 additions & 4 deletions sklearn/decomposition/_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from ..utils.validation import check_is_fitted


def _assess_dimension_(spectrum, rank, n_samples, n_features):
def _assess_dimension(spectrum, rank, n_samples, n_features):
"""Compute the likelihood of a rank ``rank`` dataset.

The dataset is assumed to be embedded in gaussian noise of shape(n,
Expand Down Expand Up @@ -58,6 +58,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
raise ValueError("The tested rank cannot exceed the rank of the"
" dataset")

spectrum_threshold = np.finfo(type(spectrum[0])).eps

pu = -rank * log(2.)
for i in range(rank):
pu += (gammaln((n_features - i) / 2.) -
Expand All @@ -67,10 +69,14 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
pl = -pl * n_samples / 2.

if rank == n_features:
# TODO: this line is never executed because _infer_dimension's
# for loop is off by one
pv = 0
v = 1
else:
v = np.sum(spectrum[rank:]) / (n_features - rank)
if spectrum_threshold > v:
return -np.inf
pv = -np.log(v) * n_samples * (n_features - rank) / 2.

m = n_features * rank - rank * (rank + 1.) / 2.
Expand All @@ -80,6 +86,13 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
spectrum_ = spectrum.copy()
spectrum_[rank:n_features] = v
for i in range(rank):
if spectrum_[i] < spectrum_threshold:
# TODO: this line is never executed
# (off by one in _infer_dimension)
# this break only happens when rank == n_features and
# spectrum_[i] < spectrum_threshold, otherwise the early return
# above catches this case.
break
for j in range(i + 1, len(spectrum)):
pa += log((spectrum[i] - spectrum[j]) *
(1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples)
Expand All @@ -89,15 +102,15 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
return ll


def _infer_dimension_(spectrum, n_samples, n_features):
def _infer_dimension(spectrum, n_samples, n_features):
"""Infers the dimension of a dataset of shape (n_samples, n_features)

The dataset is described by its spectrum `spectrum`.
"""
n_spectrum = len(spectrum)
ll = np.empty(n_spectrum)
for rank in range(n_spectrum):
ll[rank] = _assess_dimension_(spectrum, rank, n_samples, n_features)
ll[rank] = _assess_dimension(spectrum, rank, n_samples, n_features)
return ll.argmax()


Expand Down Expand Up @@ -458,7 +471,7 @@ def _fit_full(self, X, n_components):
# Postprocess the number of components required
if n_components == 'mle':
n_components = \
_infer_dimension_(explained_variance_, n_samples, n_features)
_infer_dimension(explained_variance_, n_samples, n_features)
elif 0 < n_components < 1.0:
# number of components for which the cumulated explained
# variance percentage is superior to the desired threshold
Expand Down
65 changes: 60 additions & 5 deletions sklearn/decomposition/tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.decomposition._pca import _assess_dimension_
from sklearn.decomposition._pca import _infer_dimension_
from sklearn.decomposition._pca import _assess_dimension
from sklearn.decomposition._pca import _infer_dimension

iris = datasets.load_iris()
PCA_SOLVERS = ['full', 'arpack', 'randomized', 'auto']
Expand Down Expand Up @@ -333,7 +333,7 @@ def test_infer_dim_1():
pca = PCA(n_components=p, svd_solver='full')
pca.fit(X)
spect = pca.explained_variance_
ll = np.array([_assess_dimension_(spect, k, n, p) for k in range(p)])
ll = np.array([_assess_dimension(spect, k, n, p) for k in range(p)])
assert ll[1] > ll.max() - .01 * n


Expand All @@ -348,7 +348,7 @@ def test_infer_dim_2():
pca = PCA(n_components=p, svd_solver='full')
pca.fit(X)
spect = pca.explained_variance_
assert _infer_dimension_(spect, n, p) > 1
assert _infer_dimension(spect, n, p) > 1


def test_infer_dim_3():
Expand All @@ -361,7 +361,7 @@ def test_infer_dim_3():
pca = PCA(n_components=p, svd_solver='full')
pca.fit(X)
spect = pca.explained_variance_
assert _infer_dimension_(spect, n, p) > 2
assert _infer_dimension(spect, n, p) > 2


@pytest.mark.parametrize(
Expand Down Expand Up @@ -568,3 +568,58 @@ def test_pca_n_components_mostly_explained_variance_ratio():
n_components = pca1.explained_variance_ratio_.cumsum()[-2]
pca2 = PCA(n_components=n_components).fit(X, y)
assert pca2.n_components_ == X.shape[1]


def test_infer_dim_bad_spec():
# Test a spectrum that drops to near zero for PR #16224
spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
n_samples = 10
n_features = 5
ret = _infer_dimension(spectrum, n_samples, n_features)
assert ret == 0


def test_assess_dimension_error_rank_greater_than_features():
# Test error when tested rank is greater than the number of features
# for PR #16224
spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
n_samples = 10
n_features = 4
rank = 5
with pytest.raises(ValueError, match="The tested rank cannot exceed "
"the rank of the dataset"):
_assess_dimension(spectrum, rank, n_samples, n_features)


def test_assess_dimension_small_eigenvalues():
# Test tiny eigenvalues appropriately when using 'mle'
# for PR #16224
spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
n_samples = 10
n_features = 5
rank = 3
ret = _assess_dimension(spectrum, rank, n_samples, n_features)
assert ret == -np.inf


def test_infer_dim_mle():
# Test small eigenvalues when 'mle' with pathological 'X' dataset
# for PR #16224
X, _ = datasets.make_classification(n_informative=1, n_repeated=18,
n_redundant=1, n_clusters_per_class=1,
random_state=42)
pca = PCA(n_components='mle').fit(X)
assert pca.n_components_ == 0


def test_fit_mle_too_few_samples():
# Tests that an error is raised when the number of samples is smaller
# than the number of features during an mle fit for PR #16224
X, _ = datasets.make_classification(n_samples=20, n_features=21,
random_state=42)

pca = PCA(n_components='mle', svd_solver='full')
with pytest.raises(ValueError, match="n_components='mle' is only "
"supported if "
"n_samples >= n_features"):
pca.fit(X)