Skip to content

[WIP] PCA n_components='mle' instability. Issue 4441 #4827

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
19 changes: 13 additions & 6 deletions sklearn/decomposition/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
from ..utils.validation import check_is_fitted


def _assess_dimension_(spectrum, rank, n_samples, n_features):
def _assess_dimension(spectrum, rank, n_samples, n_features, rcond=1e-15):
"""Compute the likelihood of a rank ``rank`` dataset

The dataset is assumed to be embedded in gaussian noise of shape(n,
The dataset is assumed to be embedded in Gaussian noise of shape(n,
dimf) having spectrum ``spectrum``.

Parameters
Expand All @@ -38,6 +38,9 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
Number of samples.
n_features: int
Number of features.
rcond : float
Cut-off for values in `spectrum`. Any values lower than this
will be ignored (`default=1e-15`)

Returns
-------
Expand Down Expand Up @@ -66,6 +69,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
v = 1
else:
v = np.sum(spectrum[rank:]) / (n_features - rank)
if rcond > v:
return -np.inf
pv = -np.log(v) * n_samples * (n_features - rank) / 2.

m = n_features * rank - rank * (rank + 1.) / 2.
Expand All @@ -75,6 +80,8 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
spectrum_ = spectrum.copy()
spectrum_[rank:n_features] = v
for i in range(rank):
if spectrum_[i] < rcond:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can spectrum_ values ever be negative here? If not, then the check above doesn't need the abs either, right?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Spectrum comes from S**2 with _, S, _ = scipy.linalg.svd(X)` so I think it should non-negative.[we'll never see complex numbers will we?]

I'll have a go at constructing an X that gets trapped by the new code paths for a full test. But not been able to so far.

On 9 Jun 2015, at 02:43, Vlad Niculae notifications@github.com wrote:

In sklearn/decomposition/pca.py:

@@ -75,6 +80,8 @@ def assess_dimension(spectrum, rank, n_samples, n_features):
spectrum_ = spectrum.copy()
spectrum_[rank:n_features] = v
for i in range(rank):

  •    if spectrum_[i] < rcond:
    
    Can spectrum_ values ever be negative here? If not, then the check above doesn't need the abs either, right?


Reply to this email directly or view it on GitHub.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we'll never see complex numbers will we?

PCA could be one of the algorithms that might with complex numbers. But we don't support it AFAIK.

We can either remove the abs everywhere, or add it also inside the v summation. Couldn't say which is better.

break
for j in range(i + 1, len(spectrum)):
pa += log((spectrum[i] - spectrum[j]) *
(1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples)
Expand All @@ -84,15 +91,15 @@ def _assess_dimension_(spectrum, rank, n_samples, n_features):
return ll


def _infer_dimension_(spectrum, n_samples, n_features):
def _infer_dimension(spectrum, n_samples, n_features):
"""Infers the dimension of a dataset of shape (n_samples, n_features)

The dataset is described by its spectrum `spectrum`.
"""
n_spectrum = len(spectrum)
ll = np.empty(n_spectrum)
for rank in range(n_spectrum):
ll[rank] = _assess_dimension_(spectrum, rank, n_samples, n_features)
ll[rank] = _assess_dimension(spectrum, rank, n_samples, n_features)
return ll.argmax()


Expand Down Expand Up @@ -285,8 +292,8 @@ def _fit(self, X):
raise ValueError("n_components='mle' is only supported "
"if n_samples >= n_features")

n_components = _infer_dimension_(explained_variance_,
n_samples, n_features)
n_components = _infer_dimension(explained_variance_,
n_samples, n_features)
elif not 0 <= n_components <= n_features:
raise ValueError("n_components=%r invalid for n_features=%d"
% (n_components, n_features))
Expand Down
51 changes: 45 additions & 6 deletions sklearn/decomposition/tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.decomposition import RandomizedPCA
from sklearn.decomposition.pca import _assess_dimension_
from sklearn.decomposition.pca import _infer_dimension_
from sklearn.decomposition.pca import _assess_dimension
from sklearn.decomposition.pca import _infer_dimension

iris = datasets.load_iris()

Expand Down Expand Up @@ -185,7 +185,7 @@ def test_randomized_pca_check_list():


def test_randomized_pca_inverse():
# Test that RandomizedPCA is inversible on dense data
# Test that RandomizedPCA is invertible on dense data
rng = np.random.RandomState(0)
n, p = 50, 3
X = rng.randn(n, p) # spherical data
Expand Down Expand Up @@ -231,7 +231,7 @@ def test_infer_dim_1():
spect = pca.explained_variance_
ll = []
for k in range(p):
ll.append(_assess_dimension_(spect, k, n, p))
ll.append(_assess_dimension(spect, k, n, p))
ll = np.array(ll)
assert_greater(ll[1], ll.max() - .01 * n)

Expand All @@ -247,7 +247,7 @@ def test_infer_dim_2():
pca = PCA(n_components=p)
pca.fit(X)
spect = pca.explained_variance_
assert_greater(_infer_dimension_(spect, n, p), 1)
assert_greater(_infer_dimension(spect, n, p), 1)


def test_infer_dim_3():
Expand All @@ -260,7 +260,46 @@ def test_infer_dim_3():
pca = PCA(n_components=p)
pca.fit(X)
spect = pca.explained_variance_
assert_greater(_infer_dimension_(spect, n, p), 2)
assert_greater(_infer_dimension(spect, n, p), 2)


def test_infer_dim_bad_spec():
"""
Test that we deal with a spectrum that drops to near zero
see issue https://github.com/scikit-learn/scikit-learn/issues/4441
"""
spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
n_samples = 10
n_features = 5
ret = _infer_dimension(spectrum, n_samples, n_features)
assert_equal(ret, 0)


def test_assess_dimension_tiny_eigenvals():
"""
Test that we deal with tiny eigenvalues appropriately when
`mle` inferring `n_components`
see issue https://github.com/scikit-learn/scikit-learn/issues/4441
"""
spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
n_samples = 10
n_features = 5
rank = 4
ret = _assess_dimension(spectrum, rank, n_samples, n_features)
assert_equal(ret, -np.inf)


def test_infer_dim_mle():
"""
Test that we deal with tiny eigenvalues appropriately when
`mle` inferring `n_components` with a pathalogical `X` dastaset
see issue https://github.com/scikit-learn/scikit-learn/issues/4441
"""
X, _ = datasets.make_classification(n_informative=1, n_repeated=18,
n_redundant=1, n_clusters_per_class=1,
random_state=20150609)
pca = PCA(n_components='mle').fit(X)
assert_equal(pca.n_components_, 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That seems really odd to me. Shouldn't it be 1?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this related to the off-by-one error that @alexis-mignon talked about in the original comment on 24 Mar?



def test_infer_dim_by_explained_variance():
Expand Down