-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
[WIP] PCA n_components='mle' instability. Issue 4441 #4827
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
33bba18
de56f1a
ec65afb
2c0f5c7
6c6443e
6f3d594
ae53b60
3e73cb9
2e9f0c1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,8 +10,8 @@ | |
from sklearn import datasets | ||
from sklearn.decomposition import PCA | ||
from sklearn.decomposition import RandomizedPCA | ||
from sklearn.decomposition.pca import _assess_dimension_ | ||
from sklearn.decomposition.pca import _infer_dimension_ | ||
from sklearn.decomposition.pca import _assess_dimension | ||
from sklearn.decomposition.pca import _infer_dimension | ||
|
||
iris = datasets.load_iris() | ||
|
||
|
@@ -185,7 +185,7 @@ def test_randomized_pca_check_list(): | |
|
||
|
||
def test_randomized_pca_inverse(): | ||
# Test that RandomizedPCA is inversible on dense data | ||
# Test that RandomizedPCA is invertible on dense data | ||
rng = np.random.RandomState(0) | ||
n, p = 50, 3 | ||
X = rng.randn(n, p) # spherical data | ||
|
@@ -231,7 +231,7 @@ def test_infer_dim_1(): | |
spect = pca.explained_variance_ | ||
ll = [] | ||
for k in range(p): | ||
ll.append(_assess_dimension_(spect, k, n, p)) | ||
ll.append(_assess_dimension(spect, k, n, p)) | ||
ll = np.array(ll) | ||
assert_greater(ll[1], ll.max() - .01 * n) | ||
|
||
|
@@ -247,7 +247,7 @@ def test_infer_dim_2(): | |
pca = PCA(n_components=p) | ||
pca.fit(X) | ||
spect = pca.explained_variance_ | ||
assert_greater(_infer_dimension_(spect, n, p), 1) | ||
assert_greater(_infer_dimension(spect, n, p), 1) | ||
|
||
|
||
def test_infer_dim_3(): | ||
|
@@ -260,7 +260,46 @@ def test_infer_dim_3(): | |
pca = PCA(n_components=p) | ||
pca.fit(X) | ||
spect = pca.explained_variance_ | ||
assert_greater(_infer_dimension_(spect, n, p), 2) | ||
assert_greater(_infer_dimension(spect, n, p), 2) | ||
|
||
|
||
def test_infer_dim_bad_spec(): | ||
""" | ||
Test that we deal with a spectrum that drops to near zero | ||
see issue https://github.com/scikit-learn/scikit-learn/issues/4441 | ||
""" | ||
spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) | ||
n_samples = 10 | ||
n_features = 5 | ||
ret = _infer_dimension(spectrum, n_samples, n_features) | ||
assert_equal(ret, 0) | ||
|
||
|
||
def test_assess_dimension_tiny_eigenvals(): | ||
""" | ||
Test that we deal with tiny eigenvalues appropriately when | ||
`mle` inferring `n_components` | ||
see issue https://github.com/scikit-learn/scikit-learn/issues/4441 | ||
""" | ||
spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) | ||
n_samples = 10 | ||
n_features = 5 | ||
rank = 4 | ||
ret = _assess_dimension(spectrum, rank, n_samples, n_features) | ||
assert_equal(ret, -np.inf) | ||
|
||
|
||
def test_infer_dim_mle(): | ||
""" | ||
Test that we deal with tiny eigenvalues appropriately when | ||
`mle` inferring `n_components` with a pathalogical `X` dastaset | ||
see issue https://github.com/scikit-learn/scikit-learn/issues/4441 | ||
""" | ||
X, _ = datasets.make_classification(n_informative=1, n_repeated=18, | ||
n_redundant=1, n_clusters_per_class=1, | ||
random_state=20150609) | ||
pca = PCA(n_components='mle').fit(X) | ||
assert_equal(pca.n_components_, 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That seems really odd to me. Shouldn't it be 1? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this related to the off-by-one error that @alexis-mignon talked about in the original comment on 24 Mar? |
||
|
||
|
||
def test_infer_dim_by_explained_variance(): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can
spectrum_
values ever be negative here? If not, then the check above doesn't need the abs either, right?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Spectrum comes from
_, S, _ = scipy.linalg.svd(X)` so I think it should non-negative.[we'll never see complex numbers will we?]
S**2
withI'll have a go at constructing an
X
that gets trapped by the new code paths for a full test. But not been able to so far.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
PCA could be one of the algorithms that might with complex numbers. But we don't support it AFAIK.
We can either remove the abs everywhere, or add it also inside the
v
summation. Couldn't say which is better.