-
Notifications
You must be signed in to change notification settings - Fork 228
[MRG] Remove preprocessing the data for RCA #194
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e21e856
3882b51
b6c216e
32b86c0
d6cdb8f
2497366
ea68d0d
a78767e
27985ec
ddee294
8021089
a05782f
6d6a38a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
from six.moves import xrange | ||
from sklearn import decomposition | ||
from sklearn.base import TransformerMixin | ||
from sklearn.exceptions import ChangedBehaviorWarning | ||
|
||
from ._util import _check_n_components | ||
from .base_metric import MahalanobisMixin | ||
|
@@ -48,7 +49,7 @@ class RCA(MahalanobisMixin, TransformerMixin): | |
""" | ||
|
||
def __init__(self, n_components=None, num_dims='deprecated', | ||
pca_comps=None, preprocessor=None): | ||
pca_comps='deprecated', preprocessor=None): | ||
"""Initialize the learner. | ||
|
||
Parameters | ||
|
@@ -62,12 +63,10 @@ def __init__(self, n_components=None, num_dims='deprecated', | |
`num_dims` was deprecated in version 0.5.0 and will | ||
be removed in 0.6.0. Use `n_components` instead. | ||
|
||
pca_comps : int, float, None or string | ||
Number of components to keep during PCA preprocessing. | ||
If None (default), does not perform PCA. | ||
If ``0 < pca_comps < 1``, it is used as | ||
the minimum explained variance ratio. | ||
See sklearn.decomposition.PCA for more details. | ||
pca_comps : Not used | ||
.. deprecated:: 0.5.0 | ||
`pca_comps` was deprecated in version 0.5.0 and will | ||
be removed in 0.6.0. | ||
|
||
preprocessor : array-like, shape=(n_samples, n_features) or callable | ||
The preprocessor to call to get tuples from indices. If array-like, | ||
|
@@ -83,8 +82,9 @@ def _check_dimension(self, rank, X): | |
if rank < d: | ||
warnings.warn('The inner covariance matrix is not invertible, ' | ||
'so the transformation matrix may contain Nan values. ' | ||
'You should adjust pca_comps to remove noise and ' | ||
'redundant information.') | ||
'You should reduce the dimensionality of your input,' | ||
'for instance using `sklearn.decomposition.PCA` as a ' | ||
'preprocessing step.') | ||
|
||
dim = _check_n_components(d, self.n_components) | ||
return dim | ||
|
@@ -105,25 +105,33 @@ def fit(self, X, chunks): | |
' It has been deprecated in version 0.5.0 and will be' | ||
' removed in 0.6.0. Use "n_components" instead', | ||
DeprecationWarning) | ||
|
||
if self.pca_comps != 'deprecated': | ||
warnings.warn( | ||
'"pca_comps" parameter is not used. ' | ||
'It has been deprecated in version 0.5.0 and will be' | ||
'removed in 0.6.0. RCA will not do PCA preprocessing anymore. If ' | ||
'you still want to do it, you could use ' | ||
'`sklearn.decomposition.PCA` and an `sklearn.pipeline.Pipeline`.', | ||
DeprecationWarning) | ||
|
||
X, chunks = self._prepare_inputs(X, chunks, ensure_min_samples=2) | ||
|
||
# PCA projection to remove noise and redundant information. | ||
if self.pca_comps is not None: | ||
pca = decomposition.PCA(n_components=self.pca_comps) | ||
X_t = pca.fit_transform(X) | ||
M_pca = pca.components_ | ||
else: | ||
X_t = X - X.mean(axis=0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this centering step gone? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess because we should remove any pre-processing step, but I agree I didn't talk about it at all, maybe we should keep the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair enough (I double-checked and this centering is not part of standard RCA) Finally, have you checked the influence of removing the centering step on the examples? |
||
M_pca = None | ||
warnings.warn( | ||
"RCA will no longer center the data before training. If you want " | ||
"to do some preprocessing, you should do it manually (you can also " | ||
"use an `sklearn.pipeline.Pipeline` for instance). This warning " | ||
"will disappear in version 0.6.0.", ChangedBehaviorWarning) | ||
|
||
chunk_mask, chunked_data = _chunk_mean_centering(X_t, chunks) | ||
chunks = np.asanyarray(chunks, dtype=int) | ||
chunk_mask, chunked_data = _chunk_mean_centering(X, chunks) | ||
|
||
inner_cov = np.atleast_2d(np.cov(chunked_data, rowvar=0, bias=1)) | ||
dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X_t) | ||
dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X) | ||
|
||
# Fisher Linear Discriminant projection | ||
if dim < X_t.shape[1]: | ||
total_cov = np.cov(X_t[chunk_mask], rowvar=0) | ||
if dim < X.shape[1]: | ||
total_cov = np.cov(X[chunk_mask], rowvar=0) | ||
tmp = np.linalg.lstsq(total_cov, inner_cov)[0] | ||
vals, vecs = np.linalg.eig(tmp) | ||
inds = np.argsort(vals)[:dim] | ||
|
@@ -133,9 +141,6 @@ def fit(self, X, chunks): | |
else: | ||
self.transformer_ = _inv_sqrtm(inner_cov).T | ||
|
||
if M_pca is not None: | ||
self.transformer_ = np.atleast_2d(self.transformer_.dot(M_pca)) | ||
|
||
return self | ||
|
||
|
||
|
@@ -155,7 +160,7 @@ class RCA_Supervised(RCA): | |
""" | ||
|
||
def __init__(self, num_dims='deprecated', n_components=None, | ||
pca_comps=None, num_chunks=100, chunk_size=2, | ||
pca_comps='deprecated', num_chunks=100, chunk_size=2, | ||
preprocessor=None): | ||
"""Initialize the supervised version of `RCA`. | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that this code was giving a PCA initialization at the same time, so for now we'll remove it, but I think I'll do the PR about initialization before merging this PR into master, and then we can merge it into this PR to keep the same possibility of initialization with PCA