Skip to content

[MRG] Remove preprocessing the data for RCA #194

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
55 changes: 30 additions & 25 deletions metric_learn/rca.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from six.moves import xrange
from sklearn import decomposition
from sklearn.base import TransformerMixin
from sklearn.exceptions import ChangedBehaviorWarning

from ._util import _check_n_components
from .base_metric import MahalanobisMixin
Expand Down Expand Up @@ -48,7 +49,7 @@ class RCA(MahalanobisMixin, TransformerMixin):
"""

def __init__(self, n_components=None, num_dims='deprecated',
pca_comps=None, preprocessor=None):
pca_comps='deprecated', preprocessor=None):
"""Initialize the learner.

Parameters
Expand All @@ -62,12 +63,10 @@ def __init__(self, n_components=None, num_dims='deprecated',
`num_dims` was deprecated in version 0.5.0 and will
be removed in 0.6.0. Use `n_components` instead.

pca_comps : int, float, None or string
Number of components to keep during PCA preprocessing.
If None (default), does not perform PCA.
If ``0 < pca_comps < 1``, it is used as
the minimum explained variance ratio.
See sklearn.decomposition.PCA for more details.
pca_comps : Not used
.. deprecated:: 0.5.0
`pca_comps` was deprecated in version 0.5.0 and will
be removed in 0.6.0.

preprocessor : array-like, shape=(n_samples, n_features) or callable
The preprocessor to call to get tuples from indices. If array-like,
Expand All @@ -83,8 +82,9 @@ def _check_dimension(self, rank, X):
if rank < d:
warnings.warn('The inner covariance matrix is not invertible, '
'so the transformation matrix may contain Nan values. '
'You should adjust pca_comps to remove noise and '
'redundant information.')
'You should reduce the dimensionality of your input,'
'for instance using `sklearn.decomposition.PCA` as a '
'preprocessing step.')

dim = _check_n_components(d, self.n_components)
return dim
Expand All @@ -105,25 +105,33 @@ def fit(self, X, chunks):
' It has been deprecated in version 0.5.0 and will be'
' removed in 0.6.0. Use "n_components" instead',
DeprecationWarning)

if self.pca_comps != 'deprecated':
warnings.warn(
'"pca_comps" parameter is not used. '
'It has been deprecated in version 0.5.0 and will be'
'removed in 0.6.0. RCA will not do PCA preprocessing anymore. If '
'you still want to do it, you could use '
'`sklearn.decomposition.PCA` and an `sklearn.pipeline.Pipeline`.',
DeprecationWarning)

X, chunks = self._prepare_inputs(X, chunks, ensure_min_samples=2)

# PCA projection to remove noise and redundant information.
if self.pca_comps is not None:
pca = decomposition.PCA(n_components=self.pca_comps)
X_t = pca.fit_transform(X)
M_pca = pca.components_
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that this code was giving a PCA initialization at the same time, so for now we'll remove it, but I think I'll do the PR about initialization before merging this PR into master, and then we can merge it into this PR to keep the same possibility of initialization with PCA

else:
X_t = X - X.mean(axis=0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this centering step gone?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess because we should remove any pre-processing step, but I agree I didn't talk about it at all, maybe we should keep the ChangedBehaviorWarning message below, but rather replace "no longer trained on a preprocessed version" by "no longer trained on centered data by default", and encourage to use a "StandardScaler" if needed ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough (I double-checked and this centering is not part of standard RCA)
Maybe keep ChangedBehaviorWarning but change to "no longer center the data before training RCA" (no need to mention scaler I think)
And in the deprecation warning, add the fact that PCA preprocessing should now be done by the user

Finally, have you checked the influence of removing the centering step on the examples?

M_pca = None
warnings.warn(
"RCA will no longer center the data before training. If you want "
"to do some preprocessing, you should do it manually (you can also "
"use an `sklearn.pipeline.Pipeline` for instance). This warning "
"will disappear in version 0.6.0.", ChangedBehaviorWarning)

chunk_mask, chunked_data = _chunk_mean_centering(X_t, chunks)
chunks = np.asanyarray(chunks, dtype=int)
chunk_mask, chunked_data = _chunk_mean_centering(X, chunks)

inner_cov = np.atleast_2d(np.cov(chunked_data, rowvar=0, bias=1))
dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X_t)
dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X)

# Fisher Linear Discriminant projection
if dim < X_t.shape[1]:
total_cov = np.cov(X_t[chunk_mask], rowvar=0)
if dim < X.shape[1]:
total_cov = np.cov(X[chunk_mask], rowvar=0)
tmp = np.linalg.lstsq(total_cov, inner_cov)[0]
vals, vecs = np.linalg.eig(tmp)
inds = np.argsort(vals)[:dim]
Expand All @@ -133,9 +141,6 @@ def fit(self, X, chunks):
else:
self.transformer_ = _inv_sqrtm(inner_cov).T

if M_pca is not None:
self.transformer_ = np.atleast_2d(self.transformer_.dot(M_pca))

return self


Expand All @@ -155,7 +160,7 @@ class RCA_Supervised(RCA):
"""

def __init__(self, num_dims='deprecated', n_components=None,
pca_comps=None, num_chunks=100, chunk_size=2,
pca_comps='deprecated', num_chunks=100, chunk_size=2,
preprocessor=None):
"""Initialize the supervised version of `RCA`.

Expand Down
62 changes: 60 additions & 2 deletions test/metric_learn_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@
HAS_SKGGM = False
else:
HAS_SKGGM = True
from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC, RCA,
from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC,
LSML_Supervised, ITML_Supervised, SDML_Supervised,
RCA_Supervised, MMC_Supervised, SDML, ITML, LSML)
RCA_Supervised, MMC_Supervised, SDML, RCA, ITML,
LSML)
# Import this specially for testing.
from metric_learn.constraints import wrap_pairs
from metric_learn.lmnn import python_LMNN, _sum_outer_products
Expand Down Expand Up @@ -822,6 +823,63 @@ def test_feature_null_variance(self):
csep = class_separation(rca.transform(X), self.iris_labels)
self.assertLess(csep, 0.30)

def test_deprecation_pca_comps(self):
# test that a deprecation message is thrown if pca_comps is set at
# initialization
# TODO: remove in v.0.6
X, y = make_classification(random_state=42, n_samples=100)
rca_supervised = RCA_Supervised(pca_comps=X.shape[1], num_chunks=20)
msg = ('"pca_comps" parameter is not used. '
'It has been deprecated in version 0.5.0 and will be'
'removed in 0.6.0. RCA will not do PCA preprocessing anymore. If '
'you still want to do it, you could use '
'`sklearn.decomposition.PCA` and an `sklearn.pipeline.Pipeline`.')
with pytest.warns(ChangedBehaviorWarning) as expected_msg:
rca_supervised.fit(X, y)
assert str(expected_msg[0].message) == msg

rca = RCA(pca_comps=X.shape[1])
with pytest.warns(ChangedBehaviorWarning) as expected_msg:
rca.fit(X, y)
assert str(expected_msg[0].message) == msg

def test_changedbehaviorwarning_preprocessing(self):
# test that a ChangedBehaviorWarning is thrown when using RCA
# TODO: remove in v.0.6

msg = ("RCA will no longer center the data before training. If you want "
"to do some preprocessing, you should do it manually (you can also "
"use an `sklearn.pipeline.Pipeline` for instance). This warning "
"will disappear in version 0.6.0.")

X, y = make_classification(random_state=42, n_samples=100)
rca_supervised = RCA_Supervised(num_chunks=20)
with pytest.warns(ChangedBehaviorWarning) as expected_msg:
rca_supervised.fit(X, y)
assert str(expected_msg[0].message) == msg

rca = RCA()
with pytest.warns(ChangedBehaviorWarning) as expected_msg:
rca.fit(X, y)
assert str(expected_msg[0].message) == msg

def test_rank_deficient_returns_warning(self):
"""Checks that if the covariance matrix is not invertible, we raise a
warning message advising to use PCA"""
X, y = load_iris(return_X_y=True)
# we make the fourth column a linear combination of the two first,
# so that the covariance matrix will not be invertible:
X[:, 3] = X[:, 0] + 3 * X[:, 1]
rca = RCA()
msg = ('The inner covariance matrix is not invertible, '
'so the transformation matrix may contain Nan values. '
'You should reduce the dimensionality of your input,'
'for instance using `sklearn.decomposition.PCA` as a '
'preprocessing step.')
with pytest.warns(None) as raised_warnings:
rca.fit(X, y)
assert any(str(w.message) == msg for w in raised_warnings)


@pytest.mark.parametrize('num_dims', [None, 2])
def test_deprecation_num_dims_rca(num_dims):
Expand Down
4 changes: 2 additions & 2 deletions test/test_base_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,13 @@ def test_rca(self):
self.assertEqual(remove_spaces(str(metric_learn.RCA())),
remove_spaces("RCA(n_components=None, "
"num_dims='deprecated', "
"pca_comps=None, "
"pca_comps='deprecated', "
"preprocessor=None)"))
self.assertEqual(remove_spaces(str(metric_learn.RCA_Supervised())),
remove_spaces(
"RCA_Supervised(chunk_size=2, "
"n_components=None, num_chunks=100, "
"num_dims='deprecated', pca_comps=None, "
"num_dims='deprecated', pca_comps='deprecated', "
"preprocessor=None)"))

def test_mlkr(self):
Expand Down