From a308cdb954a28204e23879ff17ee1c67221ed4cc Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 9 May 2013 01:19:18 +1000 Subject: [PATCH 1/5] FIX make Pipeline methods properties as per #1805 --- sklearn/pipeline.py | 242 +++++++++++++++++++++++++-------- sklearn/tests/test_pipeline.py | 22 +++ 2 files changed, 207 insertions(+), 57 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index e91faba0d61f4..5423e04359284 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -110,6 +110,19 @@ def get_params(self, deep=True): out['%s__%s' % (name, key)] = value return out + @property + def named_steps(self): + return dict(self.steps) + + @property + def _transforms(self): + """Non-final estimators in (name, est) tuples.""" + return self.steps[:-1] + + @property + def _final_estimator(self): + return self.steps[-1][1] + # Estimator interface def _pre_transform(self, X, y=None, **fit_params): @@ -118,7 +131,9 @@ def _pre_transform(self, X, y=None, **fit_params): step, param = pname.split('__', 1) fit_params_steps[step][param] = pval Xt = X - for name, transform in self.steps[:-1]: + for name, transform in self._transforms: + if transform is None: + continue if hasattr(transform, "fit_transform"): Xt = transform.fit_transform(Xt, y, **fit_params_steps[name]) else: @@ -129,79 +144,192 @@ def _pre_transform(self, X, y=None, **fit_params): def fit(self, X, y=None, **fit_params): """Fit all the transforms one after the other and transform the data, then fit the transformed data using the final estimator. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training data, where n_samples in the number of samples and + n_features is the number of features. + y : array-like, shape = [n_samples], optional + Target vector relative to X for classification; + None for unsupervised learning. + fit_params : dict of string -> object + Parameters passed to the `fit` method of each step, where + each parameter name is prefixed such that parameter ``p`` for step + ``s`` has key ``s__p``. """ Xt, fit_params = self._pre_transform(X, y, **fit_params) - self.steps[-1][-1].fit(Xt, y, **fit_params) + self._final_estimator.fit(Xt, y, **fit_params) return self - def fit_transform(self, X, y=None, **fit_params): - """Fit all the transforms one after the other and transform the + @property + def fit_transform(self): + """Pipeline.fit_transform(X, y=None, **fit_params) + + Fit all the transforms one after the other and transform the data, then use fit_transform on transformed data using the final - estimator.""" - Xt, fit_params = self._pre_transform(X, y, **fit_params) - if hasattr(self.steps[-1][-1], 'fit_transform'): - return self.steps[-1][-1].fit_transform(Xt, y, **fit_params) - else: - return self.steps[-1][-1].fit(Xt, y, **fit_params).transform(Xt) + estimator. - def predict(self, X): - """Applies transforms to the data, and the predict method of the - final estimator. Valid only if the final estimator implements - predict.""" + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training data, where n_samples in the number of samples and + n_features is the number of features. + y : array-like, shape = [n_samples], optional + Target vector relative to X for classification; + None for unsupervised learning. + fit_params : dict of string -> object + Parameters passed to the `fit` method of each step, where + each parameter name is prefixed such that parameter ``p`` for step + ``s`` has key ``s__p``. + """ + last_step = self._final_estimator + if ( + not hasattr(last_step, 'fit_transform') + and not hasattr(last_step, 'transform')): + raise AttributeError( + 'last step has neither `transform` nor `fit_transform`') + + def fn(X, y=None, **fit_params): + Xt, fit_params = self._pre_transform(X, y, **fit_params) + if hasattr(last_step, 'fit_transform'): + return last_step.fit_transform(Xt, y, **fit_params) + else: + return last_step.fit(Xt, y, **fit_params).transform(Xt) + return fn + + def _run_pipeline(self, est_fn, X, *args, **kwargs): Xt = X - for name, transform in self.steps[:-1]: - Xt = transform.transform(Xt) - return self.steps[-1][-1].predict(Xt) + for name, transform in self._transforms: + if transform is not None: + Xt = transform.transform(Xt) + return est_fn(Xt, *args, **kwargs) + + @property + def predict(self): + """Pipeline.predict(X) - def predict_proba(self, X): - """Applies transforms to the data, and the predict_proba method of the + Applies transforms to the data, and the `predict` method of the final estimator. Valid only if the final estimator implements - predict_proba.""" - Xt = X - for name, transform in self.steps[:-1]: - Xt = transform.transform(Xt) - return self.steps[-1][-1].predict_proba(Xt) + predict. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Data samples, where n_samples in the number of samples and + n_features is the number of features. + """ + return partial(self._run_pipeline, self._final_estimator.predict) + + @property + def predict_proba(self): + """Pipeline.predict_proba(X) - def decision_function(self, X): - """Applies transforms to the data, and the decision_function method of + Applies transforms to the data, and the `predict_proba` method of the final estimator. Valid only if the final estimator implements - decision_function.""" - Xt = X - for name, transform in self.steps[:-1]: - Xt = transform.transform(Xt) - return self.steps[-1][-1].decision_function(Xt) + predict_proba. - def predict_log_proba(self, X): - Xt = X - for name, transform in self.steps[:-1]: - Xt = transform.transform(Xt) - return self.steps[-1][-1].predict_log_proba(Xt) + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Data samples, where n_samples in the number of samples and + n_features is the number of features. + """ + return partial(self._run_pipeline, self._final_estimator.predict_proba) - def transform(self, X): - """Applies transforms to the data, and the transform method of the + @property + def predict_log_proba(self): + """Pipeline.predict_log_proba(X) + + Applies transforms to the data, and the `predict_log_proba` method + of the final estimator. Valid only if the final estimator implements + predict_log_proba. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Data samples, where n_samples in the number of samples and + n_features is the number of features. + """ + return partial(self._run_pipeline, + self._final_estimator.predict_log_proba) + + @property + def decision_function(self): + """Pipeline.decision_function(X) + + Applies transforms to the data, and the `decision_function` method + of the final estimator. Valid only if the final estimator implements + decision_function. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Data samples, where n_samples in the number of samples and + n_features is the number of features. + """ + return partial(self._run_pipeline, + self._final_estimator.decision_function) + + @property + def transform(self): + """Pipeline.transform(X) + + Applies transforms to the data, and the `transform` method of the final estimator. Valid only if the final estimator implements - transform.""" - Xt = X - for name, transform in self.steps: - Xt = transform.transform(Xt) - return Xt + transform. - def inverse_transform(self, X): - if X.ndim == 1: - X = X[None, :] - Xt = X - for name, step in self.steps[::-1]: - Xt = step.inverse_transform(Xt) - return Xt + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Data samples, where n_samples in the number of samples and + n_features is the number of features. + """ + return partial(self._run_pipeline, self._final_estimator.transform) + + @property + def inverse_transform(self): + """Pipeline.inverse_transform(X) - def score(self, X, y=None): - """Applies transforms to the data, and the score method of the + Applies inverse transforms to the data from the last step to the + first. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Data samples, where n_samples in the number of samples and + n_features is the number of features. + """ + inverse_transforms = [step.inverse_transform + for name, step in self.steps[::-1] if step is not None] + + def fn(X): + if X.ndim == 1: + X = X[None, :] + Xt = X + for inv_transform in inverse_transforms: + Xt = inv_transform(Xt) + return Xt + return fn + + @property + def score(self): + """Pipeline.score(X, y=None) + + Applies transforms to the data, and the `score` method of the final estimator. Valid only if the final estimator implements - score.""" - Xt = X - for name, transform in self.steps[:-1]: - Xt = transform.transform(Xt) - return self.steps[-1][-1].score(Xt, y) + score. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Data samples, where n_samples in the number of samples and + n_features is the number of features. + y : array-like, shape = [n_samples], optional + Target vector relative to X; + None for unsupervised learning. + """ + return partial(self._run_pipeline, self._final_estimator.score) @property def _pairwise(self): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 87bb7b813cae8..3953ef642bbb4 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -289,6 +289,28 @@ def test_make_pipeline(): assert_equal(pipe.steps[2][0], "fitparamt") +def test_pipeline_attributes(): + """Ensure that the Pipeline only provides post-fit methods that are present + on the last step""" + + def make(method): + """Make a pipeline whose estimator has specified method""" + transf = TransfT() + setattr(transf, method, lambda *args, **kwargs: True) + return Pipeline([('est', transf)]).fit([[1]], [1]) + + attribs = ['predict_proba', 'predict_log_proba', 'predict', + 'decision_function', 'score', 'inverse_transform'] + + for attrib in attribs: + pipeline = make(attrib) + getattr(pipeline, attrib)(np.asarray([[1]])) + for attrib2 in attribs: + if attrib2 != attrib: + assert_false(hasattr(pipeline, attrib2)) +>>>>>>> FIX make Pipeline methods properties as per #1805 + + def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() From ece02bbd885ca9ac7289dd84a02767efd00cde14 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 30 May 2013 11:14:13 +1000 Subject: [PATCH 2/5] TST test delegated ducktyping of metaestimators --- sklearn/pipeline.py | 4 +- sklearn/tests/test_metaestimators.py | 100 +++++++ sklearn/tests/test_pipeline.py | 410 +++++++++++++-------------- 3 files changed, 297 insertions(+), 217 deletions(-) create mode 100644 sklearn/tests/test_metaestimators.py diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 5423e04359284..e361f4ce2af3e 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -11,6 +11,8 @@ from collections import defaultdict +from functools import partial + import numpy as np from scipy import sparse @@ -78,7 +80,7 @@ class Pipeline(BaseEstimator): # BaseEstimator interface def __init__(self, steps): - self.named_steps = dict(steps) + self.steps = steps names, estimators = zip(*steps) if len(self.named_steps) != len(steps): raise ValueError("Names provided are not unique: %s" % (names,)) diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py new file mode 100644 index 0000000000000..5da01390d1a00 --- /dev/null +++ b/sklearn/tests/test_metaestimators.py @@ -0,0 +1,100 @@ +"""Common tests for metaestimators""" + +import functools + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.externals.six import iterkeys +from sklearn.datasets import make_classification +from sklearn.utils.testing import assert_true, assert_false +from sklearn.pipeline import Pipeline + + +class DelegatorData(object): + def __init__(self, name, construct, skip_methods=(), + fit_args=make_classification()): + self.name = name + self.construct = construct + self.fit_args = fit_args + self.skip_methods = skip_methods + + +DELEGATING_METAESTIMATORS = { + DelegatorData('pipeline', lambda est: Pipeline([('est', est)])), + DelegatorData('pipeline', lambda est: Pipeline([('est', est)])), +} + + +def test_metaestimator_delegation(): + def hides(method): + @property + def wrapper(obj): + if obj.hidden_method == method.__name__: + raise AttributeError + return functools.partial(method, obj) + return wrapper + + class SubEstimator(BaseEstimator): + def __init__(self, param=1, hidden_method=None): + self.param = param + self.hidden_method = hidden_method + + def fit(self, X, y=None, *args, **kwargs): + return True + + @hides + def inverse_transform(self, X, *args, **kwargs): + return X + + @hides + def transform(self, X, *args, **kwargs): + return X + + @hides + def predict(self, X, *args, **kwargs): + return np.ones(X.shape[0]) + + @hides + def predict_proba(self, X, *args, **kwargs): + return np.ones(X.shape[0]) + + @hides + def predict_log_proba(self, X, *args, **kwargs): + return np.ones(X.shape[0]) + + @hides + def decision_function(self, X, *args, **kwargs): + return np.ones(X.shape[0]) + + @hides + def score(self, X, *args, **kwargs): + return 1.0 + + + methods = [k for k in iterkeys(SubEstimator.__dict__) + if not k.startswith('_') and not k.startswith('fit')] + + for delegator_data in DELEGATING_METAESTIMATORS: + delegate = SubEstimator() + delegator = delegator_data.construct(delegate) + delegator.fit(*delegator_data.fit_args) + for method in methods: + if method in delegator_data.skip_methods: + continue + assert_true(hasattr(delegate, method)) + assert_true(hasattr(delegator, method), + msg="%s does not have method %r when its delegate does" + % (delegator_data.name, method)) + + for method in methods: + if method in delegator_data.skip_methods: + continue + delegate = SubEstimator(hidden_method=method) + delegator = delegator_data.construct(delegate) + delegator.fit(*delegator_data.fit_args) + assert_false(hasattr(delegate, method)) + assert_false(hasattr(delegator, method), + msg="%s has method %r when its delegate does not" + % (delegator_data.name, method)) + diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 3953ef642bbb4..aa4ab41e8582a 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -23,254 +23,254 @@ JUNK_FOOD_DOCS = ( - "the pizza pizza beer copyright", - "the pizza burger beer copyright", - "the the pizza beer beer copyright", - "the burger beer beer copyright", - "the coke burger coke copyright", - "the coke burger burger", +"the pizza pizza beer copyright", +"the pizza burger beer copyright", +"the the pizza beer beer copyright", +"the burger beer beer copyright", +"the coke burger coke copyright", +"the coke burger burger", ) class IncorrectT(BaseEstimator): - """Small class to test parameter dispatching. - """ +"""Small class to test parameter dispatching. +""" - def __init__(self, a=None, b=None): - self.a = a - self.b = b +def __init__(self, a=None, b=None): + self.a = a + self.b = b class T(IncorrectT): - def fit(self, X, y): - return self +def fit(self, X, y): + return self class TransfT(T): - def transform(self, X, y=None): - return X +def transform(self, X, y=None): + return X class FitParamT(BaseEstimator): - """Mock classifier - """ +"""Mock classifier +""" - def __init__(self): - self.successful = False - pass +def __init__(self): + self.successful = False + pass - def fit(self, X, y, should_succeed=False): - self.successful = should_succeed +def fit(self, X, y, should_succeed=False): + self.successful = should_succeed - def predict(self, X): - return self.successful +def predict(self, X): + return self.successful def test_pipeline_init(): - """ Test the various init parameters of the pipeline. - """ - assert_raises(TypeError, Pipeline) - # Check that we can't instantiate pipelines with objects without fit - # method - pipe = assert_raises(TypeError, Pipeline, [('svc', IncorrectT)]) - # Smoke test with only an estimator - clf = T() - pipe = Pipeline([('svc', clf)]) - assert_equal(pipe.get_params(deep=True), - dict(svc__a=None, svc__b=None, svc=clf)) - - # Check that params are set - pipe.set_params(svc__a=0.1) - assert_equal(clf.a, 0.1) - # Smoke test the repr: - repr(pipe) - - # Test with two objects - clf = SVC() - filter1 = SelectKBest(f_classif) - pipe = Pipeline([('anova', filter1), ('svc', clf)]) - - # Check that we can't use the same stage name twice - assert_raises(ValueError, Pipeline, [('svc', SVC()), ('svc', SVC())]) - - # Check that params are set - pipe.set_params(svc__C=0.1) - assert_equal(clf.C, 0.1) - # Smoke test the repr: - repr(pipe) - - # Check that params are not set when naming them wrong - assert_raises(ValueError, pipe.set_params, anova__C=0.1) - - # Test clone - pipe2 = clone(pipe) - assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc']) - - # Check that apart from estimators, the parameters are the same - params = pipe.get_params() - params2 = pipe2.get_params() - # Remove estimators that where copied - params.pop('svc') - params.pop('anova') - params2.pop('svc') - params2.pop('anova') - assert_equal(params, params2) +""" Test the various init parameters of the pipeline. +""" +assert_raises(TypeError, Pipeline) +# Check that we can't instantiate pipelines with objects without fit +# method +pipe = assert_raises(TypeError, Pipeline, [('svc', IncorrectT)]) +# Smoke test with only an estimator +clf = T() +pipe = Pipeline([('svc', clf)]) +assert_equal(pipe.get_params(deep=True), + dict(svc__a=None, svc__b=None, svc=clf)) + +# Check that params are set +pipe.set_params(svc__a=0.1) +assert_equal(clf.a, 0.1) +# Smoke test the repr: +repr(pipe) + +# Test with two objects +clf = SVC() +filter1 = SelectKBest(f_classif) +pipe = Pipeline([('anova', filter1), ('svc', clf)]) + +# Check that we can't use the same stage name twice +assert_raises(ValueError, Pipeline, [('svc', SVC()), ('svc', SVC())]) + +# Check that params are set +pipe.set_params(svc__C=0.1) +assert_equal(clf.C, 0.1) +# Smoke test the repr: +repr(pipe) + +# Check that params are not set when naming them wrong +assert_raises(ValueError, pipe.set_params, anova__C=0.1) + +# Test clone +pipe2 = clone(pipe) +assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc']) + +# Check that apart from estimators, the parameters are the same +params = pipe.get_params() +params2 = pipe2.get_params() +# Remove estimators that where copied +params.pop('svc') +params.pop('anova') +params2.pop('svc') +params2.pop('anova') +assert_equal(params, params2) def test_pipeline_methods_anova(): - """ Test the various methods of the pipeline (anova). - """ - iris = load_iris() - X = iris.data - y = iris.target - # Test with Anova + LogisticRegression - clf = LogisticRegression() - filter1 = SelectKBest(f_classif, k=2) - pipe = Pipeline([('anova', filter1), ('logistic', clf)]) - pipe.fit(X, y) - pipe.predict(X) - pipe.predict_proba(X) - pipe.predict_log_proba(X) - pipe.score(X, y) +""" Test the various methods of the pipeline (anova). +""" +iris = load_iris() +X = iris.data +y = iris.target +# Test with Anova + LogisticRegression +clf = LogisticRegression() +filter1 = SelectKBest(f_classif, k=2) +pipe = Pipeline([('anova', filter1), ('logistic', clf)]) +pipe.fit(X, y) +pipe.predict(X) +pipe.predict_proba(X) +pipe.predict_log_proba(X) +pipe.score(X, y) def test_pipeline_fit_params(): - """Test that the pipeline can take fit parameters - """ - pipe = Pipeline([('transf', TransfT()), ('clf', FitParamT())]) - pipe.fit(X=None, y=None, clf__should_succeed=True) - # classifier should return True - assert_true(pipe.predict(None)) - # and transformer params should not be changed - assert_true(pipe.named_steps['transf'].a is None) - assert_true(pipe.named_steps['transf'].b is None) +"""Test that the pipeline can take fit parameters +""" +pipe = Pipeline([('transf', TransfT()), ('clf', FitParamT())]) +pipe.fit(X=None, y=None, clf__should_succeed=True) +# classifier should return True +assert_true(pipe.predict(None)) +# and transformer params should not be changed +assert_true(pipe.named_steps['transf'].a is None) +assert_true(pipe.named_steps['transf'].b is None) def test_pipeline_methods_pca_svm(): - """Test the various methods of the pipeline (pca + svm).""" - iris = load_iris() - X = iris.data - y = iris.target - # Test with PCA + SVC - clf = SVC(probability=True, random_state=0) - pca = PCA(n_components='mle', whiten=True) - pipe = Pipeline([('pca', pca), ('svc', clf)]) - pipe.fit(X, y) - pipe.predict(X) - pipe.predict_proba(X) - pipe.predict_log_proba(X) - pipe.score(X, y) +"""Test the various methods of the pipeline (pca + svm).""" +iris = load_iris() +X = iris.data +y = iris.target +# Test with PCA + SVC +clf = SVC(probability=True, random_state=0) +pca = PCA(n_components='mle', whiten=True) +pipe = Pipeline([('pca', pca), ('svc', clf)]) +pipe.fit(X, y) +pipe.predict(X) +pipe.predict_proba(X) +pipe.predict_log_proba(X) +pipe.score(X, y) def test_pipeline_methods_preprocessing_svm(): - """Test the various methods of the pipeline (preprocessing + svm).""" - iris = load_iris() - X = iris.data - y = iris.target - n_samples = X.shape[0] - n_classes = len(np.unique(y)) - scaler = StandardScaler() - pca = RandomizedPCA(n_components=2, whiten=True) - clf = SVC(probability=True, random_state=0) - - for preprocessing in [scaler, pca]: - pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) - pipe.fit(X, y) +"""Test the various methods of the pipeline (preprocessing + svm).""" +iris = load_iris() +X = iris.data +y = iris.target +n_samples = X.shape[0] +n_classes = len(np.unique(y)) +scaler = StandardScaler() +pca = RandomizedPCA(n_components=2, whiten=True) +clf = SVC(probability=True, random_state=0) + +for preprocessing in [scaler, pca]: + pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) + pipe.fit(X, y) - # check shapes of various prediction functions - predict = pipe.predict(X) - assert_equal(predict.shape, (n_samples,)) + # check shapes of various prediction functions + predict = pipe.predict(X) + assert_equal(predict.shape, (n_samples,)) - proba = pipe.predict_proba(X) - assert_equal(proba.shape, (n_samples, n_classes)) + proba = pipe.predict_proba(X) + assert_equal(proba.shape, (n_samples, n_classes)) - log_proba = pipe.predict_log_proba(X) - assert_equal(log_proba.shape, (n_samples, n_classes)) + log_proba = pipe.predict_log_proba(X) + assert_equal(log_proba.shape, (n_samples, n_classes)) - decision_function = pipe.decision_function(X) - assert_equal(decision_function.shape, (n_samples, n_classes)) + decision_function = pipe.decision_function(X) + assert_equal(decision_function.shape, (n_samples, n_classes)) - pipe.score(X, y) + pipe.score(X, y) def test_feature_union(): - # basic sanity check for feature union - iris = load_iris() - X = iris.data - X -= X.mean(axis=0) - y = iris.target - svd = TruncatedSVD(n_components=2, random_state=0) - select = SelectKBest(k=1) - fs = FeatureUnion([("svd", svd), ("select", select)]) - fs.fit(X, y) - X_transformed = fs.transform(X) - assert_equal(X_transformed.shape, (X.shape[0], 3)) - - # check if it does the expected thing - assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) - assert_array_equal(X_transformed[:, -1], - select.fit_transform(X, y).ravel()) - - # test if it also works for sparse input - # We use a different svd object to control the random_state stream - fs = FeatureUnion([("svd", svd), ("select", select)]) - X_sp = sparse.csr_matrix(X) - X_sp_transformed = fs.fit_transform(X_sp, y) - assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) - - # test setting parameters - fs.set_params(select__k=2) - assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) - - # test it works with transformers missing fit_transform - fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)]) - X_transformed = fs.fit_transform(X, y) - assert_equal(X_transformed.shape, (X.shape[0], 8)) +# basic sanity check for feature union +iris = load_iris() +X = iris.data +X -= X.mean(axis=0) +y = iris.target +svd = TruncatedSVD(n_components=2, random_state=0) +select = SelectKBest(k=1) +fs = FeatureUnion([("svd", svd), ("select", select)]) +fs.fit(X, y) +X_transformed = fs.transform(X) +assert_equal(X_transformed.shape, (X.shape[0], 3)) + +# check if it does the expected thing +assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) +assert_array_equal(X_transformed[:, -1], + select.fit_transform(X, y).ravel()) + +# test if it also works for sparse input +# We use a different svd object to control the random_state stream +fs = FeatureUnion([("svd", svd), ("select", select)]) +X_sp = sparse.csr_matrix(X) +X_sp_transformed = fs.fit_transform(X_sp, y) +assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) + +# test setting parameters +fs.set_params(select__k=2) +assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) + +# test it works with transformers missing fit_transform +fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)]) +X_transformed = fs.fit_transform(X, y) +assert_equal(X_transformed.shape, (X.shape[0], 8)) def test_make_union(): - pca = PCA() - mock = TransfT() - fu = make_union(pca, mock) - names, transformers = zip(*fu.transformer_list) - assert_equal(names, ("pca", "transft")) - assert_equal(transformers, (pca, mock)) +pca = PCA() +mock = TransfT() +fu = make_union(pca, mock) +names, transformers = zip(*fu.transformer_list) +assert_equal(names, ("pca", "transft")) +assert_equal(transformers, (pca, mock)) def test_pipeline_transform(): - # Test whether pipeline works with a transformer at the end. - # Also test pipeline.transform and pipeline.inverse_transform - iris = load_iris() - X = iris.data - pca = PCA(n_components=2) - pipeline = Pipeline([('pca', pca)]) +# Test whether pipeline works with a transformer at the end. +# Also test pipeline.transform and pipeline.inverse_transform +iris = load_iris() +X = iris.data +pca = PCA(n_components=2) +pipeline = Pipeline([('pca', pca)]) - # test transform and fit_transform: - X_trans = pipeline.fit(X).transform(X) - X_trans2 = pipeline.fit_transform(X) - X_trans3 = pca.fit_transform(X) - assert_array_almost_equal(X_trans, X_trans2) - assert_array_almost_equal(X_trans, X_trans3) +# test transform and fit_transform: +X_trans = pipeline.fit(X).transform(X) +X_trans2 = pipeline.fit_transform(X) +X_trans3 = pca.fit_transform(X) +assert_array_almost_equal(X_trans, X_trans2) +assert_array_almost_equal(X_trans, X_trans3) - X_back = pipeline.inverse_transform(X_trans) - X_back2 = pca.inverse_transform(X_trans) - assert_array_almost_equal(X_back, X_back2) +X_back = pipeline.inverse_transform(X_trans) +X_back2 = pca.inverse_transform(X_trans) +assert_array_almost_equal(X_back, X_back2) def test_pipeline_fit_transform(): - # Test whether pipeline works with a transformer missing fit_transform - iris = load_iris() - X = iris.data - y = iris.target - transft = TransfT() - pipeline = Pipeline([('mock', transft)]) +# Test whether pipeline works with a transformer missing fit_transform +iris = load_iris() +X = iris.data +y = iris.target +transft = TransfT() +pipeline = Pipeline([('mock', transft)]) - # test fit_transform: - X_trans = pipeline.fit_transform(X, y) - X_trans2 = transft.fit(X, y).transform(X) - assert_array_almost_equal(X_trans, X_trans2) +# test fit_transform: +X_trans = pipeline.fit_transform(X, y) +X_trans2 = transft.fit(X, y).transform(X) +assert_array_almost_equal(X_trans, X_trans2) def test_make_pipeline(): @@ -289,28 +289,6 @@ def test_make_pipeline(): assert_equal(pipe.steps[2][0], "fitparamt") -def test_pipeline_attributes(): - """Ensure that the Pipeline only provides post-fit methods that are present - on the last step""" - - def make(method): - """Make a pipeline whose estimator has specified method""" - transf = TransfT() - setattr(transf, method, lambda *args, **kwargs: True) - return Pipeline([('est', transf)]).fit([[1]], [1]) - - attribs = ['predict_proba', 'predict_log_proba', 'predict', - 'decision_function', 'score', 'inverse_transform'] - - for attrib in attribs: - pipeline = make(attrib) - getattr(pipeline, attrib)(np.asarray([[1]])) - for attrib2 in attribs: - if attrib2 != attrib: - assert_false(hasattr(pipeline, attrib2)) ->>>>>>> FIX make Pipeline methods properties as per #1805 - - def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() From 470f45d3bbf482b7605db7a61080309d4a4dc29c Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 30 May 2013 11:27:53 +1000 Subject: [PATCH 3/5] TST extra tests, pep8, comment --- sklearn/tests/test_metaestimators.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index 5da01390d1a00..af4a581a0510c 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -9,6 +9,8 @@ from sklearn.datasets import make_classification from sklearn.utils.testing import assert_true, assert_false from sklearn.pipeline import Pipeline +from sklearn.grid_search import GridSearchCV, RandomizedSearchCV +from sklearn.feature_selection import RFECV class DelegatorData(object): @@ -21,12 +23,22 @@ def __init__(self, name, construct, skip_methods=(), DELEGATING_METAESTIMATORS = { - DelegatorData('pipeline', lambda est: Pipeline([('est', est)])), - DelegatorData('pipeline', lambda est: Pipeline([('est', est)])), + DelegatorData('Pipeline', lambda est: Pipeline([('est', est)])), + DelegatorData('GridSearchCV', + lambda est: GridSearchCV( + est, param_grid={'param': [5]}, cv=2), + skip_methods=['score']), + DelegatorData('RandomizedSearchCV', + lambda est: RandomizedSearchCV( + est, param_grid={'param': [5]}, cv=2), + skip_methods=['score']), + DelegatorData('RFECV', RFECV, + skip_methods=['transform', 'inverse_transform']), } def test_metaestimator_delegation(): + """Ensures specified metaestimators have methods iff subestimator does""" def hides(method): @property def wrapper(obj): @@ -39,10 +51,10 @@ class SubEstimator(BaseEstimator): def __init__(self, param=1, hidden_method=None): self.param = param self.hidden_method = hidden_method - + def fit(self, X, y=None, *args, **kwargs): return True - + @hides def inverse_transform(self, X, *args, **kwargs): return X @@ -71,7 +83,6 @@ def decision_function(self, X, *args, **kwargs): def score(self, X, *args, **kwargs): return 1.0 - methods = [k for k in iterkeys(SubEstimator.__dict__) if not k.startswith('_') and not k.startswith('fit')] @@ -86,7 +97,7 @@ def score(self, X, *args, **kwargs): assert_true(hasattr(delegator, method), msg="%s does not have method %r when its delegate does" % (delegator_data.name, method)) - + for method in methods: if method in delegator_data.skip_methods: continue @@ -95,6 +106,5 @@ def score(self, X, *args, **kwargs): delegator.fit(*delegator_data.fit_args) assert_false(hasattr(delegate, method)) assert_false(hasattr(delegator, method), - msg="%s has method %r when its delegate does not" - % (delegator_data.name, method)) - + msg="%s has method %r when its delegate does not" + % (delegator_data.name, method)) From a2d37810758a220dfd3c6006cfbd0f0d3cd61652 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 10 Jun 2013 12:43:34 +1000 Subject: [PATCH 4/5] Fix issues in test --- sklearn/tests/test_metaestimators.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index af4a581a0510c..c193c84d18db5 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -22,7 +22,7 @@ def __init__(self, name, construct, skip_methods=(), self.skip_methods = skip_methods -DELEGATING_METAESTIMATORS = { +DELEGATING_METAESTIMATORS = [ DelegatorData('Pipeline', lambda est: Pipeline([('est', est)])), DelegatorData('GridSearchCV', lambda est: GridSearchCV( @@ -30,11 +30,11 @@ def __init__(self, name, construct, skip_methods=(), skip_methods=['score']), DelegatorData('RandomizedSearchCV', lambda est: RandomizedSearchCV( - est, param_grid={'param': [5]}, cv=2), + est, param_distributions={'param': [5]}, cv=2), skip_methods=['score']), DelegatorData('RFECV', RFECV, skip_methods=['transform', 'inverse_transform']), -} +] def test_metaestimator_delegation(): @@ -53,6 +53,7 @@ def __init__(self, param=1, hidden_method=None): self.hidden_method = hidden_method def fit(self, X, y=None, *args, **kwargs): + self.coef_ = np.arange(X.shape[1]) return True @hides From 159bb7b4c63ff67fbe3d1dfc2b7a4380456211de Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 10 Jun 2013 13:14:54 +1000 Subject: [PATCH 5/5] FIX properties of #1805 for BaseSearchCV and RFE --- sklearn/feature_selection/rfe.py | 31 ++- sklearn/grid_search.py | 31 +++ sklearn/tests/test_metaestimators.py | 11 +- sklearn/tests/test_pipeline.py | 390 +++++++++++++-------------- 4 files changed, 257 insertions(+), 206 deletions(-) diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index 01c99ceb526f4..4b98106fda483 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -6,6 +6,7 @@ """Recursive feature elimination for feature ranking""" +from functools import wraps import numpy as np from ..utils import check_arrays, safe_sqr from ..base import BaseEstimator @@ -36,6 +37,7 @@ class RFE(BaseEstimator, MetaEstimatorMixin, SelectorMixin): A supervised learning estimator with a `fit` method that updates a `coef_` attribute that holds the fitted parameters. Important features must correspond to high absolute values in the `coef_` array. + The estimator must also implement a `score` method. For instance, this is the case for most supervised learning algorithms such as Support Vector Classifiers and Generalized @@ -169,7 +171,13 @@ def fit(self, X, y): return self - def predict(self, X): + def _delegate_wrapper(self, delegate): + def wrapper(X, *args, **kwargs): + return delegate(self.transform(X), *args, **kwargs) + return wrapper + + @property + def predict(self): """Reduce X to the selected features and then predict using the underlying estimator. @@ -183,9 +191,10 @@ def predict(self, X): y : array of shape [n_samples] The predicted target values. """ - return self.estimator_.predict(self.transform(X)) + return self._delegate_wrapper(self.estimator_.predict) - def score(self, X, y): + @property + def score(self): """Reduce X to the selected features and then return the score of the underlying estimator. @@ -197,16 +206,22 @@ def score(self, X, y): y : array of shape [n_samples] The target values. """ - return self.estimator_.score(self.transform(X), y) + return self._delegate_wrapper(self.estimator_.score) def _get_support_mask(self): return self.support_ - def decision_function(self, X): - return self.estimator_.decision_function(self.transform(X)) + @property + def decision_function(self): + return self._delegate_wrapper(self.estimator_.decision_function) + + @property + def predict_proba(self): + return self._delegate_wrapper(self.estimator_.predict_proba) - def predict_proba(self, X): - return self.estimator_.predict_proba(self.transform(X)) + @property + def predict_log_proba(self): + return self._delegate_wrapper(self.estimator_.predict_log_proba) class RFECV(RFE, MetaEstimatorMixin): diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 8d217521f1269..2904030e77c4a 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -325,20 +325,51 @@ def score(self, X, y=None): @property def predict(self): + """Call predict on the best estimator""" return self.best_estimator_.predict @property def predict_proba(self): + """Call predict_proba on the best estimator""" return self.best_estimator_.predict_proba + @property + def predict_log_proba(self): + """Call predict_log_proba on the best estimator""" + return self.best_estimator_.predict_log_proba + @property def decision_function(self): + """Call decision_function on the best estimator""" return self.best_estimator_.decision_function @property def transform(self): + """Call transform on the best estimator""" return self.best_estimator_.transform + @property + def inverse_transform(self): + """Call inverse_transform on the best estimator""" + return self.best_estimator_.inverse_transform + + def _check_estimator(self): + """Check that estimator can be fitted and score can be computed.""" + if (not hasattr(self.estimator, 'fit') or + not (hasattr(self.estimator, 'predict') + or hasattr(self.estimator, 'score'))): + raise TypeError("estimator should a be an estimator implementing" + " 'fit' and 'predict' or 'score' methods," + " %s (type %s) was passed" % + (self.estimator, type(self.estimator))) + if (self.scoring is None and self.loss_func is None and self.score_func + is None): + if not hasattr(self.estimator, 'score'): + raise TypeError( + "If no scoring is specified, the estimator passed " + "should have a 'score' method. The estimator %s " + "does not." % self.estimator) + def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index c193c84d18db5..0a36125c07b2a 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -10,7 +10,7 @@ from sklearn.utils.testing import assert_true, assert_false from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV, RandomizedSearchCV -from sklearn.feature_selection import RFECV +from sklearn.feature_selection import RFE, RFECV class DelegatorData(object): @@ -32,8 +32,10 @@ def __init__(self, name, construct, skip_methods=(), lambda est: RandomizedSearchCV( est, param_distributions={'param': [5]}, cv=2), skip_methods=['score']), + DelegatorData('RFE', RFE, + skip_methods=['transform', 'inverse_transform', 'score']), DelegatorData('RFECV', RFECV, - skip_methods=['transform', 'inverse_transform']), + skip_methods=['transform', 'inverse_transform', 'score']), ] @@ -43,7 +45,7 @@ def hides(method): @property def wrapper(obj): if obj.hidden_method == method.__name__: - raise AttributeError + raise AttributeError('%r is hidden' % obj.hidden_method) return functools.partial(method, obj) return wrapper @@ -86,6 +88,7 @@ def score(self, X, *args, **kwargs): methods = [k for k in iterkeys(SubEstimator.__dict__) if not k.startswith('_') and not k.startswith('fit')] + methods.sort() for delegator_data in DELEGATING_METAESTIMATORS: delegate = SubEstimator() @@ -98,6 +101,8 @@ def score(self, X, *args, **kwargs): assert_true(hasattr(delegator, method), msg="%s does not have method %r when its delegate does" % (delegator_data.name, method)) + # smoke test delegation + getattr(delegator, method)(delegator_data.fit_args[0]) for method in methods: if method in delegator_data.skip_methods: diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index aa4ab41e8582a..87bb7b813cae8 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -23,254 +23,254 @@ JUNK_FOOD_DOCS = ( -"the pizza pizza beer copyright", -"the pizza burger beer copyright", -"the the pizza beer beer copyright", -"the burger beer beer copyright", -"the coke burger coke copyright", -"the coke burger burger", + "the pizza pizza beer copyright", + "the pizza burger beer copyright", + "the the pizza beer beer copyright", + "the burger beer beer copyright", + "the coke burger coke copyright", + "the coke burger burger", ) class IncorrectT(BaseEstimator): -"""Small class to test parameter dispatching. -""" + """Small class to test parameter dispatching. + """ -def __init__(self, a=None, b=None): - self.a = a - self.b = b + def __init__(self, a=None, b=None): + self.a = a + self.b = b class T(IncorrectT): -def fit(self, X, y): - return self + def fit(self, X, y): + return self class TransfT(T): -def transform(self, X, y=None): - return X + def transform(self, X, y=None): + return X class FitParamT(BaseEstimator): -"""Mock classifier -""" + """Mock classifier + """ -def __init__(self): - self.successful = False - pass + def __init__(self): + self.successful = False + pass -def fit(self, X, y, should_succeed=False): - self.successful = should_succeed + def fit(self, X, y, should_succeed=False): + self.successful = should_succeed -def predict(self, X): - return self.successful + def predict(self, X): + return self.successful def test_pipeline_init(): -""" Test the various init parameters of the pipeline. -""" -assert_raises(TypeError, Pipeline) -# Check that we can't instantiate pipelines with objects without fit -# method -pipe = assert_raises(TypeError, Pipeline, [('svc', IncorrectT)]) -# Smoke test with only an estimator -clf = T() -pipe = Pipeline([('svc', clf)]) -assert_equal(pipe.get_params(deep=True), - dict(svc__a=None, svc__b=None, svc=clf)) - -# Check that params are set -pipe.set_params(svc__a=0.1) -assert_equal(clf.a, 0.1) -# Smoke test the repr: -repr(pipe) - -# Test with two objects -clf = SVC() -filter1 = SelectKBest(f_classif) -pipe = Pipeline([('anova', filter1), ('svc', clf)]) - -# Check that we can't use the same stage name twice -assert_raises(ValueError, Pipeline, [('svc', SVC()), ('svc', SVC())]) - -# Check that params are set -pipe.set_params(svc__C=0.1) -assert_equal(clf.C, 0.1) -# Smoke test the repr: -repr(pipe) - -# Check that params are not set when naming them wrong -assert_raises(ValueError, pipe.set_params, anova__C=0.1) - -# Test clone -pipe2 = clone(pipe) -assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc']) - -# Check that apart from estimators, the parameters are the same -params = pipe.get_params() -params2 = pipe2.get_params() -# Remove estimators that where copied -params.pop('svc') -params.pop('anova') -params2.pop('svc') -params2.pop('anova') -assert_equal(params, params2) + """ Test the various init parameters of the pipeline. + """ + assert_raises(TypeError, Pipeline) + # Check that we can't instantiate pipelines with objects without fit + # method + pipe = assert_raises(TypeError, Pipeline, [('svc', IncorrectT)]) + # Smoke test with only an estimator + clf = T() + pipe = Pipeline([('svc', clf)]) + assert_equal(pipe.get_params(deep=True), + dict(svc__a=None, svc__b=None, svc=clf)) + + # Check that params are set + pipe.set_params(svc__a=0.1) + assert_equal(clf.a, 0.1) + # Smoke test the repr: + repr(pipe) + + # Test with two objects + clf = SVC() + filter1 = SelectKBest(f_classif) + pipe = Pipeline([('anova', filter1), ('svc', clf)]) + + # Check that we can't use the same stage name twice + assert_raises(ValueError, Pipeline, [('svc', SVC()), ('svc', SVC())]) + + # Check that params are set + pipe.set_params(svc__C=0.1) + assert_equal(clf.C, 0.1) + # Smoke test the repr: + repr(pipe) + + # Check that params are not set when naming them wrong + assert_raises(ValueError, pipe.set_params, anova__C=0.1) + + # Test clone + pipe2 = clone(pipe) + assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc']) + + # Check that apart from estimators, the parameters are the same + params = pipe.get_params() + params2 = pipe2.get_params() + # Remove estimators that where copied + params.pop('svc') + params.pop('anova') + params2.pop('svc') + params2.pop('anova') + assert_equal(params, params2) def test_pipeline_methods_anova(): -""" Test the various methods of the pipeline (anova). -""" -iris = load_iris() -X = iris.data -y = iris.target -# Test with Anova + LogisticRegression -clf = LogisticRegression() -filter1 = SelectKBest(f_classif, k=2) -pipe = Pipeline([('anova', filter1), ('logistic', clf)]) -pipe.fit(X, y) -pipe.predict(X) -pipe.predict_proba(X) -pipe.predict_log_proba(X) -pipe.score(X, y) + """ Test the various methods of the pipeline (anova). + """ + iris = load_iris() + X = iris.data + y = iris.target + # Test with Anova + LogisticRegression + clf = LogisticRegression() + filter1 = SelectKBest(f_classif, k=2) + pipe = Pipeline([('anova', filter1), ('logistic', clf)]) + pipe.fit(X, y) + pipe.predict(X) + pipe.predict_proba(X) + pipe.predict_log_proba(X) + pipe.score(X, y) def test_pipeline_fit_params(): -"""Test that the pipeline can take fit parameters -""" -pipe = Pipeline([('transf', TransfT()), ('clf', FitParamT())]) -pipe.fit(X=None, y=None, clf__should_succeed=True) -# classifier should return True -assert_true(pipe.predict(None)) -# and transformer params should not be changed -assert_true(pipe.named_steps['transf'].a is None) -assert_true(pipe.named_steps['transf'].b is None) + """Test that the pipeline can take fit parameters + """ + pipe = Pipeline([('transf', TransfT()), ('clf', FitParamT())]) + pipe.fit(X=None, y=None, clf__should_succeed=True) + # classifier should return True + assert_true(pipe.predict(None)) + # and transformer params should not be changed + assert_true(pipe.named_steps['transf'].a is None) + assert_true(pipe.named_steps['transf'].b is None) def test_pipeline_methods_pca_svm(): -"""Test the various methods of the pipeline (pca + svm).""" -iris = load_iris() -X = iris.data -y = iris.target -# Test with PCA + SVC -clf = SVC(probability=True, random_state=0) -pca = PCA(n_components='mle', whiten=True) -pipe = Pipeline([('pca', pca), ('svc', clf)]) -pipe.fit(X, y) -pipe.predict(X) -pipe.predict_proba(X) -pipe.predict_log_proba(X) -pipe.score(X, y) + """Test the various methods of the pipeline (pca + svm).""" + iris = load_iris() + X = iris.data + y = iris.target + # Test with PCA + SVC + clf = SVC(probability=True, random_state=0) + pca = PCA(n_components='mle', whiten=True) + pipe = Pipeline([('pca', pca), ('svc', clf)]) + pipe.fit(X, y) + pipe.predict(X) + pipe.predict_proba(X) + pipe.predict_log_proba(X) + pipe.score(X, y) def test_pipeline_methods_preprocessing_svm(): -"""Test the various methods of the pipeline (preprocessing + svm).""" -iris = load_iris() -X = iris.data -y = iris.target -n_samples = X.shape[0] -n_classes = len(np.unique(y)) -scaler = StandardScaler() -pca = RandomizedPCA(n_components=2, whiten=True) -clf = SVC(probability=True, random_state=0) - -for preprocessing in [scaler, pca]: - pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) - pipe.fit(X, y) + """Test the various methods of the pipeline (preprocessing + svm).""" + iris = load_iris() + X = iris.data + y = iris.target + n_samples = X.shape[0] + n_classes = len(np.unique(y)) + scaler = StandardScaler() + pca = RandomizedPCA(n_components=2, whiten=True) + clf = SVC(probability=True, random_state=0) - # check shapes of various prediction functions - predict = pipe.predict(X) - assert_equal(predict.shape, (n_samples,)) + for preprocessing in [scaler, pca]: + pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) + pipe.fit(X, y) - proba = pipe.predict_proba(X) - assert_equal(proba.shape, (n_samples, n_classes)) + # check shapes of various prediction functions + predict = pipe.predict(X) + assert_equal(predict.shape, (n_samples,)) - log_proba = pipe.predict_log_proba(X) - assert_equal(log_proba.shape, (n_samples, n_classes)) + proba = pipe.predict_proba(X) + assert_equal(proba.shape, (n_samples, n_classes)) - decision_function = pipe.decision_function(X) - assert_equal(decision_function.shape, (n_samples, n_classes)) + log_proba = pipe.predict_log_proba(X) + assert_equal(log_proba.shape, (n_samples, n_classes)) - pipe.score(X, y) + decision_function = pipe.decision_function(X) + assert_equal(decision_function.shape, (n_samples, n_classes)) + + pipe.score(X, y) def test_feature_union(): -# basic sanity check for feature union -iris = load_iris() -X = iris.data -X -= X.mean(axis=0) -y = iris.target -svd = TruncatedSVD(n_components=2, random_state=0) -select = SelectKBest(k=1) -fs = FeatureUnion([("svd", svd), ("select", select)]) -fs.fit(X, y) -X_transformed = fs.transform(X) -assert_equal(X_transformed.shape, (X.shape[0], 3)) - -# check if it does the expected thing -assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) -assert_array_equal(X_transformed[:, -1], - select.fit_transform(X, y).ravel()) - -# test if it also works for sparse input -# We use a different svd object to control the random_state stream -fs = FeatureUnion([("svd", svd), ("select", select)]) -X_sp = sparse.csr_matrix(X) -X_sp_transformed = fs.fit_transform(X_sp, y) -assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) - -# test setting parameters -fs.set_params(select__k=2) -assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) - -# test it works with transformers missing fit_transform -fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)]) -X_transformed = fs.fit_transform(X, y) -assert_equal(X_transformed.shape, (X.shape[0], 8)) + # basic sanity check for feature union + iris = load_iris() + X = iris.data + X -= X.mean(axis=0) + y = iris.target + svd = TruncatedSVD(n_components=2, random_state=0) + select = SelectKBest(k=1) + fs = FeatureUnion([("svd", svd), ("select", select)]) + fs.fit(X, y) + X_transformed = fs.transform(X) + assert_equal(X_transformed.shape, (X.shape[0], 3)) + + # check if it does the expected thing + assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) + assert_array_equal(X_transformed[:, -1], + select.fit_transform(X, y).ravel()) + + # test if it also works for sparse input + # We use a different svd object to control the random_state stream + fs = FeatureUnion([("svd", svd), ("select", select)]) + X_sp = sparse.csr_matrix(X) + X_sp_transformed = fs.fit_transform(X_sp, y) + assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) + + # test setting parameters + fs.set_params(select__k=2) + assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) + + # test it works with transformers missing fit_transform + fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)]) + X_transformed = fs.fit_transform(X, y) + assert_equal(X_transformed.shape, (X.shape[0], 8)) def test_make_union(): -pca = PCA() -mock = TransfT() -fu = make_union(pca, mock) -names, transformers = zip(*fu.transformer_list) -assert_equal(names, ("pca", "transft")) -assert_equal(transformers, (pca, mock)) + pca = PCA() + mock = TransfT() + fu = make_union(pca, mock) + names, transformers = zip(*fu.transformer_list) + assert_equal(names, ("pca", "transft")) + assert_equal(transformers, (pca, mock)) def test_pipeline_transform(): -# Test whether pipeline works with a transformer at the end. -# Also test pipeline.transform and pipeline.inverse_transform -iris = load_iris() -X = iris.data -pca = PCA(n_components=2) -pipeline = Pipeline([('pca', pca)]) + # Test whether pipeline works with a transformer at the end. + # Also test pipeline.transform and pipeline.inverse_transform + iris = load_iris() + X = iris.data + pca = PCA(n_components=2) + pipeline = Pipeline([('pca', pca)]) -# test transform and fit_transform: -X_trans = pipeline.fit(X).transform(X) -X_trans2 = pipeline.fit_transform(X) -X_trans3 = pca.fit_transform(X) -assert_array_almost_equal(X_trans, X_trans2) -assert_array_almost_equal(X_trans, X_trans3) + # test transform and fit_transform: + X_trans = pipeline.fit(X).transform(X) + X_trans2 = pipeline.fit_transform(X) + X_trans3 = pca.fit_transform(X) + assert_array_almost_equal(X_trans, X_trans2) + assert_array_almost_equal(X_trans, X_trans3) -X_back = pipeline.inverse_transform(X_trans) -X_back2 = pca.inverse_transform(X_trans) -assert_array_almost_equal(X_back, X_back2) + X_back = pipeline.inverse_transform(X_trans) + X_back2 = pca.inverse_transform(X_trans) + assert_array_almost_equal(X_back, X_back2) def test_pipeline_fit_transform(): -# Test whether pipeline works with a transformer missing fit_transform -iris = load_iris() -X = iris.data -y = iris.target -transft = TransfT() -pipeline = Pipeline([('mock', transft)]) - -# test fit_transform: -X_trans = pipeline.fit_transform(X, y) -X_trans2 = transft.fit(X, y).transform(X) -assert_array_almost_equal(X_trans, X_trans2) + # Test whether pipeline works with a transformer missing fit_transform + iris = load_iris() + X = iris.data + y = iris.target + transft = TransfT() + pipeline = Pipeline([('mock', transft)]) + + # test fit_transform: + X_trans = pipeline.fit_transform(X, y) + X_trans2 = transft.fit(X, y).transform(X) + assert_array_almost_equal(X_trans, X_trans2) def test_make_pipeline():