diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 0145842b88e16..c7e4afe5f359a 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -45,6 +45,9 @@ The last estimator may be any type (transformer, classifier, etc.). Usage ----- +Construction +............ + The :class:`Pipeline` is built using a list of ``(key, value)`` pairs, where the ``key`` is a string containing the name you want to give this step and ``value`` is an estimator object:: @@ -74,17 +77,41 @@ filling in the names automatically:: class_prior=None, fit_prior=True))]) -The estimators of a pipeline are stored as a list in the ``steps`` attribute:: +Accessing steps +............... + +The estimators of a pipeline are stored as a list in the ``steps`` attribute, +but can be accessed by index or name by indexing (with ``[idx]``) the +Pipeline:: >>> pipe.steps[0] # doctest: +NORMALIZE_WHITESPACE - ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None, - svd_solver='auto', tol=0.0, whiten=False)) + ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, + random_state=None, svd_solver='auto', tol=0.0, + whiten=False)) + >>> pipe[0] # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + PCA(copy=True, iterated_power='auto', n_components=None, random_state=None, + svd_solver='auto', tol=0.0, whiten=False) + >>> pipe['reduce_dim'] # doctest: +NORMALIZE_WHITESPACE + PCA(copy=True, ...) -and as a ``dict`` in ``named_steps``:: +Pipeline's `named_steps` attribute allows accessing steps by name with tab +completion in interactive environments:: - >>> pipe.named_steps['reduce_dim'] # doctest: +NORMALIZE_WHITESPACE - PCA(copy=True, iterated_power='auto', n_components=None, random_state=None, - svd_solver='auto', tol=0.0, whiten=False) + >>> pipe.named_steps.reduce_dim is pipe['reduce_dim'] + True + +A sub-pipeline can also be extracted using the slicing notation commonly used +for Python Sequences such as lists or strings (although only a step of 1 is +permitted). This is convenient for performing only some of the transformations +(or their inverse): + + >>> pipe[:1] # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + Pipeline(memory=None, steps=[('reduce_dim', PCA(copy=True, ...))]) + >>> pipe[-1:] # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + Pipeline(memory=None, steps=[('clf', SVC(C=1.0, ...))]) + +Nested parameters +................. Parameters of the estimators in the pipeline can be accessed using the ``__`` syntax:: @@ -94,11 +121,6 @@ Parameters of the estimators in the pipeline can be accessed using the steps=[('reduce_dim', PCA(copy=True, iterated_power='auto',...)), ('clf', SVC(C=10, cache_size=200, class_weight=None,...))]) -Attributes of named_steps map to keys, enabling tab completion in interactive environments:: - - >>> pipe.named_steps.reduce_dim is pipe.named_steps['reduce_dim'] - True - This is particularly important for doing grid searches:: >>> from sklearn.model_selection import GridSearchCV @@ -115,6 +137,16 @@ ignored by setting them to ``'passthrough'``:: ... clf__C=[0.1, 10, 100]) >>> grid_search = GridSearchCV(pipe, param_grid=param_grid) +The estimators of the pipeline can be retrieved by index: + + >>> pipe[0] # doctest: +ELLIPSIS + PCA(copy=True, ...) + +or by name:: + + >>> pipe['reduce_dim'] # doctest: +ELLIPSIS + PCA(copy=True, ...) + .. topic:: Examples: * :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection_pipeline.py` diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 4cbb42c569e1b..1aa738f9b8a1c 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -298,6 +298,12 @@ Support for Python 3.4 and below has been officially dropped. :mod:`sklearn.pipeline` ....................... +- |Feature| :class:`pipeline.Pipeline` can now use indexing notation (e.g. + ``my_pipeline[0:-1]``) to extract a subsequence of steps as another Pipeline + instance. A Pipeline can also be indexed directly to extract a particular + step (e.g. ``my_pipeline['svc']``), rather than accessing ``named_steps``. + :issue:`2568` by `Joel Nothman`_. + - |API| :class:`pipeline.Pipeline` now supports using ``'passthrough'`` as a transformer. :issue:`11144` by :user:`Thomas Fan `. diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py index c4b61990ef6e5..47d4fb82e46ee 100644 --- a/examples/feature_selection/plot_feature_selection_pipeline.py +++ b/examples/feature_selection/plot_feature_selection_pipeline.py @@ -4,7 +4,10 @@ ================== Simple usage of Pipeline that runs successively a univariate -feature selection with anova and then a C-SVM of the selected features. +feature selection with anova and then a SVM of the selected features. + +Using a sub-pipeline, the fitted coefficients can be mapped back into +the original feature space. """ from sklearn import svm from sklearn.datasets import samples_generator @@ -26,9 +29,12 @@ # 1) anova filter, take 3 best ranked features anova_filter = SelectKBest(f_regression, k=3) # 2) svm -clf = svm.SVC(kernel='linear') +clf = svm.LinearSVC() anova_svm = make_pipeline(anova_filter, clf) anova_svm.fit(X_train, y_train) y_pred = anova_svm.predict(X_test) print(classification_report(y_test, y_pred)) + +coef = anova_svm[:-1].inverse_transform(anova_svm['linearsvc'].coef_) +print(coef) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index eeba9857205af..7eaf9a46f09e9 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -99,17 +99,28 @@ class Pipeline(_BaseComposition): >>> anova_svm.score(X, y) # doctest: +ELLIPSIS 0.83 >>> # getting the selected features chosen by anova_filter - >>> anova_svm.named_steps['anova'].get_support() + >>> anova_svm['anova'].get_support() ... # doctest: +NORMALIZE_WHITESPACE - array([False, False, True, True, False, False, True, True, False, - True, False, True, True, False, True, False, True, True, + array([False, False, True, True, False, False, True, True, False, + True, False, True, True, False, True, False, True, True, False, False]) >>> # Another way to get selected features chosen by anova_filter >>> anova_svm.named_steps.anova.get_support() ... # doctest: +NORMALIZE_WHITESPACE - array([False, False, True, True, False, False, True, True, False, - True, False, True, True, False, True, False, True, True, + array([False, False, True, True, False, False, True, True, False, + True, False, True, True, False, True, False, True, True, False, False]) + >>> # Indexing can also be used to extract a sub-pipeline. + >>> sub_pipeline = anova_svm[:1] + >>> sub_pipeline # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + Pipeline(memory=None, steps=[('anova', ...)]) + >>> coef = anova_svm[-1].coef_ + >>> anova_svm['svc'] is anova_svm[-1] + True + >>> coef.shape + (1, 10) + >>> sub_pipeline.inverse_transform(coef).shape + (1, 20) """ # BaseEstimator interface @@ -188,6 +199,26 @@ def _iter(self, with_final=True): if trans is not None and trans != 'passthrough': yield idx, name, trans + def __getitem__(self, ind): + """Returns a sub-pipeline or a single esimtator in the pipeline + + Indexing with an integer will return an estimator; using a slice + returns another Pipeline instance which copies a slice of this + Pipeline. This copy is shallow: modifying (or fitting) estimators in + the sub-pipeline will affect the larger pipeline and vice-versa. + However, replacing a value in `step` will not affect a copy. + """ + if isinstance(ind, slice): + if ind.step not in (1, None): + raise ValueError('Pipeline slicing only supports a step of 1') + return self.__class__(self.steps[ind]) + try: + name, est = self.steps[ind] + except TypeError: + # Not an int, try get step by name + return self.named_steps[ind] + return est + @property def _estimator_type(self): return self.steps[-1][1]._estimator_type diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 259876acd1a42..8d6fe8f70374e 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -529,6 +529,29 @@ def test_pipeline_fit_transform(): assert_array_almost_equal(X_trans, X_trans2) +def test_pipeline_slice(): + pipe = Pipeline([('transf1', Transf()), + ('transf2', Transf()), + ('clf', FitParamT())]) + pipe2 = pipe[:-1] + assert isinstance(pipe2, Pipeline) + assert pipe2.steps == pipe.steps[:-1] + assert 2 == len(pipe2.named_steps) + assert_raises(ValueError, lambda: pipe[::-1]) + + +def test_pipeline_index(): + transf = Transf() + clf = FitParamT() + pipe = Pipeline([('transf', transf), ('clf', clf)]) + assert pipe[0] == transf + assert pipe['transf'] == transf + assert pipe[-1] == clf + assert pipe['clf'] == clf + assert_raises(IndexError, lambda: pipe[3]) + assert_raises(KeyError, lambda: pipe['foobar']) + + def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf()