Fix the different tests

Guillaume Lemaitre · Guillaume Lemaitre · commit de70e2908764 · 2017-02-17T14:17:38.000+01:00
diff --git a/doc/modules/pipeline.rst b/doc/modules/pipeline.rst
@@ -166,48 +166,6 @@ object::
     >>> # Clear the cache directory when you don't need it anymore
     >>> rmtree(cachedir)
 
-.. warning:: **Side effect of caching transfomers**
-
-   Using a :class:`Pipeline` without cache enabled, it is possible to
-   inspect the original instance such as::
-
-     >>> from sklearn.datasets import load_digits
-     >>> digits = load_digits()
-     >>> pca1 = PCA()
-     >>> svm1 = SVC()
-     >>> pipe = Pipeline([('reduce_dim', pca1), ('clf', svm1)])
-     >>> pipe.fit(digits.data, digits.target)
-     ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
-     Pipeline(memory=None,
-              steps=[('reduce_dim', PCA(...)), ('clf', SVC(...))])
-     >>> # The pca instance can be inspected directly
-     >>> print(pca1.components_) # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
-         [[ -1.77484909e-19  ... 4.07058917e-18]]
-
-   Enabling caching triggers a clone of the transformers before fitting.
-   Therefore, the transformer instance given to the pipeline cannot be
-   inspected directly.
-   In following example, accessing the :class:`PCA` instance ``pca2``
-   will raise an ``AttributeError`` since ``pca2`` will be an unfitted
-   transformer.
-   Instead, use the attribute ``named_steps_`` to inspect estimators within
-   the pipeline::
-
-     >>> cachedir = mkdtemp()
-     >>> pca2 = PCA()
-     >>> svm2 = SVC()
-     >>> cached_pipe = Pipeline([('reduce_dim', pca2), ('clf', svm2)],
-     ...                        memory=cachedir)
-     >>> cached_pipe.fit(digits.data, digits.target)
-     ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
-      Pipeline(memory=...,
-               steps=[('reduce_dim', PCA(...)), ('clf', SVC(...))])
-     >>> print(cached_pipe.named_steps_['reduce_dim'].components_)
-     ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
-         [[ -1.77484909e-19  ... 4.07058917e-18]]
-     >>> # Remove the cache directory
-     >>> rmtree(cachedir)
-
 .. topic:: Examples:
 
  * :ref:`sphx_glr_auto_examples_plot_compare_reduction.py`
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
@@ -381,7 +381,11 @@ def fit_transform(self, X, y=None, **fit_params):
         """
         last_step = self._final_estimator
         Xt, fit_params = self._fit(X, y, **fit_params)
-        if last_step is None:
+        if hasattr(last_step, 'fit_transform'):
+            Xt = last_step.fit_transform(Xt, y, **fit_params)
+            self.steps_[-1] = (self.steps_[-1][0], last_step)
+            return Xt
+        elif last_step is None:
             return Xt
         else:
             fitted_transformer = last_step.fit(Xt, y, **fit_params)
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
@@ -18,7 +18,6 @@
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
-from sklearn.utils.testing import assert_dict_equal
 
 from sklearn.base import clone, BaseEstimator
 from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
@@ -32,6 +31,8 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.externals.joblib import Memory
+from sklearn.externals.joblib import hash
+from sklearn.utils.validation import check_is_fitted
 
 
 JUNK_FOOD_DOCS = (
@@ -537,17 +538,22 @@ def make():
     assert_array_equal([[exp]], pipeline.fit_transform(X, y))
     assert_array_equal([exp], pipeline.fit(X).predict(X))
     assert_array_equal(X, pipeline.inverse_transform([[exp]]))
-    print(pipeline.get_params(deep=True))
-    assert_dict_equal(pipeline.get_params(deep=True),
-                      {'last': Mult(mult=5),
-                       'memory': None,
-                       'last__mult': 5,
-                       'steps': [('m2', Mult(mult=2)),
-                                 ('m3', None),
-                                 ('last', Mult(mult=5))],
-                       'm2__mult': 2,
-                       'm3': None,
-                       'm2': Mult(mult=2)})
+
+    pipeline_params = pipeline.get_params(deep=True)
+    pipeline_params2 = {'steps': pipeline.steps,
+                        'm2': mult2,
+                        'm3': None,
+                        'last': mult5,
+                        'memory': None,
+                        'm2__mult': 2,
+                        'last__mult': 5}
+    # check if the keys are the same
+    assert_equal(sorted(pipeline_params.keys()),
+                 sorted(pipeline_params2.keys()))
+    # check if the arrays are the same using joblib.hash
+    for k in pipeline_params.keys():
+        assert_equal(hash(pipeline_params[k]),
+                     hash(pipeline_params2[k]))
 
     pipeline.set_params(m2=None)
     exp = 5
@@ -621,6 +627,22 @@ def test_pipeline_ducktyping():
     assert_false(hasattr(pipeline, 'inverse_transform'))
 
 
+def test_pipeline_steps():
+    iris = load_iris()
+    X = iris.data
+    y = iris.target
+    clf = SVC(probability=True, random_state=0)
+    pca = PCA(svd_solver='full', n_components='mle', whiten=True)
+    pipe = Pipeline([('pca', pca), ('svc', clf)])
+    pipe.fit(X, y)
+
+    # check that _steps was not change after fitting
+    assert_equal(pca, pipe._steps[0][1])
+    assert_equal(clf, pipe._steps[1][1])
+    # check that the estimators have been fitted in steps_
+    check_is_fitted(pipe.named_steps_['pca'], 'n_components_')
+    check_is_fitted(pipe.named_steps_['svc'], 'support_vectors_')
+
 def test_make_pipeline():
     t1 = Transf()
     t2 = Transf()