From 025c1181de8e5616136e5b4f20c0b3bdac851749 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Thu, 9 May 2013 09:35:28 +1000
Subject: [PATCH] ENH store per-transformer index into feature space in
 FeatureUnion

---
 doc/modules/pipeline.rst       |  8 +++++---
 sklearn/pipeline.py            |  9 +++++++++
 sklearn/tests/test_pipeline.py | 22 ++++++++++++++++++----
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/doc/modules/pipeline.rst b/doc/modules/pipeline.rst
index bf142a3a86255..af2eaceee331a 100644
--- a/doc/modules/pipeline.rst
+++ b/doc/modules/pipeline.rst
@@ -109,9 +109,8 @@ convenience and joint parameter estimation and validation.
 create complex models.
 
 (A :class:`FeatureUnion` has no way of checking whether two transformers
-might produce identical features. It only produces a union when the
-feature sets are disjoint, and making sure they are is the caller's
-responsibility.)
+might produce identical features. Making sure the features are disjoint
+is the caller's responsibility.)
 
 
 Usage
@@ -135,6 +134,9 @@ and ``value`` is an estimator object::
         n_components=None, remove_zero_eig=False, tol=0))],
         transformer_weights=None)
 
+After ``fit_transform`` is called, ``FeatureUnion`` will store a `feature_ptr_`
+attribute indicating which slices of the transfiormed matrix's features
+correspond to which constituent transformers.
 
                                                                        
 .. topic:: Examples:
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index c01de183bff38..7b1e2bbbcdd6f 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -252,6 +252,13 @@ class FeatureUnion(BaseEstimator, TransformerMixin):
         Multiplicative weights for features per transformer.
         Keys are transformer names, values the weights.
 
+    Attributes
+    ----------
+    feature_ptr_: array of shape (len(transformers) + 1)
+        Stores the feature slice corresponding to each transformer.
+        Transformer `i` generates feature columns `k` where
+       `feature_ptr_[i] <= k < feature_ptr_[i + 1]`.
+       Only available if `fit_transform` is used.
     """
     def __init__(self, transformer_list, n_jobs=1, transformer_weights=None):
         self.transformer_list = transformer_list
@@ -307,6 +314,8 @@ def fit_transform(self, X, y=None, **fit_params):
             delayed(_fit_transform_one)(trans, name, X, y,
                                         self.transformer_weights, **fit_params)
             for name, trans in self.transformer_list)
+        print(len(Xs), len(self.transformer_list), [f.shape for f in Xs])
+        self.feature_ptr_ = np.hstack([0, np.cumsum([f.shape[1] for f in Xs])])
         if any(sparse.issparse(f) for f in Xs):
             Xs = sparse.hstack(Xs).tocsr()
         else:
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index ed2e28530755a..d93da3a6696b6 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -196,25 +196,39 @@ def test_feature_union():
     assert_equal(X_transformed.shape, (X.shape[0], 3))
 
     # check if it does the expected thing
-    assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X))
-    assert_array_equal(X_transformed[:, -1],
-                       select.fit_transform(X, y).ravel())
+    pca_expected = pca.fit_transform(X)
+    select_expected = select.fit_transform(X, y)
+    assert_array_almost_equal(X_transformed[:, :-1], pca_expected)
+    assert_array_equal(X_transformed[:, -1:], select_expected)
+
+    # use fit_transform as an alternative, providing feature_ptr_
+    X_transformed = fs.fit_transform(X, y)
+    assert_equal(len(fs.feature_ptr_), len(fs.transformer_list) + 1)
+    pca_slice = slice(*fs.feature_ptr_[0:0 + 2])
+    select_slice = slice(*fs.feature_ptr_[1:1 + 2])
+    print(fs.feature_ptr_, pca_slice, select_slice)
+    assert_array_almost_equal(X_transformed[:, pca_slice], pca_expected)
+    assert_array_equal(X_transformed[:, select_slice], select_expected)
 
     # test if it also works for sparse input
     # We use a different pca object to control the random_state stream
     fs = FeatureUnion([("pca", pca), ("select", select)])
     X_sp = sparse.csr_matrix(X)
     X_sp_transformed = fs.fit_transform(X_sp, y)
-    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())
+    assert_array_almost_equal(X_transformed,
+                              X_sp_transformed.tocsr().todense())
 
     # test setting parameters
     fs.set_params(select__k=2)
     assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
+    assert_equal(2, fs.feature_ptr_[2] - fs.feature_ptr_[1])
 
     # test it works with transformers missing fit_transform
     fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)])
     X_transformed = fs.fit_transform(X, y)
     assert_equal(X_transformed.shape, (X.shape[0], 8))
+    assert_array_equal(fs.feature_ptr_, [0, 4, 6, 8])
+    assert_equal(len(fs.feature_ptr_), len(fs.transformer_list) + 1)
 
 
 def test_pipeline_transform():