From 025c1181de8e5616136e5b4f20c0b3bdac851749 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 9 May 2013 09:35:28 +1000 Subject: [PATCH] ENH store per-transformer index into feature space in FeatureUnion --- doc/modules/pipeline.rst | 8 +++++--- sklearn/pipeline.py | 9 +++++++++ sklearn/tests/test_pipeline.py | 22 ++++++++++++++++++---- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/doc/modules/pipeline.rst b/doc/modules/pipeline.rst index bf142a3a86255..af2eaceee331a 100644 --- a/doc/modules/pipeline.rst +++ b/doc/modules/pipeline.rst @@ -109,9 +109,8 @@ convenience and joint parameter estimation and validation. create complex models. (A :class:`FeatureUnion` has no way of checking whether two transformers -might produce identical features. It only produces a union when the -feature sets are disjoint, and making sure they are is the caller's -responsibility.) +might produce identical features. Making sure the features are disjoint +is the caller's responsibility.) Usage @@ -135,6 +134,9 @@ and ``value`` is an estimator object:: n_components=None, remove_zero_eig=False, tol=0))], transformer_weights=None) +After ``fit_transform`` is called, ``FeatureUnion`` will store a `feature_ptr_` +attribute indicating which slices of the transfiormed matrix's features +correspond to which constituent transformers. .. topic:: Examples: diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index c01de183bff38..7b1e2bbbcdd6f 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -252,6 +252,13 @@ class FeatureUnion(BaseEstimator, TransformerMixin): Multiplicative weights for features per transformer. Keys are transformer names, values the weights. + Attributes + ---------- + feature_ptr_: array of shape (len(transformers) + 1) + Stores the feature slice corresponding to each transformer. + Transformer `i` generates feature columns `k` where + `feature_ptr_[i] <= k < feature_ptr_[i + 1]`. + Only available if `fit_transform` is used. """ def __init__(self, transformer_list, n_jobs=1, transformer_weights=None): self.transformer_list = transformer_list @@ -307,6 +314,8 @@ def fit_transform(self, X, y=None, **fit_params): delayed(_fit_transform_one)(trans, name, X, y, self.transformer_weights, **fit_params) for name, trans in self.transformer_list) + print(len(Xs), len(self.transformer_list), [f.shape for f in Xs]) + self.feature_ptr_ = np.hstack([0, np.cumsum([f.shape[1] for f in Xs])]) if any(sparse.issparse(f) for f in Xs): Xs = sparse.hstack(Xs).tocsr() else: diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index ed2e28530755a..d93da3a6696b6 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -196,25 +196,39 @@ def test_feature_union(): assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing - assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X)) - assert_array_equal(X_transformed[:, -1], - select.fit_transform(X, y).ravel()) + pca_expected = pca.fit_transform(X) + select_expected = select.fit_transform(X, y) + assert_array_almost_equal(X_transformed[:, :-1], pca_expected) + assert_array_equal(X_transformed[:, -1:], select_expected) + + # use fit_transform as an alternative, providing feature_ptr_ + X_transformed = fs.fit_transform(X, y) + assert_equal(len(fs.feature_ptr_), len(fs.transformer_list) + 1) + pca_slice = slice(*fs.feature_ptr_[0:0 + 2]) + select_slice = slice(*fs.feature_ptr_[1:1 + 2]) + print(fs.feature_ptr_, pca_slice, select_slice) + assert_array_almost_equal(X_transformed[:, pca_slice], pca_expected) + assert_array_equal(X_transformed[:, select_slice], select_expected) # test if it also works for sparse input # We use a different pca object to control the random_state stream fs = FeatureUnion([("pca", pca), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) - assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) + assert_array_almost_equal(X_transformed, + X_sp_transformed.tocsr().todense()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) + assert_equal(2, fs.feature_ptr_[2] - fs.feature_ptr_[1]) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8)) + assert_array_equal(fs.feature_ptr_, [0, 4, 6, 8]) + assert_equal(len(fs.feature_ptr_), len(fs.transformer_list) + 1) def test_pipeline_transform():