scikit-learn · jnothman · May 29, 2018 · Jun 5, 2015 · Jun 6, 2017 · Jun 6, 2017
diff --git a/doc/conftest.py b/doc/conftest.py
@@ -55,6 +55,13 @@ def setup_working_with_text_data():
     check_skip_network()
 
 
+def setup_compose():
+    try:
+        import pandas  # noqa
+    except ImportError:
+        raise SkipTest("Skipping compose.rst, pandas not installed")
+
+
 def pytest_runtest_setup(item):
     fname = item.fspath.strpath
     if fname.endswith('datasets/labeled_faces.rst'):
@@ -67,6 +74,8 @@ def pytest_runtest_setup(item):
         setup_twenty_newsgroups()
     elif fname.endswith('tutorial/text_analytics/working_with_text_data.rst'):
         setup_working_with_text_data()
+    elif fname.endswith('modules/compose.rst'):
+        setup_compose()
 
 
 def pytest_runtest_teardown(item):

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -158,8 +158,15 @@ details.
     :toctree: generated
     :template: class.rst
 
+    compose.ColumnTransformer
     compose.TransformedTargetRegressor
 
+.. autosummary::
+   :toctree: generated/
+   :template: function.rst
+
+   compose.make_column_transformer
+
 .. _covariance_ref:
 
 :mod:`sklearn.covariance`: Covariance Estimators
@@ -1461,6 +1468,7 @@ Low-level methods
    utils.testing.assert_raise_message
    utils.testing.all_estimators
 
+
 Recently deprecated
 ===================
 

diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
@@ -304,9 +304,13 @@ FeatureUnion: composite feature spaces
 :class:`FeatureUnion` combines several transformer objects into a new
 transformer that combines their output. A :class:`FeatureUnion` takes
 a list of transformer objects. During fitting, each of these
-is fit to the data independently. For transforming data, the
-transformers are applied in parallel, and the sample vectors they output
-are concatenated end-to-end into larger vectors.
+is fit to the data independently. The transformers are applied in parallel,
+and the feature matrices they output are concatenated side-by-side into a
+larger matrix.
+
+When you want to apply different transformations to each field of the data,
+see the related class :class:`sklearn.compose.ColumnTransformer`
+(see :ref:`user guide <column_transformer>`).
 
 :class:`FeatureUnion` serves the same purposes as :class:`Pipeline` -
 convenience and joint parameter estimation and validation.
@@ -357,4 +361,102 @@ and ignored by setting to ``None``::
 .. topic:: Examples:
 
  * :ref:`sphx_glr_auto_examples_plot_feature_stacker.py`
- * :ref:`sphx_glr_auto_examples_hetero_feature_union.py`
+
+
+.. _column_transformer:
+
+ColumnTransformer for heterogeneous data
+========================================
+
+.. warning::
+
+    The :class:`compose.ColumnTransformer <sklearn.compose.ColumnTransformer>`
+    class is experimental and the API is subject to change.
+
+Many datasets contain features of different types, say text, floats, and dates,
+where each type of feature requires separate preprocessing or feature
+extraction steps.  Often it is easiest to preprocess data before applying
+scikit-learn methods, for example using `pandas <http://pandas.pydata.org/>`__.
+Processing your data before passing it to scikit-learn might be problematic for
+one of the following reasons:
+
+1. Incorporating statistics from test data into the preprocessors makes
+   cross-validation scores unreliable (known as *data leakage*),
+   for example in the case of scalers or imputing missing values.
+2. You may want to include the parameters of the preprocessors in a
+   :ref:`parameter search <grid_search>`.
+
+The :class:`~sklearn.compose.ColumnTransformer` helps performing different
+transformations for different columns of the data, within a
+:class:`~sklearn.pipeline.Pipeline` that is safe from data leakage and that can
+be parametrized. :class:`~sklearn.compose.ColumnTransformer` works on
+arrays, sparse matrices, and
+`pandas DataFrames <http://pandas.pydata.org/pandas-docs/stable/>`__.
+
+To each column, a different transformation can be applied, such as
+preprocessing or a specific feature extraction method::
+
+  >>> import pandas as pd
+  >>> X = pd.DataFrame(
+  ...     {'city': ['London', 'London', 'Paris', 'Sallisaw'],
+  ...      'title': ["His Last Bow", "How Watson Learned the Trick",
+  ...                "A Moveable Feast", "The Grapes of Wrath"]})
+
+For this data, we might want to encode the ``'city'`` column as a categorical
+variable, but apply a :class:`feature_extraction.text.CountVectorizer
+<sklearn.feature_extraction.text.CountVectorizer>` to the ``'title'`` column.
+As we might use multiple feature extraction methods on the same column, we give
+each transformer a unique name, say ``'city_category'`` and ``'title_bow'``::
+
+  >>> from sklearn.compose import ColumnTransformer
+  >>> from sklearn.feature_extraction.text import CountVectorizer
+  >>> column_trans = ColumnTransformer(
+  ...     [('city_category', CountVectorizer(analyzer=lambda x: [x]), 'city'),
+  ...      ('title_bow', CountVectorizer(), 'title')])
+
+  >>> column_trans.fit(X) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  ColumnTransformer(n_jobs=1, remainder='passthrough', transformer_weights=None,
+      transformers=...)
+
+  >>> column_trans.get_feature_names()
+  ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  ['city_category__London', 'city_category__Paris', 'city_category__Sallisaw',
+  'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his',
+  'title_bow__how', 'title_bow__last', 'title_bow__learned', 'title_bow__moveable',
+  'title_bow__of', 'title_bow__the', 'title_bow__trick', 'title_bow__watson',
+  'title_bow__wrath']
+
+  >>> column_trans.transform(X).toarray()
+  ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
+         [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0],
+         [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
+         [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1]]...)
+
+In the above example, the
+:class:`~sklearn.feature_extraction.text.CountVectorizer` expects a 1D array as
+input and therefore the columns were specified as a string (``'city'``).
+However, other transformers generally expect 2D data, and in that case you need
+to specify the column as a list of strings (``['city']``).
+
+Apart from a scalar or a single item list, the column selection can be specified
+as a list of multiple items, an integer array, a slice, or a boolean mask.
+Strings can reference columns if the input is a DataFrame, integers are always
+interpreted as the positional columns.
+
+The :func:`~sklearn.compose.make_columntransformer` function is available
+to more easily create a :class:`~sklearn.compose.ColumnTransformer` object.
+Specifically, the names will be given automatically. The equivalent for the
+above example would be::
+
+  >>> from sklearn.compose import make_column_transformer
+  >>> column_trans = make_column_transformer(
+  ...     ('city', CountVectorizer(analyzer=lambda x: [x])),
+  ...     ('title', CountVectorizer()))
+  >>> column_trans # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+  ColumnTransformer(n_jobs=1, remainder='passthrough', transformer_weights=None,
+           transformers=[('countvectorizer-1', ...)
+
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_column_transformer.py`
diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
@@ -916,7 +916,7 @@ Some tips and tricks:
     (Note that this will not filter out punctuation.)
 
 
-    The following example will, for instance, transform some British spelling 
+    The following example will, for instance, transform some British spelling
     to American spelling::
 
         >>> import re

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -74,6 +74,10 @@ Preprocessing
   the maximum value in the features. :issue:`9151` by
   :user:`Vighnesh Birodkar <vighneshbirodkar>` and `Joris Van den Bossche`_.
 
+- Added :class:`compose.ColumnTransformer`, which allows to apply
+  different transformers to different columns of arrays or pandas
+  DataFrames. By `Andreas Müller`_ and `Joris Van den Bossche`_.
+
 - Added :class:`preprocessing.PowerTransformer`, which implements the Box-Cox
   power transformation, allowing users to map data from any distribution to a
   Gaussian distribution. This is useful as a variance-stabilizing transformation

diff --git a/examples/hetero_feature_union.py → examples/column_transformer.py b/examples/hetero_feature_union.py → examples/column_transformer.py
@@ -1,7 +1,7 @@
 """
-=============================================
-Feature Union with Heterogeneous Data Sources
-=============================================
+==================================================
+Column Transformer with Heterogeneous Data Sources
+==================================================
 
 Datasets can often contain components of that require different feature
 extraction and processing pipelines.  This scenario might occur when:
@@ -12,12 +12,12 @@
    require different processing pipelines.
 
 This example demonstrates how to use
-:class:`sklearn.feature_extraction.FeatureUnion` on a dataset containing
+:class:`sklearn.compose.ColumnTransformer` on a dataset containing
 different types of features.  We use the 20-newsgroups dataset and compute
 standard bag-of-words features for the subject line and body in separate
 pipelines as well as ad hoc features on the body. We combine them (with
-weights) using a FeatureUnion and finally train a classifier on the combined
-set of features.
+weights) using a ColumnTransformer and finally train a classifier on the
+combined set of features.
 
 The choice of features is not particularly helpful, but serves to illustrate
 the technique.
@@ -38,50 +38,11 @@
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics import classification_report
-from sklearn.pipeline import FeatureUnion
 from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
 from sklearn.svm import SVC
 
 
-class ItemSelector(BaseEstimator, TransformerMixin):
-    """For data grouped by feature, select subset of data at a provided key.
-
-    The data is expected to be stored in a 2D data structure, where the first
-    index is over features and the second is over samples.  i.e.
-
-    >> len(data[key]) == n_samples
-
-    Please note that this is the opposite convention to scikit-learn feature
-    matrixes (where the first index corresponds to sample).
-
-    ItemSelector only requires that the collection implement getitem
-    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
-    DataFrame, numpy record array, etc.
-
-    >> data = {'a': [1, 5, 2, 5, 2, 8],
-               'b': [9, 4, 1, 4, 1, 3]}
-    >> ds = ItemSelector(key='a')
-    >> data['a'] == ds.transform(data)
-
-    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
-    list of dicts).  If your data is structured this way, consider a
-    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.
-
-    Parameters
-    ----------
-    key : hashable, required
-        The key corresponding to the desired value in a mappable.
-    """
-    def __init__(self, key):
-        self.key = key
-
-    def fit(self, x, y=None):
-        return self
-
-    def transform(self, data_dict):
-        return data_dict[self.key]
-
-
 class TextStats(BaseEstimator, TransformerMixin):
     """Extract features from each document for DictVectorizer"""
 
@@ -104,21 +65,22 @@ def fit(self, x, y=None):
         return self
 
     def transform(self, posts):
-        features = np.recarray(shape=(len(posts),),
-                               dtype=[('subject', object), ('body', object)])
+        # construct object dtype array with two columns
+        # first column = 'subject' and second column = 'body'
+        features = np.empty(shape=(len(posts), 2), dtype=object)
         for i, text in enumerate(posts):
             headers, _, bod = text.partition('\n\n')
             bod = strip_newsgroup_footer(bod)
             bod = strip_newsgroup_quoting(bod)
-            features['body'][i] = bod
+            features[i, 1] = bod
 
             prefix = 'Subject:'
             sub = ''
             for line in headers.split('\n'):
                 if line.startswith(prefix):
                     sub = line[len(prefix):]
                     break
-            features['subject'][i] = sub
+            features[i, 0] = sub
 
         return features
 
@@ -127,38 +89,31 @@ def transform(self, posts):
     # Extract the subject & body
     ('subjectbody', SubjectBodyExtractor()),
 
-    # Use FeatureUnion to combine the features from subject and body
-    ('union', FeatureUnion(
-        transformer_list=[
+    # Use C toolumnTransformer to combine the features from subject and body
+    ('union', ColumnTransformer(
+        [
+            # Pulling features from the post's subject line (first column)
+            ('subject', TfidfVectorizer(min_df=50), 0),
 
-            # Pipeline for pulling features from the post's subject line
-            ('subject', Pipeline([
-                ('selector', ItemSelector(key='subject')),
-                ('tfidf', TfidfVectorizer(min_df=50)),
-            ])),
-
-            # Pipeline for standard bag-of-words model for body
+            # Pipeline for standard bag-of-words model for body (second column)
             ('body_bow', Pipeline([
-                ('selector', ItemSelector(key='body')),
                 ('tfidf', TfidfVectorizer()),
                 ('best', TruncatedSVD(n_components=50)),
-            ])),
+            ]), 1),
 
             # Pipeline for pulling ad hoc features from post's body
             ('body_stats', Pipeline([
-                ('selector', ItemSelector(key='body')),
                 ('stats', TextStats()),  # returns a list of dicts
                 ('vect', DictVectorizer()),  # list of dicts -> feature matrix
-            ])),
-
+            ]), 1),
         ],
 
-        # weight components in FeatureUnion
+        # weight components in ColumnTransformer
         transformer_weights={
             'subject': 0.8,
             'body_bow': 0.5,
             'body_stats': 1.0,
-        },
+        }
     )),
 
     # Use a SVC classifier on the combined features

diff --git a/sklearn/compose/__init__.py b/sklearn/compose/__init__.py
@@ -5,8 +5,12 @@
 
 """
 
+from ._column_transformer import ColumnTransformer, make_column_transformer
 from ._target import TransformedTargetRegressor
 
+
 __all__ = [
+    'ColumnTransformer',
+    'make_column_transformer',
     'TransformedTargetRegressor',
 ]