scikit-learn · thomasjpfan · May 19, 2021 · May 19, 2021 · May 19, 2021 · May 19, 2021
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -191,6 +191,10 @@ Changelog
 :mod:`sklearn.compose`
 ......................
 
+- |Enhancement| :meth:`compose.ColumnTransformer.transform` returns a pandas
+  DataFrame when every transformer outputs a DataFrame from :meth:`transform`.
+  :pr:`20110` by `Thomas Fan`_.
+
 - |Enhancement| :class:`compose.ColumnTransformer` now records the output
   of each transformer in `output_indices_`. :pr:`18393` by
   :user:`Luca Bittarello <lbittarello>`.

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -7,6 +7,7 @@
 #         Joris Van den Bossche
 # License: BSD
 from itertools import chain
+import warnings
 
 import numpy as np
 from scipy import sparse
@@ -653,10 +654,32 @@ def _hstack(self, Xs):
         This allows subclasses to control the stacking behavior, while reusing
         everything else from ColumnTransformer.
 
+        If Xs are all DataFrames and their indexes are the same then a
+        DataFrame is returned.
+
         Parameters
         ----------
         Xs : list of {array-like, sparse matrix, dataframe}
         """
+        # Check that all output from transformers are pandas DataFrames
+        if all(hasattr(X, "iloc") for X in Xs):
+            if any(not Xs[0].index.equals(X.index) for X in Xs[1:]):
+                warnings.warn(
+                    "The DataFrame's indexes for each transformer do not "
+                    "match. A numpy array or a sparse matrix will be "
+                    "returned."
+                )
+                return self._hstack_np(Xs)
+
+            import pandas as pd
+
+            print([X.dtypes for X in Xs])
+            return pd.concat(Xs, axis=1)
+
+        return self._hstack_np(Xs)
+
+    def _hstack_np(self, Xs):
+        """Stack Xs horizontally into a numpy array or sparse matrix."""
         if self.sparse_output_:
             try:
                 # since all columns should be numeric before stacking them

diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -12,8 +12,10 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_allclose_dense_sparse
 from sklearn.utils._testing import assert_almost_equal
+from sklearn.utils import _safe_indexing
 
 from sklearn.base import BaseEstimator
+from sklearn.base import TransformerMixin
 from sklearn.compose import (
     ColumnTransformer,
     make_column_transformer,
@@ -374,9 +376,18 @@ def test_column_transformer_output_indices_df():
         "trans2": slice(1, 2),
         "remainder": slice(0, 0),
     }
-    assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
-    assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
-    assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
+    assert_array_equal(
+        _safe_indexing(X_trans, [0]),
+        _safe_indexing(X_trans, ct.output_indices_["trans1"]),
+    )
+    assert_array_equal(
+        _safe_indexing(X_trans, [1]),
+        _safe_indexing(X_trans, ct.output_indices_["trans2"]),
+    )
+    assert_array_equal(
+        _safe_indexing(X_trans, []),
+        _safe_indexing(X_trans, ct.output_indices_["remainder"]),
+    )
 
     ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
     X_trans = ct.fit_transform(X_df)
@@ -385,9 +396,18 @@ def test_column_transformer_output_indices_df():
         "trans2": slice(1, 2),
         "remainder": slice(0, 0),
     }
-    assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
-    assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
-    assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
+    assert_array_equal(
+        _safe_indexing(X_trans, [0]),
+        _safe_indexing(X_trans, ct.output_indices_["trans1"]),
+    )
+    assert_array_equal(
+        _safe_indexing(X_trans, [1]),
+        _safe_indexing(X_trans, ct.output_indices_["trans2"]),
+    )
+    assert_array_equal(
+        _safe_indexing(X_trans, []),
+        _safe_indexing(X_trans, ct.output_indices_["remainder"]),
+    )
 
 
 def test_column_transformer_sparse_array():
@@ -1595,3 +1615,103 @@ def test_get_feature_names_empty_selection(selector):
     ct = ColumnTransformer([("ohe", OneHotEncoder(drop="first"), selector)])
     ct.fit([[1, 2], [3, 4]])
     assert ct.get_feature_names() == []
+
+
+class DataFrameTransformer(TransformerMixin, BaseEstimator):
+    def __init__(
+        self, prefix="X", index_start=0, reverse_index=False, sparse_output=False
+    ):
+        self.prefix = prefix
+        self.index_start = index_start
+        self.reverse_index = reverse_index
+        self.sparse_output = sparse_output
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        import pandas as pd
+
+        n_samples, n_features = X.shape
+        columns = [f"{self.prefix}{i}" for i in range(n_features)]
+        index = [i + self.index_start for i in range(n_samples)]
+        if self.reverse_index:
+            index = index[::-1]
+
+        if self.sparse_output:
+            return pd.DataFrame.sparse.from_spmatrix(
+                sparse.csr_matrix(X.values), columns=columns, index=index
+            )
+        return pd.DataFrame(X.values, columns=columns, index=index)
+
+
+@pytest.mark.parametrize("first_sparse_output", [True, False])
+def test_pandas_index_aligned_hstack(first_sparse_output):
+    """When transformers all output dataframes, ColumnTransformer will
+    output dataframes."""
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame({"X": [1, 0, 3], "Y": [4, 5, 0]}, columns=["X", "Y"])
+
+    ct = ColumnTransformer(
+        [
+            (
+                "first",
+                DataFrameTransformer(
+                    prefix="first",
+                    index_start=1,
+                    sparse_output=first_sparse_output,
+                ),
+                ["X"],
+            ),
+            (
+                "second",
+                DataFrameTransformer(prefix="second", index_start=1),
+                ["X", "Y"],
+            ),
+        ]
+    ).fit(X)
+
+    X_expected = pd.DataFrame(
+        {
+            "first0": [1, 0, 3],
+            "second0": [1, 0, 3],
+            "second1": [4, 5, 0],
+        },
+        columns=["first0", "second0", "second1"],
+        index=[1, 2, 3],
+    )
+    if first_sparse_output:
+        X_expected["first0"] = X_expected["first0"].astype(pd.SparseDtype("int", 0))
+
+    X_trans = ct.transform(X)
+    pd.testing.assert_frame_equal(X_trans, X_expected)
+
+
+@pytest.mark.parametrize("first_kwargs", [{"index_start": 2}, {"reverse_index": True}])
+@pytest.mark.parametrize("first_sparse_output", [True, False])
+def test_pandas_index_not_aligned_warns(first_kwargs, first_sparse_output):
+    """When transformers all output dataframes, ColumnTransformer will
+    warn and output numpy arrays."""
+    pd = pytest.importorskip("pandas")
+    X = pd.DataFrame({"X": [1, 0, 3], "Y": [4, 5, 0]}, columns=["X", "Y"])
+
+    ct = ColumnTransformer(
+        [
+            (
+                "first",
+                DataFrameTransformer(
+                    prefix="first", sparse_output=first_sparse_output, **first_kwargs
+                ),
+                ["X"],
+            ),
+            ("second", DataFrameTransformer(prefix="second"), ["X", "Y"]),
+        ]
+    )
+
+    msg = "The DataFrame's indexes for each transformer do not match"
+    with pytest.warns(UserWarning, match=msg):
+        X_trans = ct.fit_transform(X)
+
+    assert isinstance(X_trans, np.ndarray)
+    X_expected = np.array([[1, 1, 4], [0, 0, 5], [3, 3, 0]])
+    assert_array_equal(X_trans, X_expected)