Skip to content
4 changes: 4 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,10 @@ Changelog
:mod:`sklearn.compose`
......................

- |Enhancement| :meth:`compose.ColumnTransformer.transform` returns a pandas
DataFrame when every transformer outputs a DataFrame from :meth:`transform`.
:pr:`20110` by `Thomas Fan`_.

- |Enhancement| :class:`compose.ColumnTransformer` now records the output
of each transformer in `output_indices_`. :pr:`18393` by
:user:`Luca Bittarello <lbittarello>`.
Expand Down
23 changes: 23 additions & 0 deletions sklearn/compose/_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Joris Van den Bossche
# License: BSD
from itertools import chain
import warnings

import numpy as np
from scipy import sparse
Expand Down Expand Up @@ -653,10 +654,32 @@ def _hstack(self, Xs):
This allows subclasses to control the stacking behavior, while reusing
everything else from ColumnTransformer.

If Xs are all DataFrames and their indexes are the same then a
DataFrame is returned.

Parameters
----------
Xs : list of {array-like, sparse matrix, dataframe}
"""
# Check that all output from transformers are pandas DataFrames
if all(hasattr(X, "iloc") for X in Xs):
if any(not Xs[0].index.equals(X.index) for X in Xs[1:]):
warnings.warn(
"The DataFrame's indexes for each transformer do not "
"match. A numpy array or a sparse matrix will be "
"returned."
)
return self._hstack_np(Xs)

import pandas as pd

print([X.dtypes for X in Xs])
return pd.concat(Xs, axis=1)

return self._hstack_np(Xs)

def _hstack_np(self, Xs):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be a purist, sparse is not really np but coming from scipy :)

I am wondering if we should have 2 functions here. I would prefer to have an if/else statement in the _hstack.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the _safe_indexing we indeed have 3 functions: _array_indexing, _pandas_indexing, and _list_indexing. We could indeed have something similar then.

"""Stack Xs horizontally into a numpy array or sparse matrix."""
if self.sparse_output_:
try:
# since all columns should be numeric before stacking them
Expand Down
132 changes: 126 additions & 6 deletions sklearn/compose/tests/test_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose_dense_sparse
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils import _safe_indexing

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.compose import (
ColumnTransformer,
make_column_transformer,
Expand Down Expand Up @@ -374,9 +376,18 @@ def test_column_transformer_output_indices_df():
"trans2": slice(1, 2),
"remainder": slice(0, 0),
}
assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
assert_array_equal(
_safe_indexing(X_trans, [0]),
_safe_indexing(X_trans, ct.output_indices_["trans1"]),
)
assert_array_equal(
_safe_indexing(X_trans, [1]),
_safe_indexing(X_trans, ct.output_indices_["trans2"]),
)
assert_array_equal(
_safe_indexing(X_trans, []),
_safe_indexing(X_trans, ct.output_indices_["remainder"]),
)

ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
X_trans = ct.fit_transform(X_df)
Expand All @@ -385,9 +396,18 @@ def test_column_transformer_output_indices_df():
"trans2": slice(1, 2),
"remainder": slice(0, 0),
}
assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
assert_array_equal(
_safe_indexing(X_trans, [0]),
_safe_indexing(X_trans, ct.output_indices_["trans1"]),
)
assert_array_equal(
_safe_indexing(X_trans, [1]),
_safe_indexing(X_trans, ct.output_indices_["trans2"]),
)
assert_array_equal(
_safe_indexing(X_trans, []),
_safe_indexing(X_trans, ct.output_indices_["remainder"]),
)


def test_column_transformer_sparse_array():
Expand Down Expand Up @@ -1595,3 +1615,103 @@ def test_get_feature_names_empty_selection(selector):
ct = ColumnTransformer([("ohe", OneHotEncoder(drop="first"), selector)])
ct.fit([[1, 2], [3, 4]])
assert ct.get_feature_names() == []


class DataFrameTransformer(TransformerMixin, BaseEstimator):
def __init__(
self, prefix="X", index_start=0, reverse_index=False, sparse_output=False
):
self.prefix = prefix
self.index_start = index_start
self.reverse_index = reverse_index
self.sparse_output = sparse_output

def fit(self, X, y=None):
return self

def transform(self, X):
import pandas as pd

n_samples, n_features = X.shape
columns = [f"{self.prefix}{i}" for i in range(n_features)]
index = [i + self.index_start for i in range(n_samples)]
if self.reverse_index:
index = index[::-1]

if self.sparse_output:
return pd.DataFrame.sparse.from_spmatrix(
sparse.csr_matrix(X.values), columns=columns, index=index
)
return pd.DataFrame(X.values, columns=columns, index=index)


@pytest.mark.parametrize("first_sparse_output", [True, False])
def test_pandas_index_aligned_hstack(first_sparse_output):
"""When transformers all output dataframes, ColumnTransformer will
output dataframes."""
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"X": [1, 0, 3], "Y": [4, 5, 0]}, columns=["X", "Y"])

ct = ColumnTransformer(
[
(
"first",
DataFrameTransformer(
prefix="first",
index_start=1,
sparse_output=first_sparse_output,
),
["X"],
),
(
"second",
DataFrameTransformer(prefix="second", index_start=1),
["X", "Y"],
),
]
).fit(X)

X_expected = pd.DataFrame(
{
"first0": [1, 0, 3],
"second0": [1, 0, 3],
"second1": [4, 5, 0],
},
columns=["first0", "second0", "second1"],
index=[1, 2, 3],
)
if first_sparse_output:
X_expected["first0"] = X_expected["first0"].astype(pd.SparseDtype("int", 0))

X_trans = ct.transform(X)
pd.testing.assert_frame_equal(X_trans, X_expected)


@pytest.mark.parametrize("first_kwargs", [{"index_start": 2}, {"reverse_index": True}])
@pytest.mark.parametrize("first_sparse_output", [True, False])
def test_pandas_index_not_aligned_warns(first_kwargs, first_sparse_output):
"""When transformers all output dataframes, ColumnTransformer will
warn and output numpy arrays."""
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"X": [1, 0, 3], "Y": [4, 5, 0]}, columns=["X", "Y"])

ct = ColumnTransformer(
[
(
"first",
DataFrameTransformer(
prefix="first", sparse_output=first_sparse_output, **first_kwargs
),
["X"],
),
("second", DataFrameTransformer(prefix="second"), ["X", "Y"]),
]
)

msg = "The DataFrame's indexes for each transformer do not match"
with pytest.warns(UserWarning, match=msg):
X_trans = ct.fit_transform(X)

assert isinstance(X_trans, np.ndarray)
X_expected = np.array([[1, 1, 4], [0, 0, 5], [3, 3, 0]])
assert_array_equal(X_trans, X_expected)