Skip to content

ENH ColumnTransformer.transform returns dataframes when transformers output them #20110

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
4 changes: 4 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,10 @@ Changelog
:mod:`sklearn.compose`
......................

- |Enhancement| :meth:`compose.ColumnTransformer.transform` returns a pandas
DataFrame when every transformer outputs a DataFrame from :meth:`transform`.
:pr:`20110` by `Thomas Fan`_.

- |Enhancement| :class:`compose.ColumnTransformer` now records the output
of each transformer in `output_indices_`. :pr:`18393` by
:user:`Luca Bittarello <lbittarello>`.
Expand Down
23 changes: 23 additions & 0 deletions sklearn/compose/_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Joris Van den Bossche
# License: BSD
from itertools import chain
import warnings

import numpy as np
from scipy import sparse
Expand Down Expand Up @@ -653,10 +654,32 @@ def _hstack(self, Xs):
This allows subclasses to control the stacking behavior, while reusing
everything else from ColumnTransformer.

If Xs are all DataFrames and their indexes are the same then a
DataFrame is returned.

Parameters
----------
Xs : list of {array-like, sparse matrix, dataframe}
"""
# Check that all output from transformers are pandas DataFrames
if all(hasattr(X, "iloc") for X in Xs):
if any(not Xs[0].index.equals(X.index) for X in Xs[1:]):
warnings.warn(
"The DataFrame's indexes for each transformer do not "
"match. A numpy array or a sparse matrix will be "
"returned."
)
return self._hstack_np(Xs)

import pandas as pd

print([X.dtypes for X in Xs])
return pd.concat(Xs, axis=1)

return self._hstack_np(Xs)

def _hstack_np(self, Xs):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be a purist, sparse is not really np but coming from scipy :)

I am wondering if we should have 2 functions here. I would prefer to have an if/else statement in the _hstack.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the _safe_indexing we indeed have 3 functions: _array_indexing, _pandas_indexing, and _list_indexing. We could indeed have something similar then.

"""Stack Xs horizontally into a numpy array or sparse matrix."""
if self.sparse_output_:
try:
# since all columns should be numeric before stacking them
Expand Down
132 changes: 126 additions & 6 deletions sklearn/compose/tests/test_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose_dense_sparse
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils import _safe_indexing

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.compose import (
ColumnTransformer,
make_column_transformer,
Expand Down Expand Up @@ -374,9 +376,18 @@ def test_column_transformer_output_indices_df():
"trans2": slice(1, 2),
"remainder": slice(0, 0),
}
assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
assert_array_equal(
_safe_indexing(X_trans, [0]),
_safe_indexing(X_trans, ct.output_indices_["trans1"]),
)
assert_array_equal(
_safe_indexing(X_trans, [1]),
_safe_indexing(X_trans, ct.output_indices_["trans2"]),
)
assert_array_equal(
_safe_indexing(X_trans, []),
_safe_indexing(X_trans, ct.output_indices_["remainder"]),
)

ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])])
X_trans = ct.fit_transform(X_df)
Expand All @@ -385,9 +396,18 @@ def test_column_transformer_output_indices_df():
"trans2": slice(1, 2),
"remainder": slice(0, 0),
}
assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]])
assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]])
assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]])
assert_array_equal(
_safe_indexing(X_trans, [0]),
_safe_indexing(X_trans, ct.output_indices_["trans1"]),
)
assert_array_equal(
_safe_indexing(X_trans, [1]),
_safe_indexing(X_trans, ct.output_indices_["trans2"]),
)
assert_array_equal(
_safe_indexing(X_trans, []),
_safe_indexing(X_trans, ct.output_indices_["remainder"]),
)


def test_column_transformer_sparse_array():
Expand Down Expand Up @@ -1595,3 +1615,103 @@ def test_get_feature_names_empty_selection(selector):
ct = ColumnTransformer([("ohe", OneHotEncoder(drop="first"), selector)])
ct.fit([[1, 2], [3, 4]])
assert ct.get_feature_names() == []


class DataFrameTransformer(TransformerMixin, BaseEstimator):
def __init__(
self, prefix="X", index_start=0, reverse_index=False, sparse_output=False
):
self.prefix = prefix
self.index_start = index_start
self.reverse_index = reverse_index
self.sparse_output = sparse_output

def fit(self, X, y=None):
return self

def transform(self, X):
import pandas as pd

n_samples, n_features = X.shape
columns = [f"{self.prefix}{i}" for i in range(n_features)]
index = [i + self.index_start for i in range(n_samples)]
if self.reverse_index:
index = index[::-1]

if self.sparse_output:
return pd.DataFrame.sparse.from_spmatrix(
sparse.csr_matrix(X.values), columns=columns, index=index
)
return pd.DataFrame(X.values, columns=columns, index=index)


@pytest.mark.parametrize("first_sparse_output", [True, False])
def test_pandas_index_aligned_hstack(first_sparse_output):
"""When transformers all output dataframes, ColumnTransformer will
output dataframes."""
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"X": [1, 0, 3], "Y": [4, 5, 0]}, columns=["X", "Y"])

ct = ColumnTransformer(
[
(
"first",
DataFrameTransformer(
prefix="first",
index_start=1,
sparse_output=first_sparse_output,
),
["X"],
),
(
"second",
DataFrameTransformer(prefix="second", index_start=1),
["X", "Y"],
),
]
).fit(X)

X_expected = pd.DataFrame(
{
"first0": [1, 0, 3],
"second0": [1, 0, 3],
"second1": [4, 5, 0],
},
columns=["first0", "second0", "second1"],
index=[1, 2, 3],
)
if first_sparse_output:
X_expected["first0"] = X_expected["first0"].astype(pd.SparseDtype("int", 0))

X_trans = ct.transform(X)
pd.testing.assert_frame_equal(X_trans, X_expected)


@pytest.mark.parametrize("first_kwargs", [{"index_start": 2}, {"reverse_index": True}])
@pytest.mark.parametrize("first_sparse_output", [True, False])
def test_pandas_index_not_aligned_warns(first_kwargs, first_sparse_output):
"""When transformers all output dataframes, ColumnTransformer will
warn and output numpy arrays."""
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"X": [1, 0, 3], "Y": [4, 5, 0]}, columns=["X", "Y"])

ct = ColumnTransformer(
[
(
"first",
DataFrameTransformer(
prefix="first", sparse_output=first_sparse_output, **first_kwargs
),
["X"],
),
("second", DataFrameTransformer(prefix="second"), ["X", "Y"]),
]
)

msg = "The DataFrame's indexes for each transformer do not match"
with pytest.warns(UserWarning, match=msg):
X_trans = ct.fit_transform(X)

assert isinstance(X_trans, np.ndarray)
X_expected = np.array([[1, 1, 4], [0, 0, 5], [3, 3, 0]])
assert_array_equal(X_trans, X_expected)