Skip to content

ENH Improve set_output compatibility in ColumnTransformer #24699

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/whats_new/v1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ Changes impacting all modules
or :class:`compose.ColumnTransformer` also define a `set_output`.
For details, see
`SLEP018 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep018/proposal.html>`__.
:pr:`23734` by `Thomas Fan`_.
:pr:`23734` and :pr:`24699` by `Thomas Fan`_.

- |Enhancement| Finiteness checks (detection of NaN and infinite values) in all
estimators are now significantly more efficient for float32 data by leveraging
Expand Down
38 changes: 37 additions & 1 deletion sklearn/compose/_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,23 @@ def get_feature_names_out(self, input_features=None):
# No feature names
return np.array([], dtype=object)

return self._add_prefix_for_feature_names_out(
transformer_with_feature_names_out
)

def _add_prefix_for_feature_names_out(self, transformer_with_feature_names_out):
"""Add prefix for feature names out that includes the transformer names.

Parameters
----------
transformer_with_feature_names_out : list of tuples of (str, array-like of str)
The tuple consistent of the transformer's name and its feature names out.

Returns
-------
feature_names_out : ndarray of shape (n_features,), dtype=str
Transformed feature names.
"""
if self.verbose_feature_names_out:
# Prefix the feature names out with the transformers name
names = list(
Expand Down Expand Up @@ -825,7 +842,26 @@ def _hstack(self, Xs):
config = _get_output_config("transform", self)
if config["dense"] == "pandas" and all(hasattr(X, "iloc") for X in Xs):
pd = check_pandas_support("transform")
return pd.concat(Xs, axis=1)
output = pd.concat(Xs, axis=1)

# If all transformers define `get_feature_names_out`, then transform
# will adjust the column names to be consistent with
# verbose_feature_names_out. Here we prefix the feature names if
# verbose_feature_names_out=True.

if not self.verbose_feature_names_out:
return output

transformer_names = [
t[0] for t in self._iter(fitted=True, replace_strings=True)
]
feature_names_outs = [X.columns for X in Xs]
names_out = self._add_prefix_for_feature_names_out(
list(zip(transformer_names, feature_names_outs))
)
output.columns = names_out
return output

return np.hstack(Xs)

def _sk_visual_block_(self):
Expand Down
77 changes: 77 additions & 0 deletions sklearn/compose/tests/test_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2053,3 +2053,80 @@ def test_column_transform_set_output_after_fitting(remainder):
}
for col, dtype in X_trans_df.dtypes.items():
assert dtype == expected_dtypes[col]


# PandasOutTransformer that does not define get_feature_names_out and always expects
# the input to be a DataFrame.
class PandasOutTransformer(BaseEstimator):
def __init__(self, offset=1.0):
self.offset = offset

def fit(self, X, y=None):
pd = pytest.importorskip("pandas")
assert isinstance(X, pd.DataFrame)
return self

def transform(self, X, y=None):
pd = pytest.importorskip("pandas")
assert isinstance(X, pd.DataFrame)
return X - self.offset

def set_output(self, transform=None):
# This transformer will always output a DataFrame regardless of the
# configuration.
return self


@pytest.mark.parametrize(
"trans_1, expected_verbose_names, expected_non_verbose_names",
[
(
PandasOutTransformer(offset=2.0),
["trans_0__feat1", "trans_1__feat0"],
["feat1", "feat0"],
),
(
"drop",
["trans_0__feat1"],
["feat1"],
),
(
"passthrough",
["trans_0__feat1", "trans_1__feat0"],
["feat1", "feat0"],
),
],
)
def test_transformers_with_pandas_out_but_not_feature_names_out(
trans_1, expected_verbose_names, expected_non_verbose_names
):
"""Check that set_config(transform="pandas") is compatible with more transformers.

Specifically, if transformers returns a DataFrame, but does not define
`get_feature_names_out`.
"""
pd = pytest.importorskip("pandas")

X_df = pd.DataFrame({"feat0": [1.0, 2.0, 3.0], "feat1": [2.0, 3.0, 4.0]})
ct = ColumnTransformer(
[
("trans_0", PandasOutTransformer(offset=3.0), ["feat1"]),
("trans_1", trans_1, ["feat0"]),
]
)
X_trans_np = ct.fit_transform(X_df)
assert isinstance(X_trans_np, np.ndarray)

# `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does
# not define the method.
with pytest.raises(AttributeError, match="not provide get_feature_names_out"):
ct.get_feature_names_out()

# The feature names are prefixed because verbose_feature_names_out=True is default
ct.set_output(transform="pandas")
X_trans_df0 = ct.fit_transform(X_df)
assert_array_equal(X_trans_df0.columns, expected_verbose_names)

ct.set_params(verbose_feature_names_out=False)
X_trans_df1 = ct.fit_transform(X_df)
assert_array_equal(X_trans_df1.columns, expected_non_verbose_names)
13 changes: 9 additions & 4 deletions sklearn/utils/_set_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ def _wrap_in_pandas_container(
columns : callable, ndarray, or None
The column names or a callable that returns the column names. The
callable is useful if the column names require some computation.
If `None` and `data_to_wrap` is already a dataframe, then the column
names are not changed. If `None` and `data_to_wrap` is **not** a
dataframe, then columns are `range(n_features)`.
If `columns` is a callable that raises an error, `columns` will have
the same semantics as `None`. If `None` and `data_to_wrap` is already a
dataframe, then the column names are not changed. If `None` and
`data_to_wrap` is **not** a dataframe, then columns are
`range(n_features)`.

index : array-like, default=None
Index for data.
Expand All @@ -43,7 +45,10 @@ def _wrap_in_pandas_container(
raise ValueError("Pandas output does not support sparse data.")

if callable(columns):
columns = columns()
try:
columns = columns()
except Exception:
columns = None

pd = check_pandas_support("Setting output container to 'pandas'")

Expand Down
17 changes: 17 additions & 0 deletions sklearn/utils/tests/test_set_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,3 +220,20 @@ def get_feature_names_out(self, input_features=None):
est = BothMixinEstimator()
assert est.custom_parameter == 123
assert hasattr(est, "set_output")


def test__wrap_in_pandas_container_column_errors():
"""If a callable `columns` errors, it has the same semantics as columns=None."""
pd = pytest.importorskip("pandas")

def get_columns():
raise ValueError("No feature names defined")

X_df = pd.DataFrame({"feat1": [1, 2, 3], "feat2": [3, 4, 5]})

X_wrapped = _wrap_in_pandas_container(X_df, columns=get_columns)
assert_array_equal(X_wrapped.columns, X_df.columns)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the documentation mentioned that raising an error is equivalent to None, I think that we should test the case where we raise an error and we pass something else than a dataframe to check that we return range(X.shape[1])

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added test in 0fc62e6 (#24699) and adjusted it slightly in 2fb935f (#24699)


X_np = np.asarray([[1, 3], [2, 4], [3, 5]])
X_wrapped = _wrap_in_pandas_container(X_np, columns=get_columns)
assert_array_equal(X_wrapped.columns, range(X_np.shape[1]))