Skip to content

Commit 7dcb5ef

Browse files
ENH Improve set_output compatibility in ColumnTransformer (#24699)
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent d47b1c8 commit 7dcb5ef

File tree

5 files changed

+141
-6
lines changed

5 files changed

+141
-6
lines changed

doc/whats_new/v1.2.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ Changes impacting all modules
6666
or :class:`compose.ColumnTransformer` also define a `set_output`.
6767
For details, see
6868
`SLEP018 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep018/proposal.html>`__.
69-
:pr:`23734` by `Thomas Fan`_.
69+
:pr:`23734` and :pr:`24699` by `Thomas Fan`_.
7070

7171
- |Enhancement| Finiteness checks (detection of NaN and infinite values) in all
7272
estimators are now significantly more efficient for float32 data by leveraging

sklearn/compose/_column_transformer.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,23 @@ def get_feature_names_out(self, input_features=None):
518518
# No feature names
519519
return np.array([], dtype=object)
520520

521+
return self._add_prefix_for_feature_names_out(
522+
transformer_with_feature_names_out
523+
)
524+
525+
def _add_prefix_for_feature_names_out(self, transformer_with_feature_names_out):
526+
"""Add prefix for feature names out that includes the transformer names.
527+
528+
Parameters
529+
----------
530+
transformer_with_feature_names_out : list of tuples of (str, array-like of str)
531+
The tuple consistent of the transformer's name and its feature names out.
532+
533+
Returns
534+
-------
535+
feature_names_out : ndarray of shape (n_features,), dtype=str
536+
Transformed feature names.
537+
"""
521538
if self.verbose_feature_names_out:
522539
# Prefix the feature names out with the transformers name
523540
names = list(
@@ -825,7 +842,26 @@ def _hstack(self, Xs):
825842
config = _get_output_config("transform", self)
826843
if config["dense"] == "pandas" and all(hasattr(X, "iloc") for X in Xs):
827844
pd = check_pandas_support("transform")
828-
return pd.concat(Xs, axis=1)
845+
output = pd.concat(Xs, axis=1)
846+
847+
# If all transformers define `get_feature_names_out`, then transform
848+
# will adjust the column names to be consistent with
849+
# verbose_feature_names_out. Here we prefix the feature names if
850+
# verbose_feature_names_out=True.
851+
852+
if not self.verbose_feature_names_out:
853+
return output
854+
855+
transformer_names = [
856+
t[0] for t in self._iter(fitted=True, replace_strings=True)
857+
]
858+
feature_names_outs = [X.columns for X in Xs]
859+
names_out = self._add_prefix_for_feature_names_out(
860+
list(zip(transformer_names, feature_names_outs))
861+
)
862+
output.columns = names_out
863+
return output
864+
829865
return np.hstack(Xs)
830866

831867
def _sk_visual_block_(self):

sklearn/compose/tests/test_column_transformer.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2053,3 +2053,80 @@ def test_column_transform_set_output_after_fitting(remainder):
20532053
}
20542054
for col, dtype in X_trans_df.dtypes.items():
20552055
assert dtype == expected_dtypes[col]
2056+
2057+
2058+
# PandasOutTransformer that does not define get_feature_names_out and always expects
2059+
# the input to be a DataFrame.
2060+
class PandasOutTransformer(BaseEstimator):
2061+
def __init__(self, offset=1.0):
2062+
self.offset = offset
2063+
2064+
def fit(self, X, y=None):
2065+
pd = pytest.importorskip("pandas")
2066+
assert isinstance(X, pd.DataFrame)
2067+
return self
2068+
2069+
def transform(self, X, y=None):
2070+
pd = pytest.importorskip("pandas")
2071+
assert isinstance(X, pd.DataFrame)
2072+
return X - self.offset
2073+
2074+
def set_output(self, transform=None):
2075+
# This transformer will always output a DataFrame regardless of the
2076+
# configuration.
2077+
return self
2078+
2079+
2080+
@pytest.mark.parametrize(
2081+
"trans_1, expected_verbose_names, expected_non_verbose_names",
2082+
[
2083+
(
2084+
PandasOutTransformer(offset=2.0),
2085+
["trans_0__feat1", "trans_1__feat0"],
2086+
["feat1", "feat0"],
2087+
),
2088+
(
2089+
"drop",
2090+
["trans_0__feat1"],
2091+
["feat1"],
2092+
),
2093+
(
2094+
"passthrough",
2095+
["trans_0__feat1", "trans_1__feat0"],
2096+
["feat1", "feat0"],
2097+
),
2098+
],
2099+
)
2100+
def test_transformers_with_pandas_out_but_not_feature_names_out(
2101+
trans_1, expected_verbose_names, expected_non_verbose_names
2102+
):
2103+
"""Check that set_config(transform="pandas") is compatible with more transformers.
2104+
2105+
Specifically, if transformers returns a DataFrame, but does not define
2106+
`get_feature_names_out`.
2107+
"""
2108+
pd = pytest.importorskip("pandas")
2109+
2110+
X_df = pd.DataFrame({"feat0": [1.0, 2.0, 3.0], "feat1": [2.0, 3.0, 4.0]})
2111+
ct = ColumnTransformer(
2112+
[
2113+
("trans_0", PandasOutTransformer(offset=3.0), ["feat1"]),
2114+
("trans_1", trans_1, ["feat0"]),
2115+
]
2116+
)
2117+
X_trans_np = ct.fit_transform(X_df)
2118+
assert isinstance(X_trans_np, np.ndarray)
2119+
2120+
# `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does
2121+
# not define the method.
2122+
with pytest.raises(AttributeError, match="not provide get_feature_names_out"):
2123+
ct.get_feature_names_out()
2124+
2125+
# The feature names are prefixed because verbose_feature_names_out=True is default
2126+
ct.set_output(transform="pandas")
2127+
X_trans_df0 = ct.fit_transform(X_df)
2128+
assert_array_equal(X_trans_df0.columns, expected_verbose_names)
2129+
2130+
ct.set_params(verbose_feature_names_out=False)
2131+
X_trans_df1 = ct.fit_transform(X_df)
2132+
assert_array_equal(X_trans_df1.columns, expected_non_verbose_names)

sklearn/utils/_set_output.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,11 @@ def _wrap_in_pandas_container(
2727
columns : callable, ndarray, or None
2828
The column names or a callable that returns the column names. The
2929
callable is useful if the column names require some computation.
30-
If `None` and `data_to_wrap` is already a dataframe, then the column
31-
names are not changed. If `None` and `data_to_wrap` is **not** a
32-
dataframe, then columns are `range(n_features)`.
30+
If `columns` is a callable that raises an error, `columns` will have
31+
the same semantics as `None`. If `None` and `data_to_wrap` is already a
32+
dataframe, then the column names are not changed. If `None` and
33+
`data_to_wrap` is **not** a dataframe, then columns are
34+
`range(n_features)`.
3335
3436
index : array-like, default=None
3537
Index for data.
@@ -43,7 +45,10 @@ def _wrap_in_pandas_container(
4345
raise ValueError("Pandas output does not support sparse data.")
4446

4547
if callable(columns):
46-
columns = columns()
48+
try:
49+
columns = columns()
50+
except Exception:
51+
columns = None
4752

4853
pd = check_pandas_support("Setting output container to 'pandas'")
4954

sklearn/utils/tests/test_set_output.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,20 @@ def get_feature_names_out(self, input_features=None):
220220
est = BothMixinEstimator()
221221
assert est.custom_parameter == 123
222222
assert hasattr(est, "set_output")
223+
224+
225+
def test__wrap_in_pandas_container_column_errors():
226+
"""If a callable `columns` errors, it has the same semantics as columns=None."""
227+
pd = pytest.importorskip("pandas")
228+
229+
def get_columns():
230+
raise ValueError("No feature names defined")
231+
232+
X_df = pd.DataFrame({"feat1": [1, 2, 3], "feat2": [3, 4, 5]})
233+
234+
X_wrapped = _wrap_in_pandas_container(X_df, columns=get_columns)
235+
assert_array_equal(X_wrapped.columns, X_df.columns)
236+
237+
X_np = np.asarray([[1, 3], [2, 4], [3, 5]])
238+
X_wrapped = _wrap_in_pandas_container(X_np, columns=get_columns)
239+
assert_array_equal(X_wrapped.columns, range(X_np.shape[1]))

0 commit comments

Comments
 (0)