diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 5645872a4d8bf..fadbc634a87c0 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -66,7 +66,7 @@ Changes impacting all modules or :class:`compose.ColumnTransformer` also define a `set_output`. For details, see `SLEP018 `__. - :pr:`23734` by `Thomas Fan`_. + :pr:`23734` and :pr:`24699` by `Thomas Fan`_. - |Enhancement| Finiteness checks (detection of NaN and infinite values) in all estimators are now significantly more efficient for float32 data by leveraging diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 3b0a95d7ed51c..bf73976cd883d 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -518,6 +518,23 @@ def get_feature_names_out(self, input_features=None): # No feature names return np.array([], dtype=object) + return self._add_prefix_for_feature_names_out( + transformer_with_feature_names_out + ) + + def _add_prefix_for_feature_names_out(self, transformer_with_feature_names_out): + """Add prefix for feature names out that includes the transformer names. + + Parameters + ---------- + transformer_with_feature_names_out : list of tuples of (str, array-like of str) + The tuple consistent of the transformer's name and its feature names out. + + Returns + ------- + feature_names_out : ndarray of shape (n_features,), dtype=str + Transformed feature names. + """ if self.verbose_feature_names_out: # Prefix the feature names out with the transformers name names = list( @@ -825,7 +842,26 @@ def _hstack(self, Xs): config = _get_output_config("transform", self) if config["dense"] == "pandas" and all(hasattr(X, "iloc") for X in Xs): pd = check_pandas_support("transform") - return pd.concat(Xs, axis=1) + output = pd.concat(Xs, axis=1) + + # If all transformers define `get_feature_names_out`, then transform + # will adjust the column names to be consistent with + # verbose_feature_names_out. Here we prefix the feature names if + # verbose_feature_names_out=True. + + if not self.verbose_feature_names_out: + return output + + transformer_names = [ + t[0] for t in self._iter(fitted=True, replace_strings=True) + ] + feature_names_outs = [X.columns for X in Xs] + names_out = self._add_prefix_for_feature_names_out( + list(zip(transformer_names, feature_names_outs)) + ) + output.columns = names_out + return output + return np.hstack(Xs) def _sk_visual_block_(self): diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index bade093b89c1f..9b8bcdb80fabe 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -2053,3 +2053,80 @@ def test_column_transform_set_output_after_fitting(remainder): } for col, dtype in X_trans_df.dtypes.items(): assert dtype == expected_dtypes[col] + + +# PandasOutTransformer that does not define get_feature_names_out and always expects +# the input to be a DataFrame. +class PandasOutTransformer(BaseEstimator): + def __init__(self, offset=1.0): + self.offset = offset + + def fit(self, X, y=None): + pd = pytest.importorskip("pandas") + assert isinstance(X, pd.DataFrame) + return self + + def transform(self, X, y=None): + pd = pytest.importorskip("pandas") + assert isinstance(X, pd.DataFrame) + return X - self.offset + + def set_output(self, transform=None): + # This transformer will always output a DataFrame regardless of the + # configuration. + return self + + +@pytest.mark.parametrize( + "trans_1, expected_verbose_names, expected_non_verbose_names", + [ + ( + PandasOutTransformer(offset=2.0), + ["trans_0__feat1", "trans_1__feat0"], + ["feat1", "feat0"], + ), + ( + "drop", + ["trans_0__feat1"], + ["feat1"], + ), + ( + "passthrough", + ["trans_0__feat1", "trans_1__feat0"], + ["feat1", "feat0"], + ), + ], +) +def test_transformers_with_pandas_out_but_not_feature_names_out( + trans_1, expected_verbose_names, expected_non_verbose_names +): + """Check that set_config(transform="pandas") is compatible with more transformers. + + Specifically, if transformers returns a DataFrame, but does not define + `get_feature_names_out`. + """ + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame({"feat0": [1.0, 2.0, 3.0], "feat1": [2.0, 3.0, 4.0]}) + ct = ColumnTransformer( + [ + ("trans_0", PandasOutTransformer(offset=3.0), ["feat1"]), + ("trans_1", trans_1, ["feat0"]), + ] + ) + X_trans_np = ct.fit_transform(X_df) + assert isinstance(X_trans_np, np.ndarray) + + # `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does + # not define the method. + with pytest.raises(AttributeError, match="not provide get_feature_names_out"): + ct.get_feature_names_out() + + # The feature names are prefixed because verbose_feature_names_out=True is default + ct.set_output(transform="pandas") + X_trans_df0 = ct.fit_transform(X_df) + assert_array_equal(X_trans_df0.columns, expected_verbose_names) + + ct.set_params(verbose_feature_names_out=False) + X_trans_df1 = ct.fit_transform(X_df) + assert_array_equal(X_trans_df1.columns, expected_non_verbose_names) diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py index 28c73575f31da..ef70f8efdde03 100644 --- a/sklearn/utils/_set_output.py +++ b/sklearn/utils/_set_output.py @@ -27,9 +27,11 @@ def _wrap_in_pandas_container( columns : callable, ndarray, or None The column names or a callable that returns the column names. The callable is useful if the column names require some computation. - If `None` and `data_to_wrap` is already a dataframe, then the column - names are not changed. If `None` and `data_to_wrap` is **not** a - dataframe, then columns are `range(n_features)`. + If `columns` is a callable that raises an error, `columns` will have + the same semantics as `None`. If `None` and `data_to_wrap` is already a + dataframe, then the column names are not changed. If `None` and + `data_to_wrap` is **not** a dataframe, then columns are + `range(n_features)`. index : array-like, default=None Index for data. @@ -43,7 +45,10 @@ def _wrap_in_pandas_container( raise ValueError("Pandas output does not support sparse data.") if callable(columns): - columns = columns() + try: + columns = columns() + except Exception: + columns = None pd = check_pandas_support("Setting output container to 'pandas'") diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py index ca953170644f6..ae33b75f65c4c 100644 --- a/sklearn/utils/tests/test_set_output.py +++ b/sklearn/utils/tests/test_set_output.py @@ -220,3 +220,20 @@ def get_feature_names_out(self, input_features=None): est = BothMixinEstimator() assert est.custom_parameter == 123 assert hasattr(est, "set_output") + + +def test__wrap_in_pandas_container_column_errors(): + """If a callable `columns` errors, it has the same semantics as columns=None.""" + pd = pytest.importorskip("pandas") + + def get_columns(): + raise ValueError("No feature names defined") + + X_df = pd.DataFrame({"feat1": [1, 2, 3], "feat2": [3, 4, 5]}) + + X_wrapped = _wrap_in_pandas_container(X_df, columns=get_columns) + assert_array_equal(X_wrapped.columns, X_df.columns) + + X_np = np.asarray([[1, 3], [2, 4], [3, 5]]) + X_wrapped = _wrap_in_pandas_container(X_np, columns=get_columns) + assert_array_equal(X_wrapped.columns, range(X_np.shape[1]))