ENH Improve set_output compatibility in ColumnTransformer (#24699)

thomasjpfan · glemaitre · web-flow · commit 7dcb5efb47da · 2022-11-17T19:03:34.000+01:00
Co-authored-by: Guillaume Lemaitre &lt;g.lemaitre58@gmail.com&gt;
diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -66,7 +66,7 @@ Changes impacting all modules
   or :class:`compose.ColumnTransformer` also define a `set_output`.
   For details, see
   `SLEP018 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep018/proposal.html>`__.
-  :pr:`23734` by `Thomas Fan`_.
+  :pr:`23734` and :pr:`24699` by `Thomas Fan`_.
 
 - |Enhancement| Finiteness checks (detection of NaN and infinite values) in all
   estimators are now significantly more efficient for float32 data by leveraging
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -518,6 +518,23 @@ def get_feature_names_out(self, input_features=None):
             # No feature names
             return np.array([], dtype=object)
 
+        return self._add_prefix_for_feature_names_out(
+            transformer_with_feature_names_out
+        )
+
+    def _add_prefix_for_feature_names_out(self, transformer_with_feature_names_out):
+        """Add prefix for feature names out that includes the transformer names.
+
+        Parameters
+        ----------
+        transformer_with_feature_names_out : list of tuples of (str, array-like of str)
+            The tuple consistent of the transformer's name and its feature names out.
+
+        Returns
+        -------
+        feature_names_out : ndarray of shape (n_features,), dtype=str
+            Transformed feature names.
+        """
         if self.verbose_feature_names_out:
             # Prefix the feature names out with the transformers name
             names = list(
@@ -825,7 +842,26 @@ def _hstack(self, Xs):
             config = _get_output_config("transform", self)
             if config["dense"] == "pandas" and all(hasattr(X, "iloc") for X in Xs):
                 pd = check_pandas_support("transform")
-                return pd.concat(Xs, axis=1)
+                output = pd.concat(Xs, axis=1)
+
+                # If all transformers define `get_feature_names_out`, then transform
+                # will adjust the column names to be consistent with
+                # verbose_feature_names_out. Here we prefix the feature names if
+                # verbose_feature_names_out=True.
+
+                if not self.verbose_feature_names_out:
+                    return output
+
+                transformer_names = [
+                    t[0] for t in self._iter(fitted=True, replace_strings=True)
+                ]
+                feature_names_outs = [X.columns for X in Xs]
+                names_out = self._add_prefix_for_feature_names_out(
+                    list(zip(transformer_names, feature_names_outs))
+                )
+                output.columns = names_out
+                return output
+
             return np.hstack(Xs)
 
     def _sk_visual_block_(self):
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -2053,3 +2053,80 @@ def test_column_transform_set_output_after_fitting(remainder):
     }
     for col, dtype in X_trans_df.dtypes.items():
         assert dtype == expected_dtypes[col]
+
+
+# PandasOutTransformer that does not define get_feature_names_out and always expects
+# the input to be a DataFrame.
+class PandasOutTransformer(BaseEstimator):
+    def __init__(self, offset=1.0):
+        self.offset = offset
+
+    def fit(self, X, y=None):
+        pd = pytest.importorskip("pandas")
+        assert isinstance(X, pd.DataFrame)
+        return self
+
+    def transform(self, X, y=None):
+        pd = pytest.importorskip("pandas")
+        assert isinstance(X, pd.DataFrame)
+        return X - self.offset
+
+    def set_output(self, transform=None):
+        # This transformer will always output a DataFrame regardless of the
+        # configuration.
+        return self
+
+
+@pytest.mark.parametrize(
+    "trans_1, expected_verbose_names, expected_non_verbose_names",
+    [
+        (
+            PandasOutTransformer(offset=2.0),
+            ["trans_0__feat1", "trans_1__feat0"],
+            ["feat1", "feat0"],
+        ),
+        (
+            "drop",
+            ["trans_0__feat1"],
+            ["feat1"],
+        ),
+        (
+            "passthrough",
+            ["trans_0__feat1", "trans_1__feat0"],
+            ["feat1", "feat0"],
+        ),
+    ],
+)
+def test_transformers_with_pandas_out_but_not_feature_names_out(
+    trans_1, expected_verbose_names, expected_non_verbose_names
+):
+    """Check that set_config(transform="pandas") is compatible with more transformers.
+
+    Specifically, if transformers returns a DataFrame, but does not define
+    `get_feature_names_out`.
+    """
+    pd = pytest.importorskip("pandas")
+
+    X_df = pd.DataFrame({"feat0": [1.0, 2.0, 3.0], "feat1": [2.0, 3.0, 4.0]})
+    ct = ColumnTransformer(
+        [
+            ("trans_0", PandasOutTransformer(offset=3.0), ["feat1"]),
+            ("trans_1", trans_1, ["feat0"]),
+        ]
+    )
+    X_trans_np = ct.fit_transform(X_df)
+    assert isinstance(X_trans_np, np.ndarray)
+
+    # `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does
+    # not define the method.
+    with pytest.raises(AttributeError, match="not provide get_feature_names_out"):
+        ct.get_feature_names_out()
+
+    # The feature names are prefixed because verbose_feature_names_out=True is default
+    ct.set_output(transform="pandas")
+    X_trans_df0 = ct.fit_transform(X_df)
+    assert_array_equal(X_trans_df0.columns, expected_verbose_names)
+
+    ct.set_params(verbose_feature_names_out=False)
+    X_trans_df1 = ct.fit_transform(X_df)
+    assert_array_equal(X_trans_df1.columns, expected_non_verbose_names)
diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py
@@ -27,9 +27,11 @@ def _wrap_in_pandas_container(
     columns : callable, ndarray, or None
         The column names or a callable that returns the column names. The
         callable is useful if the column names require some computation.
-        If `None` and `data_to_wrap` is already a dataframe, then the column
-        names are not changed. If `None` and `data_to_wrap` is **not** a
-        dataframe, then columns are `range(n_features)`.
+        If `columns` is a callable that raises an error, `columns` will have
+        the same semantics as `None`. If `None` and `data_to_wrap` is already a
+        dataframe, then the column names are not changed. If `None` and
+        `data_to_wrap` is **not** a dataframe, then columns are
+        `range(n_features)`.
 
     index : array-like, default=None
         Index for data.
@@ -43,7 +45,10 @@ def _wrap_in_pandas_container(
         raise ValueError("Pandas output does not support sparse data.")
 
     if callable(columns):
-        columns = columns()
+        try:
+            columns = columns()
+        except Exception:
+            columns = None
 
     pd = check_pandas_support("Setting output container to 'pandas'")
 
diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py
@@ -220,3 +220,20 @@ def get_feature_names_out(self, input_features=None):
     est = BothMixinEstimator()
     assert est.custom_parameter == 123
     assert hasattr(est, "set_output")
+
+
+def test__wrap_in_pandas_container_column_errors():
+    """If a callable `columns` errors, it has the same semantics as columns=None."""
+    pd = pytest.importorskip("pandas")
+
+    def get_columns():
+        raise ValueError("No feature names defined")
+
+    X_df = pd.DataFrame({"feat1": [1, 2, 3], "feat2": [3, 4, 5]})
+
+    X_wrapped = _wrap_in_pandas_container(X_df, columns=get_columns)
+    assert_array_equal(X_wrapped.columns, X_df.columns)
+
+    X_np = np.asarray([[1, 3], [2, 4], [3, 5]])
+    X_wrapped = _wrap_in_pandas_container(X_np, columns=get_columns)
+    assert_array_equal(X_wrapped.columns, range(X_np.shape[1]))