From 92383cda36bd5de516c905fba96f732a046bc8b3 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 18 Oct 2022 17:50:21 -0400
Subject: [PATCH 1/7] ENH Improve set_output compatibility in ColumnTransformer

---
 doc/whats_new/v1.2.rst                        |  2 +-
 sklearn/compose/_column_transformer.py        | 38 +++++++++-
 .../compose/tests/test_column_transformer.py  | 76 +++++++++++++++++++
 sklearn/utils/_set_output.py                  | 13 +++-
 sklearn/utils/tests/test_set_output.py        | 13 ++++
 5 files changed, 136 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
index 7b7c33cb71cfd..47ae277954758 100644
--- a/doc/whats_new/v1.2.rst
+++ b/doc/whats_new/v1.2.rst
@@ -62,7 +62,7 @@ Changes impacting all modules
   or :class:`compose.ColumnTransformer` also define a `set_output`.
   For details, see
   `SLEP018 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep018/proposal.html>`__.
-  :pr:`23734` by `Thomas Fan`_.
+  :pr:`23734` and :pr:`xxxxx` by `Thomas Fan`_.
 
 - |Enhancement| Finiteness checks (detection of NaN and infinite values) in all
   estimators are now significantly more efficient for float32 data by leveraging
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index db7a7016c83ab..f33a443ab0d9e 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -514,6 +514,23 @@ def get_feature_names_out(self, input_features=None):
             # No feature names
             return np.array([], dtype=object)
 
+        return self._add_prefix_for_feature_names_out(
+            transformer_with_feature_names_out
+        )
+
+    def _add_prefix_for_feature_names_out(self, transformer_with_feature_names_out):
+        """Add prefix for feature names out that includes the transformer names.
+
+        Parameters
+        ----------
+        transformer_with_feature_names_out : list of tuples of (str, array-like of str)
+            The tuple consistent of the transformer's name and its feature names out.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
         if self.verbose_feature_names_out:
             # Prefix the feature names out with the transformers name
             names = list(
@@ -821,7 +838,26 @@ def _hstack(self, Xs):
             config = _get_output_config("transform", self)
             if config["dense"] == "pandas" and all(hasattr(X, "iloc") for X in Xs):
                 pd = check_pandas_support("transform")
-                return pd.concat(Xs, axis=1)
+                output = pd.concat(Xs, axis=1)
+
+                # If all transformers define `get_feature_names_out`, then transform
+                # will adjust the column names to be consistent with
+                # verbose_feature_names_out. Here we prefix the feature names if
+                # verbose_feature_names_out=True.
+
+                if not self.verbose_feature_names_out:
+                    return output
+
+                transformer_names = [
+                    t[0] for t in self._iter(fitted=True, replace_strings=True)
+                ]
+                feature_names_outs = [X.columns for X in Xs]
+                names_out = self._add_prefix_for_feature_names_out(
+                    list(zip(transformer_names, feature_names_outs))
+                )
+                output.columns = names_out
+                return output
+
             return np.hstack(Xs)
 
     def _sk_visual_block_(self):
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index bade093b89c1f..e03722e5b480d 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -2053,3 +2053,79 @@ def test_column_transform_set_output_after_fitting(remainder):
     }
     for col, dtype in X_trans_df.dtypes.items():
         assert dtype == expected_dtypes[col]
+
+
+# Transformer that does not define get_feature_names_out. This transformer
+# always expects the input to be a DataFrame.
+class PandasOutTransformer(BaseEstimator):
+    def __init__(self, offset=1.0):
+        self.offset = offset
+
+    def fit(self, X, y=None):
+        pd = pytest.importorskip("pandas")
+        assert isinstance(X, pd.DataFrame)
+        return self
+
+    def transform(self, X, y=None):
+        pd = pytest.importorskip("pandas")
+        assert isinstance(X, pd.DataFrame)
+        return X - self.offset
+
+    def set_output(self, transform=None):
+        # This transformer will always output a DataFrame regardless of the
+        # configuration.
+        return self
+
+
+@pytest.mark.parametrize(
+    "trans_1, expected_verbose_names, expected_non_verbose_names",
+    [
+        (
+            PandasOutTransformer(offset=2.0),
+            ["trans_0__feat1", "trans_1__feat0"],
+            ["feat1", "feat0"],
+        ),
+        (
+            "drop",
+            ["trans_0__feat1"],
+            ["feat1"],
+        ),
+        (
+            "passthrough",
+            ["trans_0__feat1", "trans_1__feat0"],
+            ["feat1", "feat0"],
+        ),
+    ],
+)
+def test_transformers_with_pandas_out_but_not_feature_names_out(
+    trans_1, expected_verbose_names, expected_non_verbose_names
+):
+    """Check that set_config(transform="pandas") is compatible with more transformers.
+
+    Specifically, if transformers returns a DataFrame, but does not define
+    `get_feature_names_out`.
+    """
+    pd = pytest.importorskip("pandas")
+
+    X_df = pd.DataFrame({"feat0": [1.0, 2.0, 3.0], "feat1": [2.0, 3.0, 4.0]})
+    ct = ColumnTransformer(
+        [
+            ("trans_0", PandasOutTransformer(offset=3.0), ["feat1"]),
+            ("trans_1", trans_1, ["feat0"]),
+        ]
+    )
+    X_trans_np = ct.fit_transform(X_df)
+    assert isinstance(X_trans_np, np.ndarray)
+
+    # `ct` does not have `get_feature_names_out`
+    with pytest.raises(AttributeError, match="not provide get_feature_names_out"):
+        ct.get_feature_names_out()
+
+    # The feature names are prefixed because verbose_feature_names_out=True is default
+    ct.set_output(transform="pandas")
+    X_trans_df0 = ct.fit_transform(X_df)
+    assert_array_equal(X_trans_df0.columns, expected_verbose_names)
+
+    ct.set_params(verbose_feature_names_out=False)
+    X_trans_df1 = ct.fit_transform(X_df)
+    assert_array_equal(X_trans_df1.columns, expected_non_verbose_names)
diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py
index 525c6e0fe0118..be97c3734927f 100644
--- a/sklearn/utils/_set_output.py
+++ b/sklearn/utils/_set_output.py
@@ -27,9 +27,11 @@ def _wrap_in_pandas_container(
     columns : callable, ndarray, or None
         The column names or a callable that returns the column names. The
         callable is useful if the column names require some computation.
-        If `None` and `data_to_wrap` is already a dataframe, then the column
-        names are not changed. If `None` and `data_to_wrap` is **not** a
-        dataframe, then columns are `range(n_features)`.
+        If `columns` is a callable that raises an error, `columns` will have
+        the same semantics as `None`. If `None` and `data_to_wrap` is already a
+        dataframe, then the column names are not changed. If `None` and
+        `data_to_wrap` is **not** a dataframe, then columns are
+        `range(n_features)`.
 
     index : array-like, default=None
         Index for data.
@@ -43,7 +45,10 @@ def _wrap_in_pandas_container(
         raise ValueError("Pandas output does not support sparse data.")
 
     if callable(columns):
-        columns = columns()
+        try:
+            columns = columns()
+        except Exception:
+            columns = None
 
     pd = check_pandas_support("Setting output container to 'pandas'")
 
diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py
index d20a4634f885d..1158d3ad47707 100644
--- a/sklearn/utils/tests/test_set_output.py
+++ b/sklearn/utils/tests/test_set_output.py
@@ -199,3 +199,16 @@ def test_auto_wrap_output_keys_errors_with_incorrect_input():
 
         class BadEstimator(_SetOutputMixin, auto_wrap_output_keys="bad_parameter"):
             pass
+
+
+def test__wrap_in_pandas_container_column_errors():
+    """If a callable `columns` errors, it has the same semantics as columns=None."""
+    pd = pytest.importorskip("pandas")
+
+    def get_columns():
+        raise ValueError("No feature names defined")
+
+    X_df = pd.DataFrame({"feat1": [1, 2, 3], "feat2": [3, 4, 5]})
+
+    X_wrapped = _wrap_in_pandas_container(X_df, columns=get_columns)
+    assert_array_equal(X_wrapped.columns, X_df.columns)

From 1ac26b2e851594be9d33c80c0d2f6eddb4451e91 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 18 Oct 2022 18:02:38 -0400
Subject: [PATCH 2/7] DOC Adds PR number

---
 doc/whats_new/v1.2.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
index 47ae277954758..1da5fa9d63756 100644
--- a/doc/whats_new/v1.2.rst
+++ b/doc/whats_new/v1.2.rst
@@ -62,7 +62,7 @@ Changes impacting all modules
   or :class:`compose.ColumnTransformer` also define a `set_output`.
   For details, see
   `SLEP018 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep018/proposal.html>`__.
-  :pr:`23734` and :pr:`xxxxx` by `Thomas Fan`_.
+  :pr:`23734` and :pr:`24699` by `Thomas Fan`_.
 
 - |Enhancement| Finiteness checks (detection of NaN and infinite values) in all
   estimators are now significantly more efficient for float32 data by leveraging

From b16559088eb1d7d7244943f152d9e0e3a48db3f2 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 19 Oct 2022 14:37:55 -0400
Subject: [PATCH 3/7] Apply suggestions from code review

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/compose/_column_transformer.py           | 2 +-
 sklearn/compose/tests/test_column_transformer.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index f33a443ab0d9e..78c012ceec66a 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -528,7 +528,7 @@ def _add_prefix_for_feature_names_out(self, transformer_with_feature_names_out):
 
         Returns
         -------
-        feature_names_out : ndarray of str objects
+        feature_names_out : ndarray of shape (n_features,), dtype=str
             Transformed feature names.
         """
         if self.verbose_feature_names_out:
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index e03722e5b480d..74be853adf280 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -2117,7 +2117,8 @@ def test_transformers_with_pandas_out_but_not_feature_names_out(
     X_trans_np = ct.fit_transform(X_df)
     assert isinstance(X_trans_np, np.ndarray)
 
-    # `ct` does not have `get_feature_names_out`
+    # `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does define
+    # the method.
     with pytest.raises(AttributeError, match="not provide get_feature_names_out"):
         ct.get_feature_names_out()
 

From 0fc62e6a4550632e8b23d2b910b501f175a18b35 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 19 Oct 2022 14:39:41 -0400
Subject: [PATCH 4/7] TST Adds test for NumPy arrays

---
 sklearn/utils/tests/test_set_output.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py
index 1158d3ad47707..86b6ccc57f1c3 100644
--- a/sklearn/utils/tests/test_set_output.py
+++ b/sklearn/utils/tests/test_set_output.py
@@ -212,3 +212,7 @@ def get_columns():
 
     X_wrapped = _wrap_in_pandas_container(X_df, columns=get_columns)
     assert_array_equal(X_wrapped.columns, X_df.columns)
+
+    X_np = np.asarray([[1, 3], [2, 4], [3, 5]])
+    X_wrapped = _wrap_in_pandas_container(X_np, columns=get_columns)
+    assert_array_equal(X_wrapped.columns, [0, 1])

From 9317c08ec8383f20cfeec5e31151952a58f454cc Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 19 Oct 2022 14:40:19 -0400
Subject: [PATCH 5/7] STY Fixes style

---
 sklearn/compose/tests/test_column_transformer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index 74be853adf280..e0daca0b9c4fb 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -2117,8 +2117,8 @@ def test_transformers_with_pandas_out_but_not_feature_names_out(
     X_trans_np = ct.fit_transform(X_df)
     assert isinstance(X_trans_np, np.ndarray)
 
-    # `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does define
-    # the method.
+    # `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does
+    # not define the method.
     with pytest.raises(AttributeError, match="not provide get_feature_names_out"):
         ct.get_feature_names_out()
 

From abf9b5ef4c729e0f44b90d815b33cd423570ec9e Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 19 Oct 2022 14:41:09 -0400
Subject: [PATCH 6/7] DOC Minor rewording

---
 sklearn/compose/tests/test_column_transformer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index e0daca0b9c4fb..9b8bcdb80fabe 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -2055,8 +2055,8 @@ def test_column_transform_set_output_after_fitting(remainder):
         assert dtype == expected_dtypes[col]
 
 
-# Transformer that does not define get_feature_names_out. This transformer
-# always expects the input to be a DataFrame.
+# PandasOutTransformer that does not define get_feature_names_out and always expects
+# the input to be a DataFrame.
 class PandasOutTransformer(BaseEstimator):
     def __init__(self, offset=1.0):
         self.offset = offset

From 2fb935f83649f7b385f8ebade15836a464127f51 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 19 Oct 2022 14:42:24 -0400
Subject: [PATCH 7/7] TST Have test depend on X_np directly

---
 sklearn/utils/tests/test_set_output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py
index 86b6ccc57f1c3..cbc8ed8364865 100644
--- a/sklearn/utils/tests/test_set_output.py
+++ b/sklearn/utils/tests/test_set_output.py
@@ -215,4 +215,4 @@ def get_columns():
 
     X_np = np.asarray([[1, 3], [2, 4], [3, 5]])
     X_wrapped = _wrap_in_pandas_container(X_np, columns=get_columns)
-    assert_array_equal(X_wrapped.columns, [0, 1])
+    assert_array_equal(X_wrapped.columns, range(X_np.shape[1]))