scikit-learn · lorentzenchr · Oct 12, 2022 · Jun 22, 2022 · Jun 22, 2022 · Jun 22, 2022
diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
@@ -635,6 +635,35 @@ instantiated with an instance of ``LogisticRegression`` (or
 of these two models is somewhat idiosyncratic but both should provide robust
 closed-form solutions.
 
+.. _developer_api_set_output:
+
+Developer API for `set_output`
+==============================
+
+With
+`SLEP018 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep018/proposal.html>`__,
+scikit-learn introduces the `set_output` API for configuring transformers to
+output pandas DataFrames. The `set_output` API is automatically defined if the
+transformer defines :term:`get_feature_names_out` and subclasses
+:class:`base.TransformerMixin`. :term:`get_feature_names_out` is used to get the
+column names of pandas output. You can opt-out of the `set_output` API by
+setting `auto_wrap_output_keys=None` when defining a custom subclass::
+
+    class MyTransformer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
+
+        def fit(self, X, y=None):
+            return self
+        def transform(self, X, y=None):
+            return X
+        def get_feature_names_out(self, input_features=None):
+            ...
+
+For transformers that return multiple arrays in `transform`, auto wrapping will
+only wrap the first array and not alter the other arrays.
+
+See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
+for an example on how to use the API.
+
 .. _coding-guidelines:
 
 Coding guidelines

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -52,6 +52,13 @@ random sampling procedures.
 Changes impacting all modules
 -----------------------------
 
+- |MajorFeature| The `set_output` API has been adopted by all transformers.
+  Meta-estimators that contain transformers such as :class:`pipeline.Pipeline`
+  or :class:`compose.ColumnTransformer` also define a `set_output`.
+  For details, see
+  `SLEP018 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep018/proposal.html>`__.
+  :pr:`23734` by `Thomas Fan`_.
+
 - |Enhancement| Finiteness checks (detection of NaN and infinite values) in all
   estimators are now significantly more efficient for float32 data by leveraging
   NumPy's SIMD optimized primitives.

diff --git a/examples/miscellaneous/plot_set_output.py b/examples/miscellaneous/plot_set_output.py
@@ -0,0 +1,111 @@
+"""
+================================
+Introducing the `set_output` API
+================================
+
+.. currentmodule:: sklearn
+
+This example will demonstrate the `set_output` API to configure transformers to
+output pandas DataFrames. `set_output` can be configured per estimator by calling
+the `set_output` method or globally by setting `set_config(transform_output="pandas")`.
+For details, see
+`SLEP018 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep018/proposal.html>`__.
+"""  # noqa
+
+# %%
+# First, we load the iris dataset as a DataFrame to demonstrate the `set_output` API.
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+
+X, y = load_iris(as_frame=True, return_X_y=True)
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
+X_train.head()
+
+# %%
+# To configure an estimator such as :class:`preprocessing.StandardScalar` to return
+# DataFrames, call `set_output`. This feature requires pandas to be installed.
+
+from sklearn.preprocessing import StandardScaler
+
+scaler = StandardScaler().set_output(transform="pandas")
+
+scaler.fit(X_train)
+X_test_scaled = scaler.transform(X_test)
+X_test_scaled.head()
+
+# %%
+# `set_output` can be called after `fit` to configure `transform` after the fact.
+scaler2 = StandardScaler()
+
+scaler2.fit(X_train)
+X_test_np = scaler2.transform(X_test)
+print(f"Default output type: {type(X_test_np).__name__}")
+
+scaler2.set_output(transform="pandas")
+X_test_df = scaler2.transform(X_test)
+print(f"Configured pandas output type: {type(X_test_df).__name__}")
+
+# %%
+# In a :class:`pipeline.Pipeline`, `set_output` configures all steps to output
+# DataFrames.
+from sklearn.pipeline import make_pipeline
+from sklearn.linear_model import LogisticRegression
+from sklearn.feature_selection import SelectPercentile
+
+clf = make_pipeline(
+    StandardScaler(), SelectPercentile(percentile=75), LogisticRegression()
+)
+clf.set_output(transform="pandas")
+clf.fit(X_train, y_train)
+
+# %%
+# Each transformer in the pipeline is configured to return DataFrames. This
+# means that the final logistic regression step contain the feature names.
+clf[-1].feature_names_in_
+
+# %%
+# Next we load the titanic dataset to demonstrate `set_output` with
+# :class:`compose.ColumnTransformer` and heterogenous data.
+from sklearn.datasets import fetch_openml
+
+X, y = fetch_openml(
+    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
+)
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
+
+# %%
+# The `set_output` API can be configured globally by using :func:`set_config` and
+# setting the `transform_output` to `"pandas"`.
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.impute import SimpleImputer
+from sklearn import set_config
+
+set_config(transform_output="pandas")
+
+num_pipe = make_pipeline(SimpleImputer(), StandardScaler())
+ct = ColumnTransformer(
+    (
+        ("numerical", num_pipe, ["age", "fare"]),
+        (
+            "categorical",
+            OneHotEncoder(
+                sparse_output=False, drop="if_binary", handle_unknown="ignore"
+            ),
+            ["embarked", "sex", "pclass"],
+        ),
+    ),
+    verbose_feature_names_out=False,
+)
+clf = make_pipeline(ct, SelectPercentile(percentile=50), LogisticRegression())
+clf.fit(X_train, y_train)
+clf.score(X_test, y_test)
+
+# %%
+# With the global configuration, all transformers output DataFrames. This allows us to
+# easily plot the logistic regression coefficients with the corresponding feature names.
+import pandas as pd
+
+log_reg = clf[-1]
+coef = pd.Series(log_reg.coef_.ravel(), index=log_reg.feature_names_in_)
+_ = coef.sort_values().plot.barh()
diff --git a/sklearn/_config.py b/sklearn/_config.py
@@ -14,6 +14,7 @@
     ),
     "enable_cython_pairwise_dist": True,
     "array_api_dispatch": False,
+    "transform_output": "default",
 }
 _threadlocal = threading.local()
 
@@ -52,6 +53,7 @@ def set_config(
     pairwise_dist_chunk_size=None,
     enable_cython_pairwise_dist=None,
     array_api_dispatch=None,
+    transform_output=None,
 ):
     """Set global scikit-learn configuration
 
@@ -120,6 +122,11 @@ def set_config(
 
         .. versionadded:: 1.2
 
+    transform_output : str, default=None
+        Configure the output container for transform.
+
+        .. versionadded:: 1.2
+
     See Also
     --------
     config_context : Context manager for global scikit-learn configuration.
@@ -141,6 +148,8 @@ def set_config(
         local_config["enable_cython_pairwise_dist"] = enable_cython_pairwise_dist
     if array_api_dispatch is not None:
         local_config["array_api_dispatch"] = array_api_dispatch
+    if transform_output is not None:
+        local_config["transform_output"] = transform_output
 
 
 @contextmanager
@@ -153,6 +162,7 @@ def config_context(
     pairwise_dist_chunk_size=None,
     enable_cython_pairwise_dist=None,
     array_api_dispatch=None,
+    transform_output=None,
 ):
     """Context manager for global scikit-learn configuration.
 
@@ -220,6 +230,11 @@ def config_context(
 
         .. versionadded:: 1.2
 
+    transform_output : str, default=None
+        Configure the output container for transform.
+
+        .. versionadded:: 1.2
+
     Yields
     ------
     None.
@@ -256,6 +271,7 @@ def config_context(
         pairwise_dist_chunk_size=pairwise_dist_chunk_size,
         enable_cython_pairwise_dist=enable_cython_pairwise_dist,
         array_api_dispatch=array_api_dispatch,
+        transform_output=transform_output,
     )
 
     try:

diff --git a/sklearn/base.py b/sklearn/base.py
@@ -15,6 +15,7 @@
 from . import __version__
 from ._config import get_config
 from .utils import _IS_32BIT
+from .utils._set_output import _SetOutputMixin
 from .utils._tags import (
     _DEFAULT_TAGS,
 )
@@ -98,6 +99,13 @@ def clone(estimator, *, safe=True):
                 "Cannot clone object %s, as the constructor "
                 "either does not set or modifies parameter %s" % (estimator, name)
             )
+
+    # _sklearn_output_config is used by `set_output` to configure the output
+    # container of an estimator.
+    if hasattr(estimator, "_sklearn_output_config"):
+        new_object._sklearn_output_config = copy.deepcopy(
+            estimator._sklearn_output_config
+        )
     return new_object
 
 
@@ -798,8 +806,13 @@ def get_submatrix(self, i, data):
         return data[row_ind[:, np.newaxis], col_ind]
 
 
-class TransformerMixin:
-    """Mixin class for all transformers in scikit-learn."""
+class TransformerMixin(_SetOutputMixin):
+    """Mixin class for all transformers in scikit-learn.
+
+    If :term:`get_feature_names_out` is defined and `auto_wrap_output` is True,
+    then `BaseEstimator` will automatically wrap `transform` and `fit_transform` to
+    follow the `set_output` API. See the :ref:`developer_api_set_output` for details.
+    """
 
     def fit_transform(self, X, y=None, **fit_params):
         """

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -20,6 +20,8 @@
 from ..utils import Bunch
 from ..utils import _safe_indexing
 from ..utils import _get_column_indices
+from ..utils._set_output import _get_output_config, _safe_set_output
+from ..utils import check_pandas_support
 from ..utils.metaestimators import _BaseComposition
 from ..utils.validation import check_array, check_is_fitted, _check_feature_names_in
 from ..utils.fixes import delayed
@@ -252,6 +254,35 @@ def _transformers(self, value):
         except (TypeError, ValueError):
             self.transformers = value
 
+    def set_output(self, transform=None):
+        """Set the output container when `"transform`" and `"fit_transform"` are called.
+
+        Calling `set_output` will set the output of all estimators in `transformers`
+        and `transformers_`.
+
+        Parameters
+        ----------
+        transform : {"default", "pandas"}, default=None
+            Configure output of `transform` and `fit_transform`.
+
+        Returns
+        -------
+        self : estimator instance
+            Estimator instance.
+        """
+        super().set_output(transform=transform)
+        transformers = (
+            trans
+            for _, trans, _ in chain(
+                self.transformers, getattr(self, "transformers_", [])
+            )
+            if trans not in {"passthrough", "drop"}
+        )
+        for trans in transformers:
+            _safe_set_output(trans, transform=transform)
+
+        return self
+
     def get_params(self, deep=True):
         """Get parameters for this estimator.
 
@@ -302,7 +333,19 @@ def _iter(self, fitted=False, replace_strings=False, column_as_strings=False):
 
         """
         if fitted:
-            transformers = self.transformers_
+            if replace_strings:
+                # Replace "passthrough" with the fitted version in
+                # _name_to_fitted_passthrough
+                def replace_passthrough(name, trans, columns):
+                    if name not in self._name_to_fitted_passthrough:
+                        return name, trans, columns
+                    return name, self._name_to_fitted_passthrough[name], columns
+
+                transformers = [
+                    replace_passthrough(*trans) for trans in self.transformers_
+                ]
+            else:
+                transformers = self.transformers_
         else:
             # interleave the validated column specifiers
             transformers = [
@@ -314,12 +357,17 @@ def _iter(self, fitted=False, replace_strings=False, column_as_strings=False):
                 transformers = chain(transformers, [self._remainder])
         get_weight = (self.transformer_weights or {}).get
 
+        output_config = _get_output_config("transform", self)
         for name, trans, columns in transformers:
             if replace_strings:
                 # replace 'passthrough' with identity transformer and
                 # skip in case of 'drop'
                 if trans == "passthrough":
-                    trans = FunctionTransformer(accept_sparse=True, check_inverse=False)
+                    trans = FunctionTransformer(
+                        accept_sparse=True,
+                        check_inverse=False,
+                        feature_names_out="one-to-one",
+                    ).set_output(transform=output_config["dense"])
                 elif trans == "drop":
                     continue
                 elif _is_empty_column_selection(columns):
@@ -505,15 +553,20 @@ def _update_fitted_transformers(self, transformers):
         # transformers are fitted; excludes 'drop' cases
         fitted_transformers = iter(transformers)
         transformers_ = []
+        self._name_to_fitted_passthrough = {}
 
         for name, old, column, _ in self._iter():
             if old == "drop":
                 trans = "drop"
             elif old == "passthrough":
                 # FunctionTransformer is present in list of transformers,
                 # so get next transformer, but save original string
-                next(fitted_transformers)
+                func_transformer = next(fitted_transformers)
                 trans = "passthrough"
+
+                # The fitted FunctionTransformer is saved in another attribute,
+                # so it can be used during transform for set_output.
+                self._name_to_fitted_passthrough[name] = func_transformer
             elif _is_empty_column_selection(column):
                 trans = old
             else:
@@ -765,6 +818,10 @@ def _hstack(self, Xs):
             return sparse.hstack(converted_Xs).tocsr()
         else:
             Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
+            config = _get_output_config("transform", self)
+            if config["dense"] == "pandas" and all(hasattr(X, "iloc") for X in Xs):
+                pd = check_pandas_support("transform")
+                return pd.concat(Xs, axis=1)
             return np.hstack(Xs)
 
     def _sk_visual_block_(self):