scikit-learn · thomasjpfan · May 16, 2021 · May 19, 2021 · May 19, 2021 · amueller
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -438,6 +438,10 @@ Changelog
 - |Feature| :class:`preprocessing.OrdinalEncoder` supports passing through
   missing values by default. :pr:`19069` by `Thomas Fan`_.
 
+- |Feature| Transformers in the :mod:`sklearn.preprocessing` have a `array_out`
+  kwargs in :term:`transform` that can be set to `'pandas'` to output
+  DataFrames. :pr:`20100` by `Thomas Fan`_.
+
 - |Fix| :func:`preprocessing.scale`, :class:`preprocessing.StandardScaler`
   and similar scalers detect near-constant features to avoid scaling them to
   very large values. This problem happens in particular when using a scaler on

diff --git a/sklearn/base.py b/sklearn/base.py
@@ -23,6 +23,7 @@
 from .utils.validation import check_array
 from .utils.validation import _num_features
 from .utils._estimator_html_repr import estimator_html_repr
+from .utils._array_out import _get_feature_names
 
 
 def clone(estimator, *, safe=True):
@@ -376,6 +377,33 @@ def _check_n_features(self, X, reset):
                 f"X has {n_features} features, but {self.__class__.__name__} "
                 f"is expecting {self.n_features_in_} features as input.")
 
+    def _check_feature_names(self, X, reset=True):
+        """Set the `feature_names_in_` attribute, or check against it.
+
+        Parameters
+        ----------
+        X : array-like
+            The input samples.
+        reset : bool, default=True
+            If True, the `n_feature_names_` attribute is set to the feature
-            If True, the `n_feature_names_` attribute is set to the feature
+            If True, the `feature_names_in_` attribute is set to the feature
-            If True, the `n_feature_names_` attribute is set to the feature
+            If True, the `feature_names_in_` attribute is set to the feature
+            names of `X`.
+            Else, the attribute must already exist and the function checks
+            that it is equal to the feature names of `X`.
+        """
+        feature_names = _get_feature_names(X)
+        if reset:
+            self.feature_names_in_ = feature_names
+            return
+
+        if (not hasattr(self, 'feature_names_in_') or
+                self.feature_names_in_ is None or
+                feature_names is None):
+            return
+
+        if any(feature_names != self.feature_names_in_):
+            raise ValueError("The input's feature names does not match the "
+                             "feature_names_in_ attribute.")
+
     def _validate_data(self, X, y='no_validation', reset=True,
                        validate_separately=False, **check_params):
         """Validate input data and set or check the `n_features_in_` attribute.
@@ -418,6 +446,7 @@ def _validate_data(self, X, y='no_validation', reset=True,
         out : {ndarray, sparse matrix} or tuple of these
             The validated input. A tuple is returned if `y` is not None.
         """
+        self._check_feature_names(X, reset=reset)
 
         if y is None:
             if self._get_tags()['requires_y']:
@@ -678,7 +707,7 @@ def get_submatrix(self, i, data):
 class TransformerMixin:
     """Mixin class for all transformers in scikit-learn."""
 
-    def fit_transform(self, X, y=None, **fit_params):
+    def fit_transform(self, X, y=None, array_out="default", **fit_params):
         """
         Fit to data, then transform it.
 
@@ -694,6 +723,11 @@ def fit_transform(self, X, y=None, **fit_params):
                 default=None
             Target values (None for unsupervised transformations).
 
+        array_out : {"default", "pandas"}, default="default"
+            Specify the output array type. If "pandas", a pandas DataFrame is
+            returned. If "default", an array-like without feature names is
+            returned.
+
         **fit_params : dict
             Additional fit parameters.
 
@@ -706,10 +740,20 @@ def fit_transform(self, X, y=None, **fit_params):
         # method is possible for a given clustering algorithm
         if y is None:
             # fit method of arity 1 (unsupervised transformation)
-            return self.fit(X, **fit_params).transform(X)
+            fitted = self.fit(X, **fit_params)
         else:
             # fit method of arity 2 (supervised transformation)
-            return self.fit(X, y, **fit_params).transform(X)
+            fitted = self.fit(X, y, **fit_params)
+
+        if array_out == "default":
+            return fitted.transform(X)
+
+        # array_out != "default"
+        transform_params = inspect.signature(fitted.transform).parameters
+        if "array_out" not in transform_params:
+            raise ValueError("Transform does not support array_out")
+
+        return fitted.transform(X, array_out=array_out)
 
 
 class DensityMixin:

diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
@@ -29,6 +29,7 @@
 from ..utils.validation import (check_is_fitted, check_random_state,
                                 _check_sample_weight,
                                 FLOAT_DTYPES)
+from ..utils._array_out import _array_out_wrap
 
 from ._encoders import OneHotEncoder
 
@@ -440,14 +441,20 @@ def partial_fit(self, X, y=None):
         self.data_range_ = data_range
         return self
 
-    def transform(self, X):
+    @_array_out_wrap("one_to_one")
+    def transform(self, X, array_out="default"):
         """Scale features of X according to feature_range.
 
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
             Input data that will be transformed.
 
+        array_out : {"default", "pandas"}, default="default"
+            Specify the output array type. If "pandas", a pandas DataFrame is
+            returned. If "default", an array-like without feature names is
+            returned.
+
         Returns
         -------
         Xt : ndarray of shape (n_samples, n_features)
@@ -884,7 +891,8 @@ def partial_fit(self, X, y=None, sample_weight=None):
 
         return self
 
-    def transform(self, X, copy=None):
+    @_array_out_wrap("one_to_one")
+    def transform(self, X, copy=None, array_out="default"):
         """Perform standardization by centering and scaling
 
         Parameters
@@ -893,6 +901,10 @@ def transform(self, X, copy=None):
             The data used to scale along the features axis.
         copy : bool, default=None
             Copy the input X or not.
+        array_out : {"default", "pandas"}, default="default"
+            Specify the output array type. If "pandas", a pandas DataFrame is
+            returned. If "default", an array-like without feature names is
+            returned.
 
         Returns
         -------
@@ -1103,14 +1115,20 @@ def partial_fit(self, X, y=None):
         self.scale_ = _handle_zeros_in_scale(max_abs, copy=True)
         return self
 
-    def transform(self, X):
+    @_array_out_wrap("one_to_one")
+    def transform(self, X, array_out="default"):
         """Scale the data
 
         Parameters
         ----------
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The data that should be scaled.
 
+        array_out : {"default", "pandas"}, default="default"
+            Specify the output array type. If "pandas", a pandas DataFrame is
+            returned. If "default", an array-like without feature names is
+            returned.
+
         Returns
         -------
         X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
@@ -1403,14 +1421,20 @@ def fit(self, X, y=None):
 
         return self
 
-    def transform(self, X):
+    @_array_out_wrap("one_to_one")
+    def transform(self, X, array_out="default"):
         """Center and scale the data.
 
         Parameters
         ----------
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The data used to scale along the specified axis.
 
+        array_out : {"default", "pandas"}, default="default"
+            Specify the output array type. If "pandas", a pandas DataFrame is
+            returned. If "default", an array-like without feature names is
+            returned.
+
         Returns
         -------
         X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
@@ -1755,7 +1779,8 @@ def fit(self, X, y=None):
         self._validate_data(X, accept_sparse='csr')
         return self
 
-    def transform(self, X, copy=None):
+    @_array_out_wrap("one_to_one")
+    def transform(self, X, copy=None, array_out="default"):
         """Scale each non zero row of X to unit norm
 
         Parameters
@@ -1767,6 +1792,11 @@ def transform(self, X, copy=None):
         copy : bool, default=None
             Copy the input X or not.
 
+        array_out : {"default", "pandas"}, default="default"
+            Specify the output array type. If "pandas", a pandas DataFrame is
+            returned. If "default", an array-like without feature names is
+            returned.
+
         Returns
         -------
         X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
@@ -1909,7 +1939,8 @@ def fit(self, X, y=None):
         self._validate_data(X, accept_sparse='csr')
         return self
 
-    def transform(self, X, copy=None):
+    @_array_out_wrap("one_to_one")
+    def transform(self, X, copy=None, array_out="default"):
         """Binarize each element of X.
 
         Parameters
@@ -1922,6 +1953,11 @@ def transform(self, X, copy=None):
         copy : bool
             Copy the input X or not.
 
+        array_out : {"default", "pandas"}, default="default"
+            Specify the output array type. If "pandas", a pandas DataFrame is
+            returned. If "default", an array-like without feature names is
+            returned.
+
         Returns
         -------
         X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
@@ -2033,7 +2069,8 @@ def fit(self, K, y=None):
         self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples
         return self
 
-    def transform(self, K, copy=True):
+    @_array_out_wrap("one_to_one")
+    def transform(self, K, copy=True, array_out="default"):
         """Center kernel matrix.
 
         Parameters
@@ -2044,6 +2081,11 @@ def transform(self, K, copy=True):
         copy : bool, default=True
             Set to False to perform inplace computation.
 
+        array_out : {"default", "pandas"}, default="default"
+            Specify the output array type. If "pandas", a pandas DataFrame is
+            returned. If "default", an array-like without feature names is
+            returned.
+
         Returns
         -------
         K_new : ndarray of shape (n_samples1, n_samples2)
@@ -2500,7 +2542,8 @@ def _transform(self, X, inverse=False):
 
         return X
 
-    def transform(self, X):
+    @_array_out_wrap("one_to_one")
+    def transform(self, X, array_out="default"):
         """Feature-wise transformation of the data.
 
         Parameters
@@ -2511,6 +2554,11 @@ def transform(self, X):
             ``csc_matrix``. Additionally, the sparse matrix needs to be
             nonnegative if `ignore_implicit_zeros` is False.
 
+        array_out : {"default", "pandas"}, default="default"
+            Specify the output array type. If "pandas", a pandas DataFrame is
+            returned. If "default", an array-like without feature names is
+            returned.
+
         Returns
         -------
         Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
@@ -2792,7 +2840,8 @@ def fit(self, X, y=None):
         self._fit(X, y=y, force_transform=False)
         return self
 
-    def fit_transform(self, X, y=None):
+    @_array_out_wrap("one_to_one")
+    def fit_transform(self, X, y=None, array_out="default"):
         return self._fit(X, y, force_transform=True)
 
     def _fit(self, X, y=None, force_transform=False):
@@ -2825,14 +2874,20 @@ def _fit(self, X, y=None, force_transform=False):
 
         return X
 
-    def transform(self, X):
+    @_array_out_wrap("one_to_one")
+    def transform(self, X, array_out="default"):
         """Apply the power transform to each feature using the fitted lambdas.
 
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
             The data to be transformed using a power transformation.
 
+        array_out : {"default", "pandas"}, default="default"
+            Specify the output array type. If "pandas", a pandas DataFrame is
+            returned. If "default", an array-like without feature names is
+            returned.
+
         Returns
         -------
         X_trans : ndarray of shape (n_samples, n_features)

diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
@@ -15,6 +15,7 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
+from ..utils._array_out import _array_out_wrap
 
 
 class KBinsDiscretizer(TransformerMixin, BaseEstimator):
@@ -269,7 +270,8 @@ def _validate_n_bins(self, n_features):
                              .format(KBinsDiscretizer.__name__, indices))
         return n_bins
 
-    def transform(self, X):
+    @_array_out_wrap(lambda self: self._encoder.get_feature_names)
+    def transform(self, X, array_out="default"):
         """
         Discretize the data.
 
@@ -278,6 +280,11 @@ def transform(self, X):
         X : array-like of shape (n_samples, n_features)
             Data to be discretized.
 
+        array_out : {"default", "pandas"}, default="default"
+            Specify the output array type. If "pandas", a pandas DataFrame is
+            returned. If "default", an array-like without feature names is
+            returned.
+
         Returns
         -------
         Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}