Skip to content

ColumnTransformers should use get_feature_names_out() when columns attribute is not available #21452

@ageron

Description

@ageron

Describe the bug

Now that transformers have a get_feature_names_out() method, ColumnTransformers should make use of it when column names are lost in a pipeline.

Steps/Code to Reproduce

Suppose we have a DataFrame with 3 columns A, B, C. We want to use a SimpleImputer for both A and B, followed by a StandardScaler for A and a MinMaxScaler for B. And we want a OneHotEncoder for C.

A direct mapping from this description to pipelines and column transformers looks like this (see this gist):

import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

df = pd.DataFrame({"A": [1.,2.,np.nan], 
                   "B": [4.,np.nan,6.],
                   "C": ["G1", "G2", "G1"]})

num_pipeline = make_pipeline(
    SimpleImputer(),
    make_column_transformer(
        (StandardScaler(), ["A"]),
        (MinMaxScaler(), ["B"]),
    )
)

preprocessing = make_column_transformer(
    (num_pipeline, ["A", "B"]),
    (OneHotEncoder(), ["C"]),
)

X = preprocessing.fit_transform(df)

Expected Results

I expect this code to work, since all the required information is available to the Pipeline and ColumnTransformers: they could use get_feature_names_out() whenever they cannot rely on the columns attribute.

Actual Results

The code above raises an AttributeError:

Stacktrace:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
    407         try:
--> 408             all_columns = X.columns
    409         except AttributeError:

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
15 frames
<ipython-input-2-ef089cab1885> in <module>()
     23 )
     24 
---> 25 X = preprocessing.fit_transform(df)

/usr/local/lib/python3.7/dist-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
    673         self._validate_remainder(X)
    674 
--> 675         result = self._fit_transform(X, y, _fit_transform_one)
    676 
    677         if not result:

/usr/local/lib/python3.7/dist-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted, column_as_strings)
    613                     message=self._log_message(name, idx, len(transformers)),
    614                 )
--> 615                 for idx, (name, trans, column, weight) in enumerate(transformers, 1)
    616             )
    617         except ValueError as e:

/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in __call__(self, iterable)
   1039             # remaining jobs.
   1040             self._iterating = False
-> 1041             if self.dispatch_one_batch(iterator):
   1042                 self._iterating = self._original_iterator is not None
   1043 

/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    857                 return False
    858             else:
--> 859                 self._dispatch(tasks)
    860                 return True
    861 

/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in _dispatch(self, batch)
    775         with self._lock:
    776             job_idx = len(self._jobs)
--> 777             job = self._backend.apply_async(batch, callback=cb)
    778             # A job can complete so quickly than its callback is
    779             # called before we get here, causing self._jobs to

/usr/local/lib/python3.7/dist-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

/usr/local/lib/python3.7/dist-packages/joblib/_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in __call__(self)
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    262             return [func(*args, **kwargs)
--> 263                     for func, args, kwargs in self.items]
    264 
    265     def __reduce__(self):

/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in <listcomp>(.0)
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    262             return [func(*args, **kwargs)
--> 263                     for func, args, kwargs in self.items]
    264 
    265     def __reduce__(self):

/usr/local/lib/python3.7/dist-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    207     def __call__(self, *args, **kwargs):
    208         with config_context(**self.config):
--> 209             return self.function(*args, **kwargs)
    210 
    211 

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    889     with _print_elapsed_time(message_clsname, message):
    890         if hasattr(transformer, "fit_transform"):
--> 891             res = transformer.fit_transform(X, y, **fit_params)
    892         else:
    893             res = transformer.fit(X, y, **fit_params).transform(X)

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    432             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    433             if hasattr(last_step, "fit_transform"):
--> 434                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    435             else:
    436                 return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)

/usr/local/lib/python3.7/dist-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
    670         self._check_n_features(X, reset=True)
    671         self._validate_transformers()
--> 672         self._validate_column_callables(X)
    673         self._validate_remainder(X)
    674 

/usr/local/lib/python3.7/dist-packages/sklearn/compose/_column_transformer.py in _validate_column_callables(self, X)
    350                 columns = columns(X)
    351             all_columns.append(columns)
--> 352             transformer_to_input_indices[name] = _get_column_indices(X, columns)
    353 
    354         self._columns = all_columns

/usr/local/lib/python3.7/dist-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
    409         except AttributeError:
    410             raise ValueError(
--> 411                 "Specifying the columns using strings is only "
    412                 "supported for pandas DataFrames"
    413             )

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

Versions

System:
python: 3.7.12 (default, Sep 10 2021, 00:21:48) [GCC 7.5.0]
executable: /usr/bin/python3
machine: Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic

Python dependencies:
pip: 21.1.3
setuptools: 57.4.0
sklearn: 1.0
numpy: 1.19.5
scipy: 1.4.1
Cython: 0.29.24
pandas: 1.1.5
matplotlib: 3.2.2
joblib: 1.0.1
threadpoolctl: 3.0.0

Built with OpenMP: True

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions