-
-
Notifications
You must be signed in to change notification settings - Fork 26.2k
Description
Describe the bug
Now that transformers have a get_feature_names_out()
method, ColumnTransformer
s should make use of it when column names are lost in a pipeline.
Steps/Code to Reproduce
Suppose we have a DataFrame with 3 columns A, B, C. We want to use a SimpleImputer for both A and B, followed by a StandardScaler for A and a MinMaxScaler for B. And we want a OneHotEncoder for C.
A direct mapping from this description to pipelines and column transformers looks like this (see this gist):
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
df = pd.DataFrame({"A": [1.,2.,np.nan],
"B": [4.,np.nan,6.],
"C": ["G1", "G2", "G1"]})
num_pipeline = make_pipeline(
SimpleImputer(),
make_column_transformer(
(StandardScaler(), ["A"]),
(MinMaxScaler(), ["B"]),
)
)
preprocessing = make_column_transformer(
(num_pipeline, ["A", "B"]),
(OneHotEncoder(), ["C"]),
)
X = preprocessing.fit_transform(df)
Expected Results
I expect this code to work, since all the required information is available to the Pipeline
and ColumnTransformer
s: they could use get_feature_names_out()
whenever they cannot rely on the columns
attribute.
Actual Results
The code above raises an AttributeError
:
Stacktrace:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
407 try:
--> 408 all_columns = X.columns
409 except AttributeError:
AttributeError: 'numpy.ndarray' object has no attribute 'columns'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
15 frames
<ipython-input-2-ef089cab1885> in <module>()
23 )
24
---> 25 X = preprocessing.fit_transform(df)
/usr/local/lib/python3.7/dist-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
673 self._validate_remainder(X)
674
--> 675 result = self._fit_transform(X, y, _fit_transform_one)
676
677 if not result:
/usr/local/lib/python3.7/dist-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted, column_as_strings)
613 message=self._log_message(name, idx, len(transformers)),
614 )
--> 615 for idx, (name, trans, column, weight) in enumerate(transformers, 1)
616 )
617 except ValueError as e:
/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
/usr/local/lib/python3.7/dist-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
/usr/local/lib/python3.7/dist-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in __call__(self)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
/usr/local/lib/python3.7/dist-packages/joblib/parallel.py in <listcomp>(.0)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
/usr/local/lib/python3.7/dist-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
207 def __call__(self, *args, **kwargs):
208 with config_context(**self.config):
--> 209 return self.function(*args, **kwargs)
210
211
/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
889 with _print_elapsed_time(message_clsname, message):
890 if hasattr(transformer, "fit_transform"):
--> 891 res = transformer.fit_transform(X, y, **fit_params)
892 else:
893 res = transformer.fit(X, y, **fit_params).transform(X)
/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
432 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
433 if hasattr(last_step, "fit_transform"):
--> 434 return last_step.fit_transform(Xt, y, **fit_params_last_step)
435 else:
436 return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)
/usr/local/lib/python3.7/dist-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
670 self._check_n_features(X, reset=True)
671 self._validate_transformers()
--> 672 self._validate_column_callables(X)
673 self._validate_remainder(X)
674
/usr/local/lib/python3.7/dist-packages/sklearn/compose/_column_transformer.py in _validate_column_callables(self, X)
350 columns = columns(X)
351 all_columns.append(columns)
--> 352 transformer_to_input_indices[name] = _get_column_indices(X, columns)
353
354 self._columns = all_columns
/usr/local/lib/python3.7/dist-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
409 except AttributeError:
410 raise ValueError(
--> 411 "Specifying the columns using strings is only "
412 "supported for pandas DataFrames"
413 )
ValueError: Specifying the columns using strings is only supported for pandas DataFrames
Versions
System:
python: 3.7.12 (default, Sep 10 2021, 00:21:48) [GCC 7.5.0]
executable: /usr/bin/python3
machine: Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic
Python dependencies:
pip: 21.1.3
setuptools: 57.4.0
sklearn: 1.0
numpy: 1.19.5
scipy: 1.4.1
Cython: 0.29.24
pandas: 1.1.5
matplotlib: 3.2.2
joblib: 1.0.1
threadpoolctl: 3.0.0
Built with OpenMP: True