Closed
Description
Describe the bug
A while back @thomasjpfan and @lorentzenchr contributed #17317 which enabled missing value support in OneHotEncoder
For object dtypes, None and np.nan is support for missing values.
Pandas 2.0 now supports an Arrow backend which uses pandas._libs.missing.NAType
instead of either of the currently supported options (None
or np.nan
) to represent its missing values. This causes OneHotEncoder
to fail
Steps/Code to Reproduce
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
pipe = Pipeline(
[
(
"preprocess",
ColumnTransformer(
[
(
"categorical_features",
OneHotEncoder(),
["category"],
)
]
),
),
(
"model",
XGBClassifier(n_estimators=3)
)
]
)
# Native Pandas types work
df_native = pd.DataFrame(
{
"category": ["a", "b", np.nan, "d"],
"label": [0, 1, 0, 1],
}
)
pipe.fit(df_native[["category"]], df_native["label"])
# Arrow types do not work
# TypeError: Encoders require their input argument must be uniformly strings or numbers.
# Got ['NAType', 'str']
df_arrow = df_native.convert_dtypes(dtype_backend="pyarrow")
pipe.fit(df_arrow[["category"]], df_arrow["label"])
# On inspection, the null value has different representations
null_idx = 2
print(type(df_native["category"].iloc[null_idx])) # <class 'float'>
print(type(df_arrow["category"].iloc[null_idx])) # <class 'pandas._libs.missing.NAType'>
Expected Results
Sklearn should work with the new Pandas Arrow backend
Actual Results
Traceback (most recent call last):
File "sklearn/utils/_encode.py", line 174, in _unique_python
uniques = sorted(uniques_set)
^^^^^^^^^^^^^^^^^^^
File "missing.pyx", line 419, in pandas._libs.missing.NAType.__bool__
TypeError: boolean value of NA is ambiguous
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "sklearn/base.py", line 1152, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/pipeline.py", line 423, in fit
Xt = self._fit(X, y, **fit_params_steps)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/pipeline.py", line 377, in _fit
X, fitted_transformer = fit_transform_one_cached(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "joblib/memory.py", line 353, in __call__
return self.func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/pipeline.py", line 957, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/utils/_set_output.py", line 157, in wrapped
data_to_wrap = f(self, X, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/base.py", line 1152, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/compose/_column_transformer.py", line 754, in fit_transform
result = self._fit_transform(X, y, _fit_transform_one)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/compose/_column_transformer.py", line 681, in _fit_transform
return Parallel(n_jobs=self.n_jobs)(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/utils/parallel.py", line 65, in __call__
return super().__call__(iterable_with_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "joblib/parallel.py", line 1863, in __call__
return output if self.return_generator else list(output)
^^^^^^^^^^^^
File "joblib/parallel.py", line 1792, in _get_sequential_output
res = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "sklearn/utils/parallel.py", line 127, in __call__
return self.function(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/pipeline.py", line 957, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/utils/_set_output.py", line 157, in wrapped
data_to_wrap = f(self, X, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/base.py", line 919, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/base.py", line 1152, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/preprocessing/_encoders.py", line 985, in fit
self._fit(
File "sklearn/preprocessing/_encoders.py", line 98, in _fit
result = _unique(Xi, return_counts=compute_counts)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "sklearn/utils/_encode.py", line 42, in _unique
return _unique_python(
^^^^^^^^^^^^^^^
File "sklearn/utils/_encode.py", line 179, in _unique_python
raise TypeError(
TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['NAType', 'str']
Versions
System:
python: 3.11.6 | packaged by conda-forge | (main, Oct 3 2023, 10:37:07) [Clang 15.0.7 ]
executable: /var/lib/conda/envs/lightning-2023-10/bin/python3.11
machine: macOS-13.5.2-arm64-arm-64bit
Python dependencies:
sklearn: 1.3.1
pip: 23.3
setuptools: 68.2.2
numpy: 1.24.4
scipy: 1.11.3
Cython: None
pandas: 2.1.1
matplotlib: 3.8.0
joblib: 1.3.2
threadpoolctl: 3.2.0
Built with OpenMP: True
threadpoolctl info:
user_api: blas
internal_api: openblas
num_threads: 10
prefix: libopenblas
filepath: /private/var/lib/conda/envs/lightning-2023-10/lib/libopenblas.0.dylib
version: 0.3.24
threading_layer: openmp
architecture: VORTEX
user_api: openmp
internal_api: openmp
num_threads: 10
prefix: libomp
filepath: /private/var/lib/conda/envs/lightning-2023-10/lib/libomp.dylib
version: None