Skip to content

Attempt to speed up unique value discovery in _BaseEncoder for polars and pandas series #27911

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/whats_new/v1.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,12 @@ TODO: update at the time of the release.

- |Feature| A fitted :class:`compose.ColumnTransformer` now implements `__getitem__`
which returns the fitted transformers by name. :pr:`27990` by `Thomas Fan`_.

:mod:`sklearn.preprocessing`
............................

- |Efficiency| :class:`preprocessing.OrdinalEncoder`,
:class:`preprocessing.OneHotEncoder` and :class:`preprocessing.TargetEncoder`
can be faster on pandas or polars DataFrames by using a more efficient way of
finding unique values in the categorical columns. :pr:`27911` by :user:`Jérôme
Dockès <jeromedockes>`.
15 changes: 13 additions & 2 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
from ..utils._mask import _get_mask
from ..utils._param_validation import Interval, RealNotInt, StrOptions
from ..utils._set_output import _get_output_config
from ..utils.validation import _check_feature_names_in, check_is_fitted
from ..utils.validation import (
_check_feature_names_in,
_is_pandas_df,
_is_polars_df,
check_is_fitted,
)

__all__ = ["OneHotEncoder", "OrdinalEncoder"]

Expand Down Expand Up @@ -91,11 +96,17 @@ def _fit(
category_counts = []
compute_counts = return_counts or self._infrequent_enabled

is_pandas = _is_pandas_df(X)
is_polars = _is_polars_df(X)
for i in range(n_features):
Xi = X_list[i]

if self.categories == "auto":
result = _unique(Xi, return_counts=compute_counts)
if is_pandas or is_polars:
values = _safe_indexing(X, i, axis=1)
else:
values = Xi
result = _unique(values, return_counts=compute_counts)
if compute_counts:
cats, counts = result
category_counts.append(counts)
Expand Down
96 changes: 95 additions & 1 deletion sklearn/utils/_encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

import numpy as np

from . import is_scalar_nan
from ..utils import is_scalar_nan
from ..utils.fixes import parse_version
from ..utils.validation import _is_pandas_series, _is_polars_series


def _unique(values, *, return_inverse=False, return_counts=False):
Expand Down Expand Up @@ -38,6 +40,22 @@ def _unique(values, *, return_inverse=False, return_counts=False):
The number of times each of the unique values comes up in the original
array. Only provided if `return_counts` is True.
"""
if not return_inverse:
# _unique_python is faster for object dtype
if _is_pandas_series(values) and values.dtype != object:
if not return_counts:
return _unique_pandas(values, return_counts=return_counts)
# before pandas 1.4.0 value_counts would replace None and NaT with Nan
# https://github.com/pandas-dev/pandas/pull/42743
import pandas as pd

if parse_version("1.4.0") <= parse_version(pd.__version__):
return _unique_pandas(values, return_counts=return_counts)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At this point, if the pandas version is too old, then we return None.

if _is_polars_series(values):
# polars unique, arg_sort not supported for polars.Object dtype.
if str(values.dtype) != "Object":
return _unique_polars(values, return_counts=return_counts)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here regarding return None.

values = np.asarray(values)
if values.dtype == object:
return _unique_python(
values, return_inverse=return_inverse, return_counts=return_counts
Expand All @@ -48,9 +66,85 @@ def _unique(values, *, return_inverse=False, return_counts=False):
)


def _unique_pandas(values, *, return_counts=False):
if return_counts:
value_counts = values.value_counts(dropna=False, sort=False)
# sort categorical columns in lexical order to be consistent with order
# obtained when sorting after conversion to numpy array
try:
value_counts.index = value_counts.index.reorder_categories(
value_counts.index.categories.sort_values()
)
except AttributeError:
pass
value_counts = value_counts.sort_index()
return value_counts.index.to_numpy(), value_counts.to_numpy()
unique = values.unique()
# unique returns a NumpyExtensionArray for extension dtypes and a numpy
# array for other dtypes
Comment on lines +83 to +84
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this comment true? For this categorical dtype the output is not NumpyExtensionArray or a np.ndarray:

import pandas as pd
from pandas.arrays import NumpyExtensionArray
import numpy as np

x = pd.Series(["a", "b", "c", "b", "a", "b"],dtype="category")
uniques = x.unique()

assert not isinstance(uniques, NumpyExtensionArray)
assert not isinstance(uniques, np.ndarray)

if hasattr(unique, "sort_values"):
# sort categorical columns in lexical order to be consistent with order
# obtained when sorting after conversion to numpy array
try:
unique = unique.reorder_categories(unique.categories.sort_values())
Comment on lines +88 to +89
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not see the need to reorder_categories here if we end up running unique.sort_values() afterwards. Can you provide an example where this is required?

except AttributeError:
pass
return unique.sort_values().to_numpy()
if unique.dtype != object:
return np.sort(unique)
return _unique_python(unique, return_counts=False, return_inverse=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is not covered based on codecov.



def _polars_arg_sort(values):
# polars categorical variables may use physical ordering (order by the
# encoded value), here we want to sort by lexical order (the category's
# value) to be consistent with other containers
#
# we rely on arg_sort because it hase the nulls_last parameter whereas sort
# does not
try:
values = values.cat.set_ordering("lexical")
except Exception:
# non-categorical dtype, ordering does not apply
pass
return values.arg_sort(nulls_last=True)


def _polars_merge_null_nan(values, counts=None):
# polars unique() may contain both null and NaN; after converting to numpy
# they are both np.nan so we remove the duplicate.
if len(values) < 2 or not is_scalar_nan(values[-2]):
if counts is None:
return values
else:
return values, counts
values = values[:-1]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to codecov, everything below this line is not covered.

if counts is None:
return values
counts = counts.copy()
counts[-2] += counts[-1]
counts = counts[:-1]
return values, counts


def _unique_polars(values, *, return_counts=False):
if return_counts:
value_counts = values.value_counts(sort=False)
val_col, count_col = value_counts.columns
order = _polars_arg_sort(value_counts[val_col])
values = value_counts[val_col].gather(order).to_numpy()
counts = value_counts[count_col].gather(order).to_numpy()
return _polars_merge_null_nan(values, counts)
unique = values.unique()
order = _polars_arg_sort(unique)
return _polars_merge_null_nan(unique.gather(order).to_numpy())


def _unique_np(values, return_inverse=False, return_counts=False):
"""Helper function to find unique values for numpy arrays that correctly
accounts for nans. See `_unique` documentation for details."""
# TODO: remove this function and replace with np.unique once oldest
# supported numpy is 1.24 (added the nan_equal parameter to np.unique)
uniques = np.unique(
values, return_inverse=return_inverse, return_counts=return_counts
)
Expand Down
33 changes: 26 additions & 7 deletions sklearn/utils/tests/test_encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
from numpy.testing import assert_array_equal

from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique
from sklearn.utils._testing import _convert_container


@pytest.mark.parametrize("container_lib", ["numpy", "pandas", "polars"])
@pytest.mark.parametrize(
"values, expected",
[
Expand All @@ -27,15 +29,18 @@
],
ids=["int64", "float32-nan", "object", "object-None", "str"],
)
def test_encode_util(values, expected):
def test_encode_util(container_lib, values, expected):
pytest.importorskip(container_lib)
if container_lib != "numpy":
values = _convert_container(values, container_lib, columns_name=["A"])["A"]
uniques = _unique(values)
assert_array_equal(uniques, expected)

result, encoded = _unique(values, return_inverse=True)
assert_array_equal(result, expected)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))

encoded = _encode(values, uniques=uniques)
encoded = _encode(np.asarray(values), uniques=uniques)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))

result, counts = _unique(values, return_counts=True)
Expand Down Expand Up @@ -163,11 +168,17 @@ def test_check_unknown_missing_values(missing_value, pickle_uniques):
_assert_check_unknown(values, uniques, expected_diff, expected_mask)


@pytest.mark.parametrize("container_lib", ["numpy", "pandas", "polars"])
@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
@pytest.mark.parametrize("pickle_uniques", [True, False])
def test_unique_util_missing_values_objects(missing_value, pickle_uniques):
def test_unique_util_missing_values_objects(
container_lib, missing_value, pickle_uniques
):
# check for _unique and _encode with missing values with object dtypes
pytest.importorskip(container_lib)
values = np.array(["a", "c", "c", missing_value, "b"], dtype=object)
if container_lib != "numpy":
values = _convert_container(values, container_lib, columns_name=["A"])["A"]
expected_uniques = np.array(["a", "b", "c", missing_value], dtype=object)

uniques = _unique(values)
Expand All @@ -181,13 +192,17 @@ def test_unique_util_missing_values_objects(missing_value, pickle_uniques):
if pickle_uniques:
uniques = pickle.loads(pickle.dumps(uniques))

encoded = _encode(values, uniques=uniques)
encoded = _encode(np.asarray(values), uniques=uniques)
assert_array_equal(encoded, np.array([0, 2, 2, 3, 1]))


def test_unique_util_missing_values_numeric():
@pytest.mark.parametrize("container_lib", ["numpy", "pandas", "polars"])
def test_unique_util_missing_values_numeric(container_lib):
# Check missing values in numerical values
pytest.importorskip(container_lib)
values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float)
if container_lib != "numpy":
values = _convert_container(values, container_lib, columns_name=["A"])["A"]
expected_uniques = np.array([1, 3, 5, np.nan], dtype=float)
expected_inverse = np.array([1, 0, 3, 2, 1, 3])

Expand All @@ -198,13 +213,17 @@ def test_unique_util_missing_values_numeric():
assert_array_equal(uniques, expected_uniques)
assert_array_equal(inverse, expected_inverse)

encoded = _encode(values, uniques=uniques)
encoded = _encode(np.asarray(values), uniques=uniques)
assert_array_equal(encoded, expected_inverse)


def test_unique_util_with_all_missing_values():
@pytest.mark.parametrize("container_lib", ["numpy", "pandas", "polars"])
def test_unique_util_with_all_missing_values(container_lib):
# test for all types of missing values for object dtype
pytest.importorskip(container_lib)
values = np.array([np.nan, "a", "c", "c", None, float("nan"), None], dtype=object)
if container_lib != "numpy":
values = _convert_container(values, container_lib, columns_name=["A"])["A"]

uniques = _unique(values)
assert_array_equal(uniques[:-1], ["a", "c", None])
Expand Down
20 changes: 20 additions & 0 deletions sklearn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2026,6 +2026,16 @@ def _is_pandas_df(X):
return False


def _is_pandas_series(x):
if hasattr(x, "iloc") and hasattr(x, "dtype"):
try:
pd = sys.modules["pandas"]
except KeyError:
return False
return isinstance(x, pd.Series)
return False


def _is_polars_df(X):
"""Return True if the X is a polars dataframe."""
if hasattr(X, "columns") and hasattr(X, "schema"):
Expand All @@ -2038,6 +2048,16 @@ def _is_polars_df(X):
return False


def _is_polars_series(x):
if hasattr(x, "gather"):
try:
pl = sys.modules["polars"]
except KeyError:
return False
return isinstance(x, pl.Series)
return False


def _get_feature_names(X):
"""Get feature names from X.

Expand Down