-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
Attempt to speed up unique value discovery in _BaseEncoder
for polars and pandas series
#27911
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
5add48f
9112779
bd86ed4
2f29f3d
4237de1
b339650
9a4dae1
9fa2e0b
e6322e8
2a881fc
c0bde6f
c53dff1
2d7542b
b21d596
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,9 @@ | |
|
||
import numpy as np | ||
|
||
from . import is_scalar_nan | ||
from ..utils import is_scalar_nan | ||
from ..utils.fixes import parse_version | ||
from ..utils.validation import _is_pandas_series, _is_polars_series | ||
|
||
|
||
def _unique(values, *, return_inverse=False, return_counts=False): | ||
|
@@ -38,6 +40,22 @@ def _unique(values, *, return_inverse=False, return_counts=False): | |
The number of times each of the unique values comes up in the original | ||
array. Only provided if `return_counts` is True. | ||
""" | ||
if not return_inverse: | ||
# _unique_python is faster for object dtype | ||
if _is_pandas_series(values) and values.dtype != object: | ||
if not return_counts: | ||
return _unique_pandas(values, return_counts=return_counts) | ||
# before pandas 1.4.0 value_counts would replace None and NaT with Nan | ||
# https://github.com/pandas-dev/pandas/pull/42743 | ||
import pandas as pd | ||
|
||
if parse_version("1.4.0") <= parse_version(pd.__version__): | ||
return _unique_pandas(values, return_counts=return_counts) | ||
if _is_polars_series(values): | ||
# polars unique, arg_sort not supported for polars.Object dtype. | ||
if str(values.dtype) != "Object": | ||
return _unique_polars(values, return_counts=return_counts) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here regarding return |
||
values = np.asarray(values) | ||
if values.dtype == object: | ||
return _unique_python( | ||
values, return_inverse=return_inverse, return_counts=return_counts | ||
|
@@ -48,9 +66,85 @@ def _unique(values, *, return_inverse=False, return_counts=False): | |
) | ||
|
||
|
||
def _unique_pandas(values, *, return_counts=False): | ||
if return_counts: | ||
value_counts = values.value_counts(dropna=False, sort=False) | ||
# sort categorical columns in lexical order to be consistent with order | ||
# obtained when sorting after conversion to numpy array | ||
try: | ||
value_counts.index = value_counts.index.reorder_categories( | ||
value_counts.index.categories.sort_values() | ||
) | ||
except AttributeError: | ||
pass | ||
value_counts = value_counts.sort_index() | ||
return value_counts.index.to_numpy(), value_counts.to_numpy() | ||
unique = values.unique() | ||
# unique returns a NumpyExtensionArray for extension dtypes and a numpy | ||
# array for other dtypes | ||
Comment on lines
+83
to
+84
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this comment true? For this categorical dtype the output is not import pandas as pd
from pandas.arrays import NumpyExtensionArray
import numpy as np
x = pd.Series(["a", "b", "c", "b", "a", "b"],dtype="category")
uniques = x.unique()
assert not isinstance(uniques, NumpyExtensionArray)
assert not isinstance(uniques, np.ndarray) |
||
if hasattr(unique, "sort_values"): | ||
# sort categorical columns in lexical order to be consistent with order | ||
# obtained when sorting after conversion to numpy array | ||
try: | ||
unique = unique.reorder_categories(unique.categories.sort_values()) | ||
Comment on lines
+88
to
+89
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do not see the need to |
||
except AttributeError: | ||
pass | ||
return unique.sort_values().to_numpy() | ||
if unique.dtype != object: | ||
return np.sort(unique) | ||
return _unique_python(unique, return_counts=False, return_inverse=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code is not covered based on |
||
|
||
|
||
def _polars_arg_sort(values): | ||
# polars categorical variables may use physical ordering (order by the | ||
# encoded value), here we want to sort by lexical order (the category's | ||
# value) to be consistent with other containers | ||
# | ||
# we rely on arg_sort because it hase the nulls_last parameter whereas sort | ||
# does not | ||
try: | ||
values = values.cat.set_ordering("lexical") | ||
except Exception: | ||
# non-categorical dtype, ordering does not apply | ||
pass | ||
return values.arg_sort(nulls_last=True) | ||
|
||
|
||
def _polars_merge_null_nan(values, counts=None): | ||
# polars unique() may contain both null and NaN; after converting to numpy | ||
# they are both np.nan so we remove the duplicate. | ||
if len(values) < 2 or not is_scalar_nan(values[-2]): | ||
if counts is None: | ||
return values | ||
else: | ||
return values, counts | ||
values = values[:-1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to |
||
if counts is None: | ||
return values | ||
counts = counts.copy() | ||
counts[-2] += counts[-1] | ||
counts = counts[:-1] | ||
return values, counts | ||
|
||
|
||
def _unique_polars(values, *, return_counts=False): | ||
if return_counts: | ||
value_counts = values.value_counts(sort=False) | ||
val_col, count_col = value_counts.columns | ||
order = _polars_arg_sort(value_counts[val_col]) | ||
values = value_counts[val_col].gather(order).to_numpy() | ||
counts = value_counts[count_col].gather(order).to_numpy() | ||
return _polars_merge_null_nan(values, counts) | ||
unique = values.unique() | ||
order = _polars_arg_sort(unique) | ||
return _polars_merge_null_nan(unique.gather(order).to_numpy()) | ||
|
||
|
||
def _unique_np(values, return_inverse=False, return_counts=False): | ||
"""Helper function to find unique values for numpy arrays that correctly | ||
accounts for nans. See `_unique` documentation for details.""" | ||
# TODO: remove this function and replace with np.unique once oldest | ||
# supported numpy is 1.24 (added the nan_equal parameter to np.unique) | ||
uniques = np.unique( | ||
values, return_inverse=return_inverse, return_counts=return_counts | ||
) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At this point, if the pandas version is too old, then we return
None
.