diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index b4d26e07dffc0..aff2ea2b011da 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -39,7 +39,8 @@ See :ref:`array_api` for more details. **Classes:** -- +- :class:`preprocessing.LabelEncoder` now supports Array API compatible inputs. + :pr:`27381` by :user:`Omar Salman `. Metadata Routing ---------------- diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 301dc19bb1985..ecf0c400a2c2f 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -17,6 +17,7 @@ from ..base import BaseEstimator, TransformerMixin, _fit_context from ..utils import column_or_1d +from ..utils._array_api import _setdiff1d, device, get_namespace from ..utils._encode import _encode, _unique from ..utils._param_validation import Interval, validate_params from ..utils.multiclass import type_of_target, unique_labels @@ -129,10 +130,11 @@ def transform(self, y): Labels as normalized encodings. """ check_is_fitted(self) + xp, _ = get_namespace(y) y = column_or_1d(y, dtype=self.classes_.dtype, warn=True) # transform of empty array is empty array if _num_samples(y) == 0: - return np.array([]) + return xp.asarray([]) return _encode(y, uniques=self.classes_) @@ -141,7 +143,7 @@ def inverse_transform(self, y): Parameters ---------- - y : ndarray of shape (n_samples,) + y : array-like of shape (n_samples,) Target values. Returns @@ -150,19 +152,24 @@ def inverse_transform(self, y): Original encoding. """ check_is_fitted(self) + xp, _ = get_namespace(y) y = column_or_1d(y, warn=True) # inverse transform of empty array is empty array if _num_samples(y) == 0: - return np.array([]) + return xp.asarray([]) - diff = np.setdiff1d(y, np.arange(len(self.classes_))) - if len(diff): + diff = _setdiff1d( + ar1=y, + ar2=xp.arange(self.classes_.shape[0], device=device(y)), + xp=xp, + ) + if diff.shape[0]: raise ValueError("y contains previously unseen labels: %s" % str(diff)) - y = np.asarray(y) - return self.classes_[y] + y = xp.asarray(y) + return xp.take(self.classes_, y, axis=0) def _more_tags(self): - return {"X_types": ["1dlabels"]} + return {"X_types": ["1dlabels"], "array_api_support": True} class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index e438805df1254..90e3aa210eebb 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -2,7 +2,7 @@ import pytest from scipy.sparse import issparse -from sklearn import datasets +from sklearn import config_context, datasets from sklearn.preprocessing._label import ( LabelBinarizer, LabelEncoder, @@ -11,7 +11,16 @@ _inverse_binarize_thresholding, label_binarize, ) -from sklearn.utils._testing import assert_array_equal, ignore_warnings +from sklearn.utils._array_api import ( + _convert_to_numpy, + get_namespace, + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._testing import ( + _array_api_for_tests, + assert_array_equal, + ignore_warnings, +) from sklearn.utils.fixes import ( COO_CONTAINERS, CSC_CONTAINERS, @@ -697,3 +706,42 @@ def test_label_encoders_do_not_have_set_output(encoder): y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"]) y_encoded_positional = encoder.fit_transform(["a", "b", "c"]) assert_array_equal(y_encoded_with_kwarg, y_encoded_positional) + + +@pytest.mark.parametrize( + "array_namespace, device, dtype", yield_namespace_device_dtype_combinations() +) +@pytest.mark.parametrize( + "y", + [ + np.array([2, 1, 3, 1, 3]), + np.array([1, 1, 4, 5, -1, 0]), + np.array([3, 5, 9, 5, 9, 3]), + ], +) +def test_label_encoder_array_api_compliance(y, array_namespace, device, dtype): + xp = _array_api_for_tests(array_namespace, device) + xp_y = xp.asarray(y, device=device) + with config_context(array_api_dispatch=True): + xp_label = LabelEncoder() + np_label = LabelEncoder() + xp_label = xp_label.fit(xp_y) + xp_transformed = xp_label.transform(xp_y) + xp_inv_transformed = xp_label.inverse_transform(xp_transformed) + np_label = np_label.fit(y) + np_transformed = np_label.transform(y) + assert get_namespace(xp_transformed)[0].__name__ == xp.__name__ + assert get_namespace(xp_inv_transformed)[0].__name__ == xp.__name__ + assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__ + assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed) + assert_array_equal(_convert_to_numpy(xp_inv_transformed, xp), y) + assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_) + + xp_label = LabelEncoder() + np_label = LabelEncoder() + xp_transformed = xp_label.fit_transform(xp_y) + np_transformed = np_label.fit_transform(y) + assert get_namespace(xp_transformed)[0].__name__ == xp.__name__ + assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__ + assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed) + assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index a8b0363c0af38..8374dc35ff4f0 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -406,6 +406,11 @@ def unique_counts(self, x): def unique_values(self, x): return numpy.unique(x) + def unique_all(self, x): + return numpy.unique( + x, return_index=True, return_inverse=True, return_counts=True + ) + def concat(self, arrays, *, axis=None): return numpy.concatenate(arrays, axis=axis) @@ -839,3 +844,121 @@ def indexing_dtype(xp): # TODO: once sufficiently adopted, we might want to instead rely on the # newer inspection API: https://github.com/data-apis/array-api/issues/640 return xp.asarray(0).dtype + + +def _searchsorted(xp, a, v, *, side="left", sorter=None): + # Temporary workaround needed as long as searchsorted is not widely + # adopted by implementers of the Array API spec. This is a quite + # recent addition to the spec: + # https://data-apis.org/array-api/latest/API_specification/generated/array_api.searchsorted.html # noqa + if hasattr(xp, "searchsorted"): + return xp.searchsorted(a, v, side=side, sorter=sorter) + + a_np = _convert_to_numpy(a, xp=xp) + v_np = _convert_to_numpy(v, xp=xp) + indices = numpy.searchsorted(a_np, v_np, side=side, sorter=sorter) + return xp.asarray(indices, device=device(a)) + + +def _setdiff1d(ar1, ar2, xp, assume_unique=False): + """Find the set difference of two arrays. + + Return the unique values in `ar1` that are not in `ar2`. + """ + if _is_numpy_namespace(xp): + return xp.asarray( + numpy.setdiff1d( + ar1=ar1, + ar2=ar2, + assume_unique=assume_unique, + ) + ) + + if assume_unique: + ar1 = xp.reshape(ar1, (-1,)) + else: + ar1 = xp.unique_values(ar1) + ar2 = xp.unique_values(ar2) + return ar1[_in1d(ar1=ar1, ar2=ar2, xp=xp, assume_unique=True, invert=True)] + + +def _isin(element, test_elements, xp, assume_unique=False, invert=False): + """Calculates ``element in test_elements``, broadcasting over `element` + only. + + Returns a boolean array of the same shape as `element` that is True + where an element of `element` is in `test_elements` and False otherwise. + """ + if _is_numpy_namespace(xp): + return xp.asarray( + numpy.isin( + element=element, + test_elements=test_elements, + assume_unique=assume_unique, + invert=invert, + ) + ) + + original_element_shape = element.shape + element = xp.reshape(element, (-1,)) + test_elements = xp.reshape(test_elements, (-1,)) + return xp.reshape( + _in1d( + ar1=element, + ar2=test_elements, + xp=xp, + assume_unique=assume_unique, + invert=invert, + ), + original_element_shape, + ) + + +# Note: This is a helper for the functions `_isin` and +# `_setdiff1d`. It is not meant to be called directly. +def _in1d(ar1, ar2, xp, assume_unique=False, invert=False): + """Checks whether each element of an array is also present in a + second array. + + Returns a boolean array the same length as `ar1` that is True + where an element of `ar1` is in `ar2` and False otherwise. + + This function has been adapted using the original implementation + present in numpy: + https://github.com/numpy/numpy/blob/v1.26.0/numpy/lib/arraysetops.py#L524-L758 + """ + xp, _ = get_namespace(ar1, ar2, xp=xp) + + # This code is run to make the code significantly faster + if ar2.shape[0] < 10 * ar1.shape[0] ** 0.145: + if invert: + mask = xp.ones(ar1.shape[0], dtype=xp.bool, device=device(ar1)) + for a in ar2: + mask &= ar1 != a + else: + mask = xp.zeros(ar1.shape[0], dtype=xp.bool, device=device(ar1)) + for a in ar2: + mask |= ar1 == a + return mask + + if not assume_unique: + ar1, rev_idx = xp.unique_inverse(ar1) + ar2 = xp.unique_values(ar2) + + ar = xp.concat((ar1, ar2)) + device_ = device(ar) + # We need this to be a stable sort. + order = xp.argsort(ar, stable=True) + reverse_order = xp.argsort(order, stable=True) + sar = xp.take(ar, order, axis=0) + if invert: + bool_ar = sar[1:] != sar[:-1] + else: + bool_ar = sar[1:] == sar[:-1] + flag = xp.concat((bool_ar, xp.asarray([invert], device=device_))) + ret = xp.take(flag, reverse_order, axis=0) + + if assume_unique: + return ret[: ar1.shape[0]] + else: + return xp.take(ret, rev_idx, axis=0) diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py index a468af43f857d..3fd4d45f522e6 100644 --- a/sklearn/utils/_encode.py +++ b/sklearn/utils/_encode.py @@ -4,6 +4,13 @@ import numpy as np +from ._array_api import ( + _isin, + _searchsorted, + _setdiff1d, + device, + get_namespace, +) from ._missing import is_scalar_nan @@ -51,31 +58,29 @@ def _unique(values, *, return_inverse=False, return_counts=False): def _unique_np(values, return_inverse=False, return_counts=False): """Helper function to find unique values for numpy arrays that correctly accounts for nans. See `_unique` documentation for details.""" - uniques = np.unique( - values, return_inverse=return_inverse, return_counts=return_counts - ) + xp, _ = get_namespace(values) inverse, counts = None, None - if return_counts: - *uniques, counts = uniques - - if return_inverse: - *uniques, inverse = uniques - - if return_counts or return_inverse: - uniques = uniques[0] + if return_inverse and return_counts: + uniques, _, inverse, counts = xp.unique_all(values) + elif return_inverse: + uniques, inverse = xp.unique_inverse(values) + elif return_counts: + uniques, counts = xp.unique_counts(values) + else: + uniques = xp.unique_values(values) # np.unique will have duplicate missing values at the end of `uniques` # here we clip the nans and remove it from uniques if uniques.size and is_scalar_nan(uniques[-1]): - nan_idx = np.searchsorted(uniques, np.nan) + nan_idx = _searchsorted(xp, uniques, xp.nan) uniques = uniques[: nan_idx + 1] if return_inverse: inverse[inverse > nan_idx] = nan_idx if return_counts: - counts[nan_idx] = np.sum(counts[nan_idx:]) + counts[nan_idx] = xp.sum(counts[nan_idx:]) counts = counts[: nan_idx + 1] ret = (uniques,) @@ -161,8 +166,9 @@ def __missing__(self, key): def _map_to_integer(values, uniques): """Map values based on its position in uniques.""" + xp, _ = get_namespace(values, uniques) table = _nandict({val: i for i, val in enumerate(uniques)}) - return np.array([table[v] for v in values]) + return xp.asarray([table[v] for v in values], device=device(values)) def _unique_python(values, *, return_inverse, return_counts): @@ -220,7 +226,8 @@ def _encode(values, *, uniques, check_unknown=True): encoded : ndarray Encoded values """ - if values.dtype.kind in "OUS": + xp, _ = get_namespace(values, uniques) + if not xp.isdtype(values.dtype, "numeric"): try: return _map_to_integer(values, uniques) except KeyError as e: @@ -230,7 +237,7 @@ def _encode(values, *, uniques, check_unknown=True): diff = _check_unknown(values, uniques) if diff: raise ValueError(f"y contains previously unseen labels: {str(diff)}") - return np.searchsorted(uniques, values) + return _searchsorted(xp, uniques, values) def _check_unknown(values, known_values, return_mask=False): @@ -258,9 +265,10 @@ def _check_unknown(values, known_values, return_mask=False): Additionally returned if ``return_mask=True``. """ + xp, _ = get_namespace(values, known_values) valid_mask = None - if values.dtype.kind in "OUS": + if not xp.isdtype(values.dtype, "numeric"): values_set = set(values) values_set, missing_in_values = _extract_missing(values_set) @@ -282,9 +290,9 @@ def is_valid(value): if return_mask: if diff or nan_in_diff or none_in_diff: - valid_mask = np.array([is_valid(value) for value in values]) + valid_mask = xp.array([is_valid(value) for value in values]) else: - valid_mask = np.ones(len(values), dtype=bool) + valid_mask = xp.ones(len(values), dtype=xp.bool) diff = list(diff) if none_in_diff: @@ -292,21 +300,21 @@ def is_valid(value): if nan_in_diff: diff.append(np.nan) else: - unique_values = np.unique(values) - diff = np.setdiff1d(unique_values, known_values, assume_unique=True) + unique_values = xp.unique_values(values) + diff = _setdiff1d(unique_values, known_values, xp, assume_unique=True) if return_mask: if diff.size: - valid_mask = np.isin(values, known_values) + valid_mask = _isin(values, known_values, xp) else: - valid_mask = np.ones(len(values), dtype=bool) + valid_mask = xp.ones(len(values), dtype=xp.bool) # check for nans in the known_values - if np.isnan(known_values).any(): - diff_is_nan = np.isnan(diff) - if diff_is_nan.any(): + if xp.any(xp.isnan(known_values)): + diff_is_nan = xp.isnan(diff) + if xp.any(diff_is_nan): # removes nan from valid_mask if diff.size and return_mask: - is_nan = np.isnan(values) + is_nan = xp.isnan(values) valid_mask[is_nan] = 1 # remove nan from diff diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py index d0b368cd7fe91..30fc88c539fc8 100644 --- a/sklearn/utils/tests/test_array_api.py +++ b/sklearn/utils/tests/test_array_api.py @@ -15,6 +15,7 @@ _convert_to_numpy, _estimator_with_converted_arrays, _is_numpy_namespace, + _isin, _nanmax, _nanmin, _NumPyAPIWrapper, @@ -27,6 +28,7 @@ ) from sklearn.utils._testing import ( _array_api_for_tests, + assert_array_equal, skip_if_array_api_compat_not_configured, ) from sklearn.utils.fixes import _IS_32BIT @@ -504,3 +506,37 @@ def test_indexing_dtype(namespace, _device, _dtype): assert indexing_dtype(xp) == xp.int32 else: assert indexing_dtype(xp) == xp.int64 + + +@pytest.mark.parametrize( + "array_namespace, device, _", yield_namespace_device_dtype_combinations() +) +@pytest.mark.parametrize("invert", [True, False]) +@pytest.mark.parametrize("assume_unique", [True, False]) +@pytest.mark.parametrize("element_size", [6, 10, 14]) +@pytest.mark.parametrize("int_dtype", ["int16", "int32", "int64", "uint8"]) +def test_isin( + array_namespace, device, _, invert, assume_unique, element_size, int_dtype +): + xp = _array_api_for_tests(array_namespace, device) + r = element_size // 2 + element = 2 * numpy.arange(element_size).reshape((r, 2)).astype(int_dtype) + test_elements = numpy.array(numpy.arange(14), dtype=int_dtype) + element_xp = xp.asarray(element, device=device) + test_elements_xp = xp.asarray(test_elements, device=device) + expected = numpy.isin( + element=element, + test_elements=test_elements, + assume_unique=assume_unique, + invert=invert, + ) + with config_context(array_api_dispatch=True): + result = _isin( + element=element_xp, + test_elements=test_elements_xp, + xp=xp, + assume_unique=assume_unique, + invert=invert, + ) + + assert_array_equal(_convert_to_numpy(result, xp=xp), expected)