Skip to content

FIX sparse arrays not included in array API dispatch #29476

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Sep 11, 2024
Merged
43 changes: 27 additions & 16 deletions sklearn/utils/_array_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import numpy
import scipy
import scipy.sparse as sp
import scipy.special as special

from .._config import get_config
Expand Down Expand Up @@ -162,7 +163,9 @@ def device(*array_list, remove_none=True, remove_types=(str,)):
*array_list, remove_none=remove_none, remove_types=remove_types
)

# Note that _remove_non_arrays ensures that array_list is not empty.
if not array_list:
return None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the Returns section of the docstring needs to be updated to mention this case.

Copy link
Member

@ogrisel ogrisel Jul 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I we might argue that returning None in this case can be quite confusing.

When reviewing, this PR, I found out that get_namespace_and_device is not even consistent with device (independently of the changes in this PR).

Consider the following code on a single non-array input that is not filtered out by any of the remove_ arguments (note that the array API dispatch is never enabled) .

>>> from sklearn.utils._array_api import device, get_namespace, get_namespace_and_device
>>> some_list = [0, 1, 2]
>>> get_namespace_and_device(some_list)
(<sklearn.utils._array_api._NumPyAPIWrapper object at 0x10ff31750>, False, None)
>>> xp, is_enabled, device_ = get_namespace_and_device(some_list)
>>> device(some_list)
'cpu'
>>> device(xp.asarray(some_list))
'cpu'

I think we should at least make the output of device(some_list) and get_namespace_and_device(some_list) consistent, either by returning None in both cases, or by returning "cpu" consistently as is done for NumPy arrays, either with the sklearn.utils._array_api._NumPyAPIWrapper when array API dispatch is disabled, or when using the array_api_compat.numpy wrapper for numpy 1.x or when using numpy 2 arrays who naturally expose a .device == "cpu" attribute (at least in the dev version).

Returning None for non-array inputs (such as lists or scipy sparse matrices when accepted) would be the cleanest. However that might render our code more verbose by having to scatter around if device_ is not None in various places. So maybe, always returning "cpu", both in device(...) and get_namespace_and_device(...) for non-arrays inputs might be more pragmatic.


device_ = _single_array_device(array_list[0])

# Note: here we cannot simply use a Python `set` as it requires
Expand Down Expand Up @@ -451,6 +454,8 @@ def _remove_non_arrays(*arrays, remove_none=True, remove_types=(str,)):

Raise ValueError if no arrays are left after filtering.

Sparse arrays are always filtered out.

Parameters
----------
*arrays : array objects
Expand All @@ -465,7 +470,8 @@ def _remove_non_arrays(*arrays, remove_none=True, remove_types=(str,)):
Returns
-------
filtered_arrays : list
List of arrays with None and typoe
List of arrays filtered as requested. An empty list is returned if no input
passes the filters.
"""
filtered_arrays = []
remove_types = tuple(remove_types)
Expand All @@ -474,14 +480,10 @@ def _remove_non_arrays(*arrays, remove_none=True, remove_types=(str,)):
continue
if isinstance(array, remove_types):
continue
if sp.issparse(array):
continue
filtered_arrays.append(array)

if not filtered_arrays:
raise ValueError(
f"At least one input array expected after filtering with {remove_none=}, "
f"remove_types=[{', '.join(t.__name__ for t in remove_types)}]. Got none. "
f"Original types: [{', '.join(type(a).__name__ for a in arrays)}]."
)
return filtered_arrays


Expand All @@ -491,6 +493,8 @@ def get_namespace(*arrays, remove_none=True, remove_types=(str,), xp=None):
Introspect `arrays` arguments and return their common Array API compatible
namespace object, if any.

Note that sparse arrays are filtered by default.

See: https://numpy.org/neps/nep-0047-array-api-standard.html

If `arrays` are regular numpy arrays, an instance of the `_NumPyAPIWrapper`
Expand All @@ -509,6 +513,9 @@ def get_namespace(*arrays, remove_none=True, remove_types=(str,), xp=None):
always returned irrespective of the fact that arrays implement the
`__array_namespace__` protocol or not.

Note that if no arrays pass the set filters, ``_NUMPY_API_WRAPPER_INSTANCE, False``
is returned.

Parameters
----------
*arrays : array objects
Expand Down Expand Up @@ -546,9 +553,14 @@ def get_namespace(*arrays, remove_none=True, remove_types=(str,), xp=None):
return xp, True

arrays = _remove_non_arrays(
*arrays, remove_none=remove_none, remove_types=remove_types
*arrays,
remove_none=remove_none,
remove_types=remove_types,
)

if not arrays:
return _NUMPY_API_WRAPPER_INSTANCE, False

_check_array_api_dispatch(array_api_dispatch)

# array-api-compat is a required dependency of scikit-learn only when
Expand Down Expand Up @@ -591,20 +603,19 @@ def get_namespace_and_device(*array_list, remove_none=True, remove_types=(str,))
`device` object (see the "Device Support" section of the array API spec).
"""
array_list = _remove_non_arrays(
*array_list, remove_none=remove_none, remove_types=remove_types
*array_list,
remove_none=remove_none,
remove_types=remove_types,
)

skip_remove_kwargs = dict(remove_none=False, remove_types=[])

xp, is_array_api = get_namespace(*array_list, **skip_remove_kwargs)
arrays_device = device(*array_list, **skip_remove_kwargs)
if is_array_api:
return (
xp,
is_array_api,
device(*array_list, **skip_remove_kwargs),
)
return xp, is_array_api, arrays_device
else:
return xp, False, None
return xp, False, arrays_device


def _expit(X, xp=None):
Expand Down
36 changes: 20 additions & 16 deletions sklearn/utils/tests/test_array_api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import re
from functools import partial

import numpy
Expand Down Expand Up @@ -31,6 +30,7 @@
yield_namespace_device_dtype_combinations,
)
from sklearn.utils._testing import (
SkipTest,
_array_api_for_tests,
assert_array_equal,
skip_if_array_api_compat_not_configured,
Expand Down Expand Up @@ -238,20 +238,10 @@ def test_average_raises_with_invalid_parameters(
_average(array_in, axis=axis, weights=weights)


def test_device_raises_if_no_input():
err_msg = re.escape(
"At least one input array expected after filtering with remove_none=True, "
"remove_types=[str]. Got none. Original types: []."
)
with pytest.raises(ValueError, match=err_msg):
device()
def test_device_none_if_no_input():
assert device() is None

err_msg = re.escape(
"At least one input array expected after filtering with remove_none=True, "
"remove_types=[str]. Got none. Original types: [NoneType, str]."
)
with pytest.raises(ValueError, match=err_msg):
device(None, "name")
assert device(None, "name") is None


def test_device_inspection():
Expand Down Expand Up @@ -540,13 +530,13 @@ def test_get_namespace_and_device():
some_numpy_array = numpy.arange(3)

# When dispatch is disabled, get_namespace_and_device should return the
# default NumPy wrapper namespace and no device. Our code will handle such
# default NumPy wrapper namespace and "cpu" device. Our code will handle such
# inputs via the usual __array__ interface without attempting to dispatch
# via the array API.
namespace, is_array_api, device = get_namespace_and_device(some_torch_tensor)
assert namespace is get_namespace(some_numpy_array)[0]
assert not is_array_api
assert device is None
assert device.type == "cpu"

# Otherwise, expose the torch namespace and device via array API compat
# wrapper.
Expand Down Expand Up @@ -605,3 +595,17 @@ def test_fill_or_add_to_diagonal(array_namespace, device_, dtype_name, wrap):
_fill_or_add_to_diagonal(array_xp, value=1, xp=xp, add_value=False, wrap=wrap)
numpy.fill_diagonal(array_np, val=1, wrap=wrap)
assert_array_equal(_convert_to_numpy(array_xp, xp=xp), array_np)


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize("dispatch", [True, False])
def test_sparse_device(csr_container, dispatch):
a, b = csr_container(numpy.array([[1]])), csr_container(numpy.array([[2]]))
try:
with config_context(array_api_dispatch=dispatch):
assert device(a, b) is None
assert device(a, numpy.array([1])) == "cpu"
assert get_namespace_and_device(a, b)[2] is None
assert get_namespace_and_device(a, numpy.array([1]))[2] == "cpu"
except ImportError:
raise SkipTest("array_api_compat is not installed")
10 changes: 10 additions & 0 deletions sklearn/utils/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2200,6 +2200,16 @@ def test_check_array_writeable_df():
assert out.flags.writeable


@skip_if_array_api_compat_not_configured
def test_check_array_on_sparse_inputs_with_array_api_enabled():
X_sp = sp.csr_array([[0, 1, 0], [1, 0, 1]])
with config_context(array_api_dispatch=True):
assert sp.issparse(check_array(X_sp, accept_sparse=True))

with pytest.raises(TypeError):
check_array(X_sp)


# TODO(1.8): remove
def test_force_all_finite_rename_warning():
X = np.random.uniform(size=(10, 10))
Expand Down