diff --git a/build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock b/build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock index 3f948012a3f94..61229cd6ccf73 100644 --- a/build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock +++ b/build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock @@ -1,6 +1,6 @@ # Generated by conda-lock. # platform: linux-64 -# input_hash: 0ef2318a417ecd9806c39a466da49a53e8dab8b199cceb5bffcdd59c0a293907 +# input_hash: cdc27128862fe4a9c586b6961e094936bdb919a4b142282ae0370d7e4cb2c9a4 @EXPLICIT https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda#2f4327a1cbe7f022401b236e915a5fef @@ -200,6 +200,7 @@ https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-16_linux64_mkl.t https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.10.57-h85b1a90_19.conda#0605d3d60857fc07bd6a11e878fe0f08 https://conda.anaconda.org/conda-forge/linux-64/numpy-1.26.4-py311h64a7726_0.conda#a502d7aad449a1206efb366d6a12c52d https://conda.anaconda.org/conda-forge/linux-64/qt-main-5.15.8-h5810be5_19.conda#54866f708d43002a514d0b9b0f84bc11 +https://conda.anaconda.org/conda-forge/noarch/array-api-strict-1.1-pyhd8ed1ab_0.conda#db4260fac4412db30bd5213b6c9f6ecc https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.2.0-py311h9547e67_0.conda#40828c5b36ef52433e21f89943e09f33 https://conda.anaconda.org/conda-forge/linux-64/libarrow-12.0.1-hb87d912_8_cpu.conda#3f3b11398fe79b578e3c44dd00a44e4a https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.1-py311h320fe9a_0.conda#aac8d7137fedc2fd5f8320bf50e4204c diff --git a/build_tools/azure/pylatest_conda_forge_mkl_linux-64_environment.yml b/build_tools/azure/pylatest_conda_forge_mkl_linux-64_environment.yml index 8ad97b91f8fce..f4dc622417040 100644 --- a/build_tools/azure/pylatest_conda_forge_mkl_linux-64_environment.yml +++ b/build_tools/azure/pylatest_conda_forge_mkl_linux-64_environment.yml @@ -28,3 +28,4 @@ dependencies: - polars - pyarrow - array-api-compat + - array-api-strict diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py index add176b93ba82..dbb23f53a362b 100644 --- a/build_tools/update_environments_and_lock_files.py +++ b/build_tools/update_environments_and_lock_files.py @@ -104,6 +104,7 @@ def remove_from(alist, to_remove): "polars", "pyarrow", "array-api-compat", + "array-api-strict", ], "package_constraints": { "blas": "[build=mkl]", diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index 9a114cad152b4..6037d644d3f7d 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -25,7 +25,7 @@ At this stage, this support is **considered experimental** and must be enabled explicitly as explained in the following. .. note:: - Currently, only `cupy.array_api`, `numpy.array_api`, `cupy`, and `PyTorch` + Currently, only `cupy.array_api`, `array-api-strict`, `cupy`, and `PyTorch` are known to work with scikit-learn's estimators. Example usage diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 1ec359a028f8c..b0fd32d1cbf62 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -975,7 +975,7 @@ def test_pca_mle_array_api_compliance( def test_array_api_error_and_warnings_on_unsupported_params(): pytest.importorskip("array_api_compat") - xp = pytest.importorskip("numpy.array_api") + xp = pytest.importorskip("array_api_strict") iris_xp = xp.asarray(iris.data) pca = PCA(n_components=2, svd_solver="arpack", random_state=0) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index f07e974542a5b..be8c9097332eb 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -33,7 +33,7 @@ _fit_context, ) from ..utils import check_array, check_random_state -from ..utils._array_api import get_namespace +from ..utils._array_api import get_namespace, indexing_dtype from ..utils._seq_dataset import ( ArrayDataset32, ArrayDataset64, @@ -350,7 +350,7 @@ def predict(self, X): xp, _ = get_namespace(X) scores = self.decision_function(X) if len(scores.shape) == 1: - indices = xp.astype(scores > 0, int) + indices = xp.astype(scores > 0, indexing_dtype(xp)) else: indices = xp.argmax(scores, axis=1) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 009db4d7b0e71..70e210e7e913e 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -10,7 +10,7 @@ from .._config import get_config from .fixes import parse_version -_NUMPY_NAMESPACE_NAMES = {"numpy", "array_api_compat.numpy", "numpy.array_api"} +_NUMPY_NAMESPACE_NAMES = {"numpy", "array_api_compat.numpy"} def yield_namespace_device_dtype_combinations(include_numpy_namespaces=True): @@ -42,8 +42,8 @@ def yield_namespace_device_dtype_combinations(include_numpy_namespaces=True): # tests are regular numpy arrays without any "device" attribute. "numpy", # Stricter NumPy-based Array API implementation. The - # numpy.array_api.Array instances always a dummy "device" attribute. - "numpy.array_api", + # array_api_strict.Array instances always have a dummy "device" attribute. + "array_api_strict", "cupy", "cupy.array_api", "torch", @@ -194,7 +194,6 @@ def _isdtype_single(dtype, kind, *, xp): return dtype in supported_float_dtypes(xp) elif kind == "complex floating": # Some name spaces do not have complex, such as cupy.array_api - # and numpy.array_api complex_dtypes = set() if hasattr(xp, "complex64"): complex_dtypes.add(xp.complex64) @@ -300,14 +299,20 @@ def wrapped_func(*args, **kwargs): class _NumPyAPIWrapper: """Array API compat wrapper for any numpy version - NumPy < 1.22 does not expose the numpy.array_api namespace. This - wrapper makes it possible to write code that uses the standard - Array API while working with any version of NumPy supported by - scikit-learn. + NumPy < 2 does not implement the namespace. NumPy 2 and later should + progressively implement more an more of the latest Array API spec but this + is still work in progress at this time. + + This wrapper makes it possible to write code that uses the standard Array + API while working with any version of NumPy supported by scikit-learn. See the `get_namespace()` public function for more details. """ + # TODO: once scikit-learn drops support for NumPy < 2, this class can be + # removed, assuming Array API compliance of NumPy 2 is actually sufficient + # for scikit-learn's needs. + # Creation functions in spec: # https://data-apis.org/array-api/latest/API_specification/creation_functions.html _CREATION_FUNCS = { @@ -447,18 +452,15 @@ def _remove_non_arrays(*arrays, remove_none=True, remove_types=(str,)): def get_namespace(*arrays, remove_none=True, remove_types=(str,), xp=None): """Get namespace of arrays. - Introspect `arrays` arguments and return their common Array API - compatible namespace object, if any. NumPy 1.22 and later can - construct such containers using the `numpy.array_api` namespace - for instance. + Introspect `arrays` arguments and return their common Array API compatible + namespace object, if any. See: https://numpy.org/neps/nep-0047-array-api-standard.html - If `arrays` are regular numpy arrays, an instance of the - `_NumPyAPIWrapper` compatibility wrapper is returned instead. + If `arrays` are regular numpy arrays, an instance of the `_NumPyAPIWrapper` + compatibility wrapper is returned instead. - Namespace support is not enabled by default. To enabled it - call: + Namespace support is not enabled by default. To enabled it call: sklearn.set_config(array_api_dispatch=True) @@ -467,10 +469,9 @@ def get_namespace(*arrays, remove_none=True, remove_types=(str,), xp=None): with sklearn.config_context(array_api_dispatch=True): # your code here - Otherwise an instance of the `_NumPyAPIWrapper` - compatibility wrapper is always returned irrespective of - the fact that arrays implement the `__array_namespace__` - protocol or not. + Otherwise an instance of the `_NumPyAPIWrapper` compatibility wrapper is + always returned irrespective of the fact that arrays implement the + `__array_namespace__` protocol or not. Parameters ---------- @@ -524,7 +525,7 @@ def get_namespace(*arrays, remove_none=True, remove_types=(str,), xp=None): # These namespaces need additional wrapping to smooth out small differences # between implementations - if namespace.__name__ in {"numpy.array_api", "cupy.array_api"}: + if namespace.__name__ in {"cupy.array_api"}: namespace = _ArrayAPIWrapper(namespace) return namespace, is_array_api_compliant @@ -763,3 +764,27 @@ def _estimator_with_converted_arrays(estimator, converter): def _atol_for_type(dtype): """Return the absolute tolerance for a given numpy dtype.""" return numpy.finfo(dtype).eps * 100 + + +def indexing_dtype(xp): + """Return a platform-specific integer dtype suitable for indexing. + + On 32-bit platforms, this will typically return int32 and int64 otherwise. + + Note: using dtype is recommended for indexing transient array + datastructures. For long-lived arrays, such as the fitted attributes of + estimators, it is instead recommended to use platform-independent int32 if + we do not expect to index more 2B elements. Using fixed dtypes simplifies + the handling of serialized models, e.g. to deploy a model fit on a 64-bit + platform to a target 32-bit platform such as WASM/pyodide. + """ + # Currently this is implemented with simple hack that assumes that + # following "may be" statements in the Array API spec always hold: + # > The default integer data type should be the same across platforms, but + # > the default may vary depending on whether Python is 32-bit or 64-bit. + # > The default array index data type may be int32 on 32-bit platforms, but + # > the default should be int64 otherwise. + # https://data-apis.org/array-api/latest/API_specification/data_types.html#default-data-types + # TODO: once sufficiently adopted, we might want to instead rely on the + # newer inspection API: https://github.com/data-apis/array-api/issues/640 + return xp.asarray(0).dtype diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index d21e4bf1d0ccc..f8c58004dbbcd 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -1054,13 +1054,7 @@ def fit_transform(self, X, y=None): def _array_api_for_tests(array_namespace, device): try: - if array_namespace == "numpy.array_api": - # FIXME: once it is not experimental anymore - with ignore_warnings(category=UserWarning): - # UserWarning: numpy.array_api submodule is still experimental. - array_mod = importlib.import_module(array_namespace) - else: - array_mod = importlib.import_module(array_namespace) + array_mod = importlib.import_module(array_namespace) except ModuleNotFoundError: raise SkipTest( f"{array_namespace} is not installed: not checking array_api input" diff --git a/sklearn/utils/tests/test_array_api.py b/sklearn/utils/tests/test_array_api.py index 6b5f600fb01c2..98ba444bf98b0 100644 --- a/sklearn/utils/tests/test_array_api.py +++ b/sklearn/utils/tests/test_array_api.py @@ -7,6 +7,7 @@ from sklearn._config import config_context from sklearn.base import BaseEstimator +from sklearn.utils import _IS_32BIT from sklearn.utils._array_api import ( _ArrayAPIWrapper, _asarray_with_order, @@ -19,6 +20,7 @@ _NumPyAPIWrapper, device, get_namespace, + indexing_dtype, supported_float_dtypes, yield_namespace_device_dtype_combinations, ) @@ -27,10 +29,6 @@ skip_if_array_api_compat_not_configured, ) -pytestmark = pytest.mark.filterwarnings( - "ignore:The numpy.array_api submodule:UserWarning" -) - @pytest.mark.parametrize("X", [numpy.asarray([1, 2, 3]), [1, 2, 3]]) def test_get_namespace_ndarray_default(X): @@ -68,14 +66,13 @@ def test_get_namespace_ndarray_with_dispatch(): @skip_if_array_api_compat_not_configured def test_get_namespace_array_api(): """Test get_namespace for ArrayAPI arrays.""" - xp = pytest.importorskip("numpy.array_api") + xp = pytest.importorskip("array_api_strict") X_np = numpy.asarray([[1, 2, 3]]) X_xp = xp.asarray(X_np) with config_context(array_api_dispatch=True): xp_out, is_array_api_compliant = get_namespace(X_xp) assert is_array_api_compliant - assert isinstance(xp_out, _ArrayAPIWrapper) with pytest.raises(TypeError): xp_out, is_array_api_compliant = get_namespace(X_xp, X_np) @@ -91,8 +88,8 @@ def __init__(self, array_namespace, name): def test_array_api_wrapper_astype(): """Test _ArrayAPIWrapper for ArrayAPIs that is not NumPy.""" - numpy_array_api = pytest.importorskip("numpy.array_api") - xp_ = _AdjustableNameAPITestWrapper(numpy_array_api, "wrapped_numpy.array_api") + array_api_strict = pytest.importorskip("array_api_strict") + xp_ = _AdjustableNameAPITestWrapper(array_api_strict, "array_api_strict") xp = _ArrayAPIWrapper(xp_) X = xp.asarray(([[1, 2, 3], [3, 4, 5]]), dtype=xp.float64) @@ -103,7 +100,7 @@ def test_array_api_wrapper_astype(): assert X_converted.dtype == xp.float32 -@pytest.mark.parametrize("array_api", ["numpy", "numpy.array_api"]) +@pytest.mark.parametrize("array_api", ["numpy", "array_api_strict"]) def test_asarray_with_order(array_api): """Test _asarray_with_order passes along order for NumPy arrays.""" xp = pytest.importorskip(array_api) @@ -117,8 +114,8 @@ def test_asarray_with_order(array_api): def test_asarray_with_order_ignored(): """Test _asarray_with_order ignores order for Generic ArrayAPI.""" - xp = pytest.importorskip("numpy.array_api") - xp_ = _AdjustableNameAPITestWrapper(xp, "wrapped.array_api") + xp = pytest.importorskip("array_api_strict") + xp_ = _AdjustableNameAPITestWrapper(xp, "array_api_strict") X = numpy.asarray([[1.2, 3.4, 5.1], [3.4, 5.5, 1.2]], order="C") X = xp_.asarray(X) @@ -308,7 +305,7 @@ def __init__(self, device_name): # the following upstream issue has been fixed: # https://github.com/cupy/cupy/issues/8180 @skip_if_array_api_compat_not_configured -@pytest.mark.parametrize("library", ["numpy", "numpy.array_api", "torch"]) +@pytest.mark.parametrize("library", ["numpy", "array_api_strict", "torch"]) @pytest.mark.parametrize( "X,reduction,expected", [ @@ -391,7 +388,7 @@ def fit(self, X, y=None): "array_namespace, converter", [ ("torch", lambda array: array.cpu().numpy()), - ("numpy.array_api", lambda array: numpy.asarray(array)), + ("array_api_strict", lambda array: numpy.asarray(array)), ("cupy.array_api", lambda array: array._array.get()), ], ) @@ -409,7 +406,7 @@ def test_convert_estimator_to_ndarray(array_namespace, converter): @skip_if_array_api_compat_not_configured def test_convert_estimator_to_array_api(): """Convert estimator attributes to ArrayAPI arrays.""" - xp = pytest.importorskip("numpy.array_api") + xp = pytest.importorskip("array_api_strict") X_np = numpy.asarray([[1.3, 4.5]]) est = SimpleEstimator().fit(X_np) @@ -438,7 +435,7 @@ def test_get_namespace_array_api_isdtype(wrapper): """Test isdtype implementation from _ArrayAPIWrapper and _NumPyAPIWrapper.""" if wrapper == _ArrayAPIWrapper: - xp_ = pytest.importorskip("numpy.array_api") + xp_ = pytest.importorskip("array_api_strict") xp = _ArrayAPIWrapper(xp_) else: xp = _NumPyAPIWrapper() @@ -473,3 +470,15 @@ def test_get_namespace_array_api_isdtype(wrapper): with pytest.raises(ValueError, match="Unrecognized data type"): assert xp.isdtype(xp.int16, "unknown") + + +@pytest.mark.parametrize( + "namespace, _device, _dtype", yield_namespace_device_dtype_combinations() +) +def test_indexing_dtype(namespace, _device, _dtype): + xp = _array_api_for_tests(namespace, _device) + + if _IS_32BIT: + assert indexing_dtype(xp) == xp.int32 + else: + assert indexing_dtype(xp) == xp.int64 diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 1e0a083a9c989..8ac7ac9db2e9a 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -529,15 +529,15 @@ def test_check_array_api_input(): except ModuleNotFoundError: raise SkipTest("array_api_compat is required to run this test") try: - importlib.import_module("numpy.array_api") + importlib.import_module("array_api_strict") except ModuleNotFoundError: # pragma: nocover - raise SkipTest("numpy.array_api is required to run this test") + raise SkipTest("array-api-strict is required to run this test") with raises(AssertionError, match="Not equal to tolerance"): check_array_api_input( "BrokenArrayAPI", BrokenArrayAPI(), - array_namespace="numpy.array_api", + array_namespace="array_api_strict", check_values=True, ) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 0cd65685c0a85..715adfb3b003d 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1972,7 +1972,7 @@ def test_pandas_array_returns_ndarray(input_values): @skip_if_array_api_compat_not_configured -@pytest.mark.parametrize("array_namespace", ["numpy.array_api", "cupy.array_api"]) +@pytest.mark.parametrize("array_namespace", ["array_api_strict", "cupy.array_api"]) def test_check_array_array_api_has_non_finite(array_namespace): """Checks that Array API arrays checks non-finite correctly.""" xp = pytest.importorskip(array_namespace)