scikit-learn · ogrisel · Jun 7, 2024 · Aug 18, 2023 · May 24, 2024 · May 24, 2024
diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst
@@ -98,6 +98,17 @@ Estimators
 - :class:`preprocessing.MinMaxScaler`
 - :class:`preprocessing.Normalizer`
 
+Meta-estimators
+---------------
+
+Meta-estimators that accept Array API inputs conditioned on the fact that the
+base estimator also does:
+
+- :class:`model_selection.GridSearchCV`
+- :class:`model_selection.RandomizedSearchCV`
+- :class:`model_selection.HalvingGridSearchCV`
+- :class:`model_selection.HalvingRandomSearchCV`
+
 Metrics
 -------
 

diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -42,6 +42,12 @@ See :ref:`array_api` for more details.
 
 - :class:`preprocessing.LabelEncoder` now supports Array API compatible inputs.
   :pr:`27381` by :user:`Omar Salman <OmarManzoor>`.
+- :class:`model_selection.GridSearchCV`,
+  :class:`model_selection.RandomizedSearchCV`,
+  :class:`model_selection.HalvingGridSearchCV` and
+  :class:`model_selection.HalvingRandomSearchCV` now support Array API
+  compatible inputs when their base estimators do. :pr:`27096` by :user:`Tim
+  Head <betatim>` and :user:`Olivier Grisel <ogrisel>`.
 
 Metadata Routing
 ----------------

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
@@ -440,6 +440,7 @@ def _more_tags(self):
             "_xfail_checks": {
                 "check_supervised_y_2d": "DataConversionWarning not caught"
             },
+            "array_api_support": _safe_tags(self.estimator, "array_api_support"),
         }
 
     def score(self, X, y=None, **params):

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -745,7 +745,15 @@ def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
 
     def _make_test_folds(self, X, y=None):
         rng = check_random_state(self.random_state)
-        y = np.asarray(y)
+        # XXX: as of now, cross-validation splitters only operate in NumPy-land
+        # without attempting to leverage array API namespace features. However
+        # they might be fed by array API inputs, e.g. in CV-enabled estimators so
+        # we need the following explicit conversion:
+        xp, is_array_api = get_namespace(y)
+        if is_array_api:
+            y = _convert_to_numpy(y, xp)
+        else:
+            y = np.asarray(y)
         type_of_target_y = type_of_target(y)
         allowed_target_types = ("binary", "multiclass")
         if type_of_target_y not in allowed_target_types:

diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
@@ -30,6 +30,7 @@
 from ..metrics._scorer import _MultimetricScorer
 from ..preprocessing import LabelEncoder
 from ..utils import Bunch, _safe_indexing, check_random_state, indexable
+from ..utils._array_api import device, get_namespace
 from ..utils._param_validation import (
     HasMethods,
     Integral,
@@ -830,6 +831,13 @@ def _fit_and_score(
         fit_error : str or None
             Traceback str if the fit failed, None if the fit succeeded.
     """
+    xp, _ = get_namespace(X)
+    X_device = device(X)
+
+    # Make sure that we can fancy index X even if train and test are provided
+    # as NumPy arrays by NumPy only cross-validation splitters.
+    train, test = xp.asarray(train, device=X_device), xp.asarray(test, device=X_device)
+
     if not isinstance(error_score, numbers.Number) and error_score != "raise":
         raise ValueError(
             "error_score must be the string 'raise' or a numeric value. "

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
@@ -23,6 +23,7 @@
     make_classification,
     make_multilabel_classification,
 )
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.dummy import DummyClassifier
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.exceptions import FitFailedWarning
@@ -73,11 +74,13 @@
     check_recorded_metadata,
 )
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.utils._array_api import yield_namespace_device_dtype_combinations
 from sklearn.utils._mocking import CheckingClassifier, MockDataFrame
 from sklearn.utils._testing import (
     MinimalClassifier,
     MinimalRegressor,
     MinimalTransformer,
+    _array_api_for_tests,
     assert_allclose,
     assert_almost_equal,
     assert_array_almost_equal,
@@ -2718,3 +2721,28 @@ def test_search_with_estimators_issue_29157():
     grid_search = GridSearchCV(pipe, grid_params, cv=2)
     grid_search.fit(X, y)
     assert grid_search.cv_results_["param_enc__enc"].dtype == object
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype", yield_namespace_device_dtype_combinations()
+)
+@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV])
+def test_array_api_search_cv_classifier(SearchCV, array_namespace, device, dtype):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    X = np.arange(100).reshape((10, 10))
+    X_np = X.astype(dtype)
+    X_xp = xp.asarray(X_np, device=device)
+
+    # y should always be an integer, no matter what `dtype` is
+    y_np = np.array([0] * 5 + [1] * 5)
+    y_xp = xp.asarray(y_np, device=device)
+
+    with config_context(array_api_dispatch=True):
+        searcher = SearchCV(
+            LinearDiscriminantAnalysis(),
+            {"tol": [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]},
+            cv=2,
+        )
+        searcher.fit(X_xp, y_xp)
+        searcher.score(X_xp, y_xp)
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
@@ -2100,13 +2100,14 @@ def test_fit_and_score_failing():
     failing_clf = FailingClassifier(FailingClassifier.FAILING_PARAMETER)
     # dummy X data
     X = np.arange(1, 10)
+    train, test = np.arange(0, 5), np.arange(5, 9)
     fit_and_score_args = dict(
         estimator=failing_clf,
         X=X,
         y=None,
         scorer=dict(),
-        train=None,
-        test=None,
+        train=train,
+        test=test,
         verbose=0,
         parameters=None,
         fit_params=None,

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -330,7 +330,9 @@ def _generate_search_cv_instances():
         extra_params = (
             {"min_resources": "smallest"} if "min_resources" in init_params else {}
         )
-        search_cv = SearchCV(Estimator(), param_grid, cv=2, **extra_params)
+        search_cv = SearchCV(
+            Estimator(), param_grid, cv=2, error_score="raise", **extra_params
+        )
         set_random_state(search_cv)
         yield search_cv