BUG: ensure list of tuples results in 1d masked array in cv_results, as opposed to 2d array (#28571)

MarcoGorelli · lesteve · web-flow · commit 33cde65c44b1 · 2024-03-04T11:46:13.000-05:00
Co-authored-by: Loïc Estève &lt;loic.esteve@ymail.com&gt;
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
@@ -1083,7 +1083,9 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
                 arr_dtype = np.result_type(*param_list)
             except TypeError:
                 arr_dtype = object
-            if len(param_list) == n_candidates:
+            if len(param_list) == n_candidates and arr_dtype != object:
+                # Exclude `object` else the numpy constructor might infer a list of
+                # tuples to be a 2d array.
                 results[key] = MaskedArray(param_list, mask=False, dtype=arr_dtype)
             else:
                 # Use one MaskedArray and mask all the places where the param is not
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
@@ -23,6 +23,7 @@
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.exceptions import FitFailedWarning
 from sklearn.experimental import enable_halving_search_cv  # noqa
+from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.impute import SimpleImputer
 from sklearn.linear_model import (
     LinearRegression,
@@ -56,6 +57,7 @@
 )
 from sklearn.model_selection._search import BaseSearchCV
 from sklearn.model_selection.tests.common import OneTimeSplitter
+from sklearn.naive_bayes import ComplementNB
 from sklearn.neighbors import KernelDensity, KNeighborsClassifier, LocalOutlierFactor
 from sklearn.pipeline import Pipeline
 from sklearn.svm import SVC, LinearSVC
@@ -2492,6 +2494,35 @@ def test_search_estimator_param(SearchCV, param_search):
     assert gs.best_estimator_.named_steps["clf"].C == 0.01
 
 
+def test_search_with_2d_array():
+    parameter_grid = {
+        "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
+        "vect__norm": ("l1", "l2"),
+    }
+    pipeline = Pipeline(
+        [
+            ("vect", TfidfVectorizer()),
+            ("clf", ComplementNB()),
+        ]
+    )
+    random_search = RandomizedSearchCV(
+        estimator=pipeline,
+        param_distributions=parameter_grid,
+        n_iter=3,
+        random_state=0,
+        n_jobs=2,
+        verbose=1,
+        cv=3,
+    )
+    data_train = ["one", "two", "three", "four", "five"]
+    data_target = [0, 0, 1, 0, 1]
+    random_search.fit(data_train, data_target)
+    result = random_search.cv_results_["param_vect__ngram_range"]
+    expected_data = np.empty(3, dtype=object)
+    expected_data[:] = [(1, 2), (1, 2), (1, 1)]
+    np.testing.assert_array_equal(result.data, expected_data)
+
+
 # Metadata Routing Tests
 # ======================