Skip to content

Fix a regression in GridSearchCV for parameter grids that have arrays of different sizes as parameter values #29314

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Jul 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v1.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ Changelog
grids that have estimators as parameter values.
:pr:`29179` by :user:`Marco Gorelli<MarcoGorelli>`.

- |Fix| Fix a regression in :class:`model_selection.GridSearchCV` for parameter
grids that have arrays of different sizes as parameter values.
:pr:`29314` by :user:`Marco Gorelli<MarcoGorelli>`.

:mod:`sklearn.tree`
...................

Expand Down
90 changes: 52 additions & 38 deletions sklearn/model_selection/_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,56 @@ def check(self):
return check


def _yield_masked_array_for_each_param(candidate_params):
"""
Yield a masked array for each candidate param.

`candidate_params` is a sequence of params which were used in
a `GridSearchCV`. We use masked arrays for the results, as not
all params are necessarily present in each element of
`candidate_params`. For example, if using `GridSearchCV` with
a `SVC` model, then one might search over params like:

- kernel=["rbf"], gamma=[0.1, 1]
- kernel=["poly"], degree=[1, 2]

and then param `'gamma'` would not be present in entries of
`candidate_params` corresponding to `kernel='poly'`.
"""
n_candidates = len(candidate_params)
param_results = defaultdict(dict)

for cand_idx, params in enumerate(candidate_params):
for name, value in params.items():
param_results["param_%s" % name][cand_idx] = value

for key, param_result in param_results.items():
param_list = list(param_result.values())
try:
arr = np.array(param_list)
except ValueError:
# This can happen when param_list contains lists of different
# lengths, for example:
# param_list=[[1], [2, 3]]
arr_dtype = np.dtype(object)
else:
# There are two cases when we don't use the automatically inferred
# dtype when creating the array and we use object instead:
# - string dtype
# - when array.ndim > 1, that means that param_list was something
# like a list of same-size sequences, which gets turned into a
# multi-dimensional array but we want a 1d array
arr_dtype = arr.dtype if arr.dtype.kind != "U" and arr.ndim == 1 else object

# Use one MaskedArray and mask all the places where the param is not
# applicable for that candidate (which may not contain all the params).
ma = MaskedArray(np.empty(n_candidates), mask=True, dtype=arr_dtype)
for index, value in param_result.items():
# Setting the value at an index unmasks that index
ma[index] = value
yield (key, ma)


class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
"""Abstract base class for hyper parameter search with cross-validation."""

Expand Down Expand Up @@ -1079,45 +1129,9 @@ def _store(key_name, array, weights=None, splits=False, rank=False):

_store("fit_time", out["fit_time"])
_store("score_time", out["score_time"])
param_results = defaultdict(dict)
for cand_idx, params in enumerate(candidate_params):
for name, value in params.items():
param_results["param_%s" % name][cand_idx] = value
for key, param_result in param_results.items():
param_list = list(param_result.values())
try:
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="in the future the `.dtype` attribute",
category=DeprecationWarning,
)
# Warning raised by NumPy 1.20+
arr_dtype = np.result_type(*param_list)
except (TypeError, ValueError):
arr_dtype = np.dtype(object)
else:
if any(np.min_scalar_type(x) == object for x in param_list):
# `np.result_type` might get thrown off by `.dtype` properties
# (which some estimators have).
# If finding the result dtype this way would give object,
# then we use object.
# https://github.com/scikit-learn/scikit-learn/issues/29157
arr_dtype = np.dtype(object)
if len(param_list) == n_candidates and arr_dtype != object:
# Exclude `object` else the numpy constructor might infer a list of
# tuples to be a 2d array.
results[key] = MaskedArray(param_list, mask=False, dtype=arr_dtype)
else:
# Use one MaskedArray and mask all the places where the param is not
# applicable for that candidate (which may not contain all the params).
ma = MaskedArray(np.empty(n_candidates), mask=True, dtype=arr_dtype)
for index, value in param_result.items():
# Setting the value at an index unmasks that index
ma[index] = value
results[key] = ma

# Store a list of param dicts at the key 'params'
for param, ma in _yield_masked_array_for_each_param(candidate_params):
results[param] = ma
results["params"] = candidate_params

test_scores_dict = _normalize_score_results(out["test_scores"])
Expand Down
119 changes: 116 additions & 3 deletions sklearn/model_selection/tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,20 @@
StratifiedShuffleSplit,
train_test_split,
)
from sklearn.model_selection._search import BaseSearchCV
from sklearn.model_selection._search import (
BaseSearchCV,
_yield_masked_array_for_each_param,
)
from sklearn.model_selection.tests.common import OneTimeSplitter
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KernelDensity, KNeighborsClassifier, LocalOutlierFactor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
OneHotEncoder,
OrdinalEncoder,
SplineTransformer,
StandardScaler,
)
from sklearn.svm import SVC, LinearSVC
from sklearn.tests.metadata_routing_common import (
ConsumingScorer,
Expand Down Expand Up @@ -2724,6 +2732,37 @@ def test_search_with_estimators_issue_29157():
assert grid_search.cv_results_["param_enc__enc"].dtype == object


def test_cv_results_multi_size_array():
"""Check that GridSearchCV works with params that are arrays of different sizes.

Non-regression test for #29277.
"""
n_features = 10
X, y = make_classification(n_features=10)

spline_reg_pipe = make_pipeline(
SplineTransformer(extrapolation="periodic"),
LogisticRegression(),
)

n_knots_list = [n_features * i for i in [10, 11, 12]]
knots_list = [
np.linspace(0, np.pi * 2, n_knots).reshape((-1, n_features))
for n_knots in n_knots_list
]
spline_reg_pipe_cv = GridSearchCV(
estimator=spline_reg_pipe,
param_grid={
"splinetransformer__knots": knots_list,
},
)

spline_reg_pipe_cv.fit(X, y)
assert (
spline_reg_pipe_cv.cv_results_["param_splinetransformer__knots"].dtype == object
)


@pytest.mark.parametrize(
"array_namespace, device, dtype", yield_namespace_device_dtype_combinations()
)
Expand All @@ -2747,3 +2786,77 @@ def test_array_api_search_cv_classifier(SearchCV, array_namespace, device, dtype
)
searcher.fit(X_xp, y_xp)
searcher.score(X_xp, y_xp)


# Construct these outside the tests so that the same object is used
# for both input and `expected`
one_hot_encoder = OneHotEncoder()
ordinal_encoder = OrdinalEncoder()

# If we construct this directly via `MaskedArray`, the list of tuples
# gets auto-converted to a 2D array.
ma_with_tuples = np.ma.MaskedArray(np.empty(2), mask=True, dtype=object)
ma_with_tuples[0] = (1, 2)
ma_with_tuples[1] = (3, 4)


@pytest.mark.parametrize(
("candidate_params", "expected"),
[
pytest.param(
[{"foo": 1}, {"foo": 2}],
[
("param_foo", np.ma.MaskedArray(np.array([1, 2]))),
],
id="simple numeric, single param",
),
pytest.param(
[{"foo": 1, "bar": 3}, {"foo": 2, "bar": 4}, {"foo": 3}],
[
("param_foo", np.ma.MaskedArray(np.array([1, 2, 3]))),
(
"param_bar",
np.ma.MaskedArray(np.array([3, 4, 0]), mask=[False, False, True]),
),
],
id="simple numeric, one param is missing in one round",
),
pytest.param(
[{"foo": [[1], [2], [3]]}, {"foo": [[1], [2]]}],
[
(
"param_foo",
np.ma.MaskedArray([[[1], [2], [3]], [[1], [2]]], dtype=object),
),
],
id="lists of different lengths",
),
pytest.param(
[{"foo": (1, 2)}, {"foo": (3, 4)}],
[
(
"param_foo",
ma_with_tuples,
),
],
id="lists tuples",
),
pytest.param(
[{"foo": ordinal_encoder}, {"foo": one_hot_encoder}],
[
(
"param_foo",
np.ma.MaskedArray([ordinal_encoder, one_hot_encoder], dtype=object),
),
],
id="estimators",
),
],
)
def test_yield_masked_array_for_each_param(candidate_params, expected):
result = list(_yield_masked_array_for_each_param(candidate_params))
for (key, value), (expected_key, expected_value) in zip(result, expected):
assert key == expected_key
assert value.dtype == expected_value.dtype
np.testing.assert_array_equal(value, expected_value)
np.testing.assert_array_equal(value.mask, expected_value.mask)