Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- Fixed a spurious warning (about the number of unique classes being
greater than 50% of the number of samples) that could occur when
passing `classes` :func:`utils.multiclass.type_of_target`.
By :user:`Sascha D. Krauss <saskra>`.
2 changes: 1 addition & 1 deletion sklearn/utils/multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ def _raise_or_return():
if issparse(first_row_or_val):
first_row_or_val = first_row_or_val.data
classes = cached_unique(y)
if y.shape[0] > 20 and classes.shape[0] > round(0.5 * y.shape[0]):
if y.shape[0] > 20 and y.shape[0] > classes.shape[0] > round(0.5 * y.shape[0]):
# Only raise the warning when we have at least 20 samples.
warnings.warn(
"The number of unique classes is greater than 50% of the number "
Expand Down
14 changes: 13 additions & 1 deletion sklearn/utils/tests/test_multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,11 @@ def test_type_of_target_too_many_unique_classes():
We need to check that we don't raise if we have less than 20 samples.
"""

y = np.arange(25)
# Create array of unique labels, except '0', which appears twice.
# This does raise a warning.
# Note warning would not be raised if we passed only unique
# labels, which happens when `type_of_target` is passed `classes_`.
y = np.hstack((np.arange(20), [0]))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we add a note here to explain why we add the '0' at the end?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And maybe we could keep the original y = np.arange(25) and check that this does not raise a warning?

Copy link
Contributor Author

@saskra saskra Jul 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, the original y = np.arange(25) now raises a warning because the condition y.shape[0] > classes.shape[0] is not fulfilled, but y.shape[0] == classes.shape[0]. Hence the second zero in the new array in the test, so that you at least have a single class with two samples. The whole point was to prevent the warning from being issued incorrectly if the function is only called with the unique class list instead of the real list of class labels per sample - hence the original idea was also an argument to suppress the warning for this case.

So should we rather adapt the test or treat the case y.shape[0] == classes.shape[0] in type_of_target differently?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we add a note here to explain why we add the '0' at the end?

+1
Let's add a comment to explain that we create an array with almost all classes represented only once except 0 represented twice.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And maybe we could keep the original y = np.arange(25) and check that this does not raise a warning?

However, the original y = np.arange(25) now raises a warning because the condition y.shape[0] > classes.shape[0] is not fulfilled, but y.shape[0] == classes.shape[0]

@saskra I think @lucyleeow is correct. Now the np.arange(25) case won't raise the warning because we special cased y.shape[0] == classes.shape[0] so we can check that indeed no warning is raised. Something like the following that we'd put at the end of the test.

# More than 20 samples but only unique classes, no warning should be raised
y = np.arange(25)
with warnings.catch_warnings():
    warnings.simplefilter("ignore", UserWarning)
    type_of_target(y)

msg = r"The number of unique classes is greater than 50% of the number of samples."
with pytest.warns(UserWarning, match=msg):
type_of_target(y)
Expand All @@ -313,6 +317,14 @@ def test_type_of_target_too_many_unique_classes():
warnings.simplefilter("error")
type_of_target(y)

# More than 20 samples but only unique classes, simulating passing
# `classes_` to `type_of_target` (when number of classes is large).
# No warning should be raised
y = np.arange(25)
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
type_of_target(y)


def test_unique_labels_non_specific():
# Test unique_labels with a variety of collected examples
Expand Down
23 changes: 23 additions & 0 deletions sklearn/utils/tests/test_response.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import warnings

import numpy as np
import pytest

Expand Down Expand Up @@ -369,3 +371,24 @@ def test_get_response_values_multilabel_indicator(response_method):
assert (y_pred > 1).sum() > 0
else: # response_method == "predict"
assert np.logical_or(y_pred == 0, y_pred == 1).all()


def test_response_values_type_of_target_on_classes_no_warning():
"""
Ensure `_get_response_values` doesn't raise spurious warning.

"The number of unique classes is greater than > 50% of samples"
warning should not be raised when calling `type_of_target(classes_)`.

Non-regression test for issue #31583.
"""
X = np.random.RandomState(0).randn(120, 3)
# 30 classes, less than 50% of number of samples
y = np.repeat(np.arange(30), 4)

clf = LogisticRegression().fit(X, y)

with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)

_get_response_values(clf, X, response_method="predict_proba")