From 4f5483fd467516e1e917cb43201a7bb9aa021dc2 Mon Sep 17 00:00:00 2001 From: saskra Date: Wed, 18 Jun 2025 15:04:03 +0200 Subject: [PATCH 01/13] Fix spurious warning from type_of_target when called on estimator.classes_ --- sklearn/utils/_response.py | 2 +- sklearn/utils/multiclass.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/_response.py b/sklearn/utils/_response.py index 9003699d4351d..0523bbaaed079 100644 --- a/sklearn/utils/_response.py +++ b/sklearn/utils/_response.py @@ -200,7 +200,7 @@ def _get_response_values( if is_classifier(estimator): prediction_method = _check_response_method(estimator, response_method) classes = estimator.classes_ - target_type = type_of_target(classes) + target_type = type_of_target(classes, suppress_warning=True) if target_type in ("binary", "multiclass"): if pos_label is not None and pos_label not in classes.tolist(): diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 3a81e2b9eb6fe..1c97e403e5805 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -225,7 +225,7 @@ def check_classification_targets(y): ) -def type_of_target(y, input_name="", raise_unknown=False): +def type_of_target(y, input_name="", raise_unknown=False, suppress_warning=False): """Determine the type of data indicated by the target. Note that this type is the most specific type that can be inferred. @@ -414,7 +414,7 @@ def _raise_or_return(): if issparse(first_row_or_val): first_row_or_val = first_row_or_val.data classes = cached_unique(y) - if y.shape[0] > 20 and classes.shape[0] > round(0.5 * y.shape[0]): + if not suppress_warning and y.shape[0] > 20 and classes.shape[0] > round(0.5 * y.shape[0]): # Only raise the warning when we have at least 20 samples. warnings.warn( "The number of unique classes is greater than 50% of the number " From 0b5cfc5ac59cb5f463ae9b8ddbde7175c1baf404 Mon Sep 17 00:00:00 2001 From: saskra Date: Wed, 18 Jun 2025 15:19:53 +0200 Subject: [PATCH 02/13] Fix line length for E501 and ensure formatting with ruff --- sklearn/utils/multiclass.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 1c97e403e5805..04ff7b41152d9 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -414,7 +414,11 @@ def _raise_or_return(): if issparse(first_row_or_val): first_row_or_val = first_row_or_val.data classes = cached_unique(y) - if not suppress_warning and y.shape[0] > 20 and classes.shape[0] > round(0.5 * y.shape[0]): + if ( + not suppress_warning + and y.shape[0] > 20 + and classes.shape[0] > round(0.5 * y.shape[0]) + ): # Only raise the warning when we have at least 20 samples. warnings.warn( "The number of unique classes is greater than 50% of the number " From eb90ddd217a18264dea857d095648a30148fa94d Mon Sep 17 00:00:00 2001 From: saskra Date: Wed, 2 Jul 2025 15:37:20 +0200 Subject: [PATCH 03/13] Fix test_type_of_target_too_many_unique_classes; add test_response_values_type_of_target_on_classes_no_warning; do not add suppress_warning --- sklearn/utils/_response.py | 2 +- sklearn/utils/multiclass.py | 8 ++------ sklearn/utils/tests/test_multiclass.py | 2 +- sklearn/utils/tests/test_response.py | 21 +++++++++++++++++++++ 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/sklearn/utils/_response.py b/sklearn/utils/_response.py index 0523bbaaed079..9003699d4351d 100644 --- a/sklearn/utils/_response.py +++ b/sklearn/utils/_response.py @@ -200,7 +200,7 @@ def _get_response_values( if is_classifier(estimator): prediction_method = _check_response_method(estimator, response_method) classes = estimator.classes_ - target_type = type_of_target(classes, suppress_warning=True) + target_type = type_of_target(classes) if target_type in ("binary", "multiclass"): if pos_label is not None and pos_label not in classes.tolist(): diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 04ff7b41152d9..d7c81a6f51624 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -225,7 +225,7 @@ def check_classification_targets(y): ) -def type_of_target(y, input_name="", raise_unknown=False, suppress_warning=False): +def type_of_target(y, input_name="", raise_unknown=False): """Determine the type of data indicated by the target. Note that this type is the most specific type that can be inferred. @@ -414,11 +414,7 @@ def _raise_or_return(): if issparse(first_row_or_val): first_row_or_val = first_row_or_val.data classes = cached_unique(y) - if ( - not suppress_warning - and y.shape[0] > 20 - and classes.shape[0] > round(0.5 * y.shape[0]) - ): + if y.shape[0] > 20 and y.shape[0] > classes.shape[0] > round(0.5 * y.shape[0]): # Only raise the warning when we have at least 20 samples. warnings.warn( "The number of unique classes is greater than 50% of the number " diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 433e8118923fb..1ec7e6cfb46cd 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -302,7 +302,7 @@ def test_type_of_target_too_many_unique_classes(): We need to check that we don't raise if we have less than 20 samples. """ - y = np.arange(25) + y = np.concat((np.arange(20), [0])) msg = r"The number of unique classes is greater than 50% of the number of samples." with pytest.warns(UserWarning, match=msg): type_of_target(y) diff --git a/sklearn/utils/tests/test_response.py b/sklearn/utils/tests/test_response.py index 858c16cca4df1..7a80672fb8217 100644 --- a/sklearn/utils/tests/test_response.py +++ b/sklearn/utils/tests/test_response.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pytest @@ -369,3 +371,22 @@ def test_get_response_values_multilabel_indicator(response_method): assert (y_pred > 1).sum() > 0 else: # response_method == "predict" assert np.logical_or(y_pred == 0, y_pred == 1).all() + + +def test_response_values_type_of_target_on_classes_no_warning(): + """ + Ensure that _get_response_values doesn't raise the "unique classes > 50% of samples" + warning when calling `type_of_target(classes_)`. + + non-regression test for issue #31583. + """ + X = np.random.RandomState(0).randn(120, 3) + # 30 classes, less than 50% of number of samples + y = np.repeat(np.arange(30), 4) + + clf = LogisticRegression().fit(X, y) + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + + _get_response_values(clf, X, response_method="predict_proba") From a58664908d24a13be3a8ad0c1c62675f5c10cc1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= Date: Mon, 7 Jul 2025 11:34:35 +0200 Subject: [PATCH 04/13] use hstack instead --- sklearn/utils/tests/test_multiclass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 1ec7e6cfb46cd..81c4dd08076ff 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -302,7 +302,7 @@ def test_type_of_target_too_many_unique_classes(): We need to check that we don't raise if we have less than 20 samples. """ - y = np.concat((np.arange(20), [0])) + y = np.hstack((np.arange(20), [0])) msg = r"The number of unique classes is greater than 50% of the number of samples." with pytest.warns(UserWarning, match=msg): type_of_target(y) From 46bd286bbbb5088816d5612c3cbc31dcc60fba43 Mon Sep 17 00:00:00 2001 From: saskra Date: Mon, 7 Jul 2025 15:59:11 +0200 Subject: [PATCH 05/13] Add changelog entry --- doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst b/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst new file mode 100644 index 0000000000000..0e2a4f6f0a236 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst @@ -0,0 +1,3 @@ +- Fixed a spurious warning in :func:`utils.multiclass.type_of_target` that could be triggered + when called with `estimator.classes_` in certain classification setups. + By :user:`Sascha D. Krauss ` From 0155642261d5fe3fbd1ad9b4a04ce59471715847 Mon Sep 17 00:00:00 2001 From: saskra Date: Fri, 11 Jul 2025 08:36:58 +0200 Subject: [PATCH 06/13] Update sklearn/utils/tests/test_response.py Co-authored-by: Lucy Liu --- sklearn/utils/tests/test_response.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_response.py b/sklearn/utils/tests/test_response.py index 7a80672fb8217..9404d0a20cddc 100644 --- a/sklearn/utils/tests/test_response.py +++ b/sklearn/utils/tests/test_response.py @@ -375,10 +375,12 @@ def test_get_response_values_multilabel_indicator(response_method): def test_response_values_type_of_target_on_classes_no_warning(): """ - Ensure that _get_response_values doesn't raise the "unique classes > 50% of samples" - warning when calling `type_of_target(classes_)`. + Ensure `_get_response_values` doesn't raise spurious warning. + + "The number of unique classes is greater than > 50% of samples" + warning should not be raised when calling `type_of_target(classes_)`. - non-regression test for issue #31583. + Non-regression test for issue #31583. """ X = np.random.RandomState(0).randn(120, 3) # 30 classes, less than 50% of number of samples From cd60bf5a03c96e0d0f6387ac01d7dbc5f9eab6fd Mon Sep 17 00:00:00 2001 From: saskra Date: Fri, 11 Jul 2025 11:33:29 +0200 Subject: [PATCH 07/13] Longer changelog entry; third test in test_type_of_target_too_many_unique_classes plus explanatory comments --- .../sklearn.utils/31584.fix.rst | 15 +- sklearn/utils/tests/test_multiclass.py | 1010 +++++++++-------- 2 files changed, 522 insertions(+), 503 deletions(-) diff --git a/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst b/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst index 0e2a4f6f0a236..9e37883103377 100644 --- a/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst +++ b/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst @@ -1,3 +1,14 @@ -- Fixed a spurious warning in :func:`utils.multiclass.type_of_target` that could be triggered - when called with `estimator.classes_` in certain classification setups. +- Fixed a spurious warning that could occur when passing ``estimator.classes_`` + or similar arrays with many unique values to classification utilities. + The warning came from :func:`sklearn.utils.multiclass.type_of_target` and has + now been suppressed when the input is not a true target vector. + The warning message was: "The number of unique classes is greater than 50% + of the number of samples." + This could appear in tools that internally validate classification outputs, + such as :class:`~sklearn.model_selection.GridSearchCV`, + :func:`~sklearn.model_selection.cross_val_score`, + :func:`~sklearn.metrics.make_scorer`, + :class:`~sklearn.multioutput.MultiOutputClassifier`, + and :class:`~sklearn.calibration.CalibratedClassifierCV`. + By :user:`Sascha D. Krauss ` diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 81c4dd08076ff..afbd42f4f9f8d 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -9,32 +9,32 @@ from sklearn.model_selection import ShuffleSplit from sklearn.svm import SVC from sklearn.utils._array_api import ( - _get_namespace_device_dtype_ids, - yield_namespace_device_dtype_combinations, + _get_namespace_device_dtype_ids, + yield_namespace_device_dtype_combinations, ) from sklearn.utils._testing import ( - _array_api_for_tests, - _convert_container, - assert_allclose, - assert_array_almost_equal, - assert_array_equal, + _array_api_for_tests, + _convert_container, + assert_allclose, + assert_array_almost_equal, + assert_array_equal, ) from sklearn.utils.estimator_checks import _NotAnArray from sklearn.utils.fixes import ( - COO_CONTAINERS, - CSC_CONTAINERS, - CSR_CONTAINERS, - DOK_CONTAINERS, - LIL_CONTAINERS, + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + DOK_CONTAINERS, + LIL_CONTAINERS, ) from sklearn.utils.metaestimators import _safe_split from sklearn.utils.multiclass import ( - _ovr_decision_function, - check_classification_targets, - class_distribution, - is_multilabel, - type_of_target, - unique_labels, + _ovr_decision_function, + check_classification_targets, + class_distribution, + is_multilabel, + type_of_target, + unique_labels, ) multilabel_explicit_zero = np.array([[0, 1], [1, 0]]) @@ -42,592 +42,600 @@ def _generate_sparse( - data, - sparse_containers=tuple( - COO_CONTAINERS - + CSC_CONTAINERS - + CSR_CONTAINERS - + DOK_CONTAINERS - + LIL_CONTAINERS - ), - dtypes=(bool, int, np.int8, np.uint8, float, np.float32), + data, + sparse_containers=tuple( + COO_CONTAINERS + + CSC_CONTAINERS + + CSR_CONTAINERS + + DOK_CONTAINERS + + LIL_CONTAINERS + ), + dtypes=(bool, int, np.int8, np.uint8, float, np.float32), ): - return [ - sparse_container(data, dtype=dtype) - for sparse_container in sparse_containers - for dtype in dtypes - ] + return [ + sparse_container(data, dtype=dtype) + for sparse_container in sparse_containers + for dtype in dtypes + ] EXAMPLES = { - "multilabel-indicator": [ - # valid when the data is formatted as sparse or dense, identified - # by CSR format when the testing takes place - *_generate_sparse( - np.random.RandomState(42).randint(2, size=(10, 10)), - sparse_containers=CSR_CONTAINERS, - dtypes=(int,), - ), - [[0, 1], [1, 0]], - [[0, 1]], - *_generate_sparse( - multilabel_explicit_zero, sparse_containers=CSC_CONTAINERS, dtypes=(int,) - ), - *_generate_sparse([[0, 1], [1, 0]]), - *_generate_sparse([[0, 0], [0, 0]]), - *_generate_sparse([[0, 1]]), - # Only valid when data is dense - [[-1, 1], [1, -1]], - np.array([[-1, 1], [1, -1]]), - np.array([[-3, 3], [3, -3]]), - _NotAnArray(np.array([[-3, 3], [3, -3]])), - ], - "multiclass": [ - [1, 0, 2, 2, 1, 4, 2, 4, 4, 4], - np.array([1, 0, 2]), - np.array([1, 0, 2], dtype=np.int8), - np.array([1, 0, 2], dtype=np.uint8), - np.array([1, 0, 2], dtype=float), - np.array([1, 0, 2], dtype=np.float32), - np.array([[1], [0], [2]]), - _NotAnArray(np.array([1, 0, 2])), - [0, 1, 2], - ["a", "b", "c"], - np.array(["a", "b", "c"]), - np.array(["a", "b", "c"], dtype=object), - np.array(["a", "b", "c"], dtype=object), - ], - "multiclass-multioutput": [ - [[1, 0, 2, 2], [1, 4, 2, 4]], - [["a", "b"], ["c", "d"]], - np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32), - *_generate_sparse( - [[1, 0, 2, 2], [1, 4, 2, 4]], - sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS, - dtypes=(int, np.int8, np.uint8, float, np.float32), - ), - np.array([["a", "b"], ["c", "d"]]), - np.array([["a", "b"], ["c", "d"]]), - np.array([["a", "b"], ["c", "d"]], dtype=object), - np.array([[1, 0, 2]]), - _NotAnArray(np.array([[1, 0, 2]])), - ], - "binary": [ - [0, 1], - [1, 1], - [], - [0], - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32), - np.array([[0], [1]]), - _NotAnArray(np.array([[0], [1]])), - [1, -1], - [3, 5], - ["a"], - ["a", "b"], - ["abc", "def"], - np.array(["abc", "def"]), - ["a", "b"], - np.array(["abc", "def"], dtype=object), - ], - "continuous": [ - [1e-5], - [0, 0.5], - np.array([[0], [0.5]]), - np.array([[0], [0.5]], dtype=np.float32), - ], - "continuous-multioutput": [ - np.array([[0, 0.5], [0.5, 0]]), - np.array([[0, 0.5], [0.5, 0]], dtype=np.float32), - np.array([[0, 0.5]]), - *_generate_sparse( - [[0, 0.5], [0.5, 0]], - sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS, - dtypes=(float, np.float32), - ), - *_generate_sparse( - [[0, 0.5]], - sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS, - dtypes=(float, np.float32), - ), - ], - "unknown": [ - [[]], - np.array([[]], dtype=object), - [()], - # sequence of sequences that weren't supported even before deprecation - np.array([np.array([]), np.array([1, 2, 3])], dtype=object), - [np.array([]), np.array([1, 2, 3])], - [{1, 2, 3}, {1, 2}], - [frozenset([1, 2, 3]), frozenset([1, 2])], - # and also confusable as sequences of sequences - [{0: "a", 1: "b"}, {0: "a"}], - # ndim 0 - np.array(0), - # empty second dimension - np.array([[], []]), - # 3d - np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]), - ], + "multilabel-indicator": [ + # valid when the data is formatted as sparse or dense, identified + # by CSR format when the testing takes place + *_generate_sparse( + np.random.RandomState(42).randint(2, size=(10, 10)), + sparse_containers=CSR_CONTAINERS, + dtypes=(int,), + ), + [[0, 1], [1, 0]], + [[0, 1]], + *_generate_sparse( + multilabel_explicit_zero, sparse_containers=CSC_CONTAINERS, dtypes=(int,) + ), + *_generate_sparse([[0, 1], [1, 0]]), + *_generate_sparse([[0, 0], [0, 0]]), + *_generate_sparse([[0, 1]]), + # Only valid when data is dense + [[-1, 1], [1, -1]], + np.array([[-1, 1], [1, -1]]), + np.array([[-3, 3], [3, -3]]), + _NotAnArray(np.array([[-3, 3], [3, -3]])), + ], + "multiclass": [ + [1, 0, 2, 2, 1, 4, 2, 4, 4, 4], + np.array([1, 0, 2]), + np.array([1, 0, 2], dtype=np.int8), + np.array([1, 0, 2], dtype=np.uint8), + np.array([1, 0, 2], dtype=float), + np.array([1, 0, 2], dtype=np.float32), + np.array([[1], [0], [2]]), + _NotAnArray(np.array([1, 0, 2])), + [0, 1, 2], + ["a", "b", "c"], + np.array(["a", "b", "c"]), + np.array(["a", "b", "c"], dtype=object), + np.array(["a", "b", "c"], dtype=object), + ], + "multiclass-multioutput": [ + [[1, 0, 2, 2], [1, 4, 2, 4]], + [["a", "b"], ["c", "d"]], + np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32), + *_generate_sparse( + [[1, 0, 2, 2], [1, 4, 2, 4]], + sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS, + dtypes=(int, np.int8, np.uint8, float, np.float32), + ), + np.array([["a", "b"], ["c", "d"]]), + np.array([["a", "b"], ["c", "d"]]), + np.array([["a", "b"], ["c", "d"]], dtype=object), + np.array([[1, 0, 2]]), + _NotAnArray(np.array([[1, 0, 2]])), + ], + "binary": [ + [0, 1], + [1, 1], + [], + [0], + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32), + np.array([[0], [1]]), + _NotAnArray(np.array([[0], [1]])), + [1, -1], + [3, 5], + ["a"], + ["a", "b"], + ["abc", "def"], + np.array(["abc", "def"]), + ["a", "b"], + np.array(["abc", "def"], dtype=object), + ], + "continuous": [ + [1e-5], + [0, 0.5], + np.array([[0], [0.5]]), + np.array([[0], [0.5]], dtype=np.float32), + ], + "continuous-multioutput": [ + np.array([[0, 0.5], [0.5, 0]]), + np.array([[0, 0.5], [0.5, 0]], dtype=np.float32), + np.array([[0, 0.5]]), + *_generate_sparse( + [[0, 0.5], [0.5, 0]], + sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS, + dtypes=(float, np.float32), + ), + *_generate_sparse( + [[0, 0.5]], + sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS, + dtypes=(float, np.float32), + ), + ], + "unknown": [ + [[]], + np.array([[]], dtype=object), + [()], + # sequence of sequences that weren't supported even before deprecation + np.array([np.array([]), np.array([1, 2, 3])], dtype=object), + [np.array([]), np.array([1, 2, 3])], + [{1, 2, 3}, {1, 2}], + [frozenset([1, 2, 3]), frozenset([1, 2])], + # and also confusable as sequences of sequences + [{0: "a", 1: "b"}, {0: "a"}], + # ndim 0 + np.array(0), + # empty second dimension + np.array([[], []]), + # 3d + np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]), + ], } ARRAY_API_EXAMPLES = { - "multilabel-indicator": [ - np.random.RandomState(42).randint(2, size=(10, 10)), - [[0, 1], [1, 0]], - [[0, 1]], - multilabel_explicit_zero, - [[0, 0], [0, 0]], - [[-1, 1], [1, -1]], - np.array([[-1, 1], [1, -1]]), - np.array([[-3, 3], [3, -3]]), - _NotAnArray(np.array([[-3, 3], [3, -3]])), - ], - "multiclass": [ - [1, 0, 2, 2, 1, 4, 2, 4, 4, 4], - np.array([1, 0, 2]), - np.array([1, 0, 2], dtype=np.int8), - np.array([1, 0, 2], dtype=np.uint8), - np.array([1, 0, 2], dtype=float), - np.array([1, 0, 2], dtype=np.float32), - np.array([[1], [0], [2]]), - _NotAnArray(np.array([1, 0, 2])), - [0, 1, 2], - ], - "multiclass-multioutput": [ - [[1, 0, 2, 2], [1, 4, 2, 4]], - np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32), - np.array([[1, 0, 2]]), - _NotAnArray(np.array([[1, 0, 2]])), - ], - "binary": [ - [0, 1], - [1, 1], - [], - [0], - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32), - np.array([[0], [1]]), - _NotAnArray(np.array([[0], [1]])), - [1, -1], - [3, 5], - ], - "continuous": [ - [1e-5], - [0, 0.5], - np.array([[0], [0.5]]), - np.array([[0], [0.5]], dtype=np.float32), - ], - "continuous-multioutput": [ - np.array([[0, 0.5], [0.5, 0]]), - np.array([[0, 0.5], [0.5, 0]], dtype=np.float32), - np.array([[0, 0.5]]), - ], - "unknown": [ - [[]], - [()], - np.array(0), - np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]), - ], + "multilabel-indicator": [ + np.random.RandomState(42).randint(2, size=(10, 10)), + [[0, 1], [1, 0]], + [[0, 1]], + multilabel_explicit_zero, + [[0, 0], [0, 0]], + [[-1, 1], [1, -1]], + np.array([[-1, 1], [1, -1]]), + np.array([[-3, 3], [3, -3]]), + _NotAnArray(np.array([[-3, 3], [3, -3]])), + ], + "multiclass": [ + [1, 0, 2, 2, 1, 4, 2, 4, 4, 4], + np.array([1, 0, 2]), + np.array([1, 0, 2], dtype=np.int8), + np.array([1, 0, 2], dtype=np.uint8), + np.array([1, 0, 2], dtype=float), + np.array([1, 0, 2], dtype=np.float32), + np.array([[1], [0], [2]]), + _NotAnArray(np.array([1, 0, 2])), + [0, 1, 2], + ], + "multiclass-multioutput": [ + [[1, 0, 2, 2], [1, 4, 2, 4]], + np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32), + np.array([[1, 0, 2]]), + _NotAnArray(np.array([[1, 0, 2]])), + ], + "binary": [ + [0, 1], + [1, 1], + [], + [0], + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32), + np.array([[0], [1]]), + _NotAnArray(np.array([[0], [1]])), + [1, -1], + [3, 5], + ], + "continuous": [ + [1e-5], + [0, 0.5], + np.array([[0], [0.5]]), + np.array([[0], [0.5]], dtype=np.float32), + ], + "continuous-multioutput": [ + np.array([[0, 0.5], [0.5, 0]]), + np.array([[0, 0.5], [0.5, 0]], dtype=np.float32), + np.array([[0, 0.5]]), + ], + "unknown": [ + [[]], + [()], + np.array(0), + np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]), + ], } - NON_ARRAY_LIKE_EXAMPLES = [ - {1, 2, 3}, - {0: "a", 1: "b"}, - {0: [5], 1: [5]}, - "abc", - frozenset([1, 2, 3]), - None, + {1, 2, 3}, + {0: "a", 1: "b"}, + {0: [5], 1: [5]}, + "abc", + frozenset([1, 2, 3]), + None, ] MULTILABEL_SEQUENCES = [ - [[1], [2], [0, 1]], - [(), (2), (0, 1)], - np.array([[], [1, 2]], dtype="object"), - _NotAnArray(np.array([[], [1, 2]], dtype="object")), + [[1], [2], [0, 1]], + [(), (2), (0, 1)], + np.array([[], [1, 2]], dtype="object"), + _NotAnArray(np.array([[], [1, 2]], dtype="object")), ] def test_unique_labels(): - # Empty iterable - with pytest.raises(ValueError): - unique_labels() + # Empty iterable + with pytest.raises(ValueError): + unique_labels() - # Multiclass problem - assert_array_equal(unique_labels(range(10)), np.arange(10)) - assert_array_equal(unique_labels(np.arange(10)), np.arange(10)) - assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4])) + # Multiclass problem + assert_array_equal(unique_labels(range(10)), np.arange(10)) + assert_array_equal(unique_labels(np.arange(10)), np.arange(10)) + assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4])) - # Multilabel indicator - assert_array_equal( - unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3) - ) + # Multilabel indicator + assert_array_equal( + unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3) + ) - assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3)) + assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3)) - # Several arrays passed - assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5)) - assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3)) + # Several arrays passed + assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5)) + assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3)) - # Border line case with binary indicator matrix - with pytest.raises(ValueError): - unique_labels([4, 0, 2], np.ones((5, 5))) - with pytest.raises(ValueError): - unique_labels(np.ones((5, 4)), np.ones((5, 5))) + # Border line case with binary indicator matrix + with pytest.raises(ValueError): + unique_labels([4, 0, 2], np.ones((5, 5))) + with pytest.raises(ValueError): + unique_labels(np.ones((5, 4)), np.ones((5, 5))) - assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5)) + assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5)) def test_type_of_target_too_many_unique_classes(): - """Check that we raise a warning when the number of unique classes is greater than - 50% of the number of samples. + """Check that we raise a warning when the number of unique classes is greater than + 50% of the number of samples. + + We need to check that we don't raise if we have less than 20 samples. + """ - We need to check that we don't raise if we have less than 20 samples. - """ + # Create a label array where each class appears only once, except '0' appears twice. + # This simulates misuse of `type_of_target` with unique class labels, + # ensuring it doesn't raise a warning in such cases. + y = np.hstack((np.arange(20), [0])) + msg = r"The number of unique classes is greater than 50% of the number of samples." + with pytest.warns(UserWarning, match=msg): + type_of_target(y) - y = np.hstack((np.arange(20), [0])) - msg = r"The number of unique classes is greater than 50% of the number of samples." - with pytest.warns(UserWarning, match=msg): - type_of_target(y) + # less than 20 samples, no warning should be raised + y = np.arange(10) + with warnings.catch_warnings(): + warnings.simplefilter("error") + type_of_target(y) - # less than 20 samples, no warning should be raised - y = np.arange(10) - with warnings.catch_warnings(): - warnings.simplefilter("error") - type_of_target(y) + # More than 20 samples but only unique classes, no warning should be raised + y = np.arange(25) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + type_of_target(y) def test_unique_labels_non_specific(): - # Test unique_labels with a variety of collected examples + # Test unique_labels with a variety of collected examples - # Smoke test for all supported format - for format in ["binary", "multiclass", "multilabel-indicator"]: - for y in EXAMPLES[format]: - unique_labels(y) + # Smoke test for all supported format + for format in ["binary", "multiclass", "multilabel-indicator"]: + for y in EXAMPLES[format]: + unique_labels(y) - # We don't support those format at the moment - for example in NON_ARRAY_LIKE_EXAMPLES: - with pytest.raises(ValueError): - unique_labels(example) + # We don't support those format at the moment + for example in NON_ARRAY_LIKE_EXAMPLES: + with pytest.raises(ValueError): + unique_labels(example) - for y_type in [ - "unknown", - "continuous", - "continuous-multioutput", - "multiclass-multioutput", - ]: - for example in EXAMPLES[y_type]: - with pytest.raises(ValueError): - unique_labels(example) + for y_type in [ + "unknown", + "continuous", + "continuous-multioutput", + "multiclass-multioutput", + ]: + for example in EXAMPLES[y_type]: + with pytest.raises(ValueError): + unique_labels(example) def test_unique_labels_mixed_types(): - # Mix with binary or multiclass and multilabel - mix_clf_format = product( - EXAMPLES["multilabel-indicator"], EXAMPLES["multiclass"] + EXAMPLES["binary"] - ) + # Mix with binary or multiclass and multilabel + mix_clf_format = product( + EXAMPLES["multilabel-indicator"], EXAMPLES["multiclass"] + EXAMPLES["binary"] + ) - for y_multilabel, y_multiclass in mix_clf_format: - with pytest.raises(ValueError): - unique_labels(y_multiclass, y_multilabel) - with pytest.raises(ValueError): - unique_labels(y_multilabel, y_multiclass) + for y_multilabel, y_multiclass in mix_clf_format: + with pytest.raises(ValueError): + unique_labels(y_multiclass, y_multilabel) + with pytest.raises(ValueError): + unique_labels(y_multilabel, y_multiclass) - with pytest.raises(ValueError): - unique_labels([[1, 2]], [["a", "d"]]) + with pytest.raises(ValueError): + unique_labels([[1, 2]], [["a", "d"]]) - with pytest.raises(ValueError): - unique_labels(["1", 2]) + with pytest.raises(ValueError): + unique_labels(["1", 2]) - with pytest.raises(ValueError): - unique_labels([["1", 2], [1, 3]]) + with pytest.raises(ValueError): + unique_labels([["1", 2], [1, 3]]) - with pytest.raises(ValueError): - unique_labels([["1", "2"], [2, 3]]) + with pytest.raises(ValueError): + unique_labels([["1", "2"], [2, 3]]) def test_is_multilabel(): - for group, group_examples in EXAMPLES.items(): - dense_exp = group == "multilabel-indicator" - - for example in group_examples: - # Only mark explicitly defined sparse examples as valid sparse - # multilabel-indicators - sparse_exp = dense_exp and issparse(example) - - if issparse(example) or ( - hasattr(example, "__array__") - and np.asarray(example).ndim == 2 - and np.asarray(example).dtype.kind in "biuf" - and np.asarray(example).shape[1] > 0 - ): - examples_sparse = [ - sparse_container(example) - for sparse_container in ( - COO_CONTAINERS - + CSC_CONTAINERS - + CSR_CONTAINERS - + DOK_CONTAINERS - + LIL_CONTAINERS - ) - ] - for exmpl_sparse in examples_sparse: - assert sparse_exp == is_multilabel(exmpl_sparse), ( - f"is_multilabel({exmpl_sparse!r}) should be {sparse_exp}" - ) - - # Densify sparse examples before testing - if issparse(example): - example = example.toarray() - - assert dense_exp == is_multilabel(example), ( - f"is_multilabel({example!r}) should be {dense_exp}" - ) + for group, group_examples in EXAMPLES.items(): + dense_exp = group == "multilabel-indicator" + + for example in group_examples: + # Only mark explicitly defined sparse examples as valid sparse + # multilabel-indicators + sparse_exp = dense_exp and issparse(example) + + if issparse(example) or ( + hasattr(example, "__array__") + and np.asarray(example).ndim == 2 + and np.asarray(example).dtype.kind in "biuf" + and np.asarray(example).shape[1] > 0 + ): + examples_sparse = [ + sparse_container(example) + for sparse_container in ( + COO_CONTAINERS + + CSC_CONTAINERS + + CSR_CONTAINERS + + DOK_CONTAINERS + + LIL_CONTAINERS + ) + ] + for exmpl_sparse in examples_sparse: + assert sparse_exp == is_multilabel(exmpl_sparse), ( + f"is_multilabel({exmpl_sparse!r}) should be {sparse_exp}" + ) + + # Densify sparse examples before testing + if issparse(example): + example = example.toarray() + + assert dense_exp == is_multilabel(example), ( + f"is_multilabel({example!r}) should be {dense_exp}" + ) @pytest.mark.parametrize( - "array_namespace, device, dtype_name", - yield_namespace_device_dtype_combinations(), - ids=_get_namespace_device_dtype_ids, + "array_namespace, device, dtype_name", + yield_namespace_device_dtype_combinations(), + ids=_get_namespace_device_dtype_ids, ) def test_is_multilabel_array_api_compliance(array_namespace, device, dtype_name): - xp = _array_api_for_tests(array_namespace, device) + xp = _array_api_for_tests(array_namespace, device) - for group, group_examples in ARRAY_API_EXAMPLES.items(): - dense_exp = group == "multilabel-indicator" - for example in group_examples: - if np.asarray(example).dtype.kind == "f": - example = np.asarray(example, dtype=dtype_name) - else: - example = np.asarray(example) - example = xp.asarray(example, device=device) + for group, group_examples in ARRAY_API_EXAMPLES.items(): + dense_exp = group == "multilabel-indicator" + for example in group_examples: + if np.asarray(example).dtype.kind == "f": + example = np.asarray(example, dtype=dtype_name) + else: + example = np.asarray(example) + example = xp.asarray(example, device=device) - with config_context(array_api_dispatch=True): - assert dense_exp == is_multilabel(example), ( - f"is_multilabel({example!r}) should be {dense_exp}" - ) + with config_context(array_api_dispatch=True): + assert dense_exp == is_multilabel(example), ( + f"is_multilabel({example!r}) should be {dense_exp}" + ) def test_check_classification_targets(): - for y_type in EXAMPLES.keys(): - if y_type in ["unknown", "continuous", "continuous-multioutput"]: - for example in EXAMPLES[y_type]: - msg = "Unknown label type: " - with pytest.raises(ValueError, match=msg): - check_classification_targets(example) - else: - for example in EXAMPLES[y_type]: - check_classification_targets(example) + for y_type in EXAMPLES.keys(): + if y_type in ["unknown", "continuous", "continuous-multioutput"]: + for example in EXAMPLES[y_type]: + msg = "Unknown label type: " + with pytest.raises(ValueError, match=msg): + check_classification_targets(example) + else: + for example in EXAMPLES[y_type]: + check_classification_targets(example) def test_type_of_target(): - for group, group_examples in EXAMPLES.items(): - for example in group_examples: - assert type_of_target(example) == group, ( - "type_of_target(%r) should be %r, got %r" - % ( - example, - group, - type_of_target(example), - ) - ) - - for example in NON_ARRAY_LIKE_EXAMPLES: - msg_regex = r"Expected array-like \(array or non-string sequence\).*" - with pytest.raises(ValueError, match=msg_regex): - type_of_target(example) - - for example in MULTILABEL_SEQUENCES: - msg = ( - "You appear to be using a legacy multi-label data " - "representation. Sequence of sequences are no longer supported;" - " use a binary array or sparse matrix instead." - ) - with pytest.raises(ValueError, match=msg): - type_of_target(example) + for group, group_examples in EXAMPLES.items(): + for example in group_examples: + assert type_of_target(example) == group, ( + "type_of_target(%r) should be %r, got %r" + % ( + example, + group, + type_of_target(example), + ) + ) + + for example in NON_ARRAY_LIKE_EXAMPLES: + msg_regex = r"Expected array-like \(array or non-string sequence\).*" + with pytest.raises(ValueError, match=msg_regex): + type_of_target(example) + + for example in MULTILABEL_SEQUENCES: + msg = ( + "You appear to be using a legacy multi-label data " + "representation. Sequence of sequences are no longer supported;" + " use a binary array or sparse matrix instead." + ) + with pytest.raises(ValueError, match=msg): + type_of_target(example) def test_type_of_target_pandas_sparse(): - pd = pytest.importorskip("pandas") + pd = pytest.importorskip("pandas") - y = pd.arrays.SparseArray([1, np.nan, np.nan, 1, np.nan]) - msg = "y cannot be class 'SparseSeries' or 'SparseArray'" - with pytest.raises(ValueError, match=msg): - type_of_target(y) + y = pd.arrays.SparseArray([1, np.nan, np.nan, 1, np.nan]) + msg = "y cannot be class 'SparseSeries' or 'SparseArray'" + with pytest.raises(ValueError, match=msg): + type_of_target(y) def test_type_of_target_pandas_nullable(): - """Check that type_of_target works with pandas nullable dtypes.""" - pd = pytest.importorskip("pandas") + """Check that type_of_target works with pandas nullable dtypes.""" + pd = pytest.importorskip("pandas") - for dtype in ["Int32", "Float32"]: - y_true = pd.Series([1, 0, 2, 3, 4], dtype=dtype) - assert type_of_target(y_true) == "multiclass" + for dtype in ["Int32", "Float32"]: + y_true = pd.Series([1, 0, 2, 3, 4], dtype=dtype) + assert type_of_target(y_true) == "multiclass" - y_true = pd.Series([1, 0, 1, 0], dtype=dtype) - assert type_of_target(y_true) == "binary" + y_true = pd.Series([1, 0, 1, 0], dtype=dtype) + assert type_of_target(y_true) == "binary" - y_true = pd.DataFrame([[1.4, 3.1], [3.1, 1.4]], dtype="Float32") - assert type_of_target(y_true) == "continuous-multioutput" + y_true = pd.DataFrame([[1.4, 3.1], [3.1, 1.4]], dtype="Float32") + assert type_of_target(y_true) == "continuous-multioutput" - y_true = pd.DataFrame([[0, 1], [1, 1]], dtype="Int32") - assert type_of_target(y_true) == "multilabel-indicator" + y_true = pd.DataFrame([[0, 1], [1, 1]], dtype="Int32") + assert type_of_target(y_true) == "multilabel-indicator" - y_true = pd.DataFrame([[1, 2], [3, 1]], dtype="Int32") - assert type_of_target(y_true) == "multiclass-multioutput" + y_true = pd.DataFrame([[1, 2], [3, 1]], dtype="Int32") + assert type_of_target(y_true) == "multiclass-multioutput" @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) def test_unique_labels_pandas_nullable(dtype): - """Checks that unique_labels work with pandas nullable dtypes. + """Checks that unique_labels work with pandas nullable dtypes. - Non-regression test for gh-25634. - """ - pd = pytest.importorskip("pandas") + Non-regression test for gh-25634. + """ + pd = pytest.importorskip("pandas") - y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype) - y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype="int64") + y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype) + y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype="int64") - labels = unique_labels(y_true, y_predicted) - assert_array_equal(labels, [0, 1]) + labels = unique_labels(y_true, y_predicted) + assert_array_equal(labels, [0, 1]) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_class_distribution(csc_container): - y = np.array( - [ - [1, 0, 0, 1], - [2, 2, 0, 1], - [1, 3, 0, 1], - [4, 2, 0, 1], - [2, 0, 0, 1], - [1, 3, 0, 1], - ] - ) - # Define the sparse matrix with a mix of implicit and explicit zeros - data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1]) - indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5]) - indptr = np.array([0, 6, 11, 11, 17]) - y_sp = csc_container((data, indices, indptr), shape=(6, 4)) - - classes, n_classes, class_prior = class_distribution(y) - classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp) - classes_expected = [[1, 2, 4], [0, 2, 3], [0], [1]] - n_classes_expected = [3, 3, 1, 1] - class_prior_expected = [[3 / 6, 2 / 6, 1 / 6], [1 / 3, 1 / 3, 1 / 3], [1.0], [1.0]] - - for k in range(y.shape[1]): - assert_array_almost_equal(classes[k], classes_expected[k]) - assert_array_almost_equal(n_classes[k], n_classes_expected[k]) - assert_array_almost_equal(class_prior[k], class_prior_expected[k]) - - assert_array_almost_equal(classes_sp[k], classes_expected[k]) - assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k]) - assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k]) - - # Test again with explicit sample weights - (classes, n_classes, class_prior) = class_distribution( - y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0] - ) - (classes_sp, n_classes_sp, class_prior_sp) = class_distribution( - y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0] - ) - class_prior_expected = [[4 / 9, 3 / 9, 2 / 9], [2 / 9, 4 / 9, 3 / 9], [1.0], [1.0]] - - for k in range(y.shape[1]): - assert_array_almost_equal(classes[k], classes_expected[k]) - assert_array_almost_equal(n_classes[k], n_classes_expected[k]) - assert_array_almost_equal(class_prior[k], class_prior_expected[k]) - - assert_array_almost_equal(classes_sp[k], classes_expected[k]) - assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k]) - assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k]) + y = np.array( + [ + [1, 0, 0, 1], + [2, 2, 0, 1], + [1, 3, 0, 1], + [4, 2, 0, 1], + [2, 0, 0, 1], + [1, 3, 0, 1], + ] + ) + # Define the sparse matrix with a mix of implicit and explicit zeros + data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1]) + indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5]) + indptr = np.array([0, 6, 11, 11, 17]) + y_sp = csc_container((data, indices, indptr), shape=(6, 4)) + + classes, n_classes, class_prior = class_distribution(y) + classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp) + classes_expected = [[1, 2, 4], [0, 2, 3], [0], [1]] + n_classes_expected = [3, 3, 1, 1] + class_prior_expected = [[3 / 6, 2 / 6, 1 / 6], [1 / 3, 1 / 3, 1 / 3], [1.0], [1.0]] + + for k in range(y.shape[1]): + assert_array_almost_equal(classes[k], classes_expected[k]) + assert_array_almost_equal(n_classes[k], n_classes_expected[k]) + assert_array_almost_equal(class_prior[k], class_prior_expected[k]) + + assert_array_almost_equal(classes_sp[k], classes_expected[k]) + assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k]) + assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k]) + + # Test again with explicit sample weights + (classes, n_classes, class_prior) = class_distribution( + y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0] + ) + (classes_sp, n_classes_sp, class_prior_sp) = class_distribution( + y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0] + ) + class_prior_expected = [[4 / 9, 3 / 9, 2 / 9], [2 / 9, 4 / 9, 3 / 9], [1.0], [1.0]] + + for k in range(y.shape[1]): + assert_array_almost_equal(classes[k], classes_expected[k]) + assert_array_almost_equal(n_classes[k], n_classes_expected[k]) + assert_array_almost_equal(class_prior[k], class_prior_expected[k]) + + assert_array_almost_equal(classes_sp[k], classes_expected[k]) + assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k]) + assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k]) def test_safe_split_with_precomputed_kernel(): - clf = SVC() - clfp = SVC(kernel="precomputed") + clf = SVC() + clfp = SVC(kernel="precomputed") - iris = datasets.load_iris() - X, y = iris.data, iris.target - K = np.dot(X, X.T) + iris = datasets.load_iris() + X, y = iris.data, iris.target + K = np.dot(X, X.T) - cv = ShuffleSplit(test_size=0.25, random_state=0) - train, test = next(iter(cv.split(X))) + cv = ShuffleSplit(test_size=0.25, random_state=0) + train, test = next(iter(cv.split(X))) - X_train, y_train = _safe_split(clf, X, y, train) - K_train, y_train2 = _safe_split(clfp, K, y, train) - assert_array_almost_equal(K_train, np.dot(X_train, X_train.T)) - assert_array_almost_equal(y_train, y_train2) + X_train, y_train = _safe_split(clf, X, y, train) + K_train, y_train2 = _safe_split(clfp, K, y, train) + assert_array_almost_equal(K_train, np.dot(X_train, X_train.T)) + assert_array_almost_equal(y_train, y_train2) - X_test, y_test = _safe_split(clf, X, y, test, train) - K_test, y_test2 = _safe_split(clfp, K, y, test, train) - assert_array_almost_equal(K_test, np.dot(X_test, X_train.T)) - assert_array_almost_equal(y_test, y_test2) + X_test, y_test = _safe_split(clf, X, y, test, train) + K_test, y_test2 = _safe_split(clfp, K, y, test, train) + assert_array_almost_equal(K_test, np.dot(X_test, X_train.T)) + assert_array_almost_equal(y_test, y_test2) def test_ovr_decision_function(): - # test properties for ovr decision function + # test properties for ovr decision function - predictions = np.array([[0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 1]]) + predictions = np.array([[0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 1]]) - confidences = np.array( - [[-1e16, 0, -1e16], [1.0, 2.0, -3.0], [-5.0, 2.0, 5.0], [-0.5, 0.2, 0.5]] - ) + confidences = np.array( + [[-1e16, 0, -1e16], [1.0, 2.0, -3.0], [-5.0, 2.0, 5.0], [-0.5, 0.2, 0.5]] + ) - n_classes = 3 + n_classes = 3 - dec_values = _ovr_decision_function(predictions, confidences, n_classes) + dec_values = _ovr_decision_function(predictions, confidences, n_classes) - # check that the decision values are within 0.5 range of the votes - votes = np.array([[1, 0, 2], [1, 1, 1], [1, 0, 2], [1, 0, 2]]) + # check that the decision values are within 0.5 range of the votes + votes = np.array([[1, 0, 2], [1, 1, 1], [1, 0, 2], [1, 0, 2]]) - assert_allclose(votes, dec_values, atol=0.5) + assert_allclose(votes, dec_values, atol=0.5) - # check that the prediction are what we expect - # highest vote or highest confidence if there is a tie. - # for the second sample we have a tie (should be won by 1) - expected_prediction = np.array([2, 1, 2, 2]) - assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction) + # check that the prediction are what we expect + # highest vote or highest confidence if there is a tie. + # for the second sample we have a tie (should be won by 1) + expected_prediction = np.array([2, 1, 2, 2]) + assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction) - # third and fourth sample have the same vote but third sample - # has higher confidence, this should reflect on the decision values - assert dec_values[2, 2] > dec_values[3, 2] + # third and fourth sample have the same vote but third sample + # has higher confidence, this should reflect on the decision values + assert dec_values[2, 2] > dec_values[3, 2] - # assert subset invariance. - dec_values_one = [ - _ovr_decision_function( - np.array([predictions[i]]), np.array([confidences[i]]), n_classes - )[0] - for i in range(4) - ] + # assert subset invariance. + dec_values_one = [ + _ovr_decision_function( + np.array([predictions[i]]), np.array([confidences[i]]), n_classes + )[0] + for i in range(4) + ] - assert_allclose(dec_values, dec_values_one, atol=1e-6) + assert_allclose(dec_values, dec_values_one, atol=1e-6) @pytest.mark.parametrize("input_type", ["list", "array"]) def test_labels_in_bytes_format_error(input_type): - # check that we raise an error with bytes encoded labels - # non-regression test for: - # https://github.com/scikit-learn/scikit-learn/issues/16980 - target = _convert_container([b"a", b"b"], input_type) - err_msg = "Support for labels represented as bytes is not supported" - with pytest.raises(TypeError, match=err_msg): - type_of_target(target) + # check that we raise an error with bytes encoded labels + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/16980 + target = _convert_container([b"a", b"b"], input_type) + err_msg = "Support for labels represented as bytes is not supported" + with pytest.raises(TypeError, match=err_msg): + type_of_target(target) From 9302e60f2a88efc89e769bf5b2242c1debe3f29e Mon Sep 17 00:00:00 2001 From: saskra Date: Fri, 11 Jul 2025 11:37:10 +0200 Subject: [PATCH 08/13] Make changelog shorter again --- .../sklearn.utils/31584.fix.rst | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst b/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst index 9e37883103377..cee8984984d9a 100644 --- a/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst +++ b/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst @@ -1,14 +1,3 @@ -- Fixed a spurious warning that could occur when passing ``estimator.classes_`` - or similar arrays with many unique values to classification utilities. - The warning came from :func:`sklearn.utils.multiclass.type_of_target` and has - now been suppressed when the input is not a true target vector. - The warning message was: "The number of unique classes is greater than 50% - of the number of samples." - This could appear in tools that internally validate classification outputs, - such as :class:`~sklearn.model_selection.GridSearchCV`, - :func:`~sklearn.model_selection.cross_val_score`, - :func:`~sklearn.metrics.make_scorer`, - :class:`~sklearn.multioutput.MultiOutputClassifier`, - and :class:`~sklearn.calibration.CalibratedClassifierCV`. - - By :user:`Sascha D. Krauss ` +- Fixed a spurious warning (about the number of unique classes being + greater than 50% of the number of samples) that could occur when + passing `classes` :func:`utils.multiclass.type_of_target`. From d4736e94024e9bb5154ce129b1e67d0f3af4f0ca Mon Sep 17 00:00:00 2001 From: saskra Date: Fri, 11 Jul 2025 11:40:21 +0200 Subject: [PATCH 09/13] Ruff --- sklearn/utils/tests/test_multiclass.py | 1016 ++++++++++++------------ sklearn/utils/tests/test_response.py | 2 +- 2 files changed, 509 insertions(+), 509 deletions(-) diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index afbd42f4f9f8d..8f53b3e9b3d1d 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -9,32 +9,32 @@ from sklearn.model_selection import ShuffleSplit from sklearn.svm import SVC from sklearn.utils._array_api import ( - _get_namespace_device_dtype_ids, - yield_namespace_device_dtype_combinations, + _get_namespace_device_dtype_ids, + yield_namespace_device_dtype_combinations, ) from sklearn.utils._testing import ( - _array_api_for_tests, - _convert_container, - assert_allclose, - assert_array_almost_equal, - assert_array_equal, + _array_api_for_tests, + _convert_container, + assert_allclose, + assert_array_almost_equal, + assert_array_equal, ) from sklearn.utils.estimator_checks import _NotAnArray from sklearn.utils.fixes import ( - COO_CONTAINERS, - CSC_CONTAINERS, - CSR_CONTAINERS, - DOK_CONTAINERS, - LIL_CONTAINERS, + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + DOK_CONTAINERS, + LIL_CONTAINERS, ) from sklearn.utils.metaestimators import _safe_split from sklearn.utils.multiclass import ( - _ovr_decision_function, - check_classification_targets, - class_distribution, - is_multilabel, - type_of_target, - unique_labels, + _ovr_decision_function, + check_classification_targets, + class_distribution, + is_multilabel, + type_of_target, + unique_labels, ) multilabel_explicit_zero = np.array([[0, 1], [1, 0]]) @@ -42,600 +42,600 @@ def _generate_sparse( - data, - sparse_containers=tuple( - COO_CONTAINERS - + CSC_CONTAINERS - + CSR_CONTAINERS - + DOK_CONTAINERS - + LIL_CONTAINERS - ), - dtypes=(bool, int, np.int8, np.uint8, float, np.float32), + data, + sparse_containers=tuple( + COO_CONTAINERS + + CSC_CONTAINERS + + CSR_CONTAINERS + + DOK_CONTAINERS + + LIL_CONTAINERS + ), + dtypes=(bool, int, np.int8, np.uint8, float, np.float32), ): - return [ - sparse_container(data, dtype=dtype) - for sparse_container in sparse_containers - for dtype in dtypes - ] + return [ + sparse_container(data, dtype=dtype) + for sparse_container in sparse_containers + for dtype in dtypes + ] EXAMPLES = { - "multilabel-indicator": [ - # valid when the data is formatted as sparse or dense, identified - # by CSR format when the testing takes place - *_generate_sparse( - np.random.RandomState(42).randint(2, size=(10, 10)), - sparse_containers=CSR_CONTAINERS, - dtypes=(int,), - ), - [[0, 1], [1, 0]], - [[0, 1]], - *_generate_sparse( - multilabel_explicit_zero, sparse_containers=CSC_CONTAINERS, dtypes=(int,) - ), - *_generate_sparse([[0, 1], [1, 0]]), - *_generate_sparse([[0, 0], [0, 0]]), - *_generate_sparse([[0, 1]]), - # Only valid when data is dense - [[-1, 1], [1, -1]], - np.array([[-1, 1], [1, -1]]), - np.array([[-3, 3], [3, -3]]), - _NotAnArray(np.array([[-3, 3], [3, -3]])), - ], - "multiclass": [ - [1, 0, 2, 2, 1, 4, 2, 4, 4, 4], - np.array([1, 0, 2]), - np.array([1, 0, 2], dtype=np.int8), - np.array([1, 0, 2], dtype=np.uint8), - np.array([1, 0, 2], dtype=float), - np.array([1, 0, 2], dtype=np.float32), - np.array([[1], [0], [2]]), - _NotAnArray(np.array([1, 0, 2])), - [0, 1, 2], - ["a", "b", "c"], - np.array(["a", "b", "c"]), - np.array(["a", "b", "c"], dtype=object), - np.array(["a", "b", "c"], dtype=object), - ], - "multiclass-multioutput": [ - [[1, 0, 2, 2], [1, 4, 2, 4]], - [["a", "b"], ["c", "d"]], - np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32), - *_generate_sparse( - [[1, 0, 2, 2], [1, 4, 2, 4]], - sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS, - dtypes=(int, np.int8, np.uint8, float, np.float32), - ), - np.array([["a", "b"], ["c", "d"]]), - np.array([["a", "b"], ["c", "d"]]), - np.array([["a", "b"], ["c", "d"]], dtype=object), - np.array([[1, 0, 2]]), - _NotAnArray(np.array([[1, 0, 2]])), - ], - "binary": [ - [0, 1], - [1, 1], - [], - [0], - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32), - np.array([[0], [1]]), - _NotAnArray(np.array([[0], [1]])), - [1, -1], - [3, 5], - ["a"], - ["a", "b"], - ["abc", "def"], - np.array(["abc", "def"]), - ["a", "b"], - np.array(["abc", "def"], dtype=object), - ], - "continuous": [ - [1e-5], - [0, 0.5], - np.array([[0], [0.5]]), - np.array([[0], [0.5]], dtype=np.float32), - ], - "continuous-multioutput": [ - np.array([[0, 0.5], [0.5, 0]]), - np.array([[0, 0.5], [0.5, 0]], dtype=np.float32), - np.array([[0, 0.5]]), - *_generate_sparse( - [[0, 0.5], [0.5, 0]], - sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS, - dtypes=(float, np.float32), - ), - *_generate_sparse( - [[0, 0.5]], - sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS, - dtypes=(float, np.float32), - ), - ], - "unknown": [ - [[]], - np.array([[]], dtype=object), - [()], - # sequence of sequences that weren't supported even before deprecation - np.array([np.array([]), np.array([1, 2, 3])], dtype=object), - [np.array([]), np.array([1, 2, 3])], - [{1, 2, 3}, {1, 2}], - [frozenset([1, 2, 3]), frozenset([1, 2])], - # and also confusable as sequences of sequences - [{0: "a", 1: "b"}, {0: "a"}], - # ndim 0 - np.array(0), - # empty second dimension - np.array([[], []]), - # 3d - np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]), - ], + "multilabel-indicator": [ + # valid when the data is formatted as sparse or dense, identified + # by CSR format when the testing takes place + *_generate_sparse( + np.random.RandomState(42).randint(2, size=(10, 10)), + sparse_containers=CSR_CONTAINERS, + dtypes=(int,), + ), + [[0, 1], [1, 0]], + [[0, 1]], + *_generate_sparse( + multilabel_explicit_zero, sparse_containers=CSC_CONTAINERS, dtypes=(int,) + ), + *_generate_sparse([[0, 1], [1, 0]]), + *_generate_sparse([[0, 0], [0, 0]]), + *_generate_sparse([[0, 1]]), + # Only valid when data is dense + [[-1, 1], [1, -1]], + np.array([[-1, 1], [1, -1]]), + np.array([[-3, 3], [3, -3]]), + _NotAnArray(np.array([[-3, 3], [3, -3]])), + ], + "multiclass": [ + [1, 0, 2, 2, 1, 4, 2, 4, 4, 4], + np.array([1, 0, 2]), + np.array([1, 0, 2], dtype=np.int8), + np.array([1, 0, 2], dtype=np.uint8), + np.array([1, 0, 2], dtype=float), + np.array([1, 0, 2], dtype=np.float32), + np.array([[1], [0], [2]]), + _NotAnArray(np.array([1, 0, 2])), + [0, 1, 2], + ["a", "b", "c"], + np.array(["a", "b", "c"]), + np.array(["a", "b", "c"], dtype=object), + np.array(["a", "b", "c"], dtype=object), + ], + "multiclass-multioutput": [ + [[1, 0, 2, 2], [1, 4, 2, 4]], + [["a", "b"], ["c", "d"]], + np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32), + *_generate_sparse( + [[1, 0, 2, 2], [1, 4, 2, 4]], + sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS, + dtypes=(int, np.int8, np.uint8, float, np.float32), + ), + np.array([["a", "b"], ["c", "d"]]), + np.array([["a", "b"], ["c", "d"]]), + np.array([["a", "b"], ["c", "d"]], dtype=object), + np.array([[1, 0, 2]]), + _NotAnArray(np.array([[1, 0, 2]])), + ], + "binary": [ + [0, 1], + [1, 1], + [], + [0], + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32), + np.array([[0], [1]]), + _NotAnArray(np.array([[0], [1]])), + [1, -1], + [3, 5], + ["a"], + ["a", "b"], + ["abc", "def"], + np.array(["abc", "def"]), + ["a", "b"], + np.array(["abc", "def"], dtype=object), + ], + "continuous": [ + [1e-5], + [0, 0.5], + np.array([[0], [0.5]]), + np.array([[0], [0.5]], dtype=np.float32), + ], + "continuous-multioutput": [ + np.array([[0, 0.5], [0.5, 0]]), + np.array([[0, 0.5], [0.5, 0]], dtype=np.float32), + np.array([[0, 0.5]]), + *_generate_sparse( + [[0, 0.5], [0.5, 0]], + sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS, + dtypes=(float, np.float32), + ), + *_generate_sparse( + [[0, 0.5]], + sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS, + dtypes=(float, np.float32), + ), + ], + "unknown": [ + [[]], + np.array([[]], dtype=object), + [()], + # sequence of sequences that weren't supported even before deprecation + np.array([np.array([]), np.array([1, 2, 3])], dtype=object), + [np.array([]), np.array([1, 2, 3])], + [{1, 2, 3}, {1, 2}], + [frozenset([1, 2, 3]), frozenset([1, 2])], + # and also confusable as sequences of sequences + [{0: "a", 1: "b"}, {0: "a"}], + # ndim 0 + np.array(0), + # empty second dimension + np.array([[], []]), + # 3d + np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]), + ], } ARRAY_API_EXAMPLES = { - "multilabel-indicator": [ - np.random.RandomState(42).randint(2, size=(10, 10)), - [[0, 1], [1, 0]], - [[0, 1]], - multilabel_explicit_zero, - [[0, 0], [0, 0]], - [[-1, 1], [1, -1]], - np.array([[-1, 1], [1, -1]]), - np.array([[-3, 3], [3, -3]]), - _NotAnArray(np.array([[-3, 3], [3, -3]])), - ], - "multiclass": [ - [1, 0, 2, 2, 1, 4, 2, 4, 4, 4], - np.array([1, 0, 2]), - np.array([1, 0, 2], dtype=np.int8), - np.array([1, 0, 2], dtype=np.uint8), - np.array([1, 0, 2], dtype=float), - np.array([1, 0, 2], dtype=np.float32), - np.array([[1], [0], [2]]), - _NotAnArray(np.array([1, 0, 2])), - [0, 1, 2], - ], - "multiclass-multioutput": [ - [[1, 0, 2, 2], [1, 4, 2, 4]], - np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float), - np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32), - np.array([[1, 0, 2]]), - _NotAnArray(np.array([[1, 0, 2]])), - ], - "binary": [ - [0, 1], - [1, 1], - [], - [0], - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float), - np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32), - np.array([[0], [1]]), - _NotAnArray(np.array([[0], [1]])), - [1, -1], - [3, 5], - ], - "continuous": [ - [1e-5], - [0, 0.5], - np.array([[0], [0.5]]), - np.array([[0], [0.5]], dtype=np.float32), - ], - "continuous-multioutput": [ - np.array([[0, 0.5], [0.5, 0]]), - np.array([[0, 0.5], [0.5, 0]], dtype=np.float32), - np.array([[0, 0.5]]), - ], - "unknown": [ - [[]], - [()], - np.array(0), - np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]), - ], + "multilabel-indicator": [ + np.random.RandomState(42).randint(2, size=(10, 10)), + [[0, 1], [1, 0]], + [[0, 1]], + multilabel_explicit_zero, + [[0, 0], [0, 0]], + [[-1, 1], [1, -1]], + np.array([[-1, 1], [1, -1]]), + np.array([[-3, 3], [3, -3]]), + _NotAnArray(np.array([[-3, 3], [3, -3]])), + ], + "multiclass": [ + [1, 0, 2, 2, 1, 4, 2, 4, 4, 4], + np.array([1, 0, 2]), + np.array([1, 0, 2], dtype=np.int8), + np.array([1, 0, 2], dtype=np.uint8), + np.array([1, 0, 2], dtype=float), + np.array([1, 0, 2], dtype=np.float32), + np.array([[1], [0], [2]]), + _NotAnArray(np.array([1, 0, 2])), + [0, 1, 2], + ], + "multiclass-multioutput": [ + [[1, 0, 2, 2], [1, 4, 2, 4]], + np.array([[1, 0, 2, 2], [1, 4, 2, 4]]), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float), + np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32), + np.array([[1, 0, 2]]), + _NotAnArray(np.array([[1, 0, 2]])), + ], + "binary": [ + [0, 1], + [1, 1], + [], + [0], + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float), + np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32), + np.array([[0], [1]]), + _NotAnArray(np.array([[0], [1]])), + [1, -1], + [3, 5], + ], + "continuous": [ + [1e-5], + [0, 0.5], + np.array([[0], [0.5]]), + np.array([[0], [0.5]], dtype=np.float32), + ], + "continuous-multioutput": [ + np.array([[0, 0.5], [0.5, 0]]), + np.array([[0, 0.5], [0.5, 0]], dtype=np.float32), + np.array([[0, 0.5]]), + ], + "unknown": [ + [[]], + [()], + np.array(0), + np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]), + ], } NON_ARRAY_LIKE_EXAMPLES = [ - {1, 2, 3}, - {0: "a", 1: "b"}, - {0: [5], 1: [5]}, - "abc", - frozenset([1, 2, 3]), - None, + {1, 2, 3}, + {0: "a", 1: "b"}, + {0: [5], 1: [5]}, + "abc", + frozenset([1, 2, 3]), + None, ] MULTILABEL_SEQUENCES = [ - [[1], [2], [0, 1]], - [(), (2), (0, 1)], - np.array([[], [1, 2]], dtype="object"), - _NotAnArray(np.array([[], [1, 2]], dtype="object")), + [[1], [2], [0, 1]], + [(), (2), (0, 1)], + np.array([[], [1, 2]], dtype="object"), + _NotAnArray(np.array([[], [1, 2]], dtype="object")), ] def test_unique_labels(): - # Empty iterable - with pytest.raises(ValueError): - unique_labels() + # Empty iterable + with pytest.raises(ValueError): + unique_labels() - # Multiclass problem - assert_array_equal(unique_labels(range(10)), np.arange(10)) - assert_array_equal(unique_labels(np.arange(10)), np.arange(10)) - assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4])) + # Multiclass problem + assert_array_equal(unique_labels(range(10)), np.arange(10)) + assert_array_equal(unique_labels(np.arange(10)), np.arange(10)) + assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4])) - # Multilabel indicator - assert_array_equal( - unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3) - ) + # Multilabel indicator + assert_array_equal( + unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3) + ) - assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3)) + assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3)) - # Several arrays passed - assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5)) - assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3)) + # Several arrays passed + assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5)) + assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3)) - # Border line case with binary indicator matrix - with pytest.raises(ValueError): - unique_labels([4, 0, 2], np.ones((5, 5))) - with pytest.raises(ValueError): - unique_labels(np.ones((5, 4)), np.ones((5, 5))) + # Border line case with binary indicator matrix + with pytest.raises(ValueError): + unique_labels([4, 0, 2], np.ones((5, 5))) + with pytest.raises(ValueError): + unique_labels(np.ones((5, 4)), np.ones((5, 5))) - assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5)) + assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5)) def test_type_of_target_too_many_unique_classes(): - """Check that we raise a warning when the number of unique classes is greater than - 50% of the number of samples. + """Check that we raise a warning when the number of unique classes is greater than + 50% of the number of samples. - We need to check that we don't raise if we have less than 20 samples. - """ + We need to check that we don't raise if we have less than 20 samples. + """ - # Create a label array where each class appears only once, except '0' appears twice. - # This simulates misuse of `type_of_target` with unique class labels, - # ensuring it doesn't raise a warning in such cases. - y = np.hstack((np.arange(20), [0])) - msg = r"The number of unique classes is greater than 50% of the number of samples." - with pytest.warns(UserWarning, match=msg): - type_of_target(y) + # Create a label array where each class appears only once, except '0' appears twice. + # This simulates misuse of `type_of_target` with unique class labels, + # ensuring it doesn't raise a warning in such cases. + y = np.hstack((np.arange(20), [0])) + msg = r"The number of unique classes is greater than 50% of the number of samples." + with pytest.warns(UserWarning, match=msg): + type_of_target(y) - # less than 20 samples, no warning should be raised - y = np.arange(10) - with warnings.catch_warnings(): - warnings.simplefilter("error") - type_of_target(y) + # less than 20 samples, no warning should be raised + y = np.arange(10) + with warnings.catch_warnings(): + warnings.simplefilter("error") + type_of_target(y) - # More than 20 samples but only unique classes, no warning should be raised - y = np.arange(25) - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - type_of_target(y) + # More than 20 samples but only unique classes, no warning should be raised + y = np.arange(25) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + type_of_target(y) def test_unique_labels_non_specific(): - # Test unique_labels with a variety of collected examples + # Test unique_labels with a variety of collected examples - # Smoke test for all supported format - for format in ["binary", "multiclass", "multilabel-indicator"]: - for y in EXAMPLES[format]: - unique_labels(y) + # Smoke test for all supported format + for format in ["binary", "multiclass", "multilabel-indicator"]: + for y in EXAMPLES[format]: + unique_labels(y) - # We don't support those format at the moment - for example in NON_ARRAY_LIKE_EXAMPLES: - with pytest.raises(ValueError): - unique_labels(example) + # We don't support those format at the moment + for example in NON_ARRAY_LIKE_EXAMPLES: + with pytest.raises(ValueError): + unique_labels(example) - for y_type in [ - "unknown", - "continuous", - "continuous-multioutput", - "multiclass-multioutput", - ]: - for example in EXAMPLES[y_type]: - with pytest.raises(ValueError): - unique_labels(example) + for y_type in [ + "unknown", + "continuous", + "continuous-multioutput", + "multiclass-multioutput", + ]: + for example in EXAMPLES[y_type]: + with pytest.raises(ValueError): + unique_labels(example) def test_unique_labels_mixed_types(): - # Mix with binary or multiclass and multilabel - mix_clf_format = product( - EXAMPLES["multilabel-indicator"], EXAMPLES["multiclass"] + EXAMPLES["binary"] - ) + # Mix with binary or multiclass and multilabel + mix_clf_format = product( + EXAMPLES["multilabel-indicator"], EXAMPLES["multiclass"] + EXAMPLES["binary"] + ) - for y_multilabel, y_multiclass in mix_clf_format: - with pytest.raises(ValueError): - unique_labels(y_multiclass, y_multilabel) - with pytest.raises(ValueError): - unique_labels(y_multilabel, y_multiclass) + for y_multilabel, y_multiclass in mix_clf_format: + with pytest.raises(ValueError): + unique_labels(y_multiclass, y_multilabel) + with pytest.raises(ValueError): + unique_labels(y_multilabel, y_multiclass) - with pytest.raises(ValueError): - unique_labels([[1, 2]], [["a", "d"]]) + with pytest.raises(ValueError): + unique_labels([[1, 2]], [["a", "d"]]) - with pytest.raises(ValueError): - unique_labels(["1", 2]) + with pytest.raises(ValueError): + unique_labels(["1", 2]) - with pytest.raises(ValueError): - unique_labels([["1", 2], [1, 3]]) + with pytest.raises(ValueError): + unique_labels([["1", 2], [1, 3]]) - with pytest.raises(ValueError): - unique_labels([["1", "2"], [2, 3]]) + with pytest.raises(ValueError): + unique_labels([["1", "2"], [2, 3]]) def test_is_multilabel(): - for group, group_examples in EXAMPLES.items(): - dense_exp = group == "multilabel-indicator" - - for example in group_examples: - # Only mark explicitly defined sparse examples as valid sparse - # multilabel-indicators - sparse_exp = dense_exp and issparse(example) - - if issparse(example) or ( - hasattr(example, "__array__") - and np.asarray(example).ndim == 2 - and np.asarray(example).dtype.kind in "biuf" - and np.asarray(example).shape[1] > 0 - ): - examples_sparse = [ - sparse_container(example) - for sparse_container in ( - COO_CONTAINERS - + CSC_CONTAINERS - + CSR_CONTAINERS - + DOK_CONTAINERS - + LIL_CONTAINERS - ) - ] - for exmpl_sparse in examples_sparse: - assert sparse_exp == is_multilabel(exmpl_sparse), ( - f"is_multilabel({exmpl_sparse!r}) should be {sparse_exp}" - ) - - # Densify sparse examples before testing - if issparse(example): - example = example.toarray() - - assert dense_exp == is_multilabel(example), ( - f"is_multilabel({example!r}) should be {dense_exp}" - ) + for group, group_examples in EXAMPLES.items(): + dense_exp = group == "multilabel-indicator" + + for example in group_examples: + # Only mark explicitly defined sparse examples as valid sparse + # multilabel-indicators + sparse_exp = dense_exp and issparse(example) + + if issparse(example) or ( + hasattr(example, "__array__") + and np.asarray(example).ndim == 2 + and np.asarray(example).dtype.kind in "biuf" + and np.asarray(example).shape[1] > 0 + ): + examples_sparse = [ + sparse_container(example) + for sparse_container in ( + COO_CONTAINERS + + CSC_CONTAINERS + + CSR_CONTAINERS + + DOK_CONTAINERS + + LIL_CONTAINERS + ) + ] + for exmpl_sparse in examples_sparse: + assert sparse_exp == is_multilabel(exmpl_sparse), ( + f"is_multilabel({exmpl_sparse!r}) should be {sparse_exp}" + ) + + # Densify sparse examples before testing + if issparse(example): + example = example.toarray() + + assert dense_exp == is_multilabel(example), ( + f"is_multilabel({example!r}) should be {dense_exp}" + ) @pytest.mark.parametrize( - "array_namespace, device, dtype_name", - yield_namespace_device_dtype_combinations(), - ids=_get_namespace_device_dtype_ids, + "array_namespace, device, dtype_name", + yield_namespace_device_dtype_combinations(), + ids=_get_namespace_device_dtype_ids, ) def test_is_multilabel_array_api_compliance(array_namespace, device, dtype_name): - xp = _array_api_for_tests(array_namespace, device) + xp = _array_api_for_tests(array_namespace, device) - for group, group_examples in ARRAY_API_EXAMPLES.items(): - dense_exp = group == "multilabel-indicator" - for example in group_examples: - if np.asarray(example).dtype.kind == "f": - example = np.asarray(example, dtype=dtype_name) - else: - example = np.asarray(example) - example = xp.asarray(example, device=device) + for group, group_examples in ARRAY_API_EXAMPLES.items(): + dense_exp = group == "multilabel-indicator" + for example in group_examples: + if np.asarray(example).dtype.kind == "f": + example = np.asarray(example, dtype=dtype_name) + else: + example = np.asarray(example) + example = xp.asarray(example, device=device) - with config_context(array_api_dispatch=True): - assert dense_exp == is_multilabel(example), ( - f"is_multilabel({example!r}) should be {dense_exp}" - ) + with config_context(array_api_dispatch=True): + assert dense_exp == is_multilabel(example), ( + f"is_multilabel({example!r}) should be {dense_exp}" + ) def test_check_classification_targets(): - for y_type in EXAMPLES.keys(): - if y_type in ["unknown", "continuous", "continuous-multioutput"]: - for example in EXAMPLES[y_type]: - msg = "Unknown label type: " - with pytest.raises(ValueError, match=msg): - check_classification_targets(example) - else: - for example in EXAMPLES[y_type]: - check_classification_targets(example) + for y_type in EXAMPLES.keys(): + if y_type in ["unknown", "continuous", "continuous-multioutput"]: + for example in EXAMPLES[y_type]: + msg = "Unknown label type: " + with pytest.raises(ValueError, match=msg): + check_classification_targets(example) + else: + for example in EXAMPLES[y_type]: + check_classification_targets(example) def test_type_of_target(): - for group, group_examples in EXAMPLES.items(): - for example in group_examples: - assert type_of_target(example) == group, ( - "type_of_target(%r) should be %r, got %r" - % ( - example, - group, - type_of_target(example), - ) - ) - - for example in NON_ARRAY_LIKE_EXAMPLES: - msg_regex = r"Expected array-like \(array or non-string sequence\).*" - with pytest.raises(ValueError, match=msg_regex): - type_of_target(example) - - for example in MULTILABEL_SEQUENCES: - msg = ( - "You appear to be using a legacy multi-label data " - "representation. Sequence of sequences are no longer supported;" - " use a binary array or sparse matrix instead." - ) - with pytest.raises(ValueError, match=msg): - type_of_target(example) + for group, group_examples in EXAMPLES.items(): + for example in group_examples: + assert type_of_target(example) == group, ( + "type_of_target(%r) should be %r, got %r" + % ( + example, + group, + type_of_target(example), + ) + ) + + for example in NON_ARRAY_LIKE_EXAMPLES: + msg_regex = r"Expected array-like \(array or non-string sequence\).*" + with pytest.raises(ValueError, match=msg_regex): + type_of_target(example) + + for example in MULTILABEL_SEQUENCES: + msg = ( + "You appear to be using a legacy multi-label data " + "representation. Sequence of sequences are no longer supported;" + " use a binary array or sparse matrix instead." + ) + with pytest.raises(ValueError, match=msg): + type_of_target(example) def test_type_of_target_pandas_sparse(): - pd = pytest.importorskip("pandas") + pd = pytest.importorskip("pandas") - y = pd.arrays.SparseArray([1, np.nan, np.nan, 1, np.nan]) - msg = "y cannot be class 'SparseSeries' or 'SparseArray'" - with pytest.raises(ValueError, match=msg): - type_of_target(y) + y = pd.arrays.SparseArray([1, np.nan, np.nan, 1, np.nan]) + msg = "y cannot be class 'SparseSeries' or 'SparseArray'" + with pytest.raises(ValueError, match=msg): + type_of_target(y) def test_type_of_target_pandas_nullable(): - """Check that type_of_target works with pandas nullable dtypes.""" - pd = pytest.importorskip("pandas") + """Check that type_of_target works with pandas nullable dtypes.""" + pd = pytest.importorskip("pandas") - for dtype in ["Int32", "Float32"]: - y_true = pd.Series([1, 0, 2, 3, 4], dtype=dtype) - assert type_of_target(y_true) == "multiclass" + for dtype in ["Int32", "Float32"]: + y_true = pd.Series([1, 0, 2, 3, 4], dtype=dtype) + assert type_of_target(y_true) == "multiclass" - y_true = pd.Series([1, 0, 1, 0], dtype=dtype) - assert type_of_target(y_true) == "binary" + y_true = pd.Series([1, 0, 1, 0], dtype=dtype) + assert type_of_target(y_true) == "binary" - y_true = pd.DataFrame([[1.4, 3.1], [3.1, 1.4]], dtype="Float32") - assert type_of_target(y_true) == "continuous-multioutput" + y_true = pd.DataFrame([[1.4, 3.1], [3.1, 1.4]], dtype="Float32") + assert type_of_target(y_true) == "continuous-multioutput" - y_true = pd.DataFrame([[0, 1], [1, 1]], dtype="Int32") - assert type_of_target(y_true) == "multilabel-indicator" + y_true = pd.DataFrame([[0, 1], [1, 1]], dtype="Int32") + assert type_of_target(y_true) == "multilabel-indicator" - y_true = pd.DataFrame([[1, 2], [3, 1]], dtype="Int32") - assert type_of_target(y_true) == "multiclass-multioutput" + y_true = pd.DataFrame([[1, 2], [3, 1]], dtype="Int32") + assert type_of_target(y_true) == "multiclass-multioutput" @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) def test_unique_labels_pandas_nullable(dtype): - """Checks that unique_labels work with pandas nullable dtypes. + """Checks that unique_labels work with pandas nullable dtypes. - Non-regression test for gh-25634. - """ - pd = pytest.importorskip("pandas") + Non-regression test for gh-25634. + """ + pd = pytest.importorskip("pandas") - y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype) - y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype="int64") + y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype) + y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype="int64") - labels = unique_labels(y_true, y_predicted) - assert_array_equal(labels, [0, 1]) + labels = unique_labels(y_true, y_predicted) + assert_array_equal(labels, [0, 1]) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_class_distribution(csc_container): - y = np.array( - [ - [1, 0, 0, 1], - [2, 2, 0, 1], - [1, 3, 0, 1], - [4, 2, 0, 1], - [2, 0, 0, 1], - [1, 3, 0, 1], - ] - ) - # Define the sparse matrix with a mix of implicit and explicit zeros - data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1]) - indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5]) - indptr = np.array([0, 6, 11, 11, 17]) - y_sp = csc_container((data, indices, indptr), shape=(6, 4)) - - classes, n_classes, class_prior = class_distribution(y) - classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp) - classes_expected = [[1, 2, 4], [0, 2, 3], [0], [1]] - n_classes_expected = [3, 3, 1, 1] - class_prior_expected = [[3 / 6, 2 / 6, 1 / 6], [1 / 3, 1 / 3, 1 / 3], [1.0], [1.0]] - - for k in range(y.shape[1]): - assert_array_almost_equal(classes[k], classes_expected[k]) - assert_array_almost_equal(n_classes[k], n_classes_expected[k]) - assert_array_almost_equal(class_prior[k], class_prior_expected[k]) - - assert_array_almost_equal(classes_sp[k], classes_expected[k]) - assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k]) - assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k]) - - # Test again with explicit sample weights - (classes, n_classes, class_prior) = class_distribution( - y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0] - ) - (classes_sp, n_classes_sp, class_prior_sp) = class_distribution( - y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0] - ) - class_prior_expected = [[4 / 9, 3 / 9, 2 / 9], [2 / 9, 4 / 9, 3 / 9], [1.0], [1.0]] - - for k in range(y.shape[1]): - assert_array_almost_equal(classes[k], classes_expected[k]) - assert_array_almost_equal(n_classes[k], n_classes_expected[k]) - assert_array_almost_equal(class_prior[k], class_prior_expected[k]) - - assert_array_almost_equal(classes_sp[k], classes_expected[k]) - assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k]) - assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k]) + y = np.array( + [ + [1, 0, 0, 1], + [2, 2, 0, 1], + [1, 3, 0, 1], + [4, 2, 0, 1], + [2, 0, 0, 1], + [1, 3, 0, 1], + ] + ) + # Define the sparse matrix with a mix of implicit and explicit zeros + data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1]) + indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5]) + indptr = np.array([0, 6, 11, 11, 17]) + y_sp = csc_container((data, indices, indptr), shape=(6, 4)) + + classes, n_classes, class_prior = class_distribution(y) + classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp) + classes_expected = [[1, 2, 4], [0, 2, 3], [0], [1]] + n_classes_expected = [3, 3, 1, 1] + class_prior_expected = [[3 / 6, 2 / 6, 1 / 6], [1 / 3, 1 / 3, 1 / 3], [1.0], [1.0]] + + for k in range(y.shape[1]): + assert_array_almost_equal(classes[k], classes_expected[k]) + assert_array_almost_equal(n_classes[k], n_classes_expected[k]) + assert_array_almost_equal(class_prior[k], class_prior_expected[k]) + + assert_array_almost_equal(classes_sp[k], classes_expected[k]) + assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k]) + assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k]) + + # Test again with explicit sample weights + (classes, n_classes, class_prior) = class_distribution( + y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0] + ) + (classes_sp, n_classes_sp, class_prior_sp) = class_distribution( + y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0] + ) + class_prior_expected = [[4 / 9, 3 / 9, 2 / 9], [2 / 9, 4 / 9, 3 / 9], [1.0], [1.0]] + + for k in range(y.shape[1]): + assert_array_almost_equal(classes[k], classes_expected[k]) + assert_array_almost_equal(n_classes[k], n_classes_expected[k]) + assert_array_almost_equal(class_prior[k], class_prior_expected[k]) + + assert_array_almost_equal(classes_sp[k], classes_expected[k]) + assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k]) + assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k]) def test_safe_split_with_precomputed_kernel(): - clf = SVC() - clfp = SVC(kernel="precomputed") + clf = SVC() + clfp = SVC(kernel="precomputed") - iris = datasets.load_iris() - X, y = iris.data, iris.target - K = np.dot(X, X.T) + iris = datasets.load_iris() + X, y = iris.data, iris.target + K = np.dot(X, X.T) - cv = ShuffleSplit(test_size=0.25, random_state=0) - train, test = next(iter(cv.split(X))) + cv = ShuffleSplit(test_size=0.25, random_state=0) + train, test = next(iter(cv.split(X))) - X_train, y_train = _safe_split(clf, X, y, train) - K_train, y_train2 = _safe_split(clfp, K, y, train) - assert_array_almost_equal(K_train, np.dot(X_train, X_train.T)) - assert_array_almost_equal(y_train, y_train2) + X_train, y_train = _safe_split(clf, X, y, train) + K_train, y_train2 = _safe_split(clfp, K, y, train) + assert_array_almost_equal(K_train, np.dot(X_train, X_train.T)) + assert_array_almost_equal(y_train, y_train2) - X_test, y_test = _safe_split(clf, X, y, test, train) - K_test, y_test2 = _safe_split(clfp, K, y, test, train) - assert_array_almost_equal(K_test, np.dot(X_test, X_train.T)) - assert_array_almost_equal(y_test, y_test2) + X_test, y_test = _safe_split(clf, X, y, test, train) + K_test, y_test2 = _safe_split(clfp, K, y, test, train) + assert_array_almost_equal(K_test, np.dot(X_test, X_train.T)) + assert_array_almost_equal(y_test, y_test2) def test_ovr_decision_function(): - # test properties for ovr decision function + # test properties for ovr decision function - predictions = np.array([[0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 1]]) + predictions = np.array([[0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 1]]) - confidences = np.array( - [[-1e16, 0, -1e16], [1.0, 2.0, -3.0], [-5.0, 2.0, 5.0], [-0.5, 0.2, 0.5]] - ) + confidences = np.array( + [[-1e16, 0, -1e16], [1.0, 2.0, -3.0], [-5.0, 2.0, 5.0], [-0.5, 0.2, 0.5]] + ) - n_classes = 3 + n_classes = 3 - dec_values = _ovr_decision_function(predictions, confidences, n_classes) + dec_values = _ovr_decision_function(predictions, confidences, n_classes) - # check that the decision values are within 0.5 range of the votes - votes = np.array([[1, 0, 2], [1, 1, 1], [1, 0, 2], [1, 0, 2]]) + # check that the decision values are within 0.5 range of the votes + votes = np.array([[1, 0, 2], [1, 1, 1], [1, 0, 2], [1, 0, 2]]) - assert_allclose(votes, dec_values, atol=0.5) + assert_allclose(votes, dec_values, atol=0.5) - # check that the prediction are what we expect - # highest vote or highest confidence if there is a tie. - # for the second sample we have a tie (should be won by 1) - expected_prediction = np.array([2, 1, 2, 2]) - assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction) + # check that the prediction are what we expect + # highest vote or highest confidence if there is a tie. + # for the second sample we have a tie (should be won by 1) + expected_prediction = np.array([2, 1, 2, 2]) + assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction) - # third and fourth sample have the same vote but third sample - # has higher confidence, this should reflect on the decision values - assert dec_values[2, 2] > dec_values[3, 2] + # third and fourth sample have the same vote but third sample + # has higher confidence, this should reflect on the decision values + assert dec_values[2, 2] > dec_values[3, 2] - # assert subset invariance. - dec_values_one = [ - _ovr_decision_function( - np.array([predictions[i]]), np.array([confidences[i]]), n_classes - )[0] - for i in range(4) - ] + # assert subset invariance. + dec_values_one = [ + _ovr_decision_function( + np.array([predictions[i]]), np.array([confidences[i]]), n_classes + )[0] + for i in range(4) + ] - assert_allclose(dec_values, dec_values_one, atol=1e-6) + assert_allclose(dec_values, dec_values_one, atol=1e-6) @pytest.mark.parametrize("input_type", ["list", "array"]) def test_labels_in_bytes_format_error(input_type): - # check that we raise an error with bytes encoded labels - # non-regression test for: - # https://github.com/scikit-learn/scikit-learn/issues/16980 - target = _convert_container([b"a", b"b"], input_type) - err_msg = "Support for labels represented as bytes is not supported" - with pytest.raises(TypeError, match=err_msg): - type_of_target(target) + # check that we raise an error with bytes encoded labels + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/16980 + target = _convert_container([b"a", b"b"], input_type) + err_msg = "Support for labels represented as bytes is not supported" + with pytest.raises(TypeError, match=err_msg): + type_of_target(target) diff --git a/sklearn/utils/tests/test_response.py b/sklearn/utils/tests/test_response.py index 9404d0a20cddc..5f791b59dfaa3 100644 --- a/sklearn/utils/tests/test_response.py +++ b/sklearn/utils/tests/test_response.py @@ -376,7 +376,7 @@ def test_get_response_values_multilabel_indicator(response_method): def test_response_values_type_of_target_on_classes_no_warning(): """ Ensure `_get_response_values` doesn't raise spurious warning. - + "The number of unique classes is greater than > 50% of samples" warning should not be raised when calling `type_of_target(classes_)`. From fef3f26928476fa1fb6de949b216d3c9dc2a6fe0 Mon Sep 17 00:00:00 2001 From: saskra Date: Fri, 11 Jul 2025 12:44:19 +0200 Subject: [PATCH 10/13] Update doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger --- doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst b/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst index cee8984984d9a..5417dd80df975 100644 --- a/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst +++ b/doc/whats_new/upcoming_changes/sklearn.utils/31584.fix.rst @@ -1,3 +1,4 @@ - Fixed a spurious warning (about the number of unique classes being greater than 50% of the number of samples) that could occur when passing `classes` :func:`utils.multiclass.type_of_target`. + By :user:`Sascha D. Krauss `. From 7022e82324a31cb4c276f8da25b8aa64ce4c8a4f Mon Sep 17 00:00:00 2001 From: saskra Date: Fri, 11 Jul 2025 12:50:04 +0200 Subject: [PATCH 11/13] Reverses unintentional removal of a line --- sklearn/utils/tests/test_multiclass.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 8f53b3e9b3d1d..5ad87d126ba68 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -247,6 +247,7 @@ def _generate_sparse( ], } + NON_ARRAY_LIKE_EXAMPLES = [ {1, 2, 3}, {0: "a", 1: "b"}, From 5dade50211e37198efde8c4ded135c6ab3136578 Mon Sep 17 00:00:00 2001 From: saskra Date: Sun, 13 Jul 2025 14:43:08 +0200 Subject: [PATCH 12/13] Update sklearn/utils/tests/test_multiclass.py Co-authored-by: Lucy Liu --- sklearn/utils/tests/test_multiclass.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index 5ad87d126ba68..fc200dee0d120 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -316,7 +316,9 @@ def test_type_of_target_too_many_unique_classes(): warnings.simplefilter("error") type_of_target(y) - # More than 20 samples but only unique classes, no warning should be raised + # More than 20 samples but only unique classes, simulating passing + # `classes_` to `type_of_target` (when number of classes is large). + # No warning should be raised y = np.arange(25) with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) From 421728c31932e3b09dd34088fc2214330c7e24fa Mon Sep 17 00:00:00 2001 From: saskra Date: Sun, 13 Jul 2025 14:43:37 +0200 Subject: [PATCH 13/13] Update sklearn/utils/tests/test_multiclass.py Co-authored-by: Lucy Liu --- sklearn/utils/tests/test_multiclass.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_multiclass.py b/sklearn/utils/tests/test_multiclass.py index fc200dee0d120..a686b721f2393 100644 --- a/sklearn/utils/tests/test_multiclass.py +++ b/sklearn/utils/tests/test_multiclass.py @@ -302,9 +302,10 @@ def test_type_of_target_too_many_unique_classes(): We need to check that we don't raise if we have less than 20 samples. """ - # Create a label array where each class appears only once, except '0' appears twice. - # This simulates misuse of `type_of_target` with unique class labels, - # ensuring it doesn't raise a warning in such cases. + # Create array of unique labels, except '0', which appears twice. + # This does raise a warning. + # Note warning would not be raised if we passed only unique + # labels, which happens when `type_of_target` is passed `classes_`. y = np.hstack((np.arange(20), [0])) msg = r"The number of unique classes is greater than 50% of the number of samples." with pytest.warns(UserWarning, match=msg):