From 78d2a657f79b71fa05140b23144e2bbb71f0cb6b Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Sat, 7 Dec 2024 23:51:34 +0100 Subject: [PATCH 01/34] ENH Array API for confusion_matrix --- doc/modules/array_api.rst | 1 + sklearn/metrics/_classification.py | 76 +++++++++++++++++++--------- sklearn/metrics/tests/test_common.py | 4 ++ 3 files changed, 56 insertions(+), 25 deletions(-) diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index 82eb64dec08c6..171230d64d12f 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -115,6 +115,7 @@ Metrics - :func:`sklearn.metrics.cluster.entropy` - :func:`sklearn.metrics.accuracy_score` +- :func:`sklearn.metrics.confusion_matrix` - :func:`sklearn.metrics.d2_tweedie_score` - :func:`sklearn.metrics.explained_variance_score` - :func:`sklearn.metrics.f1_score` diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index dc9252c2c9fda..7c0d60fdbc4db 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -292,7 +292,7 @@ def confusion_matrix( Returns ------- - C : ndarray of shape (n_classes, n_classes) + C : array of shape (n_classes, n_classes) Confusion matrix whose i-th row and j-th column entry indicates the number of samples with true label being i-th class @@ -337,6 +337,8 @@ def confusion_matrix( (np.int64(0), np.int64(2), np.int64(1), np.int64(1)) """ y_true, y_pred = attach_unique(y_true, y_pred) + xp, _ = get_namespace(y_true, y_pred, labels, sample_weight) + device_ = device(y_true, y_pred, labels, sample_weight) y_type, y_true, y_pred = _check_targets(y_true, y_pred) if y_type not in ("binary", "multiclass"): raise ValueError("%s is not supported" % y_type) @@ -344,56 +346,70 @@ def confusion_matrix( if labels is None: labels = unique_labels(y_true, y_pred) else: - labels = np.asarray(labels) + labels = xp.asarray(labels) n_labels = labels.size if n_labels == 0: raise ValueError("'labels' should contains at least one label.") elif y_true.size == 0: - return np.zeros((n_labels, n_labels), dtype=int) - elif len(np.intersect1d(y_true, labels)) == 0: + return xp.zeros((n_labels, n_labels), dtype=int, device=device_) + # This is not tested other than for numpy; it seems xp.isin is not existing in + # array_api_compat: + elif not xp.isin(labels, y_true).any(): raise ValueError("At least one label specified must be in y_true") if sample_weight is None: - sample_weight = np.ones(y_true.shape[0], dtype=np.int64) + sample_weight = xp.ones(y_true.shape[0], dtype=xp.int64) else: - sample_weight = np.asarray(sample_weight) + sample_weight = xp.asarray(sample_weight) check_consistent_length(y_true, y_pred, sample_weight) - n_labels = labels.size + # TODO: remove condition when torch supports the size attribute + if xp.__name__ == "array_api_compat.torch": + n_labels = xp.size(labels) + else: + n_labels = labels.size # If labels are not consecutive integers starting from zero, then # y_true and y_pred must be converted into index form need_index_conversion = not ( - labels.dtype.kind in {"i", "u", "b"} - and np.all(labels == np.arange(n_labels)) - and y_true.min() >= 0 - and y_pred.min() >= 0 + xp.isdtype(labels.dtype, ("signed integer", "unsigned integer", "bool")) + and xp.all(labels == xp.arange(n_labels, device=device_)) + and xp.min(y_true) >= 0 + and xp.min(y_pred) >= 0 ) if need_index_conversion: label_to_ind = {y: x for x, y in enumerate(labels)} - y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred]) - y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true]) + y_pred = xp.array([label_to_ind.get(x, n_labels + 1) for x in y_pred]) + y_true = xp.array([label_to_ind.get(x, n_labels + 1) for x in y_true]) # intersect y_pred, y_true with labels, eliminate items not in labels - ind = np.logical_and(y_pred < n_labels, y_true < n_labels) - if not np.all(ind): + ind = xp.logical_and(y_pred < n_labels, y_true < n_labels) + if not xp.all(ind): y_pred = y_pred[ind] y_true = y_true[ind] # also eliminate weights of eliminated items sample_weight = sample_weight[ind] # Choose the accumulator dtype to always have high precision - if sample_weight.dtype.kind in {"i", "u", "b"}: - dtype = np.int64 + if xp.isdtype(sample_weight.dtype, ("signed integer", "unsigned integer", "bool")): + dtype = xp.int64 else: - dtype = np.float64 - - cm = coo_matrix( - (sample_weight, (y_true, y_pred)), - shape=(n_labels, n_labels), - dtype=dtype, - ).toarray() + dtype = xp.float64 + + if _is_numpy_namespace(xp): + cm = coo_matrix( + (sample_weight, (y_true, y_pred)), + shape=(n_labels, n_labels), + dtype=dtype, + ).toarray() + else: + cm = xp.zeros((n_labels, n_labels), dtype=dtype) + # that is probably not very performant? + for true, pred, weight in zip(y_true, y_pred, sample_weight): + cm[true, pred] += weight + # does only numpy warn for divisions by zero or do we have to handle warnings from + # other libraries as well? with np.errstate(all="ignore"): if normalize == "true": cm = cm / cm.sum(axis=1, keepdims=True) @@ -401,7 +417,17 @@ def confusion_matrix( cm = cm / cm.sum(axis=0, keepdims=True) elif normalize == "all": cm = cm / cm.sum() - cm = np.nan_to_num(cm) + + if xp.__name__ == "array_api_strict": + cm[xp.isnan(cm)] = 0 + if isinstance(cm.dtype, float): # type checking not working properly !!!! + cm[xp.isinf(cm) & (cm > 0)] = xp.finfo(cm.dtype).max + cm[xp.isinf(cm) & (cm < 0)] = xp.finfo(cm.dtype).min + elif isinstance(cm.dtype, int): + cm[xp.isinf(cm) & (cm > 0)] = xp.iinfo(cm.dtype).max + cm[xp.isinf(cm) & (cm < 0)] = xp.iinfo(cm.dtype).min + else: + cm = xp.nan_to_num(cm) if cm.shape == (1, 1): warnings.warn( diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 0b7a47b0f12da..fcc711f1d3124 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -2061,6 +2061,10 @@ def check_array_api_metric_pairwise(metric, array_namespace, device, dtype_name) check_array_api_multiclass_classification_metric, check_array_api_multilabel_classification_metric, ], + confusion_matrix: [ + check_array_api_binary_classification_metric, + check_array_api_multiclass_classification_metric, + ], f1_score: [ check_array_api_binary_classification_metric, check_array_api_multiclass_classification_metric, From 770e638ec2dda4f090c1a92f80178390b96fde73 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Sun, 8 Dec 2024 09:14:07 +0100 Subject: [PATCH 02/34] fix dtype checking --- sklearn/metrics/_classification.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 7c0d60fdbc4db..f5966af603188 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -351,16 +351,15 @@ def confusion_matrix( if n_labels == 0: raise ValueError("'labels' should contains at least one label.") elif y_true.size == 0: - return xp.zeros((n_labels, n_labels), dtype=int, device=device_) - # This is not tested other than for numpy; it seems xp.isin is not existing in - # array_api_compat: + return xp.zeros((n_labels, n_labels), dtype=xp.int64, device=device_) + # xp.isin is not existing in array_api_strict; not tested other than for numpy: elif not xp.isin(labels, y_true).any(): raise ValueError("At least one label specified must be in y_true") if sample_weight is None: - sample_weight = xp.ones(y_true.shape[0], dtype=xp.int64) + sample_weight = xp.ones(y_true.shape[0], dtype=xp.int64, device=device_) else: - sample_weight = xp.asarray(sample_weight) + sample_weight = xp.asarray(sample_weight, device=device_) check_consistent_length(y_true, y_pred, sample_weight) @@ -378,9 +377,14 @@ def confusion_matrix( and xp.min(y_pred) >= 0 ) if need_index_conversion: + # only tested for numpy so far: label_to_ind = {y: x for x, y in enumerate(labels)} - y_pred = xp.array([label_to_ind.get(x, n_labels + 1) for x in y_pred]) - y_true = xp.array([label_to_ind.get(x, n_labels + 1) for x in y_true]) + y_pred = xp.asarray( + [label_to_ind.get(x, n_labels + 1) for x in y_pred], device=device_ + ) + y_true = xp.asarray( + [label_to_ind.get(x, n_labels + 1) for x in y_true], device=device_ + ) # intersect y_pred, y_true with labels, eliminate items not in labels ind = xp.logical_and(y_pred < n_labels, y_true < n_labels) @@ -403,7 +407,7 @@ def confusion_matrix( dtype=dtype, ).toarray() else: - cm = xp.zeros((n_labels, n_labels), dtype=dtype) + cm = xp.zeros((n_labels, n_labels), dtype=dtype, device=device_) # that is probably not very performant? for true, pred, weight in zip(y_true, y_pred, sample_weight): cm[true, pred] += weight @@ -420,10 +424,10 @@ def confusion_matrix( if xp.__name__ == "array_api_strict": cm[xp.isnan(cm)] = 0 - if isinstance(cm.dtype, float): # type checking not working properly !!!! + if xp.isdtype(cm.dtype, "real floating"): cm[xp.isinf(cm) & (cm > 0)] = xp.finfo(cm.dtype).max cm[xp.isinf(cm) & (cm < 0)] = xp.finfo(cm.dtype).min - elif isinstance(cm.dtype, int): + else: # xp.isdtype(cm.dtype, "integral") cm[xp.isinf(cm) & (cm > 0)] = xp.iinfo(cm.dtype).max cm[xp.isinf(cm) & (cm < 0)] = xp.iinfo(cm.dtype).min else: From af440cab4e1143755edc54788f77695aba332206 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 9 Dec 2024 10:25:26 +0100 Subject: [PATCH 03/34] prepare for PR --- sklearn/metrics/_classification.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index f5966af603188..35b776685187c 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -337,8 +337,7 @@ def confusion_matrix( (np.int64(0), np.int64(2), np.int64(1), np.int64(1)) """ y_true, y_pred = attach_unique(y_true, y_pred) - xp, _ = get_namespace(y_true, y_pred, labels, sample_weight) - device_ = device(y_true, y_pred, labels, sample_weight) + xp, _, device_ = get_namespace_and_device(y_true, y_pred, labels, sample_weight) y_type, y_true, y_pred = _check_targets(y_true, y_pred) if y_type not in ("binary", "multiclass"): raise ValueError("%s is not supported" % y_type) @@ -352,7 +351,6 @@ def confusion_matrix( raise ValueError("'labels' should contains at least one label.") elif y_true.size == 0: return xp.zeros((n_labels, n_labels), dtype=xp.int64, device=device_) - # xp.isin is not existing in array_api_strict; not tested other than for numpy: elif not xp.isin(labels, y_true).any(): raise ValueError("At least one label specified must be in y_true") @@ -377,7 +375,6 @@ def confusion_matrix( and xp.min(y_pred) >= 0 ) if need_index_conversion: - # only tested for numpy so far: label_to_ind = {y: x for x, y in enumerate(labels)} y_pred = xp.asarray( [label_to_ind.get(x, n_labels + 1) for x in y_pred], device=device_ @@ -408,12 +405,9 @@ def confusion_matrix( ).toarray() else: cm = xp.zeros((n_labels, n_labels), dtype=dtype, device=device_) - # that is probably not very performant? for true, pred, weight in zip(y_true, y_pred, sample_weight): cm[true, pred] += weight - # does only numpy warn for divisions by zero or do we have to handle warnings from - # other libraries as well? with np.errstate(all="ignore"): if normalize == "true": cm = cm / cm.sum(axis=1, keepdims=True) From b45646e39b183134ed3da75213397f880575fbad Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 9 Dec 2024 10:49:51 +0100 Subject: [PATCH 04/34] change log --- doc/whats_new/upcoming_changes/array-api/30440.feature.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 doc/whats_new/upcoming_changes/array-api/30440.feature.rst diff --git a/doc/whats_new/upcoming_changes/array-api/30440.feature.rst b/doc/whats_new/upcoming_changes/array-api/30440.feature.rst new file mode 100644 index 0000000000000..d1f1374f28577 --- /dev/null +++ b/doc/whats_new/upcoming_changes/array-api/30440.feature.rst @@ -0,0 +1,2 @@ +- :func:`sklearn.metrics.confusion_matrix` now supports Array API compatible inputs. + by :user:`Stefanie Senger ` From 3db7054e0a66540efd65c7dd5cf9b5977f09f43d Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 9 Dec 2024 11:50:51 +0100 Subject: [PATCH 05/34] use our _isin --- sklearn/metrics/_classification.py | 3 ++- sklearn/utils/_array_api.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 35b776685187c..14cb175d2c1a3 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -32,6 +32,7 @@ _count_nonzero, _find_matching_floating_dtype, _is_numpy_namespace, + _isin, _searchsorted, _setdiff1d, _tolist, @@ -351,7 +352,7 @@ def confusion_matrix( raise ValueError("'labels' should contains at least one label.") elif y_true.size == 0: return xp.zeros((n_labels, n_labels), dtype=xp.int64, device=device_) - elif not xp.isin(labels, y_true).any(): + elif not _isin(labels, y_true, xp=xp).any(): raise ValueError("At least one label specified must be in y_true") if sample_weight is None: diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index b2b4f88fa218f..48e7959d0fb63 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -854,7 +854,7 @@ def _ravel(array, xp=None): def _convert_to_numpy(array, xp): - """Convert X into a NumPy ndarray on the CPU.""" + """Convert array into a NumPy ndarray on the CPU.""" xp_name = xp.__name__ if xp_name in {"array_api_compat.torch", "torch"}: From abab5ead6d77d52d2c17e7820d1503ccf0fa40db Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Tue, 10 Dec 2024 14:40:35 +0100 Subject: [PATCH 06/34] changes after review --- sklearn/metrics/_classification.py | 20 ++++---------------- sklearn/utils/_array_api.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 14cb175d2c1a3..e81aadee9af12 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -40,6 +40,7 @@ device, get_namespace, get_namespace_and_device, + size, ) from ..utils._param_validation import ( Hidden, @@ -362,11 +363,7 @@ def confusion_matrix( check_consistent_length(y_true, y_pred, sample_weight) - # TODO: remove condition when torch supports the size attribute - if xp.__name__ == "array_api_compat.torch": - n_labels = xp.size(labels) - else: - n_labels = labels.size + n_labels = size(labels) # If labels are not consecutive integers starting from zero, then # y_true and y_pred must be converted into index form need_index_conversion = not ( @@ -416,17 +413,8 @@ def confusion_matrix( cm = cm / cm.sum(axis=0, keepdims=True) elif normalize == "all": cm = cm / cm.sum() - - if xp.__name__ == "array_api_strict": - cm[xp.isnan(cm)] = 0 - if xp.isdtype(cm.dtype, "real floating"): - cm[xp.isinf(cm) & (cm > 0)] = xp.finfo(cm.dtype).max - cm[xp.isinf(cm) & (cm < 0)] = xp.finfo(cm.dtype).min - else: # xp.isdtype(cm.dtype, "integral") - cm[xp.isinf(cm) & (cm > 0)] = xp.iinfo(cm.dtype).max - cm[xp.isinf(cm) & (cm < 0)] = xp.iinfo(cm.dtype).min - else: - cm = xp.nan_to_num(cm) + # cm = _nan_to_num(cm) + cm = xp.nan_to_num(cm) if cm.shape == (1, 1): warnings.warn( diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 48e7959d0fb63..acdfb5e98eb8e 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -1101,3 +1101,21 @@ def _tolist(array, xp=None): return array.tolist() array_np = _convert_to_numpy(array, xp=xp) return [element.item() for element in array_np] + + +def _nan_to_num(array, xp=None): + """Substitutes NaN values with 0 and inf values with the maximum or minimum + numbers available for the dtype respectively; like np.nan_to_num.""" + if xp is None: + xp, _ = get_namespace(array, xp=xp) + try: + array = xp.nan_to_num(array) + except AttributeError: # currently catching exceptions from array_api_strict + array[xp.isnan(array)] = 0 + if xp.isdtype(array.dtype, "real floating"): + array[xp.isinf(array) & (array > 0)] = xp.finfo(array.dtype).max + array[xp.isinf(array) & (array < 0)] = xp.finfo(array.dtype).min + else: # xp.isdtype(array.dtype, "integral") + array[xp.isinf(array) & (array > 0)] = xp.iinfo(array.dtype).max + array[xp.isinf(array) & (array < 0)] = xp.iinfo(array.dtype).min + return array From abc39818383897bef2c8c4247b582ada3c865845 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Tue, 10 Dec 2024 14:46:11 +0100 Subject: [PATCH 07/34] forgot to push that before --- sklearn/metrics/_classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index e81aadee9af12..dcbb3b3db0e72 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -33,6 +33,7 @@ _find_matching_floating_dtype, _is_numpy_namespace, _isin, + _nan_to_num, _searchsorted, _setdiff1d, _tolist, @@ -413,8 +414,7 @@ def confusion_matrix( cm = cm / cm.sum(axis=0, keepdims=True) elif normalize == "all": cm = cm / cm.sum() - # cm = _nan_to_num(cm) - cm = xp.nan_to_num(cm) + cm = _nan_to_num(cm) if cm.shape == (1, 1): warnings.warn( From 09cec5d72b6286530c957f771902ba6953d30fa6 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Wed, 11 Dec 2024 15:42:10 +0100 Subject: [PATCH 08/34] add test --- sklearn/metrics/_classification.py | 4 ++-- sklearn/metrics/tests/test_classification.py | 18 ++++++++++++++++++ sklearn/utils/_array_api.py | 7 +++---- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index dcbb3b3db0e72..5df10234805c2 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -277,7 +277,7 @@ def confusion_matrix( y_pred : array-like of shape (n_samples,) Estimated targets as returned by a classifier. - labels : array-like of shape (n_classes), default=None + labels : array-like of shape (n_classes,), default=None List of labels to index the matrix. This may be used to reorder or select a subset of labels. If ``None`` is given, those that appear at least once @@ -374,7 +374,7 @@ def confusion_matrix( and xp.min(y_pred) >= 0 ) if need_index_conversion: - label_to_ind = {y: x for x, y in enumerate(labels)} + label_to_ind = {entry: idx for idx, entry in enumerate(labels)} y_pred = xp.asarray( [label_to_ind.get(x, n_labels + 1) for x in y_pred], device=device_ ) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 0e69719da1445..bb245d5b33d71 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -10,6 +10,7 @@ from scipy.stats import bernoulli from sklearn import datasets, svm +from sklearn.base import config_context from sklearn.datasets import make_multilabel_classification from sklearn.exceptions import UndefinedMetricWarning from sklearn.metrics import ( @@ -39,8 +40,10 @@ from sklearn.model_selection import cross_val_score from sklearn.preprocessing import LabelBinarizer, label_binarize from sklearn.tree import DecisionTreeClassifier +from sklearn.utils._array_api import yield_namespace_device_dtype_combinations from sklearn.utils._mocking import MockDataFrame from sklearn.utils._testing import ( + _array_api_for_tests, assert_allclose, assert_almost_equal, assert_array_almost_equal, @@ -3095,3 +3098,18 @@ def test_d2_log_loss_score_raises(): err = "The labels array needs to contain at least two" with pytest.raises(ValueError, match=err): d2_log_loss_score(y_true, y_pred, labels=labels) + + +@pytest.mark.parametrize( + "array_namespace, device, _", yield_namespace_device_dtype_combinations() +) +def test_confusion_matrix_array_api(array_namespace, device, _): + """Test that confusion_matrix works for all array types index conversion is done + and that it raises if not at least one label from `y_pred` is in `y_true`.""" + xp = _array_api_for_tests(array_namespace, device) + + y_true = xp.asarray([1, 2, 3], device=device) + y_pred = xp.asarray([4, 5, 6], device=device) + + with config_context(array_api_dispatch=True): + confusion_matrix(y_true, y_pred) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index acdfb5e98eb8e..3ff5e25f3a97e 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -1104,10 +1104,9 @@ def _tolist(array, xp=None): def _nan_to_num(array, xp=None): - """Substitutes NaN values with 0 and inf values with the maximum or minimum - numbers available for the dtype respectively; like np.nan_to_num.""" - if xp is None: - xp, _ = get_namespace(array, xp=xp) + """Substitutes NaN values of an array with 0 and inf values with the maximum or + minimum numbers available for the dtype respectively; like np.nan_to_num.""" + xp, _ = get_namespace(array, xp=xp) try: array = xp.nan_to_num(array) except AttributeError: # currently catching exceptions from array_api_strict From fdb25f6bfd55fedb52548c6ba8ae946ff220fe45 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 12 Dec 2024 01:36:20 +0100 Subject: [PATCH 09/34] fix sclar dtype --- sklearn/metrics/_classification.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 5df10234805c2..aef25578df9f0 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -374,12 +374,19 @@ def confusion_matrix( and xp.min(y_pred) >= 0 ) if need_index_conversion: - label_to_ind = {entry: idx for idx, entry in enumerate(labels)} + # convert 0D array into scalar type, see https://github.com/data-apis/array-api-strict/issues/109: + if xp.isdtype(labels.dtype, ("real floating")): + scalar_dtype = float + else: + scalar_dtype = str + label_to_ind = {scalar_dtype(entry): idx for idx, entry in enumerate(labels)} y_pred = xp.asarray( - [label_to_ind.get(x, n_labels + 1) for x in y_pred], device=device_ + [label_to_ind.get(scalar_dtype(x), n_labels + 1) for x in y_pred], + device=device_, ) y_true = xp.asarray( - [label_to_ind.get(x, n_labels + 1) for x in y_true], device=device_ + [label_to_ind.get(scalar_dtype(x), n_labels + 1) for x in y_true], + device=device_, ) # intersect y_pred, y_true with labels, eliminate items not in labels From 49f75b7d8b997f60e3c4ebdb6e84c525de12d5eb Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 12 Dec 2024 01:48:29 +0100 Subject: [PATCH 10/34] fix typos --- sklearn/metrics/_classification.py | 2 +- sklearn/metrics/tests/test_classification.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index aef25578df9f0..f9caf087cd773 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -351,7 +351,7 @@ def confusion_matrix( labels = xp.asarray(labels) n_labels = labels.size if n_labels == 0: - raise ValueError("'labels' should contains at least one label.") + raise ValueError("'labels' should contain at least one label.") elif y_true.size == 0: return xp.zeros((n_labels, n_labels), dtype=xp.int64, device=device_) elif not _isin(labels, y_true, xp=xp).any(): diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index bb245d5b33d71..a9f3b9a268864 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -1145,7 +1145,7 @@ def test_confusion_matrix_multiclass_subset_labels(): @pytest.mark.parametrize( "labels, err_msg", [ - ([], "'labels' should contains at least one label."), + ([], "'labels' should contain at least one label."), ([3, 4], "At least one label specified must be in y_true"), ], ids=["empty list", "unknown labels"], @@ -3104,8 +3104,9 @@ def test_d2_log_loss_score_raises(): "array_namespace, device, _", yield_namespace_device_dtype_combinations() ) def test_confusion_matrix_array_api(array_namespace, device, _): - """Test that confusion_matrix works for all array types index conversion is done - and that it raises if not at least one label from `y_pred` is in `y_true`.""" + """Test that `confusion_matrix` works for all array types if need_index_conversion + evaluates to `True`and that it raises if not at least one label from `y_pred` is in + `y_true`.""" xp = _array_api_for_tests(array_namespace, device) y_true = xp.asarray([1, 2, 3], device=device) From 914bb630255991e20bac3063802c863dc1b7a910 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Wed, 18 Dec 2024 12:33:54 +0100 Subject: [PATCH 11/34] convert_to_numpy and coo_matrix instead of python loop --- sklearn/metrics/_classification.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index f9caf087cd773..154e2bae38ee7 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -29,6 +29,7 @@ from ..utils._array_api import ( _average, _bincount, + _convert_to_numpy, _count_nonzero, _find_matching_floating_dtype, _is_numpy_namespace, @@ -399,20 +400,18 @@ def confusion_matrix( # Choose the accumulator dtype to always have high precision if xp.isdtype(sample_weight.dtype, ("signed integer", "unsigned integer", "bool")): - dtype = xp.int64 + dtype = np.int64 else: - dtype = xp.float64 - - if _is_numpy_namespace(xp): - cm = coo_matrix( - (sample_weight, (y_true, y_pred)), - shape=(n_labels, n_labels), - dtype=dtype, - ).toarray() - else: - cm = xp.zeros((n_labels, n_labels), dtype=dtype, device=device_) - for true, pred, weight in zip(y_true, y_pred, sample_weight): - cm[true, pred] += weight + dtype = np.float64 + cm = coo_matrix( + ( + _convert_to_numpy(sample_weight, xp=xp), + (_convert_to_numpy(y_true, xp=xp), _convert_to_numpy(y_pred, xp=xp)), + ), + shape=(n_labels, n_labels), + dtype=dtype, + ).toarray() + cm = xp.asarray(cm) with np.errstate(all="ignore"): if normalize == "true": From 6da1d0698550b389fa2b91dc507ac37e87b96a94 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 23 Dec 2024 11:13:04 +0100 Subject: [PATCH 12/34] experiment with convert_to_numpy --- sklearn/metrics/_classification.py | 60 +++++++++++++----------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 154e2bae38ee7..1756c66ed3dec 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -34,7 +34,6 @@ _find_matching_floating_dtype, _is_numpy_namespace, _isin, - _nan_to_num, _searchsorted, _setdiff1d, _tolist, @@ -42,7 +41,6 @@ device, get_namespace, get_namespace_and_device, - size, ) from ..utils._param_validation import ( Hidden, @@ -342,6 +340,8 @@ def confusion_matrix( """ y_true, y_pred = attach_unique(y_true, y_pred) xp, _, device_ = get_namespace_and_device(y_true, y_pred, labels, sample_weight) + y_true = _convert_to_numpy(y_true, xp) + y_pred = _convert_to_numpy(y_pred, xp) y_type, y_true, y_pred = _check_targets(y_true, y_pred) if y_type not in ("binary", "multiclass"): raise ValueError("%s is not supported" % y_type) @@ -349,69 +349,59 @@ def confusion_matrix( if labels is None: labels = unique_labels(y_true, y_pred) else: - labels = xp.asarray(labels) + labels = np.asarray(labels) n_labels = labels.size if n_labels == 0: raise ValueError("'labels' should contain at least one label.") elif y_true.size == 0: - return xp.zeros((n_labels, n_labels), dtype=xp.int64, device=device_) + return np.zeros((n_labels, n_labels), dtype=int) elif not _isin(labels, y_true, xp=xp).any(): raise ValueError("At least one label specified must be in y_true") + if not _is_numpy_namespace(get_namespace(labels)[0]): + labels = _convert_to_numpy(labels, xp) if sample_weight is None: - sample_weight = xp.ones(y_true.shape[0], dtype=xp.int64, device=device_) + sample_weight = np.ones(y_true.shape[0], dtype=np.int64) else: - sample_weight = xp.asarray(sample_weight, device=device_) + sample_weight = np.asarray(sample_weight) + if not _is_numpy_namespace(get_namespace(sample_weight)[0]): + sample_weight = _convert_to_numpy(sample_weight, xp) check_consistent_length(y_true, y_pred, sample_weight) - n_labels = size(labels) + n_labels = labels.size # If labels are not consecutive integers starting from zero, then # y_true and y_pred must be converted into index form need_index_conversion = not ( - xp.isdtype(labels.dtype, ("signed integer", "unsigned integer", "bool")) - and xp.all(labels == xp.arange(n_labels, device=device_)) - and xp.min(y_true) >= 0 - and xp.min(y_pred) >= 0 + labels.dtype.kind in {"i", "u", "b"} + and np.all(labels == np.arange(n_labels)) + and y_true.min() >= 0 + and y_pred.min() >= 0 ) if need_index_conversion: - # convert 0D array into scalar type, see https://github.com/data-apis/array-api-strict/issues/109: - if xp.isdtype(labels.dtype, ("real floating")): - scalar_dtype = float - else: - scalar_dtype = str - label_to_ind = {scalar_dtype(entry): idx for idx, entry in enumerate(labels)} - y_pred = xp.asarray( - [label_to_ind.get(scalar_dtype(x), n_labels + 1) for x in y_pred], - device=device_, - ) - y_true = xp.asarray( - [label_to_ind.get(scalar_dtype(x), n_labels + 1) for x in y_true], - device=device_, - ) + label_to_ind = {y: x for x, y in enumerate(labels)} + y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred]) + y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true]) # intersect y_pred, y_true with labels, eliminate items not in labels - ind = xp.logical_and(y_pred < n_labels, y_true < n_labels) - if not xp.all(ind): + ind = np.logical_and(y_pred < n_labels, y_true < n_labels) + if not np.all(ind): y_pred = y_pred[ind] y_true = y_true[ind] # also eliminate weights of eliminated items sample_weight = sample_weight[ind] # Choose the accumulator dtype to always have high precision - if xp.isdtype(sample_weight.dtype, ("signed integer", "unsigned integer", "bool")): + if sample_weight.dtype.kind in {"i", "u", "b"}: dtype = np.int64 else: dtype = np.float64 + cm = coo_matrix( - ( - _convert_to_numpy(sample_weight, xp=xp), - (_convert_to_numpy(y_true, xp=xp), _convert_to_numpy(y_pred, xp=xp)), - ), + (sample_weight, (y_true, y_pred)), shape=(n_labels, n_labels), dtype=dtype, ).toarray() - cm = xp.asarray(cm) with np.errstate(all="ignore"): if normalize == "true": @@ -420,7 +410,7 @@ def confusion_matrix( cm = cm / cm.sum(axis=0, keepdims=True) elif normalize == "all": cm = cm / cm.sum() - cm = _nan_to_num(cm) + cm = np.nan_to_num(cm) if cm.shape == (1, 1): warnings.warn( @@ -432,7 +422,7 @@ def confusion_matrix( UserWarning, ) - return cm + return xp.asarray(cm, device=device_) @validate_params( From 1f23f634d55a9f9b7182584ddb6e6afd37548d98 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 23 Dec 2024 11:51:46 +0100 Subject: [PATCH 13/34] np.intersect1d can stay as it is --- sklearn/metrics/_classification.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 1756c66ed3dec..ab342b45a2a8f 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -33,7 +33,6 @@ _count_nonzero, _find_matching_floating_dtype, _is_numpy_namespace, - _isin, _searchsorted, _setdiff1d, _tolist, @@ -355,7 +354,7 @@ def confusion_matrix( raise ValueError("'labels' should contain at least one label.") elif y_true.size == 0: return np.zeros((n_labels, n_labels), dtype=int) - elif not _isin(labels, y_true, xp=xp).any(): + elif len(np.intersect1d(y_true, labels)) == 0: raise ValueError("At least one label specified must be in y_true") if not _is_numpy_namespace(get_namespace(labels)[0]): labels = _convert_to_numpy(labels, xp) From 6a43bc3bdde8419c357f99e2032db315f7e6e6f9 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 30 Dec 2024 10:55:21 +0100 Subject: [PATCH 14/34] return cm as numpy array --- sklearn/metrics/_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index ab342b45a2a8f..89f257d50834f 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -421,7 +421,7 @@ def confusion_matrix( UserWarning, ) - return xp.asarray(cm, device=device_) + return cm @validate_params( From 2000a0096b5e73090efe4aed6681b7844a93217e Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 30 Dec 2024 11:10:08 +0100 Subject: [PATCH 15/34] move attach unique to after conversion to numpy --- .../array-api/{30440.feature.rst => 30562.feature.rst} | 2 +- sklearn/metrics/_classification.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename doc/whats_new/upcoming_changes/array-api/{30440.feature.rst => 30562.feature.rst} (64%) diff --git a/doc/whats_new/upcoming_changes/array-api/30440.feature.rst b/doc/whats_new/upcoming_changes/array-api/30562.feature.rst similarity index 64% rename from doc/whats_new/upcoming_changes/array-api/30440.feature.rst rename to doc/whats_new/upcoming_changes/array-api/30562.feature.rst index d1f1374f28577..3c1a58d90bfe5 100644 --- a/doc/whats_new/upcoming_changes/array-api/30440.feature.rst +++ b/doc/whats_new/upcoming_changes/array-api/30562.feature.rst @@ -1,2 +1,2 @@ - :func:`sklearn.metrics.confusion_matrix` now supports Array API compatible inputs. - by :user:`Stefanie Senger ` + By :user:`Stefanie Senger ` diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 89f257d50834f..600a40490e922 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -337,10 +337,10 @@ def confusion_matrix( >>> (tn, fp, fn, tp) (np.int64(0), np.int64(2), np.int64(1), np.int64(1)) """ - y_true, y_pred = attach_unique(y_true, y_pred) xp, _, device_ = get_namespace_and_device(y_true, y_pred, labels, sample_weight) y_true = _convert_to_numpy(y_true, xp) y_pred = _convert_to_numpy(y_pred, xp) + y_true, y_pred = attach_unique(y_true, y_pred) y_type, y_true, y_pred = _check_targets(y_true, y_pred) if y_type not in ("binary", "multiclass"): raise ValueError("%s is not supported" % y_type) From 5963e0fa6a971521179f326828ed8bcf58020d67 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 30 Dec 2024 12:28:26 +0100 Subject: [PATCH 16/34] adjust test --- sklearn/metrics/tests/test_classification.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index a9f3b9a268864..638554dca66f7 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -40,7 +40,11 @@ from sklearn.model_selection import cross_val_score from sklearn.preprocessing import LabelBinarizer, label_binarize from sklearn.tree import DecisionTreeClassifier -from sklearn.utils._array_api import yield_namespace_device_dtype_combinations +from sklearn.utils._array_api import ( + _is_numpy_namespace, + get_namespace, + yield_namespace_device_dtype_combinations, +) from sklearn.utils._mocking import MockDataFrame from sklearn.utils._testing import ( _array_api_for_tests, @@ -3113,4 +3117,11 @@ def test_confusion_matrix_array_api(array_namespace, device, _): y_pred = xp.asarray([4, 5, 6], device=device) with config_context(array_api_dispatch=True): - confusion_matrix(y_true, y_pred) + result = confusion_matrix(y_true, y_pred) + xp_result, _ = get_namespace(result) + assert _is_numpy_namespace(xp_result) + + # Since the computation always happens with NumPy / SciPy on the CPU, this + # function is expected to return an array allocated on the CPU even when it does + # not match the input array's device. + assert result.device == "cpu" From ef84e042650c4d461a8e497766607e86bf8899c9 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 30 Dec 2024 12:46:08 +0100 Subject: [PATCH 17/34] document return array type --- sklearn/metrics/_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 600a40490e922..b923f40b556ae 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -293,7 +293,7 @@ def confusion_matrix( Returns ------- - C : array of shape (n_classes, n_classes) + C : ndarray of shape (n_classes, n_classes) Confusion matrix whose i-th row and j-th column entry indicates the number of samples with true label being i-th class From 1cf525ecb217436537c6657de63f1ebec33cbf3c Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 2 Jan 2025 09:36:33 +0100 Subject: [PATCH 18/34] use get_namespace --- sklearn/metrics/_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index b923f40b556ae..4b0b9eced0e7a 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -337,7 +337,7 @@ def confusion_matrix( >>> (tn, fp, fn, tp) (np.int64(0), np.int64(2), np.int64(1), np.int64(1)) """ - xp, _, device_ = get_namespace_and_device(y_true, y_pred, labels, sample_weight) + xp, _ = get_namespace(y_true, y_pred, labels, sample_weight) y_true = _convert_to_numpy(y_true, xp) y_pred = _convert_to_numpy(y_pred, xp) y_true, y_pred = attach_unique(y_true, y_pred) From f50f3eae63101a2e687811149cb5312c06a5b0b0 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 2 Jan 2025 15:23:13 +0100 Subject: [PATCH 19/34] fix issue with nullable dtypes with pandas==1.1.5 --- sklearn/utils/_array_api.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index c98f9275de3ff..cadcc1fb8f684 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -9,6 +9,7 @@ from functools import wraps import numpy +import pandas as pd import scipy import scipy.sparse as sp import scipy.special as special @@ -868,10 +869,29 @@ def _convert_to_numpy(array, xp): return array.cpu().numpy() elif xp_name in {"array_api_compat.cupy", "cupy"}: # pragma: nocover return array.get() - + if isinstance(array, pd.Series) and isinstance( + array.dtype, pd.api.extensions.ExtensionDtype + ): + array = convert_pandas_nullable_dtypes(array) return numpy.asarray(array) +# TODO: remove when minimum pandas version is pandas==1.2.0, when +# `numpy.asarray(pd.Series)` with nullable dtypes no longer returns nd.arrays with +# `object` dtypes: +def convert_pandas_nullable_dtypes(pandas_series): + """Convert from pandas nullable extension dtypes to numpy dtypes. Without this + conversion, numpy.asarray(array) creates a numpy array with dtype `object` for older + pandas versions. + """ + dtype_mapping = { + **{f"pd.Int{x}Dtype()": f"int{x}" for x in [8, 16, 32, 64]}, + **{f"pd.Float{x}Dtype()": f"float{x}" for x in [32, 64]}, + "pd.BooleanDtype()": "bool", + } + return pandas_series.astype(dtype_mapping.get(pandas_series.dtype), None) + + def _estimator_with_converted_arrays(estimator, converter): """Create new estimator which converting all attributes that are arrays. From 84038e428d619cda4e5cbcc8f436f3692a89a966 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 2 Jan 2025 15:40:59 +0100 Subject: [PATCH 20/34] private function --- sklearn/utils/_array_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index cadcc1fb8f684..197175098e87e 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -872,14 +872,14 @@ def _convert_to_numpy(array, xp): if isinstance(array, pd.Series) and isinstance( array.dtype, pd.api.extensions.ExtensionDtype ): - array = convert_pandas_nullable_dtypes(array) + array = _convert_pandas_nullable_dtypes(array) return numpy.asarray(array) # TODO: remove when minimum pandas version is pandas==1.2.0, when # `numpy.asarray(pd.Series)` with nullable dtypes no longer returns nd.arrays with # `object` dtypes: -def convert_pandas_nullable_dtypes(pandas_series): +def _convert_pandas_nullable_dtypes(pandas_series): """Convert from pandas nullable extension dtypes to numpy dtypes. Without this conversion, numpy.asarray(array) creates a numpy array with dtype `object` for older pandas versions. From b47fdc7c5d3a98d0153fd7df1e1365862f74444c Mon Sep 17 00:00:00 2001 From: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com> Date: Fri, 3 Jan 2025 09:24:01 +0100 Subject: [PATCH 21/34] Update sklearn/metrics/_classification.py Co-authored-by: Virgil Chan --- sklearn/metrics/_classification.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 4b0b9eced0e7a..64d68e60c07f4 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -362,8 +362,6 @@ def confusion_matrix( if sample_weight is None: sample_weight = np.ones(y_true.shape[0], dtype=np.int64) else: - sample_weight = np.asarray(sample_weight) - if not _is_numpy_namespace(get_namespace(sample_weight)[0]): sample_weight = _convert_to_numpy(sample_weight, xp) check_consistent_length(y_true, y_pred, sample_weight) From 1abc30831cfb69e007ae897e07200a2aa8e252bf Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 3 Jan 2025 10:24:15 +0100 Subject: [PATCH 22/34] fix tests when pandas not installed --- sklearn/utils/_array_api.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 197175098e87e..0e0ff15614fe5 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -9,7 +9,7 @@ from functools import wraps import numpy -import pandas as pd +import pytest import scipy import scipy.sparse as sp import scipy.special as special @@ -865,6 +865,8 @@ def _convert_to_numpy(array, xp): """Convert array into a NumPy ndarray on the CPU.""" xp_name = xp.__name__ + pd = pytest.importorskip("pandas") + if xp_name in {"array_api_compat.torch", "torch"}: return array.cpu().numpy() elif xp_name in {"array_api_compat.cupy", "cupy"}: # pragma: nocover @@ -884,6 +886,7 @@ def _convert_pandas_nullable_dtypes(pandas_series): conversion, numpy.asarray(array) creates a numpy array with dtype `object` for older pandas versions. """ + pd = pytest.importorskip("pandas") dtype_mapping = { **{f"pd.Int{x}Dtype()": f"int{x}" for x in [8, 16, 32, 64]}, **{f"pd.Float{x}Dtype()": f"float{x}" for x in [32, 64]}, From 7325cdf8a1d8088fab2a350c253b16ec7dedcc80 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 3 Jan 2025 15:13:06 +0100 Subject: [PATCH 23/34] better fix for environments without pandas --- sklearn/utils/_array_api.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 0e0ff15614fe5..fa4f25b4d1250 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -9,7 +9,6 @@ from functools import wraps import numpy -import pytest import scipy import scipy.sparse as sp import scipy.special as special @@ -865,14 +864,19 @@ def _convert_to_numpy(array, xp): """Convert array into a NumPy ndarray on the CPU.""" xp_name = xp.__name__ - pd = pytest.importorskip("pandas") + try: + import pandas as pd + except ImportError: + pd = None if xp_name in {"array_api_compat.torch", "torch"}: return array.cpu().numpy() elif xp_name in {"array_api_compat.cupy", "cupy"}: # pragma: nocover return array.get() - if isinstance(array, pd.Series) and isinstance( - array.dtype, pd.api.extensions.ExtensionDtype + if ( + pd + and isinstance(array, pd.Series) + and isinstance(array.dtype, pd.api.extensions.ExtensionDtype) ): array = _convert_pandas_nullable_dtypes(array) return numpy.asarray(array) @@ -886,7 +890,6 @@ def _convert_pandas_nullable_dtypes(pandas_series): conversion, numpy.asarray(array) creates a numpy array with dtype `object` for older pandas versions. """ - pd = pytest.importorskip("pandas") dtype_mapping = { **{f"pd.Int{x}Dtype()": f"int{x}" for x in [8, 16, 32, 64]}, **{f"pd.Float{x}Dtype()": f"float{x}" for x in [32, 64]}, From ba06676378dda49dcf9802278b391a8c5cc0b5ee Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 9 Jan 2025 15:16:06 +0100 Subject: [PATCH 24/34] remove _nan_to_num --- sklearn/utils/_array_api.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index fa4f25b4d1250..93177dabf201f 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -1134,20 +1134,3 @@ def _tolist(array, xp=None): return array.tolist() array_np = _convert_to_numpy(array, xp=xp) return [element.item() for element in array_np] - - -def _nan_to_num(array, xp=None): - """Substitutes NaN values of an array with 0 and inf values with the maximum or - minimum numbers available for the dtype respectively; like np.nan_to_num.""" - xp, _ = get_namespace(array, xp=xp) - try: - array = xp.nan_to_num(array) - except AttributeError: # currently catching exceptions from array_api_strict - array[xp.isnan(array)] = 0 - if xp.isdtype(array.dtype, "real floating"): - array[xp.isinf(array) & (array > 0)] = xp.finfo(array.dtype).max - array[xp.isinf(array) & (array < 0)] = xp.finfo(array.dtype).min - else: # xp.isdtype(array.dtype, "integral") - array[xp.isinf(array) & (array > 0)] = xp.iinfo(array.dtype).max - array[xp.isinf(array) & (array < 0)] = xp.iinfo(array.dtype).min - return array From 5b21ad6443f50cce86441b1ae11f839955cf4f87 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 13 Jan 2025 12:29:25 +0100 Subject: [PATCH 25/34] remove handling for pandas < 1.2.0 --- sklearn/utils/_array_api.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 93177dabf201f..3b3fb42c0fdfe 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -873,31 +873,9 @@ def _convert_to_numpy(array, xp): return array.cpu().numpy() elif xp_name in {"array_api_compat.cupy", "cupy"}: # pragma: nocover return array.get() - if ( - pd - and isinstance(array, pd.Series) - and isinstance(array.dtype, pd.api.extensions.ExtensionDtype) - ): - array = _convert_pandas_nullable_dtypes(array) return numpy.asarray(array) -# TODO: remove when minimum pandas version is pandas==1.2.0, when -# `numpy.asarray(pd.Series)` with nullable dtypes no longer returns nd.arrays with -# `object` dtypes: -def _convert_pandas_nullable_dtypes(pandas_series): - """Convert from pandas nullable extension dtypes to numpy dtypes. Without this - conversion, numpy.asarray(array) creates a numpy array with dtype `object` for older - pandas versions. - """ - dtype_mapping = { - **{f"pd.Int{x}Dtype()": f"int{x}" for x in [8, 16, 32, 64]}, - **{f"pd.Float{x}Dtype()": f"float{x}" for x in [32, 64]}, - "pd.BooleanDtype()": "bool", - } - return pandas_series.astype(dtype_mapping.get(pandas_series.dtype), None) - - def _estimator_with_converted_arrays(estimator, converter): """Create new estimator which converting all attributes that are arrays. From 6776a112119fbb22f62af3707da3b5bc1cdbf07f Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 13 Jan 2025 12:32:46 +0100 Subject: [PATCH 26/34] remove handling of pandas if pandas not installed --- sklearn/utils/_array_api.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 3b3fb42c0fdfe..4996e2818a623 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -864,15 +864,11 @@ def _convert_to_numpy(array, xp): """Convert array into a NumPy ndarray on the CPU.""" xp_name = xp.__name__ - try: - import pandas as pd - except ImportError: - pd = None - if xp_name in {"array_api_compat.torch", "torch"}: return array.cpu().numpy() elif xp_name in {"array_api_compat.cupy", "cupy"}: # pragma: nocover return array.get() + return numpy.asarray(array) From fa7564d7c4a6dcb0f6c6a74550e9f52eb82387d0 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 30 Jan 2025 14:24:37 +0100 Subject: [PATCH 27/34] use check_array for handling pandas extension dtypes --- sklearn/metrics/_classification.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index ce156be5644fa..2bb05f4301270 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -339,6 +339,8 @@ def confusion_matrix( (np.int64(0), np.int64(2), np.int64(1), np.int64(1)) """ xp, _ = get_namespace(y_true, y_pred, labels, sample_weight) + y_true = check_array(y_true, dtype=None, ensure_2d=False, ensure_min_samples=0) + y_pred = check_array(y_pred, dtype=None, ensure_2d=False, ensure_min_samples=0) y_true = _convert_to_numpy(y_true, xp) y_pred = _convert_to_numpy(y_pred, xp) y_true, y_pred = attach_unique(y_true, y_pred) From 6ee6afc39a470fc18f505be7ed7745e4ce574e44 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Thu, 30 Jan 2025 14:43:50 +0100 Subject: [PATCH 28/34] ensure_all_finite=False --- sklearn/metrics/_classification.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 2bb05f4301270..0c0edaac6dcb3 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -339,8 +339,20 @@ def confusion_matrix( (np.int64(0), np.int64(2), np.int64(1), np.int64(1)) """ xp, _ = get_namespace(y_true, y_pred, labels, sample_weight) - y_true = check_array(y_true, dtype=None, ensure_2d=False, ensure_min_samples=0) - y_pred = check_array(y_pred, dtype=None, ensure_2d=False, ensure_min_samples=0) + y_true = check_array( + y_true, + dtype=None, + ensure_2d=False, + ensure_all_finite=False, + ensure_min_samples=0, + ) + y_pred = check_array( + y_pred, + dtype=None, + ensure_2d=False, + ensure_all_finite=False, + ensure_min_samples=0, + ) y_true = _convert_to_numpy(y_true, xp) y_pred = _convert_to_numpy(y_pred, xp) y_true, y_pred = attach_unique(y_true, y_pred) From 32ea61e3947805160c16985a76c2b5ac9f57a808 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 31 Jan 2025 12:12:35 +0100 Subject: [PATCH 29/34] add label passing to test to archive CodeCov --- sklearn/metrics/_classification.py | 6 +++--- sklearn/metrics/tests/test_classification.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 0c0edaac6dcb3..a33ecdd494495 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -391,9 +391,9 @@ def confusion_matrix( and y_pred.min() >= 0 ) if need_index_conversion: - label_to_ind = {y: x for x, y in enumerate(labels)} - y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred]) - y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true]) + label_to_ind = {label: index for index, label in enumerate(labels)} + y_pred = np.array([label_to_ind.get(index, n_labels + 1) for index in y_pred]) + y_true = np.array([label_to_ind.get(index, n_labels + 1) for index in y_true]) # intersect y_pred, y_true with labels, eliminate items not in labels ind = np.logical_and(y_pred < n_labels, y_true < n_labels) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 638554dca66f7..ae826108abdc1 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -3109,15 +3109,15 @@ def test_d2_log_loss_score_raises(): ) def test_confusion_matrix_array_api(array_namespace, device, _): """Test that `confusion_matrix` works for all array types if need_index_conversion - evaluates to `True`and that it raises if not at least one label from `y_pred` is in - `y_true`.""" + evaluates to `True` and with labels passed.""" xp = _array_api_for_tests(array_namespace, device) y_true = xp.asarray([1, 2, 3], device=device) y_pred = xp.asarray([4, 5, 6], device=device) + labels = xp.asarray([1, 2, 3], device=device) with config_context(array_api_dispatch=True): - result = confusion_matrix(y_true, y_pred) + result = confusion_matrix(y_true, y_pred, labels=labels) xp_result, _ = get_namespace(result) assert _is_numpy_namespace(xp_result) From e59cd7ffdf6179b4ceacb2f79ad584d4bf4f69ea Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 31 Jan 2025 12:19:55 +0100 Subject: [PATCH 30/34] fix naming --- sklearn/metrics/_classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index a33ecdd494495..8a7501355c256 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -392,8 +392,8 @@ def confusion_matrix( ) if need_index_conversion: label_to_ind = {label: index for index, label in enumerate(labels)} - y_pred = np.array([label_to_ind.get(index, n_labels + 1) for index in y_pred]) - y_true = np.array([label_to_ind.get(index, n_labels + 1) for index in y_true]) + y_pred = np.array([label_to_ind.get(label, n_labels + 1) for label in y_pred]) + y_true = np.array([label_to_ind.get(label, n_labels + 1) for label in y_true]) # intersect y_pred, y_true with labels, eliminate items not in labels ind = np.logical_and(y_pred < n_labels, y_true < n_labels) From 5124f9893c52f793f2089da0b3aba525c6e6983f Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 31 Jan 2025 13:35:37 +0100 Subject: [PATCH 31/34] experiment - need to push so I can test on GPU --- sklearn/metrics/_classification.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 8a7501355c256..53c4d7b3b14d0 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -371,8 +371,6 @@ def confusion_matrix( return np.zeros((n_labels, n_labels), dtype=int) elif len(np.intersect1d(y_true, labels)) == 0: raise ValueError("At least one label specified must be in y_true") - if not _is_numpy_namespace(get_namespace(labels)[0]): - labels = _convert_to_numpy(labels, xp) if sample_weight is None: sample_weight = np.ones(y_true.shape[0], dtype=np.int64) From 5034d017bfe11d62e392a20b302a4ba110271d4c Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Fri, 31 Jan 2025 13:40:09 +0100 Subject: [PATCH 32/34] convert labels to numpy --- sklearn/metrics/_classification.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 53c4d7b3b14d0..0b59f443020ef 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -363,7 +363,10 @@ def confusion_matrix( if labels is None: labels = unique_labels(y_true, y_pred) else: - labels = np.asarray(labels) + if not _is_numpy_namespace(get_namespace(labels)[0]): + labels = _convert_to_numpy(labels, xp) + else: # input is a list + labels = np.asarray(labels) n_labels = labels.size if n_labels == 0: raise ValueError("'labels' should contain at least one label.") From d42caa622f16c56a0e80a9eec6f188c971f80494 Mon Sep 17 00:00:00 2001 From: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com> Date: Mon, 10 Feb 2025 12:28:18 +0100 Subject: [PATCH 33/34] Update sklearn/metrics/tests/test_classification.py Co-authored-by: Omar Salman --- sklearn/metrics/tests/test_classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index e589092547659..4fdea34144002 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -3239,8 +3239,8 @@ def test_d2_log_loss_score_raises(): "array_namespace, device, _", yield_namespace_device_dtype_combinations() ) def test_confusion_matrix_array_api(array_namespace, device, _): - """Test that `confusion_matrix` works for all array types if need_index_conversion - evaluates to `True` and with labels passed.""" + """Test that `confusion_matrix` works for all array types when `labels` are passed + such that the inner boolean `need_index_conversion` evaluates to `True`.""" xp = _array_api_for_tests(array_namespace, device) y_true = xp.asarray([1, 2, 3], device=device) From 094ca6d902832c7ac1effff4177432f7104f8133 Mon Sep 17 00:00:00 2001 From: Stefanie Senger Date: Mon, 10 Feb 2025 12:32:23 +0100 Subject: [PATCH 34/34] remove unhelpful comment --- sklearn/metrics/_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 2d31da39a552f..b45c0ce8058f4 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -365,7 +365,7 @@ def confusion_matrix( else: if not _is_numpy_namespace(get_namespace(labels)[0]): labels = _convert_to_numpy(labels, xp) - else: # input is a list + else: labels = np.asarray(labels) n_labels = labels.size if n_labels == 0: