-
-
Notifications
You must be signed in to change notification settings - Fork 26.2k
Open
Labels
Description
While working on #31829, I noticed that although most metrics in _classification.py
say they support sparse in the docstring (and include "sparse matrix" in validate_params
), when you actually try, you get an error.
Essentially in _check_targets
, we do:
scikit-learn/sklearn/metrics/_classification.py
Lines 128 to 131 in 726ed18
if y_type in ["binary", "multiclass"]: | |
xp, _ = get_namespace(y_true, y_pred) | |
y_true = column_or_1d(y_true) | |
y_pred = column_or_1d(y_pred) |
column_or_1d
then calls check_array
with accept_sparse
set to the default False
.
from sklearn.metrics import accuracy_score
from scipy import sparse
import numpy as np
y = [0, 2, 1, 3]
y_sparse = sparse.csr_matrix(np.array(y).reshape(-1, 1))
accuracy_score(y_sparse, y_sparse)
Gives the following error:
Error
TypeError Traceback (most recent call last)
Cell In[11], line 1
----> 1 accuracy_score(sparse_col, sparse_col)
File ~/Documents/dev/scikit-learn/sklearn/utils/_param_validation.py:218, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
212 try:
213 with config_context(
214 skip_parameter_validation=(
215 prefer_skip_nested_validation or global_skip_validation
216 )
217 ):
--> 218 return func(*args, **kwargs)
219 except InvalidParameterError as e:
220 # When the function is just a wrapper around an estimator, we allow
221 # the function to delegate validation to the estimator, but we replace
222 # the name of the estimator by the name of the function in the error
223 # message to avoid confusion.
224 msg = re.sub(
225 r"parameter of \w+ must be",
226 f"parameter of {func.__qualname__} must be",
227 str(e),
228 )
File ~/Documents/dev/scikit-learn/sklearn/metrics/_classification.py:373, in accuracy_score(y_true, y_pred, normalize, sample_weight)
371 # Compute accuracy for each possible representation
372 y_true, y_pred = attach_unique(y_true, y_pred)
--> 373 y_type, y_true, y_pred, sample_weight = _check_targets(
374 y_true, y_pred, sample_weight
375 )
377 if y_type.startswith("multilabel"):
378 differing_labels = _count_nonzero(y_true - y_pred, xp=xp, device=device, axis=1)
File ~/Documents/dev/scikit-learn/sklearn/metrics/_classification.py:131, in _check_targets(y_true, y_pred, sample_weight)
129 if y_type in ["binary", "multiclass"]:
130 xp, _ = get_namespace(y_true, y_pred)
--> 131 y_true = column_or_1d(y_true)
132 y_pred = column_or_1d(y_pred)
133 if y_type == "binary":
File ~/Documents/dev/scikit-learn/sklearn/utils/validation.py:1469, in column_or_1d(y, dtype, warn, device)
1431 """Ravel column or 1d numpy array, else raises an error.
1432
1433 Parameters
(...)
1466 array([1, 1])
1467 """
1468 xp, _ = get_namespace(y)
-> 1469 y = check_array(
1470 y,
1471 ensure_2d=False,
1472 dtype=dtype,
1473 input_name="y",
1474 ensure_all_finite=False,
1475 ensure_min_samples=0,
1476 )
1478 shape = y.shape
1479 if len(shape) == 1:
File ~/Documents/dev/scikit-learn/sklearn/utils/validation.py:1027, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_non_negative, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
1025 if sp.issparse(array):
1026 _ensure_no_complex_data(array)
-> 1027 array = _ensure_sparse_format(
1028 array,
1029 accept_sparse=accept_sparse,
1030 dtype=dtype,
1031 copy=copy,
1032 ensure_all_finite=ensure_all_finite,
1033 accept_large_sparse=accept_large_sparse,
1034 estimator_name=estimator_name,
1035 input_name=input_name,
1036 )
1037 if ensure_2d and array.ndim < 2:
1038 raise ValueError(
1039 f"Expected 2D input, got input with shape {array.shape}.\n"
1040 "Reshape your data either using array.reshape(-1, 1) if "
1041 "your data has a single feature or array.reshape(1, -1) "
1042 "if it contains a single sample."
1043 )
File ~/Documents/dev/scikit-learn/sklearn/utils/validation.py:626, in _ensure_sparse_format(sparse_container, accept_sparse, dtype, copy, ensure_all_finite, accept_large_sparse, estimator_name, input_name)
624 if accept_sparse is False:
625 padded_input = " for " + input_name if input_name else ""
--> 626 raise TypeError(
627 f"Sparse data was passed{padded_input}, but dense data is required. "
628 "Use '.toarray()' to convert to a dense numpy array."
629 )
630 elif isinstance(accept_sparse, (list, tuple)):
631 if len(accept_sparse) == 0:
TypeError: Sparse data was passed for y, but dense data is required. Use '.toarray()' to convert to a dense numpy array.
I also could not find any sparse tests for classification. This seems so I feel like a big oversight so I feel like I have missed something??