Skip to content

Classification metrics don't seem to support sparse? #32036

@lucyleeow

Description

@lucyleeow

While working on #31829, I noticed that although most metrics in _classification.py say they support sparse in the docstring (and include "sparse matrix" in validate_params), when you actually try, you get an error.

Essentially in _check_targets, we do:

if y_type in ["binary", "multiclass"]:
xp, _ = get_namespace(y_true, y_pred)
y_true = column_or_1d(y_true)
y_pred = column_or_1d(y_pred)

column_or_1d then calls check_array with accept_sparse set to the default False.

from sklearn.metrics import accuracy_score
from scipy import sparse
import numpy as np

y = [0, 2, 1, 3]
y_sparse = sparse.csr_matrix(np.array(y).reshape(-1, 1))

accuracy_score(y_sparse, y_sparse)

Gives the following error:

Error
TypeError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 accuracy_score(sparse_col, sparse_col)

File ~/Documents/dev/scikit-learn/sklearn/utils/_param_validation.py:218, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    212 try:
    213     with config_context(
    214         skip_parameter_validation=(
    215             prefer_skip_nested_validation or global_skip_validation
    216         )
    217     ):
--> 218         return func(*args, **kwargs)
    219 except InvalidParameterError as e:
    220     # When the function is just a wrapper around an estimator, we allow
    221     # the function to delegate validation to the estimator, but we replace
    222     # the name of the estimator by the name of the function in the error
    223     # message to avoid confusion.
    224     msg = re.sub(
    225         r"parameter of \w+ must be",
    226         f"parameter of {func.__qualname__} must be",
    227         str(e),
    228     )

File ~/Documents/dev/scikit-learn/sklearn/metrics/_classification.py:373, in accuracy_score(y_true, y_pred, normalize, sample_weight)
    371 # Compute accuracy for each possible representation
    372 y_true, y_pred = attach_unique(y_true, y_pred)
--> 373 y_type, y_true, y_pred, sample_weight = _check_targets(
    374     y_true, y_pred, sample_weight
    375 )
    377 if y_type.startswith("multilabel"):
    378     differing_labels = _count_nonzero(y_true - y_pred, xp=xp, device=device, axis=1)

File ~/Documents/dev/scikit-learn/sklearn/metrics/_classification.py:131, in _check_targets(y_true, y_pred, sample_weight)
    129 if y_type in ["binary", "multiclass"]:
    130     xp, _ = get_namespace(y_true, y_pred)
--> 131     y_true = column_or_1d(y_true)
    132     y_pred = column_or_1d(y_pred)
    133     if y_type == "binary":

File ~/Documents/dev/scikit-learn/sklearn/utils/validation.py:1469, in column_or_1d(y, dtype, warn, device)
   1431 """Ravel column or 1d numpy array, else raises an error.
   1432 
   1433 Parameters
   (...)
   1466 array([1, 1])
   1467 """
   1468 xp, _ = get_namespace(y)
-> 1469 y = check_array(
   1470     y,
   1471     ensure_2d=False,
   1472     dtype=dtype,
   1473     input_name="y",
   1474     ensure_all_finite=False,
   1475     ensure_min_samples=0,
   1476 )
   1478 shape = y.shape
   1479 if len(shape) == 1:

File ~/Documents/dev/scikit-learn/sklearn/utils/validation.py:1027, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_non_negative, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
   1025 if sp.issparse(array):
   1026     _ensure_no_complex_data(array)
-> 1027     array = _ensure_sparse_format(
   1028         array,
   1029         accept_sparse=accept_sparse,
   1030         dtype=dtype,
   1031         copy=copy,
   1032         ensure_all_finite=ensure_all_finite,
   1033         accept_large_sparse=accept_large_sparse,
   1034         estimator_name=estimator_name,
   1035         input_name=input_name,
   1036     )
   1037     if ensure_2d and array.ndim < 2:
   1038         raise ValueError(
   1039             f"Expected 2D input, got input with shape {array.shape}.\n"
   1040             "Reshape your data either using array.reshape(-1, 1) if "
   1041             "your data has a single feature or array.reshape(1, -1) "
   1042             "if it contains a single sample."
   1043         )

File ~/Documents/dev/scikit-learn/sklearn/utils/validation.py:626, in _ensure_sparse_format(sparse_container, accept_sparse, dtype, copy, ensure_all_finite, accept_large_sparse, estimator_name, input_name)
    624 if accept_sparse is False:
    625     padded_input = " for " + input_name if input_name else ""
--> 626     raise TypeError(
    627         f"Sparse data was passed{padded_input}, but dense data is required. "
    628         "Use '.toarray()' to convert to a dense numpy array."
    629     )
    630 elif isinstance(accept_sparse, (list, tuple)):
    631     if len(accept_sparse) == 0:

TypeError: Sparse data was passed for y, but dense data is required. Use '.toarray()' to convert to a dense numpy array.

I also could not find any sparse tests for classification. This seems so I feel like a big oversight so I feel like I have missed something??

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions