Skip to content

PERF speedup classification_report by attaching unique values to dtype.metadata #29738

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v1.6.rst
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,10 @@ Changelog
:pr:`29210` by :user:`Marc Torrellas Socastro <marctorsoc>` and
:user:`Stefanie Senger <StefanieSenger>`.

- |Efficiency| :func:`sklearn.metrics.classification_report` is now faster by caching
classification labels.
:pr:`29738` by `Adrin Jalali`_.

- |API| scoring="neg_max_error" should be used instead of
scoring="max_error" which is now deprecated.
:pr:`29462` by :user:`Farid "Freddie" Taba <artificialfintelligence>`.
Expand Down
11 changes: 9 additions & 2 deletions sklearn/metrics/_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
StrOptions,
validate_params,
)
from ..utils._unique import attach_unique
from ..utils.extmath import _nanaverage
from ..utils.multiclass import type_of_target, unique_labels
from ..utils.sparsefuncs import count_nonzero
Expand Down Expand Up @@ -216,6 +217,7 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
"""
xp, _, device = get_namespace_and_device(y_true, y_pred, sample_weight)
# Compute accuracy for each possible representation
y_true, y_pred = attach_unique(y_true, y_pred)
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
check_consistent_length(y_true, y_pred, sample_weight)
if y_type.startswith("multilabel"):
Expand Down Expand Up @@ -327,6 +329,7 @@ def confusion_matrix(
>>> (tn, fp, fn, tp)
(np.int64(0), np.int64(2), np.int64(1), np.int64(1))
"""
y_true, y_pred = attach_unique(y_true, y_pred)
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
if y_type not in ("binary", "multiclass"):
raise ValueError("%s is not supported" % y_type)
Expand Down Expand Up @@ -516,6 +519,7 @@ def multilabel_confusion_matrix(
[[2, 1],
[1, 2]]])
"""
y_true, y_pred = attach_unique(y_true, y_pred)
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
if sample_weight is not None:
sample_weight = column_or_1d(sample_weight)
Expand Down Expand Up @@ -1054,6 +1058,7 @@ def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
>>> matthews_corrcoef(y_true, y_pred)
np.float64(-0.33...)
"""
y_true, y_pred = attach_unique(y_true, y_pred)
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
check_consistent_length(y_true, y_pred, sample_weight)
if y_type not in {"binary", "multiclass"}:
Expand Down Expand Up @@ -1612,6 +1617,7 @@ def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label):
if average not in average_options and average != "binary":
raise ValueError("average has to be one of " + str(average_options))

y_true, y_pred = attach_unique(y_true, y_pred)
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
# Convert to Python primitive type to avoid NumPy type / Python str
# comparison. See https://github.com/numpy/numpy/issues/6784
Expand Down Expand Up @@ -2031,7 +2037,7 @@ class after being classified as negative. This is the case when the
>>> class_likelihood_ratios(y_true, y_pred, labels=["non-cat", "cat"])
(np.float64(1.5), np.float64(0.75))
"""

y_true, y_pred = attach_unique(y_true, y_pred)
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
if y_type != "binary":
raise ValueError(
Expand Down Expand Up @@ -2681,6 +2687,7 @@ class 2 1.00 0.67 0.80 3
<BLANKLINE>
"""

y_true, y_pred = attach_unique(y_true, y_pred)
y_type, y_true, y_pred = _check_targets(y_true, y_pred)

if labels is None:
Expand Down Expand Up @@ -2869,7 +2876,7 @@ def hamming_loss(y_true, y_pred, *, sample_weight=None):
>>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2)))
0.75
"""

y_true, y_pred = attach_unique(y_true, y_pred)
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
check_consistent_length(y_true, y_pred, sample_weight)

Expand Down
6 changes: 5 additions & 1 deletion sklearn/utils/_array_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,11 @@ def _is_numpy_namespace(xp):

def _union1d(a, b, xp):
if _is_numpy_namespace(xp):
return xp.asarray(numpy.union1d(a, b))
# avoid circular import
from ._unique import cached_unique

a_unique, b_unique = cached_unique(a, b, xp=xp)
return xp.asarray(numpy.union1d(a_unique, b_unique))
assert a.ndim == b.ndim == 1
return xp.unique_values(xp.concat([xp.unique_values(a), xp.unique_values(b)]))

Expand Down
108 changes: 108 additions & 0 deletions sklearn/utils/_unique.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np

from sklearn.utils._array_api import get_namespace


def _attach_unique(y):
"""Attach unique values of y to y and return the result.

The result is a view of y, and the metadata (unique) is not attached to y.
"""
if not isinstance(y, np.ndarray):
return y
try:
# avoid recalculating unique in nested calls.
if "unique" in y.dtype.metadata:
return y
except (AttributeError, TypeError):
pass

unique = np.unique(y)
unique_dtype = np.dtype(y.dtype, metadata={"unique": unique})
return y.view(dtype=unique_dtype)


def attach_unique(*ys, return_tuple=False):
"""Attach unique values of ys to ys and return the results.

The result is a view of y, and the metadata (unique) is not attached to y.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we include a comment here that starts that the output of attach_unique should never be returned from a public function?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, added a comment for this.


IMPORTANT: The output of this function should NEVER be returned in functions.
This is to avoid this pattern:

.. code:: python

y = np.array([1, 2, 3])
y = attach_unique(y)
y[1] = -1
# now np.unique(y) will be different from cached_unique(y)

Parameters
----------
*ys : sequence of array-like
Input data arrays.

return_tuple : bool, default=False
If True, always return a tuple even if there is only one array.

Returns
-------
ys : tuple of array-like or array-like
Input data with unique values attached.
"""
res = tuple(_attach_unique(y) for y in ys)
if len(res) == 1 and not return_tuple:
return res[0]
return res


def _cached_unique(y, xp=None):
"""Return the unique values of y.

Use the cached values from dtype.metadata if present.

This function does NOT cache the values in y, i.e. it doesn't change y.

Call `attach_unique` to attach the unique values to y.
"""
try:
if y.dtype.metadata is not None and "unique" in y.dtype.metadata:
return y.dtype.metadata["unique"]
except AttributeError:
# in case y is not a numpy array
pass
xp, _ = get_namespace(y, xp=xp)
return xp.unique_values(y)


def cached_unique(*ys, xp=None):
"""Return the unique values of ys.

Use the cached values from dtype.metadata if present.

This function does NOT cache the values in y, i.e. it doesn't change y.

Call `attach_unique` to attach the unique values to y.

Parameters
----------
*ys : sequence of array-like
Input data arrays.

xp : module, default=None
Precomputed array namespace module. When passed, typically from a caller
that has already performed inspection of its own inputs, skips array
namespace inspection.

Returns
-------
res : tuple of array-like or array-like
Unique values of ys.
"""
res = tuple(_cached_unique(y, xp=xp) for y in ys)
if len(res) == 1:
return res[0]
return res
24 changes: 14 additions & 10 deletions sklearn/utils/multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,20 @@

from ..utils._array_api import get_namespace
from ..utils.fixes import VisibleDeprecationWarning
from ._unique import attach_unique, cached_unique
from .validation import _assert_all_finite, check_array


def _unique_multiclass(y):
xp, is_array_api_compliant = get_namespace(y)
def _unique_multiclass(y, xp=None):
xp, is_array_api_compliant = get_namespace(y, xp=xp)
if hasattr(y, "__array__") or is_array_api_compliant:
return xp.unique_values(xp.asarray(y))
return cached_unique(xp.asarray(y), xp=xp)
else:
return set(y)


def _unique_indicator(y):
xp, _ = get_namespace(y)
def _unique_indicator(y, xp=None):
xp, _ = get_namespace(y, xp=xp)
return xp.arange(
check_array(y, input_name="y", accept_sparse=["csr", "csc", "coo"]).shape[1]
)
Expand Down Expand Up @@ -69,8 +70,9 @@ def unique_labels(*ys):
>>> unique_labels([1, 2, 10], [5, 11])
array([ 1, 2, 5, 10, 11])
"""
ys = attach_unique(*ys, return_tuple=True)
xp, is_array_api_compliant = get_namespace(*ys)
if not ys:
if len(ys) == 0:
raise ValueError("No argument has been passed.")
# Check that we don't mix label format

Expand Down Expand Up @@ -104,10 +106,12 @@ def unique_labels(*ys):

if is_array_api_compliant:
# array_api does not allow for mixed dtypes
unique_ys = xp.concat([_unique_labels(y) for y in ys])
unique_ys = xp.concat([_unique_labels(y, xp=xp) for y in ys])
return xp.unique_values(unique_ys)

ys_labels = set(chain.from_iterable((i for i in _unique_labels(y)) for y in ys))
ys_labels = set(
chain.from_iterable((i for i in _unique_labels(y, xp=xp)) for y in ys)
)
# Check that we don't mix string type with number type
if len(set(isinstance(label, str) for label in ys_labels)) > 1:
raise ValueError("Mix of label input types (string and number)")
Expand Down Expand Up @@ -187,7 +191,7 @@ def is_multilabel(y):
and (y.dtype.kind in "biu" or _is_integral_float(labels)) # bool, int, uint
)
else:
labels = xp.unique_values(y)
labels = cached_unique(y, xp=xp)

return labels.shape[0] < 3 and (
xp.isdtype(y.dtype, ("bool", "signed integer", "unsigned integer"))
Expand Down Expand Up @@ -400,7 +404,7 @@ def type_of_target(y, input_name=""):
# Check multiclass
if issparse(first_row_or_val):
first_row_or_val = first_row_or_val.data
if xp.unique_values(y).shape[0] > 2 or (y.ndim == 2 and len(first_row_or_val) > 1):
if cached_unique(y).shape[0] > 2 or (y.ndim == 2 and len(first_row_or_val) > 1):
# [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
return "multiclass" + suffix
else:
Expand Down
54 changes: 54 additions & 0 deletions sklearn/utils/tests/test_unique.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import numpy as np
from numpy.testing import assert_array_equal

from sklearn.utils._unique import attach_unique, cached_unique
from sklearn.utils.validation import check_array


def test_attach_unique_attaches_unique_to_array():
arr = np.array([1, 2, 2, 3, 4, 4, 5])
arr_ = attach_unique(arr)
assert_array_equal(arr_.dtype.metadata["unique"], np.array([1, 2, 3, 4, 5]))
assert_array_equal(arr_, arr)


def test_cached_unique_returns_cached_unique():
my_dtype = np.dtype(np.float64, metadata={"unique": np.array([1, 2])})
arr = np.array([1, 2, 2, 3, 4, 4, 5], dtype=my_dtype)
assert_array_equal(cached_unique(arr), np.array([1, 2]))


def test_attach_unique_not_ndarray():
"""Test that when not np.ndarray, we don't touch the array."""
arr = [1, 2, 2, 3, 4, 4, 5]
arr_ = attach_unique(arr)
assert arr_ is arr


def test_attach_unique_returns_view():
"""Test that attach_unique returns a view of the array."""
arr = np.array([1, 2, 2, 3, 4, 4, 5])
arr_ = attach_unique(arr)
assert arr_.base is arr


def test_attach_unique_return_tuple():
"""Test return_tuple argument of the function."""
arr = np.array([1, 2, 2, 3, 4, 4, 5])
arr_tuple = attach_unique(arr, return_tuple=True)
assert isinstance(arr_tuple, tuple)
assert len(arr_tuple) == 1
assert_array_equal(arr_tuple[0], arr)

arr_single = attach_unique(arr, return_tuple=False)
assert isinstance(arr_single, np.ndarray)
assert_array_equal(arr_single, arr)


def test_check_array_keeps_unique():
"""Test that check_array keeps the unique metadata."""
arr = np.array([[1, 2, 2, 3, 4, 4, 5]])
arr_ = attach_unique(arr)
arr_ = check_array(arr_)
assert_array_equal(arr_.dtype.metadata["unique"], np.array([1, 2, 3, 4, 5]))
assert_array_equal(arr_, arr)