Skip to content

ENH Add Array API compatibility for entropy #29141

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/modules/array_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ base estimator also does:
Metrics
-------

- :func:`sklearn.metrics.cluster.entropy`
- :func:`sklearn.metrics.accuracy_score`
- :func:`sklearn.metrics.d2_tweedie_score`
- :func:`sklearn.metrics.max_error`
Expand Down
1 change: 1 addition & 0 deletions doc/whats_new/v1.6.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ See :ref:`array_api` for more details.

**Functions:**

- :func:`sklearn.metrics.cluster.entropy` :pr:`29141` by :user:`Yaroslav Korobko <Tialo>`;
- :func:`sklearn.metrics.d2_tweedie_score` :pr:`29207` by :user:`Emily Chen <EmilyXinyi>`;
- :func:`sklearn.metrics.max_error` :pr:`29212` by :user:`Edoardo Abati <EdAbati>`;
- :func:`sklearn.metrics.mean_absolute_error` :pr:`27736` by :user:`Edoardo Abati <EdAbati>`;
Expand Down
16 changes: 10 additions & 6 deletions sklearn/metrics/cluster/_supervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import numpy as np
from scipy import sparse as sp

from ...utils._array_api import get_namespace
from ...utils._param_validation import Interval, StrOptions, validate_params
from ...utils.multiclass import type_of_target
from ...utils.validation import check_array, check_consistent_length
Expand Down Expand Up @@ -1282,17 +1283,20 @@ def entropy(labels):
-----
The logarithm used is the natural logarithm (base-e).
"""
if len(labels) == 0:
xp, is_array_api_compliant = get_namespace(labels)
labels_len = labels.shape[0] if is_array_api_compliant else len(labels)
if labels_len == 0:
return 1.0
label_idx = np.unique(labels, return_inverse=True)[1]
pi = np.bincount(label_idx).astype(np.float64)
pi = pi[pi > 0]

pi = xp.astype(xp.unique_counts(labels)[1], xp.float64)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice code simplification by the way :)


# single cluster => zero entropy
if pi.size == 1:
return 0.0

pi_sum = np.sum(pi)
pi_sum = xp.sum(pi)
# log(a / b) should be calculated as log(a) - log(b) for
# possible loss of precision
return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum)))
# Always convert the result as a Python scalar (on CPU) instead of a device
# specific scalar array.
return float(-xp.sum((pi / pi_sum) * (xp.log(pi) - log(pi_sum))))
21 changes: 18 additions & 3 deletions sklearn/metrics/cluster/tests/test_supervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest
from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal

from sklearn.base import config_context
from sklearn.metrics.cluster import (
adjusted_mutual_info_score,
adjusted_rand_score,
Expand All @@ -22,7 +23,8 @@
)
from sklearn.metrics.cluster._supervised import _generalized_average, check_clusterings
from sklearn.utils import assert_all_finite
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._array_api import yield_namespace_device_dtype_combinations
from sklearn.utils._testing import _array_api_for_tests, assert_almost_equal

score_funcs = [
adjusted_rand_score,
Expand Down Expand Up @@ -254,12 +256,25 @@ def test_int_overflow_mutual_info_fowlkes_mallows_score():


def test_entropy():
ent = entropy([0, 0, 42.0])
assert_almost_equal(ent, 0.6365141, 5)
assert_almost_equal(entropy([0, 0, 42.0]), 0.6365141, 5)
assert_almost_equal(entropy([]), 1)
assert entropy([1, 1, 1, 1]) == 0


@pytest.mark.parametrize(
"array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
)
def test_entropy_array_api(array_namespace, device, dtype_name):
xp = _array_api_for_tests(array_namespace, device)
float_labels = xp.asarray(np.asarray([0, 0, 42.0], dtype=dtype_name), device=device)
empty_int32_labels = xp.asarray([], dtype=xp.int32, device=device)
int_labels = xp.asarray([1, 1, 1, 1], device=device)
with config_context(array_api_dispatch=True):
assert entropy(float_labels) == pytest.approx(0.6365141, abs=1e-5)
assert entropy(empty_int32_labels) == 1
assert entropy(int_labels) == 0


def test_contingency_matrix():
labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
Expand Down
2 changes: 1 addition & 1 deletion sklearn/utils/_array_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def supported_float_dtypes(xp):
def ensure_common_namespace_device(reference, *arrays):
"""Ensure that all arrays use the same namespace and device as reference.

If neccessary the arrays are moved to the same namespace and device as
If necessary the arrays are moved to the same namespace and device as
the reference array.

Parameters
Expand Down