diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index 26359f963963d..cc7831f094f61 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -112,6 +112,7 @@ base estimator also does: Metrics ------- +- :func:`sklearn.metrics.cluster.entropy` - :func:`sklearn.metrics.accuracy_score` - :func:`sklearn.metrics.d2_tweedie_score` - :func:`sklearn.metrics.max_error` diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 7f96f169ad638..fd43347cf7ac8 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -32,6 +32,7 @@ See :ref:`array_api` for more details. **Functions:** +- :func:`sklearn.metrics.cluster.entropy` :pr:`29141` by :user:`Yaroslav Korobko `; - :func:`sklearn.metrics.d2_tweedie_score` :pr:`29207` by :user:`Emily Chen `; - :func:`sklearn.metrics.max_error` :pr:`29212` by :user:`Edoardo Abati `; - :func:`sklearn.metrics.mean_absolute_error` :pr:`27736` by :user:`Edoardo Abati `; diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 8e178e21a55ec..208f2732d06a1 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -23,6 +23,7 @@ import numpy as np from scipy import sparse as sp +from ...utils._array_api import get_namespace from ...utils._param_validation import Interval, StrOptions, validate_params from ...utils.multiclass import type_of_target from ...utils.validation import check_array, check_consistent_length @@ -1282,17 +1283,20 @@ def entropy(labels): ----- The logarithm used is the natural logarithm (base-e). """ - if len(labels) == 0: + xp, is_array_api_compliant = get_namespace(labels) + labels_len = labels.shape[0] if is_array_api_compliant else len(labels) + if labels_len == 0: return 1.0 - label_idx = np.unique(labels, return_inverse=True)[1] - pi = np.bincount(label_idx).astype(np.float64) - pi = pi[pi > 0] + + pi = xp.astype(xp.unique_counts(labels)[1], xp.float64) # single cluster => zero entropy if pi.size == 1: return 0.0 - pi_sum = np.sum(pi) + pi_sum = xp.sum(pi) # log(a / b) should be calculated as log(a) - log(b) for # possible loss of precision - return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum))) + # Always convert the result as a Python scalar (on CPU) instead of a device + # specific scalar array. + return float(-xp.sum((pi / pi_sum) * (xp.log(pi) - log(pi_sum)))) diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index dfaa58ff62c01..077dca0854a01 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -4,6 +4,7 @@ import pytest from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal +from sklearn.base import config_context from sklearn.metrics.cluster import ( adjusted_mutual_info_score, adjusted_rand_score, @@ -22,7 +23,8 @@ ) from sklearn.metrics.cluster._supervised import _generalized_average, check_clusterings from sklearn.utils import assert_all_finite -from sklearn.utils._testing import assert_almost_equal +from sklearn.utils._array_api import yield_namespace_device_dtype_combinations +from sklearn.utils._testing import _array_api_for_tests, assert_almost_equal score_funcs = [ adjusted_rand_score, @@ -254,12 +256,25 @@ def test_int_overflow_mutual_info_fowlkes_mallows_score(): def test_entropy(): - ent = entropy([0, 0, 42.0]) - assert_almost_equal(ent, 0.6365141, 5) + assert_almost_equal(entropy([0, 0, 42.0]), 0.6365141, 5) assert_almost_equal(entropy([]), 1) assert entropy([1, 1, 1, 1]) == 0 +@pytest.mark.parametrize( + "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations() +) +def test_entropy_array_api(array_namespace, device, dtype_name): + xp = _array_api_for_tests(array_namespace, device) + float_labels = xp.asarray(np.asarray([0, 0, 42.0], dtype=dtype_name), device=device) + empty_int32_labels = xp.asarray([], dtype=xp.int32, device=device) + int_labels = xp.asarray([1, 1, 1, 1], device=device) + with config_context(array_api_dispatch=True): + assert entropy(float_labels) == pytest.approx(0.6365141, abs=1e-5) + assert entropy(empty_int32_labels) == 1 + assert entropy(int_labels) == 0 + + def test_contingency_matrix(): labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 7bf9183c80772..c222e26fcc82c 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -250,7 +250,7 @@ def supported_float_dtypes(xp): def ensure_common_namespace_device(reference, *arrays): """Ensure that all arrays use the same namespace and device as reference. - If neccessary the arrays are moved to the same namespace and device as + If necessary the arrays are moved to the same namespace and device as the reference array. Parameters