From 3034d2e570f956d234c1ceeda515f45394b5fc6b Mon Sep 17 00:00:00 2001 From: Tialo Date: Sat, 25 May 2024 02:04:52 +0300 Subject: [PATCH 01/12] array-api for entropy --- sklearn/metrics/cluster/_supervised.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 1f72eae3725f6..100bd15800e25 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -23,6 +23,7 @@ import numpy as np from scipy import sparse as sp +from ...utils._array_api import get_namespace from ...utils._param_validation import Interval, StrOptions, validate_params from ...utils.multiclass import type_of_target from ...utils.validation import check_array, check_consistent_length @@ -1282,17 +1283,27 @@ def entropy(labels): ----- The logarithm used is the natural logarithm (base-e). """ - if len(labels) == 0: - return 1.0 - label_idx = np.unique(labels, return_inverse=True)[1] - pi = np.bincount(label_idx).astype(np.float64) - pi = pi[pi > 0] + if isinstance(labels, list): + xp, is_array_api_compliant = np, False + else: + xp, is_array_api_compliant = get_namespace(labels) + + shape_0 = labels.shape[0] if is_array_api_compliant else len(labels) + if shape_0 == 0: + return 0.0 + + if is_array_api_compliant: + unique_counts = xp.unique_counts(labels) + else: + unique_counts = np.unique(labels, return_counts=True) + + pi = xp.asarray(unique_counts[1], dtype=xp.float64) # single cluster => zero entropy if pi.size == 1: return 0.0 - pi_sum = np.sum(pi) + pi_sum = xp.sum(pi) # log(a / b) should be calculated as log(a) - log(b) for # possible loss of precision - return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum))) + return -xp.sum((pi / pi_sum) * (xp.log(pi) - log(pi_sum))) From 149487c429a01813545245877f1b828218fcfef4 Mon Sep 17 00:00:00 2001 From: Tialo Date: Thu, 30 May 2024 22:47:31 +0300 Subject: [PATCH 02/12] iter --- sklearn/metrics/cluster/_supervised.py | 22 +++++-------------- .../metrics/cluster/tests/test_supervised.py | 21 +++++++++++++++--- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 100bd15800e25..f32e356fa6c14 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -23,7 +23,6 @@ import numpy as np from scipy import sparse as sp -from ...utils._array_api import get_namespace from ...utils._param_validation import Interval, StrOptions, validate_params from ...utils.multiclass import type_of_target from ...utils.validation import check_array, check_consistent_length @@ -1283,27 +1282,18 @@ def entropy(labels): ----- The logarithm used is the natural logarithm (base-e). """ - if isinstance(labels, list): - xp, is_array_api_compliant = np, False - else: - xp, is_array_api_compliant = get_namespace(labels) - - shape_0 = labels.shape[0] if is_array_api_compliant else len(labels) - if shape_0 == 0: + labels = np.asarray(labels) + if labels.shape[0] == 0: return 0.0 - if is_array_api_compliant: - unique_counts = xp.unique_counts(labels) - else: - unique_counts = np.unique(labels, return_counts=True) - - pi = xp.asarray(unique_counts[1], dtype=xp.float64) + pi = np.unique(labels, return_counts=True)[1] + pi = pi.astype(np.float64) # single cluster => zero entropy if pi.size == 1: return 0.0 - pi_sum = xp.sum(pi) + pi_sum = np.sum(pi) # log(a / b) should be calculated as log(a) - log(b) for # possible loss of precision - return -xp.sum((pi / pi_sum) * (xp.log(pi) - log(pi_sum))) + return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum))) diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index dfaa58ff62c01..23b6f9f20732b 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -4,6 +4,7 @@ import pytest from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal +from sklearn.base import config_context from sklearn.metrics.cluster import ( adjusted_mutual_info_score, adjusted_rand_score, @@ -22,7 +23,8 @@ ) from sklearn.metrics.cluster._supervised import _generalized_average, check_clusterings from sklearn.utils import assert_all_finite -from sklearn.utils._testing import assert_almost_equal +from sklearn.utils._array_api import yield_namespace_device_dtype_combinations +from sklearn.utils._testing import _array_api_for_tests, assert_almost_equal score_funcs = [ adjusted_rand_score, @@ -254,12 +256,25 @@ def test_int_overflow_mutual_info_fowlkes_mallows_score(): def test_entropy(): - ent = entropy([0, 0, 42.0]) - assert_almost_equal(ent, 0.6365141, 5) + assert_almost_equal(entropy([0, 0, 42.0]), 0.6365141, 5) assert_almost_equal(entropy([]), 1) assert entropy([1, 1, 1, 1]) == 0 +@pytest.mark.parametrize( + "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations() +) +def test_entropy_array_api(array_namespace, device, dtype_name): + xp = _array_api_for_tests(array_namespace, device) + labels1 = xp.asarray([0, 0, 42.0]) + labels2 = xp.asarray([]) + labels3 = xp.asarray([1, 1, 1, 1]) + with config_context(array_api_dispatch=True): + assert_almost_equal(entropy(labels1), 0.6365141, 5) + assert entropy(labels2) == 1 + assert entropy(labels3) == 0 + + def test_contingency_matrix(): labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) From 76f574fa64d78942654e26c52d013ab29c098b18 Mon Sep 17 00:00:00 2001 From: Tialo Date: Thu, 30 May 2024 22:53:46 +0300 Subject: [PATCH 03/12] whatsnew --- doc/whats_new/v1.6.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index bc27f894ff9a1..d09ec62bedfe3 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -36,7 +36,8 @@ See :ref:`array_api` for more details. inputs. :pr:`28106` by :user:`Thomas Li `; - :func:`sklearn.metrics.mean_absolute_error` :pr:`27736` by :user:`Edoardo Abati `; -- :func:`sklearn.metrics.pairwise.cosine_similarity` :pr:`29014` by :user:`Edoardo Abati `. +- :func:`sklearn.metrics.pairwise.cosine_similarity` :pr:`29014` by :user:`Edoardo Abati `; +- :func:`sklearn.metrics.cluster.entropy` :pr:`29141` by :user:`Yaroslav Korobko `. **Classes:** From ed3ca4ff8703b84839883667f7bda311fadbb644 Mon Sep 17 00:00:00 2001 From: Tialo Date: Thu, 30 May 2024 23:00:41 +0300 Subject: [PATCH 04/12] dont convert to numpy --- sklearn/metrics/cluster/_supervised.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index f32e356fa6c14..8a7a574cb631e 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -23,6 +23,7 @@ import numpy as np from scipy import sparse as sp +from ...utils._array_api import get_namespace from ...utils._param_validation import Interval, StrOptions, validate_params from ...utils.multiclass import type_of_target from ...utils.validation import check_array, check_consistent_length @@ -1282,9 +1283,10 @@ def entropy(labels): ----- The logarithm used is the natural logarithm (base-e). """ - labels = np.asarray(labels) - if labels.shape[0] == 0: - return 0.0 + xp, is_array_api_compliant = get_namespace(labels, remove_types=(list,)) + labels_len = labels.shape[0] if is_array_api_compliant else len(labels) + if labels_len == 0: + return 1.0 pi = np.unique(labels, return_counts=True)[1] pi = pi.astype(np.float64) From 0eb7015085f45c4bd6f707d3a00fcdaf6d656fa2 Mon Sep 17 00:00:00 2001 From: Tialo Date: Fri, 31 May 2024 02:17:26 +0300 Subject: [PATCH 05/12] remove remove_types --- sklearn/metrics/cluster/_supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 8a7a574cb631e..cc7f105288cc5 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -1283,7 +1283,7 @@ def entropy(labels): ----- The logarithm used is the natural logarithm (base-e). """ - xp, is_array_api_compliant = get_namespace(labels, remove_types=(list,)) + xp, is_array_api_compliant = get_namespace(labels) labels_len = labels.shape[0] if is_array_api_compliant else len(labels) if labels_len == 0: return 1.0 From 616426c66a3bcab88aa0334f351d4f8a2e83170c Mon Sep 17 00:00:00 2001 From: Tialo Date: Fri, 31 May 2024 02:20:37 +0300 Subject: [PATCH 06/12] underscore unused variable --- sklearn/metrics/cluster/_supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index cc7f105288cc5..9282644d6e676 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -1283,7 +1283,7 @@ def entropy(labels): ----- The logarithm used is the natural logarithm (base-e). """ - xp, is_array_api_compliant = get_namespace(labels) + _, is_array_api_compliant = get_namespace(labels) labels_len = labels.shape[0] if is_array_api_compliant else len(labels) if labels_len == 0: return 1.0 From 8e07021318fc769302884df6684bf0f49668f5d0 Mon Sep 17 00:00:00 2001 From: Tialo Date: Thu, 6 Jun 2024 14:53:56 +0300 Subject: [PATCH 07/12] sorted --- doc/whats_new/v1.6.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 884eb61e991b6..8f80ec556ad64 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -32,12 +32,10 @@ See :ref:`array_api` for more details. **Functions:** -- :func:`sklearn.metrics.mean_tweedie_deviance` now supports Array API compatible - inputs. - :pr:`28106` by :user:`Thomas Li `; +- :func:`sklearn.metrics.cluster.entropy` :pr:`29141` by :user:`Yaroslav Korobko `. - :func:`sklearn.metrics.mean_absolute_error` :pr:`27736` by :user:`Edoardo Abati `; +- :func:`sklearn.metrics.mean_tweedie_deviance` :pr:`28106` by :user:`Thomas Li `; - :func:`sklearn.metrics.pairwise.cosine_similarity` :pr:`29014` by :user:`Edoardo Abati `; -- :func:`sklearn.metrics.cluster.entropy` :pr:`29141` by :user:`Yaroslav Korobko `. **Classes:** From eaa644b41b1297915e8dc07b9217bf213e63e5ce Mon Sep 17 00:00:00 2001 From: Tialo Date: Thu, 6 Jun 2024 18:01:15 +0300 Subject: [PATCH 08/12] do not convert to numpy --- sklearn/metrics/cluster/_supervised.py | 9 ++++----- sklearn/metrics/cluster/tests/test_supervised.py | 12 ++++++------ sklearn/utils/_array_api.py | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 9282644d6e676..727bd3c2af13d 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -1283,19 +1283,18 @@ def entropy(labels): ----- The logarithm used is the natural logarithm (base-e). """ - _, is_array_api_compliant = get_namespace(labels) + xp, is_array_api_compliant = get_namespace(labels) labels_len = labels.shape[0] if is_array_api_compliant else len(labels) if labels_len == 0: return 1.0 - pi = np.unique(labels, return_counts=True)[1] - pi = pi.astype(np.float64) + pi = xp.astype(xp.unique_counts(labels)[1], xp.float64) # single cluster => zero entropy if pi.size == 1: return 0.0 - pi_sum = np.sum(pi) + pi_sum = xp.sum(pi) # log(a / b) should be calculated as log(a) - log(b) for # possible loss of precision - return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum))) + return -xp.sum((pi / pi_sum) * (xp.log(pi) - log(pi_sum))) diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index 23b6f9f20732b..3116bf58b5513 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -266,13 +266,13 @@ def test_entropy(): ) def test_entropy_array_api(array_namespace, device, dtype_name): xp = _array_api_for_tests(array_namespace, device) - labels1 = xp.asarray([0, 0, 42.0]) - labels2 = xp.asarray([]) - labels3 = xp.asarray([1, 1, 1, 1]) + float_labels = xp.asarray([0, 0, 42.0], device=device) + empty_int32_labels = xp.asarray([], dtype=xp.int32, device=device) + int_labels = xp.asarray([1, 1, 1, 1], device=device) with config_context(array_api_dispatch=True): - assert_almost_equal(entropy(labels1), 0.6365141, 5) - assert entropy(labels2) == 1 - assert entropy(labels3) == 0 + assert_almost_equal(entropy(float_labels), 0.6365141, 5) + assert entropy(empty_int32_labels) == 1 + assert entropy(int_labels) == 0 def test_contingency_matrix(): diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py index 7bf9183c80772..c222e26fcc82c 100644 --- a/sklearn/utils/_array_api.py +++ b/sklearn/utils/_array_api.py @@ -250,7 +250,7 @@ def supported_float_dtypes(xp): def ensure_common_namespace_device(reference, *arrays): """Ensure that all arrays use the same namespace and device as reference. - If neccessary the arrays are moved to the same namespace and device as + If necessary the arrays are moved to the same namespace and device as the reference array. Parameters From f02e736b1144fb54389b33d9f64adf624dd449ad Mon Sep 17 00:00:00 2001 From: Tialo Date: Thu, 6 Jun 2024 19:26:36 +0300 Subject: [PATCH 09/12] use dtype_name --- sklearn/metrics/cluster/tests/test_supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index 3116bf58b5513..c45dd9a20858e 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -266,7 +266,7 @@ def test_entropy(): ) def test_entropy_array_api(array_namespace, device, dtype_name): xp = _array_api_for_tests(array_namespace, device) - float_labels = xp.asarray([0, 0, 42.0], device=device) + float_labels = xp.asarray(np.asarray([0, 0, 42.0], dtype=dtype_name), device=device) empty_int32_labels = xp.asarray([], dtype=xp.int32, device=device) int_labels = xp.asarray([1, 1, 1, 1], device=device) with config_context(array_api_dispatch=True): From a6fe06cd1d5aef31864d4a40fefefb5e90281f57 Mon Sep 17 00:00:00 2001 From: Tialo Date: Thu, 6 Jun 2024 19:30:33 +0300 Subject: [PATCH 10/12] array_api.rst --- doc/modules/array_api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst index 9b58cde56b09a..e665b4c36d9db 100644 --- a/doc/modules/array_api.rst +++ b/doc/modules/array_api.rst @@ -101,6 +101,7 @@ Estimators Metrics ------- +- :func:`sklearn.metrics.cluster.entropy` - :func:`sklearn.metrics.accuracy_score` - :func:`sklearn.metrics.mean_absolute_error` - :func:`sklearn.metrics.mean_tweedie_deviance` From cacf3b4b71bdcfe0e91b5d0c92feeb172ab9b9f2 Mon Sep 17 00:00:00 2001 From: Tialo <65392801+Tialo@users.noreply.github.com> Date: Fri, 7 Jun 2024 11:54:46 +0300 Subject: [PATCH 11/12] Update sklearn/metrics/cluster/_supervised.py Co-authored-by: Olivier Grisel --- sklearn/metrics/cluster/_supervised.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 727bd3c2af13d..9084ee3cd3df1 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -1297,4 +1297,6 @@ def entropy(labels): pi_sum = xp.sum(pi) # log(a / b) should be calculated as log(a) - log(b) for # possible loss of precision - return -xp.sum((pi / pi_sum) * (xp.log(pi) - log(pi_sum))) + # Always convert the result as a Python scalar (on CPU) instead of a device + # specific scalar array. + return float(-xp.sum((pi / pi_sum) * (xp.log(pi) - log(pi_sum)))) From bf5c517a3503262c74025ea3faf7fb28d9ec8a9e Mon Sep 17 00:00:00 2001 From: Tialo Date: Wed, 12 Jun 2024 14:59:22 +0300 Subject: [PATCH 12/12] tests --- sklearn/metrics/cluster/tests/test_supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index c45dd9a20858e..077dca0854a01 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -270,7 +270,7 @@ def test_entropy_array_api(array_namespace, device, dtype_name): empty_int32_labels = xp.asarray([], dtype=xp.int32, device=device) int_labels = xp.asarray([1, 1, 1, 1], device=device) with config_context(array_api_dispatch=True): - assert_almost_equal(entropy(float_labels), 0.6365141, 5) + assert entropy(float_labels) == pytest.approx(0.6365141, abs=1e-5) assert entropy(empty_int32_labels) == 1 assert entropy(int_labels) == 0