From 5a000a04278b20ab70815ab955f31fb2c4a79ef6 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Thu, 10 Apr 2025 16:07:58 +0200
Subject: [PATCH 01/14] ENH Add zero division handling to cohen_kappa_score

---
 sklearn/metrics/_classification.py           | 36 ++++++++++--
 sklearn/metrics/tests/test_classification.py | 60 ++++++++++++++++++++
 2 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 30dd53bc16109..2dd591443aa90 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -799,10 +799,13 @@ def multilabel_confusion_matrix(
         "labels": ["array-like", None],
         "weights": [StrOptions({"linear", "quadratic"}), None],
         "sample_weight": ["array-like", None],
+        "replace_undefined_by": [Interval(Real, -1.0, 1.0, closed="both"), np.nan],
     },
     prefer_skip_nested_validation=True,
 )
-def cohen_kappa_score(y1, y2, *, labels=None, weights=None, sample_weight=None):
+def cohen_kappa_score(
+    y1, y2, *, labels=None, weights=None, sample_weight=None, replace_undefined_by=0.0
+):
     r"""Compute Cohen's kappa: a statistic that measures inter-annotator agreement.
 
     This function computes Cohen's kappa [1]_, a score that expresses the level
@@ -841,11 +844,24 @@ class labels [2]_.
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
+    replace_undefined_by : np.nan, float in [-1.0, 1.0], default=0.0
+        Sets the return value when a division by zero would occur. This can happen for
+        instance on empty input arrays, or when no label of interest (as defined in the
+        `labels` param) is assigned by the second annotator, or when both `y1` and `y2`
+        only have one label in common that is also in `labels`. Can take the
+        following values:
+
+        - `np.nan` to return `np.nan`
+        - a floating point value in the range of [-1.0, 1.0] to return a specific value
+
+        .. versionadded:: 1.8
+
     Returns
     -------
     kappa : float
-        The kappa statistic, which is a number between -1 and 1. The maximum
-        value means complete agreement; zero or lower means chance agreement.
+        The kappa statistic, which is a number between -1.0 and 1.0. The maximum value
+        means complete agreement; the minimum value means complete disagreement; 0.0
+        indicates no agreement beyond what would be expected by chance.
 
     References
     ----------
@@ -870,7 +886,12 @@ class labels [2]_.
     n_classes = confusion.shape[0]
     sum0 = np.sum(confusion, axis=0)
     sum1 = np.sum(confusion, axis=1)
-    expected = np.outer(sum0, sum1) / np.sum(sum0)
+
+    numerator = np.outer(sum0, sum1)
+    denominator = np.sum(sum0)
+    if np.isclose(denominator, 0):
+        return replace_undefined_by
+    expected = numerator / denominator
 
     if weights is None:
         w_mat = np.ones([n_classes, n_classes], dtype=int)
@@ -883,7 +904,12 @@ class labels [2]_.
         else:
             w_mat = (w_mat - w_mat.T) ** 2
 
-    k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
+    numerator = np.sum(w_mat * confusion)
+    denominator = np.sum(w_mat * expected)
+    if np.isclose(denominator, 0):
+        return replace_undefined_by
+    k = numerator / denominator
+
     return float(1 - k)
 
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 13fe8b3deb88e..0f102d27356c4 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -926,6 +926,66 @@ def test_cohen_kappa():
     )
 
 
+@pytest.mark.parametrize("replace_undefined_by", [0.0, np.nan])
+def test_cohen_kappa_zero_division(replace_undefined_by):
+    """Test that cohen_kappa_score handles divisions by 0 correctly returning the
+    `replace_undefined_by` param."""
+
+    def check_equal(res, exp):
+        if np.isnan(res) and np.isnan(exp):
+            return True
+        return res == exp
+
+    # test case: empty inputs
+    y1 = np.array([])
+    y2 = np.array([])
+    assert check_equal(
+        cohen_kappa_score(y1, y2, replace_undefined_by=replace_undefined_by),
+        replace_undefined_by,
+    )
+
+    # test case: annotator y2 does not assign any label specified in `labels`
+    labels = [1, 2]
+    y1 = np.array([1] * 5 + [2] * 5)
+    y2 = np.array([3] * 10)
+    assert check_equal(
+        cohen_kappa_score(
+            y1, y2, labels=labels, replace_undefined_by=replace_undefined_by
+        ),
+        replace_undefined_by,
+    )
+
+    # test case: both inputs only have one label
+    y1 = np.array([3] * 10)
+    y2 = np.array([3] * 10)
+    assert check_equal(
+        cohen_kappa_score(y1, y2, replace_undefined_by=replace_undefined_by),
+        replace_undefined_by,
+    )
+
+    # test case: both inputs only have one label in common with `labels`
+    labels = [1]
+    y1 = np.array([1] * 5 + [2] * 5)
+    y2 = np.array([1] * 5 + [3] * 5)
+    assert check_equal(
+        cohen_kappa_score(
+            y1, y2, labels=labels, replace_undefined_by=replace_undefined_by
+        ),
+        replace_undefined_by,
+    )
+    # with weights="quadratic" it is almost the same test: skipped here
+    assert check_equal(
+        cohen_kappa_score(
+            y1,
+            y2,
+            labels=labels,
+            weights="linear",
+            replace_undefined_by=replace_undefined_by,
+        ),
+        replace_undefined_by,
+    )
+
+
 @pytest.mark.parametrize("zero_division", [0, 1, np.nan])
 @pytest.mark.parametrize("y_true, y_pred", [([0], [0])])
 @pytest.mark.parametrize(

From 02fd573b711c247df9c55644d38a5e206f936cd4 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Thu, 10 Apr 2025 16:23:16 +0200
Subject: [PATCH 02/14] add changelog

---
 .../upcoming_changes/sklearn.metrics/31172.enhancement.rst     | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 doc/whats_new/upcoming_changes/sklearn.metrics/31172.enhancement.rst

diff --git a/doc/whats_new/upcoming_changes/sklearn.metrics/31172.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.metrics/31172.enhancement.rst
new file mode 100644
index 0000000000000..8caa3169d63d6
--- /dev/null
+++ b/doc/whats_new/upcoming_changes/sklearn.metrics/31172.enhancement.rst
@@ -0,0 +1,3 @@
+- :func:`~metrics.cohen_kappa_score` now has a `replace_undefined_by` param, that can be
+  set to define the function's behaviour when there would be a division by zero.
+  By :user:`Stefanie Senger <StefanieSenger>`

From 2d84ded7c9b4b0ecc047748be457fa2b1363ca99 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Thu, 10 Apr 2025 17:06:36 +0200
Subject: [PATCH 03/14] add warnings raised in case of zero division

---
 sklearn/metrics/_classification.py           | 15 +++++++++-
 sklearn/metrics/tests/test_classification.py | 29 ++++++++++++++++++--
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 2dd591443aa90..092606ac6aef8 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -848,7 +848,8 @@ class labels [2]_.
         Sets the return value when a division by zero would occur. This can happen for
         instance on empty input arrays, or when no label of interest (as defined in the
         `labels` param) is assigned by the second annotator, or when both `y1` and `y2`
-        only have one label in common that is also in `labels`. Can take the
+        only have one label in common that is also in `labels`. In these cases, an
+        :class:`~sklearn.exceptions.UndefinedMetricWarning` is raised. Can take the
         following values:
 
         - `np.nan` to return `np.nan`
@@ -890,6 +891,12 @@ class labels [2]_.
     numerator = np.outer(sum0, sum1)
     denominator = np.sum(sum0)
     if np.isclose(denominator, 0):
+        msg = (
+            "`y2` does not contain any label that is also both present in `y1` and in "
+            "`labels`. cohen_kappa_score is undefined and set to the value defined in "
+            "the `replace_undefined_by` param, which defaults to 0.0."
+        )
+        warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
         return replace_undefined_by
     expected = numerator / denominator
 
@@ -907,6 +914,12 @@ class labels [2]_.
     numerator = np.sum(w_mat * confusion)
     denominator = np.sum(w_mat * expected)
     if np.isclose(denominator, 0):
+        msg = (
+            "`y1` and `y2` only have one label in common that is also in `labels`. "
+            "cohen_kappa_score is undefined and set to the value defined in the "
+            "`replace_undefined_by` param, which defaults to 0.0."
+        )
+        warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
         return replace_undefined_by
     k = numerator / denominator
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 0f102d27356c4..82a67ba0a6eea 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -928,7 +928,7 @@ def test_cohen_kappa():
 
 @pytest.mark.parametrize("replace_undefined_by", [0.0, np.nan])
 def test_cohen_kappa_zero_division(replace_undefined_by):
-    """Test that cohen_kappa_score handles divisions by 0 correctly returning the
+    """Test that cohen_kappa_score handles divisions by 0 correctly by returning the
     `replace_undefined_by` param."""
 
     def check_equal(res, exp):
@@ -964,7 +964,7 @@ def check_equal(res, exp):
     )
 
     # test case: both inputs only have one label in common with `labels`
-    labels = [1]
+    labels = [1, 2]
     y1 = np.array([1] * 5 + [2] * 5)
     y2 = np.array([1] * 5 + [3] * 5)
     assert check_equal(
@@ -986,6 +986,31 @@ def check_equal(res, exp):
     )
 
 
+def test_cohen_kappa_zero_division_warning():
+    """Test that cohen_kappa_score raises UndefinedMetricWarning when a division by 0
+    occurs."""
+
+    # test first place to raise warning
+    labels = [1, 2]
+    y1 = np.array([1] * 5 + [2] * 5)
+    y2 = np.array([3] * 10)
+    with pytest.warns(
+        UndefinedMetricWarning,
+        match="`y2` does not contain any label that is also both present in",
+    ):
+        cohen_kappa_score(y1, y2, labels=labels)
+
+    # test second place to raise warning
+    labels = [1, 2]
+    y1 = np.array([1] * 5 + [2] * 5)
+    y2 = np.array([1] * 5 + [3] * 5)
+    with pytest.warns(
+        UndefinedMetricWarning,
+        match="`y1` and `y2` only have one label in common that is also in `labels`.",
+    ):
+        cohen_kappa_score(y1, y2, labels=labels)
+
+
 @pytest.mark.parametrize("zero_division", [0, 1, np.nan])
 @pytest.mark.parametrize("y_true, y_pred", [([0], [0])])
 @pytest.mark.parametrize(

From 4b00d9fbdcdd5656098dc396bdcf2b642e9812d4 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Thu, 10 Apr 2025 17:45:14 +0200
Subject: [PATCH 04/14] refine test comments

---
 sklearn/metrics/tests/test_classification.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 82a67ba0a6eea..cacbcd478259c 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -944,7 +944,9 @@ def check_equal(res, exp):
         replace_undefined_by,
     )
 
-    # test case: annotator y2 does not assign any label specified in `labels`
+    # test case: annotator y2 does not assign any label specified in `labels` (note:
+    # also applicable if labels is default and y2 does not contain any label that is in
+    # y1)
     labels = [1, 2]
     y1 = np.array([1] * 5 + [2] * 5)
     y2 = np.array([3] * 10)
@@ -963,7 +965,9 @@ def check_equal(res, exp):
         replace_undefined_by,
     )
 
-    # test case: both inputs only have one label in common with `labels`
+    # test case: both inputs only have one label in common that is also in `labels`
+    # (note: weights="linear" and weights="quadratic" are different branches, though the
+    # latter is so similar to the former that the test is skipped here)
     labels = [1, 2]
     y1 = np.array([1] * 5 + [2] * 5)
     y2 = np.array([1] * 5 + [3] * 5)
@@ -973,7 +977,6 @@ def check_equal(res, exp):
         ),
         replace_undefined_by,
     )
-    # with weights="quadratic" it is almost the same test: skipped here
     assert check_equal(
         cohen_kappa_score(
             y1,

From f58492a6bebeb366b49489c1db99a962d7fe1274 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Thu, 10 Apr 2025 23:02:37 +0200
Subject: [PATCH 05/14] correct version

---
 sklearn/metrics/_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 092606ac6aef8..f2c30794cd5f4 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -855,7 +855,7 @@ class labels [2]_.
         - `np.nan` to return `np.nan`
         - a floating point value in the range of [-1.0, 1.0] to return a specific value
 
-        .. versionadded:: 1.8
+        .. versionadded:: 1.7
 
     Returns
     -------

From 245da3e77dbf315b1c360063c9c60747415fe50f Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Fri, 11 Apr 2025 13:11:27 +0200
Subject: [PATCH 06/14] improve docstring of test

---
 sklearn/metrics/tests/test_classification.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index cacbcd478259c..8ef9305d1c12c 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -929,7 +929,9 @@ def test_cohen_kappa():
 @pytest.mark.parametrize("replace_undefined_by", [0.0, np.nan])
 def test_cohen_kappa_zero_division(replace_undefined_by):
     """Test that cohen_kappa_score handles divisions by 0 correctly by returning the
-    `replace_undefined_by` param."""
+    `replace_undefined_by` param. (The fist two tests cover the first possible location
+    in the function for an occurrence of a division by zero, the second two tests in the
+    the second possible location in the function."""
 
     def check_equal(res, exp):
         if np.isnan(res) and np.isnan(exp):

From ede386e4d91fd6cc880b3594c8381e44f6d60cd0 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Sat, 12 Apr 2025 17:51:48 +0200
Subject: [PATCH 07/14] wording

---
 .../upcoming_changes/sklearn.metrics/31172.enhancement.rst      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/upcoming_changes/sklearn.metrics/31172.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.metrics/31172.enhancement.rst
index 8caa3169d63d6..606f629655465 100644
--- a/doc/whats_new/upcoming_changes/sklearn.metrics/31172.enhancement.rst
+++ b/doc/whats_new/upcoming_changes/sklearn.metrics/31172.enhancement.rst
@@ -1,3 +1,3 @@
 - :func:`~metrics.cohen_kappa_score` now has a `replace_undefined_by` param, that can be
-  set to define the function's behaviour when there would be a division by zero.
+  set to define the function's return value when there would be a division by zero.
   By :user:`Stefanie Senger <StefanieSenger>`

From b93b44553f10fd5bd1b031f742e0a4a963f36691 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Sat, 12 Apr 2025 18:43:01 +0200
Subject: [PATCH 08/14] add deprecation cycle for default behaviour if zero
 division

---
 sklearn/metrics/_classification.py           | 31 ++++++++++++++++++--
 sklearn/metrics/tests/test_classification.py | 27 +++++++++++++++++
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index f2c30794cd5f4..f6bbd8a439db0 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -799,12 +799,24 @@ def multilabel_confusion_matrix(
         "labels": ["array-like", None],
         "weights": [StrOptions({"linear", "quadratic"}), None],
         "sample_weight": ["array-like", None],
-        "replace_undefined_by": [Interval(Real, -1.0, 1.0, closed="both"), np.nan],
+        "replace_undefined_by": [
+            Interval(Real, -1.0, 1.0, closed="both"),
+            np.nan,
+            Hidden(StrOptions({"deprecated"})),
+        ],
     },
     prefer_skip_nested_validation=True,
 )
+# TODO(1.9): Change default value for `replace_undefined_by` param to 0.0 and remove
+# FutureWarnings.
 def cohen_kappa_score(
-    y1, y2, *, labels=None, weights=None, sample_weight=None, replace_undefined_by=0.0
+    y1,
+    y2,
+    *,
+    labels=None,
+    weights=None,
+    sample_weight=None,
+    replace_undefined_by="deprecated",
 ):
     r"""Compute Cohen's kappa: a statistic that measures inter-annotator agreement.
 
@@ -844,7 +856,7 @@ class labels [2]_.
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
-    replace_undefined_by : np.nan, float in [-1.0, 1.0], default=0.0
+    replace_undefined_by : np.nan, float in [-1.0, 1.0], default=np.nan
         Sets the return value when a division by zero would occur. This can happen for
         instance on empty input arrays, or when no label of interest (as defined in the
         `labels` param) is assigned by the second annotator, or when both `y1` and `y2`
@@ -888,9 +900,19 @@ class labels [2]_.
     sum0 = np.sum(confusion, axis=0)
     sum1 = np.sum(confusion, axis=1)
 
+    mgs_changing_default = (
+        "The default return value of `cohen_kappa_score` in case of a division "
+        "by zero has been deprecated in 1.7 and will be changed to 0.0 in version "
+        "1.9. Set `replace_undefined_by=0.0` to use the new default and to silence "
+        "this Warning."
+    )
+
     numerator = np.outer(sum0, sum1)
     denominator = np.sum(sum0)
     if np.isclose(denominator, 0):
+        if replace_undefined_by == "deprecated":
+            replace_undefined_by = np.nan
+            warnings.warn(mgs_changing_default, FutureWarning)
         msg = (
             "`y2` does not contain any label that is also both present in `y1` and in "
             "`labels`. cohen_kappa_score is undefined and set to the value defined in "
@@ -914,6 +936,9 @@ class labels [2]_.
     numerator = np.sum(w_mat * confusion)
     denominator = np.sum(w_mat * expected)
     if np.isclose(denominator, 0):
+        if replace_undefined_by == "deprecated":
+            replace_undefined_by = np.nan
+            warnings.warn(mgs_changing_default, FutureWarning)
         msg = (
             "`y1` and `y2` only have one label in common that is also in `labels`. "
             "cohen_kappa_score is undefined and set to the value defined in the "
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 8ef9305d1c12c..646d8d343ca2c 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -991,6 +991,8 @@ def check_equal(res, exp):
     )
 
 
+# TODO(1.9): remove the @ignore_warnings of the FutureWarning
+@ignore_warnings(category=FutureWarning)
 def test_cohen_kappa_zero_division_warning():
     """Test that cohen_kappa_score raises UndefinedMetricWarning when a division by 0
     occurs."""
@@ -1016,6 +1018,31 @@ def test_cohen_kappa_zero_division_warning():
         cohen_kappa_score(y1, y2, labels=labels)
 
 
+# TODO(1.9): remove test when deprecation cycle is over
+def test_cohen_kappa_score_raise_warning_deprecation():
+    """Test that `cohen_kappa_score` raises a `FutureWarning` for the changing default
+    of the `replace_undefined_by` param."""
+    # test first place to raise warning
+    labels = [1, 2]
+    y1 = np.array([1] * 5 + [2] * 5)
+    y2 = np.array([3] * 10)
+    with pytest.warns(
+        FutureWarning,
+        match="The default return value of `cohen_kappa_score` in case of a division",
+    ):
+        cohen_kappa_score(y1, y2, labels=labels)
+
+    # test second place to raise warning
+    labels = [1, 2]
+    y1 = np.array([1] * 5 + [2] * 5)
+    y2 = np.array([1] * 5 + [3] * 5)
+    with pytest.warns(
+        FutureWarning,
+        match="The default return value of `cohen_kappa_score` in case of a division",
+    ):
+        cohen_kappa_score(y1, y2, labels=labels)
+
+
 @pytest.mark.parametrize("zero_division", [0, 1, np.nan])
 @pytest.mark.parametrize("y_true, y_pred", [([0], [0])])
 @pytest.mark.parametrize(

From a7f4ba674668cdadad3264c950754c4b52a55587 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Sat, 19 Apr 2025 07:27:26 +0200
Subject: [PATCH 09/14] fix linting

---
 sklearn/metrics/tests/test_classification.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 2faa324c73a46..b8a4cf7921644 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1042,6 +1042,7 @@ def test_cohen_kappa_score_raise_warning_deprecation():
     ):
         cohen_kappa_score(y1, y2, labels=labels)
 
+
 def test_cohen_kappa_score_error_wrong_label():
     """Test that correct error is raised when users pass labels that are not in y1."""
     labels = [1, 2]

From 6d8e59bd16e3fecbaf29c601098b3fd0be6133d2 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com>
Date: Wed, 30 Apr 2025 16:06:24 +0200
Subject: [PATCH 10/14] Apply suggestions from code review

Co-authored-by: Virgil Chan <virchan.math@gmail.com>
---
 sklearn/metrics/_classification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 919ed010149e5..2c35fc0cf0277 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -927,7 +927,7 @@ class labels [2]_.
             replace_undefined_by = np.nan
             warnings.warn(mgs_changing_default, FutureWarning)
         msg = (
-            "`y2` does not contain any label that is also both present in `y1` and in "
+            "`y2` contains no labels that are presented in both `y1` and "
             "`labels`. cohen_kappa_score is undefined and set to the value defined in "
             "the `replace_undefined_by` param, which defaults to 0.0."
         )
@@ -953,7 +953,7 @@ class labels [2]_.
             replace_undefined_by = np.nan
             warnings.warn(mgs_changing_default, FutureWarning)
         msg = (
-            "`y1` and `y2` only have one label in common that is also in `labels`. "
+            "`y1`, `y2` and `labels` have only one label in common. "
             "cohen_kappa_score is undefined and set to the value defined in the "
             "`replace_undefined_by` param, which defaults to 0.0."
         )

From 973b219342a2a15b1a7cc536e331a93d804d79fb Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Wed, 30 Apr 2025 19:48:50 +0200
Subject: [PATCH 11/14] clean up test and correct warning message

---
 sklearn/cluster/_agglomerative.py            |  2 +-
 sklearn/metrics/_classification.py           | 10 +++++-----
 sklearn/metrics/tests/test_classification.py |  4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index a2365da3669c4..f068dc934151d 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -36,7 +36,7 @@
 from ..utils.validation import check_memory, validate_data
 
 # mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast'
-from . import _hierarchical_fast as _hierarchical
+from . import _hierarchical_fast as _hierarchical  # type: ignore[attr-defined]
 from ._feature_agglomeration import AgglomerationTransform
 
 ###############################################################################
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 2c35fc0cf0277..efe9149204bbf 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -808,7 +808,7 @@ def multilabel_confusion_matrix(
     prefer_skip_nested_validation=True,
 )
 # TODO(1.9): Change default value for `replace_undefined_by` param to 0.0 and remove
-# FutureWarnings.
+# FutureWarnings; also the defaults in the warning messages need to be updated.
 def cohen_kappa_score(
     y1,
     y2,
@@ -927,9 +927,9 @@ class labels [2]_.
             replace_undefined_by = np.nan
             warnings.warn(mgs_changing_default, FutureWarning)
         msg = (
-            "`y2` contains no labels that are presented in both `y1` and "
-            "`labels`. cohen_kappa_score is undefined and set to the value defined in "
-            "the `replace_undefined_by` param, which defaults to 0.0."
+            "`y2` contains no labels that are presented in both `y1` and `labels`."
+            "cohen_kappa_score is undefined and set to the value defined in "
+            "the `replace_undefined_by` param, which defaults to `np.nan`."
         )
         warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
         return replace_undefined_by
@@ -955,7 +955,7 @@ class labels [2]_.
         msg = (
             "`y1`, `y2` and `labels` have only one label in common. "
             "cohen_kappa_score is undefined and set to the value defined in the "
-            "`replace_undefined_by` param, which defaults to 0.0."
+            "`replace_undefined_by` param, which defaults to `np.nan`."
         )
         warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
         return replace_undefined_by
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index b8a4cf7921644..855129b85ba64 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1003,7 +1003,7 @@ def test_cohen_kappa_zero_division_warning():
     y2 = np.array([3] * 10)
     with pytest.warns(
         UndefinedMetricWarning,
-        match="`y2` does not contain any label that is also both present in",
+        match="`y2` contains no labels that are presented in both `y1` and `labels`.",
     ):
         cohen_kappa_score(y1, y2, labels=labels)
 
@@ -1013,7 +1013,7 @@ def test_cohen_kappa_zero_division_warning():
     y2 = np.array([1] * 5 + [3] * 5)
     with pytest.warns(
         UndefinedMetricWarning,
-        match="`y1` and `y2` only have one label in common that is also in `labels`.",
+        match="`y1`, `y2` and `labels` have only one label in common.",
     ):
         cohen_kappa_score(y1, y2, labels=labels)
 

From 2ee10a39d9d1cb0ce856314a6c78dfac1f2ab312 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Wed, 30 Apr 2025 20:18:56 +0200
Subject: [PATCH 12/14] leaner test

---
 sklearn/metrics/tests/test_classification.py | 76 ++++++++------------
 1 file changed, 29 insertions(+), 47 deletions(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 855129b85ba64..574445008b02c 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -926,65 +926,47 @@ def test_cohen_kappa():
     )
 
 
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        # empty inputs:
+        ([], [], None, None),
+        # annotator y2 does not assign any label specified in `labels` (note: also
+        # applicable if `labels` is default and `y2` does not contain any label that is
+        # in `y1`):
+        ([1] * 5 + [2] * 5, [3] * 10, [1, 2], None),
+        # both inputs (`y1` and `y2`) only have one label:
+        ([3] * 10, [3] * 10, None, None),
+        # both inputs only have one label in common that is also in `labels`:
+        ([1] * 5 + [2] * 5, [3] * 10, [1, 2], None),
+        # like the last test case, but with `weights="linear"` (note that
+        # weights="linear" and weights="quadratic" are different branches, though the
+        # latter is so similar to the former that the test case is skipped here):
+        ([1] * 5 + [2] * 5, [3] * 10, [1, 2], "linear"),
+    ],
+)
 @pytest.mark.parametrize("replace_undefined_by", [0.0, np.nan])
-def test_cohen_kappa_zero_division(replace_undefined_by):
+def test_cohen_kappa_zero_division(test_case, replace_undefined_by):
     """Test that cohen_kappa_score handles divisions by 0 correctly by returning the
-    `replace_undefined_by` param. (The fist two tests cover the first possible location
-    in the function for an occurrence of a division by zero, the second two tests in the
-    the second possible location in the function."""
+    `replace_undefined_by` param. (The fist two test cases cover the first possible
+    location in the function for an occurrence of a division by zero, the last three
+    test cases cover a zero division in the the second possible location in the
+    function."""
 
-    def check_equal(res, exp):
+    def _check_equal(res, exp):
         if np.isnan(res) and np.isnan(exp):
             return True
         return res == exp
 
-    # test case: empty inputs
-    y1 = np.array([])
-    y2 = np.array([])
-    assert check_equal(
-        cohen_kappa_score(y1, y2, replace_undefined_by=replace_undefined_by),
-        replace_undefined_by,
-    )
+    y1, y2, labels, weights = test_case
+    y1, y2 = np.array(y1), np.array(y2)
 
-    # test case: annotator y2 does not assign any label specified in `labels` (note:
-    # also applicable if labels is default and y2 does not contain any label that is in
-    # y1)
-    labels = [1, 2]
-    y1 = np.array([1] * 5 + [2] * 5)
-    y2 = np.array([3] * 10)
-    assert check_equal(
-        cohen_kappa_score(
-            y1, y2, labels=labels, replace_undefined_by=replace_undefined_by
-        ),
-        replace_undefined_by,
-    )
-
-    # test case: both inputs only have one label
-    y1 = np.array([3] * 10)
-    y2 = np.array([3] * 10)
-    assert check_equal(
-        cohen_kappa_score(y1, y2, replace_undefined_by=replace_undefined_by),
-        replace_undefined_by,
-    )
-
-    # test case: both inputs only have one label in common that is also in `labels`
-    # (note: weights="linear" and weights="quadratic" are different branches, though the
-    # latter is so similar to the former that the test is skipped here)
-    labels = [1, 2]
-    y1 = np.array([1] * 5 + [2] * 5)
-    y2 = np.array([1] * 5 + [3] * 5)
-    assert check_equal(
-        cohen_kappa_score(
-            y1, y2, labels=labels, replace_undefined_by=replace_undefined_by
-        ),
-        replace_undefined_by,
-    )
-    assert check_equal(
+    assert _check_equal(
         cohen_kappa_score(
             y1,
             y2,
             labels=labels,
-            weights="linear",
+            weights=weights,
             replace_undefined_by=replace_undefined_by,
         ),
         replace_undefined_by,

From 13af4c8b2cb4b9f59b737f58c31e597107c86670 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com>
Date: Mon, 5 May 2025 12:11:03 +0200
Subject: [PATCH 13/14] Apply suggestions from code review

Co-authored-by: Adrin Jalali <adrin.jalali@gmail.com>
---
 sklearn/metrics/_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index efe9149204bbf..8b5bd45ee488a 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -914,7 +914,7 @@ class labels [2]_.
     sum1 = np.sum(confusion, axis=1)
 
     mgs_changing_default = (
-        "The default return value of `cohen_kappa_score` in case of a division "
+        "`np.nan` as the default return value of `cohen_kappa_score` in case of a division "
         "by zero has been deprecated in 1.7 and will be changed to 0.0 in version "
         "1.9. Set `replace_undefined_by=0.0` to use the new default and to silence "
         "this Warning."

From 703eaae958e005ca85376ebb04ce857c3b984640 Mon Sep 17 00:00:00 2001
From: Stefanie Senger <stefanie.senger@posteo.de>
Date: Mon, 5 May 2025 12:51:47 +0200
Subject: [PATCH 14/14] deal with zero division in helper function

---
 sklearn/metrics/_classification.py           | 51 +++++++++++---------
 sklearn/metrics/tests/test_classification.py |  4 +-
 2 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 8b5bd45ee488a..fd2c375abf3a8 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -897,6 +897,15 @@ class labels [2]_.
     >>> cohen_kappa_score(y1, y2)
     0.6875
     """
+
+    def _check_zero_division(denominator, replace_undefined_by, msg):
+        if np.isclose(denominator, 0):
+            if replace_undefined_by == "deprecated":
+                replace_undefined_by = np.nan
+                warnings.warn(mgs_changing_default, FutureWarning)
+            warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
+            return True
+
     try:
         confusion = confusion_matrix(y1, y2, labels=labels, sample_weight=sample_weight)
     except ValueError as e:
@@ -914,25 +923,22 @@ class labels [2]_.
     sum1 = np.sum(confusion, axis=1)
 
     mgs_changing_default = (
-        "`np.nan` as the default return value of `cohen_kappa_score` in case of a division "
-        "by zero has been deprecated in 1.7 and will be changed to 0.0 in version "
-        "1.9. Set `replace_undefined_by=0.0` to use the new default and to silence "
-        "this Warning."
+        "`np.nan` as the default return value of `cohen_kappa_score` in case of a "
+        "division by zero has been deprecated in 1.7 and will be changed to 0.0 in "
+        "version 1.9. Set `replace_undefined_by=0.0` to use the new default and to "
+        "silence this Warning."
     )
 
     numerator = np.outer(sum0, sum1)
     denominator = np.sum(sum0)
-    if np.isclose(denominator, 0):
-        if replace_undefined_by == "deprecated":
-            replace_undefined_by = np.nan
-            warnings.warn(mgs_changing_default, FutureWarning)
-        msg = (
-            "`y2` contains no labels that are presented in both `y1` and `labels`."
-            "cohen_kappa_score is undefined and set to the value defined in "
-            "the `replace_undefined_by` param, which defaults to `np.nan`."
-        )
-        warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
+    msg_zero_division = (
+        "`y2` contains no labels that are presented in both `y1` and `labels`."
+        "`cohen_kappa_score` is undefined and set to the value defined by "
+        "the `replace_undefined_by` param, which defaults to `np.nan`."
+    )
+    if _check_zero_division(denominator, replace_undefined_by, msg_zero_division):
         return replace_undefined_by
+
     expected = numerator / denominator
 
     if weights is None:
@@ -948,17 +954,14 @@ class labels [2]_.
 
     numerator = np.sum(w_mat * confusion)
     denominator = np.sum(w_mat * expected)
-    if np.isclose(denominator, 0):
-        if replace_undefined_by == "deprecated":
-            replace_undefined_by = np.nan
-            warnings.warn(mgs_changing_default, FutureWarning)
-        msg = (
-            "`y1`, `y2` and `labels` have only one label in common. "
-            "cohen_kappa_score is undefined and set to the value defined in the "
-            "`replace_undefined_by` param, which defaults to `np.nan`."
-        )
-        warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
+    msg_zero_division = (
+        "`y1`, `y2` and `labels` have only one label in common. "
+        "`cohen_kappa_score` is undefined and set to the value defined by the "
+        "`replace_undefined_by` param, which defaults to `np.nan`."
+    )
+    if _check_zero_division(denominator, replace_undefined_by, msg_zero_division):
         return replace_undefined_by
+
     k = numerator / denominator
 
     return float(1 - k)
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 574445008b02c..da163aaef5016 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -1010,7 +1010,7 @@ def test_cohen_kappa_score_raise_warning_deprecation():
     y2 = np.array([3] * 10)
     with pytest.warns(
         FutureWarning,
-        match="The default return value of `cohen_kappa_score` in case of a division",
+        match="`np.nan` as the default return value of `cohen_kappa_score` in case of",
     ):
         cohen_kappa_score(y1, y2, labels=labels)
 
@@ -1020,7 +1020,7 @@ def test_cohen_kappa_score_raise_warning_deprecation():
     y2 = np.array([1] * 5 + [3] * 5)
     with pytest.warns(
         FutureWarning,
-        match="The default return value of `cohen_kappa_score` in case of a division",
+        match="`np.nan` as the default return value of `cohen_kappa_score` in case of",
     ):
         cohen_kappa_score(y1, y2, labels=labels)