scikit-learn · StefanieSenger · Apr 10, 2025 · Apr 10, 2025 · Apr 10, 2025 · Apr 10, 2025
diff --git a/doc/whats_new/upcoming_changes/sklearn.metrics/31172.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.metrics/31172.enhancement.rst
@@ -0,0 +1,3 @@
+- :func:`~metrics.cohen_kappa_score` now has a `replace_undefined_by` param, that can be
+  set to define the function's return value when there would be a division by zero.
+  By :user:`Stefanie Senger <StefanieSenger>`
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
@@ -799,10 +799,25 @@ def multilabel_confusion_matrix(
         "labels": ["array-like", None],
         "weights": [StrOptions({"linear", "quadratic"}), None],
         "sample_weight": ["array-like", None],
+        "replace_undefined_by": [
+            Interval(Real, -1.0, 1.0, closed="both"),
+            np.nan,
+            Hidden(StrOptions({"deprecated"})),
+        ],
     },
     prefer_skip_nested_validation=True,
 )
-def cohen_kappa_score(y1, y2, *, labels=None, weights=None, sample_weight=None):
+# TODO(1.9): Change default value for `replace_undefined_by` param to 0.0 and remove
+# FutureWarnings; also the defaults in the warning messages need to be updated.
+def cohen_kappa_score(
+    y1,
+    y2,
+    *,
+    labels=None,
+    weights=None,
+    sample_weight=None,
+    replace_undefined_by="deprecated",
+):
     r"""Compute Cohen's kappa: a statistic that measures inter-annotator agreement.
 
     This function computes Cohen's kappa [1]_, a score that expresses the level
@@ -843,11 +858,25 @@ class labels [2]_.
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
+    replace_undefined_by : np.nan, float in [-1.0, 1.0], default=np.nan
+        Sets the return value when a division by zero would occur. This can happen for
+        instance on empty input arrays, or when no label of interest (as defined in the
+        `labels` param) is assigned by the second annotator, or when both `y1` and `y2`
+        only have one label in common that is also in `labels`. In these cases, an
+        :class:`~sklearn.exceptions.UndefinedMetricWarning` is raised. Can take the
+        following values:
+
+        - `np.nan` to return `np.nan`
+        - a floating point value in the range of [-1.0, 1.0] to return a specific value
+
+        .. versionadded:: 1.7
+
     Returns
     -------
     kappa : float
-        The kappa statistic, which is a number between -1 and 1. The maximum
-        value means complete agreement; zero or lower means chance agreement.
+        The kappa statistic, which is a number between -1.0 and 1.0. The maximum value
+        means complete agreement; the minimum value means complete disagreement; 0.0
+        indicates no agreement beyond what would be expected by chance.
 
     References
     ----------
@@ -868,6 +897,15 @@ class labels [2]_.
     >>> cohen_kappa_score(y1, y2)
     0.6875
     """
+
+    def _check_zero_division(denominator, replace_undefined_by, msg):
+        if np.isclose(denominator, 0):
+            if replace_undefined_by == "deprecated":
+                replace_undefined_by = np.nan
+                warnings.warn(mgs_changing_default, FutureWarning)
+            warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
+            return True
+
     try:
         confusion = confusion_matrix(y1, y2, labels=labels, sample_weight=sample_weight)
     except ValueError as e:
@@ -883,7 +921,25 @@ class labels [2]_.
     n_classes = confusion.shape[0]
     sum0 = np.sum(confusion, axis=0)
     sum1 = np.sum(confusion, axis=1)
-    expected = np.outer(sum0, sum1) / np.sum(sum0)
+
+    mgs_changing_default = (
+        "`np.nan` as the default return value of `cohen_kappa_score` in case of a "
+        "division by zero has been deprecated in 1.7 and will be changed to 0.0 in "
+        "version 1.9. Set `replace_undefined_by=0.0` to use the new default and to "
+        "silence this Warning."
+    )
+
+    numerator = np.outer(sum0, sum1)
+    denominator = np.sum(sum0)
+    msg_zero_division = (
+        "`y2` contains no labels that are presented in both `y1` and `labels`."
+        "`cohen_kappa_score` is undefined and set to the value defined by "
+        "the `replace_undefined_by` param, which defaults to `np.nan`."
+    )
+    if _check_zero_division(denominator, replace_undefined_by, msg_zero_division):
+        return replace_undefined_by
+
+    expected = numerator / denominator
 
     if weights is None:
         w_mat = np.ones([n_classes, n_classes], dtype=int)
@@ -896,7 +952,18 @@ class labels [2]_.
         else:
             w_mat = (w_mat - w_mat.T) ** 2
 
-    k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
+    numerator = np.sum(w_mat * confusion)
+    denominator = np.sum(w_mat * expected)
+    msg_zero_division = (
+        "`y1`, `y2` and `labels` have only one label in common. "
+        "`cohen_kappa_score` is undefined and set to the value defined by the "
+        "`replace_undefined_by` param, which defaults to `np.nan`."
+    )
+    if _check_zero_division(denominator, replace_undefined_by, msg_zero_division):
+        return replace_undefined_by
+
+    k = numerator / denominator
+
     return float(1 - k)
 
 

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -926,6 +926,105 @@ def test_cohen_kappa():
     )
 
 
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        # empty inputs:
+        ([], [], None, None),
+        # annotator y2 does not assign any label specified in `labels` (note: also
+        # applicable if `labels` is default and `y2` does not contain any label that is
+        # in `y1`):
+        ([1] * 5 + [2] * 5, [3] * 10, [1, 2], None),
+        # both inputs (`y1` and `y2`) only have one label:
+        ([3] * 10, [3] * 10, None, None),
+        # both inputs only have one label in common that is also in `labels`:
+        ([1] * 5 + [2] * 5, [3] * 10, [1, 2], None),
+        # like the last test case, but with `weights="linear"` (note that
+        # weights="linear" and weights="quadratic" are different branches, though the
+        # latter is so similar to the former that the test case is skipped here):
+        ([1] * 5 + [2] * 5, [3] * 10, [1, 2], "linear"),
+    ],
+)
+@pytest.mark.parametrize("replace_undefined_by", [0.0, np.nan])
+def test_cohen_kappa_zero_division(test_case, replace_undefined_by):
+    """Test that cohen_kappa_score handles divisions by 0 correctly by returning the
+    `replace_undefined_by` param. (The fist two test cases cover the first possible
+    location in the function for an occurrence of a division by zero, the last three
+    test cases cover a zero division in the the second possible location in the
+    function."""
+
+    def _check_equal(res, exp):
+        if np.isnan(res) and np.isnan(exp):
+            return True
+        return res == exp
+
+    y1, y2, labels, weights = test_case
+    y1, y2 = np.array(y1), np.array(y2)
+
+    assert _check_equal(
+        cohen_kappa_score(
+            y1,
+            y2,
+            labels=labels,
+            weights=weights,
+            replace_undefined_by=replace_undefined_by,
+        ),
+        replace_undefined_by,
+    )
+
+
+# TODO(1.9): remove the @ignore_warnings of the FutureWarning
+@ignore_warnings(category=FutureWarning)
+def test_cohen_kappa_zero_division_warning():
+    """Test that cohen_kappa_score raises UndefinedMetricWarning when a division by 0
+    occurs."""
+
+    # test first place to raise warning
+    labels = [1, 2]
+    y1 = np.array([1] * 5 + [2] * 5)
+    y2 = np.array([3] * 10)
+    with pytest.warns(
+        UndefinedMetricWarning,
+        match="`y2` contains no labels that are presented in both `y1` and `labels`.",
+    ):
+        cohen_kappa_score(y1, y2, labels=labels)
+
+    # test second place to raise warning
+    labels = [1, 2]
+    y1 = np.array([1] * 5 + [2] * 5)
+    y2 = np.array([1] * 5 + [3] * 5)
+    with pytest.warns(
+        UndefinedMetricWarning,
+        match="`y1`, `y2` and `labels` have only one label in common.",
+    ):
+        cohen_kappa_score(y1, y2, labels=labels)
+
+
+# TODO(1.9): remove test when deprecation cycle is over
+def test_cohen_kappa_score_raise_warning_deprecation():
+    """Test that `cohen_kappa_score` raises a `FutureWarning` for the changing default
+    of the `replace_undefined_by` param."""
+    # test first place to raise warning
+    labels = [1, 2]
+    y1 = np.array([1] * 5 + [2] * 5)
+    y2 = np.array([3] * 10)
+    with pytest.warns(
+        FutureWarning,
+        match="`np.nan` as the default return value of `cohen_kappa_score` in case of",
+    ):
+        cohen_kappa_score(y1, y2, labels=labels)
+
+    # test second place to raise warning
+    labels = [1, 2]
+    y1 = np.array([1] * 5 + [2] * 5)
+    y2 = np.array([1] * 5 + [3] * 5)
+    with pytest.warns(
+        FutureWarning,
+        match="`np.nan` as the default return value of `cohen_kappa_score` in case of",
+    ):
+        cohen_kappa_score(y1, y2, labels=labels)
+
+
 def test_cohen_kappa_score_error_wrong_label():
     """Test that correct error is raised when users pass labels that are not in y1."""
     labels = [1, 2]