diff --git a/doc/whats_new/upcoming_changes/sklearn.metrics/31172.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.metrics/31172.enhancement.rst new file mode 100644 index 0000000000000..606f629655465 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.metrics/31172.enhancement.rst @@ -0,0 +1,3 @@ +- :func:`~metrics.cohen_kappa_score` now has a `replace_undefined_by` param, that can be + set to define the function's return value when there would be a division by zero. + By :user:`Stefanie Senger ` diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 13f2f5dc89208..fd2c375abf3a8 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -799,10 +799,25 @@ def multilabel_confusion_matrix( "labels": ["array-like", None], "weights": [StrOptions({"linear", "quadratic"}), None], "sample_weight": ["array-like", None], + "replace_undefined_by": [ + Interval(Real, -1.0, 1.0, closed="both"), + np.nan, + Hidden(StrOptions({"deprecated"})), + ], }, prefer_skip_nested_validation=True, ) -def cohen_kappa_score(y1, y2, *, labels=None, weights=None, sample_weight=None): +# TODO(1.9): Change default value for `replace_undefined_by` param to 0.0 and remove +# FutureWarnings; also the defaults in the warning messages need to be updated. +def cohen_kappa_score( + y1, + y2, + *, + labels=None, + weights=None, + sample_weight=None, + replace_undefined_by="deprecated", +): r"""Compute Cohen's kappa: a statistic that measures inter-annotator agreement. This function computes Cohen's kappa [1]_, a score that expresses the level @@ -843,11 +858,25 @@ class labels [2]_. sample_weight : array-like of shape (n_samples,), default=None Sample weights. + replace_undefined_by : np.nan, float in [-1.0, 1.0], default=np.nan + Sets the return value when a division by zero would occur. This can happen for + instance on empty input arrays, or when no label of interest (as defined in the + `labels` param) is assigned by the second annotator, or when both `y1` and `y2` + only have one label in common that is also in `labels`. In these cases, an + :class:`~sklearn.exceptions.UndefinedMetricWarning` is raised. Can take the + following values: + + - `np.nan` to return `np.nan` + - a floating point value in the range of [-1.0, 1.0] to return a specific value + + .. versionadded:: 1.7 + Returns ------- kappa : float - The kappa statistic, which is a number between -1 and 1. The maximum - value means complete agreement; zero or lower means chance agreement. + The kappa statistic, which is a number between -1.0 and 1.0. The maximum value + means complete agreement; the minimum value means complete disagreement; 0.0 + indicates no agreement beyond what would be expected by chance. References ---------- @@ -868,6 +897,15 @@ class labels [2]_. >>> cohen_kappa_score(y1, y2) 0.6875 """ + + def _check_zero_division(denominator, replace_undefined_by, msg): + if np.isclose(denominator, 0): + if replace_undefined_by == "deprecated": + replace_undefined_by = np.nan + warnings.warn(mgs_changing_default, FutureWarning) + warnings.warn(msg, UndefinedMetricWarning, stacklevel=2) + return True + try: confusion = confusion_matrix(y1, y2, labels=labels, sample_weight=sample_weight) except ValueError as e: @@ -883,7 +921,25 @@ class labels [2]_. n_classes = confusion.shape[0] sum0 = np.sum(confusion, axis=0) sum1 = np.sum(confusion, axis=1) - expected = np.outer(sum0, sum1) / np.sum(sum0) + + mgs_changing_default = ( + "`np.nan` as the default return value of `cohen_kappa_score` in case of a " + "division by zero has been deprecated in 1.7 and will be changed to 0.0 in " + "version 1.9. Set `replace_undefined_by=0.0` to use the new default and to " + "silence this Warning." + ) + + numerator = np.outer(sum0, sum1) + denominator = np.sum(sum0) + msg_zero_division = ( + "`y2` contains no labels that are presented in both `y1` and `labels`." + "`cohen_kappa_score` is undefined and set to the value defined by " + "the `replace_undefined_by` param, which defaults to `np.nan`." + ) + if _check_zero_division(denominator, replace_undefined_by, msg_zero_division): + return replace_undefined_by + + expected = numerator / denominator if weights is None: w_mat = np.ones([n_classes, n_classes], dtype=int) @@ -896,7 +952,18 @@ class labels [2]_. else: w_mat = (w_mat - w_mat.T) ** 2 - k = np.sum(w_mat * confusion) / np.sum(w_mat * expected) + numerator = np.sum(w_mat * confusion) + denominator = np.sum(w_mat * expected) + msg_zero_division = ( + "`y1`, `y2` and `labels` have only one label in common. " + "`cohen_kappa_score` is undefined and set to the value defined by the " + "`replace_undefined_by` param, which defaults to `np.nan`." + ) + if _check_zero_division(denominator, replace_undefined_by, msg_zero_division): + return replace_undefined_by + + k = numerator / denominator + return float(1 - k) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 19a326ff184f8..da163aaef5016 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -926,6 +926,105 @@ def test_cohen_kappa(): ) +@pytest.mark.parametrize( + "test_case", + [ + # empty inputs: + ([], [], None, None), + # annotator y2 does not assign any label specified in `labels` (note: also + # applicable if `labels` is default and `y2` does not contain any label that is + # in `y1`): + ([1] * 5 + [2] * 5, [3] * 10, [1, 2], None), + # both inputs (`y1` and `y2`) only have one label: + ([3] * 10, [3] * 10, None, None), + # both inputs only have one label in common that is also in `labels`: + ([1] * 5 + [2] * 5, [3] * 10, [1, 2], None), + # like the last test case, but with `weights="linear"` (note that + # weights="linear" and weights="quadratic" are different branches, though the + # latter is so similar to the former that the test case is skipped here): + ([1] * 5 + [2] * 5, [3] * 10, [1, 2], "linear"), + ], +) +@pytest.mark.parametrize("replace_undefined_by", [0.0, np.nan]) +def test_cohen_kappa_zero_division(test_case, replace_undefined_by): + """Test that cohen_kappa_score handles divisions by 0 correctly by returning the + `replace_undefined_by` param. (The fist two test cases cover the first possible + location in the function for an occurrence of a division by zero, the last three + test cases cover a zero division in the the second possible location in the + function.""" + + def _check_equal(res, exp): + if np.isnan(res) and np.isnan(exp): + return True + return res == exp + + y1, y2, labels, weights = test_case + y1, y2 = np.array(y1), np.array(y2) + + assert _check_equal( + cohen_kappa_score( + y1, + y2, + labels=labels, + weights=weights, + replace_undefined_by=replace_undefined_by, + ), + replace_undefined_by, + ) + + +# TODO(1.9): remove the @ignore_warnings of the FutureWarning +@ignore_warnings(category=FutureWarning) +def test_cohen_kappa_zero_division_warning(): + """Test that cohen_kappa_score raises UndefinedMetricWarning when a division by 0 + occurs.""" + + # test first place to raise warning + labels = [1, 2] + y1 = np.array([1] * 5 + [2] * 5) + y2 = np.array([3] * 10) + with pytest.warns( + UndefinedMetricWarning, + match="`y2` contains no labels that are presented in both `y1` and `labels`.", + ): + cohen_kappa_score(y1, y2, labels=labels) + + # test second place to raise warning + labels = [1, 2] + y1 = np.array([1] * 5 + [2] * 5) + y2 = np.array([1] * 5 + [3] * 5) + with pytest.warns( + UndefinedMetricWarning, + match="`y1`, `y2` and `labels` have only one label in common.", + ): + cohen_kappa_score(y1, y2, labels=labels) + + +# TODO(1.9): remove test when deprecation cycle is over +def test_cohen_kappa_score_raise_warning_deprecation(): + """Test that `cohen_kappa_score` raises a `FutureWarning` for the changing default + of the `replace_undefined_by` param.""" + # test first place to raise warning + labels = [1, 2] + y1 = np.array([1] * 5 + [2] * 5) + y2 = np.array([3] * 10) + with pytest.warns( + FutureWarning, + match="`np.nan` as the default return value of `cohen_kappa_score` in case of", + ): + cohen_kappa_score(y1, y2, labels=labels) + + # test second place to raise warning + labels = [1, 2] + y1 = np.array([1] * 5 + [2] * 5) + y2 = np.array([1] * 5 + [3] * 5) + with pytest.warns( + FutureWarning, + match="`np.nan` as the default return value of `cohen_kappa_score` in case of", + ): + cohen_kappa_score(y1, y2, labels=labels) + + def test_cohen_kappa_score_error_wrong_label(): """Test that correct error is raised when users pass labels that are not in y1.""" labels = [1, 2]