From e13de28c06353fd1f01398a18027fc010a4531f1 Mon Sep 17 00:00:00 2001 From: shubhraneel Date: Sat, 28 Aug 2021 23:22:40 +0530 Subject: [PATCH 1/5] Add precision_at_recall_k and recall_at_precision_k functions --- sklearn/metrics/__init__.py | 4 ++++ sklearn/metrics/_classification.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index a0b06a02ad6d1..4e10d644abf2d 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -35,6 +35,8 @@ from ._classification import zero_one_loss from ._classification import brier_score_loss from ._classification import multilabel_confusion_matrix +from ._classification import precision_at_recall_k +from ._classification import recall_at_precision_k from . import cluster from .cluster import adjusted_mutual_info_score @@ -171,4 +173,6 @@ "v_measure_score", "zero_one_loss", "brier_score_loss", + "precision_at_recall_k", + "recall_at_precision_k", ] diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 1a23ec01f4536..832aafcab46b6 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -42,6 +42,7 @@ from ..exceptions import UndefinedMetricWarning from ._base import _check_pos_label_consistency +from ._ranking import precision_recall_curve def _check_zero_division(zero_division): @@ -2649,3 +2650,31 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None): raise y_true = np.array(y_true == pos_label, int) return np.average((y_true - y_prob) ** 2, weights=sample_weight) + + +def recall_at_precision_k(y_true, y_prob, k, *, pos_label=None, sample_weight=None): + + precisions, recalls, threshholds = precision_recall_curve( + y_true, y_prob, pos_label=pos_label, sample_weight=sample_weight + ) + + try: + value, _ = max((r, p) for p, r in zip(precisions, recalls) if p >= k) + except ValueError: + value = 0 + + return value + + +def precision_at_recall_k(y_true, y_prob, k, *, pos_label=None, sample_weight=None): + + precisions, recalls, threshholds = precision_recall_curve( + y_true, y_prob, pos_label=pos_label, sample_weight=sample_weight + ) + + try: + value, _ = max((p, r) for p, r in zip(precisions, recalls) if r >= k) + except ValueError: + value = 0 + + return value From 788527c21ea5ee03d475cc1a7ef8ff9133f13da8 Mon Sep 17 00:00:00 2001 From: shubhraneel Date: Sun, 29 Aug 2021 00:22:33 +0530 Subject: [PATCH 2/5] Add tests --- sklearn/metrics/tests/test_classification.py | 32 ++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 4f29c127defb5..c2413c93aba7a 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -42,6 +42,8 @@ from sklearn.metrics import zero_one_loss from sklearn.metrics import brier_score_loss from sklearn.metrics import multilabel_confusion_matrix +from sklearn.metrics import precision_at_recall_k +from sklearn.metrics import recall_at_precision_k from sklearn.metrics._classification import _check_targets from sklearn.exceptions import UndefinedMetricWarning @@ -2509,3 +2511,33 @@ def test_balanced_accuracy_score(y_true, y_pred): adjusted = balanced_accuracy_score(y_true, y_pred, adjusted=True) chance = balanced_accuracy_score(y_true, np.full_like(y_true, y_true[0])) assert adjusted == (balanced - chance) / (1 - chance) + + +def test_precision_at_recall_k(): + y_true = np.array([0, 0, 1, 1, 1, 1]) + y_prob = np.array([0.1, 0.8, 0.9, 0.3, 1.0, 0.95]) + y_multi = np.array([0, 2, 1, 1, 1, 1]) + + assert_almost_equal(precision_at_recall_k(y_true, y_prob, 0.8), 0.8) + assert_almost_equal(precision_at_recall_k(y_true, y_prob, 0.6), 1) + assert_almost_equal(precision_at_recall_k(y_true * 2 - 1, y_prob, 0.8), 0.8) + + with pytest.raises(ValueError): + precision_at_recall_k(y_multi, y_prob, 0.8) + + assert_almost_equal(precision_at_recall_k(y_true, y_prob, 0.8, pos_label=1), 0.8) + + +def test_recall_at_precision_k(): + y_true = np.array([0, 0, 1, 1, 1, 1]) + y_prob = np.array([0.1, 0.8, 0.9, 0.3, 1.0, 0.95]) + y_multi = np.array([0, 2, 1, 1, 1, 1]) + + assert_almost_equal(recall_at_precision_k(y_true, y_prob, 1), 0.75) + assert_almost_equal(recall_at_precision_k(y_true, y_prob, 0.8), 1) + assert_almost_equal(recall_at_precision_k(y_true * 2 - 1, y_prob, 1), 0.75) + + with pytest.raises(ValueError): + recall_at_precision_k(y_multi, y_prob, 1) + + assert_almost_equal(recall_at_precision_k(y_true, y_prob, 1, pos_label=1), 0.75) From 95f25a6aa255c79a9e90d256f092ce6b91e7b493 Mon Sep 17 00:00:00 2001 From: shubhraneel Date: Sun, 29 Aug 2021 00:33:15 +0530 Subject: [PATCH 3/5] Refactor code: no need to take list of tuples and avoid try block --- sklearn/metrics/_classification.py | 38 ++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 832aafcab46b6..3ba428864d344 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -2654,27 +2654,41 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None): def recall_at_precision_k(y_true, y_prob, k, *, pos_label=None, sample_weight=None): - precisions, recalls, threshholds = precision_recall_curve( + precisions, recalls, _ = precision_recall_curve( y_true, y_prob, pos_label=pos_label, sample_weight=sample_weight ) - try: - value, _ = max((r, p) for p, r in zip(precisions, recalls) if p >= k) - except ValueError: - value = 0 - + valid_positions = precisions >= k + valid_recalls = recalls[valid_positions] + value = 0 + if valid_recalls.shape[0] > 0: + value = np.max(valid_recalls) return value + # try: + # value, _ = max((r, p) for p, r in zip(precisions, recalls) if p >= k) + # except ValueError: + # value = 0 + + # return value + def precision_at_recall_k(y_true, y_prob, k, *, pos_label=None, sample_weight=None): - precisions, recalls, threshholds = precision_recall_curve( + precisions, recalls, _ = precision_recall_curve( y_true, y_prob, pos_label=pos_label, sample_weight=sample_weight ) - try: - value, _ = max((p, r) for p, r in zip(precisions, recalls) if r >= k) - except ValueError: - value = 0 - + valid_positions = recalls >= k + valid_precisions = precisions[valid_positions] + value = 0 + if valid_precisions.shape[0] > 0: + value = np.max(valid_precisions) return value + + # try: + # value, _ = max((p, r) for p, r in zip(precisions, recalls) if r >= k) + # except ValueError: + # value = 0 + + # return value From f143ee328f6d096d7f350751ffb7b6a09b3970a1 Mon Sep 17 00:00:00 2001 From: shubhraneel Date: Sun, 29 Aug 2021 08:39:39 +0530 Subject: [PATCH 4/5] Add more tests and documentation --- sklearn/metrics/_classification.py | 136 +++++++++++++++++-- sklearn/metrics/tests/test_classification.py | 10 ++ 2 files changed, 131 insertions(+), 15 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 3ba428864d344..81d9138c126f4 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -2653,6 +2653,66 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None): def recall_at_precision_k(y_true, y_prob, k, *, pos_label=None, sample_weight=None): + """Computes maximum recall for the thresholds when precision is greater + than or equal to ``k`` + + Note: this implementation is restricted to the binary classification task. + + The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + true positives and ``fp`` the number of false positives. The precision is + intuitively the ability of the classifier not to label as positive a sample + that is negative. + + The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + true positives and ``fn`` the number of false negatives. The recall is + intuitively the ability of the classifier to find all the positive samples. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : ndarray of shape (n_samples,) + True binary labels. If labels are not either {-1, 1} or {0, 1}, then + pos_label should be explicitly given. + + probas_pred : ndarray of shape (n_samples,) + Target scores, can either be probability estimates of the positive + class, or non-thresholded measure of decisions (as returned by + `decision_function` on some classifiers). + + pos_label : int or str, default=None + The label of the positive class. + When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1}, + ``pos_label`` is set to 1, otherwise an error will be raised. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + recall_at_precision_k : float + Maximum recall when for the thresholds when precision is greater + than or equal to ``k`` for thresholds applied to the ``pos_label`` or + to the label 1 if ``pos_label=None`` + + See Also + -------- + precision_recall_curve : Compute precision-recall curve. + plot_precision_recall_curve : Plot Precision Recall Curve for binary + classifiers. + PrecisionRecallDisplay : Precision Recall visualization. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import recall_at_precision_k + >>> y_true = np.array([0, 0, 1, 1, 1, 1]) + >>> y_prob = np.array([0.1, 0.8, 0.9, 0.3, 1.0, 0.95]) + >>> k = 0.75 + >>> recall_at_precision_k(y_true, y_prob, k) + 1.0 + + """ precisions, recalls, _ = precision_recall_curve( y_true, y_prob, pos_label=pos_label, sample_weight=sample_weight @@ -2660,20 +2720,73 @@ def recall_at_precision_k(y_true, y_prob, k, *, pos_label=None, sample_weight=No valid_positions = precisions >= k valid_recalls = recalls[valid_positions] - value = 0 + value = 0.0 if valid_recalls.shape[0] > 0: value = np.max(valid_recalls) return value - # try: - # value, _ = max((r, p) for p, r in zip(precisions, recalls) if p >= k) - # except ValueError: - # value = 0 - # return value +def precision_at_recall_k(y_true, y_prob, k, *, pos_label=None, sample_weight=None): + """Computes maximum precision for the thresholds when recall is greater + than or equal to ``k`` + + Note: this implementation is restricted to the binary classification task. + The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + true positives and ``fp`` the number of false positives. The precision is + intuitively the ability of the classifier not to label as positive a sample + that is negative. -def precision_at_recall_k(y_true, y_prob, k, *, pos_label=None, sample_weight=None): + The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + true positives and ``fn`` the number of false negatives. The recall is + intuitively the ability of the classifier to find all the positive samples. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : ndarray of shape (n_samples,) + True binary labels. If labels are not either {-1, 1} or {0, 1}, then + pos_label should be explicitly given. + + probas_pred : ndarray of shape (n_samples,) + Target scores, can either be probability estimates of the positive + class, or non-thresholded measure of decisions (as returned by + `decision_function` on some classifiers). + + pos_label : int or str, default=None + The label of the positive class. + When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1}, + ``pos_label`` is set to 1, otherwise an error will be raised. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + precision_at_recall_k : float + Maximum precision when for the thresholds when recall is greater + than or equal to ``k`` for thresholds applied to the ``pos_label`` or + to the label 1 if ``pos_label=None`` + + See Also + -------- + precision_recall_curve : Compute precision-recall curve. + plot_precision_recall_curve : Plot Precision Recall Curve for binary + classifiers. + PrecisionRecallDisplay : Precision Recall visualization. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import precision_at_recall_k + >>> y_true = np.array([0, 0, 1, 1, 1, 1]) + >>> y_prob = np.array([0.1, 0.8, 0.9, 0.3, 1.0, 0.95]) + >>> k = 0.8 + >>> precision_at_recall_k(y_true, y_prob, k) + 0.8 + + """ precisions, recalls, _ = precision_recall_curve( y_true, y_prob, pos_label=pos_label, sample_weight=sample_weight @@ -2681,14 +2794,7 @@ def precision_at_recall_k(y_true, y_prob, k, *, pos_label=None, sample_weight=No valid_positions = recalls >= k valid_precisions = precisions[valid_positions] - value = 0 + value = 0.0 if valid_precisions.shape[0] > 0: value = np.max(valid_precisions) return value - - # try: - # value, _ = max((p, r) for p, r in zip(precisions, recalls) if r >= k) - # except ValueError: - # value = 0 - - # return value diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index c2413c93aba7a..cee8a7d0cdc2b 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -2527,6 +2527,11 @@ def test_precision_at_recall_k(): assert_almost_equal(precision_at_recall_k(y_true, y_prob, 0.8, pos_label=1), 0.8) + y_true = np.array([0]) + y_prob = np.array([0.4]) + with ignore_warnings(): + assert_almost_equal(precision_at_recall_k(y_true, y_prob, 0.1), 0) + def test_recall_at_precision_k(): y_true = np.array([0, 0, 1, 1, 1, 1]) @@ -2541,3 +2546,8 @@ def test_recall_at_precision_k(): recall_at_precision_k(y_multi, y_prob, 1) assert_almost_equal(recall_at_precision_k(y_true, y_prob, 1, pos_label=1), 0.75) + + y_true = np.array([0]) + y_prob = np.array([0.4]) + with ignore_warnings(): + assert_almost_equal(recall_at_precision_k(y_true, y_prob, 0.1), 0) From c9cddf0b57bb0b84a3c68299f8d888dd0c7617b0 Mon Sep 17 00:00:00 2001 From: shubhraneel Date: Mon, 30 Aug 2021 17:12:33 +0530 Subject: [PATCH 5/5] Add changelog entry --- doc/whats_new/v1.0.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 7d8175a3b5046..2a28f009dd19a 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -623,6 +623,11 @@ Changelog amount of data. :pr:`20312` by :user:`Divyanshu Deoli `. +- |Feature| Added :func:`precision_at_recall_k` and :func:`recall_at_precision_k` + to calculate the 'maximum precision for thresholds where recall >= k' and 'maximum + precision for thresholds where precision >= k' respectively. + :pr:`20877` by :user:`Shubhraneel Pal `. + :mod:`sklearn.mixture` ......................