From 2f9b2d514d051cc0dc8b52a53786e15bd25db671 Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Fri, 18 Aug 2017 10:42:01 -0500 Subject: [PATCH 1/9] Add average precision definitions and cross references --- doc/modules/model_evaluation.rst | 9 +++++- .../model_selection/plot_precision_recall.py | 3 +- sklearn/metrics/ranking.py | 29 +++++++++++++------ 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 4800569556758..da12016835654 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -636,7 +636,14 @@ by varying a decision threshold. The :func:`average_precision_score` function computes the average precision (AP) from prediction scores. This score corresponds to the area under the precision-recall curve. The value is between 0 and 1 and higher is better. -With random predictions, the AP is the fraction of positive samples. +With random predictions, the AP is the fraction of positive samples. AP is +defined as + +.. math:: + \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n + +where :math:`P_n` and :math:`R_n` are the precision and recall at the +nth threshold. Several functions allow you to analyze the precision, recall and F-measures score: diff --git a/examples/model_selection/plot_precision_recall.py b/examples/model_selection/plot_precision_recall.py index dae720336dec8..985671a7dcdca 100644 --- a/examples/model_selection/plot_precision_recall.py +++ b/examples/model_selection/plot_precision_recall.py @@ -69,7 +69,8 @@ where :math:`P_n` and :math:`R_n` are the precision and recall at the nth threshold. A pair :math:`(R_k, P_k)` is referred to as an -*operating point*. +*operating point*. When summarizing a precision-recall curve, AP is preferable +to computing the trapezoidal area under the operating points. Precision-recall curves are typically used in binary classification to study the output of a classifier. In order to extend the precision-recall curve and diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 9755732a4f910..c9669f1bae27f 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -40,7 +40,8 @@ def auc(x, y, reorder=False): """Compute Area Under the Curve (AUC) using the trapezoidal rule This is a general function, given points on a curve. For computing the - area under the ROC-curve, see :func:`roc_auc_score`. + area under the ROC-curve, see :func:`roc_auc_score`. For summarizing a + precision-recall curve, see :func:`average_precision_score`. Parameters ---------- @@ -68,7 +69,8 @@ def auc(x, y, reorder=False): See also -------- - roc_auc_score : Computes the area under the ROC curve + roc_auc_score : Compute the area under the ROC curve + average_precision_score : Compute average precision from prediction scores precision_recall_curve : Compute precision-recall pairs for different probability thresholds """ @@ -108,6 +110,16 @@ def average_precision_score(y_true, y_score, average="macro", sample_weight=None): """Compute average precision (AP) from prediction scores + AP summarizes a precision-recall curve as the weighted mean of precisions + achieved at each threshold, with the increase in recall from the previous + threshold used as the weight: + + .. math:: + \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n + + where :math:`P_n` and :math:`R_n` are the precision and recall at the nth + threshold [1]_. + Note: this implementation is restricted to the binary classification task or multilabel classification task. @@ -149,13 +161,8 @@ def average_precision_score(y_true, y_score, average="macro", References ---------- .. [1] `Wikipedia entry for the Average precision - `_ - .. [2] `Stanford Information Retrieval book - `_ - .. [3] `The PASCAL Visual Object Classes (VOC) Challenge - `_ + `_ See also -------- @@ -396,6 +403,10 @@ def precision_recall_curve(y_true, probas_pred, pos_label=None, Increasing thresholds on the decision function used to compute precision and recall. + See also + -------- + average_precision_score : Compute average precision from prediction scores + Examples -------- >>> import numpy as np From 6b2f66c6d01fae8c2c7f4ec5bbdd048b1a7834b5 Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Fri, 18 Aug 2017 12:03:29 -0500 Subject: [PATCH 2/9] Fix formatting error in AP formula --- doc/modules/model_evaluation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index da12016835654..21003458d6140 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -640,7 +640,7 @@ With random predictions, the AP is the fraction of positive samples. AP is defined as .. math:: - \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n + \text{AP} = \sum_n (R_n - R_{n-1}) P_n where :math:`P_n` and :math:`R_n` are the precision and recall at the nth threshold. From 879dfd74289f510a77cb0edd1fb7530956646b81 Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Tue, 22 Aug 2017 10:22:29 -0500 Subject: [PATCH 3/9] Add precision recall references --- doc/modules/model_evaluation.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 21003458d6140..72d375f42c47a 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -678,6 +678,24 @@ binary classification and multilabel indicator format. for an example of :func:`precision_recall_curve` usage to evaluate classifier output quality. + +.. topic:: References: + + * C.D. Manning, P. Raghavan, H. Schütze, `Introduction to Information Retrieval + `_, + 2008. + * M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman, + `The Pascal Visual Object Classes (VOC) Challenge + `_, + IJCV 2010. + * J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves + `_, + ICML 2006. + * P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right + `_, + NIPS 2015. + + Binary classification ^^^^^^^^^^^^^^^^^^^^^ From 3daba6a9d8ef8a0748a9648e3be08b5e2d243cbc Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Tue, 22 Aug 2017 10:43:08 -0500 Subject: [PATCH 4/9] Describe precision recall references and interpolation --- doc/modules/model_evaluation.rst | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 72d375f42c47a..8a41f9dcf102a 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -634,16 +634,22 @@ from the ground truth label and a score given by the classifier by varying a decision threshold. The :func:`average_precision_score` function computes the average precision -(AP) from prediction scores. This score corresponds to the area under the -precision-recall curve. The value is between 0 and 1 and higher is better. -With random predictions, the AP is the fraction of positive samples. AP is -defined as +(AP) from prediction scores. The value is between 0 and 1 and higher is better. +AP is defined as .. math:: \text{AP} = \sum_n (R_n - R_{n-1}) P_n where :math:`P_n` and :math:`R_n` are the precision and recall at the -nth threshold. +nth threshold. With random predictions, the AP is the fraction of positive +samples. + +The references below present alternative variants of AP that interpolate the +precision-recall curve, which are not implemented in +:func:`average_precision_score`. They also describe why a linear interpolation +of points on the precision-recall curve provides an overly-optimistic measure +of classifier performance. This linear interpolation is used when computing +area under the curve with the trapezoidal rule in :func:`auc`. Several functions allow you to analyze the precision, recall and F-measures score: From ec471207a70afc4f722a8becf877de658372352b Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Tue, 22 Aug 2017 11:45:25 -0500 Subject: [PATCH 5/9] Fix reference list formatting --- doc/modules/model_evaluation.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 8a41f9dcf102a..0cf672a04555c 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -688,18 +688,18 @@ binary classification and multilabel indicator format. .. topic:: References: * C.D. Manning, P. Raghavan, H. Schütze, `Introduction to Information Retrieval - `_, - 2008. + `_, + 2008. * M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman, - `The Pascal Visual Object Classes (VOC) Challenge - `_, - IJCV 2010. + `The Pascal Visual Object Classes (VOC) Challenge + `_, + IJCV 2010. * J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves - `_, - ICML 2006. + `_, + ICML 2006. * P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right - `_, - NIPS 2015. + `_, + NIPS 2015. Binary classification From f7e7cc191a2795fcd66e1a6c3fce4a91383e3bd5 Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Wed, 23 Aug 2017 09:05:17 -0500 Subject: [PATCH 6/9] Change AUC to AP in precision recall example figure titles --- examples/model_selection/plot_precision_recall.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/model_selection/plot_precision_recall.py b/examples/model_selection/plot_precision_recall.py index 985671a7dcdca..c12177db2f08d 100644 --- a/examples/model_selection/plot_precision_recall.py +++ b/examples/model_selection/plot_precision_recall.py @@ -145,7 +145,7 @@ plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) -plt.title('2-class Precision-Recall curve: AUC={0:0.2f}'.format( +plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format( average_precision)) ############################################################################### @@ -216,7 +216,7 @@ plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title( - 'Average precision score, micro-averaged over all classes: AUC={0:0.2f}' + 'Average precision score, micro-averaged over all classes: AP={0:0.2f}' .format(average_precision["micro"])) ############################################################################### From 169a2a68b39da89e6a3dc7b83954e79573d9be55 Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Wed, 23 Aug 2017 09:18:40 -0500 Subject: [PATCH 7/9] Present both AUC and AP as reasonable ways to summarize a PR curve --- examples/model_selection/plot_precision_recall.py | 13 ++++++++----- sklearn/metrics/ranking.py | 5 +++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/examples/model_selection/plot_precision_recall.py b/examples/model_selection/plot_precision_recall.py index c12177db2f08d..25fe72a866143 100644 --- a/examples/model_selection/plot_precision_recall.py +++ b/examples/model_selection/plot_precision_recall.py @@ -61,16 +61,19 @@ in the threshold considerably reduces precision, with only a minor gain in recall. -**Average precision** summarizes such a plot as the weighted mean of precisions -achieved at each threshold, with the increase in recall from the previous -threshold used as the weight: +**Average precision** (AP) summarizes such a plot as the weighted mean of +precisions achieved at each threshold, with the increase in recall from the +previous threshold used as the weight: :math:`\\text{AP} = \\sum_n (R_n - R_{n-1}) P_n` where :math:`P_n` and :math:`R_n` are the precision and recall at the nth threshold. A pair :math:`(R_k, P_k)` is referred to as an -*operating point*. When summarizing a precision-recall curve, AP is preferable -to computing the trapezoidal area under the operating points. +*operating point*. + +AP and the trapezoidal area under the operating points +(:func:`sklearn.metrics.auc`) are common ways to summarize a precision-recall +curve. Read more in the :ref:`User Guide `. Precision-recall curves are typically used in binary classification to study the output of a classifier. In order to extend the precision-recall curve and diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index c9669f1bae27f..c39b34882e823 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -40,8 +40,9 @@ def auc(x, y, reorder=False): """Compute Area Under the Curve (AUC) using the trapezoidal rule This is a general function, given points on a curve. For computing the - area under the ROC-curve, see :func:`roc_auc_score`. For summarizing a - precision-recall curve, see :func:`average_precision_score`. + area under the ROC-curve, see :func:`roc_auc_score`. For an alternative + way to summarize a precision-recall curve, see + :func:`average_precision_score`. Parameters ---------- From 0c0f4bf046e501f63a24611f8a18d8fd220d62ad Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Sat, 23 Sep 2017 16:11:49 -0500 Subject: [PATCH 8/9] Add citation links to model evaluation guide and refine ranking function definitions --- doc/modules/model_evaluation.rst | 24 ++++++++++--------- .../model_selection/plot_precision_recall.py | 3 ++- sklearn/metrics/ranking.py | 16 +++++++++---- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 0cf672a04555c..e19da0f310c90 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -633,7 +633,8 @@ The :func:`precision_recall_curve` computes a precision-recall curve from the ground truth label and a score given by the classifier by varying a decision threshold. -The :func:`average_precision_score` function computes the average precision +The :func:`average_precision_score` function computes the +`average precision `_ (AP) from prediction scores. The value is between 0 and 1 and higher is better. AP is defined as @@ -644,12 +645,13 @@ where :math:`P_n` and :math:`R_n` are the precision and recall at the nth threshold. With random predictions, the AP is the fraction of positive samples. -The references below present alternative variants of AP that interpolate the -precision-recall curve, which are not implemented in -:func:`average_precision_score`. They also describe why a linear interpolation -of points on the precision-recall curve provides an overly-optimistic measure -of classifier performance. This linear interpolation is used when computing -area under the curve with the trapezoidal rule in :func:`auc`. +References [Manning2008]_ and [Everingham2010]_ present alternative variants of +AP that interpolate the precision-recall curve. Currently, +:func:`average_precision_score` does not implement any interpolated variant. +References [Davis2006]_ and [Flach2015]_ describe why a linear interpolation of +points on the precision-recall curve provides an overly-optimistic measure of +classifier performance. This linear interpolation is used when computing area +under the curve with the trapezoidal rule in :func:`auc`. Several functions allow you to analyze the precision, recall and F-measures score: @@ -687,17 +689,17 @@ binary classification and multilabel indicator format. .. topic:: References: - * C.D. Manning, P. Raghavan, H. Schütze, `Introduction to Information Retrieval + * [Manning2008] C.D. Manning, P. Raghavan, H. Schütze, `Introduction to Information Retrieval `_, 2008. - * M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman, + * [Everingham2010] M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman, `The Pascal Visual Object Classes (VOC) Challenge `_, IJCV 2010. - * J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves + * [Davis2006] J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves `_, ICML 2006. - * P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right + * [Flach2015] P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right `_, NIPS 2015. diff --git a/examples/model_selection/plot_precision_recall.py b/examples/model_selection/plot_precision_recall.py index 25fe72a866143..633ceea85db53 100644 --- a/examples/model_selection/plot_precision_recall.py +++ b/examples/model_selection/plot_precision_recall.py @@ -73,7 +73,8 @@ AP and the trapezoidal area under the operating points (:func:`sklearn.metrics.auc`) are common ways to summarize a precision-recall -curve. Read more in the :ref:`User Guide `. +curve that lead to different results. Read more in the +:ref:`User Guide `. Precision-recall curves are typically used in binary classification to study the output of a classifier. In order to extend the precision-recall curve and diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index c39b34882e823..9eba8178bc956 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -119,7 +119,10 @@ def average_precision_score(y_true, y_score, average="macro", \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n where :math:`P_n` and :math:`R_n` are the precision and recall at the nth - threshold [1]_. + threshold [1]_. This implementation is not interpolated and is different + from computing the area under the precision-recall curve with the + trapezoidal rule, which uses linear interpolation and can be too + optimistic. Note: this implementation is restricted to the binary classification task or multilabel classification task. @@ -167,7 +170,7 @@ def average_precision_score(y_true, y_score, average="macro", See also -------- - roc_auc_score : Area under the ROC curve + roc_auc_score : Compute the area under the ROC curve precision_recall_curve : Compute precision-recall pairs for different probability thresholds @@ -198,7 +201,8 @@ def _binary_uninterpolated_average_precision( def roc_auc_score(y_true, y_score, average="macro", sample_weight=None): - """Compute Area Under the Curve (AUC) from prediction scores + """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) + from prediction scores. Note: this implementation is restricted to the binary classification task or multilabel classification task in label indicator format. @@ -247,7 +251,7 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None): -------- average_precision_score : Area under the precision-recall curve - roc_curve : Compute Receiver operating characteristic (ROC) + roc_curve : Compute Receiver operating characteristic (ROC) curve Examples -------- @@ -408,6 +412,8 @@ def precision_recall_curve(y_true, probas_pred, pos_label=None, -------- average_precision_score : Compute average precision from prediction scores + roc_curve : Compute Receiver operating characteristic (ROC) curve + Examples -------- >>> import numpy as np @@ -489,7 +495,7 @@ def roc_curve(y_true, y_score, pos_label=None, sample_weight=None, See also -------- - roc_auc_score : Compute Area Under the Curve (AUC) from prediction scores + roc_auc_score : Compute the area under the ROC curve Notes ----- From c46a4d5ec058d52af2811c83b5048804aedb1e24 Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Sun, 24 Sep 2017 06:46:41 -0500 Subject: [PATCH 9/9] Fix RST citation syntax --- doc/modules/model_evaluation.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index e19da0f310c90..c5384d49fa658 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -689,19 +689,19 @@ binary classification and multilabel indicator format. .. topic:: References: - * [Manning2008] C.D. Manning, P. Raghavan, H. Schütze, `Introduction to Information Retrieval - `_, - 2008. - * [Everingham2010] M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman, - `The Pascal Visual Object Classes (VOC) Challenge - `_, - IJCV 2010. - * [Davis2006] J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves - `_, - ICML 2006. - * [Flach2015] P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right - `_, - NIPS 2015. + .. [Manning2008] C.D. Manning, P. Raghavan, H. Schütze, `Introduction to Information Retrieval + `_, + 2008. + .. [Everingham2010] M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman, + `The Pascal Visual Object Classes (VOC) Challenge + `_, + IJCV 2010. + .. [Davis2006] J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves + `_, + ICML 2006. + .. [Flach2015] P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right + `_, + NIPS 2015. Binary classification