From 2f9b2d514d051cc0dc8b52a53786e15bd25db671 Mon Sep 17 00:00:00 2001
From: Anthony Gitter <gitter@biostat.wisc.edu>
Date: Fri, 18 Aug 2017 10:42:01 -0500
Subject: [PATCH 1/9] Add average precision definitions and cross references

---
 doc/modules/model_evaluation.rst              |  9 +++++-
 .../model_selection/plot_precision_recall.py  |  3 +-
 sklearn/metrics/ranking.py                    | 29 +++++++++++++------
 3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 4800569556758..da12016835654 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -636,7 +636,14 @@ by varying a decision threshold.
 The :func:`average_precision_score` function computes the average precision
 (AP) from prediction scores. This score corresponds to the area under the
 precision-recall curve. The value is between 0 and 1 and higher is better.
-With random predictions, the AP is the fraction of positive samples.
+With random predictions, the AP is the fraction of positive samples. AP is
+defined as
+
+.. math::
+    \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n
+
+where :math:`P_n` and :math:`R_n` are the precision and recall at the
+nth threshold.
 
 Several functions allow you to analyze the precision, recall and F-measures
 score:
diff --git a/examples/model_selection/plot_precision_recall.py b/examples/model_selection/plot_precision_recall.py
index dae720336dec8..985671a7dcdca 100644
--- a/examples/model_selection/plot_precision_recall.py
+++ b/examples/model_selection/plot_precision_recall.py
@@ -69,7 +69,8 @@
 
 where :math:`P_n` and :math:`R_n` are the precision and recall at the
 nth threshold. A pair :math:`(R_k, P_k)` is referred to as an
-*operating point*.
+*operating point*. When summarizing a precision-recall curve, AP is preferable
+to computing the trapezoidal area under the operating points.
 
 Precision-recall curves are typically used in binary classification to study
 the output of a classifier. In order to extend the precision-recall curve and
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 9755732a4f910..c9669f1bae27f 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -40,7 +40,8 @@ def auc(x, y, reorder=False):
     """Compute Area Under the Curve (AUC) using the trapezoidal rule
 
     This is a general function, given points on a curve.  For computing the
-    area under the ROC-curve, see :func:`roc_auc_score`.
+    area under the ROC-curve, see :func:`roc_auc_score`.  For summarizing a
+    precision-recall curve, see :func:`average_precision_score`.
 
     Parameters
     ----------
@@ -68,7 +69,8 @@ def auc(x, y, reorder=False):
 
     See also
     --------
-    roc_auc_score : Computes the area under the ROC curve
+    roc_auc_score : Compute the area under the ROC curve
+    average_precision_score : Compute average precision from prediction scores
     precision_recall_curve :
         Compute precision-recall pairs for different probability thresholds
     """
@@ -108,6 +110,16 @@ def average_precision_score(y_true, y_score, average="macro",
                             sample_weight=None):
     """Compute average precision (AP) from prediction scores
 
+    AP summarizes a precision-recall curve as the weighted mean of precisions
+    achieved at each threshold, with the increase in recall from the previous
+    threshold used as the weight:
+
+    .. math::
+        \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n
+
+    where :math:`P_n` and :math:`R_n` are the precision and recall at the nth
+    threshold [1]_.
+
     Note: this implementation is restricted to the binary classification task
     or multilabel classification task.
 
@@ -149,13 +161,8 @@ def average_precision_score(y_true, y_score, average="macro",
     References
     ----------
     .. [1] `Wikipedia entry for the Average precision
-           <http://en.wikipedia.org/wiki/Average_precision>`_
-    .. [2] `Stanford Information Retrieval book
-            <http://nlp.stanford.edu/IR-book/html/htmledition/
-            evaluation-of-ranked-retrieval-results-1.html>`_
-    .. [3] `The PASCAL Visual Object Classes (VOC) Challenge
-            <http://citeseerx.ist.psu.edu/viewdoc/
-            download?doi=10.1.1.157.5766&rep=rep1&type=pdf>`_
+           <http://en.wikipedia.org/w/index.php?title=Information_retrieval&
+           oldid=793358396#Average_precision>`_
 
     See also
     --------
@@ -396,6 +403,10 @@ def precision_recall_curve(y_true, probas_pred, pos_label=None,
         Increasing thresholds on the decision function used to compute
         precision and recall.
 
+    See also
+    --------
+    average_precision_score : Compute average precision from prediction scores
+
     Examples
     --------
     >>> import numpy as np

From 6b2f66c6d01fae8c2c7f4ec5bbdd048b1a7834b5 Mon Sep 17 00:00:00 2001
From: Anthony Gitter <gitter@biostat.wisc.edu>
Date: Fri, 18 Aug 2017 12:03:29 -0500
Subject: [PATCH 2/9] Fix formatting error in AP formula

---
 doc/modules/model_evaluation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index da12016835654..21003458d6140 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -640,7 +640,7 @@ With random predictions, the AP is the fraction of positive samples. AP is
 defined as
 
 .. math::
-    \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n
+    \text{AP} = \sum_n (R_n - R_{n-1}) P_n
 
 where :math:`P_n` and :math:`R_n` are the precision and recall at the
 nth threshold.

From 879dfd74289f510a77cb0edd1fb7530956646b81 Mon Sep 17 00:00:00 2001
From: Anthony Gitter <gitter@biostat.wisc.edu>
Date: Tue, 22 Aug 2017 10:22:29 -0500
Subject: [PATCH 3/9] Add precision recall references

---
 doc/modules/model_evaluation.rst | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 21003458d6140..72d375f42c47a 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -678,6 +678,24 @@ binary classification and multilabel indicator format.
     for an example of :func:`precision_recall_curve` usage to evaluate
     classifier output quality.
 
+
+.. topic:: References:
+
+  * C.D. Manning, P. Raghavan, H. Schütze, `Introduction to Information Retrieval
+  <http://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-ranked-retrieval-results-1.html>`_,
+  2008.
+  * M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman,
+  `The Pascal Visual Object Classes (VOC) Challenge
+  <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.157.5766&rep=rep1&type=pdf>`_,
+  IJCV 2010.
+  * J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves
+  <http://www.machinelearning.org/proceedings/icml2006/030_The_Relationship_Bet.pdf>`_,
+  ICML 2006.
+  * P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right
+  <http://papers.nips.cc/paper/5867-precision-recall-gain-curves-pr-analysis-done-right.pdf>`_,
+  NIPS 2015.
+
+
 Binary classification
 ^^^^^^^^^^^^^^^^^^^^^
 

From 3daba6a9d8ef8a0748a9648e3be08b5e2d243cbc Mon Sep 17 00:00:00 2001
From: Anthony Gitter <gitter@biostat.wisc.edu>
Date: Tue, 22 Aug 2017 10:43:08 -0500
Subject: [PATCH 4/9] Describe precision recall references and interpolation

---
 doc/modules/model_evaluation.rst | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 72d375f42c47a..8a41f9dcf102a 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -634,16 +634,22 @@ from the ground truth label and a score given by the classifier
 by varying a decision threshold.
 
 The :func:`average_precision_score` function computes the average precision
-(AP) from prediction scores. This score corresponds to the area under the
-precision-recall curve. The value is between 0 and 1 and higher is better.
-With random predictions, the AP is the fraction of positive samples. AP is
-defined as
+(AP) from prediction scores. The value is between 0 and 1 and higher is better.
+AP is defined as
 
 .. math::
     \text{AP} = \sum_n (R_n - R_{n-1}) P_n
 
 where :math:`P_n` and :math:`R_n` are the precision and recall at the
-nth threshold.
+nth threshold. With random predictions, the AP is the fraction of positive
+samples.
+
+The references below present alternative variants of AP that interpolate the
+precision-recall curve, which are not implemented in
+:func:`average_precision_score`. They also describe why a linear interpolation
+of points on the precision-recall curve provides an overly-optimistic measure
+of classifier performance. This linear interpolation is used when computing
+area under the curve with the trapezoidal rule in :func:`auc`.
 
 Several functions allow you to analyze the precision, recall and F-measures
 score:

From ec471207a70afc4f722a8becf877de658372352b Mon Sep 17 00:00:00 2001
From: Anthony Gitter <gitter@biostat.wisc.edu>
Date: Tue, 22 Aug 2017 11:45:25 -0500
Subject: [PATCH 5/9] Fix reference list formatting

---
 doc/modules/model_evaluation.rst | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 8a41f9dcf102a..0cf672a04555c 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -688,18 +688,18 @@ binary classification and multilabel indicator format.
 .. topic:: References:
 
   * C.D. Manning, P. Raghavan, H. Schütze, `Introduction to Information Retrieval
-  <http://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-ranked-retrieval-results-1.html>`_,
-  2008.
+    <http://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-ranked-retrieval-results-1.html>`_,
+    2008.
   * M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman,
-  `The Pascal Visual Object Classes (VOC) Challenge
-  <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.157.5766&rep=rep1&type=pdf>`_,
-  IJCV 2010.
+    `The Pascal Visual Object Classes (VOC) Challenge
+    <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.157.5766&rep=rep1&type=pdf>`_,
+    IJCV 2010.
   * J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves
-  <http://www.machinelearning.org/proceedings/icml2006/030_The_Relationship_Bet.pdf>`_,
-  ICML 2006.
+    <http://www.machinelearning.org/proceedings/icml2006/030_The_Relationship_Bet.pdf>`_,
+    ICML 2006.
   * P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right
-  <http://papers.nips.cc/paper/5867-precision-recall-gain-curves-pr-analysis-done-right.pdf>`_,
-  NIPS 2015.
+    <http://papers.nips.cc/paper/5867-precision-recall-gain-curves-pr-analysis-done-right.pdf>`_,
+    NIPS 2015.
 
 
 Binary classification

From f7e7cc191a2795fcd66e1a6c3fce4a91383e3bd5 Mon Sep 17 00:00:00 2001
From: Anthony Gitter <gitter@biostat.wisc.edu>
Date: Wed, 23 Aug 2017 09:05:17 -0500
Subject: [PATCH 6/9] Change AUC to AP in precision recall example figure
 titles

---
 examples/model_selection/plot_precision_recall.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/model_selection/plot_precision_recall.py b/examples/model_selection/plot_precision_recall.py
index 985671a7dcdca..c12177db2f08d 100644
--- a/examples/model_selection/plot_precision_recall.py
+++ b/examples/model_selection/plot_precision_recall.py
@@ -145,7 +145,7 @@
 plt.ylabel('Precision')
 plt.ylim([0.0, 1.05])
 plt.xlim([0.0, 1.0])
-plt.title('2-class Precision-Recall curve: AUC={0:0.2f}'.format(
+plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
           average_precision))
 
 ###############################################################################
@@ -216,7 +216,7 @@
 plt.ylim([0.0, 1.05])
 plt.xlim([0.0, 1.0])
 plt.title(
-    'Average precision score, micro-averaged over all classes: AUC={0:0.2f}'
+    'Average precision score, micro-averaged over all classes: AP={0:0.2f}'
     .format(average_precision["micro"]))
 
 ###############################################################################

From 169a2a68b39da89e6a3dc7b83954e79573d9be55 Mon Sep 17 00:00:00 2001
From: Anthony Gitter <gitter@biostat.wisc.edu>
Date: Wed, 23 Aug 2017 09:18:40 -0500
Subject: [PATCH 7/9] Present both AUC and AP as reasonable ways to summarize a
 PR curve

---
 examples/model_selection/plot_precision_recall.py | 13 ++++++++-----
 sklearn/metrics/ranking.py                        |  5 +++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/examples/model_selection/plot_precision_recall.py b/examples/model_selection/plot_precision_recall.py
index c12177db2f08d..25fe72a866143 100644
--- a/examples/model_selection/plot_precision_recall.py
+++ b/examples/model_selection/plot_precision_recall.py
@@ -61,16 +61,19 @@
 in the threshold considerably reduces precision, with only a minor gain in
 recall.
 
-**Average precision** summarizes such a plot as the weighted mean of precisions
-achieved at each threshold, with the increase in recall from the previous
-threshold used as the weight:
+**Average precision** (AP) summarizes such a plot as the weighted mean of
+precisions achieved at each threshold, with the increase in recall from the
+previous threshold used as the weight:
 
 :math:`\\text{AP} = \\sum_n (R_n - R_{n-1}) P_n`
 
 where :math:`P_n` and :math:`R_n` are the precision and recall at the
 nth threshold. A pair :math:`(R_k, P_k)` is referred to as an
-*operating point*. When summarizing a precision-recall curve, AP is preferable
-to computing the trapezoidal area under the operating points.
+*operating point*.
+
+AP and the trapezoidal area under the operating points
+(:func:`sklearn.metrics.auc`) are common ways to summarize a precision-recall
+curve. Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
 
 Precision-recall curves are typically used in binary classification to study
 the output of a classifier. In order to extend the precision-recall curve and
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index c9669f1bae27f..c39b34882e823 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -40,8 +40,9 @@ def auc(x, y, reorder=False):
     """Compute Area Under the Curve (AUC) using the trapezoidal rule
 
     This is a general function, given points on a curve.  For computing the
-    area under the ROC-curve, see :func:`roc_auc_score`.  For summarizing a
-    precision-recall curve, see :func:`average_precision_score`.
+    area under the ROC-curve, see :func:`roc_auc_score`.  For an alternative
+    way to summarize a precision-recall curve, see
+    :func:`average_precision_score`.
 
     Parameters
     ----------

From 0c0f4bf046e501f63a24611f8a18d8fd220d62ad Mon Sep 17 00:00:00 2001
From: Anthony Gitter <gitter@biostat.wisc.edu>
Date: Sat, 23 Sep 2017 16:11:49 -0500
Subject: [PATCH 8/9] Add citation links to model evaluation guide and refine
 ranking function definitions

---
 doc/modules/model_evaluation.rst              | 24 ++++++++++---------
 .../model_selection/plot_precision_recall.py  |  3 ++-
 sklearn/metrics/ranking.py                    | 16 +++++++++----
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 0cf672a04555c..e19da0f310c90 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -633,7 +633,8 @@ The :func:`precision_recall_curve` computes a precision-recall curve
 from the ground truth label and a score given by the classifier
 by varying a decision threshold.
 
-The :func:`average_precision_score` function computes the average precision
+The :func:`average_precision_score` function computes the
+`average precision <http://en.wikipedia.org/w/index.php?title=Information_retrieval&oldid=793358396#Average_precision>`_
 (AP) from prediction scores. The value is between 0 and 1 and higher is better.
 AP is defined as
 
@@ -644,12 +645,13 @@ where :math:`P_n` and :math:`R_n` are the precision and recall at the
 nth threshold. With random predictions, the AP is the fraction of positive
 samples.
 
-The references below present alternative variants of AP that interpolate the
-precision-recall curve, which are not implemented in
-:func:`average_precision_score`. They also describe why a linear interpolation
-of points on the precision-recall curve provides an overly-optimistic measure
-of classifier performance. This linear interpolation is used when computing
-area under the curve with the trapezoidal rule in :func:`auc`.
+References [Manning2008]_ and [Everingham2010]_ present alternative variants of
+AP that interpolate the precision-recall curve. Currently,
+:func:`average_precision_score` does not implement any interpolated variant.
+References [Davis2006]_ and [Flach2015]_ describe why a linear interpolation of
+points on the precision-recall curve provides an overly-optimistic measure of
+classifier performance. This linear interpolation is used when computing area
+under the curve with the trapezoidal rule in :func:`auc`.
 
 Several functions allow you to analyze the precision, recall and F-measures
 score:
@@ -687,17 +689,17 @@ binary classification and multilabel indicator format.
 
 .. topic:: References:
 
-  * C.D. Manning, P. Raghavan, H. Schütze, `Introduction to Information Retrieval
+  * [Manning2008] C.D. Manning, P. Raghavan, H. Schütze, `Introduction to Information Retrieval
     <http://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-ranked-retrieval-results-1.html>`_,
     2008.
-  * M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman,
+  * [Everingham2010] M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman,
     `The Pascal Visual Object Classes (VOC) Challenge
     <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.157.5766&rep=rep1&type=pdf>`_,
     IJCV 2010.
-  * J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves
+  * [Davis2006] J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves
     <http://www.machinelearning.org/proceedings/icml2006/030_The_Relationship_Bet.pdf>`_,
     ICML 2006.
-  * P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right
+  * [Flach2015] P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right
     <http://papers.nips.cc/paper/5867-precision-recall-gain-curves-pr-analysis-done-right.pdf>`_,
     NIPS 2015.
 
diff --git a/examples/model_selection/plot_precision_recall.py b/examples/model_selection/plot_precision_recall.py
index 25fe72a866143..633ceea85db53 100644
--- a/examples/model_selection/plot_precision_recall.py
+++ b/examples/model_selection/plot_precision_recall.py
@@ -73,7 +73,8 @@
 
 AP and the trapezoidal area under the operating points
 (:func:`sklearn.metrics.auc`) are common ways to summarize a precision-recall
-curve. Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+curve that lead to different results. Read more in the
+:ref:`User Guide <precision_recall_f_measure_metrics>`.
 
 Precision-recall curves are typically used in binary classification to study
 the output of a classifier. In order to extend the precision-recall curve and
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index c39b34882e823..9eba8178bc956 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -119,7 +119,10 @@ def average_precision_score(y_true, y_score, average="macro",
         \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n
 
     where :math:`P_n` and :math:`R_n` are the precision and recall at the nth
-    threshold [1]_.
+    threshold [1]_. This implementation is not interpolated and is different
+    from computing the area under the precision-recall curve with the
+    trapezoidal rule, which uses linear interpolation and can be too
+    optimistic.
 
     Note: this implementation is restricted to the binary classification task
     or multilabel classification task.
@@ -167,7 +170,7 @@ def average_precision_score(y_true, y_score, average="macro",
 
     See also
     --------
-    roc_auc_score : Area under the ROC curve
+    roc_auc_score : Compute the area under the ROC curve
 
     precision_recall_curve :
         Compute precision-recall pairs for different probability thresholds
@@ -198,7 +201,8 @@ def _binary_uninterpolated_average_precision(
 
 
 def roc_auc_score(y_true, y_score, average="macro", sample_weight=None):
-    """Compute Area Under the Curve (AUC) from prediction scores
+    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
+    from prediction scores.
 
     Note: this implementation is restricted to the binary classification task
     or multilabel classification task in label indicator format.
@@ -247,7 +251,7 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None):
     --------
     average_precision_score : Area under the precision-recall curve
 
-    roc_curve : Compute Receiver operating characteristic (ROC)
+    roc_curve : Compute Receiver operating characteristic (ROC) curve
 
     Examples
     --------
@@ -408,6 +412,8 @@ def precision_recall_curve(y_true, probas_pred, pos_label=None,
     --------
     average_precision_score : Compute average precision from prediction scores
 
+    roc_curve : Compute Receiver operating characteristic (ROC) curve
+
     Examples
     --------
     >>> import numpy as np
@@ -489,7 +495,7 @@ def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
 
     See also
     --------
-    roc_auc_score : Compute Area Under the Curve (AUC) from prediction scores
+    roc_auc_score : Compute the area under the ROC curve
 
     Notes
     -----

From c46a4d5ec058d52af2811c83b5048804aedb1e24 Mon Sep 17 00:00:00 2001
From: Anthony Gitter <gitter@biostat.wisc.edu>
Date: Sun, 24 Sep 2017 06:46:41 -0500
Subject: [PATCH 9/9] Fix RST citation syntax

---
 doc/modules/model_evaluation.rst | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index e19da0f310c90..c5384d49fa658 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -689,19 +689,19 @@ binary classification and multilabel indicator format.
 
 .. topic:: References:
 
-  * [Manning2008] C.D. Manning, P. Raghavan, H. Schütze, `Introduction to Information Retrieval
-    <http://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-ranked-retrieval-results-1.html>`_,
-    2008.
-  * [Everingham2010] M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman,
-    `The Pascal Visual Object Classes (VOC) Challenge
-    <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.157.5766&rep=rep1&type=pdf>`_,
-    IJCV 2010.
-  * [Davis2006] J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves
-    <http://www.machinelearning.org/proceedings/icml2006/030_The_Relationship_Bet.pdf>`_,
-    ICML 2006.
-  * [Flach2015] P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right
-    <http://papers.nips.cc/paper/5867-precision-recall-gain-curves-pr-analysis-done-right.pdf>`_,
-    NIPS 2015.
+  .. [Manning2008] C.D. Manning, P. Raghavan, H. Schütze, `Introduction to Information Retrieval
+     <http://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-ranked-retrieval-results-1.html>`_,
+     2008.
+  .. [Everingham2010] M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman,
+     `The Pascal Visual Object Classes (VOC) Challenge
+     <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.157.5766&rep=rep1&type=pdf>`_,
+     IJCV 2010.
+  .. [Davis2006] J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves
+     <http://www.machinelearning.org/proceedings/icml2006/030_The_Relationship_Bet.pdf>`_,
+     ICML 2006.
+  .. [Flach2015] P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right
+     <http://papers.nips.cc/paper/5867-precision-recall-gain-curves-pr-analysis-done-right.pdf>`_,
+     NIPS 2015.
 
 
 Binary classification