From 1743375b8cd7135e7e98318e395b477e0a337381 Mon Sep 17 00:00:00 2001 From: Jeremy Karnowski Date: Wed, 15 Jul 2015 09:08:25 -0700 Subject: [PATCH 01/36] Initial add DET curve to classification metrics --- doc/modules/model_evaluation.rst | 27 + sklearn/metrics/__init__.py | 1 + sklearn/metrics/_ranking.py | 78 +++ sklearn/metrics/ranking.py | 907 +++++++++++++++++++++++++++++++ 4 files changed, 1013 insertions(+) create mode 100644 sklearn/metrics/ranking.py diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index f8874869a0274..5e8ab9a583fce 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1437,6 +1437,33 @@ to the given limit. In Data Mining, 2001. Proceedings IEEE International Conference, pp. 131-138. +.. _det_curve: + +Detection error tradeoff (DET) +--------------------------------------- + +The function :func:`detection_error_tradeoff` computes the +`detection error tradeoff curve, or DET curve `_. +Quoting Wikipedia : + + "A detection error tradeoff (DET) graph is a graphical plot of error rates for binary classification systems, plotting false reject rate vs. false accept rate. The x- and y-axes are scaled non-linearly by their standard normal deviates (or just by logarithmic transformation), yielding tradeoff curves that are more linear than ROC curves, and use most of the image area to highlight the differences of importance in the critical operating region." + +This function requires the true binary +value and the target scores, which can either be probability estimates of the +positive class, confidence values, or binary decisions. +Here is a small example of how to use the :func:`detection_error_tradeoff` function:: + + >>> import numpy as np + >>> from sklearn.metrics import det_error_tradeoff + >>> y = np.array([1, 1, 2, 2]) + >>> scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> fpr, tpr, thresholds = error_detection_tradeoff(y, scores, pos_label=2) + >>> fpr + array([ 0.5, 0.5, 0. ]) + >>> fnr + array([ 0. , 0.5, 0.5]) + >>> thresholds + array([ 0.35, 0.4 , 0.8 ]) .. _zero_one_loss: diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index be28005631963..7b9bd5bd10e5d 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -7,6 +7,7 @@ from ._ranking import auc from ._ranking import average_precision_score from ._ranking import coverage_error +from ._ranking import detection_error_tradeoff from ._ranking import dcg_score from ._ranking import label_ranking_average_precision_score from ._ranking import label_ranking_loss diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index e07f61a92d478..e7a0eaeb024e0 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -218,6 +218,84 @@ def _binary_uninterpolated_average_precision( average, sample_weight=sample_weight) +def detection_error_tradeoff(y_true, probas_pred, pos_label=None, + sample_weight=None): + """Compute error rates for different probability thresholds + + Note: this implementation is restricted to the binary classification task. + + Parameters + ---------- + y_true : array, shape = [n_samples] + True targets of binary classification in range {-1, 1} or {0, 1}. + + probas_pred : array, shape = [n_samples] + Estimated probabilities or decision function. + + pos_label : int, optional (default=None) + The label of the positive class + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + fps : array, shape = [n_thresholds] + A count of false positives, at index i being the number of negative + samples assigned a score >= thresholds[i]. The total number of + negative samples is equal to fps[-1] (thus true negatives are given by + fps[-1] - fps). + + fns : array, shape = [n_thresholds] + A count of false negatives, at index i being the number of positive + samples assigned a score < thresholds[i]. The total number of + positive samples is equal to tps[-1] (thus false negatives are given by + tps[-1] - tps). + + thresholds : array, shape = [n_thresholds] + Decreasing score values. + + References + ---------- + .. [1] `Wikipedia entry for Detection error tradeoff + `_ + .. [2] `The DET Curve in Assessment of Detection Task Performance + `_ + .. [3] `2008 NIST Speaker Recognition Evaluation Results + `_ + .. [4] `DET-Curve Plotting software for use with MATLAB + `_ + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import detection_error_tradeoff + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> fps, fns, thresholds = detection_error_tradeoff(y_true, y_scores) + >>> fps + array([ 0.5, 0.5, 0. ]) + >>> fns + array([ 0. , 0.5, 0.5]) + >>> thresholds + array([ 0.35, 0.4 , 0.8 ]) + + """ + fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred, + pos_label=pos_label, + sample_weight=sample_weight) + fns = tps[-1] - tps + tp_count = tps[-1] + tn_count = (fps[-1] - fps)[0] + + # start with false positives is zero and stop with false negatives zero + # and reverse the outputs so list of false positives is decreasing + last_ind = tps.searchsorted(tps[-1]) + 1 + first_ind = fps[::-1].searchsorted(fps[0]) + sl = range(first_ind, last_ind)[::-1] + return fps[sl] / tp_count, fns[sl] / tn_count, thresholds[sl] + + def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None): """Binary roc auc score""" if len(np.unique(y_true)) != 2: diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py new file mode 100644 index 0000000000000..64c8ddbfe2b38 --- /dev/null +++ b/sklearn/metrics/ranking.py @@ -0,0 +1,907 @@ +"""Metrics to assess performance on classification task given scores + +Functions named as ``*_score`` return a scalar value to maximize: the higher +the better + +Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: +the lower the better +""" + +# Authors: Alexandre Gramfort +# Mathieu Blondel +# Olivier Grisel +# Arnaud Joly +# Jochen Wersdorfer +# Lars Buitinck +# Joel Nothman +# Noel Dawe +# Jeremy Karnowski +# License: BSD 3 clause + +from __future__ import division + +import warnings +import numpy as np +from scipy.sparse import csr_matrix +from scipy.stats import rankdata + +from ..utils import assert_all_finite +from ..utils import check_consistent_length +from ..utils import column_or_1d, check_array +from ..utils.multiclass import type_of_target +from ..utils.extmath import stable_cumsum +from ..utils.sparsefuncs import count_nonzero +from ..exceptions import UndefinedMetricWarning +from ..preprocessing import label_binarize + +from .base import _average_binary_score + + +def auc(x, y, reorder='deprecated'): + """Compute Area Under the Curve (AUC) using the trapezoidal rule + + This is a general function, given points on a curve. For computing the + area under the ROC-curve, see :func:`roc_auc_score`. For an alternative + way to summarize a precision-recall curve, see + :func:`average_precision_score`. + + Parameters + ---------- + x : array, shape = [n] + x coordinates. These must be either monotonic increasing or monotonic + decreasing. + y : array, shape = [n] + y coordinates. + reorder : boolean, optional (default='deprecated') + Whether to sort x before computing. If False, assume that x must be + either monotonic increasing or monotonic decreasing. If True, y is + used to break ties when sorting x. Make sure that y has a monotonic + relation to x when setting reorder to True. + + .. deprecated:: 0.20 + Parameter ``reorder`` has been deprecated in version 0.20 and will + be removed in 0.22. It's introduced for roc_auc_score (not for + general use) and is no longer used there. What's more, the result + from auc will be significantly influenced if x is sorted + unexpectedly due to slight floating point error (See issue #9786). + Future (and default) behavior is equivalent to ``reorder=False``. + + Returns + ------- + auc : float + + Examples + -------- + >>> import numpy as np + >>> from sklearn import metrics + >>> y = np.array([1, 1, 2, 2]) + >>> pred = np.array([0.1, 0.4, 0.35, 0.8]) + >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2) + >>> metrics.auc(fpr, tpr) + 0.75 + + See also + -------- + roc_auc_score : Compute the area under the ROC curve + average_precision_score : Compute average precision from prediction scores + precision_recall_curve : + Compute precision-recall pairs for different probability thresholds + """ + check_consistent_length(x, y) + x = column_or_1d(x) + y = column_or_1d(y) + + if x.shape[0] < 2: + raise ValueError('At least 2 points are needed to compute' + ' area under curve, but x.shape = %s' % x.shape) + + if reorder != 'deprecated': + warnings.warn("The 'reorder' parameter has been deprecated in " + "version 0.20 and will be removed in 0.22. It is " + "recommended not to set 'reorder' and ensure that x " + "is monotonic increasing or monotonic decreasing.", + DeprecationWarning) + + direction = 1 + if reorder is True: + # reorder the data points according to the x axis and using y to + # break ties + order = np.lexsort((y, x)) + x, y = x[order], y[order] + else: + dx = np.diff(x) + if np.any(dx < 0): + if np.all(dx <= 0): + direction = -1 + else: + raise ValueError("x is neither increasing nor decreasing " + ": {}.".format(x)) + + area = direction * np.trapz(y, x) + if isinstance(area, np.memmap): + # Reductions such as .sum used internally in np.trapz do not return a + # scalar by default for numpy.memmap instances contrary to + # regular numpy.ndarray instances. + area = area.dtype.type(area) + return area + + +def average_precision_score(y_true, y_score, average="macro", + sample_weight=None): + """Compute average precision (AP) from prediction scores + + AP summarizes a precision-recall curve as the weighted mean of precisions + achieved at each threshold, with the increase in recall from the previous + threshold used as the weight: + + .. math:: + \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n + + where :math:`P_n` and :math:`R_n` are the precision and recall at the nth + threshold [1]_. This implementation is not interpolated and is different + from computing the area under the precision-recall curve with the + trapezoidal rule, which uses linear interpolation and can be too + optimistic. + + Note: this implementation is restricted to the binary classification task + or multilabel classification task. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array, shape = [n_samples] or [n_samples, n_classes] + True binary labels (either {0, 1} or {-1, 1}). + + y_score : array, shape = [n_samples] or [n_samples, n_classes] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + + average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] + If ``None``, the scores for each class are returned. Otherwise, + this determines the type of averaging performed on the data: + + ``'micro'``: + Calculate metrics globally by considering each element of the label + indicator matrix as a label. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average, weighted + by support (the number of true instances for each label). + ``'samples'``: + Calculate metrics for each instance, and find their average. + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + average_precision : float + + References + ---------- + .. [1] `Wikipedia entry for the Average precision + `_ + + See also + -------- + roc_auc_score : Compute the area under the ROC curve + + precision_recall_curve : + Compute precision-recall pairs for different probability thresholds + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import average_precision_score + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> average_precision_score(y_true, y_scores) # doctest: +ELLIPSIS + 0.83... + + """ + def _binary_uninterpolated_average_precision( + y_true, y_score, sample_weight=None): + precision, recall, thresholds = precision_recall_curve( + y_true, y_score, sample_weight=sample_weight) + # Return the step function integral + # The following works because the last entry of precision is + # guaranteed to be 1, as returned by precision_recall_curve + return -np.sum(np.diff(recall) * np.array(precision)[:-1]) + + return _average_binary_score(_binary_uninterpolated_average_precision, + y_true, y_score, average, + sample_weight=sample_weight) + + +def detection_error_tradeoff(y_true, probas_pred, pos_label=None, + sample_weight=None): + """Compute error rates for different probability thresholds + + Note: this implementation is restricted to the binary classification task. + + Parameters + ---------- + y_true : array, shape = [n_samples] + True targets of binary classification in range {-1, 1} or {0, 1}. + + probas_pred : array, shape = [n_samples] + Estimated probabilities or decision function. + + pos_label : int, optional (default=None) + The label of the positive class + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + fps : array, shape = [n_thresholds] + A count of false positives, at index i being the number of negative + samples assigned a score >= thresholds[i]. The total number of + negative samples is equal to fps[-1] (thus true negatives are given by + fps[-1] - fps). + + fns : array, shape = [n_thresholds] + A count of false negatives, at index i being the number of positive + samples assigned a score < thresholds[i]. The total number of + positive samples is equal to tps[-1] (thus false negatives are given by + tps[-1] - tps). + + thresholds : array, shape = [n_thresholds] + Decreasing score values. + + References + ---------- + .. [1] `Wikipedia entry for Detection error tradeoff + `_ + .. [2] `The DET Curve in Assessment of Detection Task Performance + `_ + .. [3] `2008 NIST Speaker Recognition Evaluation Results + `_ + .. [4] `DET-Curve Plotting software for use with MATLAB + `_ + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import detection_error_tradeoff + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> fps, fns, thresholds = detection_error_tradeoff(y_true, y_scores) + >>> fps + array([ 0.5, 0.5, 0. ]) + >>> fns + array([ 0. , 0.5, 0.5]) + >>> thresholds + array([ 0.35, 0.4 , 0.8 ]) + + """ + fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred, + pos_label=pos_label, + sample_weight=sample_weight) + fns = tps[-1] - tps + tp_count = tps[-1] + tn_count = (fps[-1] - fps)[0] + + # start with false positives is zero and stop with false negatives zero + # and reverse the outputs so list of false positives is decreasing + last_ind = tps.searchsorted(tps[-1]) + 1 + first_ind = fps[::-1].searchsorted(fps[0]) + sl = range(first_ind, last_ind)[::-1] + return fps[sl] / tp_count, fns[sl] / tn_count, thresholds[sl] + + +def roc_auc_score(y_true, y_score, average="macro", sample_weight=None): + """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) + from prediction scores. + + Note: this implementation is restricted to the binary classification task + or multilabel classification task in label indicator format. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array, shape = [n_samples] or [n_samples, n_classes] + True binary labels or binary label indicators. + + y_score : array, shape = [n_samples] or [n_samples, n_classes] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). For binary + y_true, y_score is supposed to be the score of the class with greater + label. + + average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] + If ``None``, the scores for each class are returned. Otherwise, + this determines the type of averaging performed on the data: + + ``'micro'``: + Calculate metrics globally by considering each element of the label + indicator matrix as a label. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average, weighted + by support (the number of true instances for each label). + ``'samples'``: + Calculate metrics for each instance, and find their average. + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + auc : float + + References + ---------- + .. [1] `Wikipedia entry for the Receiver operating characteristic + `_ + + .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition + Letters, 2006, 27(8):861-874. + + See also + -------- + average_precision_score : Area under the precision-recall curve + + roc_curve : Compute Receiver operating characteristic (ROC) curve + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import roc_auc_score + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> roc_auc_score(y_true, y_scores) + 0.75 + + """ + def _binary_roc_auc_score(y_true, y_score, sample_weight=None): + if len(np.unique(y_true)) != 2: + raise ValueError("Only one class present in y_true. ROC AUC score " + "is not defined in that case.") + + fpr, tpr, tresholds = roc_curve(y_true, y_score, + sample_weight=sample_weight) + return auc(fpr, tpr) + + y_type = type_of_target(y_true) + if y_type == "binary": + labels = np.unique(y_true) + y_true = label_binarize(y_true, labels)[:, 0] + + return _average_binary_score( + _binary_roc_auc_score, y_true, y_score, average, + sample_weight=sample_weight) + + +def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): + """Calculate true and false positives per binary classification threshold. + + Parameters + ---------- + y_true : array, shape = [n_samples] + True targets of binary classification + + y_score : array, shape = [n_samples] + Estimated probabilities or decision function + + pos_label : int or str, default=None + The label of the positive class + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + fps : array, shape = [n_thresholds] + A count of false positives, at index i being the number of negative + samples assigned a score >= thresholds[i]. The total number of + negative samples is equal to fps[-1] (thus true negatives are given by + fps[-1] - fps). + + tps : array, shape = [n_thresholds <= len(np.unique(y_score))] + An increasing count of true positives, at index i being the number + of positive samples assigned a score >= thresholds[i]. The total + number of positive samples is equal to tps[-1] (thus false negatives + are given by tps[-1] - tps). + + thresholds : array, shape = [n_thresholds] + Decreasing score values. + """ + # Check to make sure y_true is valid + y_type = type_of_target(y_true) + if not (y_type == "binary" or + (y_type == "multiclass" and pos_label is not None)): + raise ValueError("{0} format is not supported".format(y_type)) + + check_consistent_length(y_true, y_score, sample_weight) + y_true = column_or_1d(y_true) + y_score = column_or_1d(y_score) + assert_all_finite(y_true) + assert_all_finite(y_score) + + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight) + + # ensure binary classification if pos_label is not specified + classes = np.unique(y_true) + if (pos_label is None and + not (np.array_equal(classes, [0, 1]) or + np.array_equal(classes, [-1, 1]) or + np.array_equal(classes, [0]) or + np.array_equal(classes, [-1]) or + np.array_equal(classes, [1]))): + raise ValueError("Data is not binary and pos_label is not specified") + elif pos_label is None: + pos_label = 1. + + # make y_true a boolean vector + y_true = (y_true == pos_label) + + # sort scores and corresponding truth values + desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] + y_score = y_score[desc_score_indices] + y_true = y_true[desc_score_indices] + if sample_weight is not None: + weight = sample_weight[desc_score_indices] + else: + weight = 1. + + # y_score typically has many tied values. Here we extract + # the indices associated with the distinct values. We also + # concatenate a value for the end of the curve. + distinct_value_indices = np.where(np.diff(y_score))[0] + threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] + + # accumulate the true positives with decreasing threshold + tps = stable_cumsum(y_true * weight)[threshold_idxs] + if sample_weight is not None: + # express fps as a cumsum to ensure fps is increasing even in + # the presense of floating point errors + fps = stable_cumsum((1 - y_true) * weight)[threshold_idxs] + else: + fps = 1 + threshold_idxs - tps + return fps, tps, y_score[threshold_idxs] + + +def precision_recall_curve(y_true, probas_pred, pos_label=None, + sample_weight=None): + """Compute precision-recall pairs for different probability thresholds + + Note: this implementation is restricted to the binary classification task. + + The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + true positives and ``fp`` the number of false positives. The precision is + intuitively the ability of the classifier not to label as positive a sample + that is negative. + + The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + true positives and ``fn`` the number of false negatives. The recall is + intuitively the ability of the classifier to find all the positive samples. + + The last precision and recall values are 1. and 0. respectively and do not + have a corresponding threshold. This ensures that the graph starts on the + y axis. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array, shape = [n_samples] + True targets of binary classification in range {-1, 1} or {0, 1}. + + probas_pred : array, shape = [n_samples] + Estimated probabilities or decision function. + + pos_label : int or str, default=None + The label of the positive class + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + precision : array, shape = [n_thresholds + 1] + Precision values such that element i is the precision of + predictions with score >= thresholds[i] and the last element is 1. + + recall : array, shape = [n_thresholds + 1] + Decreasing recall values such that element i is the recall of + predictions with score >= thresholds[i] and the last element is 0. + + thresholds : array, shape = [n_thresholds <= len(np.unique(probas_pred))] + Increasing thresholds on the decision function used to compute + precision and recall. + + See also + -------- + average_precision_score : Compute average precision from prediction scores + + roc_curve : Compute Receiver operating characteristic (ROC) curve + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import precision_recall_curve + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> precision, recall, thresholds = precision_recall_curve( + ... y_true, y_scores) + >>> precision # doctest: +ELLIPSIS + array([ 0.66..., 0.5 , 1. , 1. ]) + >>> recall + array([ 1. , 0.5, 0.5, 0. ]) + >>> thresholds + array([ 0.35, 0.4 , 0.8 ]) + + """ + fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred, + pos_label=pos_label, + sample_weight=sample_weight) + + precision = tps / (tps + fps) + recall = tps / tps[-1] + + # stop when full recall attained + # and reverse the outputs so recall is decreasing + last_ind = tps.searchsorted(tps[-1]) + sl = slice(last_ind, None, -1) + return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl] + + +def roc_curve(y_true, y_score, pos_label=None, sample_weight=None, + drop_intermediate=True): + """Compute Receiver operating characteristic (ROC) + + Note: this implementation is restricted to the binary classification task. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + + y_true : array, shape = [n_samples] + True binary labels. If labels are not either {-1, 1} or {0, 1}, then + pos_label should be explicitly given. + + y_score : array, shape = [n_samples] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + + pos_label : int or str, default=None + Label considered as positive and others are considered negative. + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + drop_intermediate : boolean, optional (default=True) + Whether to drop some suboptimal thresholds which would not appear + on a plotted ROC curve. This is useful in order to create lighter + ROC curves. + + .. versionadded:: 0.17 + parameter *drop_intermediate*. + + Returns + ------- + fpr : array, shape = [>2] + Increasing false positive rates such that element i is the false + positive rate of predictions with score >= thresholds[i]. + + tpr : array, shape = [>2] + Increasing true positive rates such that element i is the true + positive rate of predictions with score >= thresholds[i]. + + thresholds : array, shape = [n_thresholds] + Decreasing thresholds on the decision function used to compute + fpr and tpr. `thresholds[0]` represents no instances being predicted + and is arbitrarily set to `max(y_score) + 1`. + + See also + -------- + roc_auc_score : Compute the area under the ROC curve + + Notes + ----- + Since the thresholds are sorted from low to high values, they + are reversed upon returning them to ensure they correspond to both ``fpr`` + and ``tpr``, which are sorted in reversed order during their calculation. + + References + ---------- + .. [1] `Wikipedia entry for the Receiver operating characteristic + `_ + + .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition + Letters, 2006, 27(8):861-874. + + Examples + -------- + >>> import numpy as np + >>> from sklearn import metrics + >>> y = np.array([1, 1, 2, 2]) + >>> scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) + >>> fpr + array([ 0. , 0. , 0.5, 0.5, 1. ]) + >>> tpr + array([ 0. , 0.5, 0.5, 1. , 1. ]) + >>> thresholds + array([ 1.8 , 0.8 , 0.4 , 0.35, 0.1 ]) + + """ + fps, tps, thresholds = _binary_clf_curve( + y_true, y_score, pos_label=pos_label, sample_weight=sample_weight) + + # Attempt to drop thresholds corresponding to points in between and + # collinear with other points. These are always suboptimal and do not + # appear on a plotted ROC curve (and thus do not affect the AUC). + # Here np.diff(_, 2) is used as a "second derivative" to tell if there + # is a corner at the point. Both fps and tps must be tested to handle + # thresholds with multiple data points (which are combined in + # _binary_clf_curve). This keeps all cases where the point should be kept, + # but does not drop more complicated cases like fps = [1, 3, 7], + # tps = [1, 2, 4]; there is no harm in keeping too many thresholds. + if drop_intermediate and len(fps) > 2: + optimal_idxs = np.where(np.r_[True, + np.logical_or(np.diff(fps, 2), + np.diff(tps, 2)), + True])[0] + fps = fps[optimal_idxs] + tps = tps[optimal_idxs] + thresholds = thresholds[optimal_idxs] + + if tps.size == 0 or fps[0] != 0 or tps[0] != 0: + # Add an extra threshold position if necessary + # to make sure that the curve starts at (0, 0) + tps = np.r_[0, tps] + fps = np.r_[0, fps] + thresholds = np.r_[thresholds[0] + 1, thresholds] + + if fps[-1] <= 0: + warnings.warn("No negative samples in y_true, " + "false positive value should be meaningless", + UndefinedMetricWarning) + fpr = np.repeat(np.nan, fps.shape) + else: + fpr = fps / fps[-1] + + if tps[-1] <= 0: + warnings.warn("No positive samples in y_true, " + "true positive value should be meaningless", + UndefinedMetricWarning) + tpr = np.repeat(np.nan, tps.shape) + else: + tpr = tps / tps[-1] + + return fpr, tpr, thresholds + + +def label_ranking_average_precision_score(y_true, y_score): + """Compute ranking-based average precision + + Label ranking average precision (LRAP) is the average over each ground + truth label assigned to each sample, of the ratio of true vs. total + labels with lower score. + + This metric is used in multilabel ranking problem, where the goal + is to give better rank to the labels associated to each sample. + + The obtained score is always strictly greater than 0 and + the best value is 1. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array or sparse matrix, shape = [n_samples, n_labels] + True binary labels in binary indicator format. + + y_score : array, shape = [n_samples, n_labels] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + + Returns + ------- + score : float + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import label_ranking_average_precision_score + >>> y_true = np.array([[1, 0, 0], [0, 0, 1]]) + >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]]) + >>> label_ranking_average_precision_score(y_true, y_score) \ + # doctest: +ELLIPSIS + 0.416... + + """ + check_consistent_length(y_true, y_score) + y_true = check_array(y_true, ensure_2d=False) + y_score = check_array(y_score, ensure_2d=False) + + if y_true.shape != y_score.shape: + raise ValueError("y_true and y_score have different shape") + + # Handle badly formatted array and the degenerate case with one label + y_type = type_of_target(y_true) + if (y_type != "multilabel-indicator" and + not (y_type == "binary" and y_true.ndim == 2)): + raise ValueError("{0} format is not supported".format(y_type)) + + y_true = csr_matrix(y_true) + y_score = -y_score + + n_samples, n_labels = y_true.shape + + out = 0. + for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): + relevant = y_true.indices[start:stop] + + if (relevant.size == 0 or relevant.size == n_labels): + # If all labels are relevant or unrelevant, the score is also + # equal to 1. The label ranking has no meaning. + out += 1. + continue + + scores_i = y_score[i] + rank = rankdata(scores_i, 'max')[relevant] + L = rankdata(scores_i[relevant], 'max') + out += (L / rank).mean() + + return out / n_samples + + +def coverage_error(y_true, y_score, sample_weight=None): + """Coverage error measure + + Compute how far we need to go through the ranked scores to cover all + true labels. The best value is equal to the average number + of labels in ``y_true`` per sample. + + Ties in ``y_scores`` are broken by giving maximal rank that would have + been assigned to all tied values. + + Note: Our implementation's score is 1 greater than the one given in + Tsoumakas et al., 2010. This extends it to handle the degenerate case + in which an instance has 0 true labels. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array, shape = [n_samples, n_labels] + True binary labels in binary indicator format. + + y_score : array, shape = [n_samples, n_labels] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + coverage_error : float + + References + ---------- + .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). + Mining multi-label data. In Data mining and knowledge discovery + handbook (pp. 667-685). Springer US. + + """ + y_true = check_array(y_true, ensure_2d=False) + y_score = check_array(y_score, ensure_2d=False) + check_consistent_length(y_true, y_score, sample_weight) + + y_type = type_of_target(y_true) + if y_type != "multilabel-indicator": + raise ValueError("{0} format is not supported".format(y_type)) + + if y_true.shape != y_score.shape: + raise ValueError("y_true and y_score have different shape") + + y_score_mask = np.ma.masked_array(y_score, mask=np.logical_not(y_true)) + y_min_relevant = y_score_mask.min(axis=1).reshape((-1, 1)) + coverage = (y_score >= y_min_relevant).sum(axis=1) + coverage = coverage.filled(0) + + return np.average(coverage, weights=sample_weight) + + +def label_ranking_loss(y_true, y_score, sample_weight=None): + """Compute Ranking loss measure + + Compute the average number of label pairs that are incorrectly ordered + given y_score weighted by the size of the label set and the number of + labels not in the label set. + + This is similar to the error set size, but weighted by the number of + relevant and irrelevant labels. The best performance is achieved with + a ranking loss of zero. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.17 + A function *label_ranking_loss* + + Parameters + ---------- + y_true : array or sparse matrix, shape = [n_samples, n_labels] + True binary labels in binary indicator format. + + y_score : array, shape = [n_samples, n_labels] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + loss : float + + References + ---------- + .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). + Mining multi-label data. In Data mining and knowledge discovery + handbook (pp. 667-685). Springer US. + + """ + y_true = check_array(y_true, ensure_2d=False, accept_sparse='csr') + y_score = check_array(y_score, ensure_2d=False) + check_consistent_length(y_true, y_score, sample_weight) + + y_type = type_of_target(y_true) + if y_type not in ("multilabel-indicator",): + raise ValueError("{0} format is not supported".format(y_type)) + + if y_true.shape != y_score.shape: + raise ValueError("y_true and y_score have different shape") + + n_samples, n_labels = y_true.shape + + y_true = csr_matrix(y_true) + + loss = np.zeros(n_samples) + for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): + # Sort and bin the label scores + unique_scores, unique_inverse = np.unique(y_score[i], + return_inverse=True) + true_at_reversed_rank = np.bincount( + unique_inverse[y_true.indices[start:stop]], + minlength=len(unique_scores)) + all_at_reversed_rank = np.bincount(unique_inverse, + minlength=len(unique_scores)) + false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank + + # if the scores are ordered, it's possible to count the number of + # incorrectly ordered paires in linear time by cumulatively counting + # how many false labels of a given score have a score higher than the + # accumulated true labels with lower score. + loss[i] = np.dot(true_at_reversed_rank.cumsum(), + false_at_reversed_rank) + + n_positives = count_nonzero(y_true, axis=1) + with np.errstate(divide="ignore", invalid="ignore"): + loss /= ((n_labels - n_positives) * n_positives) + + # When there is no positive or no negative labels, those values should + # be consider as correct, i.e. the ranking doesn't matter. + loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0. + + return np.average(loss, weights=sample_weight) From 4f65f963a7f23cb21047e3a00055cde871d44692 Mon Sep 17 00:00:00 2001 From: Julien Cornebise Date: Sun, 9 Apr 2017 17:11:08 +0100 Subject: [PATCH 02/36] Add DET to exports --- sklearn/metrics/__init__.py | 1 + sklearn/metrics/ranking.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 7b9bd5bd10e5d..1011614510c14 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -105,6 +105,7 @@ 'coverage_error', 'dcg_score', 'davies_bouldin_score', + 'detection_error_tradeoff', 'euclidean_distances', 'explained_variance_score', 'f1_score', diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 64c8ddbfe2b38..99f0650c435d2 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -218,7 +218,7 @@ def _binary_uninterpolated_average_precision( sample_weight=sample_weight) -def detection_error_tradeoff(y_true, probas_pred, pos_label=None, +def detection_error_tradeoff(y_true, y_score, pos_label=None, sample_weight=None): """Compute error rates for different probability thresholds @@ -229,7 +229,7 @@ def detection_error_tradeoff(y_true, probas_pred, pos_label=None, y_true : array, shape = [n_samples] True targets of binary classification in range {-1, 1} or {0, 1}. - probas_pred : array, shape = [n_samples] + y_score : array, shape = [n_samples] Estimated probabilities or decision function. pos_label : int, optional (default=None) @@ -281,7 +281,7 @@ def detection_error_tradeoff(y_true, probas_pred, pos_label=None, array([ 0.35, 0.4 , 0.8 ]) """ - fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred, + fps, tps, thresholds = _binary_clf_curve(y_true, y_score, pos_label=pos_label, sample_weight=sample_weight) fns = tps[-1] - tps From ec2973d56c76db80760cf5a1dbcdea68339e126d Mon Sep 17 00:00:00 2001 From: Daniel Mohns Date: Fri, 28 Jul 2017 15:39:59 +0200 Subject: [PATCH 03/36] Fix DET-curve doctest errors - Sample snippet in model_evaluation documentation was outdated. --- doc/modules/model_evaluation.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 5e8ab9a583fce..3745c909c1647 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1454,13 +1454,13 @@ positive class, confidence values, or binary decisions. Here is a small example of how to use the :func:`detection_error_tradeoff` function:: >>> import numpy as np - >>> from sklearn.metrics import det_error_tradeoff - >>> y = np.array([1, 1, 2, 2]) - >>> scores = np.array([0.1, 0.4, 0.35, 0.8]) - >>> fpr, tpr, thresholds = error_detection_tradeoff(y, scores, pos_label=2) - >>> fpr + >>> from sklearn.metrics import detection_error_tradeoff + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> fps, fns, thresholds = detection_error_tradeoff(y_true, y_scores) + >>> fps array([ 0.5, 0.5, 0. ]) - >>> fnr + >>> fns array([ 0. , 0.5, 0.5]) >>> thresholds array([ 0.35, 0.4 , 0.8 ]) From 90d681d74bf5abc1ad70e8d8685124cd3b9ddb62 Mon Sep 17 00:00:00 2001 From: daniel Date: Mon, 7 Aug 2017 19:15:34 +0200 Subject: [PATCH 04/36] Clarify wording in DET-curve computation - Align to the wording of ranking module to make it consistent. - Add correct describtion of input and outputs. - Update and fix non-existent links --- doc/modules/classes.rst | 1 + doc/modules/model_evaluation.rst | 25 ++++++++++---- sklearn/metrics/ranking.py | 59 +++++++++++++++----------------- 3 files changed, 47 insertions(+), 38 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 2e54d000a13aa..2fd1366e18434 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -946,6 +946,7 @@ details. metrics.cohen_kappa_score metrics.confusion_matrix metrics.dcg_score + metrics.detection_error_tradeoff_curve metrics.f1_score metrics.fbeta_score metrics.hamming_loss diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 3745c909c1647..b29a38a8042b7 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -306,6 +306,8 @@ Some of these are restricted to the binary classification case: precision_recall_curve roc_curve + balanced_accuracy_score + detection_error_tradeoff_curve Others also work in the multiclass case: @@ -1442,7 +1444,7 @@ to the given limit. Detection error tradeoff (DET) --------------------------------------- -The function :func:`detection_error_tradeoff` computes the +The function :func:`detection_error_tradeoff_curve` computes the `detection error tradeoff curve, or DET curve `_. Quoting Wikipedia : @@ -1451,20 +1453,31 @@ Quoting Wikipedia : This function requires the true binary value and the target scores, which can either be probability estimates of the positive class, confidence values, or binary decisions. -Here is a small example of how to use the :func:`detection_error_tradeoff` function:: +Here is a small example of how to use the :func:`detection_error_tradeoff_curve` function:: >>> import numpy as np - >>> from sklearn.metrics import detection_error_tradeoff + >>> from sklearn.metrics import detection_error_tradeoff_curve >>> y_true = np.array([0, 0, 1, 1]) >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) - >>> fps, fns, thresholds = detection_error_tradeoff(y_true, y_scores) - >>> fps + >>> fpr, fnr, thresholds = detection_error_tradeoff_curve(y_true, y_scores) + >>> fpr array([ 0.5, 0.5, 0. ]) - >>> fns + >>> fnr array([ 0. , 0.5, 0.5]) >>> thresholds array([ 0.35, 0.4 , 0.8 ]) + +.. topic:: References: + + * `Wikipedia entry for Detection error tradeoff + `_ + * A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, + `The DET Curve in Assessment of Detection Task Performance `_, + NIST 1997. + * `2008 NIST Speaker Recognition Evaluation Results `_ + * `DET-Curve Plotting software for use with MATLAB `_ + .. _zero_one_loss: Zero one loss diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 99f0650c435d2..38cd944334110 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -15,7 +15,6 @@ # Lars Buitinck # Joel Nothman # Noel Dawe -# Jeremy Karnowski # License: BSD 3 clause from __future__ import division @@ -218,11 +217,13 @@ def _binary_uninterpolated_average_precision( sample_weight=sample_weight) -def detection_error_tradeoff(y_true, y_score, pos_label=None, - sample_weight=None): - """Compute error rates for different probability thresholds +def detection_error_tradeoff_curve(y_true, y_score, pos_label=None, + sample_weight=None): + """Compute error rates for different probability thresholds. - Note: this implementation is restricted to the binary classification task. + Note: This implementation is restricted to the binary classification task. + + Read more in the :ref:`User Guide `. Parameters ---------- @@ -240,42 +241,35 @@ def detection_error_tradeoff(y_true, y_score, pos_label=None, Returns ------- - fps : array, shape = [n_thresholds] - A count of false positives, at index i being the number of negative - samples assigned a score >= thresholds[i]. The total number of - negative samples is equal to fps[-1] (thus true negatives are given by - fps[-1] - fps). + fpr : array, shape = [n_thresholds] + False positive rate (FPR) such that element i is the false positive + rate of predictions with score >= thresholds[i]. This is occasionally + referred to as false acceptance propability or fall-out. - fns : array, shape = [n_thresholds] - A count of false negatives, at index i being the number of positive - samples assigned a score < thresholds[i]. The total number of - positive samples is equal to tps[-1] (thus false negatives are given by - tps[-1] - tps). + fnr : array, shape = [n_thresholds] + False negative rate (FNR) such that element i is the false negative + rate of predictions with score >= thresholds[i]. This is occasionally + referred to as false rejection or miss rate. thresholds : array, shape = [n_thresholds] Decreasing score values. - References - ---------- - .. [1] `Wikipedia entry for Detection error tradeoff - `_ - .. [2] `The DET Curve in Assessment of Detection Task Performance - `_ - .. [3] `2008 NIST Speaker Recognition Evaluation Results - `_ - .. [4] `DET-Curve Plotting software for use with MATLAB - `_ + See also + -------- + roc_curve : Compute Receiver operating characteristic (ROC) curve + + precision_recall_curve : Compute precision-recall curve Examples -------- >>> import numpy as np - >>> from sklearn.metrics import detection_error_tradeoff + >>> from sklearn.metrics import detection_error_tradeoff_curve >>> y_true = np.array([0, 0, 1, 1]) >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) - >>> fps, fns, thresholds = detection_error_tradeoff(y_true, y_scores) - >>> fps + >>> fpr, fnr, thresholds = detection_error_tradeoff_curve(y_true, y_scores) + >>> fpr array([ 0.5, 0.5, 0. ]) - >>> fns + >>> fnr array([ 0. , 0.5, 0.5]) >>> thresholds array([ 0.35, 0.4 , 0.8 ]) @@ -284,16 +278,17 @@ def detection_error_tradeoff(y_true, y_score, pos_label=None, fps, tps, thresholds = _binary_clf_curve(y_true, y_score, pos_label=pos_label, sample_weight=sample_weight) + fns = tps[-1] - tps - tp_count = tps[-1] - tn_count = (fps[-1] - fps)[0] + p_count = tps[-1] + n_count = fps[-1] # start with false positives is zero and stop with false negatives zero # and reverse the outputs so list of false positives is decreasing last_ind = tps.searchsorted(tps[-1]) + 1 first_ind = fps[::-1].searchsorted(fps[0]) sl = range(first_ind, last_ind)[::-1] - return fps[sl] / tp_count, fns[sl] / tn_count, thresholds[sl] + return fps[sl] / n_count, fns[sl] / p_count, thresholds[sl] def roc_auc_score(y_true, y_score, average="macro", sample_weight=None): From b594b9021f9e7b4b392ed33943195dad9b1d75f3 Mon Sep 17 00:00:00 2001 From: daniel Date: Sun, 11 Feb 2018 16:17:51 +0100 Subject: [PATCH 05/36] Beautify DET curve documentation source - Limit line length to 80 characters. --- doc/modules/model_evaluation.rst | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index b29a38a8042b7..a5f22423416a7 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1445,15 +1445,23 @@ Detection error tradeoff (DET) --------------------------------------- The function :func:`detection_error_tradeoff_curve` computes the -`detection error tradeoff curve, or DET curve `_. +`detection error tradeoff curve (DET) curve +`_. Quoting Wikipedia : - "A detection error tradeoff (DET) graph is a graphical plot of error rates for binary classification systems, plotting false reject rate vs. false accept rate. The x- and y-axes are scaled non-linearly by their standard normal deviates (or just by logarithmic transformation), yielding tradeoff curves that are more linear than ROC curves, and use most of the image area to highlight the differences of importance in the critical operating region." + "A detection error tradeoff (DET) graph is a graphical plot of error rates for + binary classification systems, plotting false reject rate vs. false accept + rate. The x- and y-axes are scaled non-linearly by their standard normal + deviates (or just by logarithmic transformation), yielding tradeoff curves + that are more linear than ROC curves, and use most of the image area to + highlight the differences of importance in the critical operating region." -This function requires the true binary -value and the target scores, which can either be probability estimates of the -positive class, confidence values, or binary decisions. -Here is a small example of how to use the :func:`detection_error_tradeoff_curve` function:: +This function requires the true binary value and the target scores, which can +either be probability estimates of the positive class, confidence values, or +binary decisions. + +Here is a small example of how to use the +:func:`detection_error_tradeoff_curve` function:: >>> import numpy as np >>> from sklearn.metrics import detection_error_tradeoff_curve @@ -1473,10 +1481,13 @@ Here is a small example of how to use the :func:`detection_error_tradeoff_curve` * `Wikipedia entry for Detection error tradeoff `_ * A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, - `The DET Curve in Assessment of Detection Task Performance `_, + `The DET Curve in Assessment of Detection Task Performance + `_, NIST 1997. - * `2008 NIST Speaker Recognition Evaluation Results `_ - * `DET-Curve Plotting software for use with MATLAB `_ + * `2008 NIST Speaker Recognition Evaluation Results + `_ + * `DET-Curve Plotting software for use with MATLAB + `_ .. _zero_one_loss: From c588aa16c41e8f789ea0bb6231c97033169923b2 Mon Sep 17 00:00:00 2001 From: daniel Date: Sun, 18 Feb 2018 18:37:34 +0100 Subject: [PATCH 06/36] Expand DET curve documentation - Add an example plot to show difference between ROC and DET curves. - Expand Usage Note section with background information and properties of DET curves. --- doc/modules/model_evaluation.rst | 76 ++++++++++++----- examples/model_selection/plot_det.py | 121 +++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 21 deletions(-) create mode 100644 examples/model_selection/plot_det.py diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index a5f22423416a7..81dafaf99028d 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1442,7 +1442,7 @@ to the given limit. .. _det_curve: Detection error tradeoff (DET) ---------------------------------------- +------------------------------ The function :func:`detection_error_tradeoff_curve` computes the `detection error tradeoff curve (DET) curve @@ -1456,37 +1456,71 @@ Quoting Wikipedia : that are more linear than ROC curves, and use most of the image area to highlight the differences of importance in the critical operating region." -This function requires the true binary value and the target scores, which can -either be probability estimates of the positive class, confidence values, or -binary decisions. +DET curves are a variation of receiver operating characteristic (ROC) curves +where False Negative Rate is plotted on the ordinate instead of True Positive +Rate. +DET curves are commonly plotted in normal deviate scale by transformation with +:math:`\phi^{-1}` (with :math:`\phi` being the cumulative distribution function) +. +The resulting performance curves explicitly visualize the tradeoff of error +types for given classification algorithms. -Here is a small example of how to use the -:func:`detection_error_tradeoff_curve` function:: +This figure compares the ROC and DET curves of two example classifiers on the +same classification task: - >>> import numpy as np - >>> from sklearn.metrics import detection_error_tradeoff_curve - >>> y_true = np.array([0, 0, 1, 1]) - >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) - >>> fpr, fnr, thresholds = detection_error_tradeoff_curve(y_true, y_scores) - >>> fpr - array([ 0.5, 0.5, 0. ]) - >>> fnr - array([ 0. , 0.5, 0.5]) - >>> thresholds - array([ 0.35, 0.4 , 0.8 ]) +.. image:: ../auto_examples/model_selection/images/sphx_glr_plot_det_001.png + :target: ../auto_examples/model_selection/plot_det.html + :scale: 75 + :align: center + +**Properties:** + +* DET curves form a linear curve in normal deviate scale if the detection + scores are normally (or close-to normally) distributed. + It was shown by [4]_ that the reverse it not necessarily true and even more + general distributions are able produce linear DET curves. + +* The normal deviate scale transformation spreads out the points such that a + comparatively larger space of plot is occupied. + Therefor curves with similar classification performance might be easier to + distinguish on a DET plot. + +* With False Negative Rate being "inverse" to True Positive Rate the point + of perfection for DET curves is the origin (in contrast to the top left corner + for ROC curves). + +**Applications and limitations:** + +DET curves are intuitive to read and hence allow quick visual assessment of a +classifiers performance. +Additionally DET curves can be consulted for threshold and operating point +analysis if an comparison error types is required. + +One the other hand DET curves do not provide their metric as a single number. +Therefor for either automated evaluation or comparison to other +classification tasks metrics like the derived area under ROC curve might be +better suited. + +.. topic:: Examples: + * See :ref:`sphx_glr_auto_examples_model_selection_plot_det.py` + for an example comparison between receiver operating characteristic (ROC) + curves and Detection error tradeoff (DET) curves. .. topic:: References: - * `Wikipedia entry for Detection error tradeoff + .. [1] `Wikipedia entry for Detection error tradeoff `_ - * A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, + .. [2] A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, `The DET Curve in Assessment of Detection Task Performance `_, NIST 1997. - * `2008 NIST Speaker Recognition Evaluation Results + .. [3] `2008 NIST Speaker Recognition Evaluation Results `_ - * `DET-Curve Plotting software for use with MATLAB + .. [4] Navratil, Jiri & Klusacek, Dalibor. (2007). On Linear DETs. + Acoustics, Speech, and Signal Processing, 1988. ICASSP-88., + 1988 International Conference on. 4. IV-229 . 10.1109/ICASSP.2007.367205. + .. [5] `DET-Curve Plotting software for use with MATLAB `_ .. _zero_one_loss: diff --git a/examples/model_selection/plot_det.py b/examples/model_selection/plot_det.py new file mode 100644 index 0000000000000..f04191da83271 --- /dev/null +++ b/examples/model_selection/plot_det.py @@ -0,0 +1,121 @@ +""" +======================================= +Detection error tradeoff (DET) curve +======================================= + +In this example we compare receiver operating characteristic (ROC) and +detection error tradeoff (DET) curves to demonstrate how DET curves can help +to asses the performance of different classification algorithms. + +DET curves are commonly plotted in normal deviate scale. +To achieve this we transform the errors rates as returned by the +``detection_error_tradeoff_curve`` function and the axis scale using +``scipy.stats.norm``. + +.. note:: + + - See :func:`sklearn.metrics.roc_curve` for further information about ROC + curves. + + - This example is loosely based on + :ref:`sphx_glr_auto_examples_classification_plot_classifier_comparison.py` + . + +""" +import matplotlib.pyplot as plt + +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.datasets import make_classification +from sklearn.svm import SVC +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import detection_error_tradeoff_curve +from sklearn.metrics import roc_curve + +from scipy.stats import norm +from matplotlib.ticker import FuncFormatter + +N_SAMPLES = 1000 + +names = [ + "Linear SVM", + "Random Forest", +] + +classifiers = [ + SVC(kernel="linear", C=0.025), + RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), +] + +X, y = make_classification( + n_samples=N_SAMPLES, n_features=2, n_redundant=0, n_informative=2, + random_state=1, n_clusters_per_class=1) + +figure = plt.figure(figsize=(10, 5)) + +# preprocess dataset, split into training and test part +X = StandardScaler().fit_transform(X) + +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=.4, random_state=0) + +# prepare plots + +# first plot the ROC curve +ax_roc = plt.subplot(1, 2, 1) +ax_roc.set_title('Receiver Operating Characteristic (ROC) curves') +ax_roc.set_xlabel('False Positive Rate') +ax_roc.set_ylabel('True Positive Rate') +ax_roc.set_xlim(0, 1) +ax_roc.set_ylim(0, 1) +ax_roc.grid(linestyle='--') +ax_roc.yaxis.set_major_formatter( + FuncFormatter(lambda y, _: '{:.0%}'.format(y))) +ax_roc.xaxis.set_major_formatter( + FuncFormatter(lambda y, _: '{:.0%}'.format(y))) + +# second plot the DET curve +ax_det = plt.subplot(1, 2, 2) +ax_det.set_title('Detection Error Tradeoff (DET) curves') +ax_det.set_xlabel('False Positive Rate') +ax_det.set_ylabel('False Negative Rate') +ax_det.set_xlim(-3, 3) +ax_det.set_ylim(-3, 3) +ax_det.grid(linestyle='--') +# customized ticks to represent normal deviate scale +ticks = [0.001, 0.01, 0.05, 0.20, 0.5, 0.80, 0.95, 0.99, 0.999] +tick_locs = norm.ppf(ticks) +tick_lbls = [ + '{:.0%}'.format(s) if (100*s).is_integer() else '{:.1%}'.format(s) + for s in ticks +] + +plt.xticks(tick_locs, tick_lbls) +plt.yticks(tick_locs, tick_lbls) + +# iterate over classifiers +for name, clf in zip(names, classifiers): + clf.fit(X_train, y_train) + + if hasattr(clf, "decision_function"): + y_score = clf.decision_function(X_test) + else: + y_score = clf.predict_proba(X_test)[:, 1] + + roc_fpr, roc_tpr, _ = roc_curve(y_test, y_score) + det_fpr, det_fnr, _ = detection_error_tradeoff_curve(y_test, y_score) + + ax_roc.plot(roc_fpr, roc_tpr) + + # transform errors into normal deviate scale + ax_det.plot( + norm.ppf(det_fpr), + norm.ppf(det_fnr) + ) + +# finally add legend +ax_det = plt.subplot(1, 2, 2) +plt.legend(names, loc="upper right") + +plt.tight_layout() +plt.show() From dc41e08e57ac0576f9bcaaf23574b4d154ab4fb8 Mon Sep 17 00:00:00 2001 From: daniel Date: Mon, 19 Feb 2018 09:55:32 +0100 Subject: [PATCH 07/36] Update DET-curve documentation - Fix typos and some grammar improvements. - Use named references to avoid potential conflicts with other sections. - Remove unneeded references and improved existing ones by using e.g. using versioned links. --- doc/modules/model_evaluation.rst | 43 ++++++++++++++++---------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 81dafaf99028d..daa07627e834d 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1445,9 +1445,8 @@ Detection error tradeoff (DET) ------------------------------ The function :func:`detection_error_tradeoff_curve` computes the -`detection error tradeoff curve (DET) curve -`_. -Quoting Wikipedia : +detection error tradeoff curve (DET) curve [WikipediaDET2017]_. +Quoting Wikipedia: "A detection error tradeoff (DET) graph is a graphical plot of error rates for binary classification systems, plotting false reject rate vs. false accept @@ -1477,12 +1476,12 @@ same classification task: * DET curves form a linear curve in normal deviate scale if the detection scores are normally (or close-to normally) distributed. - It was shown by [4]_ that the reverse it not necessarily true and even more + It was shown by [Navratil2007]_ that the reverse it not necessarily true and even more general distributions are able produce linear DET curves. * The normal deviate scale transformation spreads out the points such that a comparatively larger space of plot is occupied. - Therefor curves with similar classification performance might be easier to + Therefore curves with similar classification performance might be easier to distinguish on a DET plot. * With False Negative Rate being "inverse" to True Positive Rate the point @@ -1493,11 +1492,12 @@ same classification task: DET curves are intuitive to read and hence allow quick visual assessment of a classifiers performance. -Additionally DET curves can be consulted for threshold and operating point -analysis if an comparison error types is required. +Additionally DET curves can be consulted for threshold analysis and operating +point selection. +This is particularly helpful if a comparison of error types is required. One the other hand DET curves do not provide their metric as a single number. -Therefor for either automated evaluation or comparison to other +Therefore for either automated evaluation or comparison to other classification tasks metrics like the derived area under ROC curve might be better suited. @@ -1509,19 +1509,20 @@ better suited. .. topic:: References: - .. [1] `Wikipedia entry for Detection error tradeoff - `_ - .. [2] A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, - `The DET Curve in Assessment of Detection Task Performance - `_, - NIST 1997. - .. [3] `2008 NIST Speaker Recognition Evaluation Results - `_ - .. [4] Navratil, Jiri & Klusacek, Dalibor. (2007). On Linear DETs. - Acoustics, Speech, and Signal Processing, 1988. ICASSP-88., - 1988 International Conference on. 4. IV-229 . 10.1109/ICASSP.2007.367205. - .. [5] `DET-Curve Plotting software for use with MATLAB - `_ + .. [WikipediaDET2017] Wikipedia contributors. Detection error tradeoff. + Wikipedia, The Free Encyclopedia. September 4, 2017, 23:33 UTC. + Available at: https://en.wikipedia.org/w/index.php?title=Detection_error_tradeoff&oldid=798982054. + Accessed February 19, 2018. + .. [Martin1997] A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, + `The DET Curve in Assessment of Detection Task Performance + `_, + NIST 1997. + .. [Navratil2007] J. Navractil and D. Klusacek, + "`On Linear DETs, + `_" + 2007 IEEE International Conference on Acoustics, + Speech and Signal Processing - ICASSP '07, Honolulu, + HI, 2007, pp. IV-229-IV-232. .. _zero_one_loss: From 6a2fc6064d7fc41d485555901c5fb257eae32f5f Mon Sep 17 00:00:00 2001 From: daniel Date: Mon, 19 Feb 2018 13:25:45 +0100 Subject: [PATCH 08/36] Select relevant DET points using slice object --- sklearn/metrics/ranking.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 38cd944334110..6b935d8ea5bf3 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -283,12 +283,22 @@ def detection_error_tradeoff_curve(y_true, y_score, pos_label=None, p_count = tps[-1] n_count = fps[-1] - # start with false positives is zero and stop with false negatives zero - # and reverse the outputs so list of false positives is decreasing + # start with false positives zero + first_ind = ( + fps.searchsorted(fps[0], side='right') - 1 + if fps.searchsorted(fps[0], side='right') > 0 + else None + ) + # stop with false negatives zero last_ind = tps.searchsorted(tps[-1]) + 1 - first_ind = fps[::-1].searchsorted(fps[0]) - sl = range(first_ind, last_ind)[::-1] - return fps[sl] / n_count, fns[sl] / p_count, thresholds[sl] + sl = slice(first_ind, last_ind) + + # reverse the output such that list of false positives is decreasing + return ( + fps[sl][::-1] / n_count, + fns[sl][::-1] / p_count, + thresholds[sl][::-1] + ) def roc_auc_score(y_true, y_score, average="macro", sample_weight=None): From d4d3c4c2e8e7eb9af64bdeb4c306ba2816bbc590 Mon Sep 17 00:00:00 2001 From: daniel Date: Tue, 20 Feb 2018 19:41:44 +0100 Subject: [PATCH 09/36] Remove some dubiety from DET curve doc-string --- sklearn/metrics/ranking.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 6b935d8ea5bf3..b40aaa15e9b15 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -221,7 +221,8 @@ def detection_error_tradeoff_curve(y_true, y_score, pos_label=None, sample_weight=None): """Compute error rates for different probability thresholds. - Note: This implementation is restricted to the binary classification task. + Note: This metrics is used for ranking evaluation of the binary + classification task. Read more in the :ref:`User Guide `. From 293512618d3bbbc9b37bece97254ab5aa3e97347 Mon Sep 17 00:00:00 2001 From: daniel Date: Wed, 21 Feb 2018 18:23:44 +0100 Subject: [PATCH 10/36] Add DET curve contributors --- doc/whats_new/v0.20.rst | 12 ++++++++++++ doc/whats_new/v0.24.rst | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 2eaf3199fbc3c..f6044c7ca6140 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -852,8 +852,20 @@ Support for Python 3.3 has been officially dropped. :user:`Guillaume Lemaitre `. +<<<<<<< HEAD :mod:`sklearn.feature_selection` ................................ +======= +- Added the :func:`metrics.balanced_accuracy_score` metric and a corresponding + ``'balanced_accuracy'`` scorer for binary classification. + :issue:`8066` by :user:`xyguo` and :user:`Aman Dalmia `. +- Added :class:`multioutput.RegressorChain` for multi-target + regression. :issue:`9257` by :user:`Kumar Ashutosh `. +- Added :func:`metrics.detection_error_tradeoff_curve` to compute + Detection Error Tradeoff curve classification metric. + :issue:`10591` by :user:`Jeremy Karnowski ` and + :user:`Daniel Mohns `. +>>>>>>> Add DET curve contributors - |Feature| Added select K best features functionality to :class:`feature_selection.SelectFromModel`. diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index cf3347c0ee8cd..e834be6b1f962 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -212,6 +212,11 @@ Changelog :mod:`sklearn.metrics` ...................... +- |Feature| Added :func:`metrics.detection_error_tradeoff_curve` to compute + Detection Error Tradeoff curve classification metric. + :issue:`10591` by :user:`Jeremy Karnowski ` and + :user:`Daniel Mohns `. + - |Feature| Added :func:`metrics.mean_absolute_percentage_error` metric and the associated scorer for regression problems. :issue:`10708` fixed with the PR :pr:`15007` by :user:`Ashutosh Hathidara `. The scorer and From 3b198b26cf155e224efba7e8f501ab9a4e1eef3c Mon Sep 17 00:00:00 2001 From: daniel Date: Sun, 25 Feb 2018 20:29:33 +0100 Subject: [PATCH 11/36] Add tests for DET curves --- sklearn/metrics/tests/test_ranking.py | 95 +++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 3daafa8d196d3..92fa35f9f102a 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -21,6 +21,7 @@ from sklearn.metrics import auc from sklearn.metrics import average_precision_score from sklearn.metrics import coverage_error +from sklearn.metrics import detection_error_tradeoff_curve from sklearn.metrics import label_ranking_average_precision_score from sklearn.metrics import precision_recall_curve from sklearn.metrics import label_ranking_loss @@ -925,6 +926,100 @@ def test_score_scale_invariance(): assert pr_auc == pr_auc_shifted +def check_detection_error_tradeoff_curve(test_case): + # Test detection_error_tradeoff_curve on an array of: + # [y_true, y_score, correct_fpr ,correct_fnr] + fpr, fnr, _ = detection_error_tradeoff_curve( + test_case[0], test_case[1]) + + assert_array_almost_equal(fpr, test_case[2]) + assert_array_almost_equal(fnr, test_case[3]) + + +def test_detection_error_tradeoff_curve_toydata(): + # Check on a batch of small examples. + test_cases = [ + [[0, 0, 1], [0, 0.5, 1], [0], [0]], + [[0, 0, 1], [0, 0.25, 0.5], [0], [0]], + [[0, 0, 1], [0.5, 0.75, 1], [0], [0]], + [[0, 0, 1], [0.25, 0.5, 0.75], [0], [0]], + [[0, 1, 0], [0, 0.5, 1], [0.5], [0]], + [[0, 1, 0], [0, 0.25, 0.5], [0.5], [0]], + [[0, 1, 0], [0.5, 0.75, 1], [0.5], [0]], + [[0, 1, 0], [0.25, 0.5, 0.75], [0.5], [0]], + [[0, 1, 1], [0, 0.5, 1], [0.0], [0]], + [[0, 1, 1], [0, 0.25, 0.5], [0], [0]], + [[0, 1, 1], [0.5, 0.75, 1], [0], [0]], + [[0, 1, 1], [0.25, 0.5, 0.75], [0], [0]], + [[1, 0, 0], [0, 0.5, 1], [1, 1, 0.5], [0, 1, 1]], + [[1, 0, 0], [0, 0.25, 0.5], [1, 1, 0.5], [0, 1, 1]], + [[1, 0, 0], [0.5, 0.75, 1], [1, 1, 0.5], [0, 1, 1]], + [[1, 0, 0], [0.25, 0.5, 0.75], [1, 1, 0.5], [0, 1, 1]], + [[1, 0, 1], [0, 0.5, 1], [1, 1, 0], [0, 0.5, 0.5]], + [[1, 0, 1], [0, 0.25, 0.5], [1, 1, 0], [0, 0.5, 0.5]], + [[1, 0, 1], [0.5, 0.75, 1], [1, 1, 0], [0, 0.5, 0.5]], + [[1, 0, 1], [0.25, 0.5, 0.75], [1, 1, 0], [0, 0.5, 0.5]], + ] + + for test_case in test_cases: + check_detection_error_tradeoff_curve(test_case) + + +def test_detection_error_tradeoff_curve_tie_handling(): + test_cases = [ + [[1, 0], [0.5, 0.5], [1], [0]], + [[0, 1], [0.5, 0.5], [1], [0]], + [[0, 0, 1], [0.25, 0.5, 0.5], [0.5], [0]], + [[0, 1, 0], [0.25, 0.5, 0.5], [0.5], [0]], + [[0, 1, 1], [0.25, 0.5, 0.5], [0], [0]], + [[1, 0, 0], [0.25, 0.5, 0.5], [1], [0]], + [[1, 0, 1], [0.25, 0.5, 0.5], [1], [0]], + [[1, 1, 0], [0.25, 0.5, 0.5], [1], [0]], + ] + + for test_case in test_cases: + check_detection_error_tradeoff_curve(test_case) + + # Special case: exactly duplicated inputs yield the same result. + assert_array_almost_equal( + detection_error_tradeoff_curve([0, 0, 1], [0, 0.5, 1]), + detection_error_tradeoff_curve( + [0, 0, 0, 0, 1, 1], [0, 0, 0.5, 0.5, 1, 1]) + ) + + # Check computation with perfect and constant scores. + for score in [0, 0.25, 0.5, 0.75, 1]: + check_detection_error_tradeoff_curve([ + [0, 1, 0, 1, 0, 1], score * np.ones(6), [1], [0] + ]) + + +def test_detection_error_tradeoff_curve_bad_input(): + # input variables with inconsistent numbers of samples + assert_raises( + ValueError, + detection_error_tradeoff_curve, + [0, 1], [0, 0.5, 1] + ) + assert_raises( + ValueError, + detection_error_tradeoff_curve, + [0, 1, 1], [0, 0.5] + ) + + # When the true_y values are all the same a detection error tradeoff cannot + # be computed. + with np.errstate(all="raise"): + assert_raises( + FloatingPointError, + detection_error_tradeoff_curve, [0, 0, 0], [0, 0.5, 1] + ) + assert_raises( + FloatingPointError, + detection_error_tradeoff_curve, [1, 1, 1], [0, 0.5, 1] + ) + + def check_lrap_toy(lrap_score): # Check on several small example that it works assert_almost_equal(lrap_score([[0, 1]], [[0.25, 0.75]]), 1) From d2c4a7ee25c2bd804e6052590855726820f1f73d Mon Sep 17 00:00:00 2001 From: daniel Date: Sun, 25 Feb 2018 23:31:14 +0100 Subject: [PATCH 12/36] Streamline DET test by using parametrization --- sklearn/metrics/tests/test_ranking.py | 123 ++++++++++++++------------ 1 file changed, 66 insertions(+), 57 deletions(-) diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 92fa35f9f102a..b46c01fee8577 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -4,6 +4,8 @@ import warnings from scipy.sparse import csr_matrix +import pytest + from sklearn import datasets from sklearn import svm @@ -926,72 +928,79 @@ def test_score_scale_invariance(): assert pr_auc == pr_auc_shifted -def check_detection_error_tradeoff_curve(test_case): - # Test detection_error_tradeoff_curve on an array of: - # [y_true, y_score, correct_fpr ,correct_fnr] - fpr, fnr, _ = detection_error_tradeoff_curve( - test_case[0], test_case[1]) +@pytest.mark.parametrize("y_true,y_score,expected_fpr,expected_fnr", [ + ([0, 0, 1], [0, 0.5, 1], [0], [0]), + ([0, 0, 1], [0, 0.25, 0.5], [0], [0]), + ([0, 0, 1], [0.5, 0.75, 1], [0], [0]), + ([0, 0, 1], [0.25, 0.5, 0.75], [0], [0]), + ([0, 1, 0], [0, 0.5, 1], [0.5], [0]), + ([0, 1, 0], [0, 0.25, 0.5], [0.5], [0]), + ([0, 1, 0], [0.5, 0.75, 1], [0.5], [0]), + ([0, 1, 0], [0.25, 0.5, 0.75], [0.5], [0]), + ([0, 1, 1], [0, 0.5, 1], [0.0], [0]), + ([0, 1, 1], [0, 0.25, 0.5], [0], [0]), + ([0, 1, 1], [0.5, 0.75, 1], [0], [0]), + ([0, 1, 1], [0.25, 0.5, 0.75], [0], [0]), + ([1, 0, 0], [0, 0.5, 1], [1, 1, 0.5], [0, 1, 1]), + ([1, 0, 0], [0, 0.25, 0.5], [1, 1, 0.5], [0, 1, 1]), + ([1, 0, 0], [0.5, 0.75, 1], [1, 1, 0.5], [0, 1, 1]), + ([1, 0, 0], [0.25, 0.5, 0.75], [1, 1, 0.5], [0, 1, 1]), + ([1, 0, 1], [0, 0.5, 1], [1, 1, 0], [0, 0.5, 0.5]), + ([1, 0, 1], [0, 0.25, 0.5], [1, 1, 0], [0, 0.5, 0.5]), + ([1, 0, 1], [0.5, 0.75, 1], [1, 1, 0], [0, 0.5, 0.5]), + ([1, 0, 1], [0.25, 0.5, 0.75], [1, 1, 0], [0, 0.5, 0.5]), +]) +def test_detection_error_tradeoff_curve_toydata( + y_true, y_score, expected_fpr, expected_fnr +): + # Check on a batch of small examples. + fpr, fnr, _ = detection_error_tradeoff_curve(y_true, y_score) - assert_array_almost_equal(fpr, test_case[2]) - assert_array_almost_equal(fnr, test_case[3]) + assert_array_almost_equal(fpr, expected_fpr) + assert_array_almost_equal(fnr, expected_fnr) -def test_detection_error_tradeoff_curve_toydata(): - # Check on a batch of small examples. - test_cases = [ - [[0, 0, 1], [0, 0.5, 1], [0], [0]], - [[0, 0, 1], [0, 0.25, 0.5], [0], [0]], - [[0, 0, 1], [0.5, 0.75, 1], [0], [0]], - [[0, 0, 1], [0.25, 0.5, 0.75], [0], [0]], - [[0, 1, 0], [0, 0.5, 1], [0.5], [0]], - [[0, 1, 0], [0, 0.25, 0.5], [0.5], [0]], - [[0, 1, 0], [0.5, 0.75, 1], [0.5], [0]], - [[0, 1, 0], [0.25, 0.5, 0.75], [0.5], [0]], - [[0, 1, 1], [0, 0.5, 1], [0.0], [0]], - [[0, 1, 1], [0, 0.25, 0.5], [0], [0]], - [[0, 1, 1], [0.5, 0.75, 1], [0], [0]], - [[0, 1, 1], [0.25, 0.5, 0.75], [0], [0]], - [[1, 0, 0], [0, 0.5, 1], [1, 1, 0.5], [0, 1, 1]], - [[1, 0, 0], [0, 0.25, 0.5], [1, 1, 0.5], [0, 1, 1]], - [[1, 0, 0], [0.5, 0.75, 1], [1, 1, 0.5], [0, 1, 1]], - [[1, 0, 0], [0.25, 0.5, 0.75], [1, 1, 0.5], [0, 1, 1]], - [[1, 0, 1], [0, 0.5, 1], [1, 1, 0], [0, 0.5, 0.5]], - [[1, 0, 1], [0, 0.25, 0.5], [1, 1, 0], [0, 0.5, 0.5]], - [[1, 0, 1], [0.5, 0.75, 1], [1, 1, 0], [0, 0.5, 0.5]], - [[1, 0, 1], [0.25, 0.5, 0.75], [1, 1, 0], [0, 0.5, 0.5]], - ] - - for test_case in test_cases: - check_detection_error_tradeoff_curve(test_case) - - -def test_detection_error_tradeoff_curve_tie_handling(): - test_cases = [ - [[1, 0], [0.5, 0.5], [1], [0]], - [[0, 1], [0.5, 0.5], [1], [0]], - [[0, 0, 1], [0.25, 0.5, 0.5], [0.5], [0]], - [[0, 1, 0], [0.25, 0.5, 0.5], [0.5], [0]], - [[0, 1, 1], [0.25, 0.5, 0.5], [0], [0]], - [[1, 0, 0], [0.25, 0.5, 0.5], [1], [0]], - [[1, 0, 1], [0.25, 0.5, 0.5], [1], [0]], - [[1, 1, 0], [0.25, 0.5, 0.5], [1], [0]], - ] - - for test_case in test_cases: - check_detection_error_tradeoff_curve(test_case) - - # Special case: exactly duplicated inputs yield the same result. +@pytest.mark.parametrize("y_true,y_score,expected_fpr,expected_fnr", [ + ([1, 0], [0.5, 0.5], [1], [0]), + ([0, 1], [0.5, 0.5], [1], [0]), + ([0, 0, 1], [0.25, 0.5, 0.5], [0.5], [0]), + ([0, 1, 0], [0.25, 0.5, 0.5], [0.5], [0]), + ([0, 1, 1], [0.25, 0.5, 0.5], [0], [0]), + ([1, 0, 0], [0.25, 0.5, 0.5], [1], [0]), + ([1, 0, 1], [0.25, 0.5, 0.5], [1], [0]), + ([1, 1, 0], [0.25, 0.5, 0.5], [1], [0]), +]) +def test_detection_error_tradeoff_curve_tie_handling( + y_true, y_score, expected_fpr, expected_fnr +): + fpr, fnr, _ = detection_error_tradeoff_curve(y_true, y_score) + + assert_array_almost_equal(fpr, expected_fpr) + assert_array_almost_equal(fnr, expected_fnr) + + +def test_detection_error_tradeoff_curve_sanity_check(): + # Exactly duplicated inputs yield the same result. assert_array_almost_equal( detection_error_tradeoff_curve([0, 0, 1], [0, 0.5, 1]), detection_error_tradeoff_curve( [0, 0, 0, 0, 1, 1], [0, 0, 0.5, 0.5, 1, 1]) ) + +@pytest.mark.parametrize("y_score", [ + (0), (0.25), (0.5), (0.75), (1) +]) +def test_detection_error_tradeoff_curve_constant_scores(y_score): # Check computation with perfect and constant scores. - for score in [0, 0.25, 0.5, 0.75, 1]: - check_detection_error_tradeoff_curve([ - [0, 1, 0, 1, 0, 1], score * np.ones(6), [1], [0] - ]) + fpr, fnr, threshold = detection_error_tradeoff_curve( + y_true=[0, 1, 0, 1, 0, 1], + y_score=y_score * np.ones(6) + ) + + assert_array_almost_equal(fpr, [1]) + assert_array_almost_equal(fnr, [0]) + assert_array_almost_equal(threshold, [y_score]) def test_detection_error_tradeoff_curve_bad_input(): @@ -1007,7 +1016,7 @@ def test_detection_error_tradeoff_curve_bad_input(): [0, 1, 1], [0, 0.5] ) - # When the true_y values are all the same a detection error tradeoff cannot + # When the y_true values are all the same a detection error tradeoff cannot # be computed. with np.errstate(all="raise"): assert_raises( From 82b488eb5438f8b94732c19381f298167bc9cdb3 Mon Sep 17 00:00:00 2001 From: daniel Date: Mon, 26 Feb 2018 09:11:56 +0100 Subject: [PATCH 13/36] Increase verbosity of DET curve error handling - Explicitly sanity check input before computing a DET curve. - Add test for perfect scores. - Adapt indentation style to match the test module. --- sklearn/metrics/ranking.py | 4 ++ sklearn/metrics/tests/test_ranking.py | 56 ++++++++++++++------------- 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index b40aaa15e9b15..2b2a59db7f048 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -276,6 +276,10 @@ def detection_error_tradeoff_curve(y_true, y_score, pos_label=None, array([ 0.35, 0.4 , 0.8 ]) """ + if len(np.unique(y_true)) != 2: + raise ValueError("Only one class present in y_true. Detection error " + "tradeoff curve is not defined in that case.") + fps, tps, thresholds = _binary_clf_curve(y_true, y_score, pos_label=pos_label, sample_weight=sample_weight) diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index b46c01fee8577..46c75f3993d8a 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -950,9 +950,8 @@ def test_score_scale_invariance(): ([1, 0, 1], [0.5, 0.75, 1], [1, 1, 0], [0, 0.5, 0.5]), ([1, 0, 1], [0.25, 0.5, 0.75], [1, 1, 0], [0, 0.5, 0.5]), ]) -def test_detection_error_tradeoff_curve_toydata( - y_true, y_score, expected_fpr, expected_fnr -): +def test_detection_error_tradeoff_curve_toydata(y_true, y_score, + expected_fpr, expected_fnr): # Check on a batch of small examples. fpr, fnr, _ = detection_error_tradeoff_curve(y_true, y_score) @@ -970,9 +969,9 @@ def test_detection_error_tradeoff_curve_toydata( ([1, 0, 1], [0.25, 0.5, 0.5], [1], [0]), ([1, 1, 0], [0.25, 0.5, 0.5], [1], [0]), ]) -def test_detection_error_tradeoff_curve_tie_handling( - y_true, y_score, expected_fpr, expected_fnr -): +def test_detection_error_tradeoff_curve_tie_handling(y_true, y_score, + expected_fpr, + expected_fnr): fpr, fnr, _ = detection_error_tradeoff_curve(y_true, y_score) assert_array_almost_equal(fpr, expected_fpr) @@ -992,7 +991,6 @@ def test_detection_error_tradeoff_curve_sanity_check(): (0), (0.25), (0.5), (0.75), (1) ]) def test_detection_error_tradeoff_curve_constant_scores(y_score): - # Check computation with perfect and constant scores. fpr, fnr, threshold = detection_error_tradeoff_curve( y_true=[0, 1, 0, 1, 0, 1], y_score=y_score * np.ones(6) @@ -1003,30 +1001,36 @@ def test_detection_error_tradeoff_curve_constant_scores(y_score): assert_array_almost_equal(threshold, [y_score]) +@pytest.mark.parametrize("y_true", [ + ([0, 0, 0, 0, 0, 1]), + ([0, 0, 0, 0, 1, 1]), + ([0, 0, 0, 1, 1, 1]), + ([0, 0, 1, 1, 1, 1]), + ([0, 1, 1, 1, 1, 1]), +]) +def test_detection_error_tradeoff_curve_perfect_scores(y_true): + fpr, fnr, _ = detection_error_tradeoff_curve( + y_true=y_true, + y_score=y_true + ) + + assert_array_almost_equal(fpr, [0]) + assert_array_almost_equal(fnr, [0]) + + def test_detection_error_tradeoff_curve_bad_input(): # input variables with inconsistent numbers of samples - assert_raises( - ValueError, - detection_error_tradeoff_curve, - [0, 1], [0, 0.5, 1] - ) - assert_raises( - ValueError, - detection_error_tradeoff_curve, - [0, 1, 1], [0, 0.5] - ) + assert_raises(ValueError, detection_error_tradeoff_curve, + [0, 1], [0, 0.5, 1]) + assert_raises(ValueError, detection_error_tradeoff_curve, + [0, 1, 1], [0, 0.5]) # When the y_true values are all the same a detection error tradeoff cannot # be computed. - with np.errstate(all="raise"): - assert_raises( - FloatingPointError, - detection_error_tradeoff_curve, [0, 0, 0], [0, 0.5, 1] - ) - assert_raises( - FloatingPointError, - detection_error_tradeoff_curve, [1, 1, 1], [0, 0.5, 1] - ) + assert_raises(ValueError, detection_error_tradeoff_curve, + [0, 0, 0], [0, 0.5, 1]) + assert_raises(ValueError, detection_error_tradeoff_curve, + [1, 1, 1], [0, 0.5, 1]) def check_lrap_toy(lrap_score): From 4dbaaa327b71c5d6f28f277f4c86feefef4a2943 Mon Sep 17 00:00:00 2001 From: daniel Date: Mon, 26 Feb 2018 10:44:11 +0100 Subject: [PATCH 14/36] Add reference for DET curves in invariance test --- sklearn/metrics/tests/test_common.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 3f2ba83b474c7..e03fd07fe03ea 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -58,7 +58,17 @@ from sklearn.metrics import ndcg_score from sklearn.metrics import dcg_score +<<<<<<< HEAD from sklearn.metrics._base import _average_binary_score +======= +# TODO Curve are currently not covered by invariance test +# from sklearn.metrics import precision_recall_curve +# from sklearn.metrics import roc_curve +# from sklearn.metrics import detection_error_tradeoff_curve + + +from sklearn.metrics.base import _average_binary_score +>>>>>>> Add reference for DET curves in invariance test # Note toward developers about metric testing From 3e4f7c22732210115aee11ed31fbdb30e2e0beee Mon Sep 17 00:00:00 2001 From: Daniel Mohns Date: Thu, 29 Nov 2018 16:40:24 +0100 Subject: [PATCH 15/36] Add automated invariance checks for DET curves --- sklearn/metrics/tests/test_common.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index e03fd07fe03ea..08e82d864832b 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -29,6 +29,7 @@ from sklearn.metrics import cohen_kappa_score from sklearn.metrics import confusion_matrix from sklearn.metrics import coverage_error +from sklearn.metrics import detection_error_tradeoff_curve from sklearn.metrics import explained_variance_score from sklearn.metrics import f1_score from sklearn.metrics import fbeta_score @@ -215,6 +216,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs): CURVE_METRICS = { "roc_curve": roc_curve, "precision_recall_curve": precision_recall_curve_padded_thresholds, + "detection_error_tradeoff_curve": detection_error_tradeoff_curve, } THRESHOLDED_METRICS = { @@ -311,6 +313,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs): # curves "roc_curve", "precision_recall_curve", + "detection_error_tradeoff_curve", } # Metric undefined with "binary" or "multiclass" input @@ -332,6 +335,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs): METRICS_WITH_POS_LABEL = { "roc_curve", "precision_recall_curve", + "detection_error_tradeoff_curve", "brier_score_loss", @@ -362,6 +366,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs): "normalized_confusion_matrix", "roc_curve", "precision_recall_curve", + "detection_error_tradeoff_curve", "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score", "jaccard_score", @@ -474,6 +479,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs): "normalized_confusion_matrix", "roc_curve", "precision_recall_curve", + "detection_error_tradeoff_curve", "precision_score", "recall_score", "f2_score", "f0.5_score", From aede9550949acb8db75ce55f2c49e31291a40144 Mon Sep 17 00:00:00 2001 From: Daniel Mohns Date: Thu, 29 Nov 2018 16:50:57 +0100 Subject: [PATCH 16/36] Resolve merge artifacts --- sklearn/metrics/tests/test_ranking.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 46c75f3993d8a..b24a848147155 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -4,8 +4,6 @@ import warnings from scipy.sparse import csr_matrix -import pytest - from sklearn import datasets from sklearn import svm From 3207a7f5f525cb4e97fb226a7492b6199a0a4312 Mon Sep 17 00:00:00 2001 From: Daniel Mohns Date: Thu, 29 Nov 2018 17:51:00 +0100 Subject: [PATCH 17/36] Make doctest happy --- sklearn/metrics/ranking.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 2b2a59db7f048..e288643ee4070 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -269,11 +269,11 @@ def detection_error_tradeoff_curve(y_true, y_score, pos_label=None, >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> fpr, fnr, thresholds = detection_error_tradeoff_curve(y_true, y_scores) >>> fpr - array([ 0.5, 0.5, 0. ]) + array([0.5, 0.5, 0. ]) >>> fnr - array([ 0. , 0.5, 0.5]) + array([0. , 0.5, 0.5 ]) >>> thresholds - array([ 0.35, 0.4 , 0.8 ]) + array([0.35, 0.4 , 0.8 ]) """ if len(np.unique(y_true)) != 2: From 6cdc535c8b76de2c00b60fbb8f512717d5d04251 Mon Sep 17 00:00:00 2001 From: Daniel Mohns Date: Fri, 30 Nov 2018 10:42:27 +0100 Subject: [PATCH 18/36] Fix whitespaces for doctest --- sklearn/metrics/ranking.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index e288643ee4070..588e77acb4b86 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -269,11 +269,11 @@ def detection_error_tradeoff_curve(y_true, y_score, pos_label=None, >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> fpr, fnr, thresholds = detection_error_tradeoff_curve(y_true, y_scores) >>> fpr - array([0.5, 0.5, 0. ]) + array([0.5, 0.5, 0. ]) >>> fnr - array([0. , 0.5, 0.5 ]) + array([0. , 0.5, 0.5]) >>> thresholds - array([0.35, 0.4 , 0.8 ]) + array([0.35, 0.4 , 0.8 ]) """ if len(np.unique(y_true)) != 2: From 68ebbd967a66e5b6c2799e44f7ae1f674105add0 Mon Sep 17 00:00:00 2001 From: Daniel Mohns Date: Sun, 30 Dec 2018 16:34:43 +0100 Subject: [PATCH 19/36] Revert unintended whitespace changes --- doc/whats_new/v0.21.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 94099723dd0ec..b2dd1a595119e 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -259,6 +259,7 @@ Changelog Support for Python 3.4 and below has been officially dropped. .. +<<<<<<< HEAD Entries should be grouped by module (in alphabetic order) and prefixed with one of the labels: |MajorFeature|, |Feature|, |Efficiency|, |Enhancement|, |Fix| or |API| (see whats_new.rst for descriptions). @@ -291,6 +292,15 @@ Support for Python 3.4 and below has been officially dropped. `calibration.CalibratedClassifierCV`. :pr:`13485` by :user:`William de Vazelhes `. +======= + See version doc/whats_new/v0.20.rst for structure. Entries should be + prefixed with one of the labels: |MajorFeature|, |Feature|, |Efficiency|, + |Enhancement|, |Fix| or |API|. They should be under a heading for the + relevant module (or *Multiple Modules* or *Miscellaneous*), and within each + section should be ordered according to the label ordering above. Entries + should end with: :issue:`123456` by :user:`Joe Bloggs `. + +>>>>>>> Revert unintended whitespace changes :mod:`sklearn.cluster` ...................... @@ -781,6 +791,7 @@ Support for Python 3.4 and below has been officially dropped. :pr:`13447` by :user:`Dan Ellis `. - |API| The parameter ``labels`` in :func:`metrics.hamming_loss` is deprecated +<<<<<<< HEAD in version 0.21 and will be removed in version 0.23. :pr:`10580` by :user:`Reshama Shaikh ` and :user:`Sandra Mitrovic `. @@ -803,6 +814,11 @@ Support for Python 3.4 and below has been officially dropped. :class:`mixture.BayesianGaussianMixture`, where ``fit_predict`` and ``fit.predict`` were not equivalent. :pr:`13142` by :user:`Jérémie du Boisberranger `. +======= + in version 0.21 and will be removed in version 0.23. + :issue:`10580` by :user:`Reshama Shaikh ` and `Sandra + Mitrovic `. +>>>>>>> Revert unintended whitespace changes :mod:`sklearn.model_selection` From 394bd47899b5e0e308b61ea1f29218e5a8b70326 Mon Sep 17 00:00:00 2001 From: Daniel Mohns Date: Sun, 30 Dec 2018 16:38:00 +0100 Subject: [PATCH 20/36] Revert unintended white space changes #2 --- doc/whats_new/v0.21.rst | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index b2dd1a595119e..94099723dd0ec 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -259,7 +259,6 @@ Changelog Support for Python 3.4 and below has been officially dropped. .. -<<<<<<< HEAD Entries should be grouped by module (in alphabetic order) and prefixed with one of the labels: |MajorFeature|, |Feature|, |Efficiency|, |Enhancement|, |Fix| or |API| (see whats_new.rst for descriptions). @@ -292,15 +291,6 @@ Support for Python 3.4 and below has been officially dropped. `calibration.CalibratedClassifierCV`. :pr:`13485` by :user:`William de Vazelhes `. -======= - See version doc/whats_new/v0.20.rst for structure. Entries should be - prefixed with one of the labels: |MajorFeature|, |Feature|, |Efficiency|, - |Enhancement|, |Fix| or |API|. They should be under a heading for the - relevant module (or *Multiple Modules* or *Miscellaneous*), and within each - section should be ordered according to the label ordering above. Entries - should end with: :issue:`123456` by :user:`Joe Bloggs `. - ->>>>>>> Revert unintended whitespace changes :mod:`sklearn.cluster` ...................... @@ -791,7 +781,6 @@ Support for Python 3.4 and below has been officially dropped. :pr:`13447` by :user:`Dan Ellis `. - |API| The parameter ``labels`` in :func:`metrics.hamming_loss` is deprecated -<<<<<<< HEAD in version 0.21 and will be removed in version 0.23. :pr:`10580` by :user:`Reshama Shaikh ` and :user:`Sandra Mitrovic `. @@ -814,11 +803,6 @@ Support for Python 3.4 and below has been officially dropped. :class:`mixture.BayesianGaussianMixture`, where ``fit_predict`` and ``fit.predict`` were not equivalent. :pr:`13142` by :user:`Jérémie du Boisberranger `. -======= - in version 0.21 and will be removed in version 0.23. - :issue:`10580` by :user:`Reshama Shaikh ` and `Sandra - Mitrovic `. ->>>>>>> Revert unintended whitespace changes :mod:`sklearn.model_selection` From d10be269e552fb1f227b0ed9fdd0164a7b5e3c86 Mon Sep 17 00:00:00 2001 From: Daniel Mohns Date: Sun, 30 Dec 2018 16:43:40 +0100 Subject: [PATCH 21/36] Fix typos and grammar --- doc/modules/model_evaluation.rst | 2 +- examples/model_selection/plot_det.py | 4 ++-- sklearn/metrics/ranking.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index daa07627e834d..1ddd81a62833a 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1491,7 +1491,7 @@ same classification task: **Applications and limitations:** DET curves are intuitive to read and hence allow quick visual assessment of a -classifiers performance. +classifier's performance. Additionally DET curves can be consulted for threshold analysis and operating point selection. This is particularly helpful if a comparison of error types is required. diff --git a/examples/model_selection/plot_det.py b/examples/model_selection/plot_det.py index f04191da83271..8511f1d7cac96 100644 --- a/examples/model_selection/plot_det.py +++ b/examples/model_selection/plot_det.py @@ -61,7 +61,7 @@ # prepare plots -# first plot the ROC curve +# first prepare the ROC curve ax_roc = plt.subplot(1, 2, 1) ax_roc.set_title('Receiver Operating Characteristic (ROC) curves') ax_roc.set_xlabel('False Positive Rate') @@ -74,7 +74,7 @@ ax_roc.xaxis.set_major_formatter( FuncFormatter(lambda y, _: '{:.0%}'.format(y))) -# second plot the DET curve +# second prepare the DET curve ax_det = plt.subplot(1, 2, 2) ax_det.set_title('Detection Error Tradeoff (DET) curves') ax_det.set_xlabel('False Positive Rate') diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 588e77acb4b86..ab9285ce38f63 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -221,7 +221,7 @@ def detection_error_tradeoff_curve(y_true, y_score, pos_label=None, sample_weight=None): """Compute error rates for different probability thresholds. - Note: This metrics is used for ranking evaluation of the binary + Note: This metrics is used for ranking evaluation of a binary classification task. Read more in the :ref:`User Guide `. From b0c267e452bef9e25855fec7498c902343407882 Mon Sep 17 00:00:00 2001 From: Daniel Mohns Date: Sun, 30 Dec 2018 16:46:23 +0100 Subject: [PATCH 22/36] Fix white space in doc --- doc/modules/model_evaluation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 1ddd81a62833a..c8168682ebfe4 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1459,8 +1459,8 @@ DET curves are a variation of receiver operating characteristic (ROC) curves where False Negative Rate is plotted on the ordinate instead of True Positive Rate. DET curves are commonly plotted in normal deviate scale by transformation with -:math:`\phi^{-1}` (with :math:`\phi` being the cumulative distribution function) -. +:math:`\phi^{-1}` (with :math:`\phi` being the cumulative distribution +function). The resulting performance curves explicitly visualize the tradeoff of error types for given classification algorithms. From f4381284c75645a741a5b1d6eb5997491d354e2e Mon Sep 17 00:00:00 2001 From: Daniel Mohns Date: Sun, 30 Dec 2018 16:55:26 +0100 Subject: [PATCH 23/36] Streamline test code --- sklearn/metrics/tests/test_ranking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index b24a848147155..7f1d7f38c4caf 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -991,7 +991,7 @@ def test_detection_error_tradeoff_curve_sanity_check(): def test_detection_error_tradeoff_curve_constant_scores(y_score): fpr, fnr, threshold = detection_error_tradeoff_curve( y_true=[0, 1, 0, 1, 0, 1], - y_score=y_score * np.ones(6) + y_score=np.full(6, y_score) ) assert_array_almost_equal(fpr, [1]) From e733ff5b80bfbce5f55ddb1fdf43a97d7197b73b Mon Sep 17 00:00:00 2001 From: daniel Date: Sat, 18 Jul 2020 18:33:20 +0200 Subject: [PATCH 24/36] Remove rebase artifacts --- doc/modules/model_evaluation.rst | 1 - doc/whats_new/v0.20.rst | 12 - sklearn/metrics/ranking.py | 917 --------------------------- sklearn/metrics/tests/test_common.py | 10 - 4 files changed, 940 deletions(-) delete mode 100644 sklearn/metrics/ranking.py diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index c8168682ebfe4..2592ba8db732d 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -306,7 +306,6 @@ Some of these are restricted to the binary classification case: precision_recall_curve roc_curve - balanced_accuracy_score detection_error_tradeoff_curve diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index f6044c7ca6140..2eaf3199fbc3c 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -852,20 +852,8 @@ Support for Python 3.3 has been officially dropped. :user:`Guillaume Lemaitre `. -<<<<<<< HEAD :mod:`sklearn.feature_selection` ................................ -======= -- Added the :func:`metrics.balanced_accuracy_score` metric and a corresponding - ``'balanced_accuracy'`` scorer for binary classification. - :issue:`8066` by :user:`xyguo` and :user:`Aman Dalmia `. -- Added :class:`multioutput.RegressorChain` for multi-target - regression. :issue:`9257` by :user:`Kumar Ashutosh `. -- Added :func:`metrics.detection_error_tradeoff_curve` to compute - Detection Error Tradeoff curve classification metric. - :issue:`10591` by :user:`Jeremy Karnowski ` and - :user:`Daniel Mohns `. ->>>>>>> Add DET curve contributors - |Feature| Added select K best features functionality to :class:`feature_selection.SelectFromModel`. diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py deleted file mode 100644 index ab9285ce38f63..0000000000000 --- a/sklearn/metrics/ranking.py +++ /dev/null @@ -1,917 +0,0 @@ -"""Metrics to assess performance on classification task given scores - -Functions named as ``*_score`` return a scalar value to maximize: the higher -the better - -Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: -the lower the better -""" - -# Authors: Alexandre Gramfort -# Mathieu Blondel -# Olivier Grisel -# Arnaud Joly -# Jochen Wersdorfer -# Lars Buitinck -# Joel Nothman -# Noel Dawe -# License: BSD 3 clause - -from __future__ import division - -import warnings -import numpy as np -from scipy.sparse import csr_matrix -from scipy.stats import rankdata - -from ..utils import assert_all_finite -from ..utils import check_consistent_length -from ..utils import column_or_1d, check_array -from ..utils.multiclass import type_of_target -from ..utils.extmath import stable_cumsum -from ..utils.sparsefuncs import count_nonzero -from ..exceptions import UndefinedMetricWarning -from ..preprocessing import label_binarize - -from .base import _average_binary_score - - -def auc(x, y, reorder='deprecated'): - """Compute Area Under the Curve (AUC) using the trapezoidal rule - - This is a general function, given points on a curve. For computing the - area under the ROC-curve, see :func:`roc_auc_score`. For an alternative - way to summarize a precision-recall curve, see - :func:`average_precision_score`. - - Parameters - ---------- - x : array, shape = [n] - x coordinates. These must be either monotonic increasing or monotonic - decreasing. - y : array, shape = [n] - y coordinates. - reorder : boolean, optional (default='deprecated') - Whether to sort x before computing. If False, assume that x must be - either monotonic increasing or monotonic decreasing. If True, y is - used to break ties when sorting x. Make sure that y has a monotonic - relation to x when setting reorder to True. - - .. deprecated:: 0.20 - Parameter ``reorder`` has been deprecated in version 0.20 and will - be removed in 0.22. It's introduced for roc_auc_score (not for - general use) and is no longer used there. What's more, the result - from auc will be significantly influenced if x is sorted - unexpectedly due to slight floating point error (See issue #9786). - Future (and default) behavior is equivalent to ``reorder=False``. - - Returns - ------- - auc : float - - Examples - -------- - >>> import numpy as np - >>> from sklearn import metrics - >>> y = np.array([1, 1, 2, 2]) - >>> pred = np.array([0.1, 0.4, 0.35, 0.8]) - >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2) - >>> metrics.auc(fpr, tpr) - 0.75 - - See also - -------- - roc_auc_score : Compute the area under the ROC curve - average_precision_score : Compute average precision from prediction scores - precision_recall_curve : - Compute precision-recall pairs for different probability thresholds - """ - check_consistent_length(x, y) - x = column_or_1d(x) - y = column_or_1d(y) - - if x.shape[0] < 2: - raise ValueError('At least 2 points are needed to compute' - ' area under curve, but x.shape = %s' % x.shape) - - if reorder != 'deprecated': - warnings.warn("The 'reorder' parameter has been deprecated in " - "version 0.20 and will be removed in 0.22. It is " - "recommended not to set 'reorder' and ensure that x " - "is monotonic increasing or monotonic decreasing.", - DeprecationWarning) - - direction = 1 - if reorder is True: - # reorder the data points according to the x axis and using y to - # break ties - order = np.lexsort((y, x)) - x, y = x[order], y[order] - else: - dx = np.diff(x) - if np.any(dx < 0): - if np.all(dx <= 0): - direction = -1 - else: - raise ValueError("x is neither increasing nor decreasing " - ": {}.".format(x)) - - area = direction * np.trapz(y, x) - if isinstance(area, np.memmap): - # Reductions such as .sum used internally in np.trapz do not return a - # scalar by default for numpy.memmap instances contrary to - # regular numpy.ndarray instances. - area = area.dtype.type(area) - return area - - -def average_precision_score(y_true, y_score, average="macro", - sample_weight=None): - """Compute average precision (AP) from prediction scores - - AP summarizes a precision-recall curve as the weighted mean of precisions - achieved at each threshold, with the increase in recall from the previous - threshold used as the weight: - - .. math:: - \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n - - where :math:`P_n` and :math:`R_n` are the precision and recall at the nth - threshold [1]_. This implementation is not interpolated and is different - from computing the area under the precision-recall curve with the - trapezoidal rule, which uses linear interpolation and can be too - optimistic. - - Note: this implementation is restricted to the binary classification task - or multilabel classification task. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - y_true : array, shape = [n_samples] or [n_samples, n_classes] - True binary labels (either {0, 1} or {-1, 1}). - - y_score : array, shape = [n_samples] or [n_samples, n_classes] - Target scores, can either be probability estimates of the positive - class, confidence values, or non-thresholded measure of decisions - (as returned by "decision_function" on some classifiers). - - average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] - If ``None``, the scores for each class are returned. Otherwise, - this determines the type of averaging performed on the data: - - ``'micro'``: - Calculate metrics globally by considering each element of the label - indicator matrix as a label. - ``'macro'``: - Calculate metrics for each label, and find their unweighted - mean. This does not take label imbalance into account. - ``'weighted'``: - Calculate metrics for each label, and find their average, weighted - by support (the number of true instances for each label). - ``'samples'``: - Calculate metrics for each instance, and find their average. - - sample_weight : array-like of shape = [n_samples], optional - Sample weights. - - Returns - ------- - average_precision : float - - References - ---------- - .. [1] `Wikipedia entry for the Average precision - `_ - - See also - -------- - roc_auc_score : Compute the area under the ROC curve - - precision_recall_curve : - Compute precision-recall pairs for different probability thresholds - - Examples - -------- - >>> import numpy as np - >>> from sklearn.metrics import average_precision_score - >>> y_true = np.array([0, 0, 1, 1]) - >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) - >>> average_precision_score(y_true, y_scores) # doctest: +ELLIPSIS - 0.83... - - """ - def _binary_uninterpolated_average_precision( - y_true, y_score, sample_weight=None): - precision, recall, thresholds = precision_recall_curve( - y_true, y_score, sample_weight=sample_weight) - # Return the step function integral - # The following works because the last entry of precision is - # guaranteed to be 1, as returned by precision_recall_curve - return -np.sum(np.diff(recall) * np.array(precision)[:-1]) - - return _average_binary_score(_binary_uninterpolated_average_precision, - y_true, y_score, average, - sample_weight=sample_weight) - - -def detection_error_tradeoff_curve(y_true, y_score, pos_label=None, - sample_weight=None): - """Compute error rates for different probability thresholds. - - Note: This metrics is used for ranking evaluation of a binary - classification task. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - y_true : array, shape = [n_samples] - True targets of binary classification in range {-1, 1} or {0, 1}. - - y_score : array, shape = [n_samples] - Estimated probabilities or decision function. - - pos_label : int, optional (default=None) - The label of the positive class - - sample_weight : array-like of shape = [n_samples], optional - Sample weights. - - Returns - ------- - fpr : array, shape = [n_thresholds] - False positive rate (FPR) such that element i is the false positive - rate of predictions with score >= thresholds[i]. This is occasionally - referred to as false acceptance propability or fall-out. - - fnr : array, shape = [n_thresholds] - False negative rate (FNR) such that element i is the false negative - rate of predictions with score >= thresholds[i]. This is occasionally - referred to as false rejection or miss rate. - - thresholds : array, shape = [n_thresholds] - Decreasing score values. - - See also - -------- - roc_curve : Compute Receiver operating characteristic (ROC) curve - - precision_recall_curve : Compute precision-recall curve - - Examples - -------- - >>> import numpy as np - >>> from sklearn.metrics import detection_error_tradeoff_curve - >>> y_true = np.array([0, 0, 1, 1]) - >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) - >>> fpr, fnr, thresholds = detection_error_tradeoff_curve(y_true, y_scores) - >>> fpr - array([0.5, 0.5, 0. ]) - >>> fnr - array([0. , 0.5, 0.5]) - >>> thresholds - array([0.35, 0.4 , 0.8 ]) - - """ - if len(np.unique(y_true)) != 2: - raise ValueError("Only one class present in y_true. Detection error " - "tradeoff curve is not defined in that case.") - - fps, tps, thresholds = _binary_clf_curve(y_true, y_score, - pos_label=pos_label, - sample_weight=sample_weight) - - fns = tps[-1] - tps - p_count = tps[-1] - n_count = fps[-1] - - # start with false positives zero - first_ind = ( - fps.searchsorted(fps[0], side='right') - 1 - if fps.searchsorted(fps[0], side='right') > 0 - else None - ) - # stop with false negatives zero - last_ind = tps.searchsorted(tps[-1]) + 1 - sl = slice(first_ind, last_ind) - - # reverse the output such that list of false positives is decreasing - return ( - fps[sl][::-1] / n_count, - fns[sl][::-1] / p_count, - thresholds[sl][::-1] - ) - - -def roc_auc_score(y_true, y_score, average="macro", sample_weight=None): - """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) - from prediction scores. - - Note: this implementation is restricted to the binary classification task - or multilabel classification task in label indicator format. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - y_true : array, shape = [n_samples] or [n_samples, n_classes] - True binary labels or binary label indicators. - - y_score : array, shape = [n_samples] or [n_samples, n_classes] - Target scores, can either be probability estimates of the positive - class, confidence values, or non-thresholded measure of decisions - (as returned by "decision_function" on some classifiers). For binary - y_true, y_score is supposed to be the score of the class with greater - label. - - average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] - If ``None``, the scores for each class are returned. Otherwise, - this determines the type of averaging performed on the data: - - ``'micro'``: - Calculate metrics globally by considering each element of the label - indicator matrix as a label. - ``'macro'``: - Calculate metrics for each label, and find their unweighted - mean. This does not take label imbalance into account. - ``'weighted'``: - Calculate metrics for each label, and find their average, weighted - by support (the number of true instances for each label). - ``'samples'``: - Calculate metrics for each instance, and find their average. - - sample_weight : array-like of shape = [n_samples], optional - Sample weights. - - Returns - ------- - auc : float - - References - ---------- - .. [1] `Wikipedia entry for the Receiver operating characteristic - `_ - - .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition - Letters, 2006, 27(8):861-874. - - See also - -------- - average_precision_score : Area under the precision-recall curve - - roc_curve : Compute Receiver operating characteristic (ROC) curve - - Examples - -------- - >>> import numpy as np - >>> from sklearn.metrics import roc_auc_score - >>> y_true = np.array([0, 0, 1, 1]) - >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) - >>> roc_auc_score(y_true, y_scores) - 0.75 - - """ - def _binary_roc_auc_score(y_true, y_score, sample_weight=None): - if len(np.unique(y_true)) != 2: - raise ValueError("Only one class present in y_true. ROC AUC score " - "is not defined in that case.") - - fpr, tpr, tresholds = roc_curve(y_true, y_score, - sample_weight=sample_weight) - return auc(fpr, tpr) - - y_type = type_of_target(y_true) - if y_type == "binary": - labels = np.unique(y_true) - y_true = label_binarize(y_true, labels)[:, 0] - - return _average_binary_score( - _binary_roc_auc_score, y_true, y_score, average, - sample_weight=sample_weight) - - -def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): - """Calculate true and false positives per binary classification threshold. - - Parameters - ---------- - y_true : array, shape = [n_samples] - True targets of binary classification - - y_score : array, shape = [n_samples] - Estimated probabilities or decision function - - pos_label : int or str, default=None - The label of the positive class - - sample_weight : array-like of shape = [n_samples], optional - Sample weights. - - Returns - ------- - fps : array, shape = [n_thresholds] - A count of false positives, at index i being the number of negative - samples assigned a score >= thresholds[i]. The total number of - negative samples is equal to fps[-1] (thus true negatives are given by - fps[-1] - fps). - - tps : array, shape = [n_thresholds <= len(np.unique(y_score))] - An increasing count of true positives, at index i being the number - of positive samples assigned a score >= thresholds[i]. The total - number of positive samples is equal to tps[-1] (thus false negatives - are given by tps[-1] - tps). - - thresholds : array, shape = [n_thresholds] - Decreasing score values. - """ - # Check to make sure y_true is valid - y_type = type_of_target(y_true) - if not (y_type == "binary" or - (y_type == "multiclass" and pos_label is not None)): - raise ValueError("{0} format is not supported".format(y_type)) - - check_consistent_length(y_true, y_score, sample_weight) - y_true = column_or_1d(y_true) - y_score = column_or_1d(y_score) - assert_all_finite(y_true) - assert_all_finite(y_score) - - if sample_weight is not None: - sample_weight = column_or_1d(sample_weight) - - # ensure binary classification if pos_label is not specified - classes = np.unique(y_true) - if (pos_label is None and - not (np.array_equal(classes, [0, 1]) or - np.array_equal(classes, [-1, 1]) or - np.array_equal(classes, [0]) or - np.array_equal(classes, [-1]) or - np.array_equal(classes, [1]))): - raise ValueError("Data is not binary and pos_label is not specified") - elif pos_label is None: - pos_label = 1. - - # make y_true a boolean vector - y_true = (y_true == pos_label) - - # sort scores and corresponding truth values - desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] - y_score = y_score[desc_score_indices] - y_true = y_true[desc_score_indices] - if sample_weight is not None: - weight = sample_weight[desc_score_indices] - else: - weight = 1. - - # y_score typically has many tied values. Here we extract - # the indices associated with the distinct values. We also - # concatenate a value for the end of the curve. - distinct_value_indices = np.where(np.diff(y_score))[0] - threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] - - # accumulate the true positives with decreasing threshold - tps = stable_cumsum(y_true * weight)[threshold_idxs] - if sample_weight is not None: - # express fps as a cumsum to ensure fps is increasing even in - # the presense of floating point errors - fps = stable_cumsum((1 - y_true) * weight)[threshold_idxs] - else: - fps = 1 + threshold_idxs - tps - return fps, tps, y_score[threshold_idxs] - - -def precision_recall_curve(y_true, probas_pred, pos_label=None, - sample_weight=None): - """Compute precision-recall pairs for different probability thresholds - - Note: this implementation is restricted to the binary classification task. - - The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of - true positives and ``fp`` the number of false positives. The precision is - intuitively the ability of the classifier not to label as positive a sample - that is negative. - - The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of - true positives and ``fn`` the number of false negatives. The recall is - intuitively the ability of the classifier to find all the positive samples. - - The last precision and recall values are 1. and 0. respectively and do not - have a corresponding threshold. This ensures that the graph starts on the - y axis. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - y_true : array, shape = [n_samples] - True targets of binary classification in range {-1, 1} or {0, 1}. - - probas_pred : array, shape = [n_samples] - Estimated probabilities or decision function. - - pos_label : int or str, default=None - The label of the positive class - - sample_weight : array-like of shape = [n_samples], optional - Sample weights. - - Returns - ------- - precision : array, shape = [n_thresholds + 1] - Precision values such that element i is the precision of - predictions with score >= thresholds[i] and the last element is 1. - - recall : array, shape = [n_thresholds + 1] - Decreasing recall values such that element i is the recall of - predictions with score >= thresholds[i] and the last element is 0. - - thresholds : array, shape = [n_thresholds <= len(np.unique(probas_pred))] - Increasing thresholds on the decision function used to compute - precision and recall. - - See also - -------- - average_precision_score : Compute average precision from prediction scores - - roc_curve : Compute Receiver operating characteristic (ROC) curve - - Examples - -------- - >>> import numpy as np - >>> from sklearn.metrics import precision_recall_curve - >>> y_true = np.array([0, 0, 1, 1]) - >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) - >>> precision, recall, thresholds = precision_recall_curve( - ... y_true, y_scores) - >>> precision # doctest: +ELLIPSIS - array([ 0.66..., 0.5 , 1. , 1. ]) - >>> recall - array([ 1. , 0.5, 0.5, 0. ]) - >>> thresholds - array([ 0.35, 0.4 , 0.8 ]) - - """ - fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred, - pos_label=pos_label, - sample_weight=sample_weight) - - precision = tps / (tps + fps) - recall = tps / tps[-1] - - # stop when full recall attained - # and reverse the outputs so recall is decreasing - last_ind = tps.searchsorted(tps[-1]) - sl = slice(last_ind, None, -1) - return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl] - - -def roc_curve(y_true, y_score, pos_label=None, sample_weight=None, - drop_intermediate=True): - """Compute Receiver operating characteristic (ROC) - - Note: this implementation is restricted to the binary classification task. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - - y_true : array, shape = [n_samples] - True binary labels. If labels are not either {-1, 1} or {0, 1}, then - pos_label should be explicitly given. - - y_score : array, shape = [n_samples] - Target scores, can either be probability estimates of the positive - class, confidence values, or non-thresholded measure of decisions - (as returned by "decision_function" on some classifiers). - - pos_label : int or str, default=None - Label considered as positive and others are considered negative. - - sample_weight : array-like of shape = [n_samples], optional - Sample weights. - - drop_intermediate : boolean, optional (default=True) - Whether to drop some suboptimal thresholds which would not appear - on a plotted ROC curve. This is useful in order to create lighter - ROC curves. - - .. versionadded:: 0.17 - parameter *drop_intermediate*. - - Returns - ------- - fpr : array, shape = [>2] - Increasing false positive rates such that element i is the false - positive rate of predictions with score >= thresholds[i]. - - tpr : array, shape = [>2] - Increasing true positive rates such that element i is the true - positive rate of predictions with score >= thresholds[i]. - - thresholds : array, shape = [n_thresholds] - Decreasing thresholds on the decision function used to compute - fpr and tpr. `thresholds[0]` represents no instances being predicted - and is arbitrarily set to `max(y_score) + 1`. - - See also - -------- - roc_auc_score : Compute the area under the ROC curve - - Notes - ----- - Since the thresholds are sorted from low to high values, they - are reversed upon returning them to ensure they correspond to both ``fpr`` - and ``tpr``, which are sorted in reversed order during their calculation. - - References - ---------- - .. [1] `Wikipedia entry for the Receiver operating characteristic - `_ - - .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition - Letters, 2006, 27(8):861-874. - - Examples - -------- - >>> import numpy as np - >>> from sklearn import metrics - >>> y = np.array([1, 1, 2, 2]) - >>> scores = np.array([0.1, 0.4, 0.35, 0.8]) - >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) - >>> fpr - array([ 0. , 0. , 0.5, 0.5, 1. ]) - >>> tpr - array([ 0. , 0.5, 0.5, 1. , 1. ]) - >>> thresholds - array([ 1.8 , 0.8 , 0.4 , 0.35, 0.1 ]) - - """ - fps, tps, thresholds = _binary_clf_curve( - y_true, y_score, pos_label=pos_label, sample_weight=sample_weight) - - # Attempt to drop thresholds corresponding to points in between and - # collinear with other points. These are always suboptimal and do not - # appear on a plotted ROC curve (and thus do not affect the AUC). - # Here np.diff(_, 2) is used as a "second derivative" to tell if there - # is a corner at the point. Both fps and tps must be tested to handle - # thresholds with multiple data points (which are combined in - # _binary_clf_curve). This keeps all cases where the point should be kept, - # but does not drop more complicated cases like fps = [1, 3, 7], - # tps = [1, 2, 4]; there is no harm in keeping too many thresholds. - if drop_intermediate and len(fps) > 2: - optimal_idxs = np.where(np.r_[True, - np.logical_or(np.diff(fps, 2), - np.diff(tps, 2)), - True])[0] - fps = fps[optimal_idxs] - tps = tps[optimal_idxs] - thresholds = thresholds[optimal_idxs] - - if tps.size == 0 or fps[0] != 0 or tps[0] != 0: - # Add an extra threshold position if necessary - # to make sure that the curve starts at (0, 0) - tps = np.r_[0, tps] - fps = np.r_[0, fps] - thresholds = np.r_[thresholds[0] + 1, thresholds] - - if fps[-1] <= 0: - warnings.warn("No negative samples in y_true, " - "false positive value should be meaningless", - UndefinedMetricWarning) - fpr = np.repeat(np.nan, fps.shape) - else: - fpr = fps / fps[-1] - - if tps[-1] <= 0: - warnings.warn("No positive samples in y_true, " - "true positive value should be meaningless", - UndefinedMetricWarning) - tpr = np.repeat(np.nan, tps.shape) - else: - tpr = tps / tps[-1] - - return fpr, tpr, thresholds - - -def label_ranking_average_precision_score(y_true, y_score): - """Compute ranking-based average precision - - Label ranking average precision (LRAP) is the average over each ground - truth label assigned to each sample, of the ratio of true vs. total - labels with lower score. - - This metric is used in multilabel ranking problem, where the goal - is to give better rank to the labels associated to each sample. - - The obtained score is always strictly greater than 0 and - the best value is 1. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - y_true : array or sparse matrix, shape = [n_samples, n_labels] - True binary labels in binary indicator format. - - y_score : array, shape = [n_samples, n_labels] - Target scores, can either be probability estimates of the positive - class, confidence values, or non-thresholded measure of decisions - (as returned by "decision_function" on some classifiers). - - Returns - ------- - score : float - - Examples - -------- - >>> import numpy as np - >>> from sklearn.metrics import label_ranking_average_precision_score - >>> y_true = np.array([[1, 0, 0], [0, 0, 1]]) - >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]]) - >>> label_ranking_average_precision_score(y_true, y_score) \ - # doctest: +ELLIPSIS - 0.416... - - """ - check_consistent_length(y_true, y_score) - y_true = check_array(y_true, ensure_2d=False) - y_score = check_array(y_score, ensure_2d=False) - - if y_true.shape != y_score.shape: - raise ValueError("y_true and y_score have different shape") - - # Handle badly formatted array and the degenerate case with one label - y_type = type_of_target(y_true) - if (y_type != "multilabel-indicator" and - not (y_type == "binary" and y_true.ndim == 2)): - raise ValueError("{0} format is not supported".format(y_type)) - - y_true = csr_matrix(y_true) - y_score = -y_score - - n_samples, n_labels = y_true.shape - - out = 0. - for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): - relevant = y_true.indices[start:stop] - - if (relevant.size == 0 or relevant.size == n_labels): - # If all labels are relevant or unrelevant, the score is also - # equal to 1. The label ranking has no meaning. - out += 1. - continue - - scores_i = y_score[i] - rank = rankdata(scores_i, 'max')[relevant] - L = rankdata(scores_i[relevant], 'max') - out += (L / rank).mean() - - return out / n_samples - - -def coverage_error(y_true, y_score, sample_weight=None): - """Coverage error measure - - Compute how far we need to go through the ranked scores to cover all - true labels. The best value is equal to the average number - of labels in ``y_true`` per sample. - - Ties in ``y_scores`` are broken by giving maximal rank that would have - been assigned to all tied values. - - Note: Our implementation's score is 1 greater than the one given in - Tsoumakas et al., 2010. This extends it to handle the degenerate case - in which an instance has 0 true labels. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - y_true : array, shape = [n_samples, n_labels] - True binary labels in binary indicator format. - - y_score : array, shape = [n_samples, n_labels] - Target scores, can either be probability estimates of the positive - class, confidence values, or non-thresholded measure of decisions - (as returned by "decision_function" on some classifiers). - - sample_weight : array-like of shape = [n_samples], optional - Sample weights. - - Returns - ------- - coverage_error : float - - References - ---------- - .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). - Mining multi-label data. In Data mining and knowledge discovery - handbook (pp. 667-685). Springer US. - - """ - y_true = check_array(y_true, ensure_2d=False) - y_score = check_array(y_score, ensure_2d=False) - check_consistent_length(y_true, y_score, sample_weight) - - y_type = type_of_target(y_true) - if y_type != "multilabel-indicator": - raise ValueError("{0} format is not supported".format(y_type)) - - if y_true.shape != y_score.shape: - raise ValueError("y_true and y_score have different shape") - - y_score_mask = np.ma.masked_array(y_score, mask=np.logical_not(y_true)) - y_min_relevant = y_score_mask.min(axis=1).reshape((-1, 1)) - coverage = (y_score >= y_min_relevant).sum(axis=1) - coverage = coverage.filled(0) - - return np.average(coverage, weights=sample_weight) - - -def label_ranking_loss(y_true, y_score, sample_weight=None): - """Compute Ranking loss measure - - Compute the average number of label pairs that are incorrectly ordered - given y_score weighted by the size of the label set and the number of - labels not in the label set. - - This is similar to the error set size, but weighted by the number of - relevant and irrelevant labels. The best performance is achieved with - a ranking loss of zero. - - Read more in the :ref:`User Guide `. - - .. versionadded:: 0.17 - A function *label_ranking_loss* - - Parameters - ---------- - y_true : array or sparse matrix, shape = [n_samples, n_labels] - True binary labels in binary indicator format. - - y_score : array, shape = [n_samples, n_labels] - Target scores, can either be probability estimates of the positive - class, confidence values, or non-thresholded measure of decisions - (as returned by "decision_function" on some classifiers). - - sample_weight : array-like of shape = [n_samples], optional - Sample weights. - - Returns - ------- - loss : float - - References - ---------- - .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). - Mining multi-label data. In Data mining and knowledge discovery - handbook (pp. 667-685). Springer US. - - """ - y_true = check_array(y_true, ensure_2d=False, accept_sparse='csr') - y_score = check_array(y_score, ensure_2d=False) - check_consistent_length(y_true, y_score, sample_weight) - - y_type = type_of_target(y_true) - if y_type not in ("multilabel-indicator",): - raise ValueError("{0} format is not supported".format(y_type)) - - if y_true.shape != y_score.shape: - raise ValueError("y_true and y_score have different shape") - - n_samples, n_labels = y_true.shape - - y_true = csr_matrix(y_true) - - loss = np.zeros(n_samples) - for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): - # Sort and bin the label scores - unique_scores, unique_inverse = np.unique(y_score[i], - return_inverse=True) - true_at_reversed_rank = np.bincount( - unique_inverse[y_true.indices[start:stop]], - minlength=len(unique_scores)) - all_at_reversed_rank = np.bincount(unique_inverse, - minlength=len(unique_scores)) - false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank - - # if the scores are ordered, it's possible to count the number of - # incorrectly ordered paires in linear time by cumulatively counting - # how many false labels of a given score have a score higher than the - # accumulated true labels with lower score. - loss[i] = np.dot(true_at_reversed_rank.cumsum(), - false_at_reversed_rank) - - n_positives = count_nonzero(y_true, axis=1) - with np.errstate(divide="ignore", invalid="ignore"): - loss /= ((n_labels - n_positives) * n_positives) - - # When there is no positive or no negative labels, those values should - # be consider as correct, i.e. the ranking doesn't matter. - loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0. - - return np.average(loss, weights=sample_weight) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 08e82d864832b..24f01d46610a7 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -59,17 +59,7 @@ from sklearn.metrics import ndcg_score from sklearn.metrics import dcg_score -<<<<<<< HEAD from sklearn.metrics._base import _average_binary_score -======= -# TODO Curve are currently not covered by invariance test -# from sklearn.metrics import precision_recall_curve -# from sklearn.metrics import roc_curve -# from sklearn.metrics import detection_error_tradeoff_curve - - -from sklearn.metrics.base import _average_binary_score ->>>>>>> Add reference for DET curves in invariance test # Note toward developers about metric testing From 6ebff5b7be63650aa9e36cb7b49ab25e09109396 Mon Sep 17 00:00:00 2001 From: daniel Date: Sat, 18 Jul 2020 18:36:55 +0200 Subject: [PATCH 25/36] Fix PR link in doc --- doc/whats_new/v0.24.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index e834be6b1f962..ff8149142f3b3 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -214,7 +214,7 @@ Changelog - |Feature| Added :func:`metrics.detection_error_tradeoff_curve` to compute Detection Error Tradeoff curve classification metric. - :issue:`10591` by :user:`Jeremy Karnowski ` and + :pr:`10591` by :user:`Jeremy Karnowski ` and :user:`Daniel Mohns `. - |Feature| Added :func:`metrics.mean_absolute_percentage_error` metric and From d0a2f5c45b7185857a9172a3df53ff8df3ef4970 Mon Sep 17 00:00:00 2001 From: daniel Date: Sat, 18 Jul 2020 18:47:39 +0200 Subject: [PATCH 26/36] Fix test_ranking --- sklearn/metrics/tests/test_ranking.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 7f1d7f38c4caf..e08a8909cfe72 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -16,6 +16,7 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils._testing import assert_raises from sklearn.utils._testing import assert_warns from sklearn.metrics import auc From 3ff579269b4587eb45d5a80aa4b48054b45f28ec Mon Sep 17 00:00:00 2001 From: daniel Date: Sat, 18 Jul 2020 18:56:49 +0200 Subject: [PATCH 27/36] Fix rebase errors --- sklearn/metrics/_ranking.py | 102 ++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 52 deletions(-) diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index e7a0eaeb024e0..feb08e893b452 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -218,82 +218,80 @@ def _binary_uninterpolated_average_precision( average, sample_weight=sample_weight) -def detection_error_tradeoff(y_true, probas_pred, pos_label=None, - sample_weight=None): - """Compute error rates for different probability thresholds - - Note: this implementation is restricted to the binary classification task. - +def detection_error_tradeoff_curve(y_true, y_score, pos_label=None, + sample_weight=None): + """Compute error rates for different probability thresholds. + Note: This metrics is used for ranking evaluation of a binary + classification task. + Read more in the :ref:`User Guide `. Parameters ---------- y_true : array, shape = [n_samples] True targets of binary classification in range {-1, 1} or {0, 1}. - - probas_pred : array, shape = [n_samples] + y_score : array, shape = [n_samples] Estimated probabilities or decision function. - pos_label : int, optional (default=None) The label of the positive class - sample_weight : array-like of shape = [n_samples], optional Sample weights. - Returns ------- - fps : array, shape = [n_thresholds] - A count of false positives, at index i being the number of negative - samples assigned a score >= thresholds[i]. The total number of - negative samples is equal to fps[-1] (thus true negatives are given by - fps[-1] - fps). - - fns : array, shape = [n_thresholds] - A count of false negatives, at index i being the number of positive - samples assigned a score < thresholds[i]. The total number of - positive samples is equal to tps[-1] (thus false negatives are given by - tps[-1] - tps). - + fpr : array, shape = [n_thresholds] + False positive rate (FPR) such that element i is the false positive + rate of predictions with score >= thresholds[i]. This is occasionally + referred to as false acceptance propability or fall-out. + fnr : array, shape = [n_thresholds] + False negative rate (FNR) such that element i is the false negative + rate of predictions with score >= thresholds[i]. This is occasionally + referred to as false rejection or miss rate. thresholds : array, shape = [n_thresholds] Decreasing score values. - - References - ---------- - .. [1] `Wikipedia entry for Detection error tradeoff - `_ - .. [2] `The DET Curve in Assessment of Detection Task Performance - `_ - .. [3] `2008 NIST Speaker Recognition Evaluation Results - `_ - .. [4] `DET-Curve Plotting software for use with MATLAB - `_ - + See also + -------- + roc_curve : Compute Receiver operating characteristic (ROC) curve + precision_recall_curve : Compute precision-recall curve Examples -------- >>> import numpy as np - >>> from sklearn.metrics import detection_error_tradeoff + >>> from sklearn.metrics import detection_error_tradeoff_curve >>> y_true = np.array([0, 0, 1, 1]) >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) - >>> fps, fns, thresholds = detection_error_tradeoff(y_true, y_scores) - >>> fps - array([ 0.5, 0.5, 0. ]) - >>> fns - array([ 0. , 0.5, 0.5]) + >>> fpr, fnr, thresholds = detection_error_tradeoff_curve(y_true, y_scores) + >>> fpr + array([0.5, 0.5, 0. ]) + >>> fnr + array([0. , 0.5, 0.5]) >>> thresholds - array([ 0.35, 0.4 , 0.8 ]) - + array([0.35, 0.4 , 0.8 ]) """ - fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred, + if len(np.unique(y_true)) != 2: + raise ValueError("Only one class present in y_true. Detection error " + "tradeoff curve is not defined in that case.") + + fps, tps, thresholds = _binary_clf_curve(y_true, y_score, pos_label=pos_label, sample_weight=sample_weight) - fns = tps[-1] - tps - tp_count = tps[-1] - tn_count = (fps[-1] - fps)[0] - # start with false positives is zero and stop with false negatives zero - # and reverse the outputs so list of false positives is decreasing + fns = tps[-1] - tps + p_count = tps[-1] + n_count = fps[-1] + + # start with false positives zero + first_ind = ( + fps.searchsorted(fps[0], side='right') - 1 + if fps.searchsorted(fps[0], side='right') > 0 + else None + ) + # stop with false negatives zero last_ind = tps.searchsorted(tps[-1]) + 1 - first_ind = fps[::-1].searchsorted(fps[0]) - sl = range(first_ind, last_ind)[::-1] - return fps[sl] / tp_count, fns[sl] / tn_count, thresholds[sl] + sl = slice(first_ind, last_ind) + + # reverse the output such that list of false positives is decreasing + return ( + fps[sl][::-1] / n_count, + fns[sl][::-1] / p_count, + thresholds[sl][::-1] + ) def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None): From a3776d8202c4cf86fd3396f3a1335ef063ccc521 Mon Sep 17 00:00:00 2001 From: daniel Date: Sat, 18 Jul 2020 19:05:12 +0200 Subject: [PATCH 28/36] Fix import --- sklearn/metrics/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 1011614510c14..a69d5c618c20f 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -7,7 +7,7 @@ from ._ranking import auc from ._ranking import average_precision_score from ._ranking import coverage_error -from ._ranking import detection_error_tradeoff +from ._ranking import detection_error_tradeoff_curve from ._ranking import dcg_score from ._ranking import label_ranking_average_precision_score from ._ranking import label_ranking_loss @@ -105,7 +105,7 @@ 'coverage_error', 'dcg_score', 'davies_bouldin_score', - 'detection_error_tradeoff', + 'detection_error_tradeoff_curve', 'euclidean_distances', 'explained_variance_score', 'f1_score', From d29d4748076e815b010e5d2d1eee746ab11b8f0a Mon Sep 17 00:00:00 2001 From: daniel Date: Sun, 19 Jul 2020 15:38:46 +0200 Subject: [PATCH 29/36] Bring back newlines - Swallowed by copy/paste --- sklearn/metrics/_ranking.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index feb08e893b452..5c58920e3ffd4 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -221,35 +221,46 @@ def _binary_uninterpolated_average_precision( def detection_error_tradeoff_curve(y_true, y_score, pos_label=None, sample_weight=None): """Compute error rates for different probability thresholds. + Note: This metrics is used for ranking evaluation of a binary classification task. + Read more in the :ref:`User Guide `. + Parameters ---------- y_true : array, shape = [n_samples] True targets of binary classification in range {-1, 1} or {0, 1}. + y_score : array, shape = [n_samples] Estimated probabilities or decision function. + pos_label : int, optional (default=None) The label of the positive class + sample_weight : array-like of shape = [n_samples], optional Sample weights. + Returns ------- fpr : array, shape = [n_thresholds] False positive rate (FPR) such that element i is the false positive rate of predictions with score >= thresholds[i]. This is occasionally referred to as false acceptance propability or fall-out. + fnr : array, shape = [n_thresholds] False negative rate (FNR) such that element i is the false negative rate of predictions with score >= thresholds[i]. This is occasionally referred to as false rejection or miss rate. + thresholds : array, shape = [n_thresholds] Decreasing score values. + See also -------- roc_curve : Compute Receiver operating characteristic (ROC) curve precision_recall_curve : Compute precision-recall curve + Examples -------- >>> import numpy as np @@ -263,6 +274,7 @@ def detection_error_tradeoff_curve(y_true, y_score, pos_label=None, array([0. , 0.5, 0.5]) >>> thresholds array([0.35, 0.4 , 0.8 ]) + """ if len(np.unique(y_true)) != 2: raise ValueError("Only one class present in y_true. Detection error " From 0d31c770483af35b159909bc43bd7cff45948b51 Mon Sep 17 00:00:00 2001 From: daniel Date: Sun, 19 Jul 2020 15:40:49 +0200 Subject: [PATCH 30/36] Remove uncited ref link --- doc/modules/model_evaluation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 2592ba8db732d..67f4195986a95 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1512,7 +1512,7 @@ better suited. Wikipedia, The Free Encyclopedia. September 4, 2017, 23:33 UTC. Available at: https://en.wikipedia.org/w/index.php?title=Detection_error_tradeoff&oldid=798982054. Accessed February 19, 2018. - .. [Martin1997] A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, + .. A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, `The DET Curve in Assessment of Detection Task Performance `_, NIST 1997. From bdc2608e6d82662b0192486d50bcf03383eae546 Mon Sep 17 00:00:00 2001 From: daniel Date: Sun, 26 Jul 2020 13:17:46 +0200 Subject: [PATCH 31/36] Remove matplotlib deprecation warning --- examples/model_selection/plot_det.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/model_selection/plot_det.py b/examples/model_selection/plot_det.py index 8511f1d7cac96..3206d85a21c54 100644 --- a/examples/model_selection/plot_det.py +++ b/examples/model_selection/plot_det.py @@ -62,7 +62,7 @@ # prepare plots # first prepare the ROC curve -ax_roc = plt.subplot(1, 2, 1) +ax_roc = plt.subplot(1, 2, 1, label='roc_curve') ax_roc.set_title('Receiver Operating Characteristic (ROC) curves') ax_roc.set_xlabel('False Positive Rate') ax_roc.set_ylabel('True Positive Rate') @@ -75,7 +75,7 @@ FuncFormatter(lambda y, _: '{:.0%}'.format(y))) # second prepare the DET curve -ax_det = plt.subplot(1, 2, 2) +ax_det = plt.subplot(1, 2, 2, label='det_curve') ax_det.set_title('Detection Error Tradeoff (DET) curves') ax_det.set_xlabel('False Positive Rate') ax_det.set_ylabel('False Negative Rate') @@ -114,7 +114,6 @@ ) # finally add legend -ax_det = plt.subplot(1, 2, 2) plt.legend(names, loc="upper right") plt.tight_layout() From 51a08fea37061b111562836a72c66dcfec42346f Mon Sep 17 00:00:00 2001 From: daniel Date: Sun, 26 Jul 2020 13:18:10 +0200 Subject: [PATCH 32/36] Bring back hidden reference --- doc/modules/model_evaluation.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 67f4195986a95..d530d79b0e9a9 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1512,10 +1512,7 @@ better suited. Wikipedia, The Free Encyclopedia. September 4, 2017, 23:33 UTC. Available at: https://en.wikipedia.org/w/index.php?title=Detection_error_tradeoff&oldid=798982054. Accessed February 19, 2018. - .. A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, - `The DET Curve in Assessment of Detection Task Performance - `_, - NIST 1997. + .. [Navratil2007] J. Navractil and D. Klusacek, "`On Linear DETs, `_" @@ -1523,6 +1520,11 @@ better suited. Speech and Signal Processing - ICASSP '07, Honolulu, HI, 2007, pp. IV-229-IV-232. + A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, + `The DET Curve in Assessment of Detection Task Performance + `_, + NIST 1997. + .. _zero_one_loss: Zero one loss From a662f44b3aa0faa22c960d2b143f8b1bb125c3ba Mon Sep 17 00:00:00 2001 From: daniel Date: Sun, 26 Jul 2020 15:20:43 +0200 Subject: [PATCH 33/36] Add motivation to DET example --- examples/model_selection/plot_det.py | 31 +++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/examples/model_selection/plot_det.py b/examples/model_selection/plot_det.py index 3206d85a21c54..51f253e49225c 100644 --- a/examples/model_selection/plot_det.py +++ b/examples/model_selection/plot_det.py @@ -3,20 +3,45 @@ Detection error tradeoff (DET) curve ======================================= -In this example we compare receiver operating characteristic (ROC) and -detection error tradeoff (DET) curves to demonstrate how DET curves can help -to asses the performance of different classification algorithms. +In this example, we compare receiver operating characteristic (ROC) and +detection error tradeoff (DET) curves for different classification algorithms +for the same classification task. DET curves are commonly plotted in normal deviate scale. To achieve this we transform the errors rates as returned by the ``detection_error_tradeoff_curve`` function and the axis scale using ``scipy.stats.norm``. +The point of this example is to demonstrate two properties of DET curves, +namely: + +1. It might be easier to visually assess the overall performance of different + classification algorithms using DET curves over ROC curves. + Due to the linear scale used for plotting ROC curves, different classifiers + usually only differ in the top left corner of the graph and appear similar + for a large part of the plot. On the other hand, because DET curves represent + straight lines in normal deviate scale. As such, they tend to be + distinguishable as a whole and the area of interest spans a large part of the + plot. +2. DET curves give the user direct feedback of the detection error tradeoff to + aid in operating point analysis. + The user can deduct directly from the DET-curve plot at which rate + false-negative error rate will improve when willing to accept an increase in + false-positive error rate (or vice-versa). + +The plots in this example compare ROC curves on the left side to corresponding +DET curves on the right. +There is no particular reason why these classifiers have been chosen for the +example plot over other classifiers available in scikit-learn. + .. note:: - See :func:`sklearn.metrics.roc_curve` for further information about ROC curves. + - See :func:`sklearn.metrics.detection_error_tradeoff_curve` for further + information about DET curves. + - This example is loosely based on :ref:`sphx_glr_auto_examples_classification_plot_classifier_comparison.py` . From 8d492e84fe4ddf9634af4431897b1574b2bb8640 Mon Sep 17 00:00:00 2001 From: daniel Date: Sun, 26 Jul 2020 15:48:32 +0200 Subject: [PATCH 34/36] Fix lint --- examples/model_selection/plot_det.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/model_selection/plot_det.py b/examples/model_selection/plot_det.py index 51f253e49225c..8f8ebd3b856b6 100644 --- a/examples/model_selection/plot_det.py +++ b/examples/model_selection/plot_det.py @@ -19,10 +19,10 @@ classification algorithms using DET curves over ROC curves. Due to the linear scale used for plotting ROC curves, different classifiers usually only differ in the top left corner of the graph and appear similar - for a large part of the plot. On the other hand, because DET curves represent - straight lines in normal deviate scale. As such, they tend to be - distinguishable as a whole and the area of interest spans a large part of the - plot. + for a large part of the plot. On the other hand, because DET curves + represent straight lines in normal deviate scale. As such, they tend to be + distinguishable as a whole and the area of interest spans a large part of + the plot. 2. DET curves give the user direct feedback of the detection error tradeoff to aid in operating point analysis. The user can deduct directly from the DET-curve plot at which rate From 596819ec4d9f6419b9a5fcf69a516a3febb71de0 Mon Sep 17 00:00:00 2001 From: daniel Date: Sat, 1 Aug 2020 12:35:24 +0200 Subject: [PATCH 35/36] Add citation --- doc/modules/model_evaluation.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index d530d79b0e9a9..decd0f42383eb 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1462,6 +1462,7 @@ DET curves are commonly plotted in normal deviate scale by transformation with function). The resulting performance curves explicitly visualize the tradeoff of error types for given classification algorithms. +See [Martin1997]_ for examples and further motivation. This figure compares the ROC and DET curves of two example classifiers on the same classification task: @@ -1513,6 +1514,11 @@ better suited. Available at: https://en.wikipedia.org/w/index.php?title=Detection_error_tradeoff&oldid=798982054. Accessed February 19, 2018. + .. [Martin1997] A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, + `The DET Curve in Assessment of Detection Task Performance + `_, + NIST 1997. + .. [Navratil2007] J. Navractil and D. Klusacek, "`On Linear DETs, `_" @@ -1520,11 +1526,6 @@ better suited. Speech and Signal Processing - ICASSP '07, Honolulu, HI, 2007, pp. IV-229-IV-232. - A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, - `The DET Curve in Assessment of Detection Task Performance - `_, - NIST 1997. - .. _zero_one_loss: Zero one loss From a54aca279bc128656cd06b5e11b0a5920c281c79 Mon Sep 17 00:00:00 2001 From: daniel Date: Sat, 1 Aug 2020 12:46:23 +0200 Subject: [PATCH 36/36] Use modern matplotlib API --- examples/model_selection/plot_det.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/model_selection/plot_det.py b/examples/model_selection/plot_det.py index 8f8ebd3b856b6..6cfac7e5ce0ca 100644 --- a/examples/model_selection/plot_det.py +++ b/examples/model_selection/plot_det.py @@ -76,8 +76,6 @@ n_samples=N_SAMPLES, n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) -figure = plt.figure(figsize=(10, 5)) - # preprocess dataset, split into training and test part X = StandardScaler().fit_transform(X) @@ -85,9 +83,9 @@ X, y, test_size=.4, random_state=0) # prepare plots +fig, [ax_roc, ax_det] = plt.subplots(1, 2, figsize=(10, 5)) # first prepare the ROC curve -ax_roc = plt.subplot(1, 2, 1, label='roc_curve') ax_roc.set_title('Receiver Operating Characteristic (ROC) curves') ax_roc.set_xlabel('False Positive Rate') ax_roc.set_ylabel('True Positive Rate') @@ -100,21 +98,21 @@ FuncFormatter(lambda y, _: '{:.0%}'.format(y))) # second prepare the DET curve -ax_det = plt.subplot(1, 2, 2, label='det_curve') ax_det.set_title('Detection Error Tradeoff (DET) curves') ax_det.set_xlabel('False Positive Rate') ax_det.set_ylabel('False Negative Rate') ax_det.set_xlim(-3, 3) ax_det.set_ylim(-3, 3) ax_det.grid(linestyle='--') -# customized ticks to represent normal deviate scale + +# customized ticks for DET curve plot to represent normal deviate scale ticks = [0.001, 0.01, 0.05, 0.20, 0.5, 0.80, 0.95, 0.99, 0.999] tick_locs = norm.ppf(ticks) tick_lbls = [ '{:.0%}'.format(s) if (100*s).is_integer() else '{:.1%}'.format(s) for s in ticks ] - +plt.sca(ax_det) plt.xticks(tick_locs, tick_lbls) plt.yticks(tick_locs, tick_lbls) @@ -138,8 +136,10 @@ norm.ppf(det_fnr) ) -# finally add legend +# add a single legend +plt.sca(ax_det) plt.legend(names, loc="upper right") +# plot plt.tight_layout() plt.show()