scikit-learn · maskani-moh · Jan 16, 2018 · Jan 16, 2018 · Jan 16, 2018 · Jan 17, 2018
diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
@@ -13,11 +13,13 @@
 # License: BSD 3 clause
 
 from __future__ import division
+import itertools
 
 import numpy as np
 
 from ..utils import check_array, check_consistent_length
 from ..utils.multiclass import type_of_target
+from ..preprocessing import LabelBinarizer
 
 
 def _average_binary_score(binary_metric, y_true, y_score, average,
@@ -33,7 +35,8 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
         Target scores, can either be probability estimates of the positive
         class, confidence values, or binary decisions.
 
-    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+    average : string, {None, 'micro', 'macro', 'samples', 'weighted'},
+              default 'macro'
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
 
@@ -122,3 +125,125 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
         return np.average(score, weights=average_weight)
     else:
         return score
+
+
+def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
+    """Uses the binary metric for one-vs-one multiclass classification,
+    where the score is computed according to the Hand & Till (2001) algorithm.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        True multiclass labels.
+        Assumes labels have been recoded to 0 to n_classes.
+
+    y_score : array, shape = [n_samples, n_classes]
+        Target scores corresponding to probability estimates of a sample
+        belonging to a particular class
+
+    average : 'macro' or 'weighted', default='macro'
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean. This does not take label imbalance into account. Classes
+            are assumed to be uniformly distributed.
+        ``'weighted'``:
+            Calculate metrics for each label, taking into account the
+            prevalence of the classes.
+
+    binary_metric : callable, the binary metric function to use.
+        Accepts the following as input
+            y_true_target : array, shape = [n_samples_target]
+                Some sub-array of y_true for a pair of classes designated
+                positive and negative in the one-vs-one scheme.
+            y_score_target : array, shape = [n_samples_target]
+                Scores corresponding to the probability estimates
+                of a sample belonging to the designated positive class label
+
+    Returns
+    -------
+    score : float
+        Average the sum of pairwise binary metric scores
+    """
+    n_classes = len(np.unique(y_true))
+    n_pairs = n_classes * (n_classes - 1) // 2
+    prevalence = np.empty(n_pairs)
+    pair_scores = np.empty(n_pairs)
+
+    for ix, (a, b) in enumerate(itertools.combinations(range(n_classes), 2)):
+        a_mask = y_true == a
+        ab_mask = np.logical_or(a_mask, y_true == b)
+
+        prevalence[ix] = np.sum(ab_mask) / len(y_true)
+
+        y_score_filtered = y_score[ab_mask]
+
+        a_true = a_mask[ab_mask]
+        b_true = np.logical_not(a_true)
+
+        a_true_score = binary_metric(
+                a_true, y_score_filtered[:, a])
+        b_true_score = binary_metric(
+                b_true, y_score_filtered[:, b])
+        binary_avg_score = (a_true_score + b_true_score) / 2
+        pair_scores[ix] = binary_avg_score
+
+    return (np.average(pair_scores, weights=prevalence)
+            if average == "weighted" else np.average(pair_scores))
+
+
+def _average_multiclass_ovr_score(binary_metric, y_true, y_score, average):
+    """Uses the binary metric for one-vs-rest multi-class classification,
+    where the score is computed according to the Provost & Domingos (2001)
+    definition of the AUC in multi-class settings (when `average` parameter is
+    set to `weighted`).
+
+    For each class, the ROC curve is generated and the AUC computed.
+    The output is the average of the individual AUCs weighted by the prevalence
+    of the classes in the data.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        True multiclass labels.
+        Assumes labels have been recoded to 0 to n_classes.
+
+    y_score : array, shape = [n_samples, n_classes]
+        Target scores corresponding to probability estimates of a sample
+        belonging to a particular class.
+
+    average : 'macro' or 'weighted', default='macro'
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean. This does not take label imbalance into account. Classes
+            are assumed to be uniformly distributed.
+        ``'weighted'``:
+            Calculate metrics for each label, taking into account the
+            prevalence of the classes in the dataset.
+
+    binary_metric : callable, the binary metric function to use.
+        Accepts the following as input
+            y_true_target : array, shape = [n_samples_target]
+                Some sub-array of y_true for a pair of classes designated
+                positive and negative in the one-vs-one scheme.
+            y_score_target : array, shape = [n_samples_target]
+                Scores corresponding to the probability estimates
+                of a sample belonging to the designated positive class label
+
+    Returns
+    -------
+    score : float
+        Average of binary metric scores
+    """
+    n_classes = len(np.unique(y_true))
+    scores = np.zeros((n_classes,))
+
+    y_true_multilabel = LabelBinarizer().fit_transform(y_true)
+    prevalence = np.sum(y_true_multilabel, axis=0) / y_true_multilabel.shape[0]
+
+    for c in range(n_classes):
+        y_true_c = y_true_multilabel.take([c], axis=1).ravel()
+        y_score_c = y_score.take([c], axis=1).ravel()
+        scores[c] = binary_metric(y_true_c, y_score_c)
+
+    return (np.average(scores, weights=prevalence)
+            if average == "weighted" else np.average(scores))
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
@@ -31,9 +31,10 @@
 from ..utils.extmath import stable_cumsum
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
-from ..preprocessing import label_binarize
+from ..preprocessing import LabelBinarizer, label_binarize
 
-from .base import _average_binary_score
+from .base import _average_binary_score, _average_multiclass_ovo_score, \
+                  _average_multiclass_ovr_score
 
 
 def auc(x, y, reorder='deprecated'):
@@ -157,7 +158,8 @@ def average_precision_score(y_true, y_score, average="macro",
         class, confidence values, or non-thresholded measure of decisions
         (as returned by "decision_function" on some classifiers).
 
-    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+    average : string, {None, 'micro', 'macro', 'samples', 'weighted'},
+              default 'macro'
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
 
@@ -222,29 +224,39 @@ def _binary_uninterpolated_average_precision(
                                  sample_weight=sample_weight)
 
 
-def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
-                  max_fpr=None):
-    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
-    from prediction scores.
-
-    Note: this implementation is restricted to the binary classification task
-    or multilabel classification task in label indicator format.
+def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro",
+                  sample_weight=None, max_fpr=None):
+    """Compute Area Under the Curve (AUC) from prediction scores.
 
     Read more in the :ref:`User Guide <roc_metrics>`.
 
     Parameters
     ----------
     y_true : array, shape = [n_samples] or [n_samples, n_classes]
-        True binary labels or binary label indicators.
+        True binary labels in binary label indicators.
+        The multiclass case expects shape = [n_samples] and labels
+        with values from 0 to (n_classes-1), inclusive.
 
     y_score : array, shape = [n_samples] or [n_samples, n_classes]
         Target scores, can either be probability estimates of the positive
         class, confidence values, or non-thresholded measure of decisions
-        (as returned by "decision_function" on some classifiers). For binary
-        y_true, y_score is supposed to be the score of the class with greater
-        label.
-
-    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+        (as returned by "decision_function" on some classifiers).
+        The multiclass case expects shape = [n_samples, n_classes]
+        where the scores correspond to probability estimates.
+
+    multiclass : string, 'ovr' or 'ovo', default 'ovr'
+        Note: multiclass ROC AUC currently only handles the 'macro' and
+        'weighted' averages.
+
+        ``'ovr'``:
+            Calculate metrics for the multiclass case using the one-vs-rest
+            approach.
+        ``'ovo'``:
+            Calculate metrics for the multiclass case using the one-vs-one
+            approach.
+
+    average : string, {None, 'micro', 'macro', 'samples', 'weighted'},
+              default 'macro'
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
 
@@ -265,7 +277,9 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
 
     max_fpr : float > 0 and <= 1, optional
         If not ``None``, the standardized partial AUC [3]_ over the range
-        [0, max_fpr] is returned.
+        [0, max_fpr] is returned. If multiclass task, should be either
+        equal to ``None`` or ``1.0`` as AUC ROC partial computation currently
+        not supported in this case.
 
     Returns
     -------
@@ -326,13 +340,65 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area))
 
     y_type = type_of_target(y_true)
-    if y_type == "binary":
+    y_true = check_array(y_true, ensure_2d=False, dtype=None)
+    y_score = check_array(y_score, ensure_2d=False)
+
+    if y_type == "multiclass" or (y_type == "binary" and
+                                  y_score.ndim == 2 and
+                                  y_score.shape[1] > 2):
+        # validation of the input y_score
+        if not np.allclose(1, y_score.sum(axis=1)):
+            raise ValueError("Target scores should sum up to 1.0 for all"
+                             "samples.")
+
+        # do not support partial ROC computation for multiclass
+        if max_fpr is not None and max_fpr != 1.:
+            raise ValueError("Partial AUC computation not available in "
+                             "multiclass setting. Parameter 'max_fpr' must be"
+                             " set to `None`. Received `max_fpr={0}` "
+                             "instead.".format(max_fpr))
+
+        # validation for multiclass parameter specifications
+        average_options = ("macro", "weighted")
+        if average not in average_options:
+            raise ValueError("Parameter 'average' must be one of {0} for"
+                             " multiclass problems.".format(average_options))
+        multiclass_options = ("ovo", "ovr")
+        if multiclass not in multiclass_options:
+            raise ValueError("Parameter multiclass='{0}' is not supported"
+                             " for multiclass ROC AUC. 'multiclass' must be"
+                             " one of {1}.".format(
+                                 multiclass, multiclass_options))
+        if sample_weight is not None:
+            # TODO: check if only in ovo case, if yes, do not raise when ovr
+            raise ValueError("Parameter 'sample_weight' is not supported"
+                             " for multiclass one-vs-one ROC AUC."
+                             " 'sample_weight' must be None in this case.")
+
+        if multiclass == "ovo":
+            # Hand & Till (2001) implementation
+            return _average_multiclass_ovo_score(
+                _binary_roc_auc_score, y_true, y_score, average)
+        elif multiclass == "ovr" and average == "weighted":
+            # Provost & Domingos (2001) implementation
+            return _average_multiclass_ovr_score(
+                _binary_roc_auc_score, y_true, y_score, average)
+        else:
+            y_true = y_true.reshape((-1, 1))
+            y_true_multilabel = LabelBinarizer().fit_transform(y_true)
+            return _average_binary_score(
+                 _binary_roc_auc_score, y_true_multilabel, y_score, average,
+                 sample_weight=sample_weight)
+    elif y_type == "binary":
         labels = np.unique(y_true)
         y_true = label_binarize(y_true, labels)[:, 0]
-
-    return _average_binary_score(
-        _binary_roc_auc_score, y_true, y_score, average,
-        sample_weight=sample_weight)
+        return _average_binary_score(
+            _binary_roc_auc_score, y_true, y_score, average,
+            sample_weight=sample_weight)
+    else:
+        return _average_binary_score(
+            _binary_roc_auc_score, y_true, y_score, average,
+            sample_weight=sample_weight)
 
 
 def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):