diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py index b8bbab30930b4..0d0fba50e8737 100644 --- a/sklearn/metrics/base.py +++ b/sklearn/metrics/base.py @@ -13,11 +13,13 @@ # License: BSD 3 clause from __future__ import division +import itertools import numpy as np from ..utils import check_array, check_consistent_length from ..utils.multiclass import type_of_target +from ..preprocessing import LabelBinarizer def _average_binary_score(binary_metric, y_true, y_score, average, @@ -33,7 +35,8 @@ def _average_binary_score(binary_metric, y_true, y_score, average, Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions. - average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] + average : string, {None, 'micro', 'macro', 'samples', 'weighted'}, + default 'macro' If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: @@ -122,3 +125,125 @@ def _average_binary_score(binary_metric, y_true, y_score, average, return np.average(score, weights=average_weight) else: return score + + +def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average): + """Uses the binary metric for one-vs-one multiclass classification, + where the score is computed according to the Hand & Till (2001) algorithm. + + Parameters + ---------- + y_true : array, shape = [n_samples] + True multiclass labels. + Assumes labels have been recoded to 0 to n_classes. + + y_score : array, shape = [n_samples, n_classes] + Target scores corresponding to probability estimates of a sample + belonging to a particular class + + average : 'macro' or 'weighted', default='macro' + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. Classes + are assumed to be uniformly distributed. + ``'weighted'``: + Calculate metrics for each label, taking into account the + prevalence of the classes. + + binary_metric : callable, the binary metric function to use. + Accepts the following as input + y_true_target : array, shape = [n_samples_target] + Some sub-array of y_true for a pair of classes designated + positive and negative in the one-vs-one scheme. + y_score_target : array, shape = [n_samples_target] + Scores corresponding to the probability estimates + of a sample belonging to the designated positive class label + + Returns + ------- + score : float + Average the sum of pairwise binary metric scores + """ + n_classes = len(np.unique(y_true)) + n_pairs = n_classes * (n_classes - 1) // 2 + prevalence = np.empty(n_pairs) + pair_scores = np.empty(n_pairs) + + for ix, (a, b) in enumerate(itertools.combinations(range(n_classes), 2)): + a_mask = y_true == a + ab_mask = np.logical_or(a_mask, y_true == b) + + prevalence[ix] = np.sum(ab_mask) / len(y_true) + + y_score_filtered = y_score[ab_mask] + + a_true = a_mask[ab_mask] + b_true = np.logical_not(a_true) + + a_true_score = binary_metric( + a_true, y_score_filtered[:, a]) + b_true_score = binary_metric( + b_true, y_score_filtered[:, b]) + binary_avg_score = (a_true_score + b_true_score) / 2 + pair_scores[ix] = binary_avg_score + + return (np.average(pair_scores, weights=prevalence) + if average == "weighted" else np.average(pair_scores)) + + +def _average_multiclass_ovr_score(binary_metric, y_true, y_score, average): + """Uses the binary metric for one-vs-rest multi-class classification, + where the score is computed according to the Provost & Domingos (2001) + definition of the AUC in multi-class settings (when `average` parameter is + set to `weighted`). + + For each class, the ROC curve is generated and the AUC computed. + The output is the average of the individual AUCs weighted by the prevalence + of the classes in the data. + + Parameters + ---------- + y_true : array, shape = [n_samples] + True multiclass labels. + Assumes labels have been recoded to 0 to n_classes. + + y_score : array, shape = [n_samples, n_classes] + Target scores corresponding to probability estimates of a sample + belonging to a particular class. + + average : 'macro' or 'weighted', default='macro' + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. Classes + are assumed to be uniformly distributed. + ``'weighted'``: + Calculate metrics for each label, taking into account the + prevalence of the classes in the dataset. + + binary_metric : callable, the binary metric function to use. + Accepts the following as input + y_true_target : array, shape = [n_samples_target] + Some sub-array of y_true for a pair of classes designated + positive and negative in the one-vs-one scheme. + y_score_target : array, shape = [n_samples_target] + Scores corresponding to the probability estimates + of a sample belonging to the designated positive class label + + Returns + ------- + score : float + Average of binary metric scores + """ + n_classes = len(np.unique(y_true)) + scores = np.zeros((n_classes,)) + + y_true_multilabel = LabelBinarizer().fit_transform(y_true) + prevalence = np.sum(y_true_multilabel, axis=0) / y_true_multilabel.shape[0] + + for c in range(n_classes): + y_true_c = y_true_multilabel.take([c], axis=1).ravel() + y_score_c = y_score.take([c], axis=1).ravel() + scores[c] = binary_metric(y_true_c, y_score_c) + + return (np.average(scores, weights=prevalence) + if average == "weighted" else np.average(scores)) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index d612e913d2a06..505149749cb55 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -31,9 +31,10 @@ from ..utils.extmath import stable_cumsum from ..utils.sparsefuncs import count_nonzero from ..exceptions import UndefinedMetricWarning -from ..preprocessing import label_binarize +from ..preprocessing import LabelBinarizer, label_binarize -from .base import _average_binary_score +from .base import _average_binary_score, _average_multiclass_ovo_score, \ + _average_multiclass_ovr_score def auc(x, y, reorder='deprecated'): @@ -157,7 +158,8 @@ def average_precision_score(y_true, y_score, average="macro", class, confidence values, or non-thresholded measure of decisions (as returned by "decision_function" on some classifiers). - average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] + average : string, {None, 'micro', 'macro', 'samples', 'weighted'}, + default 'macro' If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: @@ -222,29 +224,39 @@ def _binary_uninterpolated_average_precision( sample_weight=sample_weight) -def roc_auc_score(y_true, y_score, average="macro", sample_weight=None, - max_fpr=None): - """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) - from prediction scores. - - Note: this implementation is restricted to the binary classification task - or multilabel classification task in label indicator format. +def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro", + sample_weight=None, max_fpr=None): + """Compute Area Under the Curve (AUC) from prediction scores. Read more in the :ref:`User Guide `. Parameters ---------- y_true : array, shape = [n_samples] or [n_samples, n_classes] - True binary labels or binary label indicators. + True binary labels in binary label indicators. + The multiclass case expects shape = [n_samples] and labels + with values from 0 to (n_classes-1), inclusive. y_score : array, shape = [n_samples] or [n_samples, n_classes] Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions - (as returned by "decision_function" on some classifiers). For binary - y_true, y_score is supposed to be the score of the class with greater - label. - - average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] + (as returned by "decision_function" on some classifiers). + The multiclass case expects shape = [n_samples, n_classes] + where the scores correspond to probability estimates. + + multiclass : string, 'ovr' or 'ovo', default 'ovr' + Note: multiclass ROC AUC currently only handles the 'macro' and + 'weighted' averages. + + ``'ovr'``: + Calculate metrics for the multiclass case using the one-vs-rest + approach. + ``'ovo'``: + Calculate metrics for the multiclass case using the one-vs-one + approach. + + average : string, {None, 'micro', 'macro', 'samples', 'weighted'}, + default 'macro' If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: @@ -265,7 +277,9 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None, max_fpr : float > 0 and <= 1, optional If not ``None``, the standardized partial AUC [3]_ over the range - [0, max_fpr] is returned. + [0, max_fpr] is returned. If multiclass task, should be either + equal to ``None`` or ``1.0`` as AUC ROC partial computation currently + not supported in this case. Returns ------- @@ -326,13 +340,65 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None): return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area)) y_type = type_of_target(y_true) - if y_type == "binary": + y_true = check_array(y_true, ensure_2d=False, dtype=None) + y_score = check_array(y_score, ensure_2d=False) + + if y_type == "multiclass" or (y_type == "binary" and + y_score.ndim == 2 and + y_score.shape[1] > 2): + # validation of the input y_score + if not np.allclose(1, y_score.sum(axis=1)): + raise ValueError("Target scores should sum up to 1.0 for all" + "samples.") + + # do not support partial ROC computation for multiclass + if max_fpr is not None and max_fpr != 1.: + raise ValueError("Partial AUC computation not available in " + "multiclass setting. Parameter 'max_fpr' must be" + " set to `None`. Received `max_fpr={0}` " + "instead.".format(max_fpr)) + + # validation for multiclass parameter specifications + average_options = ("macro", "weighted") + if average not in average_options: + raise ValueError("Parameter 'average' must be one of {0} for" + " multiclass problems.".format(average_options)) + multiclass_options = ("ovo", "ovr") + if multiclass not in multiclass_options: + raise ValueError("Parameter multiclass='{0}' is not supported" + " for multiclass ROC AUC. 'multiclass' must be" + " one of {1}.".format( + multiclass, multiclass_options)) + if sample_weight is not None: + # TODO: check if only in ovo case, if yes, do not raise when ovr + raise ValueError("Parameter 'sample_weight' is not supported" + " for multiclass one-vs-one ROC AUC." + " 'sample_weight' must be None in this case.") + + if multiclass == "ovo": + # Hand & Till (2001) implementation + return _average_multiclass_ovo_score( + _binary_roc_auc_score, y_true, y_score, average) + elif multiclass == "ovr" and average == "weighted": + # Provost & Domingos (2001) implementation + return _average_multiclass_ovr_score( + _binary_roc_auc_score, y_true, y_score, average) + else: + y_true = y_true.reshape((-1, 1)) + y_true_multilabel = LabelBinarizer().fit_transform(y_true) + return _average_binary_score( + _binary_roc_auc_score, y_true_multilabel, y_score, average, + sample_weight=sample_weight) + elif y_type == "binary": labels = np.unique(y_true) y_true = label_binarize(y_true, labels)[:, 0] - - return _average_binary_score( - _binary_roc_auc_score, y_true, y_score, average, - sample_weight=sample_weight) + return _average_binary_score( + _binary_roc_auc_score, y_true, y_score, average, + sample_weight=sample_weight) + else: + return _average_binary_score( + _binary_roc_auc_score, y_true, y_score, average, + sample_weight=sample_weight) def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 07c35c609358d..a503b0b84078e 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -471,6 +471,136 @@ def test_deprecated_auc_reorder(): [1, 2], [2, 3], reorder=True) +def test_multi_ovo_auc_toydata(): + # Tests the one-vs-one multiclass ROC AUC algorithm + # on a small example, representative of an expected use case. + y_true = np.array([0, 1, 0, 2]) + n_labels = len(np.unique(y_true)) + y_scores = np.array( + [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]]) + + # Used to compute the expected output. + # Consider labels 0 and 1: + # positive label is 0, negative label is 1 + score_01 = roc_auc_score([1, 0, 1], [0.1, 0.3, 0.35]) + # positive label is 1, negative label is 0 + score_10 = roc_auc_score([0, 1, 0], [0.8, 0.4, 0.5]) + average_score_01 = (score_01 + score_10) / 2. + + # Consider labels 0 and 2: + score_02 = roc_auc_score([1, 1, 0], [0.1, 0.35, 0]) + score_20 = roc_auc_score([0, 0, 1], [0.1, 0.15, 0.8]) + average_score_02 = (score_02 + score_20) / 2. + + # Consider labels 1 and 2: + score_12 = roc_auc_score([1, 0], [0.4, 0.2]) + score_21 = roc_auc_score([0, 1], [0.3, 0.8]) + average_score_12 = (score_12 + score_21) / 2. + + # Unweighted, one-vs-one multiclass ROC AUC algorithm + sum_avg_scores = average_score_01 + average_score_02 + average_score_12 + ovo_unweighted_coefficient = 2. / (n_labels * (n_labels - 1)) + ovo_unweighted_score = ovo_unweighted_coefficient * sum_avg_scores + assert_almost_equal( + roc_auc_score(y_true, y_scores, multiclass="ovo"), + ovo_unweighted_score) + + # Weighted, one-vs-one multiclass ROC AUC algorithm + # Each term is weighted by the prevalence for the positive label. + pair_scores = [average_score_01, average_score_02, average_score_12] + prevalence = [0.75, 0.75, 0.50] + ovo_weighted_score = np.average(pair_scores, weights=prevalence) + assert_almost_equal( + roc_auc_score(y_true, y_scores, multiclass="ovo", average="weighted"), + ovo_weighted_score) + + +def test_multi_ovr_auc_toydata(): + # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm + # on a small example, representative of an expected use case. + y_true = np.array([0, 1, 2, 2]) + y_scores = np.array( + [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]]) + # Compute the expected result by individually computing the 'one-vs-rest' + # ROC AUC scores for classes 0, 1, and 2. + out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0]) + out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1]) + out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2]) + result_unweighted = (out_0 + out_1 + out_2) / 3. + + assert_almost_equal( + roc_auc_score(y_true, y_scores, multiclass="ovr"), + result_unweighted) + + # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm + # on the same input (Provost & Domingos, 2001) + result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5 + assert_almost_equal( + roc_auc_score(y_true, y_scores, multiclass="ovr", average="weighted"), + result_weighted) + + +def test_multi_auc_score_under_permutation(): + y_score = np.random.rand(100, 3) + # Normalize the scores for each row + row_sums = y_score.sum(axis=1) + y_score = y_score / row_sums[:, np.newaxis] + # Generate the true labels + y_true = np.argmax(y_score, axis=1) + y_true[np.random.randint(len(y_score), size=20)] = np.random.randint( + 2, size=20) + for multiclass in ['ovr', 'ovo']: + for average in ['macro', 'weighted']: + same_score_under_permutation = None + for perm in [[0, 1, 2], [0, 2, 1], [1, 0, 2], + [1, 2, 0], [2, 0, 1], [2, 1, 0]]: + inv_perm = np.zeros(3, dtype=int) + inv_perm[perm] = np.arange(3) + y_score_perm = y_score[:, inv_perm] + y_true_perm = np.take(perm, y_true) + score = roc_auc_score(y_true_perm, y_score_perm, + multiclass=multiclass, average=average) + if same_score_under_permutation is None: + same_score_under_permutation = score + else: + assert_almost_equal(score, same_score_under_permutation) + + +def test_auc_score_multi_error(): + # Test that roc_auc_score function returns an error when trying + # to compute multiclass AUC for parameters where an output + # is not defined. + rng = check_random_state(404) + y_pred = rng.rand(10, 3) + row_sums = y_pred.sum(axis=1) + y_pred = y_pred / row_sums[:, np.newaxis] + y_true = rng.randint(0, 3, size=10) + average_error_msg = ("Parameter 'average' must be one of " + "('macro', 'weighted') for multiclass problems.") + assert_raise_message(ValueError, average_error_msg, + roc_auc_score, y_true, y_pred, average="samples") + assert_raise_message(ValueError, average_error_msg, + roc_auc_score, y_true, y_pred, average="micro") + multiclass_error_msg = ("Parameter multiclass='invalid' is not " + "supported for multiclass ROC AUC. 'multiclass' " + "must be one of ('ovo', 'ovr').") + assert_raise_message(ValueError, multiclass_error_msg, + roc_auc_score, y_true, y_pred, multiclass="invalid") + sample_weight_error_msg = ("Parameter 'sample_weight' is not supported " + "for multiclass one-vs-one ROC AUC. " + "'sample_weight' must be None in this case.") + assert_raise_message(ValueError, sample_weight_error_msg, + roc_auc_score, y_true, y_pred, + multiclass="ovo", sample_weight=[]) + partial_comp_error_msg = ("Partial AUC computation not available in " + "multiclass setting. Parameter 'max_fpr' must " + "be set to `None`. Received `max_fpr=0.5` " + "instead.") + assert_raise_message(ValueError, partial_comp_error_msg, + roc_auc_score, y_true, y_pred, + multiclass="ovo", max_fpr=0.5) + + def test_auc_score_non_binary_class(): # Test that roc_auc_score function returns an error when trying # to compute AUC for non-binary class values. @@ -486,10 +616,6 @@ def test_auc_score_non_binary_class(): y_true = -np.ones(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) - # y_true contains three different class values - y_true = rng.randint(0, 3, size=10) - assert_raise_message(ValueError, "multiclass format is not supported", - roc_auc_score, y_true, y_pred) clean_warning_registry() with warnings.catch_warnings(record=True): @@ -506,11 +632,6 @@ def test_auc_score_non_binary_class(): assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) - # y_true contains three different class values - y_true = rng.randint(0, 3, size=10) - assert_raise_message(ValueError, "multiclass format is not supported", - roc_auc_score, y_true, y_pred) - def test_binary_clf_curve(): rng = check_random_state(404) @@ -520,6 +641,7 @@ def test_binary_clf_curve(): assert_raise_message(ValueError, msg, precision_recall_curve, y_true, y_pred) + def test_precision_recall_curve(): y_true, _, probas_pred = make_prediction(binary=True) _test_precision_recall_curve(y_true, probas_pred) @@ -704,7 +826,6 @@ def test_score_scale_invariance(): # issue #3864 (and others), where overly aggressive rounding was causing # problems for users with very small y_score values y_true, _, probas_pred = make_prediction(binary=True) - roc_auc = roc_auc_score(y_true, probas_pred) roc_auc_scaled_up = roc_auc_score(y_true, 100 * probas_pred) roc_auc_scaled_down = roc_auc_score(y_true, 1e-6 * probas_pred)