diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 64a83ec7acd9f..8486c0e9686bc 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -7,7 +7,7 @@ recall_score, fbeta_score, f1_score, zero_one_score, \ precision_recall_fscore_support, classification_report, \ precision_recall_curve, explained_variance_score, r2_score, \ - zero_one, mean_square_error, hinge_loss + zero_one, mean_square_error, hinge_loss, matthews_corrcoef from . import cluster from .cluster import adjusted_rand_score diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index bf4dc9572d22f..5b64e21c0e5cf 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -212,7 +212,7 @@ def auc(x, y): return area -def precision_score(y_true, y_pred, pos_label=1): +def precision_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): """Compute the precision The precision is the ratio :math:`tp / (tp + fp)` where tp is the @@ -230,28 +230,40 @@ def precision_score(y_true, y_pred, pos_label=1): y_pred : array, shape = [n_samples] predicted targets + labels : array + integer array of labels + pos_label : int - in the binary classification case, give the label of the - positive class (default is 1). Everything else but 'pos_label' + in the binary classification case, give the label of the positive + class (default is 1). Everything else but 'pos_label' is considered to belong to the negative class. - Not used in the case of multiclass classification. + Set to None in the case of multiclass classification. + + average : string, ['micro', 'macro', 'weighted'(default)] + in the multiclass classification case, this determines the + type of averaging performed on the data. + macro: average over classes (does not take imbalance into account) + micro: average over instances (takes imbalance into account) + implies that precision == recall == f1 + weighted: average weighted by support (takes imbalance into account) + can have f1 score that is not between precision and recall Returns ------- precision : float precision of the positive class in binary classification or - weighted avergage of the precision of each class for the + weighted average of the precision of each class for the multiclass task """ - p, _, _, s = precision_recall_fscore_support(y_true, y_pred) - if p.shape[0] == 2: - return p[pos_label] - else: - return np.average(p, weights=s) + p, _, _, _ = precision_recall_fscore_support(y_true, y_pred, + labels=labels, + pos_label=pos_label, + average=average) + return p -def recall_score(y_true, y_pred, pos_label=1): +def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): """Compute the recall The recall is the ratio :math:`tp / (tp + fn)` where tp is the number of @@ -268,27 +280,40 @@ def recall_score(y_true, y_pred, pos_label=1): y_pred : array, shape = [n_samples] predicted targets + labels : array + integer array of labels + pos_label : int in the binary classification case, give the label of the positive class (default is 1). Everything else but 'pos_label' is considered to belong to the negative class. - Not used in the case of multiclass classification. + Set to None in the case of multiclass classification. + + average : string, [None, 'micro', 'macro', 'weighted'(default)] + in the multiclass classification case, this determines the + type of averaging performed on the data. + macro: average over classes (does not take imbalance into account) + micro: average over instances (takes imbalance into account) + implies that precision == recall == f1 + weighted: average weighted by support (takes imbalance into account) + can have f1 score that is not between precision and recall Returns ------- recall : float recall of the positive class in binary classification or weighted - avergage of the recall of each class for the multiclass task. + average of the recall of each class for the multiclass task. """ - _, r, _, s = precision_recall_fscore_support(y_true, y_pred) - if r.shape[0] == 2: - return r[pos_label] - else: - return np.average(r, weights=s) + _, r, _, _ = precision_recall_fscore_support(y_true, y_pred, + labels=labels, + pos_label=pos_label, + average=average) + return r -def fbeta_score(y_true, y_pred, beta, pos_label=1): +def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, + average='weighted'): """Compute fbeta score The F_beta score is the weighted harmonic mean of precision and recall, @@ -308,17 +333,29 @@ def fbeta_score(y_true, y_pred, beta, pos_label=1): beta: float + labels : array + integer array of labels + pos_label : int in the binary classification case, give the label of the positive class (default is 1). Everything else but 'pos_label' is considered to belong to the negative class. - Not used in the case of multiclass classification. + Set to None in the case of multiclass classification. + + average : string, [None, 'micro', 'macro', 'weighted'(default)] + in the multiclass classification case, this determines the + type of averaging performed on the data. + macro: average over classes (does not take imbalance into account) + micro: average over instances (takes imbalance into account) + implies that precision == recall == f1 + weighted: average weighted by support (takes imbalance into account) + can have f1 score that is not between precision and recall Returns ------- fbeta_score : float fbeta_score of the positive class in binary classification or weighted - avergage of the fbeta_score of each class for the multiclass task. + average of the fbeta_score of each class for the multiclass task. See also -------- @@ -328,14 +365,15 @@ class (default is 1). Everything else but 'pos_label' http://en.wikipedia.org/wiki/F1_score """ - _, _, f, s = precision_recall_fscore_support(y_true, y_pred, beta=beta) - if f.shape[0] == 2: - return f[pos_label] - else: - return np.average(f, weights=s) + _, _, f, _ = precision_recall_fscore_support(y_true, y_pred, + beta=beta, + labels=labels, + pos_label=pos_label, + average=average) + return f -def f1_score(y_true, y_pred, pos_label=1): +def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'): """Compute f1 score The F1 score can be interpreted as a weighted average of the precision @@ -358,17 +396,29 @@ def f1_score(y_true, y_pred, pos_label=1): y_pred : array, shape = [n_samples] predicted targets + labels : array + integer array of labels + pos_label : int - in the binary classification case, give the label of the positive class - (default is 1). Everything else but 'pos_label' + in the binary classification case, give the label of the positive + class (default is 1). Everything else but 'pos_label' is considered to belong to the negative class. - Not used in the case of multiclass classification. + Set to None in the case of multiclass classification. + + average : string, [None, 'micro', 'macro', 'weighted'(default)] + in the multiclass classification case, this determines the + type of averaging performed on the data. + macro: average over classes (does not take imbalance into account) + micro: average over instances (takes imbalance into account) + implies that precision == recall == f1 + weighted: average weighted by support (takes imbalance into account) + can have f1 score that is not between precision and recall Returns ------- f1_score : float f1_score of the positive class in binary classification or weighted - avergage of the f1_scores of each class for the multiclass task + average of the f1_scores of each class for the multiclass task Notes ----- @@ -376,10 +426,12 @@ def f1_score(y_true, y_pred, pos_label=1): http://en.wikipedia.org/wiki/F1_score """ - return fbeta_score(y_true, y_pred, 1, pos_label=pos_label) + return fbeta_score(y_true, y_pred, 1, labels=labels, + pos_label=pos_label, average=average) -def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None): +def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, + pos_label=1, average="weighted"): """Compute precisions, recalls, f-measures and support for each class The precision is the ratio :math:`tp / (tp + fp)` where tp is the number of @@ -400,6 +452,9 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None): The support is the number of occurrences of each class in y_true. + If pos_label is None, this function returns the average precision, recall + and f-measure if `average` is one of 'micro', 'macro', 'weighted'. + Parameters ---------- y_true : array, shape = [n_samples] @@ -411,6 +466,24 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None): beta : float, 1.0 by default the strength of recall versus precision in the f-score + labels : array + integer array of labels + + pos_label : int + in the binary classification case, give the label of the positive + class (default is 1). Everything else but 'pos_label' + is considered to belong to the negative class. + Set to None in the case of multiclass classification. + + average : string, [None (default), 'micro', 'macro', 'weighted'] + in the multiclass classification case, this determines the + type of averaging performed on the data. + macro: average over classes (does not take imbalance into account) + micro: average over instances (takes imbalance into account) + implies that precision == recall == f1 + weighted: average weighted by support (takes imbalance into account) + can have f1 score that is not between precision and recall + Returns ------- precision: array, shape = [n_unique_labels], dtype = np.double @@ -467,7 +540,79 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None): finally: np.seterr(**old_err_settings) - return precision, recall, fscore, support + if not average: + return precision, recall, fscore, support + + elif n_labels == 2: + if pos_label not in labels: + raise ValueError("pos_label=%d is not a valid label: %r" % + (pos_label, labels)) + pos_label_idx = list(labels).index(pos_label) + return (precision[pos_label_idx], recall[pos_label_idx], + fscore[pos_label_idx], support[pos_label_idx]) + else: + average_options = (None, 'micro', 'macro', 'weighted') + if average == 'micro': + avg_precision = true_pos.sum() / (true_pos.sum() + + false_pos.sum()) + avg_recall = true_pos.sum() / (true_pos.sum() + false_neg.sum()) + avg_fscore = (1 + beta2) * (avg_precision * avg_recall) / \ + (beta2 * avg_precision + avg_recall) + elif average == 'macro': + avg_precision = np.mean(precision) + avg_recall = np.mean(recall) + avg_fscore = np.mean(fscore) + elif average == 'weighted': + avg_precision = np.average(precision, weights=support) + avg_recall = np.average(recall, weights=support) + avg_fscore = np.average(fscore, weights=support) + else: + raise ValueError('average has to be one of ' + + str(average_options)) + + return avg_precision, avg_recall, avg_fscore, None + + +def matthews_corrcoef(y_true, y_pred): + """Returns matthew's correlation coefficient for binary classes + + The Matthews correlation coefficient is used in machine learning as a + measure of the quality of binary (two-class) classifications. It takes + into account true and false positives and negatives and is generally + regarded as a balanced measure which can be used even if the classes are + of very different sizes. The MCC is in essence a correlation coefficient + value between -1 and +1. A coefficient of +1 represents a perfect + prediction, 0 an average random prediction and -1 an inverse prediction. + The statistic is also known as the phi coefficient. [source: Wikipedia] + + Only in the binary case does this relate to information about true and + false positives and negatives. See references below. + + Parameters + ---------- + y_true : array, shape = [n_samples] + true targets + + y_pred : array, shape = [n_samples] + estimated targets + + Returns + ------- + mcc : float + matthew's correlation coefficient (+1 represents a perfect prediction, + 0 an average random prediction and -1 and inverse prediction). + + References + ---------- + http://en.wikipedia.org/wiki/Matthews_correlation_coefficient + http://dx.doi.org/10.1093/bioinformatics/16.5.412 + + """ + mcc = np.corrcoef(y_true, y_pred)[0, 1] + if np.isnan(mcc): + return 0. + else: + return mcc def classification_report(y_true, y_pred, labels=None, target_names=None): @@ -519,7 +664,9 @@ def classification_report(y_true, y_pred, labels=None, target_names=None): report += '\n' p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, - labels=labels) + labels=labels, + average=None) + for i, label in enumerate(labels): values = [target_names[i]] for v in (p[i], r[i], f1[i]): diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py index 237fade95525c..41af3b9f679a4 100644 --- a/sklearn/metrics/tests/test_metrics.py +++ b/sklearn/metrics/tests/test_metrics.py @@ -4,7 +4,7 @@ from nose.tools import raises from nose.tools import assert_true from numpy.testing import assert_array_almost_equal -from numpy.testing import assert_array_equal +from numpy.testing import assert_array_equal, assert_raises from numpy.testing import assert_equal, assert_almost_equal from ... import datasets @@ -15,6 +15,7 @@ from ..metrics import explained_variance_score from ..metrics import r2_score from ..metrics import f1_score +from ..metrics import matthews_corrcoef from ..metrics import mean_square_error from ..metrics import precision_recall_curve from ..metrics import precision_recall_fscore_support @@ -138,7 +139,7 @@ def test_precision_recall_f1_score_binary(): y_true, y_pred, _ = make_prediction(binary=True) # detailed measures for each class - p, r, f, s = precision_recall_fscore_support(y_true, y_pred) + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) assert_array_almost_equal(p, [0.73, 0.75], 2) assert_array_almost_equal(r, [0.76, 0.72], 2) assert_array_almost_equal(f, [0.75, 0.74], 2) @@ -164,34 +165,63 @@ def test_confusion_matrix_binary(): cm = confusion_matrix(y_true, y_pred) assert_array_equal(cm, [[19, 6], [7, 18]]) + tp = cm[0, 0] + tn = cm[1, 1] + fp = cm[0, 1] + fn = cm[1, 0] + num = (tp * tn - fp * fn) + den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) + if den == 0.: + true_mcc = 0 + else: + true_mcc = num / den + mcc = matthews_corrcoef(y_true, y_pred) + assert_array_almost_equal(mcc, true_mcc, decimal=2) + assert_array_almost_equal(mcc, 0.48, decimal=2) + def test_precision_recall_f1_score_multiclass(): """Test Precision Recall and F1 Score for multiclass classification task""" y_true, y_pred, _ = make_prediction(binary=False) # compute scores with default labels introspection - p, r, f, s = precision_recall_fscore_support(y_true, y_pred) + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) assert_array_almost_equal(p, [0.82, 0.55, 0.47], 2) assert_array_almost_equal(r, [0.92, 0.17, 0.90], 2) assert_array_almost_equal(f, [0.87, 0.26, 0.62], 2) assert_array_equal(s, [25, 30, 20]) - # individual scoring function that can be used for grid search: in the - # multiclass case the score is the wieghthed average of the individual - # class values hence f1_score is not necessary between precision_score and - # recall_score - ps = precision_score(y_true, y_pred) + # averaging tests + ps = precision_score(y_true, y_pred, pos_label=1, average='micro') + assert_array_almost_equal(ps, 0.61, 2) + + rs = recall_score(y_true, y_pred, average='micro') + assert_array_almost_equal(rs, 0.61, 2) + + fs = f1_score(y_true, y_pred, average='micro') + assert_array_almost_equal(fs, 0.61, 2) + + ps = precision_score(y_true, y_pred, average='macro') assert_array_almost_equal(ps, 0.62, 2) - rs = recall_score(y_true, y_pred) + rs = recall_score(y_true, y_pred, average='macro') + assert_array_almost_equal(rs, 0.66, 2) + + fs = f1_score(y_true, y_pred, average='macro') + assert_array_almost_equal(fs, 0.58, 2) + + ps = precision_score(y_true, y_pred, average='weighted') + assert_array_almost_equal(ps, 0.62, 2) + + rs = recall_score(y_true, y_pred, average='weighted') assert_array_almost_equal(rs, 0.61, 2) - fs = f1_score(y_true, y_pred) - assert_array_almost_equal(fs, 0.56, 2) + fs = f1_score(y_true, y_pred, average='weighted') + assert_array_almost_equal(fs, 0.55, 2) # same prediction but with and explicit label ordering p, r, f, s = precision_recall_fscore_support( - y_true, y_pred, labels=[0, 2, 1]) + y_true, y_pred, labels=[0, 2, 1], average=None) assert_array_almost_equal(p, [0.82, 0.47, 0.55], 2) assert_array_almost_equal(r, [0.92, 0.90, 0.17], 2) assert_array_almost_equal(f, [0.87, 0.62, 0.26], 2) @@ -207,9 +237,12 @@ def test_zero_precision_recall(): y_true = np.array([0, 1, 2, 0, 1, 2]) y_pred = np.array([2, 0, 1, 1, 2, 0]) - assert_almost_equal(precision_score(y_true, y_pred), 0.0, 2) - assert_almost_equal(recall_score(y_true, y_pred), 0.0, 2) - assert_almost_equal(f1_score(y_true, y_pred), 0.0, 2) + assert_almost_equal(precision_score(y_true, y_pred, average='weighted'), + 0.0, 2) + assert_almost_equal(recall_score(y_true, y_pred, average='weighted'), + 0.0, 2) + assert_almost_equal(f1_score(y_true, y_pred, average='weighted'), + 0.0, 2) finally: np.seterr(**old_error_settings)