implement balanced_accuracy_score

xuewei4d · xuewei4d · commit de84a0694fe2 · 2015-03-28T09:53:04.000-04:00
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
@@ -13,6 +13,7 @@
 from .ranking import roc_curve
 
 from .classification import accuracy_score
+from .classification import balanced_accuracy_score
 from .classification import classification_report
 from .classification import confusion_matrix
 from .classification import f1_score
@@ -59,6 +60,7 @@
 
 __all__ = [
     'accuracy_score',
+    'balanced_accuracy_score',
     'adjusted_mutual_info_score',
     'adjusted_rand_score',
     'auc',
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
@@ -186,6 +186,70 @@ def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
     return _weighted_sum(score, sample_weight, normalize)
 
 
+def balanced_accuracy_score(y_true, y_pred):
+    """Balanced accuracy score
+
+    The balanced accuracy score is defined as
+    0.5 * true positives / (true positives + false negatives) +
+    0.5 * true negatives / (true negatives + false positives)
+
+    This function is equal to the average of positive label recall
+    and negative label recall.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        Ground truth (correct) target values.
+
+    y_pred : array, shape = [n_samples]
+        Estimated targets as returned by a classifier.
+
+    Returns
+    -------
+    score : float
+        return the balanced accuracy score.
+
+        The best performance is 1.
+
+    References
+    ----
+    .. [1] `Wikipedia entry for balanced accuracy
+    http://en.wikipedia.org/wiki/Accuracy_and_precision#In_binary_classification
+
+    Examples
+    --------
+    >>> from sklearn.metrics import balanced_accuracy_score
+    >>> y_true = [0, 0, 1, 1]
+    >>> y_pred = [0, 1, 1, 1]
+    >>> balanced_accuracy_score(y_true, y_pred)  # doctest: +ELLIPSIS
+    0.75...
+
+    >>> y_true = [0, 1, 1, 1, 1]
+    >>> y_pred = [1, 1, 1, 1, 1]
+    >>> balanced_accuracy_score(y_true, y_pred)  # doctest: +ELLIPSIS
+    0.5...
+
+    >>> y_true = ['b', 'a', 'a', 'a']
+    >>> y_pred = ['a', 'a', 'b', 'a']
+    >>> balanced_accuracy_score(y_true, y_pred)  # doctest: +ELLIPSIS
+    0.33...
+
+    """
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    if y_type != 'binary':
+        raise ValueError("%s is not supported" % y_type)
+
+    # Label encoding
+    lb = LabelBinarizer()
+    y_true_binary = lb.fit_transform(y_true)
+    y_pred_binary = lb.transform(y_pred)
+
+    pos_recall = recall_score(y_true_binary, y_pred_binary)
+    neg_recall = recall_score(1 - y_true_binary, 1 - y_pred_binary)
+
+    return np.average([pos_recall, neg_recall])
+
+
 def confusion_matrix(y_true, y_pred, labels=None):
     """Compute confusion matrix to evaluate the accuracy of a classification
 
diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
@@ -12,6 +12,7 @@
 from .ranking import roc_curve
 
 from .classification import accuracy_score
+from .classification import balanced_accuracy_score
 from .classification import classification_report
 from .classification import confusion_matrix
 from .classification import f1_score
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -27,6 +27,7 @@
 from sklearn.utils.testing import ignore_warnings
 
 from sklearn.metrics import accuracy_score
+from sklearn.metrics import balanced_accuracy_score
 from sklearn.metrics import average_precision_score
 from sklearn.metrics import classification_report
 from sklearn.metrics import confusion_matrix
@@ -127,6 +128,43 @@ def test_multilabel_accuracy_score_subset_accuracy():
         assert_equal(accuracy_score(y2, [(), ()], normalize=False), 0)
 
 
+def test_balanced_accuracy_score():
+    # Test balanced accuracy score for binary classification task
+
+    # test on an imbalanced data set
+    y_true = np.array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+    y_pred = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+
+    assert_equal(balanced_accuracy_score(y_true, y_pred), 0.5)
+
+    # test the function with the equation defined as
+    # 0.5 * true positives / (true positives + false negatives) +
+    # 0.5 * true negatives / (true negatives + false positives)
+    y_true, y_pred, _ = make_prediction(binary=True)
+    tn, fp, fn, tp = np.bincount(y_true * 2 + y_pred, minlength=4)
+    bas = 0.5 * tp / (tp + fn) + 0.5 * tn / (tn + fp)
+    assert_equal(balanced_accuracy_score(y_true, y_pred), bas)
+
+    # test using string labels
+    y_true = np.array(['a', 'b', 'a', 'b'])
+    y_pred = np.array(['a', 'b', 'a', 'a'])
+
+    assert_equal(balanced_accuracy_score(y_true, y_pred), 0.75)
+
+
+def test_balanced_accuracy_score_on_non_binary_class():
+    # Test that balanced_accuracy_score returns an error when trying
+    # to comptue balanced_accuracy_score for multiclass task.
+    rng = check_random_state(404)
+    y_pred = rng.randint(0, 3, size=10)
+
+    # y_true contains three different class values
+    y_true = rng.randint(0, 3, size=10)
+
+    assert_raise_message(ValueError, "multiclass is not supported",
+                         balanced_accuracy_score, y_true, y_pred)
+
+
 def test_precision_recall_f1_score_binary():
     # Test Precision Recall and F1 Score for binary classification task
     y_true, y_pred, _ = make_prediction(binary=True)
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
@@ -24,6 +24,7 @@
 from sklearn.utils.testing import ignore_warnings
 
 from sklearn.metrics import accuracy_score
+from sklearn.metrics import balanced_accuracy_score
 from sklearn.metrics import average_precision_score
 from sklearn.metrics import brier_score_loss
 from sklearn.metrics import confusion_matrix
@@ -97,6 +98,7 @@
 
 CLASSIFICATION_METRICS = {
     "accuracy_score": accuracy_score,
+    "balanced_accuracy_score": balanced_accuracy_score,
     "unnormalized_accuracy_score": partial(accuracy_score, normalize=False),
     "confusion_matrix": confusion_matrix,
     "hamming_loss": hamming_loss,
@@ -190,6 +192,7 @@
     "samples_precision_score", "samples_recall_score",
 
     # Those metrics don't support multiclass outputs
+    "balanced_accuracy_score",
     "average_precision_score", "weighted_average_precision_score",
     "micro_average_precision_score", "macro_average_precision_score",
     "samples_average_precision_score",
@@ -331,7 +334,9 @@
     "micro_recall_score",
 
     "macro_f0.5_score", "macro_f2_score", "macro_precision_score",
-    "macro_recall_score", "log_loss", "hinge_loss"
+    "macro_recall_score", "log_loss", "hinge_loss",
+
+    "balanced_accuracy_score",
 ]
 
 
@@ -341,6 +346,7 @@
     "hamming_loss",
     "matthews_corrcoef_score",
     "median_absolute_error",
+    "balanced_accuracy_score",
 ]