diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index 64a83ec7acd9f..8486c0e9686bc 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -7,7 +7,7 @@
                 recall_score, fbeta_score, f1_score, zero_one_score, \
                 precision_recall_fscore_support, classification_report, \
                 precision_recall_curve, explained_variance_score, r2_score, \
-                zero_one, mean_square_error, hinge_loss
+                zero_one, mean_square_error, hinge_loss, matthews_corrcoef
 
 from . import cluster
 from .cluster import adjusted_rand_score
diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
index bf4dc9572d22f..5b64e21c0e5cf 100644
--- a/sklearn/metrics/metrics.py
+++ b/sklearn/metrics/metrics.py
@@ -212,7 +212,7 @@ def auc(x, y):
     return area
 
 
-def precision_score(y_true, y_pred, pos_label=1):
+def precision_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     """Compute the precision
 
     The precision is the ratio :math:`tp / (tp + fp)` where tp is the
@@ -230,28 +230,40 @@ def precision_score(y_true, y_pred, pos_label=1):
     y_pred : array, shape = [n_samples]
         predicted targets
 
+    labels : array
+        integer array of labels
+
     pos_label : int
-        in the binary classification case, give the label of the
-        positive class (default is 1). Everything else but 'pos_label'
+        in the binary classification case, give the label of the positive
+        class (default is 1). Everything else but 'pos_label'
         is considered to belong to the negative class.
-        Not used in the case of multiclass classification.
+        Set to None in the case of multiclass classification.
+
+    average : string, ['micro', 'macro', 'weighted'(default)]
+        in the multiclass classification case, this determines the
+        type of averaging performed on the data.
+        macro: average over classes (does not take imbalance into account)
+        micro: average over instances (takes imbalance into account)
+               implies that precision == recall == f1
+        weighted: average weighted by support (takes imbalance into account)
+               can have f1 score that is not between precision and recall
 
     Returns
     -------
     precision : float
         precision of the positive class in binary classification or
-        weighted avergage of the precision of each class for the
+        weighted average of the precision of each class for the
         multiclass task
 
     """
-    p, _, _, s = precision_recall_fscore_support(y_true, y_pred)
-    if p.shape[0] == 2:
-        return p[pos_label]
-    else:
-        return np.average(p, weights=s)
+    p, _, _, _ = precision_recall_fscore_support(y_true, y_pred,
+                                                 labels=labels,
+                                                 pos_label=pos_label,
+                                                 average=average)
+    return p
 
 
-def recall_score(y_true, y_pred, pos_label=1):
+def recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     """Compute the recall
 
     The recall is the ratio :math:`tp / (tp + fn)` where tp is the number of
@@ -268,27 +280,40 @@ def recall_score(y_true, y_pred, pos_label=1):
     y_pred : array, shape = [n_samples]
         predicted targets
 
+    labels : array
+        integer array of labels
+
     pos_label : int
         in the binary classification case, give the label of the positive
         class (default is 1). Everything else but 'pos_label'
         is considered to belong to the negative class.
-        Not used in the case of multiclass classification.
+        Set to None in the case of multiclass classification.
+
+    average : string, [None, 'micro', 'macro', 'weighted'(default)]
+        in the multiclass classification case, this determines the
+        type of averaging performed on the data.
+        macro: average over classes (does not take imbalance into account)
+        micro: average over instances (takes imbalance into account)
+               implies that precision == recall == f1
+        weighted: average weighted by support (takes imbalance into account)
+               can have f1 score that is not between precision and recall
 
     Returns
     -------
     recall : float
         recall of the positive class in binary classification or weighted
-        avergage of the recall of each class for the multiclass task.
+        average of the recall of each class for the multiclass task.
 
     """
-    _, r, _, s = precision_recall_fscore_support(y_true, y_pred)
-    if r.shape[0] == 2:
-        return r[pos_label]
-    else:
-        return np.average(r, weights=s)
+    _, r, _, _ = precision_recall_fscore_support(y_true, y_pred,
+                                                 labels=labels,
+                                                 pos_label=pos_label,
+                                                 average=average)
+    return r
 
 
-def fbeta_score(y_true, y_pred, beta, pos_label=1):
+def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
+                average='weighted'):
     """Compute fbeta score
 
     The F_beta score is the weighted harmonic mean of precision and recall,
@@ -308,17 +333,29 @@ def fbeta_score(y_true, y_pred, beta, pos_label=1):
 
     beta: float
 
+    labels : array
+        integer array of labels
+
     pos_label : int
         in the binary classification case, give the label of the positive
         class (default is 1). Everything else but 'pos_label'
         is considered to belong to the negative class.
-        Not used in the case of multiclass classification.
+        Set to None in the case of multiclass classification.
+
+    average : string, [None, 'micro', 'macro', 'weighted'(default)]
+        in the multiclass classification case, this determines the
+        type of averaging performed on the data.
+        macro: average over classes (does not take imbalance into account)
+        micro: average over instances (takes imbalance into account)
+               implies that precision == recall == f1
+        weighted: average weighted by support (takes imbalance into account)
+               can have f1 score that is not between precision and recall
 
     Returns
     -------
     fbeta_score : float
         fbeta_score of the positive class in binary classification or weighted
-        avergage of the fbeta_score of each class for the multiclass task.
+        average of the fbeta_score of each class for the multiclass task.
 
     See also
     --------
@@ -328,14 +365,15 @@ class (default is 1). Everything else but 'pos_label'
     http://en.wikipedia.org/wiki/F1_score
 
     """
-    _, _, f, s = precision_recall_fscore_support(y_true, y_pred, beta=beta)
-    if f.shape[0] == 2:
-        return f[pos_label]
-    else:
-        return np.average(f, weights=s)
+    _, _, f, _ = precision_recall_fscore_support(y_true, y_pred,
+                                                 beta=beta,
+                                                 labels=labels,
+                                                 pos_label=pos_label,
+                                                 average=average)
+    return f
 
 
-def f1_score(y_true, y_pred, pos_label=1):
+def f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted'):
     """Compute f1 score
 
     The F1 score can be interpreted as a weighted average of the precision
@@ -358,17 +396,29 @@ def f1_score(y_true, y_pred, pos_label=1):
     y_pred : array, shape = [n_samples]
         predicted targets
 
+    labels : array
+        integer array of labels
+
     pos_label : int
-        in the binary classification case, give the label of the positive class
-        (default is 1). Everything else but 'pos_label'
+        in the binary classification case, give the label of the positive
+        class (default is 1). Everything else but 'pos_label'
         is considered to belong to the negative class.
-        Not used in the case of multiclass classification.
+        Set to None in the case of multiclass classification.
+
+    average : string, [None, 'micro', 'macro', 'weighted'(default)]
+        in the multiclass classification case, this determines the
+        type of averaging performed on the data.
+        macro: average over classes (does not take imbalance into account)
+        micro: average over instances (takes imbalance into account)
+               implies that precision == recall == f1
+        weighted: average weighted by support (takes imbalance into account)
+               can have f1 score that is not between precision and recall
 
     Returns
     -------
     f1_score : float
         f1_score of the positive class in binary classification or weighted
-        avergage of the f1_scores of each class for the multiclass task
+        average of the f1_scores of each class for the multiclass task
 
     Notes
     -----
@@ -376,10 +426,12 @@ def f1_score(y_true, y_pred, pos_label=1):
     http://en.wikipedia.org/wiki/F1_score
 
     """
-    return fbeta_score(y_true, y_pred, 1, pos_label=pos_label)
+    return fbeta_score(y_true, y_pred, 1, labels=labels,
+                       pos_label=pos_label, average=average)
 
 
-def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None):
+def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
+                                    pos_label=1, average="weighted"):
     """Compute precisions, recalls, f-measures and support for each class
 
     The precision is the ratio :math:`tp / (tp + fp)` where tp is the number of
@@ -400,6 +452,9 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None):
 
     The support is the number of occurrences of each class in y_true.
 
+    If pos_label is None, this function returns the average precision, recall
+    and f-measure if `average` is one of 'micro', 'macro', 'weighted'.
+
     Parameters
     ----------
     y_true : array, shape = [n_samples]
@@ -411,6 +466,24 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None):
     beta : float, 1.0 by default
         the strength of recall versus precision in the f-score
 
+    labels : array
+        integer array of labels
+
+    pos_label : int
+        in the binary classification case, give the label of the positive
+        class (default is 1). Everything else but 'pos_label'
+        is considered to belong to the negative class.
+        Set to None in the case of multiclass classification.
+
+    average : string, [None (default), 'micro', 'macro', 'weighted']
+        in the multiclass classification case, this determines the
+        type of averaging performed on the data.
+        macro: average over classes (does not take imbalance into account)
+        micro: average over instances (takes imbalance into account)
+               implies that precision == recall == f1
+        weighted: average weighted by support (takes imbalance into account)
+               can have f1 score that is not between precision and recall
+
     Returns
     -------
     precision: array, shape = [n_unique_labels], dtype = np.double
@@ -467,7 +540,79 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None):
     finally:
         np.seterr(**old_err_settings)
 
-    return precision, recall, fscore, support
+    if not average:
+        return precision, recall, fscore, support
+
+    elif n_labels == 2:
+        if pos_label not in labels:
+            raise ValueError("pos_label=%d is not a valid label: %r" %
+                             (pos_label, labels))
+        pos_label_idx = list(labels).index(pos_label)
+        return (precision[pos_label_idx], recall[pos_label_idx],
+                fscore[pos_label_idx], support[pos_label_idx])
+    else:
+        average_options = (None, 'micro', 'macro', 'weighted')
+        if average == 'micro':
+            avg_precision = true_pos.sum() / (true_pos.sum() +
+                                              false_pos.sum())
+            avg_recall = true_pos.sum() / (true_pos.sum() + false_neg.sum())
+            avg_fscore = (1 + beta2) * (avg_precision * avg_recall) / \
+                         (beta2 * avg_precision + avg_recall)
+        elif average == 'macro':
+            avg_precision = np.mean(precision)
+            avg_recall = np.mean(recall)
+            avg_fscore = np.mean(fscore)
+        elif average == 'weighted':
+            avg_precision = np.average(precision, weights=support)
+            avg_recall = np.average(recall, weights=support)
+            avg_fscore = np.average(fscore, weights=support)
+        else:
+            raise ValueError('average has to be one of ' +
+                             str(average_options))
+
+        return avg_precision, avg_recall, avg_fscore, None
+
+
+def matthews_corrcoef(y_true, y_pred):
+    """Returns matthew's correlation coefficient for binary classes
+
+    The Matthews correlation coefficient is used in machine learning as a
+    measure of the quality of binary (two-class) classifications. It takes
+    into account true and false positives and negatives and is generally
+    regarded as a balanced measure which can be used even if the classes are
+    of very different sizes. The MCC is in essence a correlation coefficient
+    value between -1 and +1. A coefficient of +1 represents a perfect
+    prediction, 0 an average random prediction and -1 an inverse prediction.
+    The statistic is also known as the phi coefficient. [source: Wikipedia]
+
+    Only in the binary case does this relate to information about true and
+    false positives and negatives. See references below.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        true targets
+
+    y_pred : array, shape = [n_samples]
+        estimated targets
+
+    Returns
+    -------
+    mcc : float
+        matthew's correlation coefficient (+1 represents a perfect prediction,
+        0 an average random prediction and -1 and inverse prediction).
+
+    References
+    ----------
+    http://en.wikipedia.org/wiki/Matthews_correlation_coefficient
+    http://dx.doi.org/10.1093/bioinformatics/16.5.412
+
+    """
+    mcc = np.corrcoef(y_true, y_pred)[0, 1]
+    if np.isnan(mcc):
+        return 0.
+    else:
+        return mcc
 
 
 def classification_report(y_true, y_pred, labels=None, target_names=None):
@@ -519,7 +664,9 @@ def classification_report(y_true, y_pred, labels=None, target_names=None):
     report += '\n'
 
     p, r, f1, s = precision_recall_fscore_support(y_true, y_pred,
-                                                  labels=labels)
+                                                  labels=labels,
+                                                  average=None)
+
     for i, label in enumerate(labels):
         values = [target_names[i]]
         for v in (p[i], r[i], f1[i]):
diff --git a/sklearn/metrics/tests/test_metrics.py b/sklearn/metrics/tests/test_metrics.py
index 237fade95525c..41af3b9f679a4 100644
--- a/sklearn/metrics/tests/test_metrics.py
+++ b/sklearn/metrics/tests/test_metrics.py
@@ -4,7 +4,7 @@
 from nose.tools import raises
 from nose.tools import assert_true
 from numpy.testing import assert_array_almost_equal
-from numpy.testing import assert_array_equal
+from numpy.testing import assert_array_equal, assert_raises
 from numpy.testing import assert_equal, assert_almost_equal
 
 from ... import datasets
@@ -15,6 +15,7 @@
 from ..metrics import explained_variance_score
 from ..metrics import r2_score
 from ..metrics import f1_score
+from ..metrics import matthews_corrcoef
 from ..metrics import mean_square_error
 from ..metrics import precision_recall_curve
 from ..metrics import precision_recall_fscore_support
@@ -138,7 +139,7 @@ def test_precision_recall_f1_score_binary():
     y_true, y_pred, _ = make_prediction(binary=True)
 
     # detailed measures for each class
-    p, r, f, s = precision_recall_fscore_support(y_true, y_pred)
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
     assert_array_almost_equal(p, [0.73, 0.75], 2)
     assert_array_almost_equal(r, [0.76, 0.72], 2)
     assert_array_almost_equal(f, [0.75, 0.74], 2)
@@ -164,34 +165,63 @@ def test_confusion_matrix_binary():
     cm = confusion_matrix(y_true, y_pred)
     assert_array_equal(cm, [[19, 6], [7, 18]])
 
+    tp = cm[0, 0]
+    tn = cm[1, 1]
+    fp = cm[0, 1]
+    fn = cm[1, 0]
+    num = (tp * tn - fp * fn)
+    den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
+    if den == 0.:
+        true_mcc = 0
+    else:
+        true_mcc = num / den
+    mcc = matthews_corrcoef(y_true, y_pred)
+    assert_array_almost_equal(mcc, true_mcc, decimal=2)
+    assert_array_almost_equal(mcc, 0.48, decimal=2)
+
 
 def test_precision_recall_f1_score_multiclass():
     """Test Precision Recall and F1 Score for multiclass classification task"""
     y_true, y_pred, _ = make_prediction(binary=False)
 
     # compute scores with default labels introspection
-    p, r, f, s = precision_recall_fscore_support(y_true, y_pred)
+    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
     assert_array_almost_equal(p, [0.82, 0.55, 0.47], 2)
     assert_array_almost_equal(r, [0.92, 0.17, 0.90], 2)
     assert_array_almost_equal(f, [0.87, 0.26, 0.62], 2)
     assert_array_equal(s, [25, 30, 20])
 
-    # individual scoring function that can be used for grid search: in the
-    # multiclass case the score is the wieghthed average of the individual
-    # class values hence f1_score is not necessary between precision_score and
-    # recall_score
-    ps = precision_score(y_true, y_pred)
+    # averaging tests
+    ps = precision_score(y_true, y_pred, pos_label=1, average='micro')
+    assert_array_almost_equal(ps, 0.61, 2)
+
+    rs = recall_score(y_true, y_pred, average='micro')
+    assert_array_almost_equal(rs, 0.61, 2)
+
+    fs = f1_score(y_true, y_pred, average='micro')
+    assert_array_almost_equal(fs, 0.61, 2)
+
+    ps = precision_score(y_true, y_pred, average='macro')
     assert_array_almost_equal(ps, 0.62, 2)
 
-    rs = recall_score(y_true, y_pred)
+    rs = recall_score(y_true, y_pred, average='macro')
+    assert_array_almost_equal(rs, 0.66, 2)
+
+    fs = f1_score(y_true, y_pred, average='macro')
+    assert_array_almost_equal(fs, 0.58, 2)
+
+    ps = precision_score(y_true, y_pred, average='weighted')
+    assert_array_almost_equal(ps, 0.62, 2)
+
+    rs = recall_score(y_true, y_pred, average='weighted')
     assert_array_almost_equal(rs, 0.61, 2)
 
-    fs = f1_score(y_true, y_pred)
-    assert_array_almost_equal(fs, 0.56, 2)
+    fs = f1_score(y_true, y_pred, average='weighted')
+    assert_array_almost_equal(fs, 0.55, 2)
 
     # same prediction but with and explicit label ordering
     p, r, f, s = precision_recall_fscore_support(
-        y_true, y_pred, labels=[0, 2, 1])
+        y_true, y_pred, labels=[0, 2, 1], average=None)
     assert_array_almost_equal(p, [0.82, 0.47, 0.55], 2)
     assert_array_almost_equal(r, [0.92, 0.90, 0.17], 2)
     assert_array_almost_equal(f, [0.87, 0.62, 0.26], 2)
@@ -207,9 +237,12 @@ def test_zero_precision_recall():
         y_true = np.array([0, 1, 2, 0, 1, 2])
         y_pred = np.array([2, 0, 1, 1, 2, 0])
 
-        assert_almost_equal(precision_score(y_true, y_pred), 0.0, 2)
-        assert_almost_equal(recall_score(y_true, y_pred), 0.0, 2)
-        assert_almost_equal(f1_score(y_true, y_pred), 0.0, 2)
+        assert_almost_equal(precision_score(y_true, y_pred, average='weighted'),
+                            0.0, 2)
+        assert_almost_equal(recall_score(y_true, y_pred, average='weighted'),
+                            0.0, 2)
+        assert_almost_equal(f1_score(y_true, y_pred, average='weighted'),
+                            0.0, 2)
 
     finally:
         np.seterr(**old_error_settings)