From f40724133de5be68a2ab9268a3112f89dc85c3d8 Mon Sep 17 00:00:00 2001 From: Jan Hendrik Metzen Date: Mon, 18 Aug 2014 19:51:12 +0200 Subject: [PATCH 1/2] ADD New ranking-metric "reliability-score" added --- sklearn/metrics/__init__.py | 2 + sklearn/metrics/metrics.py | 1 + sklearn/metrics/ranking.py | 86 ++++++++++++++++++++++++++++++++++++- 3 files changed, 88 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index ead9a092b90d2..554c9eba5c0ba 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -9,6 +9,7 @@ from .ranking import precision_recall_curve from .ranking import roc_auc_score from .ranking import roc_curve +from .ranking import reliability_curve from .classification import accuracy_score from .classification import classification_report @@ -93,6 +94,7 @@ 'precision_score', 'r2_score', 'recall_score', + 'reliability_curve', 'roc_auc_score', 'roc_curve', 'SCORERS', diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py index efb86a158d917..e4afe84cfa22b 100644 --- a/sklearn/metrics/metrics.py +++ b/sklearn/metrics/metrics.py @@ -10,6 +10,7 @@ from .ranking import precision_recall_curve from .ranking import roc_auc_score from .ranking import roc_curve +from .ranking import reliability_curve from .classification import accuracy_score from .classification import classification_report diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 47efce7520137..717feada510a0 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -15,6 +15,7 @@ # Lars Buitinck # Joel Nothman # Noel Dawe +# Jan Hendrik Metzen # License: BSD 3 clause from __future__ import division @@ -23,7 +24,6 @@ import numpy as np from scipy.sparse import csr_matrix -from ..preprocessing import LabelBinarizer from ..utils import check_consistent_length from ..utils import deprecated from ..utils import column_or_1d, check_array @@ -619,3 +619,87 @@ def label_ranking_average_precision_score(y_true, y_score): out += np.divide(L, rank, dtype=float).mean() return out / n_samples + + +def reliability_curve(y_true, y_score, bins=10, normalize=False): + """Compute reliability curve + + Reliability curves allow checking if the predicted probabilities of a + binary classifier are well calibrated. This function returns two arrays + which encode a mapping from predicted probability to empirical probability. + For this, the predicted probabilities are partitioned into equally sized + bins and the mean predicted probability and the mean empirical probabilties + in the bins are computed. For perfectly calibrated predictions, both + quantities whould be approximately equal (for sufficiently many test + samples). + + Note: this implementation is restricted to binary classification. + + Parameters + ---------- + + y_true : array, shape = [n_samples] + True binary labels (0 or 1). + + y_score : array, shape = [n_samples] + Target scores, can either be probability estimates of the positive + class or confidence values. If normalize is False, y_score must be in + the interval [0, 1] + + bins : int, optional, default=10 + The number of bins into which the y_scores are partitioned. + Note: n_samples should be considerably larger than bins such that + there is sufficient data in each bin to get a reliable estimate + of the reliability + + normalize : bool, optional, default=False + Whether y_score needs to be normalized into the bin [0, 1]. If True, + the smallest value in y_score is mapped onto 0 and the largest one + onto 1. + + Returns + ------- + y_score_bin_mean : array, shape = [bins] + The mean predicted y_score in the respective bins. + + empirical_prob_pos : array, shape = [bins] + The empirical probability (frequency) of the positive class (+1) in the + respective bins. + + References + ---------- + .. [1] `Predicting Good Probabilities with Supervised Learning + `_ + + """ + check_consistent_length(y_true, y_score) + y_true = check_array(y_true, ensure_2d=False) + y_score = check_array(y_score, ensure_2d=False) + + if y_true.shape != y_score.shape: + raise ValueError("y_true and y_score have different shape") + + if normalize: # Normalize scores into bin [0, 1] + y_score = (y_score - y_score.min()) / (y_score.max() - y_score.min()) + elif y_score.min() < 0 or y_score.max() > 1: + raise ValueError("y_score has values outside [0, 1] and normalize is " + "set to False.") + + bin_width = 1.0 / bins + bin_centers = np.linspace(0, 1.0 - bin_width, bins) + bin_width / 2 + + y_score_bin_mean = np.empty(bins) + empirical_prob_pos = np.empty(bins) + for i, threshold in enumerate(bin_centers): + # determine all samples where y_score falls into the i-th bin + bin_idx = np.logical_and(threshold - bin_width / 2 < y_score, + y_score <= threshold + bin_width / 2) + # Store mean y_score and mean empirical probability of positive class + if np.any(bin_idx): + y_score_bin_mean[i] = y_score[bin_idx].mean() + empirical_prob_pos[i] = y_true[bin_idx].mean() + else: + y_score_bin_mean[i] = np.nan + empirical_prob_pos[i] = np.nan + return y_score_bin_mean, empirical_prob_pos From 77f3a4eaa2d0d64f6d46734003d3602f7b76c3d6 Mon Sep 17 00:00:00 2001 From: Jan Hendrik Metzen Date: Mon, 18 Aug 2014 19:51:36 +0200 Subject: [PATCH 2/2] ADD Example script illustrating relibability-diagram --- .../plot_reliability_diagram.py | 168 ++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 examples/classification/plot_reliability_diagram.py diff --git a/examples/classification/plot_reliability_diagram.py b/examples/classification/plot_reliability_diagram.py new file mode 100644 index 0000000000000..f0dc6230196bb --- /dev/null +++ b/examples/classification/plot_reliability_diagram.py @@ -0,0 +1,168 @@ +""" +========================= +Plot reliability diagram +========================= + +This script generates reliability diagrams for some classifiers on an +artificial data set. Reliability diagrams allow checking if the predicted +probabilities of a binary classifier are well calibrated. For perfectly +calibrated predictions, the curve in a reliability diagram should be as close +as possible to the diagonal/identity. This would correspond to a situation in +which among $N$ instances for which a classifier predicts probability $p$ for +class $A$, the ratio of instances which actually belong to class $A$ is approx. + $p$ (for any $p$ and sufficiently large $N$). + +This script reproduces some of the results from the paper "Predicting Good +Probabilities with Supervised Learning " +http://machinelearning.wustl.edu/mlpapers/paper_files/ +icml2005_Niculescu-MizilC05.pdf. + +The following observations can be made: + * Logistic regression returns well-calibrated probabilities close to the + "perfect" line + * Naive Bayes tends to push probabilties to 0 or 1 (note the counts in the + histograms). This is mainly because it makes the assumption that features + are conditionally independent given the class, which is not the case in + this dataset which contains 2 redundant features. + * Random Forest shows the opposite behavior: The histograms show peaks at + approx. 0.1 and 0.9 probability, while probabilities close to 0 or 1 are + very rare. An explanation for this is given by Niculescu-Mizil and Caruana: + "Methods such as bagging and random forests that average predictions from + a base set of models can have difficulty making predictions near 0 + and 1 because variance in the underlying base models will + bias predictions that should be near zero or one away from + these values. Because predictions are restricted to the interval [0,1], + errors caused by variance tend to be one-sided + near zero and one. For example, if a model should predict + p = 0 for a case, the only way bagging can achieve this is + if all bagged trees predict zero. If we add noise to the trees + that bagging is averaging over, this noise will cause some + trees to predict values larger than 0 for this case, thus moving the + average prediction of the bagged ensemble away + from 0. We observe this effect most strongly with random + forests because the base-level trees trained with random + forests have relatively high variance due to feature subseting." + As a result, the calibration curve shows a characteristic sigmoid shape, + indicating that the classifier could trust its "intuition" more and return + probabilties closer to 0 or 1 typically. A post-processing such as + Platt-calibration, which fits a sigmoid to the probabilities on a separate + calibration dataset, would typically help if the calibration curve is + sigmoid. + * The scores of a Support Vector Classification (SVC), which are linearly + related to the distance of the sample from the hyperplane, show a similar + but even stronger effect as the Random Forest. This is not too surprising + as the scores are in no sense probabilties and must not be interpreted as + such as the curve shows. + * One alternative to Platt-calibration is Isotonic Regression. While + Platt-calibration fits a sigmoid, Isotonic Regression fits an arbitrary + increasing (isotonic) function. Thus, it has a weaker inductive bias and + can be applied more broadly (also in situations where the calibration curve + is not sigmoid). The downside is that it typically requires more calibration + data because its inductive bias is weaker. This can also be seen in the + SVC + IR curve: While the sigmoid shape of the pure SVC scores is removed + and the calibration curve does not show a clear bias, it is quite noisy, + indicating much variance. Thus, the used calibration dataset (even though + 4 times larger than the training data) is too small in this case. +""" +print(__doc__) + +# Author: Jan Hendrik Metzen +# License: BSD 3 clause + +import numpy as np +np.random.seed(0) +import matplotlib.pyplot as plt + +from sklearn import datasets +from sklearn.svm import SVC +from sklearn.naive_bayes import GaussianNB +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.isotonic import IsotonicRegression + +from sklearn.metrics import reliability_curve + +#### Training data +# Generate a *toy dataset* on which different classifiers are compared. +# Among the 20 features, only 2 are actually informative. 2 further features +# are redundant, i.e., linear combinations of the 2 informative features. +# The remaining features are just noise. + +X, y = datasets.make_classification(n_samples=100000, n_features=20, + n_informative=2, n_redundant=2) + +bins = 25 +# Samples used for training the models +train_samples = 100 +# Additional samples for claibration using Isotonic Regression +calibration_samples = 400 + +X_train = X[:train_samples] +X_calibration = X[train_samples:train_samples + calibration_samples] +X_test = X[train_samples + calibration_samples:] +y_train = y[:train_samples] +y_calibration = y[train_samples:train_samples + calibration_samples] +y_test = y[train_samples + calibration_samples:] + +#### Compute reliability curves for different classifiers + +classifiers = {"Logistic regression": LogisticRegression(), + "Naive Bayes": GaussianNB(), + "Random Forest": RandomForestClassifier(n_estimators=100), + "SVC": SVC(kernel='linear', C=1.0), + "SVC + IR": SVC(kernel='linear', C=1.0)} + +reliability_scores = {} +y_score = {} +for method, clf in classifiers.items(): + clf.fit(X_train, y_train) + if method == "SVC + IR": # Calibrate SVC scores using isotonic regression. + n_plus = (y_calibration == 1.0).sum() # number of positive examples + n_minus = (y_calibration == 0.0).sum() # number of negative examples + # Determine target values for isotonic calibration. See + # "Predicting Good Probabilities with Supervised Learning" + # for details. + y_target = np.where(y_calibration == 0.0, + 1.0 / (n_minus + 2.0), + (n_plus + 1.0) / (n_plus + 2.0)) + # Perform actual calibration using isotonic calibration + svm_score = clf.decision_function(X_calibration) + ir = IsotonicRegression(out_of_bounds='clip').fit(svm_score, y_target) + y_score[method] = ir.transform(clf.decision_function(X_test)) + reliability_scores[method] = \ + reliability_curve(y_test, y_score[method], bins=bins, + normalize=False) + elif method == "SVC": + # Use SVC scores (predict_proba returns already calibrated + # probabilities) + y_score[method] = clf.decision_function(X_test) + reliability_scores[method] = \ + reliability_curve(y_test, y_score[method], bins=bins, + normalize=True) + else: + y_score[method] = clf.predict_proba(X_test)[:, 1] + reliability_scores[method] = \ + reliability_curve(y_test, y_score[method], bins=bins, + normalize=False) + +#### Plot reliability diagram + +plt.figure(0, figsize=(8, 8)) +plt.subplot2grid((3, 1), (0, 0), rowspan=2) +plt.plot([0.0, 1.0], [0.0, 1.0], 'k', label="Perfect") +for method, (bin_mean, empirical_prob_pos) in reliability_scores.items(): + scores_not_nan = np.logical_not(np.isnan(empirical_prob_pos)) + plt.plot(bin_mean[scores_not_nan], + empirical_prob_pos[scores_not_nan], label=method) +plt.ylabel("Empirical probability") +plt.legend(loc=0) + +plt.subplot2grid((3, 1), (2, 0)) +for method, y_score_ in y_score.items(): + y_score_ = (y_score_ - y_score_.min()) / (y_score_.max() - y_score_.min()) + plt.hist(y_score_, range=(0, 1), bins=bins, label=method, + histtype="step", lw=2) +plt.xlabel("Predicted Probability") +plt.ylabel("Count") +plt.legend(loc='upper center', ncol=2) +plt.show()