diff --git a/doc/modules/calibration.rst b/doc/modules/calibration.rst new file mode 100644 index 0000000000000..f215a8628dbc1 --- /dev/null +++ b/doc/modules/calibration.rst @@ -0,0 +1,195 @@ +.. _calibration: + +======================= +Probability calibration +======================= + +.. currentmodule:: sklearn.calibration + + +When performing classification you often want not only to predict the class +label, but also obtain a probability of the respective label. This probability +gives you some kind of confidence on the prediction. Some models can give you +poor estimates of the class probabilities and some even do not not support +probability prediction. The calibration module allows you to better calibrate +the probabilities of a given model, or to add support for probability +prediction. + +Well calibrated classifiers are probabilistic classifiers for which the output +of the predict_proba method can be directly interpreted as a confidence level. +For instance, a well calibrated (binary) classifier should classify the samples +such that among the samples to which it gave a predict_proba value close to 0.8, +approximately 80% actually belong to the positive class. The following plot compares +how well the probabilistic predictions of different classifiers are calibrated: + +.. figure:: ../auto_examples/calibration/images/plot_compare_calibration_001.png + :target: ../auto_examples/calibration/plot_compare_calibration.html + :align: center + +.. currentmodule:: sklearn.linear_model +:class:`LogisticRegression` returns well calibrated predictions by default as it directly +optimizes log-loss. In contrast, the other methods return biased probabilities; +with different biases per method: + + * .. currentmodule:: sklearn.naive_bayes + :class:`GaussianNB` tends to push probabilties to 0 or 1 (note the + counts in the histograms). This is mainly because it makes the assumption + that features are conditionally independent given the class, which is not + the case in this dataset which contains 2 redundant features. + + * .. currentmodule:: sklearn.ensemble + :class:`RandomForestClassifier` shows the opposite behavior: the histograms + show peaks at approximately 0.2 and 0.9 probability, while probabilities close to + 0 or 1 are very rare. An explanation for this is given by Niculescu-Mizil + and Caruana [4]: "Methods such as bagging and random forests that average + predictions from a base set of models can have difficulty making predictions + near 0 and 1 because variance in the underlying base models will bias + predictions that should be near zero or one away from these values. Because + predictions are restricted to the interval [0,1], errors caused by variance + tend to be one-sided near zero and one. For example, if a model should + predict p = 0 for a case, the only way bagging can achieve this is if all + bagged trees predict zero. If we add noise to the trees that bagging is + averaging over, this noise will cause some trees to predict values larger + than 0 for this case, thus moving the average prediction of the bagged + ensemble away from 0. We observe this effect most strongly with random + forests because the base-level trees trained with random forests have + relatively high variance due to feature subseting." As a result, the + calibration curve shows a characteristic sigmoid shape, indicating that the + classifier could trust its "intuition" more and return probabilties closer + to 0 or 1 typically. + + * .. currentmodule:: sklearn.svm + Linear Support Vector Classification (:class:`LinearSVC`) shows an even more sigmoid curve + as the RandomForestClassifier, which is typical for maximum-margin methods + (compare Niculescu-Mizil and Caruana [4]), which focus on hard samples + that are close to the decision boundary (the support vectors). + +.. currentmodule:: sklearn.calibration +Two approaches for performing calibration of probabilistic predictions are +provided: a parametric approach based on Platt's sigmoid model and a +non-parametric approach based on isotonic regression (:mod:`sklearn.isotonic`). +Probability calibration should be done on new data not used for model fitting. +The class :class:`CalibratedClassifierCV` uses a cross-validation generator and +estimates for each split the model parameter on the train samples and the +calibration of the test samples. The probabilities predicted for the +folds are then averaged. Already fitted classifiers can be calibrated by +:class:`CalibratedClassifierCV` via the paramter cv="prefit". In this case, +the user has to take care manually that data for model fitting and calibration +are disjoint. + +The following images demonstrate the benefit of probability calibration. +The first image present a dataset with 2 classes and 3 blobs of +data. The blob in the middle contains random samples of each class. +The probability for the samples in this blob should be 0.5. + +.. figure:: ../auto_examples/calibration/images/plot_calibration_001.png + :target: ../auto_examples/calibration/plot_calibration.html + :align: center + +The following image shows on the data above the estimated probability +using a Gaussian naive Bayes classifier without calibration, +with a sigmoid calibration and with a non-parametric isotonic +calibration. One can observe that the non-parametric model +provides the most accurate probability estimates for samples +in the middle, i.e., 0.5. + +.. figure:: ../auto_examples/calibration/images/plot_calibration_002.png + :target: ../auto_examples/calibration/plot_calibration.html + :align: center + +.. currentmodule:: sklearn.metrics +The following experiment is performed on an artificial dataset for binary +classification with 100.000 samples (1.000 of them are used for model fitting) +with 20 features. Of the 20 features, only 2 are informative and 10 are +redundant. The figure shows the estimated probabilities obtained with +logistic regression, a linear support-vector classifier (SVC), and linear SVC with +both isotonic calibration and sigmoid calibration. The calibration performance +is evaluated with Brier score :func:`brier_score_loss`, reported in the legend +(the smaller the better). + +.. figure:: ../auto_examples/calibration/images/plot_calibration_curve_002.png + :target: ../auto_examples/calibration/plot_calibration_curve.html + :align: center + +One can observe here that logistic regression is well calibrated as its curve is +nearly diagonal. Linear SVC's calibration curve has a sigmoid curve, which is +typical for an under-confident classifier. In the case of LinearSVC, this is +caused by the margin property of the hinge loss, which lets the model focus on +hard samples that are close to the decision boundary (the support vectors). Both +kinds of calibration can fix this issue and yield nearly identical results. +The next figure shows the calibration curve of Gaussian naive Bayes on +the same data, with both kinds of calibration and also without calibration. + +.. figure:: ../auto_examples/calibration/images/plot_calibration_curve_001.png + :target: ../auto_examples/calibration/plot_calibration_curve.html + :align: center + +One can see that Gaussian naive Bayes performs very badly but does so in an +other way than linear SVC: While linear SVC exhibited a sigmoid calibration +curve, Gaussian naive Bayes' calibration curve has a transposed-sigmoid shape. +This is typical for an over-confident classifier. In this case, the classifier's +overconfidence is caused by the redundant features which violate the naive Bayes +assumption of feature-independence. + +Calibration of the probabilities of Gaussian naive Bayes with isotonic +regression can fix this issue as can be seen from the nearly diagonal +calibration curve. Sigmoid calibration also improves the brier score slightly, +albeit not as strongly as the non-parametric isotonic calibration. This is an +intrinsic limitation of sigmoid calibration, whose parametric form assumes a +sigmoid rather than a transposed-sigmoid curve. The non-parametric isotonic +calibration model, however, makes no such strong assumptions and can deal with +either shape, provided that there is sufficient calibration data. In general, +sigmoid calibration is preferable if the calibration curve is sigmoid and when +there is few calibration data while isotonic calibration is preferable for non- +sigmoid calibration curves and in situations where many additional data can be +used for calibration. + +.. currentmodule:: sklearn.calibration +:class:`CalibratedClassifierCV` can also deal with classification tasks that +involve more than two classes if the base estimator can do so. In this case, +the classifier is calibrated first for each class separately in an one-vs-rest +fashion. When predicting probabilities for unseen data, the calibrated +probabilities for each class are predicted separately. As those probabilities +do not necessarily sum to one, a postprocessing is performed to normalize them. + +The next image illustrates how sigmoid calibration changes predicted +probabilities for a 3-class classification problem. Illustrated is the standard +2-simplex, where the three corners correspond to the three classes. Arrows point +from the probability vectors predicted by an uncalibrated classifier to the +probability vectors predicted by the same classifier after sigmoid calibration +on a hold-out validation set. Colors indicate the true class of an instance +(red: class 1, green: class 2, blue: class 3). + +.. figure:: ../auto_examples/calibration/images/plot_calibration_multiclass_000.png + :target: ../auto_examples/calibration/plot_calibration_multiclass.html + :align: center + +The base classifier is a random forest classifier with 25 base estimators +(trees). If this classifier is trained on all 800 training datapoints, it is +overly confident in its predictions and thus incurs a large log-loss. +Calibrating an identical classifier, which was trained on 600 datapoints, with +method='sigmoid' on the remaining 200 datapoints reduces the confidence of the +predictions, i.e., moves the probability vectors from the edges of the simplex +towards the center: + +.. figure:: ../auto_examples/calibration/images/plot_calibration_multiclass_001.png + :target: ../auto_examples/calibration/plot_calibration_multiclass.html + :align: center + +This calibration results in a lower log-loss. Note that an alternative would +have been to increase the number of base estimators which would have resulted in +a similar decrease in log-loss. + +.. topic:: References: + + .. [1] Obtaining calibrated probability estimates from decision trees + and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001 + + .. [2] Transforming Classifier Scores into Accurate Multiclass + Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002) + + .. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to + Regularized Likelihood Methods, J. Platt, (1999) + + .. [4] Predicting Good Probabilities with Supervised Learning, + A. Niculescu-Mizil & R. Caruana, ICML 2005 diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 3df61164ab870..eefe6d6a6f3d4 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -768,6 +768,7 @@ details. metrics.accuracy_score metrics.auc metrics.average_precision_score + metrics.brier_score_loss metrics.classification_report metrics.confusion_matrix metrics.f1_score @@ -784,6 +785,7 @@ details. metrics.roc_auc_score metrics.roc_curve metrics.zero_one_loss + metrics.brier_score_loss Regression metrics ------------------ @@ -1009,6 +1011,33 @@ See the :ref:`metrics` section of the user guide for further details. neural_network.BernoulliRBM +.. _calibration_ref: + +:mod:`sklearn.calibration`: Probability Calibration +=================================================== + +.. automodule:: sklearn.calibration + :no-members: + :no-inherited-members: + +**User guide:** See the :ref:`calibration` section for further details. + +.. currentmodule:: sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + calibration.CalibratedClassifierCV + + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + calibration.calibration_curve + + .. _cross_decomposition_ref: :mod:`sklearn.cross_decomposition`: Cross decomposition diff --git a/doc/supervised_learning.rst b/doc/supervised_learning.rst index 4fc42bd1e4a1c..cde197287bee9 100644 --- a/doc/supervised_learning.rst +++ b/doc/supervised_learning.rst @@ -22,3 +22,4 @@ Supervised learning modules/feature_selection.rst modules/label_propagation.rst modules/isotonic.rst + modules/calibration.rst diff --git a/doc/whats_new.rst b/doc/whats_new.rst index c49769567f960..9e454fb84d2a7 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -75,6 +75,10 @@ New features for fixed user-provided cross-validation folds. By `untom `_. + - Added :class:`calibration.CalibratedClassifierCV`, an approach for + calibrating the predicted probabilities of a classifier. + By `Alexandre Gramfort`_ and `Jan Hendrik Metzen`_. + Enhancements ............ diff --git a/examples/calibration/README.txt b/examples/calibration/README.txt new file mode 100644 index 0000000000000..5e4a31b966b50 --- /dev/null +++ b/examples/calibration/README.txt @@ -0,0 +1,6 @@ +.. _calibration_examples: + +Calibration +----------------------- + +Examples illustrating the calibration of predicted probabilities of classifiers. diff --git a/examples/calibration/plot_calibration.py b/examples/calibration/plot_calibration.py new file mode 100644 index 0000000000000..2267f02dd0022 --- /dev/null +++ b/examples/calibration/plot_calibration.py @@ -0,0 +1,116 @@ +""" +====================================== +Probability calibration of classifiers +====================================== + +When performing classification you often want to predict not only +the class label, but also the associated probability. This probability +gives you some kind of confidence on the prediction. However, not all +classifiers provide well-calibrated probabilities, some being over-confident +while others being under-confident. Thus, a separate calibration of predicted +probabilities is often desirable as a postprocessing. This example illustrates +two different methods for this calibration and evaluates the quality of the +returned probabilities using Brier's score +(see http://en.wikipedia.org/wiki/Brier_score). + +Compared are the estimated probability using a Gaussian naive Bayes classifier +without calibration, with a sigmoid calibration, and with a non-parametric +isotonic calibration. One can observe that only the non-parametric model is able +to provide a probability calibration that returns probabilities close to the +expected 0.5 for most of the samples belonging to the middle cluster with +heterogeneous labels. This results in a significantly improved Brier score. +""" +print(__doc__) + +# Author: Mathieu Blondel +# Alexandre Gramfort +# Balazs Kegl +# Jan Hendrik Metzen +# License: BSD Style. + +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import cm + +from sklearn.datasets import make_blobs +from sklearn.naive_bayes import GaussianNB +from sklearn.metrics import brier_score_loss +from sklearn.calibration import CalibratedClassifierCV +from sklearn.cross_validation import train_test_split + + +n_samples = 50000 +n_bins = 3 # use 3 bins for calibration_curve as we have 3 clusters here + +# Generate 3 blobs with 2 classes where the second blob contains +# half positive samples and half negative samples. Probability in this +# blob is therefore 0.5. +centers = [(-5, -5), (0, 0), (5, 5)] +X, y = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0, + centers=centers, shuffle=False, random_state=42) + +y[:n_samples // 2] = 0 +y[n_samples // 2:] = 1 +sample_weight = np.random.RandomState(42).rand(y.shape[0]) + +# split train, test for calibration +X_train, X_test, y_train, y_test, sw_train, sw_test = \ + train_test_split(X, y, sample_weight, test_size=0.9, random_state=42) + +# Gaussian Naive-Bayes with no calibration +clf = GaussianNB() +clf.fit(X_train, y_train) # GaussianNB itself does not support sample-weights +prob_pos_clf = clf.predict_proba(X_test)[:, 1] + +# Gaussian Naive-Bayes with isotonic calibration +clf_isotonic = CalibratedClassifierCV(clf, cv=2, method='isotonic') +clf_isotonic.fit(X_train, y_train, sw_train) +prob_pos_isotonic = clf_isotonic.predict_proba(X_test)[:, 1] + +# Gaussian Naive-Bayes with sigmoid calibration +clf_sigmoid = CalibratedClassifierCV(clf, cv=2, method='sigmoid') +clf_sigmoid.fit(X_train, y_train, sw_train) +prob_pos_sigmoid = clf_sigmoid.predict_proba(X_test)[:, 1] + +print("Brier scores: (the smaller the better)") + +clf_score = brier_score_loss(y_test, prob_pos_clf, sw_test) +print("No calibration: %1.3f" % clf_score) + +clf_isotonic_score = brier_score_loss(y_test, prob_pos_isotonic, sw_test) +print("With isotonic calibration: %1.3f" % clf_isotonic_score) + +clf_sigmoid_score = brier_score_loss(y_test, prob_pos_sigmoid, sw_test) +print("With sigmoid calibration: %1.3f" % clf_sigmoid_score) + +############################################################################### +# Plot the data and the predicted probabilities +plt.figure() +y_unique = np.unique(y) +colors = cm.rainbow(np.linspace(0.0, 1.0, y_unique.size)) +for this_y, color in zip(y_unique, colors): + this_X = X_train[y_train == this_y] + this_sw = sw_train[y_train == this_y] + plt.scatter(this_X[:, 0], this_X[:, 1], s=this_sw * 50, c=color, alpha=0.5, + label="Class %s" % this_y) +plt.legend(loc="best") +plt.title("Data") + +plt.figure() +order = np.lexsort((prob_pos_clf, )) +plt.plot(prob_pos_clf[order], 'r', label='No calibration (%1.3f)' % clf_score) +plt.plot(prob_pos_isotonic[order], 'g', linewidth=3, + label='Isotonic calibration (%1.3f)' % clf_isotonic_score) +plt.plot(prob_pos_sigmoid[order], 'b', linewidth=3, + label='Sigmoid calibration (%1.3f)' % clf_sigmoid_score) +plt.plot(np.linspace(0, y_test.size, 51)[1::2], + y_test[order].reshape(25, -1).mean(1), + 'k', linewidth=3, label=r'Empirical') +plt.ylim([-0.05, 1.05]) +plt.xlabel("Instances sorted according to predicted probability " + "(uncalibrated GNB)") +plt.ylabel("P(y=1)") +plt.legend(loc="upper left") +plt.title("Gaussian naive Bayes probabilities") + +plt.show() diff --git a/examples/calibration/plot_calibration_curve.py b/examples/calibration/plot_calibration_curve.py new file mode 100644 index 0000000000000..42dc8473e6c30 --- /dev/null +++ b/examples/calibration/plot_calibration_curve.py @@ -0,0 +1,134 @@ +""" +============================== +Probability Calibration curves +============================== + +When performing classification one often wants to predict not only the class +label, but also the associated probability. This probability gives some +kind of confidence on the prediction. This example demonstrates how to display +how well calibrated the predicted probabilities are and how to calibrate an +uncalibrated classifier. + +The experiment is performed on an artificial dataset for binary classification +with 100.000 samples (1.000 of them are used for model fitting) with 20 +features. Of the 20 features, only 2 are informative and 10 are redundant. The +first figure shows the estimated probabilities obtained with logistic +regression, Gaussian naive Bayes, and Gaussian naive Bayes with both isotonic +calibration and sigmoid calibration. The calibration performance is evaluated +with Brier score, reported in the legend (the smaller the better). One can +observe here that logistic regression is well calibrated while raw Gaussian +naive Bayes performs very badly. This is because of the redundant features +which violate the assumption of feature-independence and result in an overly +confident classifier, which is indicated by the typical transposed-sigmoid +curve. + +Calibration of the probabilities of Gaussian naive Bayes with isotonic +regression can fix this issue as can be seen from the nearly diagonal +calibration curve. Sigmoid calibration also improves the brier score slightly, +albeit not as strongly as the non-parametric isotonic regression. This can be +attributed to the fact that we have plenty of calibration data such that the +greater flexibility of the non-parametric model can be exploited. + +The second figure shows the calibration curve of a linear support-vector +classifier (LinearSVC). LinearSVC shows the opposite behavior as Gaussian +naive Bayes: the calibration curve has a sigmoid curve, which is typical for +an under-confident classifier. In the case of LinearSVC, this is caused by the +margin property of the hinge loss, which lets the model focus on hard samples +that are close to the decision boundary (the support vectors). + +Both kinds of calibration can fix this issue and yield nearly identical +results. This shows that sigmoid calibration can deal with situations where +the calibration curve of the base classifier is sigmoid (e.g., for LinearSVC) +but not where it is transposed-sigmoid (e.g., Gaussian naive Bayes). +""" +print(__doc__) + +# Author: Alexandre Gramfort +# Jan Hendrik Metzen +# License: BSD Style. + +import matplotlib.pyplot as plt + +from sklearn import datasets +from sklearn.naive_bayes import GaussianNB +from sklearn.svm import LinearSVC +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import (brier_score_loss, precision_score, recall_score, + f1_score) +from sklearn.calibration import CalibratedClassifierCV, calibration_curve +from sklearn.cross_validation import train_test_split + + +# Create dataset of classification task with many redundant and few +# informative features +X, y = datasets.make_classification(n_samples=100000, n_features=20, + n_informative=2, n_redundant=10, + random_state=42) + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99, + random_state=42) + + +def plot_calibration_curve(est, name, fig_index): + """Plot calibration curve for est w/o and with calibration. """ + # Calibrated with isotonic calibration + isotonic = CalibratedClassifierCV(est, cv=2, method='isotonic') + + # Calibrated with sigmoid calibration + sigmoid = CalibratedClassifierCV(est, cv=2, method='sigmoid') + + # Logistic regression with no calibration as baseline + lr = LogisticRegression(C=1., solver='lbfgs') + + fig = plt.figure(fig_index, figsize=(10, 10)) + ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2) + ax2 = plt.subplot2grid((3, 1), (2, 0)) + + ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") + for clf, name in [(lr, 'Logistic'), + (est, name), + (isotonic, name + ' + Isotonic'), + (sigmoid, name + ' + Sigmoid')]: + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + if hasattr(clf, "predict_proba"): + prob_pos = clf.predict_proba(X_test)[:, 1] + else: # use decision function + prob_pos = clf.decision_function(X_test) + prob_pos = \ + (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min()) + + clf_score = brier_score_loss(y_test, prob_pos, pos_label=y.max()) + print("%s:" % name) + print("\tBrier: %1.3f" % (clf_score)) + print("\tPrecision: %1.3f" % precision_score(y_test, y_pred)) + print("\tRecall: %1.3f" % recall_score(y_test, y_pred)) + print("\tF1: %1.3f\n" % f1_score(y_test, y_pred)) + + fraction_of_positives, mean_predicted_value = \ + calibration_curve(y_test, prob_pos, n_bins=10) + + ax1.plot(mean_predicted_value, fraction_of_positives, "s-", + label="%s (%1.3f)" % (name, clf_score)) + + ax2.hist(prob_pos, range=(0, 1), bins=10, label=name, + histtype="step", lw=2) + + ax1.set_ylabel("Fraction of positives") + ax1.set_ylim([-0.05, 1.05]) + ax1.legend(loc="lower right") + ax1.set_title('Calibration plots (reliability curve)') + + ax2.set_xlabel("Mean predicted value") + ax2.set_ylabel("Count") + ax2.legend(loc="upper center", ncol=2) + + plt.tight_layout() + +# Plot calibration cuve for Gaussian Naive Bayes +plot_calibration_curve(GaussianNB(), "Naive Bayes", 1) + +# Plot calibration cuve for Linear SVC +plot_calibration_curve(LinearSVC(), "SVC", 2) + +plt.show() diff --git a/examples/calibration/plot_calibration_multiclass.py b/examples/calibration/plot_calibration_multiclass.py new file mode 100644 index 0000000000000..7843b84fa35cb --- /dev/null +++ b/examples/calibration/plot_calibration_multiclass.py @@ -0,0 +1,168 @@ +""" +================================================== +Probability Calibration for 3-class classification +================================================== + +This example illustrates how sigmoid calibration changes predicted +probabilities for a 3-class classification problem. Illustrated is the +standard 2-simplex, where the three corners correspond to the three classes. +Arrows point from the probability vectors predicted by an uncalibrated +classifier to the probability vectors predicted by the same classifier after +sigmoid calibration on a hold-out validation set. Colors indicate the true +class of an instance (red: class 1, green: class 2, blue: class 3). + +The base classifier is a random forest classifier with 25 base estimators +(trees). If this classifier is trained on all 800 training datapoints, it is +overly confident in its predictions and thus incurs a large log-loss. +Calibrating an identical classifier, which was trained on 600 datapoints, with +method='sigmoid' on the remaining 200 datapoints reduces the confidence of the +predictions, i.e., moves the probability vectors from the edges of the simplex +towards the center. This calibration results in a lower log-loss. Note that an +alternative would have been to increase the number of base estimators which +would have resulted in a similar decrease in log-loss. +""" +print(__doc__) + +# Author: Jan Hendrik Metzen +# License: BSD Style. + + +import matplotlib.pyplot as plt + +import numpy as np + +from sklearn.datasets import make_blobs +from sklearn.ensemble import RandomForestClassifier +from sklearn.calibration import CalibratedClassifierCV +from sklearn.metrics import log_loss + +np.random.seed(0) + +# Generate data +X, y = make_blobs(n_samples=1000, n_features=2, random_state=42, + cluster_std=5.0) +X_train, y_train = X[:600], y[:600] +X_valid, y_valid = X[600:800], y[600:800] +X_train_valid, y_train_valid = X[:800], y[:800] +X_test, y_test = X[800:], y[800:] + +# Train uncalibrated random forest classifier on whole train and validation +# data and evaluate on test data +clf = RandomForestClassifier(n_estimators=25) +clf.fit(X_train_valid, y_train_valid) +clf_probs = clf.predict_proba(X_test) +score = log_loss(y_test, clf_probs) + +# Train random forest classifier, calibrate on validation data and evaluate +# on test data +clf = RandomForestClassifier(n_estimators=25) +clf.fit(X_train, y_train) +clf_probs = clf.predict_proba(X_test) +sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit") +sig_clf.fit(X_valid, y_valid) +sig_clf_probs = sig_clf.predict_proba(X_test) +sig_score = log_loss(y_test, sig_clf_probs) + +# Plot changes in predicted probabilities via arrows +plt.figure(0) +colors = ["r", "g", "b"] +for i in range(clf_probs.shape[0]): + plt.arrow(clf_probs[i, 0], clf_probs[i, 1], + sig_clf_probs[i, 0] - clf_probs[i, 0], + sig_clf_probs[i, 1] - clf_probs[i, 1], + color=colors[y_test[i]], head_width=1e-2) + +# Plot perfect predictions +plt.plot([1.0], [0.0], 'ro', ms=20, label="Class 1") +plt.plot([0.0], [1.0], 'go', ms=20, label="Class 2") +plt.plot([0.0], [0.0], 'bo', ms=20, label="Class 3") + +# Plot boundaries of unit simplex +plt.plot([0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], 'k', label="Simplex") + +# Annotate points on the simplex +plt.annotate(r'($\frac{1}{3}$, $\frac{1}{3}$, $\frac{1}{3}$)', + xy=(1.0/3, 1.0/3), xytext=(1.0/3, .23), xycoords='data', + arrowprops=dict(facecolor='black', shrink=0.05), + horizontalalignment='center', verticalalignment='center') +plt.plot([1.0/3], [1.0/3], 'ko', ms=5) +plt.annotate(r'($\frac{1}{2}$, $0$, $\frac{1}{2}$)', + xy=(.5, .0), xytext=(.5, .1), xycoords='data', + arrowprops=dict(facecolor='black', shrink=0.05), + horizontalalignment='center', verticalalignment='center') +plt.annotate(r'($0$, $\frac{1}{2}$, $\frac{1}{2}$)', + xy=(.0, .5), xytext=(.1, .5), xycoords='data', + arrowprops=dict(facecolor='black', shrink=0.05), + horizontalalignment='center', verticalalignment='center') +plt.annotate(r'($\frac{1}{2}$, $\frac{1}{2}$, $0$)', + xy=(.5, .5), xytext=(.6, .6), xycoords='data', + arrowprops=dict(facecolor='black', shrink=0.05), + horizontalalignment='center', verticalalignment='center') +plt.annotate(r'($0$, $0$, $1$)', + xy=(0, 0), xytext=(.1, .1), xycoords='data', + arrowprops=dict(facecolor='black', shrink=0.05), + horizontalalignment='center', verticalalignment='center') +plt.annotate(r'($1$, $0$, $0$)', + xy=(1, 0), xytext=(1, .1), xycoords='data', + arrowprops=dict(facecolor='black', shrink=0.05), + horizontalalignment='center', verticalalignment='center') +plt.annotate(r'($0$, $1$, $0$)', + xy=(0, 1), xytext=(.1, 1), xycoords='data', + arrowprops=dict(facecolor='black', shrink=0.05), + horizontalalignment='center', verticalalignment='center') +# Add grid +plt.grid("off") +for x in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: + plt.plot([0, x], [x, 0], 'k', alpha=0.2) + plt.plot([0, 0 + (1-x)/2], [x, x + (1-x)/2], 'k', alpha=0.2) + plt.plot([x, x + (1-x)/2], [0, 0 + (1-x)/2], 'k', alpha=0.2) + +plt.title("Change of predicted probabilities after sigmoid calibration") +plt.xlabel("Probability class 1") +plt.ylabel("Probability class 2") +plt.xlim(-0.05, 1.05) +plt.ylim(-0.05, 1.05) +plt.legend(loc="best") + +print("Log-loss of") +print(" * uncalibrated classifier trained on 800 datapoints: %.3f " + % score) +print(" * classifier trained on 600 datapoints and calibrated on " + "200 datapoint: %.3f" % sig_score) + +# Illustrate calibrator +plt.figure(1) +# generate grid over 2-simplex +p1d = np.linspace(0, 1, 20) +p0, p1 = np.meshgrid(p1d, p1d) +p2 = 1 - p0 - p1 +p = np.c_[p0.ravel(), p1.ravel(), p2.ravel()] +p = p[p[:, 2] >= 0] + +calibrated_classifier = sig_clf.calibrated_classifiers_[0] +prediction = np.vstack([calibrator.predict(this_p) + for calibrator, this_p in + zip(calibrated_classifier.calibrators_, p.T)]).T +prediction /= prediction.sum(axis=1)[:, None] + +# Ploit modifications of calibrator +for i in range(prediction.shape[0]): + plt.arrow(p[i, 0], p[i, 1], + prediction[i, 0] - p[i, 0], prediction[i, 1] - p[i, 1], + head_width=1e-2, color=colors[np.argmax(p[i])]) +# Plot boundaries of unit simplex +plt.plot([0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], 'k', label="Simplex") + +plt.grid("off") +for x in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: + plt.plot([0, x], [x, 0], 'k', alpha=0.2) + plt.plot([0, 0 + (1-x)/2], [x, x + (1-x)/2], 'k', alpha=0.2) + plt.plot([x, x + (1-x)/2], [0, 0 + (1-x)/2], 'k', alpha=0.2) + +plt.title("Illustration of sigmoid calibrator") +plt.xlabel("Probability class 1") +plt.ylabel("Probability class 2") +plt.xlim(-0.05, 1.05) +plt.ylim(-0.05, 1.05) + +plt.show() diff --git a/examples/calibration/plot_compare_calibration.py b/examples/calibration/plot_compare_calibration.py new file mode 100644 index 0000000000000..dc40ff108b8c5 --- /dev/null +++ b/examples/calibration/plot_compare_calibration.py @@ -0,0 +1,124 @@ +""" +======================================= +Comparison of Calibration of Classifiers +======================================= + +Well calibrated classifiers are probabilistic classifiers for which the output +of the predict_proba method can be directly interpreted as a confidence level. +For instance a well calibrated (binary) classifier should classify the samples +such that among the samples to which it gave a predict_proba value close to +0.8, approx. 80% actually belong to the positive class. + +LogisticRegression returns well calibrated predictions as it directly +optimizes log-loss. In contrast, the other methods return biased probilities, +with different biases per method: + +* GaussianNaiveBayes tends to push probabilties to 0 or 1 (note the counts in + the histograms). This is mainly because it makes the assumption that features + are conditionally independent given the class, which is not the case in this + dataset which contains 2 redundant features. + +* RandomForestClassifier shows the opposite behavior: the histograms show + peaks at approx. 0.2 and 0.9 probability, while probabilities close to 0 or 1 + are very rare. An explanation for this is given by Niculescu-Mizil and Caruana + [1]: "Methods such as bagging and random forests that average predictions from + a base set of models can have difficulty making predictions near 0 and 1 + because variance in the underlying base models will bias predictions that + should be near zero or one away from these values. Because predictions are + restricted to the interval [0,1], errors caused by variance tend to be one- + sided near zero and one. For example, if a model should predict p = 0 for a + case, the only way bagging can achieve this is if all bagged trees predict + zero. If we add noise to the trees that bagging is averaging over, this noise + will cause some trees to predict values larger than 0 for this case, thus + moving the average prediction of the bagged ensemble away from 0. We observe + this effect most strongly with random forests because the base-level trees + trained with random forests have relatively high variance due to feature + subseting." As a result, the calibration curve shows a characteristic sigmoid + shape, indicating that the classifier could trust its "intuition" more and + return probabilties closer to 0 or 1 typically. + +* Support Vector Classification (SVC) shows an even more sigmoid curve as + the RandomForestClassifier, which is typical for maximum-margin methods + (compare Niculescu-Mizil and Caruana [1]), which focus on hard samples + that are close to the decision boundary (the support vectors). + +.. topic:: References: + + .. [1] Predicting Good Probabilities with Supervised Learning, + A. Niculescu-Mizil & R. Caruana, ICML 2005 +""" +print(__doc__) + +# Author: Jan Hendrik Metzen +# License: BSD Style. + +import numpy as np +np.random.seed(0) + +import matplotlib.pyplot as plt + +from sklearn import datasets +from sklearn.svm import SVC +from sklearn.naive_bayes import GaussianNB +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.svm import LinearSVC +from sklearn.metrics import brier_score_loss +from sklearn.calibration import calibration_curve + +X, y = datasets.make_classification(n_samples=100000, n_features=20, + n_informative=2, n_redundant=2) + +train_samples = 100 # Samples used for training the models + +X_train = X[:train_samples] +X_test = X[train_samples:] +y_train = y[:train_samples] +y_test = y[train_samples:] + +# Create classifiers +lr = LogisticRegression() +gnb = GaussianNB() +svc = LinearSVC(C=1.0) +rfc = RandomForestClassifier(n_estimators=100) + + +############################################################################### +# Plot calibration plots + +plt.figure(figsize=(10, 10)) +ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2) +ax2 = plt.subplot2grid((3, 1), (2, 0)) + +ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") +for clf, name in [(lr, 'Logistic'), + (gnb, 'Naive Bayes'), + (svc, 'Support Vector Classification'), + (rfc, 'Random Forest')]: + clf.fit(X_train, y_train) + if hasattr(clf, "predict_proba"): + prob_pos = clf.predict_proba(X_test)[:, 1] + else: # use decision function + prob_pos = clf.decision_function(X_test) + prob_pos = \ + (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min()) + fraction_of_positives, mean_predicted_value = \ + calibration_curve(y_test, prob_pos, n_bins=10) + + ax1.plot(mean_predicted_value, fraction_of_positives, "s-", + label="%s" % (name, )) + + ax2.hist(prob_pos, range=(0, 1), bins=10, label=name, + histtype="step", lw=2) + +ax1.set_ylabel("Fraction of positives") +ax1.set_ylim([-0.05, 1.05]) +ax1.legend(loc="lower right") +ax1.set_title('Calibration plots (reliability curve)') + +ax2.set_xlabel("Mean predicted value") +ax2.set_ylabel("Count") +ax2.legend(loc="upper center", ncol=2) + +plt.tight_layout() +plt.show() diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 6ad049e1f637c..f3462531d8987 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -56,7 +56,7 @@ from .base import clone __check_build # avoid flakes unused variable error - __all__ = ['cluster', 'covariance', 'cross_decomposition', + __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition', 'cross_validation', 'datasets', 'decomposition', 'dummy', 'ensemble', 'externals', 'feature_extraction', 'feature_selection', 'gaussian_process', 'grid_search', 'hmm', diff --git a/sklearn/calibration.py b/sklearn/calibration.py new file mode 100644 index 0000000000000..d6a8e290c42c7 --- /dev/null +++ b/sklearn/calibration.py @@ -0,0 +1,522 @@ +"""Calibration of predicted probabilities.""" + +# Author: Alexandre Gramfort +# Balazs Kegl +# Jan Hendrik Metzen +# Mathieu Blondel +# +# License: BSD 3 clause + +from __future__ import division +import inspect + +from math import log +import numpy as np + +from scipy.optimize import fmin_bfgs + +from .base import BaseEstimator, ClassifierMixin, RegressorMixin, clone +from .preprocessing import LabelBinarizer +from .utils import check_X_y, check_array, indexable, column_or_1d +from .utils.validation import check_is_fitted +from .isotonic import IsotonicRegression +from .naive_bayes import GaussianNB +from .cross_validation import _check_cv +from .metrics.classification import _check_binary_probabilistic_predictions + + +class CalibratedClassifierCV(BaseEstimator, ClassifierMixin): + """Probability calibration with isotonic regression or sigmoid. + + With this class, the base_estimator is fit on the train set of the + cross-validation generator and the test set is used for calibration. + The probabilities for each of the folds are then averaged + for prediction. In case that cv="prefit" is passed to __init__, + it is it is assumed that base_estimator has been + fitted already and all data is used for calibration. Note that + data for fitting the classifier and for calibrating it must be disjpint. + + Parameters + ---------- + base_estimator : instance BaseEstimator + The classifier whose output decision function needs to be calibrated + to offer more accurate predict_proba outputs. If cv=prefit, the + classifier must have been fit already on data. + + method : 'sigmoid' | 'isotonic' + The method to use for calibration. Can be 'sigmoid' which + corresponds to Platt's method or 'isotonic' which is a + non-parameteric approach. It is not advised to use isotonic calibration + with too few calibration samples (<<1000) since it tends to overfit. + Use sigmoids (Platt's calibration) in this case. + + cv : integer or cross-validation generator or "prefit", optional + If an integer is passed, it is the number of folds (default 3). + Specific cross-validation objects can be passed, see + sklearn.cross_validation module for the list of possible objects. + If "prefit" is passed, it is assumed that base_estimator has been + fitted already and all data is used for calibration. + + Attributes + ---------- + classes_ : array, shape (n_classes) + The class labels. + + calibrated_classifiers_: list (len() equal to cv or 1 if cv == "prefit") + The list of calibrated classifiers, one for each crossvalidation fold, + which has been fitted on all but the validation fold and calibrated + on the validation fold. + + References + ---------- + .. [1] Obtaining calibrated probability estimates from decision trees + and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001 + + .. [2] Transforming Classifier Scores into Accurate Multiclass + Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002) + + .. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to + Regularized Likelihood Methods, J. Platt, (1999) + + .. [4] Predicting Good Probabilities with Supervised Learning, + A. Niculescu-Mizil & R. Caruana, ICML 2005 + """ + def __init__(self, base_estimator=GaussianNB(), method='sigmoid', cv=3): + self.base_estimator = base_estimator + self.method = method + self.cv = cv + + def fit(self, X, y, sample_weight=None): + """Fit the calibrated model + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data. + + y : array-like, shape (n_samples,) + Target values. + + sample_weight : array-like, shape = [n_samples] or None + Sample weights. If None, then samples are equally weighted. + + Returns + ------- + self : object + Returns an instance of self. + """ + X, y = check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo']) + X, y = indexable(X, y) + lb = LabelBinarizer().fit(y) + self.classes_ = lb.classes_ + + # Check that we each cross-validation fold can have at least one + # example per class + n_folds = self.cv if isinstance(self.cv, int) \ + else self.cv.n_folds if hasattr(self.cv, "n_folds") else None + if n_folds and \ + np.any([np.sum(y == class_) < n_folds for class_ in self.classes_]): + raise ValueError("Requesting %d-fold cross-validation but provided" + " less than %d examples for at least one class." + % (n_folds, n_folds)) + + self.calibrated_classifiers_ = [] + if self.cv == "prefit": + calibrated_classifier = _CalibratedClassifier(self.base_estimator, + method=self.method) + if sample_weight is not None: + calibrated_classifier.fit(X, y, sample_weight) + else: + calibrated_classifier.fit(X, y) + self.calibrated_classifiers_.append(calibrated_classifier) + else: + cv = _check_cv(self.cv, X, y, classifier=True) + for train, test in cv: + this_estimator = clone(self.base_estimator) + if sample_weight is not None and \ + "sample_weight" in inspect.getargspec( + this_estimator.fit)[0]: + this_estimator.fit(X[train], y[train], + sample_weight[train]) + else: + this_estimator.fit(X[train], y[train]) + + calibrated_classifier = \ + _CalibratedClassifier(this_estimator, method=self.method) + if sample_weight is not None: + calibrated_classifier.fit(X[test], y[test], + sample_weight[test]) + else: + calibrated_classifier.fit(X[test], y[test]) + self.calibrated_classifiers_.append(calibrated_classifier) + + return self + + def predict_proba(self, X): + """Posterior probabilities of classification + + This function returns posterior probabilities of classification + according to each class on an array of test vectors X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The samples. + + Returns + ------- + C : array, shape (n_samples, n_classes) + The predicted probas. + """ + check_is_fitted(self, ["classes_", "calibrated_classifiers_"]) + X = check_array(X, accept_sparse=['csc', 'csr', 'coo']) + # Compute the arithmetic mean of the predictions of the calibrated + # classfiers + mean_proba = np.zeros((X.shape[0], len(self.classes_))) + for calibrated_classifier in self.calibrated_classifiers_: + proba = calibrated_classifier.predict_proba(X) + mean_proba += proba + + mean_proba /= len(self.calibrated_classifiers_) + + return mean_proba + + def predict(self, X): + """Predict the target of new samples. Can be different from the + prediction of the uncalibrated classifier. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The samples. + + Returns + ------- + C : array, shape (n_samples,) + The predicted class. + """ + check_is_fitted(self, ["classes_", "calibrated_classifiers_"]) + return self.classes_[np.argmax(self.predict_proba(X), axis=1)] + + +class _CalibratedClassifier(object): + """Probability calibration with isotonic regression or sigmoid. + + It assumes that base_estimator has already been fit, and trains the + calibration on the input set of the fit function. Note that this class + should not be used as an estimator directly. Use CalibratedClassifierCV + with cv="prefit" instead. + + Parameters + ---------- + base_estimator : instance BaseEstimator + The classifier whose output decision function needs to be calibrated + to offer more accurate predict_proba outputs. No default value since + it has to be an already fitted estimator. + + method : 'sigmoid' | 'isotonic' + The method to use for calibration. Can be 'sigmoid' which + corresponds to Platt's method or 'isotonic' which is a + non-parameteric approach based on isotonic regression. + + References + ---------- + .. [1] Obtaining calibrated probability estimates from decision trees + and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001 + + .. [2] Transforming Classifier Scores into Accurate Multiclass + Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002) + + .. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to + Regularized Likelihood Methods, J. Platt, (1999) + + .. [4] Predicting Good Probabilities with Supervised Learning, + A. Niculescu-Mizil & R. Caruana, ICML 2005 + """ + def __init__(self, base_estimator, method='sigmoid'): + self.base_estimator = base_estimator + self.method = method + + def _preproc(self, X): + n_classes = len(self.classes_) + if hasattr(self.base_estimator, "decision_function"): + df = self.base_estimator.decision_function(X) + if df.ndim == 1: + df = df[:, np.newaxis] + elif hasattr(self.base_estimator, "predict_proba"): + df = self.base_estimator.predict_proba(X) + if n_classes == 2: + df = df[:, 1:] + else: + raise RuntimeError('classifier has no decision_function or ' + 'predict_proba method.') + + idx_pos_class = np.arange(df.shape[1]) + + return df, idx_pos_class + + def fit(self, X, y, sample_weight=None): + """Calibrate the fitted model + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data. + + y : array-like, shape (n_samples,) + Target values. + + sample_weight : array-like, shape = [n_samples] or None + Sample weights. If None, then samples are equally weighted. + + Returns + ------- + self : object + Returns an instance of self. + """ + lb = LabelBinarizer() + Y = lb.fit_transform(y) + self.classes_ = lb.classes_ + + df, idx_pos_class = self._preproc(X) + self.calibrators_ = [] + + for k, this_df in zip(idx_pos_class, df.T): + if self.method == 'isotonic': + calibrator = IsotonicRegression(out_of_bounds='clip') + # XXX: isotonic regression cannot deal correctly with + # situations in which multiple inputs are identical but + # have different outputs. Since this is not untypical + # when calibrating, we add some small random jitter to + # the inputs. + this_df = \ + this_df + np.random.normal(0, 1e-10, this_df.shape[0]) + elif self.method == 'sigmoid': + calibrator = _SigmoidCalibration() + else: + raise ValueError('method should be "sigmoid" or ' + '"isotonic". Got %s.' % self.method) + calibrator.fit(this_df, Y[:, k], sample_weight) + self.calibrators_.append(calibrator) + + return self + + def predict_proba(self, X): + """Posterior probabilities of classification + + This function returns posterior probabilities of classification + according to each class on an array of test vectors X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The samples. + + Returns + ------- + C : array, shape (n_samples, n_classes) + The predicted probas. Can be exact zeros. + """ + n_classes = len(self.classes_) + proba = np.zeros((X.shape[0], n_classes)) + + df, idx_pos_class = self._preproc(X) + + for k, this_df, calibrator in \ + zip(idx_pos_class, df.T, self.calibrators_): + if n_classes == 2: + k += 1 + proba[:, k] = calibrator.predict(this_df) + + # Normalize the probabilities + if n_classes == 2: + proba[:, 0] = 1. - proba[:, 1] + else: + proba /= np.sum(proba, axis=1)[:, np.newaxis] + + # XXX : for some reason all probas can be 0 + proba[np.isnan(proba)] = 1. / n_classes + + # Deal with cases where the predicted probability minimally exceeds 1.0 + proba[(1.0 < proba) & (proba <= 1.0 + 1e-5)] = 1.0 + + return proba + + +def sigmoid_calibration(df, y, sample_weight=None): + """Probability Calibration with sigmoid method (Platt 2000) + + Parameters + ---------- + df : ndarray, shape (n_samples,) + The decision function or predict proba for the samples. + + y : ndarray, shape (n_samples,) + The targets. + + sample_weight : array-like, shape = [n_samples] or None + Sample weights. If None, then samples are equally weighted. + + Returns + ------- + a : float + The slope. + + b : float + The intercept. + + References + ---------- + Platt, "Probabilistic Outputs for Support Vector Machines" + """ + df = column_or_1d(df) + y = column_or_1d(y) + + F = df # F follows Platt's notations + tiny = np.finfo(np.float).tiny # to avoid division by 0 warning + + # Bayesian priors (see Platt end of section 2.2) + prior0 = float(np.sum(y <= 0)) + prior1 = y.shape[0] - prior0 + T = np.zeros(y.shape) + T[y > 0] = (prior1 + 1.) / (prior1 + 2.) + T[y <= 0] = 1. / (prior0 + 2.) + T1 = 1. - T + + def objective(AB): + # From Platt (beginning of Section 2.2) + E = np.exp(AB[0] * F + AB[1]) + P = 1. / (1. + E) + l = -(T * np.log(P + tiny) + T1 * np.log(1. - P + tiny)) + if sample_weight is not None: + return (sample_weight * l).sum() + else: + return l.sum() + + def grad(AB): + # gradient of the objective function + E = np.exp(AB[0] * F + AB[1]) + P = 1. / (1. + E) + TEP_minus_T1P = P * (T * E - T1) + if sample_weight is not None: + TEP_minus_T1P *= sample_weight + dA = np.dot(TEP_minus_T1P, F) + dB = np.sum(TEP_minus_T1P) + return np.array([dA, dB]) + + AB0 = np.array([0., log((prior0 + 1.) / (prior1 + 1.))]) + AB_ = fmin_bfgs(objective, AB0, fprime=grad, disp=False) + return AB_[0], AB_[1] + + +class _SigmoidCalibration(BaseEstimator, RegressorMixin): + """Sigmoid regression model. + + Attributes + ---------- + `a_` : float + The slope. + + `b_` : float + The intercept. + """ + def fit(self, X, y, sample_weight=None): + """Fit the model using X, y as training data. + + Parameters + ---------- + X : array-like, shape (n_samples,) + Training data. + + y : array-like, shape (n_samples,) + Training target. + + sample_weight : array-like, shape = [n_samples] or None + Sample weights. If None, then samples are equally weighted. + + Returns + ------- + self : object + Returns an instance of self. + """ + X = column_or_1d(X) + y = column_or_1d(y) + X, y = indexable(X, y) + + self.a_, self.b_ = sigmoid_calibration(X, y, sample_weight) + return self + + def predict(self, T): + """Predict new data by linear interpolation. + + Parameters + ---------- + T : array-like, shape (n_samples,) + Data to predict from. + + Returns + ------- + `T_` : array, shape (n_samples,) + The predicted data. + """ + T = column_or_1d(T) + return 1. / (1. + np.exp(self.a_ * T + self.b_)) + + +def calibration_curve(y_true, y_prob, normalize=False, n_bins=5): + """Compute true and predicted probabilities for a calibration curve. + + Parameters + ---------- + y_true : array, shape (n_samples,) + True targets. + + y_prob : array, shape (n_samples,) + Probabilities of the positive class. + + normalize : bool, optional, default=False + Whether y_prob needs to be normalized into the bin [0, 1], i.e. is not + a proper probability. If True, the smallest value in y_prob is mapped + onto 0 and the largest one onto 1. + + n_bins : int + Number of bins. A bigger number requires more data. + + Returns + ------- + prob_true : array, shape (n_bins,) + The true probability in each bin (fraction of positives). + + prob_pred : array, shape (n_bins,) + The mean predicted probability in each bin. + + References + ---------- + Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good + Probabilities With Supervised Learning, in Proceedings of the 22nd + International Conference on Machine Learning (ICML). + See section 4 (Qualitative Analysis of Predictions). + """ + y_true = column_or_1d(y_true) + y_prob = column_or_1d(y_prob) + + if normalize: # Normalize predicted values into interval [0, 1] + y_prob = (y_prob - y_prob.min()) / (y_prob.max() - y_prob.min()) + elif y_prob.min() < 0 or y_prob.max() > 1: + raise ValueError("y_prob has values outside [0, 1] and normalize is " + "set to False.") + + y_true = _check_binary_probabilistic_predictions(y_true, y_prob) + + bins = np.linspace(0., 1. + 1e-8, n_bins + 1) + binids = np.digitize(y_prob, bins) - 1 + + bin_sums = np.bincount(binids, weights=y_prob, minlength=len(bins)) + bin_true = np.bincount(binids, weights=y_true, minlength=len(bins)) + bin_total = np.bincount(binids, minlength=len(bins)) + + nonzero = bin_total != 0 + prob_true = (bin_true[nonzero] / bin_total[nonzero]) + prob_pred = (bin_sums[nonzero] / bin_total[nonzero]) + + return prob_true, prob_pred diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 3cd4d2afe3308..c455f9e8c4d81 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -3,6 +3,7 @@ and pairwise metrics and distance computations. """ + from .ranking import auc from .ranking import average_precision_score from .ranking import coverage_error @@ -25,6 +26,7 @@ from .classification import precision_score from .classification import recall_score from .classification import zero_one_loss +from .classification import brier_score_loss from . import cluster from .cluster import adjusted_mutual_info_score @@ -103,4 +105,5 @@ 'silhouette_score', 'v_measure_score', 'zero_one_loss', + 'brier_score_loss', ] diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py index 99a7e509fe69e..441af936d53d0 100644 --- a/sklearn/metrics/classification.py +++ b/sklearn/metrics/classification.py @@ -28,12 +28,13 @@ from scipy.sparse import csr_matrix from scipy.spatial.distance import hamming as sp_hamming -from ..preprocessing import LabelBinarizer +from ..preprocessing import LabelBinarizer, label_binarize from ..preprocessing import LabelEncoder from ..utils import check_array from ..utils import check_consistent_length from ..preprocessing import MultiLabelBinarizer from ..utils import column_or_1d +from ..utils import check_consistent_length from ..utils.multiclass import unique_labels from ..utils.multiclass import type_of_target from ..utils.validation import _num_samples @@ -1515,3 +1516,95 @@ def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None): # The hinge_loss doesn't penalize good enough predictions. losses[losses <= 0] = 0 return np.average(losses, weights=sample_weight) + + +def _check_binary_probabilistic_predictions(y_true, y_prob): + """Check that y_true is binary and y_prob contains valid probabilities""" + check_consistent_length(y_true, y_prob) + + labels = np.unique(y_true) + + if len(labels) != 2: + raise ValueError("Only binary classification is supported. " + "Provided labels %s." % labels) + + if y_prob.max() > 1: + raise ValueError("y_prob contains values greater than 1.") + + if y_prob.min() < 0: + raise ValueError("y_prob contains values less than 0.") + + return label_binarize(y_true, labels)[:, 0] + + +def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None): + """Compute the Brier score. + + The smaller the Brier score, the better, hence the naming with "loss". + + Across all items in a set N predictions, the Brier score measures the + mean squared difference between (1) the predicted probability assigned + to the possible outcomes for item i, and (2) the actual outcome. + Therefore, the lower the Brier score is for a set of predictions, the + better the predictions are calibrated. Note that the Brier score always + takes on a value between zero and one, since this is the largest + possible difference between a predicted probability (which must be + between zero and one) and the actual outcome (which can take on values + of only 0 and 1). + + The Brier score is appropriate for binary and categorical outcomes that + can be structured as true or false, but is inappropriate for ordinal + variables which can take on three or more values (this is because the + Brier score assumes that all possible outcomes are equivalently + "distant" from one another). Which label is considered to be the positive + label is controlled via the parameter pos_label, which defaults to 1. + + + Parameters + ---------- + y_true : array, shape (n_samples,) + True targets. + + y_prob : array, shape (n_samples,) + Probabilities of the positive class. + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + pos_label : int (default: None) + Label of the positive class. If None, the maximum label is used as + positive class + + Returns + ------- + score : float + Brier score + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import brier_score_loss + >>> y_true = np.array([0, 1, 1, 0]) + >>> y_true_categorical = np.array(["spam", "ham", "ham", "spam"]) + >>> y_prob = np.array([0.1, 0.9, 0.8, 0.3]) + >>> brier_score_loss(y_true, y_prob) # doctest: +ELLIPSIS + 0.037... + >>> brier_score_loss(y_true, 1-y_prob, pos_label=0) # doctest: +ELLIPSIS + 0.037... + >>> brier_score_loss(y_true_categorical, y_prob, \ + pos_label="ham") # doctest: +ELLIPSIS + 0.037... + >>> brier_score_loss(y_true, np.array(y_prob) > 0.5) + 0.0 + + References + ---------- + http://en.wikipedia.org/wiki/Brier_score + """ + y_true = column_or_1d(y_true) + y_prob = column_or_1d(y_prob) + if pos_label is None: + pos_label = y_true.max() + y_true = np.array(y_true == pos_label, int) + y_true = _check_binary_probabilistic_predictions(y_true, y_prob) + return np.average((y_true - y_prob) ** 2, weights=sample_weight) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index fe48585fddb16..d658169bd20ef 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -1,6 +1,7 @@ from __future__ import division, print_function import numpy as np +from scipy import linalg from functools import partial from itertools import product import warnings @@ -40,6 +41,7 @@ from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import zero_one_loss +from sklearn.metrics import brier_score_loss from sklearn.metrics.classification import _check_targets @@ -1252,3 +1254,20 @@ def test_log_loss(): y_pred = [[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]] loss = log_loss(y_true, y_pred) assert_almost_equal(loss, 1.0383217, decimal=6) + + +def test_brier_score_loss(): + """Check brier_score_loss function""" + y_true = np.array([0, 1, 1, 0, 1, 1]) + y_pred = np.array([0.1, 0.8, 0.9, 0.3, 1., 0.95]) + true_score = linalg.norm(y_true - y_pred) ** 2 / len(y_true) + + assert_almost_equal(brier_score_loss(y_true, y_true), 0.0) + assert_almost_equal(brier_score_loss(y_true, y_pred), true_score) + assert_almost_equal(brier_score_loss(1. + y_true, y_pred), + true_score) + assert_almost_equal(brier_score_loss(2 * y_true - 1, y_pred), + true_score) + assert_raises(ValueError, brier_score_loss, y_true, y_pred[1:]) + assert_raises(ValueError, brier_score_loss, y_true, y_pred + 1.) + assert_raises(ValueError, brier_score_loss, y_true, y_pred - 1.) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 34e30e0c7640d..f443e776ad14d 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -25,6 +25,7 @@ from sklearn.metrics import accuracy_score from sklearn.metrics import average_precision_score +from sklearn.metrics import brier_score_loss from sklearn.metrics import confusion_matrix from sklearn.metrics import coverage_error from sklearn.metrics import explained_variance_score @@ -148,6 +149,8 @@ "hinge_loss": hinge_loss, + "brier_score_loss": brier_score_loss, + "roc_auc_score": roc_auc_score, "weighted_roc_auc": partial(roc_auc_score, average="weighted"), "samples_roc_auc": partial(roc_auc_score, average="samples"), @@ -197,6 +200,7 @@ "macro_roc_auc", "samples_roc_auc", "coverage_error", + "brier_score_loss" ] # Metrics with an "average" argument @@ -211,7 +215,9 @@ # Metrics with a "pos_label" argument METRICS_WITH_POS_LABEL = [ - "roc_curve", "hinge_loss", + "roc_curve", + + "brier_score_loss", "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score", @@ -554,9 +560,15 @@ def test_invariance_string_vs_numbers_labels(): "invariance test".format(name)) for name, metric in THRESHOLDED_METRICS.items(): - if name in ("log_loss", "hinge_loss", "unnormalized_log_loss"): + if name in ("log_loss", "hinge_loss", "unnormalized_log_loss", + "brier_score_loss"): + # Ugly, but handle case with a pos_label and label + metric_str = metric + if name in METRICS_WITH_POS_LABEL: + metric_str = partial(metric_str, pos_label=pos_label_str) + measure_with_number = metric(y1, y2) - measure_with_str = metric(y1_str, y2) + measure_with_str = metric_str(y1_str, y2) assert_array_equal(measure_with_number, measure_with_str, err_msg="{0} failed string vs number " "invariance test".format(name)) diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py new file mode 100644 index 0000000000000..98e3d63fc70e5 --- /dev/null +++ b/sklearn/tests/test_calibration.py @@ -0,0 +1,234 @@ +# Authors: Alexandre Gramfort +# License: BSD 3 clause + +import numpy as np +from scipy import sparse + +from sklearn.utils.testing import (assert_array_almost_equal, assert_equal, + assert_greater, assert_almost_equal, + assert_greater_equal, + assert_array_equal, + assert_raises) +from sklearn.datasets import make_classification, make_blobs +from sklearn.naive_bayes import MultinomialNB +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.svm import LinearSVC +from sklearn.linear_model import Ridge +from sklearn.metrics import brier_score_loss, log_loss +from sklearn.calibration import CalibratedClassifierCV +from sklearn.calibration import sigmoid_calibration, _SigmoidCalibration +from sklearn.calibration import calibration_curve + + +def test_calibration(): + """Test calibration objects with isotonic and sigmoid""" + n_samples = 100 + X, y = make_classification(n_samples=2 * n_samples, n_features=6, + random_state=42) + sample_weight = np.random.RandomState(seed=42).uniform(size=y.size) + + X -= X.min() # MultinomialNB only allows positive X + + # split train and test + X_train, y_train, sw_train = \ + X[:n_samples], y[:n_samples], sample_weight[:n_samples] + X_test, y_test = X[n_samples:], y[n_samples:] + + # Naive-Bayes + clf = MultinomialNB() + clf.fit(X_train, y_train, sw_train) + prob_pos_clf = clf.predict_proba(X_test)[:, 1] + + pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1) + assert_raises(ValueError, pc_clf.fit, X, y) + + # Naive Bayes with calibration + for this_X_train, this_X_test in [(X_train, X_test), + (sparse.csr_matrix(X_train), + sparse.csr_matrix(X_test))]: + for method in ['isotonic', 'sigmoid']: + pc_clf = CalibratedClassifierCV(clf, method=method, cv=2) + # Note that this fit overwrites the fit on the entire training + # set + pc_clf.fit(this_X_train, y_train, sample_weight=sw_train) + prob_pos_pc_clf = pc_clf.predict_proba(this_X_test)[:, 1] + + # Check that brier score has improved after calibration + assert_greater(brier_score_loss(y_test, prob_pos_clf), + brier_score_loss(y_test, prob_pos_pc_clf)) + + # Check invariance against relabeling [0, 1] -> [1, 2] + pc_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train) + prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1] + assert_array_almost_equal(prob_pos_pc_clf, + prob_pos_pc_clf_relabeled) + + # Check invariance against relabeling [0, 1] -> [-1, 1] + pc_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train) + prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1] + assert_array_almost_equal(prob_pos_pc_clf, + prob_pos_pc_clf_relabeled) + + # Check invariance against relabeling [0, 1] -> [1, 0] + pc_clf.fit(this_X_train, (y_train + 1) % 2, + sample_weight=sw_train) + prob_pos_pc_clf_relabeled = \ + pc_clf.predict_proba(this_X_test)[:, 1] + if method == "sigmoid": + assert_array_almost_equal(prob_pos_pc_clf, + 1 - prob_pos_pc_clf_relabeled) + else: + # Isotonic calibration is not invariant against relabeling + # but should improve in both cases + assert_greater(brier_score_loss(y_test, prob_pos_clf), + brier_score_loss((y_test + 1) % 2, + prob_pos_pc_clf_relabeled)) + + # check that calibration can also deal with regressors that have + # a decision_function + clf_base_regressor = CalibratedClassifierCV(Ridge(), method="sigmoid") + clf_base_regressor.fit(X_train, y_train) + clf_base_regressor.predict(X_test) + + # Check failure cases: + # only "isotonic" and "sigmoid" should be accepted as methods + clf_invalid_method = CalibratedClassifierCV(clf, method="foo") + assert_raises(ValueError, clf_invalid_method.fit, X_train, y_train) + + # base-estimators should provide either decision_function or + # predict_proba (most regressors, for instance, should fail) + clf_base_regressor = \ + CalibratedClassifierCV(RandomForestRegressor(), method="sigmoid") + assert_raises(RuntimeError, clf_base_regressor.fit, X_train, y_train) + + +def test_calibration_multiclass(): + """Test calibration for multiclass """ + # test multi-class setting with classifier that implements + # only decision function + clf = LinearSVC() + X, y_idx = make_blobs(n_samples=100, n_features=2, random_state=42, + centers=3, cluster_std=3.0) + + # Use categorical labels to check that CalibratedClassifierCV supports + # them correctly + target_names = np.array(['a', 'b', 'c']) + y = target_names[y_idx] + + X_train, y_train = X[::2], y[::2] + X_test, y_test = X[1::2], y[1::2] + + clf.fit(X_train, y_train) + for method in ['isotonic', 'sigmoid']: + cal_clf = CalibratedClassifierCV(clf, method=method, cv=2) + cal_clf.fit(X_train, y_train) + probas = cal_clf.predict_proba(X_test) + assert_array_almost_equal(np.sum(probas, axis=1), np.ones(len(X_test))) + + # Check that log-loss of calibrated classifier is smaller than + # log-loss of naively turned OvR decision function to probabilities + # via softmax + def softmax(y_pred): + e = np.exp(-y_pred) + return e / e.sum(axis=1).reshape(-1, 1) + uncalibrated_log_loss = \ + log_loss(y_test, softmax(clf.decision_function(X_test))) + calibrated_log_loss = log_loss(y_test, probas) + assert_greater_equal(uncalibrated_log_loss, calibrated_log_loss) + + # Test that calibration of a multiclass classifier decreases log-loss + # for RandomForestClassifier + X, y = make_blobs(n_samples=100, n_features=2, random_state=42, + cluster_std=3.0) + X_train, y_train = X[::2], y[::2] + X_test, y_test = X[1::2], y[1::2] + + clf = RandomForestClassifier(n_estimators=10, random_state=42) + clf.fit(X_train, y_train) + clf_probs = clf.predict_proba(X_test) + loss = log_loss(y_test, clf_probs) + + for method in ['isotonic', 'sigmoid']: + cal_clf = CalibratedClassifierCV(clf, method=method, cv=3) + cal_clf.fit(X_train, y_train) + cal_clf_probs = cal_clf.predict_proba(X_test) + cal_loss = log_loss(y_test, cal_clf_probs) + assert_greater(loss, cal_loss) + + +def test_calibration_prefit(): + """Test calibration for prefitted classifiers""" + n_samples = 50 + X, y = make_classification(n_samples=3 * n_samples, n_features=6, + random_state=42) + sample_weight = np.random.RandomState(seed=42).uniform(size=y.size) + + X -= X.min() # MultinomialNB only allows positive X + + # split train and test + X_train, y_train, sw_train = \ + X[:n_samples], y[:n_samples], sample_weight[:n_samples] + X_calib, y_calib, sw_calib = \ + X[n_samples:2 * n_samples], y[n_samples:2 * n_samples], \ + sample_weight[n_samples:2 * n_samples] + X_test, y_test = X[2 * n_samples:], y[2 * n_samples:] + + # Naive-Bayes + clf = MultinomialNB() + clf.fit(X_train, y_train, sw_train) + prob_pos_clf = clf.predict_proba(X_test)[:, 1] + + # Naive Bayes with calibration + for this_X_calib, this_X_test in [(X_calib, X_test), + (sparse.csr_matrix(X_calib), + sparse.csr_matrix(X_test))]: + for method in ['isotonic', 'sigmoid']: + pc_clf = CalibratedClassifierCV(clf, method=method, cv="prefit") + + for sw in [sw_calib, None]: + pc_clf.fit(this_X_calib, y_calib, sample_weight=sw) + y_prob = pc_clf.predict_proba(this_X_test) + y_pred = pc_clf.predict(this_X_test) + prob_pos_pc_clf = y_prob[:, 1] + assert_array_equal(y_pred, + np.array([0, 1])[np.argmax(y_prob, axis=1)]) + + assert_greater(brier_score_loss(y_test, prob_pos_clf), + brier_score_loss(y_test, prob_pos_pc_clf)) + + +def test_sigmoid_calibration(): + """Test calibration values with Platt sigmoid model""" + exF = np.array([5, -4, 1.0]) + exY = np.array([1, -1, -1]) + # computed from my python port of the C++ code in LibSVM + AB_lin_libsvm = np.array([-0.20261354391187855, 0.65236314980010512]) + assert_array_almost_equal(AB_lin_libsvm, sigmoid_calibration(exF, exY), 3) + lin_prob = 1. / (1. + np.exp(AB_lin_libsvm[0] * exF + AB_lin_libsvm[1])) + sk_prob = _SigmoidCalibration().fit(exF, exY).predict(exF) + assert_array_almost_equal(lin_prob, sk_prob, 6) + + # check that _SigmoidCalibration().fit only accepts 1d array or 2d column + # arrays + assert_raises(ValueError, _SigmoidCalibration().fit, + np.vstack((exF, exF)), exY) + + +def test_calibration_curve(): + """Check calibration_curve function""" + y_true = np.array([0, 0, 0, 1, 1, 1]) + y_pred = np.array([0., 0.1, 0.2, 0.8, 0.9, 1.]) + prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=2) + prob_true_unnormalized, prob_pred_unnormalized = \ + calibration_curve(y_true, y_pred * 2, n_bins=2, normalize=True) + assert_equal(len(prob_true), len(prob_pred)) + assert_equal(len(prob_true), 2) + assert_almost_equal(prob_true, [0, 1]) + assert_almost_equal(prob_pred, [0.1, 0.9]) + assert_almost_equal(prob_true, prob_true_unnormalized) + assert_almost_equal(prob_pred, prob_pred_unnormalized) + + # probabilities outside [0, 1] should not be accepted when normalize + # is set to False + assert_raises(ValueError, calibration_curve, [1.1], [-0.1], + normalize=False) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 2ca110a17d6c0..edfd6dd4d2757 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -277,7 +277,7 @@ def check_fit_score_takes_y(name, Estimator): # in fit and score so they can be used in pipelines rnd = np.random.RandomState(0) X = rnd.uniform(size=(10, 3)) - y = (X[:, 0] * 4).astype(np.int) + y = np.arange(10) % 3 y = multioutput_estimator_convert_y_2d(name, y) estimator = Estimator() set_fast_parameters(estimator) @@ -294,7 +294,7 @@ def check_fit_score_takes_y(name, Estimator): def check_estimators_dtypes(name, Estimator): rnd = np.random.RandomState(0) - X_train_32 = 4 * rnd.uniform(size=(10, 3)).astype(np.float32) + X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32) X_train_64 = X_train_32.astype(np.float64) X_train_int_64 = X_train_32.astype(np.int64) X_train_int_32 = X_train_32.astype(np.int32) @@ -634,6 +634,7 @@ def check_classifiers_input_shapes(name, Classifier): # raised with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DataConversionWarning) + warnings.simplefilter("ignore", RuntimeWarning) classifier.fit(X, y[:, np.newaxis]) msg = "expected 1 DataConversionWarning, got: %s" % ( ", ".join([str(w_x) for w_x in w])) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 704b67399abb4..a0ae8fb01b63d 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -509,7 +509,8 @@ def uninstall_mldata_mock(): # exclude them in another way 'ZeroEstimator', 'ScaledLogOddsEstimator', 'QuantileEstimator', 'MeanEstimator', - 'LogOddsEstimator', 'PriorProbabilityEstimator'] + 'LogOddsEstimator', 'PriorProbabilityEstimator', + '_SigmoidCalibration'] def all_estimators(include_meta_estimators=False,