diff --git a/examples/calibration/plot_compare_calibration.py b/examples/calibration/plot_compare_calibration.py index b2ec219c9ca1c..389e231c8f07c 100644 --- a/examples/calibration/plot_compare_calibration.py +++ b/examples/calibration/plot_compare_calibration.py @@ -26,8 +26,14 @@ # We will use a synthetic binary classification dataset with 100,000 samples # and 20 features. Of the 20 features, only 2 are informative, 2 are # redundant (random combinations of the informative features) and the -# remaining 16 are uninformative (random numbers). Of the 100,000 samples, -# 100 will be used for model fitting and the remaining for testing. +# remaining 16 are uninformative (random numbers). +# +# Of the 100,000 samples, 100 will be used for model fitting and the remaining +# for testing. Note that this split is quite unusual: the goal is to obtain +# stable calibration curve estimates for models that are potentially prone to +# overfitting. In practice, one should rather use cross-validation with more +# balanced splits but this would make the code of this example more complicated +# to follow. from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split @@ -86,17 +92,26 @@ def predict_proba(self, X): from sklearn.calibration import CalibrationDisplay from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import LogisticRegressionCV from sklearn.naive_bayes import GaussianNB -# Create classifiers -lr = LogisticRegression() +# Define the classifiers to be compared in the study. +# +# Note that we use a variant of the logistic regression model that can +# automatically tune its regularization parameter. +# +# For a fair comparison, we should run a hyper-parameter search for all the +# classifiers but we don't do it here for the sake of keeping the example code +# concise and fast to execute. +lr = LogisticRegressionCV( + Cs=np.logspace(-6, 6, 101), cv=10, scoring="neg_log_loss", max_iter=1_000 +) gnb = GaussianNB() svc = NaivelyCalibratedLinearSVC(C=1.0, dual="auto") -rfc = RandomForestClassifier() +rfc = RandomForestClassifier(random_state=42) clf_list = [ - (lr, "Logistic"), + (lr, "Logistic Regression"), (gnb, "Naive Bayes"), (svc, "SVC"), (rfc, "Random forest"), @@ -150,60 +165,116 @@ def predict_proba(self, X): plt.show() # %% -# :class:`~sklearn.linear_model.LogisticRegression` returns well calibrated -# predictions as it directly optimizes log-loss. In contrast, the other methods -# return biased probabilities, with different biases for each method: # -# * :class:`~sklearn.naive_bayes.GaussianNB` tends to push -# probabilities to 0 or 1 (see histogram). This is mainly -# because the naive Bayes equation only provides correct estimate of +# Analysis of the results +# ----------------------- +# +# :class:`~sklearn.linear_model.LogisticRegressionCV` returns reasonably well +# calibrated predictions despite the small training set size: its reliability +# curve is the closest to the diagonal among the four models. +# +# Logistic regression is trained by minimizing the log-loss which is a strictly +# proper scoring rule: in the limit of infinite training data, strictly proper +# scoring rules are minimized by the model that predicts the true conditional +# probabilities. That (hypothetical) model would therefore be perfectly +# calibrated. However, using a proper scoring rule as training objective is not +# sufficient to guarantee a well-calibrated model by itself: even with a very +# large training set, logistic regression could still be poorly calibrated, if +# it was too strongly regularized or if the choice and preprocessing of input +# features made this model mis-specified (e.g. if the true decision boundary of +# the dataset is a highly non-linear function of the input features). +# +# In this example the training set was intentionally kept very small. In this +# setting, optimizing the log-loss can still lead to poorly calibrated models +# because of overfitting. To mitigate this, the +# :class:`~sklearn.linear_model.LogisticRegressionCV` class was configured to +# tune the `C` regularization parameter to also minimize the log-loss via inner +# cross-validation so as to find the best compromise for this model in the +# small training set setting. +# +# Because of the finite training set size and the lack of guarantee for +# well-specification, we observe that the calibration curve of the logistic +# regression model is close but not perfectly on the diagonal. The shape of the +# calibration curve of this model can be interpreted as slightly +# under-confident: the predicted probabilities are a bit too close to 0.5 +# compared to the true fraction of positive samples. +# +# The other methods all output less well calibrated probabilities: +# +# * :class:`~sklearn.naive_bayes.GaussianNB` tends to push probabilities to 0 +# or 1 (see histogram) on this particular dataset (over-confidence). This is +# mainly because the naive Bayes equation only provides correct estimate of # probabilities when the assumption that features are conditionally -# independent holds [2]_. However, features tend to be positively correlated -# and is the case with this dataset, which contains 2 features -# generated as random linear combinations of the informative features. These -# correlated features are effectively being 'counted twice', resulting in -# pushing the predicted probabilities towards 0 and 1 [3]_. -# -# * :class:`~sklearn.ensemble.RandomForestClassifier` shows the opposite -# behavior: the histograms show peaks at approx. 0.2 and 0.9 probability, -# while probabilities close to 0 or 1 are very rare. An explanation for this -# is given by Niculescu-Mizil and Caruana [1]_: "Methods such as bagging and -# random forests that average predictions from a base set of models can have -# difficulty making predictions near 0 and 1 because variance in the -# underlying base models will bias predictions that should be near zero or -# one away from these values. Because predictions are restricted to the -# interval [0,1], errors caused by variance tend to be one- sided near zero -# and one. For example, if a model should predict p = 0 for a case, the only -# way bagging can achieve this is if all bagged trees predict zero. If we add -# noise to the trees that bagging is averaging over, this noise will cause -# some trees to predict values larger than 0 for this case, thus moving the -# average prediction of the bagged ensemble away from 0. We observe this -# effect most strongly with random forests because the base-level trees -# trained with random forests have relatively high variance due to feature -# subsetting." As a result, the calibration curve shows a characteristic -# sigmoid shape, indicating that the classifier is under-confident -# and could return probabilities closer to 0 or 1. -# -# * To show the performance of :class:`~sklearn.svm.LinearSVC`, we naively -# scale the output of the :term:`decision_function` into [0, 1] by applying -# min-max scaling, since SVC does not output probabilities by default. -# :class:`~sklearn.svm.LinearSVC` shows an -# even more sigmoid curve than the -# :class:`~sklearn.ensemble.RandomForestClassifier`, which is typical for -# maximum-margin methods [1]_ as they focus on difficult to classify samples -# that are close to the decision boundary (the support vectors). +# independent holds [2]_. However, features can be correlated and this is the case +# with this dataset, which contains 2 features generated as random linear +# combinations of the informative features. These correlated features are +# effectively being 'counted twice', resulting in pushing the predicted +# probabilities towards 0 and 1 [3]_. Note, however, that changing the seed +# used to generate the dataset can lead to widely varying results for the +# naive Bayes estimator. +# +# * :class:`~sklearn.svm.LinearSVC` is not a natural probabilistic classifier. +# In order to interpret its prediction as such, we naively scaled the output +# of the :term:`decision_function` into [0, 1] by applying min-max scaling in +# the `NaivelyCalibratedLinearSVC` wrapper class defined above. This +# estimator shows a typical sigmoid-shaped calibration curve on this data: +# predictions larger than 0.5 correspond to samples with an even larger +# effective positive class fraction (above the diagonal), while predictions +# below 0.5 corresponds to even lower positive class fractions (below the +# diagonal). This under-confident predictions are typical for maximum-margin +# methods [1]_. +# +# * :class:`~sklearn.ensemble.RandomForestClassifier`'s prediction histogram +# shows peaks at approx. 0.2 and 0.9 probability, while probabilities close to +# 0 or 1 are very rare. An explanation for this is given by [1]_: +# "Methods such as bagging and random forests that average +# predictions from a base set of models can have difficulty making +# predictions near 0 and 1 because variance in the underlying base models +# will bias predictions that should be near zero or one away from these +# values. Because predictions are restricted to the interval [0, 1], errors +# caused by variance tend to be one-sided near zero and one. For example, if +# a model should predict p = 0 for a case, the only way bagging can achieve +# this is if all bagged trees predict zero. If we add noise to the trees that +# bagging is averaging over, this noise will cause some trees to predict +# values larger than 0 for this case, thus moving the average prediction of +# the bagged ensemble away from 0. We observe this effect most strongly with +# random forests because the base-level trees trained with random forests +# have relatively high variance due to feature subsetting." This effect can +# make random forests under-confident. Despite this possible bias, note that +# the trees themselves are fit by minimizing either the Gini or Entropy +# criterion, both of which lead to splits that minimize proper scoring rules: +# the Brier score or the log-loss respectively. See :ref:`the user guide +# ` for more details. This can explain why +# this model shows a good enough calibration curve on this particular example +# dataset. Indeed the Random Forest model is not significantly more +# under-confident than the Logistic Regression model. +# +# Feel free to re-run this example with different random seeds and other +# dataset generation parameters to see how different the calibration plots can +# look. In general, Logistic Regression and Random Forest will tend to be the +# best calibrated classifiers, while SVC will often display the typical +# under-confident miscalibration. The naive Bayes model is also often poorly +# calibrated but the general shape of its calibration curve can vary widely +# depending on the dataset. +# +# Finally, note that for some dataset seeds, all models are poorly calibrated, +# even when tuning the regularization parameter as above. This is bound to +# happen when the training size is too small or when the model is severely +# misspecified. # # References # ---------- # # .. [1] `Predicting Good Probabilities with Supervised Learning -# `_, -# A. Niculescu-Mizil & R. Caruana, ICML 2005 +# `_, A. +# Niculescu-Mizil & R. Caruana, ICML 2005 +# # .. [2] `Beyond independence: Conditions for the optimality of the simple # bayesian classifier # `_ # Domingos, P., & Pazzani, M., Proc. 13th Intl. Conf. Machine Learning. # 1996. +# # .. [3] `Obtaining calibrated probability estimates from decision trees and # naive Bayesian classifiers # `_