diff --git a/examples/calibration/plot_compare_calibration.py b/examples/calibration/plot_compare_calibration.py
index b2ec219c9ca1c..389e231c8f07c 100644
--- a/examples/calibration/plot_compare_calibration.py
+++ b/examples/calibration/plot_compare_calibration.py
@@ -26,8 +26,14 @@
 # We will use a synthetic binary classification dataset with 100,000 samples
 # and 20 features. Of the 20 features, only 2 are informative, 2 are
 # redundant (random combinations of the informative features) and the
-# remaining 16 are uninformative (random numbers). Of the 100,000 samples,
-# 100 will be used for model fitting and the remaining for testing.
+# remaining 16 are uninformative (random numbers).
+#
+# Of the 100,000 samples, 100 will be used for model fitting and the remaining
+# for testing. Note that this split is quite unusual: the goal is to obtain
+# stable calibration curve estimates for models that are potentially prone to
+# overfitting. In practice, one should rather use cross-validation with more
+# balanced splits but this would make the code of this example more complicated
+# to follow.
 
 from sklearn.datasets import make_classification
 from sklearn.model_selection import train_test_split
@@ -86,17 +92,26 @@ def predict_proba(self, X):
 
 from sklearn.calibration import CalibrationDisplay
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LogisticRegressionCV
 from sklearn.naive_bayes import GaussianNB
 
-# Create classifiers
-lr = LogisticRegression()
+# Define the classifiers to be compared in the study.
+#
+# Note that we use a variant of the logistic regression model that can
+# automatically tune its regularization parameter.
+#
+# For a fair comparison, we should run a hyper-parameter search for all the
+# classifiers but we don't do it here for the sake of keeping the example code
+# concise and fast to execute.
+lr = LogisticRegressionCV(
+    Cs=np.logspace(-6, 6, 101), cv=10, scoring="neg_log_loss", max_iter=1_000
+)
 gnb = GaussianNB()
 svc = NaivelyCalibratedLinearSVC(C=1.0, dual="auto")
-rfc = RandomForestClassifier()
+rfc = RandomForestClassifier(random_state=42)
 
 clf_list = [
-    (lr, "Logistic"),
+    (lr, "Logistic Regression"),
     (gnb, "Naive Bayes"),
     (svc, "SVC"),
     (rfc, "Random forest"),
@@ -150,60 +165,116 @@ def predict_proba(self, X):
 plt.show()
 
 # %%
-# :class:`~sklearn.linear_model.LogisticRegression` returns well calibrated
-# predictions as it directly optimizes log-loss. In contrast, the other methods
-# return biased probabilities, with different biases for each method:
 #
-# * :class:`~sklearn.naive_bayes.GaussianNB` tends to push
-#   probabilities to 0 or 1 (see histogram). This is mainly
-#   because the naive Bayes equation only provides correct estimate of
+# Analysis of the results
+# -----------------------
+#
+# :class:`~sklearn.linear_model.LogisticRegressionCV` returns reasonably well
+# calibrated predictions despite the small training set size: its reliability
+# curve is the closest to the diagonal among the four models.
+#
+# Logistic regression is trained by minimizing the log-loss which is a strictly
+# proper scoring rule: in the limit of infinite training data, strictly proper
+# scoring rules are minimized by the model that predicts the true conditional
+# probabilities. That (hypothetical) model would therefore be perfectly
+# calibrated. However, using a proper scoring rule as training objective is not
+# sufficient to guarantee a well-calibrated model by itself: even with a very
+# large training set, logistic regression could still be poorly calibrated, if
+# it was too strongly regularized or if the choice and preprocessing of input
+# features made this model mis-specified (e.g. if the true decision boundary of
+# the dataset is a highly non-linear function of the input features).
+#
+# In this example the training set was intentionally kept very small. In this
+# setting, optimizing the log-loss can still lead to poorly calibrated models
+# because of overfitting. To mitigate this, the
+# :class:`~sklearn.linear_model.LogisticRegressionCV` class was configured to
+# tune the `C` regularization parameter to also minimize the log-loss via inner
+# cross-validation so as to find the best compromise for this model in the
+# small training set setting.
+#
+# Because of the finite training set size and the lack of guarantee for
+# well-specification, we observe that the calibration curve of the logistic
+# regression model is close but not perfectly on the diagonal. The shape of the
+# calibration curve of this model can be interpreted as slightly
+# under-confident: the predicted probabilities are a bit too close to 0.5
+# compared to the true fraction of positive samples.
+#
+# The other methods all output less well calibrated probabilities:
+#
+# * :class:`~sklearn.naive_bayes.GaussianNB` tends to push probabilities to 0
+#   or 1 (see histogram) on this particular dataset (over-confidence). This is
+#   mainly because the naive Bayes equation only provides correct estimate of
 #   probabilities when the assumption that features are conditionally
-#   independent holds [2]_. However, features tend to be positively correlated
-#   and is the case with this dataset, which contains 2 features
-#   generated as random linear combinations of the informative features. These
-#   correlated features are effectively being 'counted twice', resulting in
-#   pushing the predicted probabilities towards 0 and 1 [3]_.
-#
-# * :class:`~sklearn.ensemble.RandomForestClassifier` shows the opposite
-#   behavior: the histograms show peaks at approx. 0.2 and 0.9 probability,
-#   while probabilities close to 0 or 1 are very rare. An explanation for this
-#   is given by Niculescu-Mizil and Caruana [1]_: "Methods such as bagging and
-#   random forests that average predictions from a base set of models can have
-#   difficulty making predictions near 0 and 1 because variance in the
-#   underlying base models will bias predictions that should be near zero or
-#   one away from these values. Because predictions are restricted to the
-#   interval [0,1], errors caused by variance tend to be one- sided near zero
-#   and one. For example, if a model should predict p = 0 for a case, the only
-#   way bagging can achieve this is if all bagged trees predict zero. If we add
-#   noise to the trees that bagging is averaging over, this noise will cause
-#   some trees to predict values larger than 0 for this case, thus moving the
-#   average prediction of the bagged ensemble away from 0. We observe this
-#   effect most strongly with random forests because the base-level trees
-#   trained with random forests have relatively high variance due to feature
-#   subsetting." As a result, the calibration curve shows a characteristic
-#   sigmoid shape, indicating that the classifier is under-confident
-#   and could return probabilities closer to 0 or 1.
-#
-# * To show the performance of :class:`~sklearn.svm.LinearSVC`, we naively
-#   scale the output of the :term:`decision_function` into [0, 1] by applying
-#   min-max scaling, since SVC does not output probabilities by default.
-#   :class:`~sklearn.svm.LinearSVC` shows an
-#   even more sigmoid curve than the
-#   :class:`~sklearn.ensemble.RandomForestClassifier`, which is typical for
-#   maximum-margin methods [1]_ as they focus on difficult to classify samples
-#   that are close to the decision boundary (the support vectors).
+#   independent holds [2]_. However, features can be correlated and this is the case
+#   with this dataset, which contains 2 features generated as random linear
+#   combinations of the informative features. These correlated features are
+#   effectively being 'counted twice', resulting in pushing the predicted
+#   probabilities towards 0 and 1 [3]_. Note, however, that changing the seed
+#   used to generate the dataset can lead to widely varying results for the
+#   naive Bayes estimator.
+#
+# * :class:`~sklearn.svm.LinearSVC` is not a natural probabilistic classifier.
+#   In order to interpret its prediction as such, we naively scaled the output
+#   of the :term:`decision_function` into [0, 1] by applying min-max scaling in
+#   the `NaivelyCalibratedLinearSVC` wrapper class defined above. This
+#   estimator shows a typical sigmoid-shaped calibration curve on this data:
+#   predictions larger than 0.5 correspond to samples with an even larger
+#   effective positive class fraction (above the diagonal), while predictions
+#   below 0.5 corresponds to even lower positive class fractions (below the
+#   diagonal). This under-confident predictions are typical for maximum-margin
+#   methods [1]_.
+#
+# * :class:`~sklearn.ensemble.RandomForestClassifier`'s prediction histogram
+#   shows peaks at approx. 0.2 and 0.9 probability, while probabilities close to
+#   0 or 1 are very rare. An explanation for this is given by [1]_:
+#   "Methods such as bagging and random forests that average
+#   predictions from a base set of models can have difficulty making
+#   predictions near 0 and 1 because variance in the underlying base models
+#   will bias predictions that should be near zero or one away from these
+#   values. Because predictions are restricted to the interval [0, 1], errors
+#   caused by variance tend to be one-sided near zero and one. For example, if
+#   a model should predict p = 0 for a case, the only way bagging can achieve
+#   this is if all bagged trees predict zero. If we add noise to the trees that
+#   bagging is averaging over, this noise will cause some trees to predict
+#   values larger than 0 for this case, thus moving the average prediction of
+#   the bagged ensemble away from 0. We observe this effect most strongly with
+#   random forests because the base-level trees trained with random forests
+#   have relatively high variance due to feature subsetting." This effect can
+#   make random forests under-confident. Despite this possible bias, note that
+#   the trees themselves are fit by minimizing either the Gini or Entropy
+#   criterion, both of which lead to splits that minimize proper scoring rules:
+#   the Brier score or the log-loss respectively. See :ref:`the user guide
+#   <tree_mathematical_formulation>` for more details. This can explain why
+#   this model shows a good enough calibration curve on this particular example
+#   dataset. Indeed the Random Forest model is not significantly more
+#   under-confident than the Logistic Regression model.
+#
+# Feel free to re-run this example with different random seeds and other
+# dataset generation parameters to see how different the calibration plots can
+# look. In general, Logistic Regression and Random Forest will tend to be the
+# best calibrated classifiers, while SVC will often display the typical
+# under-confident miscalibration. The naive Bayes model is also often poorly
+# calibrated but the general shape of its calibration curve can vary widely
+# depending on the dataset.
+#
+# Finally, note that for some dataset seeds, all models are poorly calibrated,
+# even when tuning the regularization parameter as above. This is bound to
+# happen when the training size is too small or when the model is severely
+# misspecified.
 #
 # References
 # ----------
 #
 # .. [1] `Predicting Good Probabilities with Supervised Learning
-#        <https://dl.acm.org/doi/pdf/10.1145/1102351.1102430>`_,
-#        A. Niculescu-Mizil & R. Caruana, ICML 2005
+#        <https://dl.acm.org/doi/pdf/10.1145/1102351.1102430>`_, A.
+#        Niculescu-Mizil & R. Caruana, ICML 2005
+#
 # .. [2] `Beyond independence: Conditions for the optimality of the simple
 #        bayesian classifier
 #        <https://www.ics.uci.edu/~pazzani/Publications/mlc96-pedro.pdf>`_
 #        Domingos, P., & Pazzani, M., Proc. 13th Intl. Conf. Machine Learning.
 #        1996.
+#
 # .. [3] `Obtaining calibrated probability estimates from decision trees and
 #        naive Bayesian classifiers
 #        <https://citeseerx.ist.psu.edu/doc_view/pid/4f67a122ec3723f08ad5cbefecad119b432b3304>`_