diff --git a/doc/modules/calibration.rst b/doc/modules/calibration.rst
index d0a9737dac612..1fcd1d501d100 100644
--- a/doc/modules/calibration.rst
+++ b/doc/modules/calibration.rst
@@ -30,11 +30,25 @@ approximately 80% actually belong to the positive class.
 Calibration curves
 ------------------
 
-The following plot compares how well the probabilistic predictions of
-different classifiers are calibrated, using :func:`calibration_curve`.
+Calibration curves (also known as reliability diagrams) compare how well the
+probabilistic predictions of a binary classifier are calibrated. It plots
+the true frequency of the positive label against its predicted probability,
+for binned predictions.
 The x axis represents the average predicted probability in each bin. The
 y axis is the *fraction of positives*, i.e. the proportion of samples whose
-class is the positive class (in each bin).
+class is the positive class (in each bin). The top calibration curve plot
+is created with :func:`CalibrationDisplay.from_estimators`, which uses
+:func:`calibration_curve` to calculate the per bin average predicted
+probabilities and fraction of positives.
+:func:`CalibrationDisplay.from_estimator`
+takes as input a fitted classifier, which is used to calculate the predicted
+probabilities. The classifier thus must have :term:`predict_proba` method. For
+the few classifiers that do not have a :term:`predict_proba` method, it is
+possible to use :class:`CalibratedClassifierCV` to calibrate the classifier
+outputs to probabilities.
+
+The bottom histogram gives some insight into the behavior of each classifier
+by showing the number of samples in each predicted probability bin.
 
 .. figure:: ../auto_examples/calibration/images/sphx_glr_plot_compare_calibration_001.png
    :target: ../auto_examples/calibration/plot_compare_calibration.html
@@ -161,6 +175,8 @@ mean a better calibrated model.
 :class:`CalibratedClassifierCV` supports the use of two 'calibration'
 regressors: 'sigmoid' and 'isotonic'.
 
+.. _sigmoid_regressor:
+
 Sigmoid
 ^^^^^^^
 
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 3edd8adee8191..3848a189c35d4 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1123,7 +1123,7 @@ See the :ref:`visualizations` section of the user guide for further details.
    metrics.DetCurveDisplay
    metrics.PrecisionRecallDisplay
    metrics.RocCurveDisplay
-
+   calibration.CalibrationDisplay
 
 .. _mixture_ref:
 
diff --git a/doc/visualizations.rst b/doc/visualizations.rst
index a2d40408b403f..65612b2787d84 100644
--- a/doc/visualizations.rst
+++ b/doc/visualizations.rst
@@ -65,6 +65,7 @@ values of the curves.
     * :ref:`sphx_glr_auto_examples_miscellaneous_plot_roc_curve_visualization_api.py`
     * :ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py`
     * :ref:`sphx_glr_auto_examples_miscellaneous_plot_display_object_visualization.py`
+    * :ref:`sphx_glr_auto_examples_calibration_plot_compare_calibration.py`
 
 Available Plotting Utilities
 ============================
@@ -90,6 +91,7 @@ Display Objects
 
 .. autosummary::
 
+   calibration.CalibrationDisplay
    inspection.PartialDependenceDisplay
    metrics.ConfusionMatrixDisplay
    metrics.DetCurveDisplay
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index d9f63cc62add4..001c3350fb056 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -152,6 +152,9 @@ Changelog
   :class:`calibration.CalibratedClassifierCV` can now properly be used on
   prefitted pipelines. :pr:`19641` by :user:`Alek Lefebvre <AlekLefebvre>`.
 
+- |Feature| :func:`calibration.CalibrationDisplay` added to plot
+  calibration curves. :pr:`17443` by :user:`Lucy Liu <lucyleeow>`.
+
 - |Fix| Fixed an error when using a ::class:`ensemble.VotingClassifier`
   as `base_estimator` in ::class:`calibration.CalibratedClassifierCV`.
   :pr:`20087` by :user:`Clément Fauchereau <clement-f>`.
diff --git a/examples/calibration/plot_calibration_curve.py b/examples/calibration/plot_calibration_curve.py
index b397bb79a2ba2..d4bfda5a3a55d 100644
--- a/examples/calibration/plot_calibration_curve.py
+++ b/examples/calibration/plot_calibration_curve.py
@@ -5,131 +5,305 @@
 
 When performing classification one often wants to predict not only the class
 label, but also the associated probability. This probability gives some
-kind of confidence on the prediction. This example demonstrates how to display
-how well calibrated the predicted probabilities are and how to calibrate an
-uncalibrated classifier.
-
-The experiment is performed on an artificial dataset for binary classification
-with 100,000 samples (1,000 of them are used for model fitting) with 20
-features. Of the 20 features, only 2 are informative and 10 are redundant. The
-first figure shows the estimated probabilities obtained with logistic
-regression, Gaussian naive Bayes, and Gaussian naive Bayes with both isotonic
-calibration and sigmoid calibration. The calibration performance is evaluated
-with Brier score, reported in the legend (the smaller the better). One can
-observe here that logistic regression is well calibrated while raw Gaussian
-naive Bayes performs very badly. This is because of the redundant features
-which violate the assumption of feature-independence and result in an overly
-confident classifier, which is indicated by the typical transposed-sigmoid
-curve.
-
-Calibration of the probabilities of Gaussian naive Bayes with isotonic
-regression can fix this issue as can be seen from the nearly diagonal
-calibration curve. Sigmoid calibration also improves the brier score slightly,
-albeit not as strongly as the non-parametric isotonic regression. This can be
-attributed to the fact that we have plenty of calibration data such that the
-greater flexibility of the non-parametric model can be exploited.
-
-The second figure shows the calibration curve of a linear support-vector
-classifier (LinearSVC). LinearSVC shows the opposite behavior as Gaussian
-naive Bayes: the calibration curve has a sigmoid curve, which is typical for
-an under-confident classifier. In the case of LinearSVC, this is caused by the
-margin property of the hinge loss, which lets the model focus on hard samples
-that are close to the decision boundary (the support vectors).
-
-Both kinds of calibration can fix this issue and yield nearly identical
-results. This shows that sigmoid calibration can deal with situations where
-the calibration curve of the base classifier is sigmoid (e.g., for LinearSVC)
-but not where it is transposed-sigmoid (e.g., Gaussian naive Bayes).
+kind of confidence on the prediction. This example demonstrates how to
+visualize how well calibrated the predicted probabilities are using calibration
+curves, also known as reliability diagrams. Calibration of an uncalibrated
+classifier will also be demonstrated.
 """
 print(__doc__)
+# %%
 
 # Author: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
 #         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
-# License: BSD Style.
+# License: BSD 3 clause.
+#
+# Dataset
+# -------
+#
+# We will use a synthetic binary classification dataset with 100,000 samples
+# and 20 features. Of the 20 features, only 2 are informative, 10 are
+# redundant (random combinations of the informative features) and the
+# remaining 8 are uninformative (random numbers). Of the 100,000 samples, 1,000
+# will be used for model fitting and the rest for testing.
+
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+
+X, y = make_classification(n_samples=100_000, n_features=20, n_informative=2,
+                           n_redundant=10, random_state=42)
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99,
+                                                    random_state=42)
+
+# %%
+# Calibration curves
+# ------------------
+#
+# Gaussian Naive Bayes
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# First, we will compare:
+#
+# * :class:`~sklearn.linear_model.LogisticRegression` (used as baseline
+#   since very often, properly regularized logistic regression is well
+#   calibrated by default thanks to the use of the log-loss)
+# * Uncalibrated :class:`~sklearn.naive_bayes.GaussianNB`
+# * :class:`~sklearn.naive_bayes.GaussianNB` with isotonic and sigmoid
+#   calibration (see :ref:`User Guide <calibration>`)
+#
+# Calibration curves for all 4 conditions are plotted below, with the average
+# predicted probability for each bin on the x-axis and the fraction of positive
+# classes in each bin on the y-axis.
 
 import matplotlib.pyplot as plt
+from matplotlib.gridspec import GridSpec
 
-from sklearn import datasets
-from sklearn.naive_bayes import GaussianNB
-from sklearn.svm import LinearSVC
+from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay
 from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import (brier_score_loss, precision_score, recall_score,
-                             f1_score)
-from sklearn.calibration import CalibratedClassifierCV, calibration_curve
-from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import GaussianNB
 
+lr = LogisticRegression(C=1.)
+gnb = GaussianNB()
+gnb_isotonic = CalibratedClassifierCV(gnb, cv=2, method='isotonic')
+gnb_sigmoid = CalibratedClassifierCV(gnb, cv=2, method='sigmoid')
 
-# Create dataset of classification task with many redundant and few
-# informative features
-X, y = datasets.make_classification(n_samples=100000, n_features=20,
-                                    n_informative=2, n_redundant=10,
-                                    random_state=42)
+clf_list = [(lr, 'Logistic'),
+            (gnb, 'Naive Bayes'),
+            (gnb_isotonic, 'Naive Bayes + Isotonic'),
+            (gnb_sigmoid, 'Naive Bayes + Sigmoid')]
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99,
-                                                    random_state=42)
+# %%
+fig = plt.figure(figsize=(10, 10))
+gs = GridSpec(4, 2)
+colors = plt.cm.get_cmap('Dark2')
+
+ax_calibration_curve = fig.add_subplot(gs[:2, :2])
+calibration_displays = {}
+for i, (clf, name) in enumerate(clf_list):
+    clf.fit(X_train, y_train)
+    display = CalibrationDisplay.from_estimator(
+        clf, X_test, y_test, n_bins=10, name=name, ax=ax_calibration_curve,
+        color=colors(i)
+    )
+    calibration_displays[name] = display
+
+ax_calibration_curve.grid()
+ax_calibration_curve.set_title('Calibration plots (Naive Bayes)')
+
+# Add histogram
+grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)]
+for i, (_, name) in enumerate(clf_list):
+    row, col = grid_positions[i]
+    ax = fig.add_subplot(gs[row, col])
+
+    ax.hist(
+        calibration_displays[name].y_prob, range=(0, 1), bins=10, label=name,
+        color=colors(i)
+    )
+    ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count")
+
+plt.tight_layout()
+plt.show()
+
+# %%
+# Uncalibrated :class:`~sklearn.naive_bayes.GaussianNB` is poorly calibrated
+# because of
+# the redundant features which violate the assumption of feature-independence
+# and result in an overly confident classifier, which is indicated by the
+# typical transposed-sigmoid curve. Calibration of the probabilities of
+# :class:`~sklearn.naive_bayes.GaussianNB` with :ref:`isotonic` can fix
+# this issue as can be seen from the nearly diagonal calibration curve.
+# :ref:sigmoid regression `<sigmoid_regressor>` also improves calibration
+# slightly,
+# albeit not as strongly as the non-parametric isotonic regression. This can be
+# attributed to the fact that we have plenty of calibration data such that the
+# greater flexibility of the non-parametric model can be exploited.
+#
+# Below we will make a quantitative analysis considering several classification
+# metrics: :ref:`brier_score_loss`, :ref:`log_loss`,
+# :ref:`precision, recall, F1 score <precision_recall_f_measure_metrics>` and
+# :ref:`ROC AUC <roc_metrics>`.
+
+from collections import defaultdict
+
+import pandas as pd
+
+from sklearn.metrics import (precision_score, recall_score, f1_score,
+                             brier_score_loss, log_loss, roc_auc_score)
+
+scores = defaultdict(list)
+for i, (clf, name) in enumerate(clf_list):
+    clf.fit(X_train, y_train)
+    y_prob = clf.predict_proba(X_test)
+    y_pred = clf.predict(X_test)
+    scores["Classifier"].append(name)
+
+    for metric in [brier_score_loss, log_loss]:
+        score_name = metric.__name__.replace("_", " ").replace("score", "").capitalize()
+        scores[score_name].append(metric(y_test, y_prob[:, 1]))
+
+    for metric in [precision_score, recall_score, f1_score, roc_auc_score]:
+        score_name = metric.__name__.replace("_", " ").replace("score", "").capitalize()
+        scores[score_name].append(metric(y_test, y_pred))
+
+    score_df = pd.DataFrame(scores).set_index("Classifier")
+    score_df.round(decimals=3)
 
+score_df
 
-def plot_calibration_curve(est, name, fig_index):
-    """Plot calibration curve for est w/o and with calibration. """
-    # Calibrated with isotonic calibration
-    isotonic = CalibratedClassifierCV(est, cv=2, method='isotonic')
+# %%
+# Notice that although calibration improves the :ref:`brier_score_loss` (a
+# metric composed
+# of calibration term and refinement term) and :ref:`log_loss`, it does not
+# significantly alter the prediction accuracy measures (precision, recall and
+# F1 score).
+# This is because calibration should not significantly change prediction
+# probabilities at the location of the decision threshold (at x = 0.5 on the
+# graph). Calibration should however, make the predicted probabilities more
+# accurate and thus more useful for making allocation decisions under
+# uncertainty.
+# Further, ROC AUC, should not change at all because calibration is a
+# monotonic transformation. Indeed, no rank metrics are affected by
+# calibration.
+#
+# Linear support vector classifier
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Next, we will compare:
+#
+# * :class:`~sklearn.linear_model.LogisticRegression` (baseline)
+# * Uncalibrated :class:`~sklearn.svm.LinearSVC`. Since SVC does not output
+#   probabilities by default, we naively scale the output of the
+#   :term:`decision_function` into [0, 1] by applying min-max scaling.
+# * :class:`~sklearn.svm.LinearSVC` with isotonic and sigmoid
+#   calibration (see :ref:`User Guide <calibration>`)
 
-    # Calibrated with sigmoid calibration
-    sigmoid = CalibratedClassifierCV(est, cv=2, method='sigmoid')
+import numpy as np
+
+from sklearn.svm import LinearSVC
 
-    # Logistic regression with no calibration as baseline
-    lr = LogisticRegression(C=1.)
 
-    fig = plt.figure(fig_index, figsize=(10, 10))
-    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2, fig=fig)
-    ax2 = plt.subplot2grid((3, 1), (2, 0), fig=fig)
+class NaivelyCalibratedLinearSVC(LinearSVC):
+    """LinearSVC with `predict_proba` method that naively scales
+    `decision_function` output for binary classification."""
 
-    ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
-    for clf, name in [(lr, 'Logistic'),
-                      (est, name),
-                      (isotonic, name + ' + Isotonic'),
-                      (sigmoid, name + ' + Sigmoid')]:
-        clf.fit(X_train, y_train)
-        y_pred = clf.predict(X_test)
-        if hasattr(clf, "predict_proba"):
-            prob_pos = clf.predict_proba(X_test)[:, 1]
-        else:  # use decision function
-            prob_pos = clf.decision_function(X_test)
-            prob_pos = \
-                (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
+    def fit(self, X, y):
+        super().fit(X, y)
+        df = self.decision_function(X)
+        self.df_min_ = df.min()
+        self.df_max_ = df.max()
 
-        clf_score = brier_score_loss(y_test, prob_pos, pos_label=y.max())
-        print("%s:" % name)
-        print("\tBrier: %1.3f" % (clf_score))
-        print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
-        print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
-        print("\tF1: %1.3f\n" % f1_score(y_test, y_pred))
+    def predict_proba(self, X):
+        """Min-max scale output of `decision_function` to [0, 1]."""
+        df = self.decision_function(X)
+        calibrated_df = (df - self.df_min_) / (self.df_max_ - self.df_min_)
+        proba_pos_class = np.clip(calibrated_df, 0, 1)
+        proba_neg_class = 1 - proba_pos_class
+        proba = np.c_[proba_neg_class, proba_pos_class]
+        return proba
 
-        fraction_of_positives, mean_predicted_value = \
-            calibration_curve(y_test, prob_pos, n_bins=10)
 
-        ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
-                 label="%s (%1.3f)" % (name, clf_score))
+# %%
 
-        ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
-                 histtype="step", lw=2)
+lr = LogisticRegression(C=1.)
+svc = NaivelyCalibratedLinearSVC(max_iter=10_000)
+svc_isotonic = CalibratedClassifierCV(svc, cv=2, method='isotonic')
+svc_sigmoid = CalibratedClassifierCV(svc, cv=2, method='sigmoid')
 
-    ax1.set_ylabel("Fraction of positives")
-    ax1.set_ylim([-0.05, 1.05])
-    ax1.legend(loc="lower right")
-    ax1.set_title('Calibration plots  (reliability curve)')
+clf_list = [(lr, 'Logistic'),
+            (svc, 'SVC'),
+            (svc_isotonic, 'SVC + Isotonic'),
+            (svc_sigmoid, 'SVC + Sigmoid')]
 
-    ax2.set_xlabel("Mean predicted value")
-    ax2.set_ylabel("Count")
-    ax2.legend(loc="upper center", ncol=2)
+# %%
+fig = plt.figure(figsize=(10, 10))
+gs = GridSpec(4, 2)
 
-    plt.tight_layout()
+ax_calibration_curve = fig.add_subplot(gs[:2, :2])
+calibration_displays = {}
+for i, (clf, name) in enumerate(clf_list):
+    clf.fit(X_train, y_train)
+    display = CalibrationDisplay.from_estimator(
+        clf, X_test, y_test, n_bins=10, name=name, ax=ax_calibration_curve,
+        color=colors(i)
+    )
+    calibration_displays[name] = display
 
+ax_calibration_curve.grid()
+ax_calibration_curve.set_title('Calibration plots (SVC)')
 
-# Plot calibration curve for Gaussian Naive Bayes
-plot_calibration_curve(GaussianNB(), "Naive Bayes", 1)
+# Add histogram
+grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)]
+for i, (_, name) in enumerate(clf_list):
+    row, col = grid_positions[i]
+    ax = fig.add_subplot(gs[row, col])
 
-# Plot calibration curve for Linear SVC
-plot_calibration_curve(LinearSVC(max_iter=10000), "SVC", 2)
+    ax.hist(
+        calibration_displays[name].y_prob, range=(0, 1), bins=10, label=name,
+        color=colors(i)
+    )
+    ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count")
 
+plt.tight_layout()
 plt.show()
+
+# %%
+# :class:`~sklearn.svm.LinearSVC` shows the opposite
+# behavior to :class:`~sklearn.naive_bayes.GaussianNB`; the calibration
+# curve has a sigmoid shape, which is typical for an under-confident
+# classifier. In the case of :class:`~sklearn.svm.LinearSVC`, this is caused
+# by the margin property of the hinge loss, which focuses on samples that are
+# close to the decision boundary (support vectors). Samples that are far
+# away from the decision boundary do not impact the hinge loss. It thus makes
+# sense that :class:`~sklearn.svm.LinearSVC` does not try to separate samples
+# in the high confidence region regions. This leads to flatter calibration
+# curves near 0 and 1 and is empirically shown with a variety of datasets
+# in Niculescu-Mizil & Caruana [1]_.
+#
+# Both kinds of calibration (sigmoid and isotonic) can fix this issue and
+# yield similar results.
+#
+# As before, we show the :ref:`brier_score_loss`, :ref:`log_loss`,
+# :ref:`precision, recall, F1 score <precision_recall_f_measure_metrics>` and
+# :ref:`ROC AUC <roc_metrics>`.
+
+scores = defaultdict(list)
+for i, (clf, name) in enumerate(clf_list):
+    clf.fit(X_train, y_train)
+    y_prob = clf.predict_proba(X_test)
+    y_pred = clf.predict(X_test)
+    scores["Classifier"].append(name)
+
+    for metric in [brier_score_loss, log_loss]:
+        score_name = metric.__name__.replace("_", " ").replace("score", "").capitalize()
+        scores[score_name].append(metric(y_test, y_prob[:, 1]))
+
+    for metric in [precision_score, recall_score, f1_score, roc_auc_score]:
+        score_name = metric.__name__.replace("_", " ").replace("score", "").capitalize()
+        scores[score_name].append(metric(y_test, y_pred))
+
+    score_df = pd.DataFrame(scores).set_index("Classifier")
+    score_df.round(decimals=3)
+
+score_df
+
+# %%
+# As with :class:`~sklearn.naive_bayes.GaussianNB` above, calibration improves
+# both :ref:`brier_score_loss` and :ref:`log_loss` but does not alter the
+# prediction accuracy measures (precision, recall and F1 score) much.
+#
+# Summary
+# -------
+#
+# Parametric sigmoid calibration can deal with situations where the calibration
+# curve of the base classifier is sigmoid (e.g., for
+# :class:`~sklearn.svm.LinearSVC`) but not where it is transposed-sigmoid
+# (e.g., :class:`~sklearn.naive_bayes.GaussianNB`). Non-parametric
+# isotonic calibration can deal with both situations but may require more
+# data to produce good results.
+#
+# References
+# ----------
+#
+# .. [1] `Predicting Good Probabilities with Supervised Learning
+#        <https://dl.acm.org/doi/pdf/10.1145/1102351.1102430>`_,
+#        A. Niculescu-Mizil & R. Caruana, ICML 2005
diff --git a/examples/calibration/plot_compare_calibration.py b/examples/calibration/plot_compare_calibration.py
index a8599aecc16af..7ee4eaf4da7df 100644
--- a/examples/calibration/plot_compare_calibration.py
+++ b/examples/calibration/plot_compare_calibration.py
@@ -4,119 +4,192 @@
 ========================================
 
 Well calibrated classifiers are probabilistic classifiers for which the output
-of the predict_proba method can be directly interpreted as a confidence level.
-For instance a well calibrated (binary) classifier should classify the samples
-such that among the samples to which it gave a predict_proba value close to
-0.8, approx. 80% actually belong to the positive class.
-
-LogisticRegression returns well calibrated predictions as it directly
-optimizes log-loss. In contrast, the other methods return biased probabilities,
-with different biases per method:
-
-* GaussianNaiveBayes tends to push probabilities to 0 or 1 (note the counts in
-  the histograms). This is mainly because it makes the assumption that features
-  are conditionally independent given the class, which is not the case in this
-  dataset which contains 2 redundant features.
-
-* RandomForestClassifier shows the opposite behavior: the histograms show
-  peaks at approx. 0.2 and 0.9 probability, while probabilities close to 0 or 1
-  are very rare. An explanation for this is given by Niculescu-Mizil and Caruana
-  [1]_: "Methods such as bagging and random forests that average predictions
-  from a base set of models can have difficulty making predictions near 0 and 1
-  because variance in the underlying base models will bias predictions that
-  should be near zero or one away from these values. Because predictions are
-  restricted to the interval [0,1], errors caused by variance tend to be one-
-  sided near zero and one. For example, if a model should predict p = 0 for a
-  case, the only way bagging can achieve this is if all bagged trees predict
-  zero. If we add noise to the trees that bagging is averaging over, this noise
-  will cause some trees to predict values larger than 0 for this case, thus
-  moving the average prediction of the bagged ensemble away from 0. We observe
-  this effect most strongly with random forests because the base-level trees
-  trained with random forests have relatively high variance due to feature
-  subsetting." As a result, the calibration curve shows a characteristic
-  sigmoid shape, indicating that the classifier could trust its "intuition"
-  more and return probabilities closer to 0 or 1 typically.
-
-* Support Vector Classification (SVC) shows an even more sigmoid curve as
-  the  RandomForestClassifier, which is typical for maximum-margin methods
-  (compare Niculescu-Mizil and Caruana [1]_), which focus on hard samples
-  that are close to the decision boundary (the support vectors).
-
-.. topic:: References:
-
-    .. [1] Predicting Good Probabilities with Supervised Learning,
-          A. Niculescu-Mizil & R. Caruana, ICML 2005
+of :term:`predict_proba` can be directly interpreted as a confidence level.
+For instance, a well calibrated (binary) classifier should classify the samples
+such that for the samples to which it gave a :term:`predict_proba` value close
+to 0.8, approximately 80% actually belong to the positive class.
+
+In this example we will compare the calibration of four different
+models: :ref:`Logistic_regression`, :ref:`gaussian_naive_bayes`,
+:ref:`Random Forest Classifier <forest>` and :ref:`Linear SVM
+<svm_classification>`.
 """
-print(__doc__)
 
+# %%
 # Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
-# License: BSD Style.
+# License: BSD 3 clause.
+#
+# Dataset
+# -------
+#
+# We will use a synthetic binary classification dataset with 100,000 samples
+# and 20 features. Of the 20 features, only 2 are informative, 2 are
+# redundant (random combinations of the informative features) and the
+# remaining 16 are uninformative (random numbers). Of the 100,000 samples,
+# 100 will be used for model fitting and the remaining for testing.
+
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+
+X, y = make_classification(
+  n_samples=100_000, n_features=20, n_informative=2, n_redundant=2,
+  random_state=42
+)
 
-import numpy as np
-np.random.seed(0)
+train_samples = 100  # Samples used for training the models
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, shuffle=False, test_size=100_000 - train_samples,
+)
+
+# %%
+# Calibration curves
+# ------------------
+#
+# Below, we train each of the four models with the small training dataset, then
+# plot calibration curves (also known as reliability diagrams) using
+# predicted probabilities of the test dataset. Calibration curves are created
+# by binning predicted probabilities, then plotting the mean predicted
+# probability in each bin against the observed frequency ('fraction of
+# positives'). Below the calibration curve, we plot a histogram showing
+# the distribution of the predicted probabilities or more specifically,
+# the number of samples in each predicted probability bin.
 
-import matplotlib.pyplot as plt
+import numpy as np
 
-from sklearn import datasets
-from sklearn.naive_bayes import GaussianNB
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import RandomForestClassifier
 from sklearn.svm import LinearSVC
-from sklearn.calibration import calibration_curve
 
-X, y = datasets.make_classification(n_samples=100000, n_features=20,
-                                    n_informative=2, n_redundant=2)
 
-train_samples = 100  # Samples used for training the models
+class NaivelyCalibratedLinearSVC(LinearSVC):
+    """LinearSVC with `predict_proba` method that naively scales
+    `decision_function` output."""
+
+    def fit(self, X, y):
+        super().fit(X, y)
+        df = self.decision_function(X)
+        self.df_min_ = df.min()
+        self.df_max_ = df.max()
 
-X_train = X[:train_samples]
-X_test = X[train_samples:]
-y_train = y[:train_samples]
-y_test = y[train_samples:]
+    def predict_proba(self, X):
+        """Min-max scale output of `decision_function` to [0,1]."""
+        df = self.decision_function(X)
+        calibrated_df = (df - self.df_min_) / (self.df_max_ - self.df_min_)
+        proba_pos_class = np.clip(calibrated_df, 0, 1)
+        proba_neg_class = 1 - proba_pos_class
+        proba = np.c_[proba_neg_class, proba_pos_class]
+        return proba
+
+
+# %%
+
+from sklearn.calibration import CalibrationDisplay
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import GaussianNB
 
 # Create classifiers
 lr = LogisticRegression()
 gnb = GaussianNB()
-svc = LinearSVC(C=1.0)
+svc = NaivelyCalibratedLinearSVC(C=1.0)
 rfc = RandomForestClassifier()
 
+clf_list = [(lr, 'Logistic'),
+            (gnb, 'Naive Bayes'),
+            (svc, 'SVC'),
+            (rfc, 'Random forest')]
+
+# %%
 
-# #############################################################################
-# Plot calibration plots
+import matplotlib.pyplot as plt
+from matplotlib.gridspec import GridSpec
 
-plt.figure(figsize=(10, 10))
-ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
-ax2 = plt.subplot2grid((3, 1), (2, 0))
+fig = plt.figure(figsize=(10, 10))
+gs = GridSpec(4, 2)
+colors = plt.cm.get_cmap('Dark2')
 
-ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
-for clf, name in [(lr, 'Logistic'),
-                  (gnb, 'Naive Bayes'),
-                  (svc, 'Support Vector Classification'),
-                  (rfc, 'Random Forest')]:
+ax_calibration_curve = fig.add_subplot(gs[:2, :2])
+calibration_displays = {}
+for i, (clf, name) in enumerate(clf_list):
     clf.fit(X_train, y_train)
-    if hasattr(clf, "predict_proba"):
-        prob_pos = clf.predict_proba(X_test)[:, 1]
-    else:  # use decision function
-        prob_pos = clf.decision_function(X_test)
-        prob_pos = \
-            (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
-    fraction_of_positives, mean_predicted_value = \
-        calibration_curve(y_test, prob_pos, n_bins=10)
-
-    ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
-             label="%s" % (name, ))
-
-    ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
-             histtype="step", lw=2)
-
-ax1.set_ylabel("Fraction of positives")
-ax1.set_ylim([-0.05, 1.05])
-ax1.legend(loc="lower right")
-ax1.set_title('Calibration plots  (reliability curve)')
-
-ax2.set_xlabel("Mean predicted value")
-ax2.set_ylabel("Count")
-ax2.legend(loc="upper center", ncol=2)
+    display = CalibrationDisplay.from_estimator(
+        clf, X_test, y_test, n_bins=10, name=name, ax=ax_calibration_curve,
+        color=colors(i)
+    )
+    calibration_displays[name] = display
+
+ax_calibration_curve.grid()
+ax_calibration_curve.set_title('Calibration plots')
+
+# Add histogram
+grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)]
+for i, (_, name) in enumerate(clf_list):
+    row, col = grid_positions[i]
+    ax = fig.add_subplot(gs[row, col])
+
+    ax.hist(
+        calibration_displays[name].y_prob, range=(0, 1), bins=10, label=name,
+        color=colors(i)
+    )
+    ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count")
 
 plt.tight_layout()
 plt.show()
+
+# %%
+# :class:`~sklearn.linear_model.LogisticRegression` returns well calibrated
+# predictions as it directly optimizes log-loss. In contrast, the other methods
+# return biased probabilities, with different biases for each method:
+#
+# * :class:`~sklearn.naive_bayes.GaussianNB` tends to push
+#   probabilities to 0 or 1 (see histogram). This is mainly
+#   because the naive Bayes equation only provides correct estimate of
+#   probabilities when the assumption that features are conditionally
+#   independent holds [2]_. However, features tend to be positively correlated
+#   and is the case with this dataset, which contains 2 features
+#   generated as random linear combinations of the informative features. These
+#   correlated features are effectively being 'counted twice', resulting in
+#   pushing the predicted probabilities towards 0 and 1 [3]_.
+#
+# * :class:`~sklearn.ensemble.RandomForestClassifier` shows the opposite
+#   behavior: the histograms show peaks at approx. 0.2 and 0.9 probability,
+#   while probabilities close to 0 or 1 are very rare. An explanation for this
+#   is given by Niculescu-Mizil and Caruana [1]_: "Methods such as bagging and
+#   random forests that average predictions from a base set of models can have
+#   difficulty making predictions near 0 and 1 because variance in the
+#   underlying base models will bias predictions that should be near zero or
+#   one away from these values. Because predictions are restricted to the
+#   interval [0,1], errors caused by variance tend to be one- sided near zero
+#   and one. For example, if a model should predict p = 0 for a case, the only
+#   way bagging can achieve this is if all bagged trees predict zero. If we add
+#   noise to the trees that bagging is averaging over, this noise will cause
+#   some trees to predict values larger than 0 for this case, thus moving the
+#   average prediction of the bagged ensemble away from 0. We observe this
+#   effect most strongly with random forests because the base-level trees
+#   trained with random forests have relatively high variance due to feature
+#   subsetting." As a result, the calibration curve shows a characteristic
+#   sigmoid shape, indicating that the classifier is under-confident
+#   and could return probabilities closer to 0 or 1.
+#
+# * To show the performance of :class:`~sklearn.svm.LinearSVC`, we naively
+#   scale the output of the :term:`decision_function` into [0, 1] by applying
+#   min-max scaling, since SVC does not output probabilities by default.
+#   :class:`~sklearn.svm.LinearSVC` shows an
+#   even more sigmoid curve than the
+#   :class:`~sklearn.ensemble.RandomForestClassifier`, which is typical for
+#   maximum-margin methods [1]_ as they focus on difficult to classify samples
+#   that are close to the decision boundary (the support vectors).
+#
+# References
+# ----------
+#
+# .. [1] `Predicting Good Probabilities with Supervised Learning
+#        <https://dl.acm.org/doi/pdf/10.1145/1102351.1102430>`_,
+#        A. Niculescu-Mizil & R. Caruana, ICML 2005
+# .. [2] `Beyond independence: Conditions for the optimality of the simple
+#        bayesian classifier
+#        <https://www.ics.uci.edu/~pazzani/Publications/mlc96-pedro.pdf>`_
+#        Domingos, P., & Pazzani, M., Proc. 13th Intl. Conf. Machine Learning.
+#        1996.
+# .. [3] `Obtaining calibrated probability estimates from decision trees and
+#        naive Bayesian classifiers
+#        <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.29.3039&rep=rep1&type=pdf>`_
+#        Zadrozny, Bianca, and Charles Elkan. Icml. Vol. 1. 2001.
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 126cbbcbe9c88..95cd6731bb182 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -25,12 +25,14 @@
     RegressorMixin,
     clone,
     MetaEstimatorMixin,
+    is_classifier,
 )
 from .preprocessing import label_binarize, LabelEncoder
 from .utils import (
     column_or_1d,
     deprecated,
     indexable,
+    check_matplotlib_support,
 )
 
 from .utils.multiclass import check_classification_targets
@@ -41,6 +43,7 @@
 from .isotonic import IsotonicRegression
 from .svm import LinearSVC
 from .model_selection import check_cv, cross_val_predict
+from .metrics._plot.base import _get_response
 
 
 class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
@@ -943,3 +946,351 @@ def calibration_curve(y_true, y_prob, *, normalize=False, n_bins=5, strategy="un
     prob_pred = bin_sums[nonzero] / bin_total[nonzero]
 
     return prob_true, prob_pred
+
+
+class CalibrationDisplay:
+    """Calibration curve (also known as reliability diagram) visualization.
+
+    It is recommended to use
+    :func:`~sklearn.calibration.CalibrationDisplay.from_estimator` or
+    :func:`~sklearn.calibration.CalibrationDisplay.from_predictions`
+    to create a `CalibrationDisplay`. All parameters are stored as attributes.
+
+    Read more about calibration in the :ref:`User Guide <calibration>` and
+    more about the scikit-learn visualization API in :ref:`visualizations`.
+
+    .. versionadded:: 1.0
+
+    Parameters
+    -----------
+    prob_true : ndarray of shape (n_bins,)
+        The proportion of samples whose class is the positive class (fraction
+        of positives), in each bin.
+
+    prob_pred : ndarray of shape (n_bins,)
+        The mean predicted probability in each bin.
+
+    y_prob : ndarray of shape (n_samples,)
+        Probability estimates for the positive class, for each sample.
+
+    name : str, default=None
+        Name for labeling curve.
+
+    Attributes
+    ----------
+    line_ : matplotlib Artist
+        Calibration curve.
+
+    ax_ : matplotlib Axes
+        Axes with calibration curve.
+
+    figure_ : matplotlib Figure
+        Figure containing the curve.
+
+    See Also
+    --------
+    calibration_curve : Compute true and predicted probabilities for a
+        calibration curve.
+    CalibrationDisplay.from_predictions : Plot calibration curve using true
+        and predicted labels.
+    CalibrationDisplay.from_estimator : Plot calibration curve using an
+        estimator and data.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.calibration import calibration_curve, CalibrationDisplay
+    >>> X, y = make_classification(random_state=0)
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     X, y, random_state=0)
+    >>> clf = LogisticRegression(random_state=0)
+    >>> clf.fit(X_train, y_train)
+    LogisticRegression(random_state=0)
+    >>> y_prob = clf.predict_proba(X_test)[:, 1]
+    >>> prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)
+    >>> disp = CalibrationDisplay(prob_true, prob_pred, y_prob)
+    >>> disp.plot()
+    <...>
+    """
+
+    def __init__(self, prob_true, prob_pred, y_prob, *, name=None):
+        self.prob_true = prob_true
+        self.prob_pred = prob_pred
+        self.y_prob = y_prob
+        self.name = name
+
+    def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
+        """Plot visualization.
+
+        Extra keyword arguments will be passed to
+        :func:`matplotlib.pyplot.plot`.
+
+        Parameters
+        ----------
+        ax : Matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        name : str, default=None
+            Name for labeling curve.
+
+        ref_line : bool, default=True
+            If `True`, plots a reference line representing a perfectly
+            calibrated classifier.
+
+        **kwargs : dict
+            Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.
+
+        Returns
+        -------
+        display : :class:`~sklearn.calibration.CalibrationDisplay`
+            Object that stores computed values.
+        """
+        check_matplotlib_support("CalibrationDisplay.plot")
+        import matplotlib.pyplot as plt
+
+        if ax is None:
+            fig, ax = plt.subplots()
+
+        name = self.name if name is None else name
+        self.name = name
+
+        line_kwargs = {}
+        if name is not None:
+            line_kwargs["label"] = name
+        line_kwargs.update(**kwargs)
+
+        ref_line_label = "Perfectly calibrated"
+        existing_ref_line = ref_line_label in ax.get_legend_handles_labels()[1]
+        if ref_line and not existing_ref_line:
+            ax.plot([0, 1], [0, 1], "k:", label=ref_line_label)
+        self.line_ = ax.plot(self.prob_pred, self.prob_true, "s-", **line_kwargs)[0]
+
+        if "label" in line_kwargs:
+            ax.legend(loc="lower right")
+
+        ax.set(xlabel="Mean predicted probability", ylabel="Fraction of positives")
+
+        self.ax_ = ax
+        self.figure_ = ax.figure
+        return self
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        n_bins=5,
+        strategy="uniform",
+        name=None,
+        ref_line=True,
+        ax=None,
+        **kwargs,
+    ):
+        """Plot calibration curve using an binary classifier and data.
+
+        Calibration curve, also known as reliability diagram, uses inputs
+        from a binary classifier and plots the average predicted probability
+        for each bin against the fraction of positive classes, on the
+        y-axis.
+
+        Extra keyword arguments will be passed to
+        :func:`matplotlib.pyplot.plot`.
+
+        Read more about calibration in the :ref:`User Guide <calibration>` and
+        more about the scikit-learn visualization API in :ref:`visualizations`.
+
+        .. versionadded:: 1.0
+
+        Parameters
+        ----------
+        estimator : estimator instance
+            Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+            in which the last estimator is a classifier. The classifier must
+            have a :term:`predict_proba` method.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input values.
+
+        y : array-like of shape (n_samples,)
+            Binary target values.
+
+        n_bins : int, default=5
+            Number of bins to discretize the [0, 1] interval into when
+            calculating the calibration curve. A bigger number requires more
+            data.
+
+        strategy : {'uniform', 'quantile'}, default='uniform'
+            Strategy used to define the widths of the bins.
+
+            - `'uniform'`: The bins have identical widths.
+            - `'quantile'`: The bins have the same number of samples and depend
+              on predicted probabilities.
+
+        name : str, default=None
+            Name for labeling curve. If `None`, the name of the estimator is
+            used.
+
+        ref_line : bool, default=True
+            If `True`, plots a reference line representing a perfectly
+            calibrated classifier.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        **kwargs : dict
+            Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.
+
+        Returns
+        -------
+        display : :class:`~sklearn.calibration.CalibrationDisplay`.
+            Object that stores computed values.
+
+        See Also
+        --------
+        CalibrationDisplay.from_predictions : Plot calibration curve using true
+            and predicted labels.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.calibration import CalibrationDisplay
+        >>> X, y = make_classification(random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...     X, y, random_state=0)
+        >>> clf = LogisticRegression(random_state=0)
+        >>> clf.fit(X_train, y_train)
+        LogisticRegression(random_state=0)
+        >>> disp = CalibrationDisplay.from_estimator(clf, X_test, y_test)
+        >>> plt.show()
+        """
+        method_name = f"{cls.__name__}.from_estimator"
+        check_matplotlib_support(method_name)
+
+        if not is_classifier(estimator):
+            raise ValueError("'estimator' should be a fitted classifier.")
+
+        # FIXME: `pos_label` should not be set to None
+        # We should allow any int or string in `calibration_curve`.
+        y_prob, _ = _get_response(
+            X, estimator, response_method="predict_proba", pos_label=None
+        )
+
+        name = name if name is not None else estimator.__class__.__name__
+        return cls.from_predictions(
+            y,
+            y_prob,
+            n_bins=n_bins,
+            strategy=strategy,
+            name=name,
+            ref_line=ref_line,
+            ax=ax,
+            **kwargs,
+        )
+
+    @classmethod
+    def from_predictions(
+        cls,
+        y_true,
+        y_prob,
+        *,
+        n_bins=5,
+        strategy="uniform",
+        name=None,
+        ref_line=True,
+        ax=None,
+        **kwargs,
+    ):
+        """Plot calibration curve using true labels and predicted probabilities.
+
+        Calibration curve, also known as reliability diagram, uses inputs
+        from a binary classifier and plots the average predicted probability
+        for each bin against the fraction of positive classes, on the
+        y-axis.
+
+        Extra keyword arguments will be passed to
+        :func:`matplotlib.pyplot.plot`.
+
+        Read more about calibration in the :ref:`User Guide <calibration>` and
+        more about the scikit-learn visualization API in :ref:`visualizations`.
+
+        .. versionadded:: 1.0
+
+        Parameters
+        ----------
+        y_true : array-like of shape (n_samples,)
+            True labels.
+
+        y_prob : array-like of shape (n_samples,)
+            The predicted probabilities of the positive class.
+
+        n_bins : int, default=5
+            Number of bins to discretize the [0, 1] interval into when
+            calculating the calibration curve. A bigger number requires more
+            data.
+
+        strategy : {'uniform', 'quantile'}, default='uniform'
+            Strategy used to define the widths of the bins.
+
+            - `'uniform'`: The bins have identical widths.
+            - `'quantile'`: The bins have the same number of samples and depend
+              on predicted probabilities.
+
+        name : str, default=None
+            Name for labeling curve.
+
+        ref_line : bool, default=True
+            If `True`, plots a reference line representing a perfectly
+            calibrated classifier.
+
+        ax : matplotlib axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        **kwargs : dict
+            Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.
+
+        Returns
+        -------
+        display : :class:`~sklearn.calibration.CalibrationDisplay`.
+            Object that stores computed values.
+
+        See Also
+        --------
+        CalibrationDisplay.from_estimator : Plot calibration curve using an
+            estimator and data.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> from sklearn.calibration import CalibrationDisplay
+        >>> X, y = make_classification(random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...     X, y, random_state=0)
+        >>> clf = LogisticRegression(random_state=0)
+        >>> clf.fit(X_train, y_train)
+        LogisticRegression(random_state=0)
+        >>> y_prob = clf.predict_proba(X_test)[:, 1]
+        >>> disp = CalibrationDisplay.from_predictions(y_test, y_prob)
+        >>> plt.show()
+        """
+        method_name = f"{cls.__name__}.from_estimator"
+        check_matplotlib_support(method_name)
+
+        prob_true, prob_pred = calibration_curve(
+            y_true, y_prob, n_bins=n_bins, strategy=strategy
+        )
+
+        disp = cls(prob_true=prob_true, prob_pred=prob_pred, y_prob=y_prob, name=name)
+        return disp.plot(ax=ax, ref_line=ref_line, **kwargs)
diff --git a/sklearn/metrics/_plot/base.py b/sklearn/metrics/_plot/base.py
index 103fcffbd9187..8f5552ffd6808 100644
--- a/sklearn/metrics/_plot/base.py
+++ b/sklearn/metrics/_plot/base.py
@@ -101,8 +101,12 @@ def _get_response(X, estimator, response_method, pos_label=None):
         )
 
     if y_pred.ndim != 1:  # `predict_proba`
-        if y_pred.shape[1] != 2:
-            raise ValueError(classification_error)
+        y_pred_shape = y_pred.shape[1]
+        if y_pred_shape != 2:
+            raise ValueError(
+                f"{classification_error} fit on multiclass ({y_pred_shape} classes)"
+                " data"
+            )
         if pos_label is None:
             pos_label = estimator.classes_[1]
             y_pred = y_pred[:, 1]
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index 4fe08c27fb19e..8decff0cc96d5 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -18,7 +18,7 @@
 )
 from sklearn.utils.extmath import softmax
 from sklearn.exceptions import NotFittedError
-from sklearn.datasets import make_classification, make_blobs
+from sklearn.datasets import make_classification, make_blobs, load_iris
 from sklearn.preprocessing import LabelEncoder
 from sklearn.model_selection import KFold, cross_val_predict
 from sklearn.naive_bayes import MultinomialNB
@@ -27,15 +27,18 @@
     RandomForestRegressor,
     VotingClassifier,
 )
+from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.tree import DecisionTreeClassifier
 from sklearn.svm import LinearSVC
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import StandardScaler
 from sklearn.isotonic import IsotonicRegression
 from sklearn.feature_extraction import DictVectorizer
-from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
 from sklearn.metrics import brier_score_loss
 from sklearn.calibration import CalibratedClassifierCV, _CalibratedClassifier
 from sklearn.calibration import _sigmoid_calibration, _SigmoidCalibration
-from sklearn.calibration import calibration_curve
+from sklearn.calibration import calibration_curve, CalibrationDisplay
 
 
 @pytest.fixture(scope="module")
@@ -618,3 +621,167 @@ def test_calibration_votingclassifier():
     calib_clf = CalibratedClassifierCV(base_estimator=vote, cv="prefit")
     # smoke test: should not raise an error
     calib_clf.fit(X, y)
+
+
+@pytest.fixture(scope="module")
+def iris_data():
+    return load_iris(return_X_y=True)
+
+
+@pytest.fixture(scope="module")
+def iris_data_binary(iris_data):
+    X, y = iris_data
+    return X[y < 2], y[y < 2]
+
+
+def test_calibration_display_validation(pyplot, iris_data, iris_data_binary):
+    X, y = iris_data
+    X_binary, y_binary = iris_data_binary
+
+    reg = LinearRegression().fit(X, y)
+    msg = "'estimator' should be a fitted classifier"
+    with pytest.raises(ValueError, match=msg):
+        CalibrationDisplay.from_estimator(reg, X, y)
+
+    clf = LinearSVC().fit(X, y)
+    msg = "response method predict_proba is not defined in"
+    with pytest.raises(ValueError, match=msg):
+        CalibrationDisplay.from_estimator(clf, X, y)
+
+    clf = LogisticRegression()
+    with pytest.raises(NotFittedError):
+        CalibrationDisplay.from_estimator(clf, X, y)
+
+
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+def test_calibration_display_non_binary(pyplot, iris_data, constructor_name):
+    X, y = iris_data
+    clf = DecisionTreeClassifier()
+    clf.fit(X, y)
+    y_prob = clf.predict_proba(X)
+
+    if constructor_name == "from_estimator":
+        msg = "to be a binary classifier, but got"
+        with pytest.raises(ValueError, match=msg):
+            CalibrationDisplay.from_estimator(clf, X, y)
+    else:
+        msg = "y should be a 1d array, got an array of shape"
+        with pytest.raises(ValueError, match=msg):
+            CalibrationDisplay.from_predictions(y, y_prob)
+
+
+@pytest.mark.parametrize("n_bins", [5, 10])
+@pytest.mark.parametrize("strategy", ["uniform", "quantile"])
+def test_calibration_display_compute(pyplot, iris_data_binary, n_bins, strategy):
+    # Ensure `CalibrationDisplay.from_predictions` and `calibration_curve`
+    # compute the same results. Also checks attributes of the
+    # CalibrationDisplay object.
+    X, y = iris_data_binary
+
+    lr = LogisticRegression().fit(X, y)
+
+    viz = CalibrationDisplay.from_estimator(
+        lr, X, y, n_bins=n_bins, strategy=strategy, alpha=0.8
+    )
+
+    y_prob = lr.predict_proba(X)[:, 1]
+    prob_true, prob_pred = calibration_curve(
+        y, y_prob, n_bins=n_bins, strategy=strategy
+    )
+
+    assert_allclose(viz.prob_true, prob_true)
+    assert_allclose(viz.prob_pred, prob_pred)
+    assert_allclose(viz.y_prob, y_prob)
+
+    assert viz.name == "LogisticRegression"
+
+    # cannot fail thanks to pyplot fixture
+    import matplotlib as mpl  # noqa
+
+    assert isinstance(viz.line_, mpl.lines.Line2D)
+    assert viz.line_.get_alpha() == 0.8
+    assert isinstance(viz.ax_, mpl.axes.Axes)
+    assert isinstance(viz.figure_, mpl.figure.Figure)
+
+    assert viz.ax_.get_xlabel() == "Mean predicted probability"
+    assert viz.ax_.get_ylabel() == "Fraction of positives"
+    assert viz.line_.get_label() == "LogisticRegression"
+
+
+def test_plot_calibration_curve_pipeline(pyplot, iris_data_binary):
+    # Ensure pipelines are supported by CalibrationDisplay.from_estimator
+    X, y = iris_data_binary
+    clf = make_pipeline(StandardScaler(), LogisticRegression())
+    clf.fit(X, y)
+    viz = CalibrationDisplay.from_estimator(clf, X, y)
+    assert clf.__class__.__name__ in viz.line_.get_label()
+    assert viz.name == clf.__class__.__name__
+
+
+@pytest.mark.parametrize(
+    "name, expected_label", [(None, "_line1"), ("my_est", "my_est")]
+)
+def test_calibration_display_default_labels(pyplot, name, expected_label):
+    prob_true = np.array([0, 1, 1, 0])
+    prob_pred = np.array([0.2, 0.8, 0.8, 0.4])
+    y_prob = np.array([])
+
+    viz = CalibrationDisplay(prob_true, prob_pred, y_prob, name=name)
+    viz.plot()
+    assert viz.line_.get_label() == expected_label
+
+
+def test_calibration_display_label_class_plot(pyplot):
+    # Checks that when instantiating `CalibrationDisplay` class then calling
+    # `plot`, `self.name` is the one given in `plot`
+    prob_true = np.array([0, 1, 1, 0])
+    prob_pred = np.array([0.2, 0.8, 0.8, 0.4])
+    y_prob = np.array([])
+
+    name = "name one"
+    viz = CalibrationDisplay(prob_true, prob_pred, y_prob, name=name)
+    assert viz.name == name
+    name = "name two"
+    viz.plot(name=name)
+    assert viz.name == name
+    assert viz.line_.get_label() == name
+
+
+@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
+def test_calibration_display_name_multiple_calls(
+    constructor_name, pyplot, iris_data_binary
+):
+    # Check that the `name` used when calling
+    # `CalibrationDisplay.from_predictions` or
+    # `CalibrationDisplay.from_estimator` is used when multiple
+    # `CalibrationDisplay.viz.plot()` calls are made.
+    X, y = iris_data_binary
+    clf_name = "my hand-crafted name"
+    clf = LogisticRegression().fit(X, y)
+    y_prob = clf.predict_proba(X)[:, 1]
+
+    constructor = getattr(CalibrationDisplay, constructor_name)
+    params = (clf, X, y) if constructor_name == "from_estimator" else (y, y_prob)
+
+    viz = constructor(*params, name=clf_name)
+    assert viz.name == clf_name
+    pyplot.close("all")
+    viz.plot()
+    assert clf_name == viz.line_.get_label()
+    pyplot.close("all")
+    clf_name = "another_name"
+    viz.plot(name=clf_name)
+    assert clf_name == viz.line_.get_label()
+
+
+def test_calibration_display_ref_line(pyplot, iris_data_binary):
+    # Check that `ref_line` only appears once
+    X, y = iris_data_binary
+    lr = LogisticRegression().fit(X, y)
+    dt = DecisionTreeClassifier().fit(X, y)
+
+    viz = CalibrationDisplay.from_estimator(lr, X, y)
+    viz2 = CalibrationDisplay.from_estimator(dt, X, y, ax=viz.ax_)
+
+    labels = viz2.ax_.get_legend_handles_labels()[1]
+    assert labels.count("Perfectly calibrated") == 1