diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py
index e80c0fb6fdc6e..2e1132584fcc2 100644
--- a/examples/ensemble/plot_gradient_boosting_categorical.py
+++ b/examples/ensemble/plot_gradient_boosting_categorical.py
@@ -10,12 +10,12 @@
 different encoding strategies for categorical features. In
 particular, we will evaluate:
 
-- dropping the categorical features
-- using a :class:`~preprocessing.OneHotEncoder`
-- using an :class:`~preprocessing.OrdinalEncoder` and treat categories as
-  ordered, equidistant quantities
-- using an :class:`~preprocessing.OrdinalEncoder` and rely on the :ref:`native
-  category support <categorical_support_gbdt>` of the
+- "Dropped": dropping the categorical features;
+- "One Hot": using a :class:`~preprocessing.OneHotEncoder`;
+- "Ordinal": using an :class:`~preprocessing.OrdinalEncoder` and treat
+  categories as ordered, equidistant quantities;
+- "Native": using an :class:`~preprocessing.OrdinalEncoder` and rely on the
+  :ref:`native category support <categorical_support_gbdt>` of the
   :class:`~ensemble.HistGradientBoostingRegressor` estimator.
 
 We will work with the Ames Iowa Housing dataset which consists of numerical
@@ -92,6 +92,7 @@
     ("drop", make_column_selector(dtype_include="category")), remainder="passthrough"
 )
 hist_dropped = make_pipeline(dropper, HistGradientBoostingRegressor(random_state=42))
+hist_dropped
 
 # %%
 # Gradient boosting estimator with one-hot encoding
@@ -112,6 +113,7 @@
 hist_one_hot = make_pipeline(
     one_hot_encoder, HistGradientBoostingRegressor(random_state=42)
 )
+hist_one_hot
 
 # %%
 # Gradient boosting estimator with ordinal encoding
@@ -139,6 +141,7 @@
 hist_ordinal = make_pipeline(
     ordinal_encoder, HistGradientBoostingRegressor(random_state=42)
 )
+hist_ordinal
 
 # %%
 # Gradient boosting estimator with native categorical support
@@ -156,67 +159,105 @@
 hist_native = HistGradientBoostingRegressor(
     random_state=42, categorical_features="from_dtype"
 )
+hist_native
 
 # %%
 # Model comparison
 # ----------------
-# Finally, we evaluate the models using cross validation. Here we compare the
-# models performance in terms of
-# :func:`~metrics.mean_absolute_percentage_error` and fit times.
+# Here we use :term:`cross validation` to compare the models performance in
+# terms of :func:`~metrics.mean_absolute_percentage_error` and fit times. In the
+# upcoming plots, error bars represent 1 standard deviation as computed across
+# folds.
 
+from sklearn.model_selection import cross_validate
+
+common_params = {"cv": 5, "scoring": "neg_mean_absolute_percentage_error", "n_jobs": -1}
+
+dropped_result = cross_validate(hist_dropped, X, y, **common_params)
+one_hot_result = cross_validate(hist_one_hot, X, y, **common_params)
+ordinal_result = cross_validate(hist_ordinal, X, y, **common_params)
+native_result = cross_validate(hist_native, X, y, **common_params)
+results = [
+    ("Dropped", dropped_result),
+    ("One Hot", one_hot_result),
+    ("Ordinal", ordinal_result),
+    ("Native", native_result),
+]
+
+# %%
 import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
 
-from sklearn.model_selection import cross_validate
 
-scoring = "neg_mean_absolute_percentage_error"
-n_cv_folds = 3
-
-dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
-one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)
-ordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)
-native_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)
-
-
-def plot_results(figure_title):
-    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
-
-    plot_info = [
-        ("fit_time", "Fit times (s)", ax1, None),
-        ("test_score", "Mean Absolute Percentage Error", ax2, None),
-    ]
-
-    x, width = np.arange(4), 0.9
-    for key, title, ax, y_limit in plot_info:
-        items = [
-            dropped_result[key],
-            one_hot_result[key],
-            ordinal_result[key],
-            native_result[key],
-        ]
-
-        mape_cv_mean = [np.mean(np.abs(item)) for item in items]
-        mape_cv_std = [np.std(item) for item in items]
-
-        ax.bar(
-            x=x,
-            height=mape_cv_mean,
-            width=width,
-            yerr=mape_cv_std,
-            color=["C0", "C1", "C2", "C3"],
+def plot_performance_tradeoff(results, title):
+    fig, ax = plt.subplots()
+    markers = ["s", "o", "^", "x"]
+
+    for idx, (name, result) in enumerate(results):
+        test_error = -result["test_score"]
+        mean_fit_time = np.mean(result["fit_time"])
+        mean_score = np.mean(test_error)
+        std_fit_time = np.std(result["fit_time"])
+        std_score = np.std(test_error)
+
+        ax.scatter(
+            result["fit_time"],
+            test_error,
+            label=name,
+            marker=markers[idx],
+        )
+        ax.scatter(
+            mean_fit_time,
+            mean_score,
+            color="k",
+            marker=markers[idx],
         )
-        ax.set(
-            xlabel="Model",
-            title=title,
-            xticks=x,
-            xticklabels=["Dropped", "One Hot", "Ordinal", "Native"],
-            ylim=y_limit,
+        ax.errorbar(
+            x=mean_fit_time,
+            y=mean_score,
+            yerr=std_score,
+            c="k",
+            capsize=2,
         )
-    fig.suptitle(figure_title)
+        ax.errorbar(
+            x=mean_fit_time,
+            y=mean_score,
+            xerr=std_fit_time,
+            c="k",
+            capsize=2,
+        )
+
+    ax.set_xscale("log")
+
+    nticks = 7
+    x0, x1 = np.log10(ax.get_xlim())
+    ticks = np.logspace(x0, x1, nticks)
+    ax.set_xticks(ticks)
+    ax.xaxis.set_major_formatter(ticker.FormatStrFormatter("%1.1e"))
+    ax.minorticks_off()
 
+    ax.annotate(
+        "  best\nmodels",
+        xy=(0.05, 0.05),
+        xycoords="axes fraction",
+        xytext=(0.1, 0.15),
+        textcoords="axes fraction",
+        arrowprops=dict(arrowstyle="->", lw=1.5),
+    )
+    ax.set_xlabel("Time to fit (seconds)")
+    ax.set_ylabel("Mean Absolute Percentage Error")
+    ax.set_title(title)
+    ax.legend()
+    plt.show()
 
-plot_results("Gradient Boosting on Ames Housing")
+
+plot_performance_tradeoff(results, "Gradient Boosting on Ames Housing")
 
 # %%
+# In the plot above, the "best models" are those that are closer to the
+# down-left corner, as indicated by the arrow. Those models would indeed
+# correspond to faster fitting and lower error.
+#
 # We see that the model with one-hot-encoded data is by far the slowest. This
 # is to be expected, since one-hot-encoding creates one additional feature per
 # category value (for each categorical feature), and thus more split points
@@ -264,14 +305,21 @@ def plot_results(figure_title):
             histgradientboostingregressor__max_iter=15,
         )
 
-dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
-one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)
-ordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)
-native_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)
-
-plot_results("Gradient Boosting on Ames Housing (few and small trees)")
+dropped_result = cross_validate(hist_dropped, X, y, **common_params)
+one_hot_result = cross_validate(hist_one_hot, X, y, **common_params)
+ordinal_result = cross_validate(hist_ordinal, X, y, **common_params)
+native_result = cross_validate(hist_native, X, y, **common_params)
+results_underfit = [
+    ("Dropped", dropped_result),
+    ("One Hot", one_hot_result),
+    ("Ordinal", ordinal_result),
+    ("Native", native_result),
+]
 
-plt.show()
+# %%
+plot_performance_tradeoff(
+    results_underfit, "Gradient Boosting on Ames Housing (few and shallow trees)"
+)
 
 # %%
 # The results for these under-fitting models confirm our previous intuition: