diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index e58796fa85b46..eac8f063be258 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -273,26 +273,39 @@ Exact Kernel PCA
 ----------------
 
 :class:`KernelPCA` is an extension of PCA which achieves non-linear
-dimensionality reduction through the use of kernels (see :ref:`metrics`). It
+dimensionality reduction through the use of kernels (see :ref:`metrics`) [Scholkopf1997]_. It
 has many applications including denoising, compression and structured
 prediction (kernel dependency estimation). :class:`KernelPCA` supports both
 ``transform`` and ``inverse_transform``.
 
-.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_001.png
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_002.png
     :target: ../auto_examples/decomposition/plot_kernel_pca.html
     :align: center
     :scale: 75%
 
+.. note::
+    :meth:`KernelPCA.inverse_transform` relies on a kernel ridge to learn the
+    function mapping samples from the PCA basis into the original feature
+    space [Bakir2004]_. Thus, the reconstruction obtained with
+    :meth:`KernelPCA.inverse_transform` is an approximation. See the example
+    linked below for more details.
+
 .. topic:: Examples:
 
     * :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py`
 
 .. topic:: References:
 
-    * Kernel PCA was introduced in "Kernel principal component analysis"
-      Bernhard Schoelkopf, Alexander J. Smola, and Klaus-Robert Mueller. 1999.
-      In Advances in kernel methods, MIT Press, Cambridge, MA, USA 327-352.
+    .. [Scholkopf1997] Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
+       `"Kernel principal component analysis."
+       <https://people.eecs.berkeley.edu/~wainwrig/stat241b/scholkopf_kernel.pdf>`_
+       International conference on artificial neural networks.
+       Springer, Berlin, Heidelberg, 1997.
 
+    .. [Bakir2004] Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf.
+       `"Learning to find pre-images."
+       <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.68.5164&rep=rep1&type=pdf>`_
+       Advances in neural information processing systems 16 (2004): 449-456.
 
 .. _kPCA_Solvers:
 
diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index 3ec0958a9e602..fe6d63240523e 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -3,70 +3,163 @@
 Kernel PCA
 ==========
 
-This example shows that Kernel PCA is able to find a projection of the data
-that makes data linearly separable.
+This example shows the difference between the Principal Components Analysis
+(:class:`~sklearn.decomposition.PCA`) and its kernalized version
+(:class:`~sklearn.decomposition.KernelPCA`).
 
+On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able
+to find a projection of the data which linearly separates them while it is not the case
+with :class:`~sklearn.decomposition.PCA`.
+
+Finally, we show that inverting this projection is an approximation with
+:class:`~sklearn.decomposition.KernelPCA`, while it is exact with
+:class:`~sklearn.decomposition.PCA`.
 """
 
 # Authors: Mathieu Blondel
 #          Andreas Mueller
+#          Guillaume Lemaitre
 # License: BSD 3 clause
 
-import numpy as np
+# %%
+# Projecting data: `PCA` vs. `KernelPCA`
+# --------------------------------------
+#
+# In this section, we show the advantages of using a kernel when
+# projecting data using a Principal Component Analysis (PCA). We create a
+# dataset made of two nested circles.
+from sklearn.datasets import make_circles
+from sklearn.model_selection import train_test_split
+
+X, y = make_circles(n_samples=1_000, factor=0.3, noise=0.05, random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
+
+# %%
+# Let's have a quick first look at the generated dataset.
 import matplotlib.pyplot as plt
 
+_, (train_ax, test_ax) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4))
+
+train_ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train)
+train_ax.set_ylabel("Feature #1")
+train_ax.set_xlabel("Feature #0")
+train_ax.set_title("Training data")
+
+test_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
+test_ax.set_xlabel("Feature #0")
+_ = test_ax.set_title("Testing data")
+
+# %%
+# The samples from each class cannot be linearly separated: there is no
+# straight line that can split the samples of the inner set from the outer
+# set.
+#
+# Now, we will use PCA with and without a kernel to see what is the effect of
+# using such a kernel. The kernel used here is a radial basis function (RBF)
+# kernel.
 from sklearn.decomposition import PCA, KernelPCA
-from sklearn.datasets import make_circles
 
-np.random.seed(0)
-
-X, y = make_circles(n_samples=400, factor=0.3, noise=0.05)
-
-kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
-X_kpca = kpca.fit_transform(X)
-X_back = kpca.inverse_transform(X_kpca)
-pca = PCA()
-X_pca = pca.fit_transform(X)
-
-# Plot results
-
-plt.figure()
-plt.subplot(2, 2, 1, aspect="equal")
-plt.title("Original space")
-reds = y == 0
-blues = y == 1
-
-plt.scatter(X[reds, 0], X[reds, 1], c="red", s=20, edgecolor="k")
-plt.scatter(X[blues, 0], X[blues, 1], c="blue", s=20, edgecolor="k")
-plt.xlabel("$x_1$")
-plt.ylabel("$x_2$")
-
-X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50))
-X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T
-# projection on the first principal component (in the phi space)
-Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape)
-plt.contour(X1, X2, Z_grid, colors="grey", linewidths=1, origin="lower")
-
-plt.subplot(2, 2, 2, aspect="equal")
-plt.scatter(X_pca[reds, 0], X_pca[reds, 1], c="red", s=20, edgecolor="k")
-plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c="blue", s=20, edgecolor="k")
-plt.title("Projection by PCA")
-plt.xlabel("1st principal component")
-plt.ylabel("2nd component")
-
-plt.subplot(2, 2, 3, aspect="equal")
-plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=20, edgecolor="k")
-plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=20, edgecolor="k")
-plt.title("Projection by KPCA")
-plt.xlabel(r"1st principal component in space induced by $\phi$")
-plt.ylabel("2nd component")
-
-plt.subplot(2, 2, 4, aspect="equal")
-plt.scatter(X_back[reds, 0], X_back[reds, 1], c="red", s=20, edgecolor="k")
-plt.scatter(X_back[blues, 0], X_back[blues, 1], c="blue", s=20, edgecolor="k")
-plt.title("Original space after inverse transform")
-plt.xlabel("$x_1$")
-plt.ylabel("$x_2$")
-
-plt.tight_layout()
-plt.show()
+pca = PCA(n_components=2)
+kernel_pca = KernelPCA(
+    n_components=None, kernel="rbf", gamma=10, fit_inverse_transform=True, alpha=0.1
+)
+
+X_test_pca = pca.fit(X_train).transform(X_test)
+X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test)
+
+# %%
+fig, (orig_data_ax, pca_proj_ax, kernel_pca_proj_ax) = plt.subplots(
+    ncols=3, figsize=(14, 4)
+)
+
+orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
+orig_data_ax.set_ylabel("Feature #1")
+orig_data_ax.set_xlabel("Feature #0")
+orig_data_ax.set_title("Testing data")
+
+pca_proj_ax.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test)
+pca_proj_ax.set_ylabel("Principal component #1")
+pca_proj_ax.set_xlabel("Principal component #0")
+pca_proj_ax.set_title("Projection of testing data\n using PCA")
+
+kernel_pca_proj_ax.scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test)
+kernel_pca_proj_ax.set_ylabel("Principal component #1")
+kernel_pca_proj_ax.set_xlabel("Principal component #0")
+_ = kernel_pca_proj_ax.set_title("Projection of testing data\n using KernelPCA")
+
+# %%
+# We recall that PCA transforms the data linearly. Intuitively, it means that
+# the coordinate system will be centered, rescaled on each component
+# with respected to its variance and finally be rotated.
+# The obtained data from this transformation is isotropic and can now be
+# projected on its _principal components_.
+#
+# Thus, looking at the projection made using PCA (i.e. the middle figure), we
+# see that there is no change regarding the scaling; indeed the data being two
+# concentric circles centered in zero, the original data is already isotropic.
+# However, we can see that the data have been rotated. As a
+# conclusion, we see that such a projection would not help if define a linear
+# classifier to distinguish samples from both classes.
+#
+# Using a kernel allows to make a non-linear projection. Here, by using an RBF
+# kernel, we expect that the projection will unfold the dataset while keeping
+# approximately preserving the relative distances of pairs of data points that
+# are close to one another in the original space.
+#
+# We observe such behaviour in the figure on the right: the samples of a given
+# class are closer to each other than the samples from the opposite class,
+# untangling both sample sets. Now, we can use a linear classifier to separate
+# the samples from the two classes.
+#
+# Projecting into the original feature space
+# ------------------------------------------
+#
+# One particularity to have in mind when using
+# :class:`~sklearn.decomposition.KernelPCA` is related to the reconstruction
+# (i.e. the back projection in the original feature space). With
+# :class:`~sklearn.decomposition.PCA`, the reconstruction will be exact if
+# `n_components` is the same than the number of original features.
+# This is the case in this example.
+#
+# We can investigate if we get the original dataset when back projecting with
+# :class:`~sklearn.decomposition.KernelPCA`.
+X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test))
+X_reconstructed_kernel_pca = kernel_pca.inverse_transform(kernel_pca.transform(X_test))
+
+# %%
+fig, (orig_data_ax, pca_back_proj_ax, kernel_pca_back_proj_ax) = plt.subplots(
+    ncols=3, sharex=True, sharey=True, figsize=(13, 4)
+)
+
+orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
+orig_data_ax.set_ylabel("Feature #1")
+orig_data_ax.set_xlabel("Feature #0")
+orig_data_ax.set_title("Original test data")
+
+pca_back_proj_ax.scatter(X_reconstructed_pca[:, 0], X_reconstructed_pca[:, 1], c=y_test)
+pca_back_proj_ax.set_xlabel("Feature #0")
+pca_back_proj_ax.set_title("Reconstruction via PCA")
+
+kernel_pca_back_proj_ax.scatter(
+    X_reconstructed_kernel_pca[:, 0], X_reconstructed_kernel_pca[:, 1], c=y_test
+)
+kernel_pca_back_proj_ax.set_xlabel("Feature #0")
+_ = kernel_pca_back_proj_ax.set_title("Reconstruction via KernelPCA")
+
+# %%
+# While we see a perfect reconstruction with
+# :class:`~sklearn.decomposition.PCA` we observe a different result for
+# :class:`~sklearn.decomposition.KernelPCA`.
+#
+# Indeed, :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot
+# rely on an analytical back-projection and thus an extact reconstruction.
+# Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` is internally trained
+# to learn a mapping from the kernalized PCA basis to the original feature
+# space. This method therefore comes with an approximation introducing small
+# differences when back projecting in the original feature space.
+#
+# To improve the reconstruction using
+# :meth:`~sklearn.decomposition.KernelPCA.inverse_transform`, one can tune
+# `alpha` in :class:`~sklearn.decomposition.KernelPCA`, the regularization term
+# which controls the reliance on the training data during the training of
+# the mapping.
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index ff0fd223512c1..0efcf2d4fd341 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -22,16 +22,16 @@
 
 
 class KernelPCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
-    """Kernel Principal component analysis (KPCA).
+    """Kernel Principal component analysis (KPCA) [1]_.
 
     Non-linear dimensionality reduction through the use of kernels (see
     :ref:`metrics`).
 
-    It uses the `scipy.linalg.eigh` LAPACK implementation of the full SVD or
-    the `scipy.sparse.linalg.eigsh` ARPACK implementation of the truncated SVD,
-    depending on the shape of the input data and the number of components to
-    extract. It can also use a randomized truncated SVD by the method of
-    Halko et al. 2009, see `eigen_solver`.
+    It uses the :func:`scipy.linalg.eigh` LAPACK implementation of the full SVD
+    or the :func:`scipy.sparse.linalg.eigsh` ARPACK implementation of the
+    truncated SVD, depending on the shape of the input data and the number of
+    components to extract. It can also use a randomized truncated SVD by the
+    method proposed in [3]_, see `eigen_solver`.
 
     Read more in the :ref:`User Guide <kernel_PCA>`.
 
@@ -66,14 +66,16 @@ class KernelPCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimato
 
     fit_inverse_transform : bool, default=False
         Learn the inverse transform for non-precomputed kernels
-        (i.e. learn to find the pre-image of a point).
+        (i.e. learn to find the pre-image of a point). This method is based
+        on [2]_.
 
     eigen_solver : {'auto', 'dense', 'arpack', 'randomized'}, \
-        default='auto'
+            default='auto'
         Select eigensolver to use. If `n_components` is much
         less than the number of training samples, randomized (or arpack to a
         smaller extend) may be more efficient than the dense eigensolver.
-        Randomized SVD is performed according to the method of Halko et al.
+        Randomized SVD is performed according to the method of Halko et al
+        [3]_.
 
         auto :
             the solver is selected by a default policy based on n_samples
@@ -92,10 +94,10 @@ class KernelPCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimato
             `scipy.sparse.linalg.eigsh`. It requires strictly
             0 < n_components < n_samples
         randomized :
-            run randomized SVD by the method of Halko et al. The current
+            run randomized SVD by the method of Halko et al. [3]_. The current
             implementation selects eigenvalues based on their module; therefore
             using this method can lead to unexpected results if the kernel is
-            not positive semi-definite.
+            not positive semi-definite. See also [4]_.
 
         .. versionchanged:: 1.0
            `'randomized'` was added.
@@ -203,20 +205,26 @@ class KernelPCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimato
 
     References
     ----------
-    Kernel PCA was introduced in:
-        Bernhard Schoelkopf, Alexander J. Smola,
-        and Klaus-Robert Mueller. 1999. Kernel principal
-        component analysis. In Advances in kernel methods,
-        MIT Press, Cambridge, MA, USA 327-352.
-
-    For eigen_solver == 'arpack', refer to `scipy.sparse.linalg.eigsh`.
-
-    For eigen_solver == 'randomized', see:
-        Finding structure with randomness: Stochastic algorithms
-        for constructing approximate matrix decompositions Halko, et al., 2009
-        (arXiv:909)
-        A randomized algorithm for the decomposition of matrices
-        Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert
+    .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
+       "Kernel principal component analysis."
+       International conference on artificial neural networks.
+       Springer, Berlin, Heidelberg, 1997.
+       <https://people.eecs.berkeley.edu/~wainwrig/stat241b/scholkopf_kernel.pdf>`_
+
+    .. [2] `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf.
+       "Learning to find pre-images."
+       Advances in neural information processing systems 16 (2004): 449-456.
+       <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.68.5164&rep=rep1&type=pdf>`_
+
+    .. [3] :arxiv:`Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp.
+       "Finding structure with randomness: Probabilistic algorithms for
+       constructing approximate matrix decompositions."
+       SIAM review 53.2 (2011): 217-288. <0909.4061>`
+
+    .. [4] `Martinsson, Per-Gunnar, Vladimir Rokhlin, and Mark Tygert.
+       "A randomized algorithm for the decomposition of matrices."
+       Applied and Computational Harmonic Analysis 30.1 (2011): 47-68.
+       <https://www.sciencedirect.com/science/article/pii/S1063520310000242>`_
 
     Examples
     --------
@@ -532,7 +540,10 @@ def inverse_transform(self, X):
 
         References
         ----------
-        "Learning to Find Pre-Images", G BakIr et al, 2004.
+        `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf.
+        "Learning to find pre-images."
+        Advances in neural information processing systems 16 (2004): 449-456.
+        <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.68.5164&rep=rep1&type=pdf>`_
         """
         if not self.fit_inverse_transform:
             raise NotFittedError(