diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst index e58796fa85b46..eac8f063be258 100644 --- a/doc/modules/decomposition.rst +++ b/doc/modules/decomposition.rst @@ -273,26 +273,39 @@ Exact Kernel PCA ---------------- :class:`KernelPCA` is an extension of PCA which achieves non-linear -dimensionality reduction through the use of kernels (see :ref:`metrics`). It +dimensionality reduction through the use of kernels (see :ref:`metrics`) [Scholkopf1997]_. It has many applications including denoising, compression and structured prediction (kernel dependency estimation). :class:`KernelPCA` supports both ``transform`` and ``inverse_transform``. -.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_001.png +.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_002.png :target: ../auto_examples/decomposition/plot_kernel_pca.html :align: center :scale: 75% +.. note:: + :meth:`KernelPCA.inverse_transform` relies on a kernel ridge to learn the + function mapping samples from the PCA basis into the original feature + space [Bakir2004]_. Thus, the reconstruction obtained with + :meth:`KernelPCA.inverse_transform` is an approximation. See the example + linked below for more details. + .. topic:: Examples: * :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py` .. topic:: References: - * Kernel PCA was introduced in "Kernel principal component analysis" - Bernhard Schoelkopf, Alexander J. Smola, and Klaus-Robert Mueller. 1999. - In Advances in kernel methods, MIT Press, Cambridge, MA, USA 327-352. + .. [Scholkopf1997] Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller. + `"Kernel principal component analysis." + `_ + International conference on artificial neural networks. + Springer, Berlin, Heidelberg, 1997. + .. [Bakir2004] Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf. + `"Learning to find pre-images." + `_ + Advances in neural information processing systems 16 (2004): 449-456. .. _kPCA_Solvers: diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index 3ec0958a9e602..fe6d63240523e 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -3,70 +3,163 @@ Kernel PCA ========== -This example shows that Kernel PCA is able to find a projection of the data -that makes data linearly separable. +This example shows the difference between the Principal Components Analysis +(:class:`~sklearn.decomposition.PCA`) and its kernalized version +(:class:`~sklearn.decomposition.KernelPCA`). +On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able +to find a projection of the data which linearly separates them while it is not the case +with :class:`~sklearn.decomposition.PCA`. + +Finally, we show that inverting this projection is an approximation with +:class:`~sklearn.decomposition.KernelPCA`, while it is exact with +:class:`~sklearn.decomposition.PCA`. """ # Authors: Mathieu Blondel # Andreas Mueller +# Guillaume Lemaitre # License: BSD 3 clause -import numpy as np +# %% +# Projecting data: `PCA` vs. `KernelPCA` +# -------------------------------------- +# +# In this section, we show the advantages of using a kernel when +# projecting data using a Principal Component Analysis (PCA). We create a +# dataset made of two nested circles. +from sklearn.datasets import make_circles +from sklearn.model_selection import train_test_split + +X, y = make_circles(n_samples=1_000, factor=0.3, noise=0.05, random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) + +# %% +# Let's have a quick first look at the generated dataset. import matplotlib.pyplot as plt +_, (train_ax, test_ax) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4)) + +train_ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train) +train_ax.set_ylabel("Feature #1") +train_ax.set_xlabel("Feature #0") +train_ax.set_title("Training data") + +test_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test) +test_ax.set_xlabel("Feature #0") +_ = test_ax.set_title("Testing data") + +# %% +# The samples from each class cannot be linearly separated: there is no +# straight line that can split the samples of the inner set from the outer +# set. +# +# Now, we will use PCA with and without a kernel to see what is the effect of +# using such a kernel. The kernel used here is a radial basis function (RBF) +# kernel. from sklearn.decomposition import PCA, KernelPCA -from sklearn.datasets import make_circles -np.random.seed(0) - -X, y = make_circles(n_samples=400, factor=0.3, noise=0.05) - -kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10) -X_kpca = kpca.fit_transform(X) -X_back = kpca.inverse_transform(X_kpca) -pca = PCA() -X_pca = pca.fit_transform(X) - -# Plot results - -plt.figure() -plt.subplot(2, 2, 1, aspect="equal") -plt.title("Original space") -reds = y == 0 -blues = y == 1 - -plt.scatter(X[reds, 0], X[reds, 1], c="red", s=20, edgecolor="k") -plt.scatter(X[blues, 0], X[blues, 1], c="blue", s=20, edgecolor="k") -plt.xlabel("$x_1$") -plt.ylabel("$x_2$") - -X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50)) -X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T -# projection on the first principal component (in the phi space) -Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape) -plt.contour(X1, X2, Z_grid, colors="grey", linewidths=1, origin="lower") - -plt.subplot(2, 2, 2, aspect="equal") -plt.scatter(X_pca[reds, 0], X_pca[reds, 1], c="red", s=20, edgecolor="k") -plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c="blue", s=20, edgecolor="k") -plt.title("Projection by PCA") -plt.xlabel("1st principal component") -plt.ylabel("2nd component") - -plt.subplot(2, 2, 3, aspect="equal") -plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=20, edgecolor="k") -plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=20, edgecolor="k") -plt.title("Projection by KPCA") -plt.xlabel(r"1st principal component in space induced by $\phi$") -plt.ylabel("2nd component") - -plt.subplot(2, 2, 4, aspect="equal") -plt.scatter(X_back[reds, 0], X_back[reds, 1], c="red", s=20, edgecolor="k") -plt.scatter(X_back[blues, 0], X_back[blues, 1], c="blue", s=20, edgecolor="k") -plt.title("Original space after inverse transform") -plt.xlabel("$x_1$") -plt.ylabel("$x_2$") - -plt.tight_layout() -plt.show() +pca = PCA(n_components=2) +kernel_pca = KernelPCA( + n_components=None, kernel="rbf", gamma=10, fit_inverse_transform=True, alpha=0.1 +) + +X_test_pca = pca.fit(X_train).transform(X_test) +X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test) + +# %% +fig, (orig_data_ax, pca_proj_ax, kernel_pca_proj_ax) = plt.subplots( + ncols=3, figsize=(14, 4) +) + +orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test) +orig_data_ax.set_ylabel("Feature #1") +orig_data_ax.set_xlabel("Feature #0") +orig_data_ax.set_title("Testing data") + +pca_proj_ax.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test) +pca_proj_ax.set_ylabel("Principal component #1") +pca_proj_ax.set_xlabel("Principal component #0") +pca_proj_ax.set_title("Projection of testing data\n using PCA") + +kernel_pca_proj_ax.scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test) +kernel_pca_proj_ax.set_ylabel("Principal component #1") +kernel_pca_proj_ax.set_xlabel("Principal component #0") +_ = kernel_pca_proj_ax.set_title("Projection of testing data\n using KernelPCA") + +# %% +# We recall that PCA transforms the data linearly. Intuitively, it means that +# the coordinate system will be centered, rescaled on each component +# with respected to its variance and finally be rotated. +# The obtained data from this transformation is isotropic and can now be +# projected on its _principal components_. +# +# Thus, looking at the projection made using PCA (i.e. the middle figure), we +# see that there is no change regarding the scaling; indeed the data being two +# concentric circles centered in zero, the original data is already isotropic. +# However, we can see that the data have been rotated. As a +# conclusion, we see that such a projection would not help if define a linear +# classifier to distinguish samples from both classes. +# +# Using a kernel allows to make a non-linear projection. Here, by using an RBF +# kernel, we expect that the projection will unfold the dataset while keeping +# approximately preserving the relative distances of pairs of data points that +# are close to one another in the original space. +# +# We observe such behaviour in the figure on the right: the samples of a given +# class are closer to each other than the samples from the opposite class, +# untangling both sample sets. Now, we can use a linear classifier to separate +# the samples from the two classes. +# +# Projecting into the original feature space +# ------------------------------------------ +# +# One particularity to have in mind when using +# :class:`~sklearn.decomposition.KernelPCA` is related to the reconstruction +# (i.e. the back projection in the original feature space). With +# :class:`~sklearn.decomposition.PCA`, the reconstruction will be exact if +# `n_components` is the same than the number of original features. +# This is the case in this example. +# +# We can investigate if we get the original dataset when back projecting with +# :class:`~sklearn.decomposition.KernelPCA`. +X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test)) +X_reconstructed_kernel_pca = kernel_pca.inverse_transform(kernel_pca.transform(X_test)) + +# %% +fig, (orig_data_ax, pca_back_proj_ax, kernel_pca_back_proj_ax) = plt.subplots( + ncols=3, sharex=True, sharey=True, figsize=(13, 4) +) + +orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test) +orig_data_ax.set_ylabel("Feature #1") +orig_data_ax.set_xlabel("Feature #0") +orig_data_ax.set_title("Original test data") + +pca_back_proj_ax.scatter(X_reconstructed_pca[:, 0], X_reconstructed_pca[:, 1], c=y_test) +pca_back_proj_ax.set_xlabel("Feature #0") +pca_back_proj_ax.set_title("Reconstruction via PCA") + +kernel_pca_back_proj_ax.scatter( + X_reconstructed_kernel_pca[:, 0], X_reconstructed_kernel_pca[:, 1], c=y_test +) +kernel_pca_back_proj_ax.set_xlabel("Feature #0") +_ = kernel_pca_back_proj_ax.set_title("Reconstruction via KernelPCA") + +# %% +# While we see a perfect reconstruction with +# :class:`~sklearn.decomposition.PCA` we observe a different result for +# :class:`~sklearn.decomposition.KernelPCA`. +# +# Indeed, :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot +# rely on an analytical back-projection and thus an extact reconstruction. +# Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` is internally trained +# to learn a mapping from the kernalized PCA basis to the original feature +# space. This method therefore comes with an approximation introducing small +# differences when back projecting in the original feature space. +# +# To improve the reconstruction using +# :meth:`~sklearn.decomposition.KernelPCA.inverse_transform`, one can tune +# `alpha` in :class:`~sklearn.decomposition.KernelPCA`, the regularization term +# which controls the reliance on the training data during the training of +# the mapping. diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index ff0fd223512c1..0efcf2d4fd341 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -22,16 +22,16 @@ class KernelPCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): - """Kernel Principal component analysis (KPCA). + """Kernel Principal component analysis (KPCA) [1]_. Non-linear dimensionality reduction through the use of kernels (see :ref:`metrics`). - It uses the `scipy.linalg.eigh` LAPACK implementation of the full SVD or - the `scipy.sparse.linalg.eigsh` ARPACK implementation of the truncated SVD, - depending on the shape of the input data and the number of components to - extract. It can also use a randomized truncated SVD by the method of - Halko et al. 2009, see `eigen_solver`. + It uses the :func:`scipy.linalg.eigh` LAPACK implementation of the full SVD + or the :func:`scipy.sparse.linalg.eigsh` ARPACK implementation of the + truncated SVD, depending on the shape of the input data and the number of + components to extract. It can also use a randomized truncated SVD by the + method proposed in [3]_, see `eigen_solver`. Read more in the :ref:`User Guide `. @@ -66,14 +66,16 @@ class KernelPCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimato fit_inverse_transform : bool, default=False Learn the inverse transform for non-precomputed kernels - (i.e. learn to find the pre-image of a point). + (i.e. learn to find the pre-image of a point). This method is based + on [2]_. eigen_solver : {'auto', 'dense', 'arpack', 'randomized'}, \ - default='auto' + default='auto' Select eigensolver to use. If `n_components` is much less than the number of training samples, randomized (or arpack to a smaller extend) may be more efficient than the dense eigensolver. - Randomized SVD is performed according to the method of Halko et al. + Randomized SVD is performed according to the method of Halko et al + [3]_. auto : the solver is selected by a default policy based on n_samples @@ -92,10 +94,10 @@ class KernelPCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimato `scipy.sparse.linalg.eigsh`. It requires strictly 0 < n_components < n_samples randomized : - run randomized SVD by the method of Halko et al. The current + run randomized SVD by the method of Halko et al. [3]_. The current implementation selects eigenvalues based on their module; therefore using this method can lead to unexpected results if the kernel is - not positive semi-definite. + not positive semi-definite. See also [4]_. .. versionchanged:: 1.0 `'randomized'` was added. @@ -203,20 +205,26 @@ class KernelPCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimato References ---------- - Kernel PCA was introduced in: - Bernhard Schoelkopf, Alexander J. Smola, - and Klaus-Robert Mueller. 1999. Kernel principal - component analysis. In Advances in kernel methods, - MIT Press, Cambridge, MA, USA 327-352. - - For eigen_solver == 'arpack', refer to `scipy.sparse.linalg.eigsh`. - - For eigen_solver == 'randomized', see: - Finding structure with randomness: Stochastic algorithms - for constructing approximate matrix decompositions Halko, et al., 2009 - (arXiv:909) - A randomized algorithm for the decomposition of matrices - Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert + .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller. + "Kernel principal component analysis." + International conference on artificial neural networks. + Springer, Berlin, Heidelberg, 1997. + `_ + + .. [2] `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf. + "Learning to find pre-images." + Advances in neural information processing systems 16 (2004): 449-456. + `_ + + .. [3] :arxiv:`Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp. + "Finding structure with randomness: Probabilistic algorithms for + constructing approximate matrix decompositions." + SIAM review 53.2 (2011): 217-288. <0909.4061>` + + .. [4] `Martinsson, Per-Gunnar, Vladimir Rokhlin, and Mark Tygert. + "A randomized algorithm for the decomposition of matrices." + Applied and Computational Harmonic Analysis 30.1 (2011): 47-68. + `_ Examples -------- @@ -532,7 +540,10 @@ def inverse_transform(self, X): References ---------- - "Learning to Find Pre-Images", G BakIr et al, 2004. + `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf. + "Learning to find pre-images." + Advances in neural information processing systems 16 (2004): 449-456. + `_ """ if not self.fit_inverse_transform: raise NotFittedError(