From a0d84a58281074d323cbf7b87bd8178baa2d325c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 28 Apr 2021 23:24:26 +0200
Subject: [PATCH 01/40] DOC revamp documentation of GPR

---
 doc/modules/gaussian_process.rst            |   4 +-
 examples/gaussian_process/plot_gpr_noisy.py | 231 ++++++++++++++------
 2 files changed, 161 insertions(+), 74 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 6aa9cb417aa5d..089ad9672647c 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -47,7 +47,7 @@ prior mean is assumed to be constant and zero (for ``normalize_y=False``) or the
 training data's mean (for ``normalize_y=True``). The prior's
 covariance is specified by passing a :ref:`kernel <gp_kernels>` object. The
 hyperparameters of the kernel are optimized during fitting of
-GaussianProcessRegressor by maximizing the log-marginal-likelihood (LML) based
+:class:`GaussianProcessRegressor` by maximizing the log-marginal-likelihood (LML) based
 on the passed ``optimizer``. As the LML may have multiple local optima, the
 optimizer can be started repeatedly by specifying ``n_restarts_optimizer``. The
 first run is always conducted starting from the initial hyperparameter values
@@ -62,7 +62,7 @@ Note that a moderate noise level can also be helpful for dealing with numeric
 issues during fitting as it is effectively implemented as Tikhonov
 regularization, i.e., by adding it to the diagonal of the kernel matrix. An
 alternative to specifying the noise level explicitly is to include a
-WhiteKernel component into the kernel, which can estimate the global noise
+:class:`~sklearn.gaussian_process.kernels.WhiteKernel` component into the kernel, which can estimate the global noise
 level from the data (see example below).
 
 The implementation is based on Algorithm 2.1 of [RW2006]_. In addition to
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index 5f8ce2cd0fe96..2b15659beb3f1 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -3,95 +3,182 @@
 Gaussian process regression (GPR) with noise-level estimation
 =============================================================
 
-This example illustrates that GPR with a sum-kernel including a WhiteKernel can
-estimate the noise level of data. An illustration of the
-log-marginal-likelihood (LML) landscape shows that there exist two local
-maxima of LML. The first corresponds to a model with a high noise level and a
-large length scale, which explains all variations in the data by noise. The
-second one has a smaller noise level and shorter length scale, which explains
-most of the variation by the noise-free functional relationship. The second
-model has a higher likelihood; however, depending on the initial value for the
-hyperparameters, the gradient-based optimization might also converge to the
-high-noise solution. It is thus important to repeat the optimization several
-times for different initializations.
+This example shows the ability of the
+:class:`~sklearn.gaussian_process.kernels.WhiteKernel` to estimate the noise
+level in the data. However, we show the importance of kernel hyperparameters
+initialization.
 """
 print(__doc__)
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
-#
+#          Guillaume Lemaitre <guillaume.lemaitre@inria.fr>
 # License: BSD 3 clause
 
+# %%
+# Data generation
+# ---------------
+#
+# We will work in a setting where `X` will be a single feature. We create a
+# function that will generate a the target to be predicted. We will add an
+# option to add some noise to the generated target.
 import numpy as np
 
-from matplotlib import pyplot as plt
-from matplotlib.colors import LogNorm
 
+def target_generator(X, add_noise=False):
+    target = 0.5 + np.sin(3 * X)
+    if add_noise:
+        rng = np.random.RandomState(1)
+        target += rng.normal(0, 0.3, size=target.shape)
+    return target.squeeze()
+
+
+# %%
+# Let's have a look to the target generator where we will not add any noise to
+# observe the signal that we would like to predict.
+X = np.linspace(0, 5, num=30).reshape(-1, 1)
+y = target_generator(X, add_noise=False)
+
+# %%
+import matplotlib.pyplot as plt
+
+plt.plot(X, y, label="Perfect generator")
+plt.legend()
+plt.xlabel("X")
+_ = plt.ylabel("y")
+
+# %%
+# The target is transforming the input `X` using a sine function. Now, we will
+# generate few noisy training samples. To illustrate the noise level, we will
+# plot the true signal together with the noisy training samples.
+rng = np.random.RandomState(0)
+X_train = rng.uniform(0, 5, size=20).reshape(-1, 1)
+y_train = target_generator(X_train, add_noise=True)
+
+# %%
+plt.plot(X, y, label="Perfect generator")
+plt.scatter(
+    x=X_train[:, 0], y=y_train, color="tab:orange",
+    label="Noisy training samples",
+)
+plt.legend()
+plt.xlabel("X")
+_ = plt.ylabel("y")
+
+# %%
+# Optimisation of kernel hyperparameters in GPR
+# ---------------------------------------------
+#
+# Now, we will create a GPR using an additive kernel using a
+# :class:`~sklearn.gaussian_process.kernels.RBF` and
+# :class:`~sklearn.gaussian_process.kernels.WhiteKernel` kernels.
+# The :class:`~sklearn.gaussian_process.kernels.WhiteKernel` is a kernel that
+# will able to estimate the amount of noise present in the data while the
+# :class:`~sklearn.gaussian_process.kernels.RBF` will serve at fitting the
+# non-linearity between the data and the target.
+#
+# However, we will show that the hyperparameter space contains several local
+# minima. It will highlight the importance of initial hyperparameter values.
+#
+# We will create a model using a kernel with a high noise level and a large
+# length scale, which explains all variations in the data by noise.
 from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels import RBF, WhiteKernel
 
+kernel = (
+    1.0 * RBF(length_scale=1e1, length_scale_bounds=(1e-2, 1e3)) +
+    WhiteKernel(noise_level=1, noise_level_bounds=(1e-5, 1e1))
+)
+gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.0)
+gpr.fit(X_train, y_train)
+y_mean, y_std = gpr.predict(X, return_std=True)
 
-rng = np.random.RandomState(0)
-X = rng.uniform(0, 5, 20)[:, np.newaxis]
-y = 0.5 * np.sin(3 * X[:, 0]) + rng.normal(0, 0.5, X.shape[0])
-
-# First run
-plt.figure()
-kernel = 1.0 * RBF(length_scale=100.0, length_scale_bounds=(1e-2, 1e3)) \
-    + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))
-gp = GaussianProcessRegressor(kernel=kernel,
-                              alpha=0.0).fit(X, y)
-X_ = np.linspace(0, 5, 100)
-y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
-plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
-plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
-                 y_mean + np.sqrt(np.diag(y_cov)),
-                 alpha=0.5, color='k')
-plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
-plt.scatter(X[:, 0], y, c='r', s=50, zorder=10, edgecolors=(0, 0, 0))
-plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
-          % (kernel, gp.kernel_,
-             gp.log_marginal_likelihood(gp.kernel_.theta)))
-plt.tight_layout()
-
-# Second run
-plt.figure()
-kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
-    + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-10, 1e+1))
-gp = GaussianProcessRegressor(kernel=kernel,
-                              alpha=0.0).fit(X, y)
-X_ = np.linspace(0, 5, 100)
-y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
-plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
-plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
-                 y_mean + np.sqrt(np.diag(y_cov)),
-                 alpha=0.5, color='k')
-plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
-plt.scatter(X[:, 0], y, c='r', s=50, zorder=10, edgecolors=(0, 0, 0))
-plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
-          % (kernel, gp.kernel_,
-             gp.log_marginal_likelihood(gp.kernel_.theta)))
-plt.tight_layout()
-
-# Plot LML landscape
-plt.figure()
-theta0 = np.logspace(-2, 3, 49)
-theta1 = np.logspace(-2, 0, 50)
-Theta0, Theta1 = np.meshgrid(theta0, theta1)
-LML = [[gp.log_marginal_likelihood(np.log([0.36, Theta0[i, j], Theta1[i, j]]))
-        for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
-LML = np.array(LML).T
-
-vmin, vmax = (-LML).min(), (-LML).max()
-vmax = 50
-level = np.around(np.logspace(np.log10(vmin), np.log10(vmax), 50), decimals=1)
-plt.contour(Theta0, Theta1, -LML,
-            levels=level, norm=LogNorm(vmin=vmin, vmax=vmax))
+# %%
+plt.plot(X, y, label="Perfect generator")
+plt.scatter(
+    x=X_train[:, 0], y=y_train, color="tab:orange", label="Noisy measurement"
+)
+plt.errorbar(X, y_mean, y_std)
+plt.legend()
+plt.xlabel("X")
+plt.ylabel("y")
+_ = plt.title(
+    f"Initial: {kernel}\nOptimum: {gpr.kernel_}\nLog-Marginal-Likelihood: "
+    f"{gpr.log_marginal_likelihood(gpr.kernel_.theta)}"
+)
+# %%
+# We see that the optimum kernel found still have a high noise level and
+# and an even larger length scale. However, we see that the model does not
+# provide satisfactory predictions.
+#
+# Now, we will initialize the hyperparameter with a lower noise level and
+# length scale.
+kernel = (
+    1.0 * RBF(length_scale=1e-1, length_scale_bounds=(1e-2, 1e3)) +
+    WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-10, 1e1))
+)
+gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.0)
+gpr.fit(X_train, y_train)
+y_mean, y_std = gpr.predict(X, return_std=True)
+
+# %%
+plt.plot(X, y, label="Perfect generator")
+plt.scatter(
+    x=X_train[:, 0], y=y_train, color="tab:orange", label="Noisy measurement"
+)
+plt.errorbar(X, y_mean, y_std)
+plt.legend()
+plt.xlabel("X")
+plt.ylabel("y")
+_ = plt.title(
+    f"Initial: {kernel}\nOptimum: {gpr.kernel_}\nLog-Marginal-Likelihood: "
+    f"{gpr.log_marginal_likelihood(gpr.kernel_.theta)}"
+)
+
+# %%
+# First, we see that the model is more satisfactory than the previous one
+# regarding the predictions. It is able to estimate the noise-free functional
+# relationship.
+#
+# Looking at the kernel hyperparameters, we see that the best combination found
+# as a smaller noise level and shorter length scale than the first model.
+#
+# We can have a look at the Log-Marginal-Likelihood (LML) of GPR for different
+# hyperparameters to have an idea of the minima.
+from matplotlib.colors import LogNorm
+
+length_scale = np.logspace(-2, 4, num=50)
+noise_level = np.logspace(-2, 1, num=50)
+length_scale_grid, noise_level_grid = np.meshgrid(length_scale, noise_level)
+
+log_marginal_likelihood = [
+    gpr.log_marginal_likelihood(np.log([0.36, scale, noise]))
+    for scale, noise in zip(length_scale_grid.ravel(),
+                            noise_level_grid.ravel())
+]
+log_marginal_likelihood = np.reshape(
+    log_marginal_likelihood, newshape=noise_level_grid.shape
+)
+
+# %%
+vmin, vmax = (-log_marginal_likelihood).min(), 50
+level = np.around(
+    np.logspace(np.log10(vmin), np.log10(vmax), num=50), decimals=1
+)
+plt.contour(
+    length_scale_grid, noise_level_grid, -log_marginal_likelihood,
+    levels=level, norm=LogNorm(vmin=vmin, vmax=vmax),
+)
 plt.colorbar()
 plt.xscale("log")
 plt.yscale("log")
 plt.xlabel("Length-scale")
 plt.ylabel("Noise-level")
 plt.title("Log-marginal-likelihood")
-plt.tight_layout()
-
 plt.show()
+
+# %%
+# We see that there are two local minima that correspond to the combination
+# of hyperparameters previously found. Depending on the initial value for the
+# hyperparameters, the gradient-based optimization might converge whether or
+# not to the best model. It is thus important to repeat the optimization
+# several times for different initializations.

From 286408ef1d880c70914363e7ab4a69977bf3c7e9 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 29 Apr 2021 00:07:51 +0200
Subject: [PATCH 02/40] iter

---
 doc/modules/gaussian_process.rst | 33 +++-----------------------------
 1 file changed, 3 insertions(+), 30 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 089ad9672647c..805ace85bf9f2 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -77,40 +77,13 @@ the API of standard scikit-learn estimators, GaussianProcessRegressor:
   externally for other ways of selecting hyperparameters, e.g., via
   Markov chain Monte Carlo.
 
+.. topic:: Examples
+
+   * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy.py`
 
 GPR examples
 ============
 
-GPR with noise-level estimation
--------------------------------
-This example illustrates that GPR with a sum-kernel including a WhiteKernel can
-estimate the noise level of data. An illustration of the
-log-marginal-likelihood (LML) landscape shows that there exist two local
-maxima of LML.
-
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_001.png
-   :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
-   :align: center
-
-The first corresponds to a model with a high noise level and a
-large length scale, which explains all variations in the data by noise.
-
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_002.png
-   :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
-   :align: center
-
-The second one has a smaller noise level and shorter length scale, which explains
-most of the variation by the noise-free functional relationship. The second
-model has a higher likelihood; however, depending on the initial value for the
-hyperparameters, the gradient-based optimization might also converge to the
-high-noise solution. It is thus important to repeat the optimization several
-times for different initializations.
-
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_003.png
-   :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
-   :align: center
-
-
 Comparison of GPR and Kernel Ridge Regression
 ---------------------------------------------
 

From 0383114ca15304686de94f3be0b58fb40470917c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 8 May 2021 19:04:13 +0200
Subject: [PATCH 03/40] iter

---
 examples/gaussian_process/plot_gpr_prior_posterior.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/gaussian_process/plot_gpr_prior_posterior.py b/examples/gaussian_process/plot_gpr_prior_posterior.py
index 78eb22df48f6a..69df8af23b9f4 100644
--- a/examples/gaussian_process/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/plot_gpr_prior_posterior.py
@@ -13,6 +13,7 @@
 #
 # License: BSD 3 clause
 
+# %%
 import numpy as np
 
 from matplotlib import pyplot as plt
@@ -24,7 +25,8 @@
 
 
 kernels = [1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)),
-           1.0 * RationalQuadratic(length_scale=1.0, alpha=0.1),
+           1.0 * RationalQuadratic(length_scale=1.0, alpha=0.1,
+                                   alpha_bounds=(1e-5, 1e10)),
            1.0 * ExpSineSquared(length_scale=1.0, periodicity=3.0,
                                 length_scale_bounds=(0.1, 10.0),
                                 periodicity_bounds=(1.0, 10.0)),
@@ -76,3 +78,5 @@
     plt.tight_layout()
 
 plt.show()
+
+# %%

From 69a984eb47be1d13804f2c36d305329ca8a36c09 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 17 May 2021 16:33:55 +0200
Subject: [PATCH 04/40] iter

---
 doc/modules/gaussian_process.rst              |   6 +-
 .../gaussian_process/plot_compare_gpr_krr.py  | 470 ++++++++++++++----
 2 files changed, 371 insertions(+), 105 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 805ace85bf9f2..c844a3cac048d 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -1,5 +1,3 @@
-
-
 .. _gaussian_process:
 
 ==================
@@ -27,8 +25,8 @@ The advantages of Gaussian processes are:
 
 The disadvantages of Gaussian processes include:
 
-    - They are not sparse, i.e., they use the whole samples/features information to
-      perform the prediction.
+    - They are not sparse, i.e., they use the whole samples/features
+      information to perform the prediction.
 
     - They lose efficiency in high dimensional spaces -- namely when the number
       of features exceeds a few dozens.
diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index 1eb771673b0d6..33d30237294b2 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -3,119 +3,387 @@
 Comparison of kernel ridge and Gaussian process regression
 ==========================================================
 
-Both kernel ridge regression (KRR) and Gaussian process regression (GPR) learn
-a target function by employing internally the "kernel trick". KRR learns a
-linear function in the space induced by the respective kernel which corresponds
-to a non-linear function in the original space. The linear function in the
-kernel space is chosen based on the mean-squared error loss with
-ridge regularization. GPR uses the kernel to define the covariance of
-a prior distribution over the target functions and uses the observed training
-data to define a likelihood function. Based on Bayes theorem, a (Gaussian)
-posterior distribution over target functions is defined, whose mean is used
-for prediction.
-
-A major difference is that GPR can choose the kernel's hyperparameters based
-on gradient-ascent on the marginal likelihood function while KRR needs to
-perform a grid search on a cross-validated loss function (mean-squared error
-loss). A further difference is that GPR learns a generative, probabilistic
-model of the target function and can thus provide meaningful confidence
-intervals and posterior samples along with the predictions while KRR only
-provides predictions.
-
-This example illustrates both methods on an artificial dataset, which
-consists of a sinusoidal target function and strong noise. The figure compares
-the learned model of KRR and GPR based on a ExpSineSquared kernel, which is
-suited for learning periodic functions. The kernel's hyperparameters control
-the smoothness (l) and periodicity of the kernel (p). Moreover, the noise level
-of the data is learned explicitly by GPR by an additional WhiteKernel component
-in the kernel and by the regularization parameter alpha of KRR.
-
-The figure shows that both methods learn reasonable models of the target
-function. GPR correctly identifies the periodicity of the function to be
-roughly 2*pi (6.28), while KRR chooses the doubled periodicity 4*pi. Besides
-that, GPR provides reasonable confidence bounds on the prediction which are not
-available for KRR. A major difference between the two methods is the time
-required for fitting and predicting: while fitting KRR is fast in principle,
-the grid-search for hyperparameter optimization scales exponentially with the
-number of hyperparameters ("curse of dimensionality"). The gradient-based
-optimization of the parameters in GPR does not suffer from this exponential
-scaling and is thus considerable faster on this example with 3-dimensional
-hyperparameter space. The time for predicting is similar; however, generating
-the variance of the predictive distribution of GPR takes considerable longer
-than just predicting the mean.
+This example illustrates differences between a kernel ridge regression and a
+Gaussian process regression.
+
+Both kernel ridge regression and Gaussian process regression are using a
+so-called "kernel trick" to make their models expressive enough to not underfit
+the training data. However, the machine learning problems solved by the two
+methods are drastically different.
+
+Kernel ridge regression will find the target function that minimizes a loss
+function (the mean squared error).
+
+Instead of finding a single target function, the Gaussian process regression
+employs a probabilistic approach. It gives prior probabilities to each possible
+target functions and combines it with a likelihood function defined by the
+observed training data. Thus, a Gaussian posterior distribution over target
+functions is defined based on the Bayes' theorem.
+
+We will illustrate with an example these differences and we will also focus on
+the tuning of the kernel hyperparameters.
 """
 print(__doc__)
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#          Guillaume Lemaitre <g.lemaitre58@gmail.com>
 # License: BSD 3 clause
 
+# %%
+# Generating a dataset
+# --------------------
+#
+# We create a synthetic dataset. The true generative process will take a 1-D
+# vector and compute its sine. Note that the period of this sine is thus
+# :math:`2 \pi`. We will reuse this information later in this example.
+import numpy as np
 
-import time
+rng = np.random.RandomState(0)
+data = np.linspace(0, 30, num=1_000).reshape(-1, 1)
+target = np.sin(data).ravel()
 
-import numpy as np
+# %%
+# Now, we can imagine a scenario where we get observations from this true
+# process. However, we will add some challenges:
+#
+# - the measurements will be noisy;
+# - only samples from the beginning of the signal will be available.
+training_sample_indices = rng.choice(np.arange(0, 400), size=40, replace=False)
+training_data = data[training_sample_indices]
+training_noisy_target = target[training_sample_indices] + 0.5 * rng.randn(
+    len(training_sample_indices)
+)
 
+# %%
+# Let's plot the true signal and the noisy measurements available for training.
 import matplotlib.pyplot as plt
 
+plt.plot(data, target, label="True signal", linewidth=2)
+plt.scatter(
+    training_data,
+    training_noisy_target,
+    color="black",
+    label="Noisy measurements",
+)
+plt.legend()
+plt.xlabel("data")
+plt.ylabel("target")
+_ = plt.title("Illustration of the true generative process and \n"
+              "noisy measurements available during training")
+
+# %%
+# Limitations of a simple linear model
+# ------------------------------------
+#
+# First, we would like to highlight the limitations of a linear model given
+# our dataset. We fit a :class:`~sklearn.linear_model.Ridge` and check the
+# predictions of this model on our dataset.
+from sklearn.linear_model import Ridge
+
+ridge = Ridge().fit(training_data, training_noisy_target)
+
+plt.plot(data, target, label="True signal", linewidth=2)
+plt.scatter(
+    training_data,
+    training_noisy_target,
+    color="black",
+    label="Noisy measurements",
+)
+plt.plot(data, ridge.predict(data), label="Ridge regression")
+plt.legend()
+plt.xlabel("data")
+plt.ylabel("target")
+_ = plt.title("Limitation of a linear model such as ridge")
+
+# %%
+# Such a ridge regressor underfits since it is not enough expressive.
+#
+# Kernel methods: kernel ridge and Gaussian process
+# -------------------------------------------------
+#
+# Kernel ridge
+# ............
+#
+# We can make the previous linear model more expressive by using a so-called
+# kernel. A kernel is used to make a model more expressive; for simplicity, it
+# would be equivalent to project our original data into an more complex feature
+# space. This new space is defined by the choice of kernel.
+#
+# In our case, we know that the true generative process is a periodic function.
+# We can use a :class:`~sklearn.gaussian_process.kernels.ExpSineSquared`. This
+# kernel allows to fit such periodicity. The class
+# :class:`~sklearn.kernel_ridge.KernelRidge` will accept such a kernel.
+#
+# Using this model together with a kernel is equivalent to project the data
+# using the mapping function of the kernel and then apply a ridge regression.
+# In practice, the data are not mapped explicitely; instead the dot product
+# between samples in the higher dimensional feature space is computed using the
+# "kernel trick".
+#
+# Thus, let's use such a :class:`~sklearn.kernel_ridge.KernelRidge`.
+import time
+from sklearn.gaussian_process.kernels import ExpSineSquared
 from sklearn.kernel_ridge import KernelRidge
-from sklearn.model_selection import GridSearchCV
+
+kernel_ridge = KernelRidge(kernel=ExpSineSquared())
+
+start_time = time.time()
+kernel_ridge.fit(training_data, training_noisy_target)
+print(f"Fitting KernelRidge with default kernel: "
+      f"{time.time() - start_time:.3f} seconds")
+
+# %%
+plt.plot(data, target, label="True signal", linewidth=2, linestyle="dashed")
+plt.scatter(
+    training_data,
+    training_noisy_target,
+    color="black",
+    label="Noisy measurements",
+)
+plt.plot(
+    data,
+    kernel_ridge.predict(data),
+    label="Kernel ridge",
+    linewidth=2,
+    linestyle="dashdot",
+)
+plt.legend(loc="lower right")
+plt.xlabel("data")
+plt.ylabel("target")
+_ = plt.title("Kernel ridge regression with an exponential sine squared\n "
+              "kernel using default hyperparameters")
+
+# %%
+# Our fitted model does not work. Indeed, we did not set the parameters of the
+# kernel and instead used the default hyperparameters. We can have a look at
+# them.
+kernel_ridge.kernel
+
+# %%
+# Our kernel has two parameters: the length-scale and the periodicity. For our
+# dataset, we use `sin(x)` as the generative process. It means that we have
+# a periodicity of :math:`2 \pi`. The default value of the parameter being 1,
+# it explains the high frequency observed in the predictions of our model.
+# Similar conclusions could be drawn with the length-scale parameter. Thus, it
+# tell us that the kernel parameters need to be tuned. We will use a randomized
+# search to tune the different parameters the kernel ridge model: the `alpha`
+# parameter and the kernel parameters.
+
+# %%
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.utils.fixes import loguniform
+
+param_distributions = {
+    "alpha": loguniform(1e0, 1e3),
+    "kernel__length_scale": loguniform(1e-2, 1e2),
+    "kernel__periodicity": loguniform(1e0, 1e1),
+}
+kernel_ridge_tuned = RandomizedSearchCV(
+    kernel_ridge,
+    param_distributions=param_distributions,
+    n_iter=500,
+    random_state=0,
+)
+start_time = time.time()
+kernel_ridge_tuned.fit(training_data, training_noisy_target)
+print(f"Time for KernelRidge fitting: {time.time() - start_time:.3f} seconds")
+
+# %%
+# Fitting the model is now more computationally expensive since we have to try
+# several combinations of hyperparameters. We can have a look at the
+# hyperparameters found to get some intuitions.
+kernel_ridge_tuned.best_params_
+
+# %%
+# Looking at the best parameters, we see that they are different from the
+# defaults. We also see that the periodicity is closer to the expected value:
+# :math:`2 \pi`. We can now have a look at the predictions of our tuned kernel
+# ridge.
+start_time = time.time()
+predictions_kr = kernel_ridge_tuned.predict(data)
+print(f"Time for KernelRidge predict: {time.time() - start_time:.3f} seconds")
+
+# %%
+plt.plot(data, target, label="True signal", linewidth=2, linestyle="dashed")
+plt.scatter(
+    training_data,
+    training_noisy_target,
+    color="black",
+    label="Noisy measurements",
+)
+plt.plot(
+    data,
+    predictions_kr,
+    label="Kernel ridge",
+    linewidth=2,
+    linestyle="dashdot",
+)
+plt.legend(loc="lower right")
+plt.xlabel("data")
+plt.ylabel("target")
+_ = plt.title("Kernel ridge regression with an exponential sine squared\n "
+              "kernel using tuned hyperparameters")
+
+# %%
+# We get a much more accurate model. We still observe some errors mainly due to
+# the noise added to the dataset.
+#
+# Gaussian process regression
+# ...........................
+#
+# Now, we will use a
+# :class:`~sklearn.gaussian_process.GaussianProcessRegressor` to fit the same
+# dataset. When training a Gaussian process, the hyperparameters of the kernel
+# are optimized during the fitting process. There is no need for an external
+# hyperparameter search. Here, we create a slightly more complex kernel than
+# for the kernel ridge regressor: we add a
+# :class:`~sklearn.gaussian_process.kernels.WhiteKernel` that is used to
+# estimate the noise in the dataset.
 from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels import WhiteKernel, ExpSineSquared
+from sklearn.gaussian_process.kernels import WhiteKernel
 
-rng = np.random.RandomState(0)
+kernel = 1.0 * ExpSineSquared(
+    1.0, 5.0, periodicity_bounds=(1e-2, 1e1)
+) + WhiteKernel(1e-1)
+gaussian_process = GaussianProcessRegressor(kernel=kernel)
+start_time = time.time()
+gaussian_process.fit(training_data, training_noisy_target)
+print(
+    f"Time for GaussianProcessRegressor fitting: "
+    f"{time.time() - start_time:.3f} seconds"
+)
+
+# %%
+# The computation cost of training a Gaussian process is much less than the
+# kernel ridge that uses a randomized search. We can check the parameters of
+# the kernels that we computed.
+gaussian_process.kernel_
+
+# %%
+# Indeed, we see that the parameters have been optimized. Looking at the
+# `periodicity` parameter, we see that we found a period close to the
+# theoretical value :math:`2 \pi`. We can have a look now at the predictions of
+# our model.
+start_time = time.time()
+mean_predictions_gpr, std_predictions_gpr = gaussian_process.predict(
+    data,
+    return_std=True,
+)
+print(
+    f"Time for GaussianProcessRegressor predict: "
+    f"{time.time() - start_time:.3f} seconds"
+)
+
+# %%
+plt.plot(data, target, label="True signal", linewidth=2, linestyle="dashed")
+plt.scatter(
+    training_data,
+    training_noisy_target,
+    color="black",
+    label="Noisy measurements",
+)
+# Plot the predictions of the kernel ridge
+plt.plot(
+    data,
+    predictions_kr,
+    label="Kernel ridge",
+    linewidth=2,
+    linestyle="dashdot",
+)
+# Plot the predictions of the gaussian process regressor
+plt.plot(
+    data,
+    mean_predictions_gpr,
+    label="Gaussian process regressor",
+    linewidth=2,
+    linestyle="dotted",
+)
+plt.fill_between(
+    data.ravel(),
+    mean_predictions_gpr - std_predictions_gpr,
+    mean_predictions_gpr + std_predictions_gpr,
+    color="tab:green",
+    alpha=0.2,
+)
+plt.legend(loc="lower right")
+plt.xlabel("data")
+plt.ylabel("target")
+_ = plt.title("Comparison between kernel ridge and gaussian process regressor")
+
+# %%
+# We observe that the results of the kernel ridge and the Gaussian process
+# regressor are close. However, the Gaussian process regressor also provide
+# an uncertainty information that is not available with a kernel ridge.
+# Due to the probabilistic formulation regarding the target function, the
+# Gaussian process can output the standard deviation (or the covariance)
+# together with the mean predictions of the target functions.
+#
+# However, it comes at a cost: the time to compute the predictions is higher
+# with a Gaussian process.
+#
+# Final conclusion
+# ----------------
+#
+# We can give a final word regarding the possibility of the two models to
+# extrapolate. Indeed, we provided only the beginning of the signal as a
+# training set. Using a periodic kernel forces our model to repeat the pattern
+# found on the training set. Using this kernel information together with the
+# capacity of the both models to extrapolate, we observe that the models will
+# continue to predict the sine pattern.
+#
+# Gaussian process allows to combine kernels together. Thus, we could associate
+# the exponential sine squared kernel together with a radial basis function
+# kernel
+from sklearn.gaussian_process.kernels import RBF
+
+kernel = 1.0 * ExpSineSquared(
+    1.0, 5.0, periodicity_bounds=(1e-2, 1e1)
+) * RBF(length_scale=15, length_scale_bounds="fixed") + WhiteKernel(1e-1)
+gaussian_process = GaussianProcessRegressor(kernel=kernel)
+gaussian_process.fit(training_data, training_noisy_target)
+mean_predictions_gpr, std_predictions_gpr = gaussian_process.predict(
+    data,
+    return_std=True,
+)
+
+# %%
+plt.plot(data, target, label="True signal", linewidth=2, linestyle="dashed")
+plt.scatter(
+    training_data,
+    training_noisy_target,
+    color="black",
+    label="Noisy measurements",
+)
+# Plot the predictions of the kernel ridge
+plt.plot(
+    data,
+    predictions_kr,
+    label="Kernel ridge",
+    linewidth=2,
+    linestyle="dashdot",
+)
+# Plot the predictions of the gaussian process regressor
+plt.plot(
+    data,
+    mean_predictions_gpr,
+    label="Gaussian process regressor",
+    linewidth=2,
+    linestyle="dotted",
+)
+plt.fill_between(
+    data.ravel(),
+    mean_predictions_gpr - std_predictions_gpr,
+    mean_predictions_gpr + std_predictions_gpr,
+    color="tab:green",
+    alpha=0.2,
+)
+plt.legend(loc="lower right")
+plt.xlabel("data")
+plt.ylabel("target")
+_ = plt.title("Effect of using a radial basis function kernel")
 
-# Generate sample data
-X = 15 * rng.rand(100, 1)
-y = np.sin(X).ravel()
-y += 3 * (0.5 - rng.rand(X.shape[0]))  # add noise
-
-# Fit KernelRidge with parameter selection based on 5-fold cross validation
-param_grid = {"alpha": [1e0, 1e-1, 1e-2, 1e-3],
-              "kernel": [ExpSineSquared(l, p)
-                         for l in np.logspace(-2, 2, 10)
-                         for p in np.logspace(0, 2, 10)]}
-kr = GridSearchCV(KernelRidge(), param_grid=param_grid)
-stime = time.time()
-kr.fit(X, y)
-print("Time for KRR fitting: %.3f" % (time.time() - stime))
-
-gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) \
-    + WhiteKernel(1e-1)
-gpr = GaussianProcessRegressor(kernel=gp_kernel)
-stime = time.time()
-gpr.fit(X, y)
-print("Time for GPR fitting: %.3f" % (time.time() - stime))
-
-# Predict using kernel ridge
-X_plot = np.linspace(0, 20, 10000)[:, None]
-stime = time.time()
-y_kr = kr.predict(X_plot)
-print("Time for KRR prediction: %.3f" % (time.time() - stime))
-
-# Predict using gaussian process regressor
-stime = time.time()
-y_gpr = gpr.predict(X_plot, return_std=False)
-print("Time for GPR prediction: %.3f" % (time.time() - stime))
-
-stime = time.time()
-y_gpr, y_std = gpr.predict(X_plot, return_std=True)
-print("Time for GPR prediction with standard-deviation: %.3f"
-      % (time.time() - stime))
-
-# Plot results
-plt.figure(figsize=(10, 5))
-lw = 2
-plt.scatter(X, y, c='k', label='data')
-plt.plot(X_plot, np.sin(X_plot), color='navy', lw=lw, label='True')
-plt.plot(X_plot, y_kr, color='turquoise', lw=lw,
-         label='KRR (%s)' % kr.best_params_)
-plt.plot(X_plot, y_gpr, color='darkorange', lw=lw,
-         label='GPR (%s)' % gpr.kernel_)
-plt.fill_between(X_plot[:, 0], y_gpr - y_std, y_gpr + y_std, color='darkorange',
-                 alpha=0.2)
-plt.xlabel('data')
-plt.ylabel('target')
-plt.xlim(0, 20)
-plt.ylim(-4, 4)
-plt.title('GPR versus Kernel Ridge')
-plt.legend(loc="best",  scatterpoints=1, prop={'size': 8})
-plt.show()
+# %%
+# The effect of using an radial basis function kernel will attenuate the
+# periodicity effect once that no sample are available in the training.
+# The predictions is converging towards the mean of the probability and the
+# standard deviation increases.

From e3f359de35d2b75506dbbdca9d9e66464c3e4576 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 17 May 2021 16:35:53 +0200
Subject: [PATCH 05/40] iter

---
 doc/modules/gaussian_process.rst | 51 +-------------------------------
 1 file changed, 1 insertion(+), 50 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index c844a3cac048d..d50886a55ad9e 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -78,60 +78,11 @@ the API of standard scikit-learn estimators, GaussianProcessRegressor:
 .. topic:: Examples
 
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy.py`
+   * :ref:`sphx_glr_auto_examples_gaussian_process_plot_compare_gpr_krr.py`
 
 GPR examples
 ============
 
-Comparison of GPR and Kernel Ridge Regression
----------------------------------------------
-
-Both kernel ridge regression (KRR) and GPR learn
-a target function by employing internally the "kernel trick". KRR learns a
-linear function in the space induced by the respective kernel which corresponds
-to a non-linear function in the original space. The linear function in the
-kernel space is chosen based on the mean-squared error loss with
-ridge regularization. GPR uses the kernel to define the covariance of
-a prior distribution over the target functions and uses the observed training
-data to define a likelihood function. Based on Bayes theorem, a (Gaussian)
-posterior distribution over target functions is defined, whose mean is used
-for prediction.
-
-A major difference is that GPR can choose the kernel's hyperparameters based
-on gradient-ascent on the marginal likelihood function while KRR needs to
-perform a grid search on a cross-validated loss function (mean-squared error
-loss). A further difference is that GPR learns a generative, probabilistic
-model of the target function and can thus provide meaningful confidence
-intervals and posterior samples along with the predictions while KRR only
-provides predictions.
-
-The following figure illustrates both methods on an artificial dataset, which
-consists of a sinusoidal target function and strong noise. The figure compares
-the learned model of KRR and GPR based on a ExpSineSquared kernel, which is
-suited for learning periodic functions. The kernel's hyperparameters control
-the smoothness (length_scale) and periodicity of the kernel (periodicity).
-Moreover, the noise level
-of the data is learned explicitly by GPR by an additional WhiteKernel component
-in the kernel and by the regularization parameter alpha of KRR.
-
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_compare_gpr_krr_001.png
-   :target: ../auto_examples/gaussian_process/plot_compare_gpr_krr.html
-   :align: center
-
-The figure shows that both methods learn reasonable models of the target
-function. GPR correctly identifies the periodicity of the function to be
-roughly :math:`2*\pi` (6.28), while KRR chooses the doubled periodicity
-:math:`4*\pi` . Besides
-that, GPR provides reasonable confidence bounds on the prediction which are not
-available for KRR. A major difference between the two methods is the time
-required for fitting and predicting: while fitting KRR is fast in principle,
-the grid-search for hyperparameter optimization scales exponentially with the
-number of hyperparameters ("curse of dimensionality"). The gradient-based
-optimization of the parameters in GPR does not suffer from this exponential
-scaling and is thus considerably faster on this example with 3-dimensional
-hyperparameter space. The time for predicting is similar; however, generating
-the variance of the predictive distribution of GPR takes considerably longer
-than just predicting the mean.
-
 GPR on Mauna Loa CO2 data
 -------------------------
 

From bd281fa22747d46b9c67097e27623dfeff0e9a3c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 17 May 2021 17:37:56 +0200
Subject: [PATCH 06/40] remove exercise

---
 doc/modules/gaussian_process.rst | 67 +-------------------------------
 1 file changed, 1 insertion(+), 66 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index d50886a55ad9e..99bef6020118b 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -79,72 +79,7 @@ the API of standard scikit-learn estimators, GaussianProcessRegressor:
 
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy.py`
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_compare_gpr_krr.py`
-
-GPR examples
-============
-
-GPR on Mauna Loa CO2 data
--------------------------
-
-This example is based on Section 5.4.3 of [RW2006]_.
-It illustrates an example of complex kernel engineering and
-hyperparameter optimization using gradient ascent on the
-log-marginal-likelihood. The data consists of the monthly average atmospheric
-CO2 concentrations (in parts per million by volume (ppmv)) collected at the
-Mauna Loa Observatory in Hawaii, between 1958 and 1997. The objective is to
-model the CO2 concentration as a function of the time t.
-
-The kernel is composed of several terms that are responsible for explaining
-different properties of the signal:
-
-- a long term, smooth rising trend is to be explained by an RBF kernel. The
-  RBF kernel with a large length-scale enforces this component to be smooth;
-  it is not enforced that the trend is rising which leaves this choice to the
-  GP. The specific length-scale and the amplitude are free hyperparameters.
-
-- a seasonal component, which is to be explained by the periodic
-  ExpSineSquared kernel with a fixed periodicity of 1 year. The length-scale
-  of this periodic component, controlling its smoothness, is a free parameter.
-  In order to allow decaying away from exact periodicity, the product with an
-  RBF kernel is taken. The length-scale of this RBF component controls the
-  decay time and is a further free parameter.
-
-- smaller, medium term irregularities are to be explained by a
-  RationalQuadratic kernel component, whose length-scale and alpha parameter,
-  which determines the diffuseness of the length-scales, are to be determined.
-  According to [RW2006]_, these irregularities can better be explained by
-  a RationalQuadratic than an RBF kernel component, probably because it can
-  accommodate several length-scales.
-
-- a "noise" term, consisting of an RBF kernel contribution, which shall
-  explain the correlated noise components such as local weather phenomena,
-  and a WhiteKernel contribution for the white noise. The relative amplitudes
-  and the RBF's length scale are further free parameters.
-
-Maximizing the log-marginal-likelihood after subtracting the target's mean
-yields the following kernel with an LML of -83.214:
-
-::
-
-   34.4**2 * RBF(length_scale=41.8)
-   + 3.27**2 * RBF(length_scale=180) * ExpSineSquared(length_scale=1.44,
-                                                      periodicity=1)
-   + 0.446**2 * RationalQuadratic(alpha=17.7, length_scale=0.957)
-   + 0.197**2 * RBF(length_scale=0.138) + WhiteKernel(noise_level=0.0336)
-
-Thus, most of the target signal (34.4ppm) is explained by a long-term rising
-trend (length-scale 41.8 years). The periodic component has an amplitude of
-3.27ppm, a decay time of 180 years and a length-scale of 1.44. The long decay
-time indicates that we have a locally very close to periodic seasonal
-component. The correlated noise has an amplitude of 0.197ppm with a length
-scale of 0.138 years and a white-noise contribution of 0.197ppm. Thus, the
-overall noise level is very small, indicating that the data can be very well
-explained by the model. The figure shows also that the model makes very
-confident predictions until around 2015
-
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_co2_001.png
-   :target: ../auto_examples/gaussian_process/plot_gpr_co2.html
-   :align: center
+   * :ref:`sphx_glr_auto_examples_gaussian_process_plot_fpr_co2.py`
 
 .. _gpc:
 

From 7de916bce5695cb4259711333c49528645cc072b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 18 May 2021 12:31:06 +0200
Subject: [PATCH 07/40] iter

---
 examples/gaussian_process/plot_gpr_co2.py | 369 +++++++++++++---------
 1 file changed, 227 insertions(+), 142 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 2cc751438cbd4..c4455fe4d2bf5 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -1,156 +1,241 @@
 """
-========================================================
-Gaussian process regression (GPR) on Mauna Loa CO2 data.
-========================================================
+=======================================================
+Gaussian process regression (GPR) on Mauna Loa CO2 data
+=======================================================
 
 This example is based on Section 5.4.3 of "Gaussian Processes for Machine
-Learning" [RW2006]. It illustrates an example of complex kernel engineering and
-hyperparameter optimization using gradient ascent on the
+Learning" [RW2006]_. It illustrates an example of complex kernel engineering
+and hyperparameter optimization using gradient ascent on the
 log-marginal-likelihood. The data consists of the monthly average atmospheric
-CO2 concentrations (in parts per million by volume (ppmv)) collected at the
-Mauna Loa Observatory in Hawaii, between 1958 and 2001. The objective is to
-model the CO2 concentration as a function of the time t.
-
-The kernel is composed of several terms that are responsible for explaining
-different properties of the signal:
-
-- a long term, smooth rising trend is to be explained by an RBF kernel. The
-  RBF kernel with a large length-scale enforces this component to be smooth;
-  it is not enforced that the trend is rising which leaves this choice to the
-  GP. The specific length-scale and the amplitude are free hyperparameters.
-
-- a seasonal component, which is to be explained by the periodic
-  ExpSineSquared kernel with a fixed periodicity of 1 year. The length-scale
-  of this periodic component, controlling its smoothness, is a free parameter.
-  In order to allow decaying away from exact periodicity, the product with an
-  RBF kernel is taken. The length-scale of this RBF component controls the
-  decay time and is a further free parameter.
-
-- smaller, medium term irregularities are to be explained by a
-  RationalQuadratic kernel component, whose length-scale and alpha parameter,
-  which determines the diffuseness of the length-scales, are to be determined.
-  According to [RW2006], these irregularities can better be explained by
-  a RationalQuadratic than an RBF kernel component, probably because it can
-  accommodate several length-scales.
-
-- a "noise" term, consisting of an RBF kernel contribution, which shall
-  explain the correlated noise components such as local weather phenomena,
-  and a WhiteKernel contribution for the white noise. The relative amplitudes
-  and the RBF's length scale are further free parameters.
-
-Maximizing the log-marginal-likelihood after subtracting the target's mean
-yields the following kernel with an LML of -83.214::
-
-   34.4**2 * RBF(length_scale=41.8)
-   + 3.27**2 * RBF(length_scale=180) * ExpSineSquared(length_scale=1.44,
-                                                      periodicity=1)
-   + 0.446**2 * RationalQuadratic(alpha=17.7, length_scale=0.957)
-   + 0.197**2 * RBF(length_scale=0.138) + WhiteKernel(noise_level=0.0336)
-
-Thus, most of the target signal (34.4ppm) is explained by a long-term rising
-trend (length-scale 41.8 years). The periodic component has an amplitude of
-3.27ppm, a decay time of 180 years and a length-scale of 1.44. The long decay
-time indicates that we have a locally very close to periodic seasonal
-component. The correlated noise has an amplitude of 0.197ppm with a length
-scale of 0.138 years and a white-noise contribution of 0.197ppm. Thus, the
-overall noise level is very small, indicating that the data can be very well
-explained by the model. The figure shows also that the model makes very
-confident predictions until around 2015.
+CO:math:`_{2}` concentrations (in parts per million by volume (ppm)) collected
+at the Mauna Loa Observatory in Hawaii, between 1958 and 2001. The objective
+is to model the CO:math:`_{2}` concentration as a function of the time
+:math:`t` and extrapolate for years after 2001.
+
+.. topic: References
+
+    .. [RW2006] `Rasmussen, Carl Edward.
+       "Gaussian processes in machine learning."
+       Summer school on machine learning. Springer, Berlin, Heidelberg, 2003
+       <http://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_.
 """
-# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
-#
-# License: BSD 3 clause
 
+print(__doc__)
 
-import numpy as np
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#          Guillaume Lemaitre <g.lemaitre58@gmail.com>
+# License: BSD 3 clause
 
-from matplotlib import pyplot as plt
+# %%
+# Build the dataset
+# -----------------
+#
+# We will derive a dataset from the Mauna Loa Observatory that collected air
+# samples. We are interested in estimating the concentration of CO:math:`_{2}`
+# and extrapolate it for futher year. First, we load the original dataset
+# available in OpenML.
 from sklearn.datasets import fetch_openml
+
+co2 = fetch_openml(data_id=41187, as_frame=True)
+co2.frame.head()
+
+# %%
+# First, we process the original dataframe to create a date index and select
+# only the CO:math:`_{2}` column.
+import pandas as pd
+
+co2_data = co2.frame
+co2_data["date"] = pd.to_datetime(co2_data[["year", "month", "day"]])
+co2_data = co2_data[["date", "co2"]].set_index("date")
+co2_data.head()
+
+# %%
+co2_data.index.min(), co2_data.index.max()
+
+# %%
+# We see that we get CO:math:`_{2}` concentration for some days from
+# March, 1958 to December, 2001. We can plot these raw information to have a
+# better understanding.
+import matplotlib.pyplot as plt
+
+co2_data.plot()
+plt.ylabel("CO$_2$ concentration (ppm)")
+_ = plt.title("Raw air samples measurements from the Mauna Loa Observatory")
+
+# %%
+# We will preprocess the dataset by taking a monthly average and drop month
+# for which no measurements were collected. Such a processing will have an
+# effect to smooth the data.
+co2_data = co2_data.resample("M").mean().dropna(axis="index", how="any")
+co2_data.plot()
+plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
+_ = plt.title(
+    "Monthly average of air samples measurements\n"
+    "from the Mauna Loa Observatory"
+)
+
+# %%
+# The idea in this example will be to predict the CO:math:`_{2}` concentration
+# in function of the date. We are as well interesting in extrapolating for
+# upcoming year after 2001.
+#
+# As a first step, we will divide the data and the target to estimate. The data
+# being a date, we will convert it into a numeric.
+X = (co2_data.index.year + co2_data.index.month / 12).to_numpy().reshape(-1, 1)
+y = co2_data["co2"].to_numpy()
+
+# %%
+plt.plot(X, y, label="Measurements")
+plt.legend()
+plt.xlabel("Year")
+plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
+_ = plt.title(
+    "Monthly average of air samples measurements\n"
+    "from the Mauna Loa Observatory"
+)
+
+# %%
+# Design the proper kernel
+# ------------------------
+#
+# To design the kernel to use with our Gaussian process, we can make some
+# assumption regarding the data at hand. We observe that have several
+# characteristics: we see a long term rising trend, a pronounced seasonal
+# variation and some smaller irregularities. We can use different appropriate
+# kernel that would capture these features.
+#
+# First, the long term rising trend could be fitted using a radial basis
+# function (RBF) kernel with a large length-scale parameter. The RBF kernel
+# with a large length-scale enforces this component to be smooth; it is not
+# enforced that the trend is rising which leaves this choice to our model. The
+# specific length-scale and the amplitude are free hyperparameters.
+from sklearn.gaussian_process.kernels import RBF
+
+long_term_trend_kernel = 50.0 ** 2 * RBF(length_scale=50.0)
+
+# %%
+# The seasonal variation is explained by the periodic exponential sine squared
+# kernel with a fixed periodicity of 1 year. The length-scale of this periodic
+# component, controlling its smoothness, is a free parameter. In order to allow
+# decaying away from exact periodicity, the product with an RBF kernel is
+# taken. The length-scale of this RBF component controls the decay time and is
+# a further free parameter. This type of kernel is also known as locally
+# periodic kernel.
+from sklearn.gaussian_process.kernels import ExpSineSquared
+
+seasonal_kernel = (
+    2.0 ** 2
+    * RBF(length_scale=100.0)
+    * ExpSineSquared(
+        length_scale=1.0, periodicity=1.0, periodicity_bounds="fixed"
+    )
+)
+
+# %%
+# The small irregularities are to be explained by a rational quadratic kernel
+# component, whose length-scale and alpha parameter, which determines the
+# diffuseness of the length-scales, are to be determined. A rational quadratic
+# kernel is equivalent to an RBF kernel with several length-scale and will
+# better accommodate the different irregularities.
+from sklearn.gaussian_process.kernels import RationalQuadratic
+
+irregularities_kernel = 0.5 ** 2 * RationalQuadratic(
+    length_scale=1.0, alpha=1.0
+)
+
+# %%
+# Finally, the noise in the dataset can be accounted with a kernel consisting
+# of an RBF kernel contribution, which shall explain the correlated noise
+# components such as local weather phenomena, and a white kernel contribution
+# for the white noise. The relative amplitudes and the RBF's length scale are
+# further free parameters.
+from sklearn.gaussian_process.kernels import WhiteKernel
+
+noise_kernel = 0.1 ** 2 * RBF(length_scale=0.1) + WhiteKernel(
+    noise_level=0.1 ** 2, noise_level_bounds=(1e-5, 1e5)
+)
+
+# %%
+# Thus, our final kernel is an addition of all previous kernel.
+co2_kernel = (
+    long_term_trend_kernel
+    + seasonal_kernel
+    + irregularities_kernel
+    + noise_kernel
+)
+co2_kernel
+
+# %%
+# Model fitting and extrapolation
+# -------------------------------
+#
+# Now, we are ready to use a Gaussian process regressor and fit the available
+# data. To follow the example from the literature, we will subtract the mean
+# from the target. We could have use `normalize_y=True`. However, doing so
+# would have also scale the target (dividing `y` by its standard deviation).
+# Thus, the hyperparameters of the different kernel would have different
+# meaning since they would not be express in ppm units.
 from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels \
-    import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
 
-print(__doc__)
+y_mean = y.mean()
+gaussian_process = GaussianProcessRegressor(
+    kernel=co2_kernel, normalize_y=False
+)
+gaussian_process.fit(X, y - y_mean)
 
+# %%
+# Now, we will use the Gaussian process to predict on:
+#
+# - training data to inspect the goodness of fit;
+# - future data to see the extrapolation done by the model.
+#
+# Thus, we create synthetic data from 1958 to the current month. In addition,
+# we need to add the subtracted mean computed during training.
+import datetime
+import numpy as np
 
-def load_mauna_loa_atmospheric_co2():
-    ml_data = fetch_openml(data_id=41187, as_frame=False)
-    months = []
-    ppmv_sums = []
-    counts = []
-
-    y = ml_data.data[:, 0]
-    m = ml_data.data[:, 1]
-    month_float = y + (m - 1) / 12
-    ppmvs = ml_data.target
-
-    for month, ppmv in zip(month_float, ppmvs):
-        if not months or month != months[-1]:
-            months.append(month)
-            ppmv_sums.append(ppmv)
-            counts.append(1)
-        else:
-            # aggregate monthly sum to produce average
-            ppmv_sums[-1] += ppmv
-            counts[-1] += 1
-
-    months = np.asarray(months).reshape(-1, 1)
-    avg_ppmvs = np.asarray(ppmv_sums) / counts
-    return months, avg_ppmvs
-
-
-X, y = load_mauna_loa_atmospheric_co2()
-
-# Kernel with parameters given in GPML book
-k1 = 66.0**2 * RBF(length_scale=67.0)  # long term smooth rising trend
-k2 = 2.4**2 * RBF(length_scale=90.0) \
-    * ExpSineSquared(length_scale=1.3, periodicity=1.0)  # seasonal component
-# medium term irregularity
-k3 = 0.66**2 \
-    * RationalQuadratic(length_scale=1.2, alpha=0.78)
-k4 = 0.18**2 * RBF(length_scale=0.134) \
-    + WhiteKernel(noise_level=0.19**2)  # noise terms
-kernel_gpml = k1 + k2 + k3 + k4
-
-gp = GaussianProcessRegressor(kernel=kernel_gpml, alpha=0,
-                              optimizer=None, normalize_y=True)
-gp.fit(X, y)
-
-print("GPML kernel: %s" % gp.kernel_)
-print("Log-marginal-likelihood: %.3f"
-      % gp.log_marginal_likelihood(gp.kernel_.theta))
-
-# Kernel with optimized parameters
-k1 = 50.0**2 * RBF(length_scale=50.0)  # long term smooth rising trend
-k2 = 2.0**2 * RBF(length_scale=100.0) \
-    * ExpSineSquared(length_scale=1.0, periodicity=1.0,
-                     periodicity_bounds="fixed")  # seasonal component
-# medium term irregularities
-k3 = 0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0)
-k4 = 0.1**2 * RBF(length_scale=0.1) \
-    + WhiteKernel(noise_level=0.1**2,
-                  noise_level_bounds=(1e-5, np.inf))  # noise terms
-kernel = k1 + k2 + k3 + k4
-
-gp = GaussianProcessRegressor(kernel=kernel, alpha=0,
-                              normalize_y=True)
-gp.fit(X, y)
-
-print("\nLearned kernel: %s" % gp.kernel_)
-print("Log-marginal-likelihood: %.3f"
-      % gp.log_marginal_likelihood(gp.kernel_.theta))
-
-X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
-y_pred, y_std = gp.predict(X_, return_std=True)
-
-# Illustration
-plt.scatter(X, y, c='k')
-plt.plot(X_, y_pred)
-plt.fill_between(X_[:, 0], y_pred - y_std, y_pred + y_std,
-                 alpha=0.5, color='k')
-plt.xlim(X_.min(), X_.max())
+today = datetime.datetime.now()
+current_month = today.year + today.month / 12
+X_test = np.linspace(start=1958, stop=current_month, num=1_000).reshape(-1, 1)
+mean_y_pred, std_y_pred = gaussian_process.predict(X_test, return_std=True)
+mean_y_pred += y_mean
+
+# %%
+plt.plot(X, y, color="black", linestyle="dashed", label="Measurements")
+plt.plot(
+    X_test, mean_y_pred, color="tab:blue", alpha=0.4, label="Gaussian process"
+)
+plt.fill_between(
+    X_test.ravel(),
+    mean_y_pred - std_y_pred,
+    mean_y_pred + std_y_pred,
+    color="tab:blue",
+    alpha=0.2,
+)
+plt.legend()
 plt.xlabel("Year")
-plt.ylabel(r"CO$_2$ in ppm")
-plt.title(r"Atmospheric CO$_2$ concentration at Mauna Loa")
-plt.tight_layout()
-plt.show()
+plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
+_ = plt.title(
+    "Monthly average of air samples measurements\n"
+    "from the Mauna Loa Observatory"
+)
+
+# %%
+# Our fitted model is capable to fit previous data properly and extrapolate to
+# future year with confidence.
+#
+# Interpretation of kernel hyperparameters
+# ----------------------------------------
+#
+# Now, we can have a look at the hyperparameters of the kernel.
+gaussian_process.kernel_
+
+# %%
+# Thus, most of the target signal, with the mean substracted, is explained by a
+# long-term rising trend for ~45 ppm and a length-scale of ~52 years. The
+# periodic component has an amplitude of ~2.6ppm, a decay time of ~90 years and
+# a length-scale of ~1.5. The long decay time indicates that we have a locally
+# very close to periodic seasonal component. The correlated noise has an
+# amplitude of ~0.2 ppm with a length scale of ~0.12 years and a white-noise
+# contribution of ~0.04 ppm. Thus, the overall noise level is very small,
+# indicating that the data can be very well explained by the model.

From 8e907506a8ec09bb6026b23d8e3a1b73a7b15f25 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 18 May 2021 13:27:12 +0200
Subject: [PATCH 08/40] iter

---
 doc/modules/gaussian_process.rst          |  2 +-
 examples/gaussian_process/plot_gpr_co2.py | 28 +++++++++++------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 99bef6020118b..56236d52a568d 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -79,7 +79,7 @@ the API of standard scikit-learn estimators, GaussianProcessRegressor:
 
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy.py`
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_compare_gpr_krr.py`
-   * :ref:`sphx_glr_auto_examples_gaussian_process_plot_fpr_co2.py`
+   * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_co2.py`
 
 .. _gpc:
 
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index c4455fe4d2bf5..c4d9061f0f147 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -7,10 +7,10 @@
 Learning" [RW2006]_. It illustrates an example of complex kernel engineering
 and hyperparameter optimization using gradient ascent on the
 log-marginal-likelihood. The data consists of the monthly average atmospheric
-CO:math:`_{2}` concentrations (in parts per million by volume (ppm)) collected
-at the Mauna Loa Observatory in Hawaii, between 1958 and 2001. The objective
-is to model the CO:math:`_{2}` concentration as a function of the time
-:math:`t` and extrapolate for years after 2001.
+CO2 concentrations (in parts per million by volume (ppm)) collected at the
+Mauna Loa Observatory in Hawaii, between 1958 and 2001. The objective is to
+model the CO2 concentration as a function of the time :math:`t` and extrapolate
+for years after 2001.
 
 .. topic: References
 
@@ -31,9 +31,9 @@
 # -----------------
 #
 # We will derive a dataset from the Mauna Loa Observatory that collected air
-# samples. We are interested in estimating the concentration of CO:math:`_{2}`
-# and extrapolate it for futher year. First, we load the original dataset
-# available in OpenML.
+# samples. We are interested in estimating the concentration of CO2 and
+# extrapolate it for futher year. First, we load the original dataset available
+# in OpenML.
 from sklearn.datasets import fetch_openml
 
 co2 = fetch_openml(data_id=41187, as_frame=True)
@@ -41,7 +41,7 @@
 
 # %%
 # First, we process the original dataframe to create a date index and select
-# only the CO:math:`_{2}` column.
+# only the CO2 column.
 import pandas as pd
 
 co2_data = co2.frame
@@ -53,9 +53,9 @@
 co2_data.index.min(), co2_data.index.max()
 
 # %%
-# We see that we get CO:math:`_{2}` concentration for some days from
-# March, 1958 to December, 2001. We can plot these raw information to have a
-# better understanding.
+# We see that we get CO2 concentration for some days from March, 1958 to
+# December, 2001. We can plot these raw information to have a better
+# understanding.
 import matplotlib.pyplot as plt
 
 co2_data.plot()
@@ -75,9 +75,9 @@
 )
 
 # %%
-# The idea in this example will be to predict the CO:math:`_{2}` concentration
-# in function of the date. We are as well interesting in extrapolating for
-# upcoming year after 2001.
+# The idea in this example will be to predict the CO2 concentration in function
+# of the date. We are as well interesting in extrapolating for upcoming year
+# after 2001.
 #
 # As a first step, we will divide the data and the target to estimate. The data
 # being a date, we will convert it into a numeric.

From 37d1cefd4aca92b43281d1529ef5f868109ed4c0 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 18 May 2021 13:28:12 +0200
Subject: [PATCH 09/40] iter

---
 examples/gaussian_process/plot_gpr_co2.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index c4d9061f0f147..8520b6f916487 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -84,16 +84,6 @@
 X = (co2_data.index.year + co2_data.index.month / 12).to_numpy().reshape(-1, 1)
 y = co2_data["co2"].to_numpy()
 
-# %%
-plt.plot(X, y, label="Measurements")
-plt.legend()
-plt.xlabel("Year")
-plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
-_ = plt.title(
-    "Monthly average of air samples measurements\n"
-    "from the Mauna Loa Observatory"
-)
-
 # %%
 # Design the proper kernel
 # ------------------------

From d0739269e3a2886b2e5ff926a131cb80cb3fe420 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 18 May 2021 18:37:51 +0200
Subject: [PATCH 10/40] iter

---
 .../plot_gpr_noisy_targets.py                 | 222 +++++++++++-------
 1 file changed, 133 insertions(+), 89 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index 50992c19337b3..51b07b4bd3a26 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -11,103 +11,147 @@
 In both cases, the kernel's parameters are estimated using the maximum
 likelihood principle.
 
-The figures illustrate the interpolating property of the Gaussian Process
-model as well as its probabilistic nature in the form of a pointwise 95%
-confidence interval.
+The figures illustrate the interpolating property of the Gaussian Process model
+as well as its probabilistic nature in the form of a pointwise 95% confidence
+interval.
 
-Note that the parameter ``alpha`` is applied as a Tikhonov
-regularization of the assumed covariance between the training points.
+Note that the parameter `alpha` is applied as a Tikhonov regularization of the
+assumed covariance between the training points.
 """
 print(__doc__)
 
 # Author: Vincent Dubourg <vincent.dubourg@gmail.com>
 #         Jake Vanderplas <vanderplas@astro.washington.edu>
-#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>s
+#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#         Guillaume Lemaitre <g.lemaitre58@gmail.com>
 # License: BSD 3 clause
 
+# %%
+# Dataset generation
+# ------------------
+#
+# We will start by generating a synthetic dataset. The true generative process
+# is defined as :math:`f(x) = x \sin(x)`.
 import numpy as np
-from matplotlib import pyplot as plt
 
+X = np.linspace(start=0, stop=10, num=1_000).reshape(-1, 1)
+y = np.squeeze(X * np.sin(X))
+
+# %%
+import matplotlib.pyplot as plt
+
+plt.plot(X, y, label=r"$f(x) = x \sin(x)$", linestyle="dotted")
+plt.legend()
+plt.xlabel("$x$")
+plt.ylabel("$f(x)$")
+_ = plt.title("True generative process")
+
+# %%
+# We will use this dataset in the next experiment to illustrate how Gaussian
+# process regression is working.
+#
+# Example with noise-free target
+# ------------------------------
+#
+# In this first example, we will use the true generative process without
+# adding any noise. For training the Gaussian process regression, we will only
+# select few samples.
+rng = np.random.RandomState(1)
+training_indices = rng.choice(np.arange(y.size), size=6, replace=False)
+X_train, y_train = X[training_indices], y[training_indices]
+
+# %%
+# Now, we fit a Gaussian process on these few training data samples. We will
+# use a radial basis function (RBF) kernel and a constant parameter to fit the
+# amplitude.
 from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
-
-np.random.seed(1)
-
-
-def f(x):
-    """The function to predict."""
-    return x * np.sin(x)
-
-# ----------------------------------------------------------------------
-#  First the noiseless case
-X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T
-
-# Observations
-y = f(X).ravel()
-
-# Mesh the input space for evaluations of the real function, the prediction and
-# its MSE
-x = np.atleast_2d(np.linspace(0, 10, 1000)).T
-
-# Instantiate a Gaussian Process model
-kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
-gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)
-
-# Fit to data using Maximum Likelihood Estimation of the parameters
-gp.fit(X, y)
-
-# Make the prediction on the meshed x-axis (ask for MSE as well)
-y_pred, sigma = gp.predict(x, return_std=True)
-
-# Plot the function, the prediction and the 95% confidence interval based on
-# the MSE
-plt.figure()
-plt.plot(x, f(x), 'r:', label=r'$f(x) = x\,\sin(x)$')
-plt.plot(X, y, 'r.', markersize=10, label='Observations')
-plt.plot(x, y_pred, 'b-', label='Prediction')
-plt.fill(np.concatenate([x, x[::-1]]),
-         np.concatenate([y_pred - 1.9600 * sigma,
-                        (y_pred + 1.9600 * sigma)[::-1]]),
-         alpha=.5, fc='b', ec='None', label='95% confidence interval')
-plt.xlabel('$x$')
-plt.ylabel('$f(x)$')
-plt.ylim(-10, 20)
-plt.legend(loc='upper left')
-
-# ----------------------------------------------------------------------
-# now the noisy case
-X = np.linspace(0.1, 9.9, 20)
-X = np.atleast_2d(X).T
-
-# Observations and noise
-y = f(X).ravel()
-dy = 0.5 + 1.0 * np.random.random(y.shape)
-noise = np.random.normal(0, dy)
-y += noise
-
-# Instantiate a Gaussian Process model
-gp = GaussianProcessRegressor(kernel=kernel, alpha=dy ** 2,
-                              n_restarts_optimizer=10)
-
-# Fit to data using Maximum Likelihood Estimation of the parameters
-gp.fit(X, y)
-
-# Make the prediction on the meshed x-axis (ask for MSE as well)
-y_pred, sigma = gp.predict(x, return_std=True)
-
-# Plot the function, the prediction and the 95% confidence interval based on
-# the MSE
-plt.figure()
-plt.plot(x, f(x), 'r:', label=r'$f(x) = x\,\sin(x)$')
-plt.errorbar(X.ravel(), y, dy, fmt='r.', markersize=10, label='Observations')
-plt.plot(x, y_pred, 'b-', label='Prediction')
-plt.fill(np.concatenate([x, x[::-1]]),
-         np.concatenate([y_pred - 1.9600 * sigma,
-                        (y_pred + 1.9600 * sigma)[::-1]]),
-         alpha=.5, fc='b', ec='None', label='95% confidence interval')
-plt.xlabel('$x$')
-plt.ylabel('$f(x)$')
-plt.ylim(-10, 20)
-plt.legend(loc='upper left')
-
-plt.show()
+from sklearn.gaussian_process.kernels import RBF
+
+kernel = 1 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
+gaussian_process = GaussianProcessRegressor(
+    kernel=kernel, n_restarts_optimizer=9
+)
+gaussian_process.fit(X_train, y_train)
+gaussian_process.kernel_
+
+# %%
+# After fitting our model, we see that the hyperparemeters' kernel have been
+# optimized. Now, we will use our kernel to compute the mean prediction of the
+# full dataset and plot the 95% confidence interval.
+mean_prediction, std_prediction = gaussian_process.predict(X, return_std=True)
+
+plt.plot(X, y, label=r"$f(x) = x \sin(x)$", linestyle="dotted")
+plt.scatter(X_train, y_train, label="Observations")
+plt.plot(X, mean_prediction, label="Mean prediction")
+plt.fill_between(
+    X.ravel(),
+    mean_prediction - 1.96 * std_prediction,
+    mean_prediction + 1.96 * std_prediction,
+    alpha=0.5,
+    label=r"95% confidence interval",
+)
+plt.legend()
+plt.xlabel("$x$")
+plt.ylabel("$f(x)$")
+_ = plt.title("Gaussian process regression on noise-free dataset")
+
+# %%
+# We see that for a prediction close to a training sample, the 95% confidence
+# interval is small. Whenever a prediction is done far from training data, our
+# model become interval and the confidence interval is large.
+#
+# Example with noisy targets
+# --------------------------
+#
+# We can repeat a similar experiment but this time we will add an additional
+# noise to the target. It will allow us to see the effect of the noise on the
+# fitted model.
+#
+# We define the noise standard deviation and add it to the original training
+# target
+dy = 0.5 + 1.0 * rng.random(y_train.shape)
+y_train_noisy = y_train + rng.normal(0, dy)
+
+# %%
+# We create a similar Gaussian process model. In addition to the kernel, this
+# time, we specify the parameter `alpha` that is related to the variance of the
+# noise.
+gaussian_process = GaussianProcessRegressor(
+    kernel=kernel, alpha=dy ** 2, n_restarts_optimizer=9
+)
+gaussian_process.fit(X_train, y_train_noisy)
+mean_prediction, std_prediction = gaussian_process.predict(X, return_std=True)
+
+# %%
+# Let's plot the mean prediction and the confidence interval as before.
+plt.plot(X, y, label=r"$f(x) = x \sin(x)$", linestyle="dotted")
+plt.errorbar(
+    X_train,
+    y_train,
+    dy,
+    linestyle="None",
+    color="tab:blue",
+    marker=".",
+    markersize=10,
+    label="Observations",
+)
+plt.plot(X, mean_prediction, label="Mean prediction")
+plt.fill_between(
+    X.ravel(),
+    mean_prediction - 1.96 * std_prediction,
+    mean_prediction + 1.96 * std_prediction,
+    color="tab:orange",
+    alpha=0.5,
+    label=r"95% confidence interval",
+)
+plt.legend()
+plt.xlabel("$x$")
+plt.ylabel("$f(x)$")
+_ = plt.title("Gaussian process regression on a noisy dataset")
+
+# %%
+# The noise affect the predictions close to the training samples: now, the
+# confidence interval close to the training samples is larger because we
+# indicated the amount of noise for each training samples. As a result, the
+# confidence interval is larger on this noisy dataset than on the previous
+# example.

From 1a674e5e4363a97e478a81316b3ce75d88ba21d4 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 18 May 2021 18:51:12 +0200
Subject: [PATCH 11/40] iter

---
 doc/modules/gaussian_process.rst | 56 ++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 21 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 56236d52a568d..bb969e59c5925 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -40,28 +40,41 @@ Gaussian Process Regression (GPR)
 .. currentmodule:: sklearn.gaussian_process
 
 The :class:`GaussianProcessRegressor` implements Gaussian processes (GP) for
-regression purposes. For this, the prior of the GP needs to be specified. The
-prior mean is assumed to be constant and zero (for ``normalize_y=False``) or the
-training data's mean (for ``normalize_y=True``). The prior's
-covariance is specified by passing a :ref:`kernel <gp_kernels>` object. The
-hyperparameters of the kernel are optimized during fitting of
-:class:`GaussianProcessRegressor` by maximizing the log-marginal-likelihood (LML) based
-on the passed ``optimizer``. As the LML may have multiple local optima, the
-optimizer can be started repeatedly by specifying ``n_restarts_optimizer``. The
-first run is always conducted starting from the initial hyperparameter values
-of the kernel; subsequent runs are conducted from hyperparameter values
-that have been chosen randomly from the range of allowed values.
-If the initial hyperparameters should be kept fixed, `None` can be passed as
-optimizer.
+regression purposes. For this, the prior of the GP needs to be specified. GP
+will combine this prior and the likelihood function based on training samples.
+It allows to give a probabilistic approach to prediction by giving the mean and
+standard deviation as output when predicting.
 
-The noise level in the targets can be specified by passing it via the
-parameter ``alpha``, either globally as a scalar or per datapoint.
-Note that a moderate noise level can also be helpful for dealing with numeric
-issues during fitting as it is effectively implemented as Tikhonov
-regularization, i.e., by adding it to the diagonal of the kernel matrix. An
-alternative to specifying the noise level explicitly is to include a
-:class:`~sklearn.gaussian_process.kernels.WhiteKernel` component into the kernel, which can estimate the global noise
-level from the data (see example below).
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_targets_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_noisy_targets.html
+   :align: center
+
+The prior mean is assumed to be constant and zero (for `normalize_y=False`) or
+the training data's mean (for `normalize_y=True`). The prior's covariance is
+specified by passing a :ref:`kernel <gp_kernels>` object. The hyperparameters
+of the kernel are optimized during fitting of :class:`GaussianProcessRegressor`
+by maximizing the log-marginal-likelihood (LML) based on the passed
+`optimizer`. As the LML may have multiple local optima, the optimizer can be
+started repeatedly by specifying `n_restarts_optimizer`. The first run is
+always conducted starting from the initial hyperparameter values of the kernel;
+subsequent runs are conducted from hyperparameter values that have been chosen
+randomly from the range of allowed values. If the initial hyperparameters
+should be kept fixed, `None` can be passed as optimizer.
+
+The noise level in the targets can be specified by passing it via the parameter
+`alpha`, either globally as a scalar or per datapoint. Note that a moderate
+noise level can also be helpful for dealing with numeric issues during fitting
+as it is effectively implemented as Tikhonov regularization, i.e., by adding it
+to the diagonal of the kernel matrix. An alternative to specifying the noise
+level explicitly is to include a
+:class:`~sklearn.gaussian_process.kernels.WhiteKernel` component into the
+kernel, which can estimate the global noise level from the data (see example
+below). The figure below shows the effect of noisy target handled by setting
+the parameter `alpha`.
+
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_targets_002.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_noisy_targets.html
+   :align: center
 
 The implementation is based on Algorithm 2.1 of [RW2006]_. In addition to
 the API of standard scikit-learn estimators, GaussianProcessRegressor:
@@ -77,6 +90,7 @@ the API of standard scikit-learn estimators, GaussianProcessRegressor:
 
 .. topic:: Examples
 
+   * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy_target.py`
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy.py`
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_compare_gpr_krr.py`
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_co2.py`

From d08a09cba4c27f8cd30d1f303cf06bb7e659e182 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:23:00 +0200
Subject: [PATCH 12/40] Apply suggestions from code review

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 doc/modules/gaussian_process.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index bb969e59c5925..a773d50a5a472 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -52,7 +52,7 @@ standard deviation as output when predicting.
 The prior mean is assumed to be constant and zero (for `normalize_y=False`) or
 the training data's mean (for `normalize_y=True`). The prior's covariance is
 specified by passing a :ref:`kernel <gp_kernels>` object. The hyperparameters
-of the kernel are optimized during fitting of :class:`GaussianProcessRegressor`
+of the kernel are optimized when fitting the :class:`GaussianProcessRegressor`
 by maximizing the log-marginal-likelihood (LML) based on the passed
 `optimizer`. As the LML may have multiple local optima, the optimizer can be
 started repeatedly by specifying `n_restarts_optimizer`. The first run is
@@ -77,7 +77,7 @@ the parameter `alpha`.
    :align: center
 
 The implementation is based on Algorithm 2.1 of [RW2006]_. In addition to
-the API of standard scikit-learn estimators, GaussianProcessRegressor:
+the API of standard scikit-learn estimators, :class:`GaussianProcessRegressor`:
 
 * allows prediction without prior fitting (based on the GP prior)
 

From 7c2d3a6def3f85aba1acbabd4bc1eeae19058b9c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:29:14 +0200
Subject: [PATCH 13/40] Update
 examples/gaussian_process/plot_compare_gpr_krr.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_compare_gpr_krr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index 33d30237294b2..a0802c51a456f 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -7,7 +7,7 @@
 Gaussian process regression.
 
 Both kernel ridge regression and Gaussian process regression are using a
-so-called "kernel trick" to make their models expressive enough to not underfit
+so-called "kernel trick" to make their models expressive enough to fit
 the training data. However, the machine learning problems solved by the two
 methods are drastically different.
 

From 83cb2152082747b9caebccfbdcf38cb028a9286f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:29:22 +0200
Subject: [PATCH 14/40] Update
 examples/gaussian_process/plot_compare_gpr_krr.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_compare_gpr_krr.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index a0802c51a456f..9c6b617d081fc 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -15,10 +15,10 @@
 function (the mean squared error).
 
 Instead of finding a single target function, the Gaussian process regression
-employs a probabilistic approach. It gives prior probabilities to each possible
-target functions and combines it with a likelihood function defined by the
-observed training data. Thus, a Gaussian posterior distribution over target
-functions is defined based on the Bayes' theorem.
+employs a probabilistic approach : a Gaussian posterior distribution over target
+functions is defined based on the Bayes' theorem, Thus prior probabilities on
+target functions are being combined with a likelihood function defined by the
+observed training data to provide estimates of the posterior distributions.
 
 We will illustrate with an example these differences and we will also focus on
 the tuning of the kernel hyperparameters.

From c965bdb69c034b529bf83387a20e140216de8242 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:29:49 +0200
Subject: [PATCH 15/40] Update
 examples/gaussian_process/plot_compare_gpr_krr.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_compare_gpr_krr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index 9c6b617d081fc..885935348e5f2 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -20,8 +20,8 @@
 target functions are being combined with a likelihood function defined by the
 observed training data to provide estimates of the posterior distributions.
 
-We will illustrate with an example these differences and we will also focus on
-the tuning of the kernel hyperparameters.
+We will illustrate these differences with an example and we will also focus on
+tuning the kernel hyperparameters.
 """
 print(__doc__)
 

From a20e370e3bea93d70e54ccf6bf7f7e6af373ab74 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:29:56 +0200
Subject: [PATCH 16/40] Update examples/gaussian_process/plot_gpr_co2.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_gpr_co2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 8520b6f916487..9eab37fb85edb 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -89,7 +89,7 @@
 # ------------------------
 #
 # To design the kernel to use with our Gaussian process, we can make some
-# assumption regarding the data at hand. We observe that have several
+# assumption regarding the data at hand. We observe that they have several
 # characteristics: we see a long term rising trend, a pronounced seasonal
 # variation and some smaller irregularities. We can use different appropriate
 # kernel that would capture these features.

From 9bedda040718f634ffbdef6b1f91a4c15c841c1c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:30:02 +0200
Subject: [PATCH 17/40] Update examples/gaussian_process/plot_gpr_co2.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_gpr_co2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 9eab37fb85edb..95cadf5f0284e 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -96,8 +96,8 @@
 #
 # First, the long term rising trend could be fitted using a radial basis
 # function (RBF) kernel with a large length-scale parameter. The RBF kernel
-# with a large length-scale enforces this component to be smooth; it is not
-# enforced that the trend is rising which leaves this choice to our model. The
+# with a large length-scale enforces this component to be smooth. An trending
+# increase is not enforced as to give a degree of freedom to our model. The
 # specific length-scale and the amplitude are free hyperparameters.
 from sklearn.gaussian_process.kernels import RBF
 

From a2e40572a4a5fbb8ef9a5e613c274bdb4d7decea Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:30:07 +0200
Subject: [PATCH 18/40] Update examples/gaussian_process/plot_gpr_co2.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_gpr_co2.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 95cadf5f0284e..d0874230176bb 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -161,10 +161,10 @@
 #
 # Now, we are ready to use a Gaussian process regressor and fit the available
 # data. To follow the example from the literature, we will subtract the mean
-# from the target. We could have use `normalize_y=True`. However, doing so
-# would have also scale the target (dividing `y` by its standard deviation).
-# Thus, the hyperparameters of the different kernel would have different
-# meaning since they would not be express in ppm units.
+# from the target. We could have used `normalize_y=True`. However, doing so
+# would have also scaled the target (dividing `y` by its standard deviation).
+# Thus, the hyperparameters of the different kernel would have had different
+# meaning since they would not have been expressed in ppm.
 from sklearn.gaussian_process import GaussianProcessRegressor
 
 y_mean = y.mean()

From 76a4a1144a4da0d1fc633cff535ff6cb0f77934c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:30:17 +0200
Subject: [PATCH 19/40] Update examples/gaussian_process/plot_gpr_co2.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_gpr_co2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index d0874230176bb..fafd0393a4d31 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -123,7 +123,7 @@
 
 # %%
 # The small irregularities are to be explained by a rational quadratic kernel
-# component, whose length-scale and alpha parameter, which determines the
+# component, whose length-scale and alpha parameter, which quantifies the
 # diffuseness of the length-scales, are to be determined. A rational quadratic
 # kernel is equivalent to an RBF kernel with several length-scale and will
 # better accommodate the different irregularities.

From d0a439621bffff0aa65b68d7c11a2650a36deb68 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:30:30 +0200
Subject: [PATCH 20/40] Update
 examples/gaussian_process/plot_compare_gpr_krr.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_compare_gpr_krr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index 885935348e5f2..202761502a039 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -96,7 +96,7 @@
 _ = plt.title("Limitation of a linear model such as ridge")
 
 # %%
-# Such a ridge regressor underfits since it is not enough expressive.
+# Such a ridge regressor underfits data since it is not expressive enough.
 #
 # Kernel methods: kernel ridge and Gaussian process
 # -------------------------------------------------

From e1467af6ecc4c2c4654e79b0f7b65171c87fb61e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:31:15 +0200
Subject: [PATCH 21/40] Apply suggestions from code review

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_compare_gpr_krr.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index 202761502a039..14bb166f82d77 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -105,18 +105,18 @@
 # ............
 #
 # We can make the previous linear model more expressive by using a so-called
-# kernel. A kernel is used to make a model more expressive; for simplicity, it
-# would be equivalent to project our original data into an more complex feature
-# space. This new space is defined by the choice of kernel.
+# kernel. A kernel is an embedding from the original feature space to another one.
+# Simply put, it is used to map our original data into a newer and more complex
+# feature space. This new space is explicitly defined by the choice of kernel.
 #
 # In our case, we know that the true generative process is a periodic function.
-# We can use a :class:`~sklearn.gaussian_process.kernels.ExpSineSquared`. This
-# kernel allows to fit such periodicity. The class
+# We can use a :class:`~sklearn.gaussian_process.kernels.ExpSineSquared` kernel
+# which allows recovering the periodicity. The class
 # :class:`~sklearn.kernel_ridge.KernelRidge` will accept such a kernel.
 #
-# Using this model together with a kernel is equivalent to project the data
+# Using this model together with a kernel is equivalent to embed the data
 # using the mapping function of the kernel and then apply a ridge regression.
-# In practice, the data are not mapped explicitely; instead the dot product
+# In practice, the data are not mapped explicitly; instead the dot product
 # between samples in the higher dimensional feature space is computed using the
 # "kernel trick".
 #

From e0b54223008b77b0d7af749a4340a625a856f3c7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:32:41 +0200
Subject: [PATCH 22/40] Update
 examples/gaussian_process/plot_compare_gpr_krr.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_compare_gpr_krr.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index 14bb166f82d77..5087c6b23cbf7 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -155,8 +155,7 @@
 
 # %%
 # Our fitted model does not work. Indeed, we did not set the parameters of the
-# kernel and instead used the default hyperparameters. We can have a look at
-# them.
+# kernel and instead used the default ones. We can have a look at them.
 kernel_ridge.kernel
 
 # %%

From 41a8b2e337351945bee80756c5c8076dc923f78b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:33:03 +0200
Subject: [PATCH 23/40] Update
 examples/gaussian_process/plot_compare_gpr_krr.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_compare_gpr_krr.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index 5087c6b23cbf7..c61fee796a515 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -161,8 +161,9 @@
 # %%
 # Our kernel has two parameters: the length-scale and the periodicity. For our
 # dataset, we use `sin(x)` as the generative process. It means that we have
-# a periodicity of :math:`2 \pi`. The default value of the parameter being 1,
-# it explains the high frequency observed in the predictions of our model.
+# a periodicity of :math:`2 \pi`. The default value of the parameter being
+# :math:`1`, it explains the high frequency observed in the predictions of
+# our model.
 # Similar conclusions could be drawn with the length-scale parameter. Thus, it
 # tell us that the kernel parameters need to be tuned. We will use a randomized
 # search to tune the different parameters the kernel ridge model: the `alpha`

From 7cfc324c51f52884c5440e1d7ae07aca18d2719d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:34:00 +0200
Subject: [PATCH 24/40] Apply suggestions from code review

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_compare_gpr_krr.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index c61fee796a515..01b2ffe437254 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -154,7 +154,7 @@
               "kernel using default hyperparameters")
 
 # %%
-# Our fitted model does not work. Indeed, we did not set the parameters of the
+# This fitted model is not accurate. Indeed, we did not set the parameters of the
 # kernel and instead used the default ones. We can have a look at them.
 kernel_ridge.kernel
 
@@ -314,7 +314,7 @@
 # We observe that the results of the kernel ridge and the Gaussian process
 # regressor are close. However, the Gaussian process regressor also provide
 # an uncertainty information that is not available with a kernel ridge.
-# Due to the probabilistic formulation regarding the target function, the
+# Due to the probabilistic formulation of the target functions, the
 # Gaussian process can output the standard deviation (or the covariance)
 # together with the mean predictions of the target functions.
 #
@@ -325,7 +325,7 @@
 # ----------------
 #
 # We can give a final word regarding the possibility of the two models to
-# extrapolate. Indeed, we provided only the beginning of the signal as a
+# extrapolate. Indeed, we only provided the beginning of the signal as a
 # training set. Using a periodic kernel forces our model to repeat the pattern
 # found on the training set. Using this kernel information together with the
 # capacity of the both models to extrapolate, we observe that the models will
@@ -333,7 +333,7 @@
 #
 # Gaussian process allows to combine kernels together. Thus, we could associate
 # the exponential sine squared kernel together with a radial basis function
-# kernel
+# kernel.
 from sklearn.gaussian_process.kernels import RBF
 
 kernel = 1.0 * ExpSineSquared(
@@ -383,7 +383,8 @@
 _ = plt.title("Effect of using a radial basis function kernel")
 
 # %%
-# The effect of using an radial basis function kernel will attenuate the
+# The effect of using a radial basis function kernel will attenuate the
 # periodicity effect once that no sample are available in the training.
-# The predictions is converging towards the mean of the probability and the
-# standard deviation increases.
+# As testing samples get further away from the training ones, predictions
+# are converging towards their mean and their standard deviation
+# also increases.

From 8d5a3566616939188cf1aa4ecc37b642dae3ccde Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:37:12 +0200
Subject: [PATCH 25/40] Update
 examples/gaussian_process/plot_gpr_noisy_targets.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_gpr_noisy_targets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index 51b07b4bd3a26..133ae883f7adb 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -15,8 +15,8 @@
 as well as its probabilistic nature in the form of a pointwise 95% confidence
 interval.
 
-Note that the parameter `alpha` is applied as a Tikhonov regularization of the
-assumed covariance between the training points.
+Note that the parameter `alpha` is applied as a Tikhonov regularization ridge
+parameter on the assumed covariance between the training points.
 """
 print(__doc__)
 

From 232935400de65f3685b766d2a2500cc1a263c061 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:37:27 +0200
Subject: [PATCH 26/40] Update
 examples/gaussian_process/plot_gpr_noisy_targets.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_gpr_noisy_targets.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index 133ae883f7adb..57bec5ab2e65d 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -75,9 +75,9 @@
 gaussian_process.kernel_
 
 # %%
-# After fitting our model, we see that the hyperparemeters' kernel have been
-# optimized. Now, we will use our kernel to compute the mean prediction of the
-# full dataset and plot the 95% confidence interval.
+# After fitting our model, we see that the hyperparameters of the kernel have
+# been optimized. Now, we will use our kernel to compute the mean prediction
+# of the full dataset and plot the 95% confidence interval.
 mean_prediction, std_prediction = gaussian_process.predict(X, return_std=True)
 
 plt.plot(X, y, label=r"$f(x) = x \sin(x)$", linestyle="dotted")

From 49db719f7be03838a853d44a7e568ec5f626ee51 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:37:39 +0200
Subject: [PATCH 27/40] Update
 examples/gaussian_process/plot_gpr_noisy_targets.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_gpr_noisy_targets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index 57bec5ab2e65d..26a3a62bd4ebd 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -97,8 +97,8 @@
 
 # %%
 # We see that for a prediction close to a training sample, the 95% confidence
-# interval is small. Whenever a prediction is done far from training data, our
-# model become interval and the confidence interval is large.
+# interval is small. Whenever a sample is falls far from training data, our
+# model's prediction is less accurate and its confidence interval is larger.
 #
 # Example with noisy targets
 # --------------------------

From 9e8745545b6e6548f47d10c5738d4cf5ec0ac85b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:37:57 +0200
Subject: [PATCH 28/40] Update
 examples/gaussian_process/plot_gpr_noisy_targets.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_gpr_noisy_targets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index 26a3a62bd4ebd..8ec94b6863869 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -108,7 +108,7 @@
 # fitted model.
 #
 # We define the noise standard deviation and add it to the original training
-# target
+# target.
 dy = 0.5 + 1.0 * rng.random(y_train.shape)
 y_train_noisy = y_train + rng.normal(0, dy)
 

From 2089316ad5126abbe426e37cccf10ba02a844edc Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:38:10 +0200
Subject: [PATCH 29/40] Update
 examples/gaussian_process/plot_gpr_noisy_targets.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_gpr_noisy_targets.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index 8ec94b6863869..a2433d11384af 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -152,6 +152,4 @@
 # %%
 # The noise affect the predictions close to the training samples: now, the
 # confidence interval close to the training samples is larger because we
-# indicated the amount of noise for each training samples. As a result, the
-# confidence interval is larger on this noisy dataset than on the previous
-# example.
+# indicated the amount of noise for each training samples.

From 88f427879b04cce15eaaf6495de10aafa51b346f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:38:25 +0200
Subject: [PATCH 30/40] Update examples/gaussian_process/plot_gpr_noisy.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_gpr_noisy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index 2b15659beb3f1..33e33a2be7e8a 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -5,7 +5,7 @@
 
 This example shows the ability of the
 :class:`~sklearn.gaussian_process.kernels.WhiteKernel` to estimate the noise
-level in the data. However, we show the importance of kernel hyperparameters
+level in the data. Moreover, we show the importance of kernel hyperparameters
 initialization.
 """
 print(__doc__)

From 1d7bf95e33299bc44da0342e5f17c1686a0c9ec2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:39:35 +0200
Subject: [PATCH 31/40] Apply suggestions from code review

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_gpr_noisy.py | 29 ++++++++++++---------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index 33e33a2be7e8a..b5760908c64ca 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -19,7 +19,7 @@
 # ---------------
 #
 # We will work in a setting where `X` will be a single feature. We create a
-# function that will generate a the target to be predicted. We will add an
+# function that will generate the target to be predicted. We will add an
 # option to add some noise to the generated target.
 import numpy as np
 
@@ -103,15 +103,19 @@ def target_generator(X, add_noise=False):
 plt.ylabel("y")
 _ = plt.title(
     f"Initial: {kernel}\nOptimum: {gpr.kernel_}\nLog-Marginal-Likelihood: "
-    f"{gpr.log_marginal_likelihood(gpr.kernel_.theta)}"
+    f"{gpr.log_marginal_likelihood(gpr.kernel_.theta)}",
+    fontsize=8
 )
 # %%
 # We see that the optimum kernel found still have a high noise level and
-# and an even larger length scale. However, we see that the model does not
-# provide satisfactory predictions.
+# an even larger length scale. Furthermore, we observe that the
+# model does not provide faithful predictions.
 #
-# Now, we will initialize the hyperparameter with a lower noise level and
-# length scale.
+# Now, we will initialize the
+# :class:`~sklearn.gaussian_process.kernels.RBF` with a
+# larger length_scale and the
+# :class:`~sklearn.gaussian_process.kernels.WhiteKernel`
+# with a smaller noise level lower bound.
 kernel = (
     1.0 * RBF(length_scale=1e-1, length_scale_bounds=(1e-2, 1e3)) +
     WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-10, 1e1))
@@ -131,19 +135,20 @@ def target_generator(X, add_noise=False):
 plt.ylabel("y")
 _ = plt.title(
     f"Initial: {kernel}\nOptimum: {gpr.kernel_}\nLog-Marginal-Likelihood: "
-    f"{gpr.log_marginal_likelihood(gpr.kernel_.theta)}"
+    f"{gpr.log_marginal_likelihood(gpr.kernel_.theta)}",
+    fontsize=8
 )
 
 # %%
-# First, we see that the model is more satisfactory than the previous one
-# regarding the predictions. It is able to estimate the noise-free functional
+# First, we see that the model's predictions are precise than the previous 
+# model's. This new model is able to estimate the noise-free functional
 # relationship.
 #
 # Looking at the kernel hyperparameters, we see that the best combination found
-# as a smaller noise level and shorter length scale than the first model.
+# has a smaller noise level and shorter length scale than the first model.
 #
 # We can have a look at the Log-Marginal-Likelihood (LML) of GPR for different
-# hyperparameters to have an idea of the minima.
+# hyperparameters to get a sense of the local minima.
 from matplotlib.colors import LogNorm
 
 length_scale = np.logspace(-2, 4, num=50)
@@ -178,7 +183,7 @@ def target_generator(X, add_noise=False):
 
 # %%
 # We see that there are two local minima that correspond to the combination
-# of hyperparameters previously found. Depending on the initial value for the
+# of hyperparameters previously found. Depending on the initial values for the
 # hyperparameters, the gradient-based optimization might converge whether or
 # not to the best model. It is thus important to repeat the optimization
 # several times for different initializations.

From 39f62e0cb8286d98bc2ba1a102155ea919d6c96d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:41:25 +0200
Subject: [PATCH 32/40] Update examples/gaussian_process/plot_gpr_co2.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_gpr_co2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index fafd0393a4d31..6c38f71db6def 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -65,7 +65,7 @@
 # %%
 # We will preprocess the dataset by taking a monthly average and drop month
 # for which no measurements were collected. Such a processing will have an
-# effect to smooth the data.
+# smoothing effect on the data.
 co2_data = co2_data.resample("M").mean().dropna(axis="index", how="any")
 co2_data.plot()
 plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")

From d3509e3b55dc179db8eff1eb05d025aa907d02b7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 14:41:44 +0200
Subject: [PATCH 33/40] Update examples/gaussian_process/plot_gpr_co2.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/gaussian_process/plot_gpr_co2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 6c38f71db6def..7d5a9344cf726 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -76,7 +76,7 @@
 
 # %%
 # The idea in this example will be to predict the CO2 concentration in function
-# of the date. We are as well interesting in extrapolating for upcoming year
+# of the date. We are as well interested in extrapolating for upcoming year
 # after 2001.
 #
 # As a first step, we will divide the data and the target to estimate. The data

From 9ac4237dbf0c8cd43361c358a293cd9ee2cda23d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 11 Jun 2021 15:54:34 +0200
Subject: [PATCH 34/40] PEP8

---
 .../gaussian_process/plot_compare_gpr_krr.py  | 20 ++++++++++---------
 examples/gaussian_process/plot_gpr_noisy.py   |  2 +-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index 01b2ffe437254..97ec9fba4be46 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -15,10 +15,11 @@
 function (the mean squared error).
 
 Instead of finding a single target function, the Gaussian process regression
-employs a probabilistic approach : a Gaussian posterior distribution over target
-functions is defined based on the Bayes' theorem, Thus prior probabilities on
-target functions are being combined with a likelihood function defined by the
-observed training data to provide estimates of the posterior distributions.
+employs a probabilistic approach : a Gaussian posterior distribution over
+target functions is defined based on the Bayes' theorem, Thus prior
+probabilities on target functions are being combined with a likelihood function
+defined by the observed training data to provide estimates of the posterior
+distributions.
 
 We will illustrate these differences with an example and we will also focus on
 tuning the kernel hyperparameters.
@@ -105,9 +106,10 @@
 # ............
 #
 # We can make the previous linear model more expressive by using a so-called
-# kernel. A kernel is an embedding from the original feature space to another one.
-# Simply put, it is used to map our original data into a newer and more complex
-# feature space. This new space is explicitly defined by the choice of kernel.
+# kernel. A kernel is an embedding from the original feature space to another
+# one. Simply put, it is used to map our original data into a newer and more
+# complex feature space. This new space is explicitly defined by the choice of
+# kernel.
 #
 # In our case, we know that the true generative process is a periodic function.
 # We can use a :class:`~sklearn.gaussian_process.kernels.ExpSineSquared` kernel
@@ -154,8 +156,8 @@
               "kernel using default hyperparameters")
 
 # %%
-# This fitted model is not accurate. Indeed, we did not set the parameters of the
-# kernel and instead used the default ones. We can have a look at them.
+# This fitted model is not accurate. Indeed, we did not set the parameters of
+# the kernel and instead used the default ones. We can have a look at them.
 kernel_ridge.kernel
 
 # %%
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index b5760908c64ca..3423743dba725 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -140,7 +140,7 @@ def target_generator(X, add_noise=False):
 )
 
 # %%
-# First, we see that the model's predictions are precise than the previous 
+# First, we see that the model's predictions are precise than the previous
 # model's. This new model is able to estimate the noise-free functional
 # relationship.
 #

From 8a468d1668cd665e3fdda1aed751776ad47f0e0d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 11 Jun 2021 16:00:30 +0200
Subject: [PATCH 35/40] fix

---
 examples/gaussian_process/plot_gpr_co2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 7d5a9344cf726..2920788156b09 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -224,8 +224,8 @@
 # Thus, most of the target signal, with the mean substracted, is explained by a
 # long-term rising trend for ~45 ppm and a length-scale of ~52 years. The
 # periodic component has an amplitude of ~2.6ppm, a decay time of ~90 years and
-# a length-scale of ~1.5. The long decay time indicates that we have a locally
-# very close to periodic seasonal component. The correlated noise has an
+# a length-scale of ~1.5. The long decay time indicates that we have a
+# component very close to a seasonal periodicity. The correlated noise has an
 # amplitude of ~0.2 ppm with a length scale of ~0.12 years and a white-noise
 # contribution of ~0.04 ppm. Thus, the overall noise level is very small,
 # indicating that the data can be very well explained by the model.

From 45522756ecd7dd56f41abcfc0db030725046503b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 11 Jun 2021 16:44:00 +0200
Subject: [PATCH 36/40] iter

---
 doc/modules/gaussian_process.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index a773d50a5a472..c12894c94b577 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -45,7 +45,7 @@ will combine this prior and the likelihood function based on training samples.
 It allows to give a probabilistic approach to prediction by giving the mean and
 standard deviation as output when predicting.
 
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_targets_001.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_targets_002.png
    :target: ../auto_examples/gaussian_process/plot_gpr_noisy_targets.html
    :align: center
 
@@ -72,7 +72,7 @@ kernel, which can estimate the global noise level from the data (see example
 below). The figure below shows the effect of noisy target handled by setting
 the parameter `alpha`.
 
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_targets_002.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_targets_003.png
    :target: ../auto_examples/gaussian_process/plot_gpr_noisy_targets.html
    :align: center
 
@@ -90,7 +90,7 @@ the API of standard scikit-learn estimators, :class:`GaussianProcessRegressor`:
 
 .. topic:: Examples
 
-   * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy_target.py`
+   * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy_targets.py`
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy.py`
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_compare_gpr_krr.py`
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_co2.py`

From 3e86ff76a3fc1fc4f81c1147833b7d9a8559238d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 11 Jun 2021 17:00:36 +0200
Subject: [PATCH 37/40] iter

---
 examples/gaussian_process/plot_gpr_noisy_targets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index a2433d11384af..6aad484a5a131 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -109,7 +109,7 @@
 #
 # We define the noise standard deviation and add it to the original training
 # target.
-dy = 0.5 + 1.0 * rng.random(y_train.shape)
+dy = 0.5 + 1.0 * rng.random_sample(y_train.shape)
 y_train_noisy = y_train + rng.normal(0, dy)
 
 # %%

From bd8bb62aa61dc33eb0f1fc48b2f628d3e5716ab9 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 2 Aug 2023 10:57:42 +0200
Subject: [PATCH 38/40] forgot to save

---
 doc/modules/gaussian_process.rst | 145 -------------------------------
 1 file changed, 145 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index bdc8c10b2fe49..46e50cb68ae5d 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -90,155 +90,10 @@ the API of standard scikit-learn estimators, :class:`GaussianProcessRegressor`:
 
 .. topic:: Examples
 
-<<<<<<< HEAD
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy_targets.py`
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy.py`
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_compare_gpr_krr.py`
    * :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_co2.py`
-=======
-GPR examples
-============
-
-GPR with noise-level estimation
--------------------------------
-This example illustrates that GPR with a sum-kernel including a WhiteKernel can
-estimate the noise level of data. An illustration of the
-log-marginal-likelihood (LML) landscape shows that there exist two local
-maxima of LML.
-
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_003.png
-   :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
-   :align: center
-
-The first corresponds to a model with a high noise level and a
-large length scale, which explains all variations in the data by noise.
-
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_004.png
-   :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
-   :align: center
-
-The second one has a smaller noise level and shorter length scale, which explains
-most of the variation by the noise-free functional relationship. The second
-model has a higher likelihood; however, depending on the initial value for the
-hyperparameters, the gradient-based optimization might also converge to the
-high-noise solution. It is thus important to repeat the optimization several
-times for different initializations.
-
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_005.png
-   :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
-   :align: center
-
-
-Comparison of GPR and Kernel Ridge Regression
----------------------------------------------
-
-Both kernel ridge regression (KRR) and GPR learn
-a target function by employing internally the "kernel trick". KRR learns a
-linear function in the space induced by the respective kernel which corresponds
-to a non-linear function in the original space. The linear function in the
-kernel space is chosen based on the mean-squared error loss with
-ridge regularization. GPR uses the kernel to define the covariance of
-a prior distribution over the target functions and uses the observed training
-data to define a likelihood function. Based on Bayes theorem, a (Gaussian)
-posterior distribution over target functions is defined, whose mean is used
-for prediction.
-
-A major difference is that GPR can choose the kernel's hyperparameters based
-on gradient-ascent on the marginal likelihood function while KRR needs to
-perform a grid search on a cross-validated loss function (mean-squared error
-loss). A further difference is that GPR learns a generative, probabilistic
-model of the target function and can thus provide meaningful confidence
-intervals and posterior samples along with the predictions while KRR only
-provides predictions.
-
-The following figure illustrates both methods on an artificial dataset, which
-consists of a sinusoidal target function and strong noise. The figure compares
-the learned model of KRR and GPR based on a ExpSineSquared kernel, which is
-suited for learning periodic functions. The kernel's hyperparameters control
-the smoothness (length_scale) and periodicity of the kernel (periodicity).
-Moreover, the noise level
-of the data is learned explicitly by GPR by an additional WhiteKernel component
-in the kernel and by the regularization parameter alpha of KRR.
-
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_compare_gpr_krr_005.png
-   :target: ../auto_examples/gaussian_process/plot_compare_gpr_krr.html
-   :align: center
-
-The figure shows that both methods learn reasonable models of the target
-function. GPR provides reasonable confidence bounds on the prediction which are not
-available for KRR. A major difference between the two methods is the time
-required for fitting and predicting: while fitting KRR is fast in principle,
-the grid-search for hyperparameter optimization scales exponentially with the
-number of hyperparameters ("curse of dimensionality"). The gradient-based
-optimization of the parameters in GPR does not suffer from this exponential
-scaling and is thus considerably faster on this example with 3-dimensional
-hyperparameter space. The time for predicting is similar; however, generating
-the variance of the predictive distribution of GPR takes considerably longer
-than just predicting the mean.
-
-GPR on Mauna Loa CO2 data
--------------------------
-
-This example is based on Section 5.4.3 of [RW2006]_.
-It illustrates an example of complex kernel engineering and
-hyperparameter optimization using gradient ascent on the
-log-marginal-likelihood. The data consists of the monthly average atmospheric
-CO2 concentrations (in parts per million by volume (ppmv)) collected at the
-Mauna Loa Observatory in Hawaii, between 1958 and 1997. The objective is to
-model the CO2 concentration as a function of the time t.
-
-The kernel is composed of several terms that are responsible for explaining
-different properties of the signal:
-
-- a long term, smooth rising trend is to be explained by an RBF kernel. The
-  RBF kernel with a large length-scale enforces this component to be smooth;
-  it is not enforced that the trend is rising which leaves this choice to the
-  GP. The specific length-scale and the amplitude are free hyperparameters.
-
-- a seasonal component, which is to be explained by the periodic
-  ExpSineSquared kernel with a fixed periodicity of 1 year. The length-scale
-  of this periodic component, controlling its smoothness, is a free parameter.
-  In order to allow decaying away from exact periodicity, the product with an
-  RBF kernel is taken. The length-scale of this RBF component controls the
-  decay time and is a further free parameter.
-
-- smaller, medium term irregularities are to be explained by a
-  RationalQuadratic kernel component, whose length-scale and alpha parameter,
-  which determines the diffuseness of the length-scales, are to be determined.
-  According to [RW2006]_, these irregularities can better be explained by
-  a RationalQuadratic than an RBF kernel component, probably because it can
-  accommodate several length-scales.
-
-- a "noise" term, consisting of an RBF kernel contribution, which shall
-  explain the correlated noise components such as local weather phenomena,
-  and a WhiteKernel contribution for the white noise. The relative amplitudes
-  and the RBF's length scale are further free parameters.
-
-Maximizing the log-marginal-likelihood after subtracting the target's mean
-yields the following kernel with an LML of -83.214:
-
-::
-
-   34.4**2 * RBF(length_scale=41.8)
-   + 3.27**2 * RBF(length_scale=180) * ExpSineSquared(length_scale=1.44,
-                                                      periodicity=1)
-   + 0.446**2 * RationalQuadratic(alpha=17.7, length_scale=0.957)
-   + 0.197**2 * RBF(length_scale=0.138) + WhiteKernel(noise_level=0.0336)
-
-Thus, most of the target signal (34.4ppm) is explained by a long-term rising
-trend (length-scale 41.8 years). The periodic component has an amplitude of
-3.27ppm, a decay time of 180 years and a length-scale of 1.44. The long decay
-time indicates that we have a locally very close to periodic seasonal
-component. The correlated noise has an amplitude of 0.197ppm with a length
-scale of 0.138 years and a white-noise contribution of 0.197ppm. Thus, the
-overall noise level is very small, indicating that the data can be very well
-explained by the model. The figure shows also that the model makes very
-confident predictions until around 2015
-
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_co2_003.png
-   :target: ../auto_examples/gaussian_process/plot_gpr_co2.html
-   :align: center
->>>>>>> origin/main
 
 .. _gpc:
 

From 87c9272ed1d9acaf8441b4fed1a08f7c214ac15c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 12 Aug 2023 00:35:35 +0200
Subject: [PATCH 39/40] improve title

---
 examples/gaussian_process/plot_gpr_co2.py   | 6 +++---
 examples/gaussian_process/plot_gpr_noisy.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index a3acd1dbfcbd3..787574a65a9d3 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -1,7 +1,7 @@
 """
-=======================================================
-Gaussian process regression (GPR) on Mauna Loa CO2 data
-=======================================================
+====================================================================================
+Forecasting of CO2 level on Mona Loa dataset using Gaussian process regression (GPR)
+====================================================================================
 
 This example is based on Section 5.4.3 of "Gaussian Processes for Machine
 Learning" [RW2006]_. It illustrates an example of complex kernel engineering
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index b76fc745e7df7..31d3b149aa47f 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -1,7 +1,7 @@
 """
-=============================================================
-Gaussian process regression (GPR) with noise-level estimation
-=============================================================
+=========================================================================
+Ability of Gaussian process regression (GPR) to estimate data noise-level
+=========================================================================
 
 This example shows the ability of the
 :class:`~sklearn.gaussian_process.kernels.WhiteKernel` to estimate the noise

From 53b2eda214e74d5bf2099bcffa7cb1d2d6f0930b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 29 Aug 2023 17:09:26 +0200
Subject: [PATCH 40/40] add changes proposed by Noa

---
 doc/modules/gaussian_process.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 46e50cb68ae5d..75cbf95dae049 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -6,7 +6,7 @@ Gaussian Processes
 
 .. currentmodule:: sklearn.gaussian_process
 
-**Gaussian Processes (GP)** are a generic supervised learning method designed
+**Gaussian Processes (GP)** are a nonparametric supervised learning method used
 to solve *regression* and *probabilistic classification* problems.
 
 The advantages of Gaussian processes are:
@@ -25,7 +25,7 @@ The advantages of Gaussian processes are:
 
 The disadvantages of Gaussian processes include:
 
-    - They are not sparse, i.e., they use the whole samples/features
+    - Our implementation is not sparse, i.e., they use the whole samples/features
       information to perform the prediction.
 
     - They lose efficiency in high dimensional spaces -- namely when the number
@@ -63,10 +63,10 @@ should be kept fixed, `None` can be passed as optimizer.
 
 The noise level in the targets can be specified by passing it via the parameter
 `alpha`, either globally as a scalar or per datapoint. Note that a moderate
-noise level can also be helpful for dealing with numeric issues during fitting
-as it is effectively implemented as Tikhonov regularization, i.e., by adding it
-to the diagonal of the kernel matrix. An alternative to specifying the noise
-level explicitly is to include a
+noise level can also be helpful for dealing with numeric instabilities during
+fitting as it is effectively implemented as Tikhonov regularization, i.e., by
+adding it to the diagonal of the kernel matrix. An alternative to specifying
+the noise level explicitly is to include a
 :class:`~sklearn.gaussian_process.kernels.WhiteKernel` component into the
 kernel, which can estimate the global noise level from the data (see example
 below). The figure below shows the effect of noisy target handled by setting