From a6187a801d2596c6e8c42baad1a9933c947677d6 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 17 Feb 2015 07:19:21 +0100
Subject: [PATCH 001/165] ADD New kernel module for GPs which supports kernel
 engineering

---
 sklearn/gaussian_process/kernels.py | 232 ++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 sklearn/gaussian_process/kernels.py

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
new file mode 100644
index 0000000000000..001f0138ab354
--- /dev/null
+++ b/sklearn/gaussian_process/kernels.py
@@ -0,0 +1,232 @@
+
+# This is strongly based on george's kernel module
+# Author: Jan Hendrik Metzen <vincent.dubourg@gmail.com>
+# Licence: BSD 3 clause
+
+import numpy as np
+from scipy.spatial.distance import pdist, cdist, squareform
+
+
+class Kernel(object):
+
+    def _parse_param_space(self, param_space):
+        if not hasattr(param_space, "__iter__"):  # fixed hyperparameter
+            self.params = np.array([float(param_space)])
+            self.has_bounds = False
+            return
+        param_space = np.atleast_2d(param_space)
+        if param_space.shape[1] == 1:  # fixed hyperparameter
+            self.params = param_space
+            self.has_bounds = False
+        elif param_space.shape[1] == 2:  # lower+upper bound for hyperparameter
+            self.bounds = param_space
+            self.has_bounds = True
+            # Use geometric mean of upper and lower boundary as initial
+            # hyperparameter value
+            assert not np.any(self.l_bound == None)  # XXX: enforce element-wise comparison to None
+            assert not np.any(self.u_bound == None)
+            self.params = np.array([np.sqrt(self.l_bound * self.u_bound)])
+        elif param_space.shape[1] == 3:  # lower bound, initial value, upper bound
+            self.params = param_space[:, 1]
+            self.bounds = param_space[:, [0, 2]]
+            self.has_bounds = True
+        else:
+            raise Exception()
+
+    @property
+    def n_params(self):
+        return self.params.shape[0]
+
+    @property
+    def bounds(self):
+        return np.vstack((self.l_bound, self.u_bound)).T
+
+    @bounds.setter
+    def bounds(self, bounds):
+        bounds = bounds.reshape(-1, 2)
+        self.l_bound = bounds[:, 0]
+        self.u_bound = bounds[:, 1]
+
+    def __add__(self, b):
+        if not isinstance(b, Kernel):
+            return Sum(self, ConstantKernel(b))
+        return Sum(self, b)
+
+    def __radd__(self, b):
+        if not isinstance(b, Kernel):
+            return Sum(ConstantKernel(b), self)
+        return Sum(b, self)
+
+    def __mul__(self, b):
+        if not isinstance(b, Kernel):
+            return Product(self, ConstantKernel(b))
+        return Product(self, b)
+
+    def __rmul__(self, b):
+        if not isinstance(b, Kernel):
+            return Product(ConstantKernel(b), self)
+        return Product(b, self)
+
+    def __repr__(self):
+        return "{0}({1})".format(self.__class__.__name__,
+                                 ", ".join(map("{0}".format,
+                                               self.params)))
+
+
+
+class KernelOperator(Kernel):
+
+    def __init__(self, k1, k2):
+        self.k1 = k1
+        self.k2 = k2
+        # XXX: Deal with situations in which only some of the hyperparameter
+        #      shall be optimized
+        self.has_bounds = k1.has_bounds and k2.has_bounds
+
+    @property
+    def params(self):
+        return np.append(self.k1.params, self.k2.params)
+
+    @params.setter
+    def params(self, theta):
+        i = self.k1.n_params
+        self.k1.params = theta[:i]
+        self.k2.params = theta[i:]
+
+    @property
+    def bounds(self):
+        assert self.has_bounds
+        return np.vstack((self.k1.bounds, self.k2.bounds))
+
+    @bounds.setter
+    def bounds(self, bounds):
+        i = self.k1.n_params
+        self.k1.bounds = bounds[:i]
+        self.k2.bounds = bounds[i:]
+
+
+class Sum(KernelOperator):
+
+    def auto_correlation(self, X, eval_gradient=False):
+        if eval_gradient:
+            K1, K1_gradient = self.k1.auto_correlation(X, eval_gradient=True)
+            K2, K2_gradient = self.k2.auto_correlation(X, eval_gradient=True)
+            return K1 + K2, np.dstack((K1_gradient, K2_gradient))
+        else:
+            return self.k1.auto_correlation(X) + self.k2.auto_correlation(X)
+
+    def cross_correlation(self, X1, X2):
+        return self.k1.cross_correlation(X1, X2) \
+            + self.k2.cross_correlation(X1, X2)
+
+    def __repr__(self):
+        return "{0} + {1}".format(self.k1, self.k2)
+
+
+class Product(KernelOperator):
+
+    def auto_correlation(self, X, eval_gradient=False):
+        if eval_gradient:
+            K1, K1_gradient = self.k1.auto_correlation(X, eval_gradient=True)
+            K2, K2_gradient = self.k2.auto_correlation(X, eval_gradient=True)
+            return K1 * K2, np.dstack((K1_gradient * K2[:, :, None],
+                                       K2_gradient * K1[:, :, None]))
+        else:
+            return self.k1.auto_correlation(X) * self.k2.auto_correlation(X)
+
+    def cross_correlation(self, X1, X2):
+        return self.k1.cross_correlation(X1, X2) \
+            * self.k2.cross_correlation(X1, X2)
+
+    def __repr__(self):
+        return "{0} * {1}".format(self.k1, self.k2)
+
+
+class ConstantKernel(Kernel):
+
+    def __init__(self, param_space=1.0):
+        self._parse_param_space(param_space)
+
+    @property
+    def params(self):
+        return np.array([self.value])
+
+    @params.setter
+    def params(self, theta):
+        assert len(theta) == 1
+        self.value = theta[0]
+
+    def auto_correlation(self, X, eval_gradient=False):
+        K = self.value * np.ones((X.shape[0], X.shape[0]))
+        if eval_gradient:
+            return K, np.ones((X.shape[0], X.shape[0], 1))
+        else:
+            return K
+
+    def cross_correlation(self, X1, X2):
+        return self.value * np.ones((X1.shape[0], X2.shape[0]))
+
+    def __repr__(self):
+        return "{0}".format(self.value)
+
+
+class RBF(Kernel):
+    def __init__(self, param_space=1.0):
+        self._parse_param_space(param_space)
+
+    @property
+    def params(self):
+        return np.asarray(self.l)
+
+    @params.setter
+    def params(self, theta):
+        self.l = theta
+
+    def auto_correlation(self, X, eval_gradient=False):
+        dists = pdist(X / self.l, metric='sqeuclidean')
+        K = np.exp(-.5 * dists)
+        # convert from upper-triangular matrix to square matrix
+        K = squareform(K)
+        np.fill_diagonal(K, 1)
+        if eval_gradient:
+            if self.l.shape[0] == 1:
+                K_gradient = (K * squareform(dists) / self.l[0])[:, :, None]
+                return K, K_gradient
+            elif self.l.shape[0] == X.shape[1]:
+                # We need to recompute the pairwise dimension-wise distances
+                D = (X[:, None, :] - X[None, :, :])**2 / (self.l ** 3)
+                K_gradient = .5 * K[..., None] * D
+                return K, K_gradient
+            else:
+                raise Exception("Anisotropic kernels require that the number "
+                                "of length scales and features match.")
+        else:
+            return K
+
+    def cross_correlation(self, X1, X2):
+        dists = cdist(X1, X2, metric='sqeuclidean')
+        K = np.exp(-dists / (2 * self.l**2))
+        return K
+
+
+class WhiteKernel(Kernel):
+    def __init__(self, param_space=1.0):
+        self._parse_param_space(param_space)
+
+    @property
+    def params(self):
+        return np.asarray([self.c])
+
+    @params.setter
+    def params(self, theta):
+        self.c = theta[0]
+
+    def auto_correlation(self, X, eval_gradient=False):
+        K = self.c * np.eye(X.shape[0])
+        if eval_gradient:
+            return K, np.eye(X.shape[0])[:, :, None]
+        else:
+            return K
+
+    def cross_correlation(self, X1, X2):
+        return np.zeros((X1.shape[0], X2.shape[1]))

From ca82e7b7b9a1dcfa7bf2eb1e8cd3ceae58c4f08f Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 17 Feb 2015 07:20:00 +0100
Subject: [PATCH 002/165] ADD New implementation of GaussianProcessRegression

---
 sklearn/gaussian_process/gpr.py | 134 ++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 sklearn/gaussian_process/gpr.py

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
new file mode 100644
index 0000000000000..9d44565cfc58b
--- /dev/null
+++ b/sklearn/gaussian_process/gpr.py
@@ -0,0 +1,134 @@
+"""Gaussian processes regression. """
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
+import numpy as np
+from scipy.linalg import cholesky, cho_solve, solve
+from scipy.optimize import fmin_l_bfgs_b
+
+from sklearn.base import BaseEstimator
+
+
+class GaussianProcessRegression(BaseEstimator):
+    """ Gaussian process regression (GPR).
+
+    The implementation is based on Algorithm 2.1 of ``Gaussian Processes
+    for Machine Learning'' (GPML) by Rasmussen and Williams.
+
+    In addition to standard sklearn estimators, GaussianProcessRegression
+       * allows prediction without prior fitting (based on the GP prior)
+       * provides an additional method sample(X), which evaluates samples drawn
+         from the GPR (prior or posterior) at given inputs
+       * exposes a method log_marginal_likelihood(theta), which can be used
+         externally for other ways of selecting hyperparamters, e.g., via
+         Markov chain Monte Carlo.
+
+    Parameters
+    ----------
+    kernel : Kernel object
+        The kernel specifying the covariance function of the GP.
+
+    y_err : float, optional (default: 1e-10)
+        Value added to the diagonal of the kernel matrix during fitting.
+        Larger values correspond to increased noise level in the observations
+        and reduce potential numerical issue during fitting.
+
+    Attributes
+    ----------
+    X_fit_:
+
+    y_fit_:
+
+    theta_:
+
+    L_:
+
+    alpha_:
+    """
+
+    def __init__(self, kernel, y_err=1e-10):
+        self.kernel = kernel
+        self.y_err = y_err
+
+    def fit(self, X, y):
+        self.X_fit_ = np.asarray(X)
+        self.y_fit_ = np.asarray(y)
+
+        if self.kernel.has_bounds:
+            # Choose hyperparameters based on maximizing the log-marginal
+            # likelihood
+            def obj_func(theta):
+                lml, grad = self.log_marginal_likelihood(theta,
+                                                         eval_gradient=True)
+                return -lml, -grad
+            self.theta_, lml, _ = fmin_l_bfgs_b(obj_func, self.kernel.params,
+                                                bounds=self.kernel.bounds)
+            self.kernel.params = self.theta_
+        else:
+            self.theta_ = self.kernel.params
+
+        # Precompute quantities required for predictions which are independent
+        # of actual query points
+        K = self.kernel.auto_correlation(self.X_fit_)
+        K[np.diag_indices_from(K)] += self.y_err
+        self.L_ = cholesky(K, lower=True)  # Line 2
+        self.alpha_ = cho_solve((self.L_, True), self.y_fit_)  # Line 3
+
+        return self
+
+    def predict(self, X, return_cov=False):
+        X = np.asarray(X)
+
+        if not hasattr(self, "X_fit_"):  # Unfitted; predict based on GP prior
+            y_mean = np.zeros(X.shape[0])
+            if return_cov:
+                y_cov = self.kernel.auto_correlation(X)
+                return y_mean, y_cov
+            else:
+                return y_mean
+        else:  # Predict based on GP posterior
+            K_trans = self.kernel.cross_correlation(X, self.X_fit_)
+            y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
+            if return_cov:
+                v = cho_solve((self.L_, True), K_trans.T)  # Line 5
+                y_cov = \
+                    self.kernel.auto_correlation(X) - K_trans.dot(v)  # Line 6
+                return y_mean, y_cov
+            else:
+                return y_mean
+
+    def sample(self, X, n_samples=1):
+        y_mean, y_cov = self.predict(X, return_cov=True)
+        y_samples = \
+            np.random.multivariate_normal(y_mean, y_cov, n_samples).T
+        return y_samples
+
+    def log_marginal_likelihood(self, theta, eval_gradient=False):
+        import copy  # XXX: Avoid deepcopy
+        kernel = copy.deepcopy(self.kernel)
+        kernel.params = theta
+
+        if eval_gradient:
+            K, K_gradient = \
+                kernel.auto_correlation(self.X_fit_, eval_gradient=True)
+        else:
+            K = kernel.auto_correlation(self.X_fit_)
+
+        K[np.diag_indices_from(K)] += self.y_err
+        L = cholesky(K, lower=True)  # Line 2
+        alpha = cho_solve((L, True), self.y_fit_)  # Line 3
+
+        # Compute log-likelihood (compare line 7)
+        log_likelihood = -0.5*self.y_fit_.dot(alpha)
+        log_likelihood -= np.log(np.diag(L)).sum()
+        log_likelihood -= K.shape[0] / 2 * np.log(2 * np.pi)
+
+        if eval_gradient:  # compare Equation 5.9 from GPML
+            tmp = np.outer(alpha, alpha)
+            tmp -= cho_solve((L, True), np.eye(K.shape[0]))
+            gradient = 0.5 * np.trace(tmp.dot(K_gradient))
+            return log_likelihood, gradient
+        else:
+            return log_likelihood

From 2de1d231e3c4d6addf0e980e15055cf98f502943 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 17 Feb 2015 07:20:27 +0100
Subject: [PATCH 003/165] ADD Gaussian Process classification based on Laplace
 approximation

---
 sklearn/gaussian_process/gpc.py | 180 ++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 sklearn/gaussian_process/gpc.py

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
new file mode 100644
index 0000000000000..ae9db5538e22e
--- /dev/null
+++ b/sklearn/gaussian_process/gpc.py
@@ -0,0 +1,180 @@
+"""Gaussian processes classification based on the Laplace approximation. """
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
+import numpy as np
+from scipy.linalg import cholesky, cho_solve, solve
+from scipy.optimize import fmin_l_bfgs_b
+from scipy.special import erf
+
+from sklearn.base import BaseEstimator
+
+
+class GaussianProcessClassification(BaseEstimator):
+    """ Gaussian process classification (GPC).
+
+    The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
+    ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and
+    Williams.
+
+    Internally, the Laplace approximation is used for approximating the
+    non-Gaussian posterior by a Gaussian.
+
+    Currently, the implementation is restricted to
+      * using the logistic link function
+      * binary classification
+    """
+
+    def __init__(self, kernel, jitter=0.0):
+        self.kernel = kernel
+        self.jitter = jitter
+
+        # Values required for approximating the logistic sigmoid by
+        # error functions. coefs are obtained via:
+        # x = np.array([0, 0.6, 2, 3.5, 4.5, np.inf])
+        # b = logistic(x)
+        # A = (erf(np.dot(x, self.lambdas)) + 1) / 2
+        # coefs = lstsq(A, b)[0]
+        self.lambdas = np.array([0.41, 0.4, 0.37, 0.44, 0.39])[:, None]
+        self.coefs = np.array([-1854.8214151, 3516.89893646, 221.29346712,
+                               128.12323805, -2010.49422654])[:, None]
+
+    def fit(self, X, y):
+        # XXX: Assert that y is binary and labels are {0, 1}
+        self.X_fit_ = np.asarray(X)
+        self.y_fit_ = np.asarray(y)
+
+        if self.kernel.has_bounds:
+            # Choose hyperparameters based on maximizing the log-marginal
+            # likelihood
+            def obj_func(theta):
+                lml, grad = self.log_marginal_likelihood(theta,
+                                                         eval_gradient=True)
+                return -lml, -grad
+            self.theta_, lml, _ = fmin_l_bfgs_b(obj_func, self.kernel.params,
+                                                bounds=self.kernel.bounds)
+            self.kernel.params = self.theta_
+        else:
+            self.theta_ = self.kernel.params
+
+        # Precompute quantities required for predictions which are independent
+        # of actual query points
+        self.K_ = self.kernel.auto_correlation(self.X_fit_)
+        self.K_[np.diag_indices_from(self.K_)] += self.jitter
+
+        self.f_, _, (self.pi, self.W_sr, self.L, _, _) = \
+            self._posterior_mode(self.K_, return_temporaries=True)
+
+        return self
+
+    def predict(self, X):
+        # As discussed on Sectio 3.4.2 of GPML, for making hard binary
+        # decisions, it is enough to compute the MAP of the posterior and
+        # pass it through the link function
+        K_star = \
+            self.kernel.cross_correlation(self.X_fit_, X)  # K_star =k(x_star)
+        f_star = K_star.T.dot(self.y_fit_ - self.pi)  # Line 4
+
+        return f_star > 0
+
+    def predict_proba(self, X):
+        # Based on Algorithm 3.2 of GPML
+        K_star = \
+            self.kernel.cross_correlation(self.X_fit_, X)  # K_star =k(x_star)
+        f_star = K_star.T.dot(self.y_fit_ - self.pi)  # Line 4
+        v = solve(self.L, self.W_sr.dot(K_star))  # Line 5
+        var_f_star = self.kernel.auto_correlation(X) - v.T.dot(v)  # Line 6
+
+        # Line 7:
+        # Approximate \int log(z) * N(z | f_star, var_f_star)
+        # Approximation is due to Williams & Barber, "Bayesian Classification
+        # with Gaussian Processes", Appendix A:
+        # Approximate the logistic sigmoid by a linear combination of
+        # 5 error functions
+        # See
+        # blitiri.blogspot.de/2012/11/gaussian-integral-of-error-function.html
+        # for information on how this integral can be computed
+        alpha = 1 / (2 * np.diag(var_f_star))
+        gamma = self.lambdas * f_star
+        integrals = np.sqrt(np.pi / alpha) \
+            * erf(gamma * np.sqrt(alpha / (alpha + self.lambdas**2))) \
+            / (2 * np.sqrt(np.diag(var_f_star) * 2 * np.pi))
+        pi_star = (self.coefs * integrals).sum(axis=0) + .5 * self.coefs.sum()
+
+        return pi_star
+
+    def log_marginal_likelihood(self, theta, eval_gradient=False):
+        import copy  # XXX
+        kernel = copy.deepcopy(self.kernel)
+        kernel.params = theta
+
+        if eval_gradient:
+            K, K_gradient = \
+                kernel.auto_correlation(self.X_fit_, eval_gradient=True)
+        else:
+            K = kernel.auto_correlation(self.X_fit_)
+
+        K[np.diag_indices_from(K)] += self.jitter
+
+        # Return temporaries
+        f, Z, (pi, W_sr, L, b, a) = \
+            self._posterior_mode(K, return_temporaries=True)
+
+        if not eval_gradient:
+            return Z
+
+        # Compute gradient based on Algorithm 5.1 of GPML
+        d_Z = np.empty(theta.shape[0])
+        R = W_sr.dot(cho_solve((L, True), W_sr))  # Line 7
+        C = solve(L, W_sr.dot(K))  # Line 8
+        # Line 9:
+        s_2 = -0.5*(np.diag(K) - np.diag(C.T.dot(C))) \
+            * (pi * (1 - pi) * (1 - 2*pi))  # third derivative
+        for j in range(d_Z.shape[0]):
+            C = K_gradient[..., j]   # Line 11
+            s_1 = .5 * a.T.dot(C).dot(a) - .5 * np.trace(R.dot(C))  # Line 12
+
+            b = C.dot(self.y_fit_ - pi)  # Line 13
+            s_3 = b - K.dot(R).dot(b)  # Line 14
+
+            d_Z[j] = s_1 + s_2.T.dot(s_3)  # Line 15
+
+        return Z, d_Z
+
+    def _posterior_mode(self, K, return_temporaries=False):
+        # Based on Algorithm 3.1 of GPML
+        f = np.zeros_like(self.y_fit_, dtype=np.float64)
+        log_marginal_likelihood = -np.inf
+        while True:
+            # Line 4
+            pi = 1 / (1 + np.exp(-f))
+            W = -np.diag(-pi*(1-pi))  # XXX: avoid creating square matrix?
+            # Line 5
+            W_sr = np.sqrt(W)
+            B = np.eye(W.shape[0]) + W_sr.dot(K).dot(W_sr)
+            L = cholesky(B, lower=True)
+            # Line 6
+            b = W.dot(f) + (self.y_fit_ - pi)
+            # Line 7
+            a = b - W_sr.dot(cho_solve((L, True), W_sr.dot(K).dot(b)))
+            # Line 8
+            f = K.dot(a)
+
+            # Line 10: Compute log marginal likelihood in loop and use as
+            #          convergence criterion
+            lml = -0.5*a.T.dot(f) \
+                - np.log(1 + np.exp(-(self.y_fit_*2 - 1)*f)).sum() \
+                - np.log(np.diag(L)).sum()
+            # Check if we have converged (log marginal likelihood does
+            # not decrease)
+            # XXX: more complex convergence criterion
+            if lml - log_marginal_likelihood < 1e-10:
+                break
+            log_marginal_likelihood = lml
+
+        if return_temporaries:
+            return f, log_marginal_likelihood, (pi, W_sr, L, b, a)
+        else:
+            return f, log_marginal_likelihood

From d8e34adbaf8747081b2b50355238abd25ff8453f Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 17 Feb 2015 07:20:46 +0100
Subject: [PATCH 004/165] TST Tests for GP regression and classification

---
 sklearn/gaussian_process/__init__.py       |   4 +
 sklearn/gaussian_process/tests/test_gpc.py |  78 ++++++++++++++
 sklearn/gaussian_process/tests/test_gpr.py | 113 +++++++++++++++++++++
 3 files changed, 195 insertions(+)
 create mode 100644 sklearn/gaussian_process/tests/test_gpc.py
 create mode 100644 sklearn/gaussian_process/tests/test_gpr.py

diff --git a/sklearn/gaussian_process/__init__.py b/sklearn/gaussian_process/__init__.py
index 1d903181be719..691b025872ca3 100644
--- a/sklearn/gaussian_process/__init__.py
+++ b/sklearn/gaussian_process/__init__.py
@@ -9,6 +9,10 @@
 based predictions.
 """
 
+from .gpr import GaussianProcessRegression
+from .gpc import GaussianProcessClassification
+from . import kernels
+
 from .gaussian_process import GaussianProcess
 from . import correlation_models
 from . import regression_models
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
new file mode 100644
index 0000000000000..1f828ccbe02b2
--- /dev/null
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -0,0 +1,78 @@
+"""Testing for Gaussian process classification """
+
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# Licence: BSD 3 clause
+
+from copy import deepcopy
+
+import numpy as np
+
+from scipy.optimize import approx_fprime
+
+from sklearn.gaussian_process import GaussianProcessClassification
+from sklearn.gaussian_process.kernels import RBF
+
+from sklearn.utils.testing import (assert_true, assert_greater,
+                                   assert_almost_equal, assert_array_equal)
+
+
+f = lambda x: x * np.sin(x)
+X = np.atleast_2d(np.linspace(0, 10, 30)).T
+X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T
+y = np.array(f(X).ravel() > 0, dtype=int)
+
+
+kernels = [RBF(0.1), RBF((1e-3, 1.0, 1e3)),
+           (1e-2, 1.0, 1e2) * RBF((1e-3, 0.1, 1e3))]
+
+
+def test_predict_consistent():
+    """ Check binary predict decision has also predicted probability above 0.5.
+    """
+    for kernel in kernels:
+        kernel = deepcopy(kernel)
+        gpc = GaussianProcessClassification(kernel=kernel).fit(X, y)
+        assert_array_equal(gpc.predict(X),
+                           gpc.predict_proba(X) >=0.5)
+
+
+def test_lml_improving():
+    """ Test that hyperparameter-tuning improves log-marginal likelihood. """
+    for kernel in kernels:
+        if not kernel.has_bounds:
+            continue
+        kernel = deepcopy(kernel)
+        params_initial = kernel.params
+        gpc = GaussianProcessClassification(kernel=kernel).fit(X, y)
+        assert_greater(gpc.log_marginal_likelihood(kernel.params),
+                       gpc.log_marginal_likelihood(params_initial))
+
+
+def test_converged_to_local_maximum():
+    """ Test that we are in local maximum after hyperparameter-optimization. """
+    for kernel in kernels:
+        if not kernel.has_bounds:
+            continue
+        kernel = deepcopy(kernel)
+        gpc = GaussianProcessClassification(kernel=kernel).fit(X, y)
+
+        lml, lml_gradient = gpc.log_marginal_likelihood(kernel.params, True)
+
+        assert_almost_equal(lml_gradient, 0, 2)
+
+
+def test_lml_gradient():
+    """ Compare analytic and numeric gradient of log marginal likelihood. """
+    for kernel in kernels:
+        kernel = deepcopy(kernel)
+        params = kernel.params
+        gpc = GaussianProcessClassification(kernel=kernel).fit(X, y)
+
+        lml, lml_gradient = gpc.log_marginal_likelihood(params, True)
+        lml_gradient_approx = \
+            approx_fprime(params,
+                          lambda theta: gpc.log_marginal_likelihood(theta,
+                                                                    False),
+                          1e-10)
+
+        assert_almost_equal(lml_gradient, lml_gradient_approx, 3)
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
new file mode 100644
index 0000000000000..40efcae385398
--- /dev/null
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -0,0 +1,113 @@
+"""Testing for Gaussian process regression """
+
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# Licence: BSD 3 clause
+
+from copy import deepcopy
+
+import numpy as np
+
+from scipy.optimize import approx_fprime
+
+from sklearn.gaussian_process import GaussianProcessRegression
+from sklearn.gaussian_process.kernels import RBF
+
+from sklearn.utils.testing import (assert_true, assert_greater,
+                                   assert_almost_equal)
+
+
+f = lambda x: x * np.sin(x)
+X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T
+X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T
+y = f(X).ravel()
+
+
+kernels = [RBF(0.1), RBF((1e-3, 1.0, 1e3)),
+           (1e-2, 1.0, 1e2) * RBF((1e-3, 0.1, 1e3)),
+           (1e-2, 1.0, 1e2) * RBF((1e-3, 0.1, 1e3)) + (None, 0.0, None),
+           (1e-2, 0.1, 1e2) * RBF((1e-3, 0.1, 1e3)) + (None, 0.0, None)]
+
+def test_gpr_interpolation():
+    """Test the interpolating property for different kernels."""
+    for kernel in kernels:
+        kernel = deepcopy(kernel)
+        gpr = GaussianProcessRegression(kernel=kernel).fit(X, y)
+        y_pred, y_cov = gpr.predict(X, return_cov=True)
+
+        assert_true(np.allclose(y_pred, y))
+        assert_true(np.allclose(np.diag(y_cov), 0.))
+
+
+def test_lml_improving():
+    """ Test that hyperparameter-tuning improves log-marginal likelihood. """
+    for kernel in kernels:
+        if not kernel.has_bounds:
+            continue
+        kernel = deepcopy(kernel)
+        params_initial = kernel.params
+        gpr = GaussianProcessRegression(kernel=kernel).fit(X, y)
+        assert_greater(gpr.log_marginal_likelihood(kernel.params),
+                       gpr.log_marginal_likelihood(params_initial))
+
+
+def test_converged_to_local_maximum():
+    """ Test that we are in local maximum after hyperparameter-optimization. """
+    for kernel in kernels:
+        if not kernel.has_bounds:
+            continue
+        kernel = deepcopy(kernel)
+        gpr = GaussianProcessRegression(kernel=kernel).fit(X, y)
+
+        lml, lml_gradient = gpr.log_marginal_likelihood(kernel.params, True)
+
+        assert_almost_equal(lml_gradient, 0, 5)
+
+
+def test_lml_gradient():
+    """ Compare analytic and numeric gradient of log marginal likelihood. """
+    for kernel in kernels:
+        kernel = deepcopy(kernel)
+        params = kernel.params
+        gpr = GaussianProcessRegression(kernel=kernel).fit(X, y)
+
+        lml, lml_gradient = gpr.log_marginal_likelihood(params, True)
+        lml_gradient_approx = \
+            approx_fprime(params,
+                          lambda theta: gpr.log_marginal_likelihood(theta,
+                                                                    False),
+                          1e-10)
+
+        assert_almost_equal(lml_gradient, lml_gradient_approx, 3)
+
+
+def test_prior():
+    """ Test that GP prior has mean 0 and identical variances."""
+    for kernel in kernels:
+        kernel = deepcopy(kernel)
+        gpr = GaussianProcessRegression(kernel=kernel)
+
+        y_mean, y_cov = gpr.predict(X, return_cov=True)
+
+        assert_almost_equal(y_mean, 0, 5)
+        if len(kernel.params) > 1:
+            # XXX: quite hacky, works only for current kernels
+            assert_almost_equal(np.diag(y_cov), kernel.params[0] , 5)
+        else:
+            assert_almost_equal(np.diag(y_cov), 1, 5)
+
+
+def test_sample_statistics():
+    """ Test that statistics of samples drawn from GP are correct."""
+    for kernel in kernels:
+        kernel = deepcopy(kernel)
+        gpr = GaussianProcessRegression(kernel=kernel).fit(
+            X, y)
+
+        y_mean, y_cov = gpr.predict(X2, return_cov=True)
+
+        samples = gpr.sample(X2, 1000000)
+
+        # More digits accuracy would require many more samples
+        assert_almost_equal(y_mean, np.mean(samples, 1), 2)
+        assert_almost_equal(np.diag(y_cov) / np.diag(y_cov).max(),
+                            np.var(samples, 1) / np.diag(y_cov).max(), 1)

From 4ac2f5b601be96ef7757509b580022a3f006e115 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 17 Feb 2015 07:21:06 +0100
Subject: [PATCH 005/165] ADD Examples for GP classification

---
 examples/gaussian_process/new/plot_gpc.py     | 48 +++++++++++++++++++
 examples/gaussian_process/new/plot_gpc_xor.py | 31 ++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 examples/gaussian_process/new/plot_gpc.py
 create mode 100644 examples/gaussian_process/new/plot_gpc_xor.py

diff --git a/examples/gaussian_process/new/plot_gpc.py b/examples/gaussian_process/new/plot_gpc.py
new file mode 100644
index 0000000000000..5ee15248a55d5
--- /dev/null
+++ b/examples/gaussian_process/new/plot_gpc.py
@@ -0,0 +1,48 @@
+
+import numpy as np
+
+from sklearn.gaussian_process import GaussianProcessClassification
+from sklearn.gaussian_process.kernels import RBF
+
+np.random.seed(0)
+
+
+# Generate data
+X = np.random.uniform(0, 5, 50)[:, None]
+y = np.array(np.sin((X[:, 0] - 2.5) ** 2) > 0.0, dtype=int)
+
+# Specify Gaussian Processes with fixed and optimized hyperparameters
+kernel_fix = 4.0 * RBF(param_space=[1.0])
+gp_fix = GaussianProcessClassification(kernel=kernel_fix).fit(X, y)
+
+kernel_opt = (1e-10, 1.0, 100) * RBF(param_space=(1e-10, 1, 10))
+gp_opt = GaussianProcessClassification(kernel=kernel_opt).fit(X, y)
+
+print "Log Marginal Likelihood (initial): %.3f" % \
+    gp_fix.log_marginal_likelihood(gp_fix.theta_)
+print "Log Marginal Likelihood (optimized): %.3f" % \
+    gp_fix.log_marginal_likelihood(gp_opt.theta_)
+
+
+# Plot posteriors
+import pylab
+pylab.figure(0)
+pylab.scatter(X[:, 0], y)
+X_ = np.linspace(0, 5, 100)
+pylab.plot(X_, gp_fix.predict_proba(X_[:, None]), 'r', label="Fixed")
+pylab.plot(X_, gp_opt.predict_proba(X_[:, None]), 'b', label="Optimized")
+pylab.legend(loc="best")
+
+# Plot LML landscape
+pylab.figure(1)
+theta0 = np.logspace(0, 8, 50)
+theta1 = np.logspace(-1, 1, 49)
+Theta0, Theta1 = np.meshgrid(theta0, theta1)
+LML = [[gp_opt.log_marginal_likelihood([Theta0[i, j], Theta1[i, j]])
+        for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
+LML = np.array(LML).T
+pylab.pcolor(Theta0, Theta1, LML)
+pylab.xscale("log")
+pylab.yscale("log")
+pylab.colorbar()
+pylab.show()
diff --git a/examples/gaussian_process/new/plot_gpc_xor.py b/examples/gaussian_process/new/plot_gpc_xor.py
new file mode 100644
index 0000000000000..401e5c571e287
--- /dev/null
+++ b/examples/gaussian_process/new/plot_gpc_xor.py
@@ -0,0 +1,31 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.gaussian_process import GaussianProcessClassification
+from sklearn.gaussian_process.kernels import RBF
+
+xx, yy = np.meshgrid(np.linspace(-3, 3, 100),
+                     np.linspace(-3, 3, 100))
+np.random.seed(0)
+X = np.random.randn(200, 2)
+Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
+
+# fit the model
+kernel = [1e-10, 1.0, 100] * RBF(param_space=(1e-10, 1.0, 10))
+clf = GaussianProcessClassification(kernel=kernel).fit(X, Y)
+
+# plot the decision function for each datapoint on the grid
+Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])
+Z = Z.reshape(xx.shape)
+
+image = plt.imshow(Z, interpolation='nearest',
+           extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
+           origin='lower', cmap=plt.cm.PuOr_r)
+contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
+                       linetypes='--')
+plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired)
+plt.xticks(())
+plt.yticks(())
+plt.axis([-3, 3, -3, 3])
+plt.colorbar(image)
+plt.show()
\ No newline at end of file

From a6be0aea9dc1a54a5c09f03f80e9bf7fbe108153 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 17 Feb 2015 07:21:21 +0100
Subject: [PATCH 006/165] ADD Examples for GP regression

---
 .../gaussian_process/new/plot_gpr_noisy.py    | 48 +++++++++++++++++
 .../new/plot_gpr_prior_posterior.py           | 51 +++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 examples/gaussian_process/new/plot_gpr_noisy.py
 create mode 100644 examples/gaussian_process/new/plot_gpr_prior_posterior.py

diff --git a/examples/gaussian_process/new/plot_gpr_noisy.py b/examples/gaussian_process/new/plot_gpr_noisy.py
new file mode 100644
index 0000000000000..65f67814a5498
--- /dev/null
+++ b/examples/gaussian_process/new/plot_gpr_noisy.py
@@ -0,0 +1,48 @@
+
+import numpy as np
+import pylab
+from matplotlib.colors import LogNorm
+
+from sklearn.gaussian_process import GaussianProcessRegression
+from sklearn.gaussian_process.kernels import RBF, WhiteKernel
+
+
+np.random.seed(0)
+X = np.random.uniform(0, 5, 20)[:, None]
+y = 0.5*np.sin(3*X[:, 0]) + np.random.normal(0, 0.5, X.shape[0])
+
+rbf_kernel = (1e-10, 1.0, 100) * RBF(param_space=(1e-10, 1.0, None))
+white_kernel = WhiteKernel(param_space=(1e-10, 1e-5, 1e+1))
+
+gp = GaussianProcessRegression(kernel=rbf_kernel + white_kernel,
+                               y_err=0.0).fit(X, y)
+
+pylab.figure(0)
+X_ = np.linspace(0, 5, 100)
+y_mean, y_cov = gp.predict(X_[:, None], return_cov=True)
+pylab.plot(X_, y_mean, 'k', lw=3, zorder=9)
+pylab.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
+                   y_mean + np.sqrt(np.diag(y_cov)),
+                   alpha=0.5, color='k')
+
+pylab.scatter(X[:, 0], y, c='r', s=50, zorder=10)
+
+# Plot LML landscape
+pylab.figure(1)
+theta0 = np.logspace(-2, 3, 49)
+theta1 = np.logspace(-2, 0, 50)
+Theta0, Theta1 = np.meshgrid(theta0, theta1)
+LML = [[gp.log_marginal_likelihood([0.36, Theta0[i, j], Theta1[i, j]])
+        for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
+LML = np.array(LML).T
+
+vmin, vmax = (-LML).min(), (-LML).max()
+vmax = 50
+pylab.contour(Theta0, Theta1, -LML,
+              levels=np.logspace(np.log10(vmin), np.log10(vmax), 50),
+              norm=LogNorm(vmin=vmin, vmax=vmax))
+pylab.xscale("log")
+pylab.yscale("log")
+pylab.colorbar()
+
+pylab.show()
\ No newline at end of file
diff --git a/examples/gaussian_process/new/plot_gpr_prior_posterior.py b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
new file mode 100644
index 0000000000000..7e6d202bc3a7b
--- /dev/null
+++ b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
@@ -0,0 +1,51 @@
+
+import numpy as np
+import pylab
+
+from sklearn.gaussian_process import GaussianProcessRegression
+from sklearn.gaussian_process.kernels import RBF
+
+np.random.seed(0)
+
+
+# Specify Gaussian Process
+kernel = (1e-10, 1.0, 100) * RBF(param_space=(1e-10, 1.0, None))
+gp = GaussianProcessRegression(kernel=kernel)
+
+# Plot prior
+pylab.figure(0, figsize=(8, 8))
+pylab.subplot(2, 1, 1)
+X_ = np.linspace(0, 5, 100)
+y_mean, y_cov = gp.predict(X_[:, None], return_cov=True)
+pylab.plot(X_, y_mean, 'k', lw=3, zorder=9)
+pylab.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
+                   y_mean + np.sqrt(np.diag(y_cov)),
+                   alpha=0.5, color='k')
+y_samples = gp.sample(X_[:, None], 10)
+pylab.plot(X_, y_samples, color='b', lw=1)
+pylab.xlim(0, 5)
+pylab.ylim(-3, 3)
+pylab.title("Prior")
+
+# Generate data and fit GP
+X = np.random.uniform(0, 5, 10)[:, None]
+y = np.sin((X[:, 0] - 2.5) ** 2)
+gp.fit(X, y)
+
+# Plot posterior
+pylab.subplot(2, 1, 2)
+X_ = np.linspace(0, 5, 100)
+y_mean, y_cov = gp.predict(X_[:, None], return_cov=True)
+pylab.plot(X_, y_mean, 'k', lw=3, zorder=9)
+pylab.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
+                   y_mean + np.sqrt(np.diag(y_cov)),
+                   alpha=0.5, color='k')
+
+y_samples = gp.sample(X_[:, None], 10)
+pylab.plot(X_, y_samples, color='b', lw=1)
+pylab.scatter(X[:, 0], y, c='r', s=50, zorder=10)
+pylab.xlim(0, 5)
+pylab.ylim(-3, 3)
+pylab.title("Posterior")
+pylab.tight_layout()
+pylab.show()

From d55ccaf93673064dc64d3e2162b99ab11df3ebc9 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 17 Feb 2015 11:57:43 +0100
Subject: [PATCH 007/165] FIX cross_correlation of anisotropic RBF kernel
 computed correctly

---
 sklearn/gaussian_process/kernels.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 001f0138ab354..3749d07fc5c37 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -204,8 +204,8 @@ def auto_correlation(self, X, eval_gradient=False):
             return K
 
     def cross_correlation(self, X1, X2):
-        dists = cdist(X1, X2, metric='sqeuclidean')
-        K = np.exp(-dists / (2 * self.l**2))
+        dists = cdist(X1 / self.l, X2 / self.l, metric='sqeuclidean')
+        K = np.exp(-.5 * dists)
         return K
 
 

From 0765d095ccfddd6ae64e985aa6de5d63988cb843 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 18 Feb 2015 14:30:47 +0100
Subject: [PATCH 008/165] FIX Fix some bugs in kernel module discovered during
 writing unit test

---
 sklearn/gaussian_process/kernels.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 3749d07fc5c37..6b979451af737 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -16,7 +16,7 @@ def _parse_param_space(self, param_space):
             return
         param_space = np.atleast_2d(param_space)
         if param_space.shape[1] == 1:  # fixed hyperparameter
-            self.params = param_space
+            self.params = param_space[:, 0]
             self.has_bounds = False
         elif param_space.shape[1] == 2:  # lower+upper bound for hyperparameter
             self.bounds = param_space
@@ -195,7 +195,7 @@ def auto_correlation(self, X, eval_gradient=False):
             elif self.l.shape[0] == X.shape[1]:
                 # We need to recompute the pairwise dimension-wise distances
                 D = (X[:, None, :] - X[None, :, :])**2 / (self.l ** 3)
-                K_gradient = .5 * K[..., None] * D
+                K_gradient = K[..., None] * D
                 return K, K_gradient
             else:
                 raise Exception("Anisotropic kernels require that the number "
@@ -229,4 +229,6 @@ def auto_correlation(self, X, eval_gradient=False):
             return K
 
     def cross_correlation(self, X1, X2):
-        return np.zeros((X1.shape[0], X2.shape[1]))
+        K = np.zeros((X1.shape[0], X2.shape[0]))
+        K[cdist(X1, X2) < 1e-10] = 1  # entries which are sufficiently similar
+        return K

From 0240f4e9c0bdf5e15c506f9b5727fe972cebcf8c Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 18 Feb 2015 14:31:03 +0100
Subject: [PATCH 009/165] TST Test for GP kernels

---
 .../gaussian_process/tests/test_kernels.py    | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 sklearn/gaussian_process/tests/test_kernels.py

diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
new file mode 100644
index 0000000000000..e7a930c15e3d8
--- /dev/null
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -0,0 +1,79 @@
+"""Testing for kernels for Gaussian processes."""
+
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# Licence: BSD 3 clause
+
+from copy import deepcopy
+
+import numpy as np
+
+from scipy.optimize import approx_fprime
+
+from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
+
+from sklearn.utils.testing import assert_equal, assert_almost_equal
+
+
+X = np.random.normal(0, 1, (10, 2))
+
+kernels = [RBF(2.0), RBF([[0.5], [2.0]]),
+           2.0*RBF(0.5), RBF(2.0) + WhiteKernel(1.0),
+           ConstantKernel(10.0)]
+
+
+def test_kernel_gradient():
+    """ Compare analytic and numeric gradient of kernels. """
+    for kernel in kernels:
+        K, K_gradient = kernel.auto_correlation(X, eval_gradient=True)
+
+        assert_equal(K_gradient.shape[0], X.shape[0])
+        assert_equal(K_gradient.shape[1], X.shape[0])
+        assert_equal(K_gradient.shape[2], kernel.params.shape[0])
+
+        K_gradient_approx = np.empty_like(K_gradient)
+        for i in range(K.shape[0]):
+            for j in range(K.shape[1]):
+                def eval_kernel_ij_for_theta(theta):
+                    kernel_copy = deepcopy(kernel)
+                    kernel_copy.params = theta
+                    K = kernel_copy.auto_correlation(X, eval_gradient=False)
+                    return K[i, j]
+                K_gradient_approx[i, j] = \
+                    approx_fprime(kernel.params, eval_kernel_ij_for_theta,
+                                  1e-10)
+
+        assert_almost_equal(K_gradient, K_gradient_approx, 5)
+
+
+def test_auto_vs_cross():
+    """ Auto-correlation and cross-correlation should be consistent. """
+    for kernel in kernels:
+        print kernel
+        K_auto = kernel.auto_correlation(X)
+        K_cross = kernel.cross_correlation(X, X)
+
+        assert_almost_equal(K_auto, K_cross, 5)
+
+def test_kernel_operator_commutative():
+    """ Adding kernels and multiplying kernels should be commutative. """
+    # Check addition
+    assert_almost_equal((RBF(2.0) + 1.0).auto_correlation(X),
+                        (1.0 + RBF(2.0)).auto_correlation(X))
+
+    # Check multiplication
+    assert_almost_equal((3.0 * RBF(2.0)).auto_correlation(X),
+                        (RBF(2.0) * 3.0).auto_correlation(X))
+
+
+def test_kernel_anisotropic():
+    """ Anisotropic kernel should be consistent with isotropic kernels."""
+    K = RBF([[0.5], [2.0]]).auto_correlation(X)
+    X1 = np.array(X)
+    X1[:, 0] *= 4
+    K1 = RBF(2.0).auto_correlation(X1)
+    assert_almost_equal(K, K1)
+
+    X2 = np.array(X)
+    X2[:, 1] /= 4
+    K2 = RBF(0.5).auto_correlation(X2)
+    assert_almost_equal(K, K2)

From ecb5f39b206fc002f595bc8017aabf7dc52f92a7 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 18 Feb 2015 14:34:16 +0100
Subject: [PATCH 010/165] REFACTOR auto_correlation() -> auto(),
 cross_correlation() -> cross()

---
 sklearn/gaussian_process/gpc.py               | 12 +++---
 sklearn/gaussian_process/gpr.py               | 12 +++---
 sklearn/gaussian_process/kernels.py           | 40 +++++++++----------
 .../gaussian_process/tests/test_kernels.py    | 22 +++++-----
 4 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index ae9db5538e22e..3a5c9a93f7b09 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -61,7 +61,7 @@ def obj_func(theta):
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
-        self.K_ = self.kernel.auto_correlation(self.X_fit_)
+        self.K_ = self.kernel.auto(self.X_fit_)
         self.K_[np.diag_indices_from(self.K_)] += self.jitter
 
         self.f_, _, (self.pi, self.W_sr, self.L, _, _) = \
@@ -74,7 +74,7 @@ def predict(self, X):
         # decisions, it is enough to compute the MAP of the posterior and
         # pass it through the link function
         K_star = \
-            self.kernel.cross_correlation(self.X_fit_, X)  # K_star =k(x_star)
+            self.kernel.cross(self.X_fit_, X)  # K_star =k(x_star)
         f_star = K_star.T.dot(self.y_fit_ - self.pi)  # Line 4
 
         return f_star > 0
@@ -82,10 +82,10 @@ def predict(self, X):
     def predict_proba(self, X):
         # Based on Algorithm 3.2 of GPML
         K_star = \
-            self.kernel.cross_correlation(self.X_fit_, X)  # K_star =k(x_star)
+            self.kernel.cross(self.X_fit_, X)  # K_star =k(x_star)
         f_star = K_star.T.dot(self.y_fit_ - self.pi)  # Line 4
         v = solve(self.L, self.W_sr.dot(K_star))  # Line 5
-        var_f_star = self.kernel.auto_correlation(X) - v.T.dot(v)  # Line 6
+        var_f_star = self.kernel.auto(X) - v.T.dot(v)  # Line 6
 
         # Line 7:
         # Approximate \int log(z) * N(z | f_star, var_f_star)
@@ -112,9 +112,9 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
 
         if eval_gradient:
             K, K_gradient = \
-                kernel.auto_correlation(self.X_fit_, eval_gradient=True)
+                kernel.auto(self.X_fit_, eval_gradient=True)
         else:
-            K = kernel.auto_correlation(self.X_fit_)
+            K = kernel.auto(self.X_fit_)
 
         K[np.diag_indices_from(K)] += self.jitter
 
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 9d44565cfc58b..a3e28c9e9cd38 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -71,7 +71,7 @@ def obj_func(theta):
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
-        K = self.kernel.auto_correlation(self.X_fit_)
+        K = self.kernel.auto(self.X_fit_)
         K[np.diag_indices_from(K)] += self.y_err
         self.L_ = cholesky(K, lower=True)  # Line 2
         self.alpha_ = cho_solve((self.L_, True), self.y_fit_)  # Line 3
@@ -84,17 +84,17 @@ def predict(self, X, return_cov=False):
         if not hasattr(self, "X_fit_"):  # Unfitted; predict based on GP prior
             y_mean = np.zeros(X.shape[0])
             if return_cov:
-                y_cov = self.kernel.auto_correlation(X)
+                y_cov = self.kernel.auto(X)
                 return y_mean, y_cov
             else:
                 return y_mean
         else:  # Predict based on GP posterior
-            K_trans = self.kernel.cross_correlation(X, self.X_fit_)
+            K_trans = self.kernel.cross(X, self.X_fit_)
             y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
             if return_cov:
                 v = cho_solve((self.L_, True), K_trans.T)  # Line 5
                 y_cov = \
-                    self.kernel.auto_correlation(X) - K_trans.dot(v)  # Line 6
+                    self.kernel.auto(X) - K_trans.dot(v)  # Line 6
                 return y_mean, y_cov
             else:
                 return y_mean
@@ -112,9 +112,9 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
 
         if eval_gradient:
             K, K_gradient = \
-                kernel.auto_correlation(self.X_fit_, eval_gradient=True)
+                kernel.auto(self.X_fit_, eval_gradient=True)
         else:
-            K = kernel.auto_correlation(self.X_fit_)
+            K = kernel.auto(self.X_fit_)
 
         K[np.diag_indices_from(K)] += self.y_err
         L = cholesky(K, lower=True)  # Line 2
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 6b979451af737..b45bc21e7d9ba 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -107,17 +107,17 @@ def bounds(self, bounds):
 
 class Sum(KernelOperator):
 
-    def auto_correlation(self, X, eval_gradient=False):
+    def auto(self, X, eval_gradient=False):
         if eval_gradient:
-            K1, K1_gradient = self.k1.auto_correlation(X, eval_gradient=True)
-            K2, K2_gradient = self.k2.auto_correlation(X, eval_gradient=True)
+            K1, K1_gradient = self.k1.auto(X, eval_gradient=True)
+            K2, K2_gradient = self.k2.auto(X, eval_gradient=True)
             return K1 + K2, np.dstack((K1_gradient, K2_gradient))
         else:
-            return self.k1.auto_correlation(X) + self.k2.auto_correlation(X)
+            return self.k1.auto(X) + self.k2.auto(X)
 
-    def cross_correlation(self, X1, X2):
-        return self.k1.cross_correlation(X1, X2) \
-            + self.k2.cross_correlation(X1, X2)
+    def cross(self, X1, X2):
+        return self.k1.cross(X1, X2) \
+            + self.k2.cross(X1, X2)
 
     def __repr__(self):
         return "{0} + {1}".format(self.k1, self.k2)
@@ -125,18 +125,18 @@ def __repr__(self):
 
 class Product(KernelOperator):
 
-    def auto_correlation(self, X, eval_gradient=False):
+    def auto(self, X, eval_gradient=False):
         if eval_gradient:
-            K1, K1_gradient = self.k1.auto_correlation(X, eval_gradient=True)
-            K2, K2_gradient = self.k2.auto_correlation(X, eval_gradient=True)
+            K1, K1_gradient = self.k1.auto(X, eval_gradient=True)
+            K2, K2_gradient = self.k2.auto(X, eval_gradient=True)
             return K1 * K2, np.dstack((K1_gradient * K2[:, :, None],
                                        K2_gradient * K1[:, :, None]))
         else:
-            return self.k1.auto_correlation(X) * self.k2.auto_correlation(X)
+            return self.k1.auto(X) * self.k2.auto(X)
 
-    def cross_correlation(self, X1, X2):
-        return self.k1.cross_correlation(X1, X2) \
-            * self.k2.cross_correlation(X1, X2)
+    def cross(self, X1, X2):
+        return self.k1.cross(X1, X2) \
+            * self.k2.cross(X1, X2)
 
     def __repr__(self):
         return "{0} * {1}".format(self.k1, self.k2)
@@ -156,14 +156,14 @@ def params(self, theta):
         assert len(theta) == 1
         self.value = theta[0]
 
-    def auto_correlation(self, X, eval_gradient=False):
+    def auto(self, X, eval_gradient=False):
         K = self.value * np.ones((X.shape[0], X.shape[0]))
         if eval_gradient:
             return K, np.ones((X.shape[0], X.shape[0], 1))
         else:
             return K
 
-    def cross_correlation(self, X1, X2):
+    def cross(self, X1, X2):
         return self.value * np.ones((X1.shape[0], X2.shape[0]))
 
     def __repr__(self):
@@ -182,7 +182,7 @@ def params(self):
     def params(self, theta):
         self.l = theta
 
-    def auto_correlation(self, X, eval_gradient=False):
+    def auto(self, X, eval_gradient=False):
         dists = pdist(X / self.l, metric='sqeuclidean')
         K = np.exp(-.5 * dists)
         # convert from upper-triangular matrix to square matrix
@@ -203,7 +203,7 @@ def auto_correlation(self, X, eval_gradient=False):
         else:
             return K
 
-    def cross_correlation(self, X1, X2):
+    def cross(self, X1, X2):
         dists = cdist(X1 / self.l, X2 / self.l, metric='sqeuclidean')
         K = np.exp(-.5 * dists)
         return K
@@ -221,14 +221,14 @@ def params(self):
     def params(self, theta):
         self.c = theta[0]
 
-    def auto_correlation(self, X, eval_gradient=False):
+    def auto(self, X, eval_gradient=False):
         K = self.c * np.eye(X.shape[0])
         if eval_gradient:
             return K, np.eye(X.shape[0])[:, :, None]
         else:
             return K
 
-    def cross_correlation(self, X1, X2):
+    def cross(self, X1, X2):
         K = np.zeros((X1.shape[0], X2.shape[0]))
         K[cdist(X1, X2) < 1e-10] = 1  # entries which are sufficiently similar
         return K
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index e7a930c15e3d8..13376408f5797 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -24,7 +24,7 @@
 def test_kernel_gradient():
     """ Compare analytic and numeric gradient of kernels. """
     for kernel in kernels:
-        K, K_gradient = kernel.auto_correlation(X, eval_gradient=True)
+        K, K_gradient = kernel.auto(X, eval_gradient=True)
 
         assert_equal(K_gradient.shape[0], X.shape[0])
         assert_equal(K_gradient.shape[1], X.shape[0])
@@ -36,7 +36,7 @@ def test_kernel_gradient():
                 def eval_kernel_ij_for_theta(theta):
                     kernel_copy = deepcopy(kernel)
                     kernel_copy.params = theta
-                    K = kernel_copy.auto_correlation(X, eval_gradient=False)
+                    K = kernel_copy.auto(X, eval_gradient=False)
                     return K[i, j]
                 K_gradient_approx[i, j] = \
                     approx_fprime(kernel.params, eval_kernel_ij_for_theta,
@@ -49,31 +49,31 @@ def test_auto_vs_cross():
     """ Auto-correlation and cross-correlation should be consistent. """
     for kernel in kernels:
         print kernel
-        K_auto = kernel.auto_correlation(X)
-        K_cross = kernel.cross_correlation(X, X)
+        K_auto = kernel.auto(X)
+        K_cross = kernel.cross(X, X)
 
         assert_almost_equal(K_auto, K_cross, 5)
 
 def test_kernel_operator_commutative():
     """ Adding kernels and multiplying kernels should be commutative. """
     # Check addition
-    assert_almost_equal((RBF(2.0) + 1.0).auto_correlation(X),
-                        (1.0 + RBF(2.0)).auto_correlation(X))
+    assert_almost_equal((RBF(2.0) + 1.0).auto(X),
+                        (1.0 + RBF(2.0)).auto(X))
 
     # Check multiplication
-    assert_almost_equal((3.0 * RBF(2.0)).auto_correlation(X),
-                        (RBF(2.0) * 3.0).auto_correlation(X))
+    assert_almost_equal((3.0 * RBF(2.0)).auto(X),
+                        (RBF(2.0) * 3.0).auto(X))
 
 
 def test_kernel_anisotropic():
     """ Anisotropic kernel should be consistent with isotropic kernels."""
-    K = RBF([[0.5], [2.0]]).auto_correlation(X)
+    K = RBF([[0.5], [2.0]]).auto(X)
     X1 = np.array(X)
     X1[:, 0] *= 4
-    K1 = RBF(2.0).auto_correlation(X1)
+    K1 = RBF(2.0).auto(X1)
     assert_almost_equal(K, K1)
 
     X2 = np.array(X)
     X2[:, 1] /= 4
-    K2 = RBF(0.5).auto_correlation(X2)
+    K2 = RBF(0.5).auto(X2)
     assert_almost_equal(K, K2)

From ebb95f57bef723a94066413997e5a8970a0ac7df Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 18 Feb 2015 16:56:52 +0100
Subject: [PATCH 011/165] DOC GP's kernel module documented

---
 sklearn/gaussian_process/kernels.py | 258 +++++++++++++++++++++++++++-
 1 file changed, 250 insertions(+), 8 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index b45bc21e7d9ba..5d71fff7d2a74 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -1,13 +1,31 @@
-
-# This is strongly based on george's kernel module
-# Author: Jan Hendrik Metzen <vincent.dubourg@gmail.com>
+""" Kernels for Gaussian process regression and classification.
+
+The kernels in this module allow kernel-engineering, i.e., they can be
+combined via the "+" and "*" operators. These expressions can also contain
+scalar values, which are automatically converted to a constant kernel.
+
+All kernel allow (analytic) gradient-based hyperparameter optimization.
+The space of hyperparameters can be specified by giving lower und upper
+boundaries for the value of each hyperparameter (the search space is thus
+rectangular). This can be achieved by using a pair or triple instead of a
+single float wherever a parameter value is specified. In case of a pair,
+the first value specifies the lower boundary and the second value the upper
+boundary. In case of a triple, the middle value specified the initial value
+of the parameter during hyperparameter-optimization.
+"""
+
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 # Licence: BSD 3 clause
 
+# Note: this module is strongly inspired by the kernel module of the george
+#       package.
+
 import numpy as np
 from scipy.spatial.distance import pdist, cdist, squareform
 
 
 class Kernel(object):
+    """ Base class for all kernels."""
 
     def _parse_param_space(self, param_space):
         if not hasattr(param_space, "__iter__"):  # fixed hyperparameter
@@ -75,6 +93,7 @@ def __repr__(self):
 
 
 class KernelOperator(Kernel):
+    """ Base class for all kernel operators. """
 
     def __init__(self, k1, k2):
         self.k1 = k1
@@ -106,8 +125,42 @@ def bounds(self, bounds):
 
 
 class Sum(KernelOperator):
+    """ Sum-kernel k1+k2 of two kernels k1 and k2.
+
+    The resulting kernel is defined as
+    k_sum(X1, X2) = k1(X1, X2) + k2(X1, X2)
+
+    Parameters
+    ----------
+    k1 : Kernel object
+        The first base-kernel of the sum-kernel
+
+    k2 : Kernel object
+        The second base-kernel of the sum-kernel
+    """
 
     def auto(self, X, eval_gradient=False):
+        """ Return the auto-kernel k(X, X) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_features)
+            Data for which the kernel k(X, X) is computed
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined.
+
+        Returns
+        -------
+        K : array, shape (n_samples, n_samples)
+            Kernel k(X, X)
+
+        K_gradient : array (optional), shape (n_samples, n_samples, n_params)
+            The gradient of the kernel k(X, X) with repect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
         if eval_gradient:
             K1, K1_gradient = self.k1.auto(X, eval_gradient=True)
             K2, K2_gradient = self.k2.auto(X, eval_gradient=True)
@@ -116,16 +169,64 @@ def auto(self, X, eval_gradient=False):
             return self.k1.auto(X) + self.k2.auto(X)
 
     def cross(self, X1, X2):
-        return self.k1.cross(X1, X2) \
-            + self.k2.cross(X1, X2)
+        """ Return the cross-kernel k(X1, X2).
+
+        Parameters
+        ----------
+        X1 : array, shape (n_samples_1, n_features)
+            Left argument of the returned kernel k(X1, X2)
+
+        X2 : array, shape (n_samples_2, n_features)
+            Right argument of the returned kernel k(X1, X2)
+
+        Returns
+        -------
+        K : array, shape (n_samples_1, n_samples_2)
+            Kernel k(X1, X2)
+        """
+        return self.k1.cross(X1, X2) + self.k2.cross(X1, X2)
 
     def __repr__(self):
         return "{0} + {1}".format(self.k1, self.k2)
 
 
 class Product(KernelOperator):
+    """ Product-kernel k1*k2 of two kernels k1 and k2.
+
+    The resulting kernel is defined as
+    k_prod(X1, X2) = k1(X1, X2) * k2(X1, X2)
+
+    Parameters
+    ----------
+    k1 : Kernel object
+        The first base-kernel of the product-kernel
+
+    k2 : Kernel object
+        The second base-kernel of the product-kernel
+    """
 
     def auto(self, X, eval_gradient=False):
+        """ Return the auto-kernel k(X, X) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_features)
+            Data for which the kernel k(X, X) is computed
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined.
+
+        Returns
+        -------
+        K : array, shape (n_samples, n_samples)
+            Kernel k(X, X)
+
+        K_gradient : array (optional), shape (n_samples, n_samples, n_params)
+            The gradient of the kernel k(X, X) with repect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
         if eval_gradient:
             K1, K1_gradient = self.k1.auto(X, eval_gradient=True)
             K2, K2_gradient = self.k2.auto(X, eval_gradient=True)
@@ -135,14 +236,34 @@ def auto(self, X, eval_gradient=False):
             return self.k1.auto(X) * self.k2.auto(X)
 
     def cross(self, X1, X2):
-        return self.k1.cross(X1, X2) \
-            * self.k2.cross(X1, X2)
+        """ Return the cross-kernel k(X1, X2).
+
+        Parameters
+        ----------
+        X1 : array, shape (n_samples_1, n_features)
+            Left argument of the returned kernel k(X1, X2)
+
+        X2 : array, shape (n_samples_2, n_features)
+            Right argument of the returned kernel k(X1, X2)
+
+        Returns
+        -------
+        K : array, shape (n_samples_1, n_samples_2)
+            Kernel k(X1, X2)
+        """
+        return self.k1.cross(X1, X2) * self.k2.cross(X1, X2)
 
     def __repr__(self):
         return "{0} * {1}".format(self.k1, self.k2)
 
 
 class ConstantKernel(Kernel):
+    """ Constant kernel.
+
+    Can be used as part of a product-kernel where it scales the magnitude of
+    the other factor (kernel) or as part of a sum-kernel, where it modifies
+    the mean of the Gaussian process.
+    """
 
     def __init__(self, param_space=1.0):
         self._parse_param_space(param_space)
@@ -157,6 +278,27 @@ def params(self, theta):
         self.value = theta[0]
 
     def auto(self, X, eval_gradient=False):
+        """ Return the auto-kernel k(X, X) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_features)
+            Data for which the kernel k(X, X) is computed
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined.
+
+        Returns
+        -------
+        K : array, shape (n_samples, n_samples)
+            Kernel k(X, X)
+
+        K_gradient : array (optional), shape (n_samples, n_samples, n_params)
+            The gradient of the kernel k(X, X) with repect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
         K = self.value * np.ones((X.shape[0], X.shape[0]))
         if eval_gradient:
             return K, np.ones((X.shape[0], X.shape[0], 1))
@@ -164,6 +306,21 @@ def auto(self, X, eval_gradient=False):
             return K
 
     def cross(self, X1, X2):
+        """ Return the cross-kernel k(X1, X2).
+
+        Parameters
+        ----------
+        X1 : array, shape (n_samples_1, n_features)
+            Left argument of the returned kernel k(X1, X2)
+
+        X2 : array, shape (n_samples_2, n_features)
+            Right argument of the returned kernel k(X1, X2)
+
+        Returns
+        -------
+        K : array, shape (n_samples_1, n_samples_2)
+            Kernel k(X1, X2)
+        """
         return self.value * np.ones((X1.shape[0], X2.shape[0]))
 
     def __repr__(self):
@@ -171,6 +328,11 @@ def __repr__(self):
 
 
 class RBF(Kernel):
+    """ Radial-basis function kernel (aka squared-exponential kernel).
+
+    Both isotropic and anisotropic version are supported.
+    """
+
     def __init__(self, param_space=1.0):
         self._parse_param_space(param_space)
 
@@ -183,6 +345,27 @@ def params(self, theta):
         self.l = theta
 
     def auto(self, X, eval_gradient=False):
+        """ Return the auto-kernel k(X, X) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_features)
+            Data for which the kernel k(X, X) is computed
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined.
+
+        Returns
+        -------
+        K : array, shape (n_samples, n_samples)
+            Kernel k(X, X)
+
+        K_gradient : array (optional), shape (n_samples, n_samples, n_params)
+            The gradient of the kernel k(X, X) with repect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
         dists = pdist(X / self.l, metric='sqeuclidean')
         K = np.exp(-.5 * dists)
         # convert from upper-triangular matrix to square matrix
@@ -204,12 +387,34 @@ def auto(self, X, eval_gradient=False):
             return K
 
     def cross(self, X1, X2):
+        """ Return the cross-kernel k(X1, X2).
+
+        Parameters
+        ----------
+        X1 : array, shape (n_samples_1, n_features)
+            Left argument of the returned kernel k(X1, X2)
+
+        X2 : array, shape (n_samples_2, n_features)
+            Right argument of the returned kernel k(X1, X2)
+
+        Returns
+        -------
+        K : array, shape (n_samples_1, n_samples_2)
+            Kernel k(X1, X2)
+        """
         dists = cdist(X1 / self.l, X2 / self.l, metric='sqeuclidean')
         K = np.exp(-.5 * dists)
         return K
 
 
 class WhiteKernel(Kernel):
+    """ White kernel.
+
+    The main use-case of this kernel is as part of a sum-kernel where it
+    explains the noise-component of the signal. Tuning its parameter
+    corresponds to estimating the noise-level.
+    """
+
     def __init__(self, param_space=1.0):
         self._parse_param_space(param_space)
 
@@ -222,6 +427,27 @@ def params(self, theta):
         self.c = theta[0]
 
     def auto(self, X, eval_gradient=False):
+        """ Return the auto-kernel k(X, X) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_features)
+            Data for which the kernel k(X, X) is computed
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined.
+
+        Returns
+        -------
+        K : array, shape (n_samples, n_samples)
+            Kernel k(X, X)
+
+        K_gradient : array (optional), shape (n_samples, n_samples, n_params)
+            The gradient of the kernel k(X, X) with repect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
         K = self.c * np.eye(X.shape[0])
         if eval_gradient:
             return K, np.eye(X.shape[0])[:, :, None]
@@ -229,6 +455,22 @@ def auto(self, X, eval_gradient=False):
             return K
 
     def cross(self, X1, X2):
+        """ Return the cross-kernel k(X1, X2).
+
+        Parameters
+        ----------
+        X1 : array, shape (n_samples_1, n_features)
+            Left argument of the returned kernel k(X1, X2)
+
+        X2 : array, shape (n_samples_2, n_features)
+            Right argument of the returned kernel k(X1, X2)
+
+        Returns
+        -------
+        K : array, shape (n_samples_1, n_samples_2)
+            Kernel k(X1, X2)
+        """
         K = np.zeros((X1.shape[0], X2.shape[0]))
-        K[cdist(X1, X2) < 1e-10] = 1  # entries which are sufficiently similar
+        # entries which are sufficiently similar to be considered identical
+        K[cdist(X1, X2) < 1e-10] = 1
         return K

From 7024332a20ad2e08bfaf9fe162ac3cfb958f485d Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 18 Feb 2015 19:45:11 +0100
Subject: [PATCH 012/165] DOC Revised GP examples (added doc etc.)

---
 examples/gaussian_process/new/plot_gpc.py     | 28 +++++++--
 examples/gaussian_process/new/plot_gpc_xor.py | 14 ++++-
 .../gaussian_process/new/plot_gpr_noisy.py    | 63 ++++++++++++++++---
 .../new/plot_gpr_prior_posterior.py           | 14 ++++-
 sklearn/gaussian_process/kernels.py           |  3 +-
 5 files changed, 103 insertions(+), 19 deletions(-)

diff --git a/examples/gaussian_process/new/plot_gpc.py b/examples/gaussian_process/new/plot_gpc.py
index 5ee15248a55d5..8ca781576e835 100644
--- a/examples/gaussian_process/new/plot_gpc.py
+++ b/examples/gaussian_process/new/plot_gpc.py
@@ -1,3 +1,15 @@
+"""Gaussian process classification (GPC)
+
+This example illustrates both prediction of the prior GPC and the posterior
+GPC. While the posterior model has a considerably larger
+log-marginal-likelihood, the generated predictions are not optimal. This
+is caused by the Laplace approximations used internally by GPC.
+"""
+print __doc__
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
 
 import numpy as np
 
@@ -29,14 +41,18 @@
 pylab.figure(0)
 pylab.scatter(X[:, 0], y)
 X_ = np.linspace(0, 5, 100)
-pylab.plot(X_, gp_fix.predict_proba(X_[:, None]), 'r', label="Fixed")
-pylab.plot(X_, gp_opt.predict_proba(X_[:, None]), 'b', label="Optimized")
+pylab.plot(X_, gp_fix.predict_proba(X_[:, None]), 'r',
+           label="Initial kernel: %s" % kernel_fix)
+pylab.plot(X_, gp_opt.predict_proba(X_[:, None]), 'b',
+           label="Optimized kernel: %s" % kernel_opt)
 pylab.legend(loc="best")
+pylab.xlabel("Feature")
+pylab.ylabel("Class")
 
 # Plot LML landscape
 pylab.figure(1)
-theta0 = np.logspace(0, 8, 50)
-theta1 = np.logspace(-1, 1, 49)
+theta0 = np.logspace(0, 8, 30)
+theta1 = np.logspace(-1, 1, 29)
 Theta0, Theta1 = np.meshgrid(theta0, theta1)
 LML = [[gp_opt.log_marginal_likelihood([Theta0[i, j], Theta1[i, j]])
         for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
@@ -45,4 +61,8 @@
 pylab.xscale("log")
 pylab.yscale("log")
 pylab.colorbar()
+pylab.xlabel("Magnitude")
+pylab.ylabel("Length-scale")
+pylab.title("Log-marginal-likelihood")
+
 pylab.show()
diff --git a/examples/gaussian_process/new/plot_gpc_xor.py b/examples/gaussian_process/new/plot_gpc_xor.py
index 401e5c571e287..d227d7c1a80b4 100644
--- a/examples/gaussian_process/new/plot_gpc_xor.py
+++ b/examples/gaussian_process/new/plot_gpc_xor.py
@@ -1,11 +1,21 @@
+"""Gaussian process classification (GPC)
+
+This example illustrates GPC on XOR data.
+"""
+print __doc__
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
 import numpy as np
 import matplotlib.pyplot as plt
 
 from sklearn.gaussian_process import GaussianProcessClassification
 from sklearn.gaussian_process.kernels import RBF
 
-xx, yy = np.meshgrid(np.linspace(-3, 3, 100),
-                     np.linspace(-3, 3, 100))
+xx, yy = np.meshgrid(np.linspace(-3, 3, 50),
+                     np.linspace(-3, 3, 50))
 np.random.seed(0)
 X = np.random.randn(200, 2)
 Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
diff --git a/examples/gaussian_process/new/plot_gpr_noisy.py b/examples/gaussian_process/new/plot_gpr_noisy.py
index 65f67814a5498..2f214c6b29945 100644
--- a/examples/gaussian_process/new/plot_gpr_noisy.py
+++ b/examples/gaussian_process/new/plot_gpr_noisy.py
@@ -1,3 +1,22 @@
+"""Gaussian process regression (GPR) with noise-level estimation.
+
+This example illustrates that GPR with a sum-kernel including a WhiteKernel can
+estimate the noise level of data. An illustration of the
+log-marginal-likelihood (LML) landscape shows that there exist two local
+maxima of LML. The first corresponds to a model with a high noise level and a
+large length scale, which explains all variations in the data by noise. The
+second one has a smaller noise level and shorter length scale, which explains
+most of the variation by the noise-free functional relationship. The second
+model has a higher likelihood; however, depending on the initial value for the
+hyperparameters, the gradient-based optimization might also converge to the
+high-noise solution. It is thus important to repeat the optimization several
+times for different initializations.
+"""
+print __doc__
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
 
 import numpy as np
 import pylab
@@ -11,24 +30,46 @@
 X = np.random.uniform(0, 5, 20)[:, None]
 y = 0.5*np.sin(3*X[:, 0]) + np.random.normal(0, 0.5, X.shape[0])
 
-rbf_kernel = (1e-10, 1.0, 100) * RBF(param_space=(1e-10, 1.0, None))
-white_kernel = WhiteKernel(param_space=(1e-10, 1e-5, 1e+1))
-
-gp = GaussianProcessRegression(kernel=rbf_kernel + white_kernel,
-                               y_err=0.0).fit(X, y)
-
+# First run
 pylab.figure(0)
+kernel = (1e-10, 1.0, None) * RBF(param_space=(1e-10, 100.0, None)) \
+    + WhiteKernel(param_space=(1e-10, 1e-5, 1e+1))
+kernel_str = str(kernel)
+gp = GaussianProcessRegression(kernel=kernel,
+                               y_err=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
 y_mean, y_cov = gp.predict(X_[:, None], return_cov=True)
 pylab.plot(X_, y_mean, 'k', lw=3, zorder=9)
 pylab.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
                    y_mean + np.sqrt(np.diag(y_cov)),
                    alpha=0.5, color='k')
+pylab.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
+pylab.scatter(X[:, 0], y, c='r', s=50, zorder=10)
+pylab.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
+            % (kernel_str, kernel, gp.log_marginal_likelihood(kernel.params)))
+pylab.tight_layout()
 
+# First run
+pylab.figure(1)
+kernel = (1e-10, 1.0, None) * RBF(param_space=(1e-10, 1.0, None)) \
+    + WhiteKernel(param_space=(1e-10, 1e-5, 1e+1))
+kernel_str = str(kernel)
+gp = GaussianProcessRegression(kernel=kernel,
+                               y_err=0.0).fit(X, y)
+X_ = np.linspace(0, 5, 100)
+y_mean, y_cov = gp.predict(X_[:, None], return_cov=True)
+pylab.plot(X_, y_mean, 'k', lw=3, zorder=9)
+pylab.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
+                   y_mean + np.sqrt(np.diag(y_cov)),
+                   alpha=0.5, color='k')
+pylab.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
 pylab.scatter(X[:, 0], y, c='r', s=50, zorder=10)
+pylab.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
+            % (kernel_str, kernel, gp.log_marginal_likelihood(kernel.params)))
+pylab.tight_layout()
 
 # Plot LML landscape
-pylab.figure(1)
+pylab.figure(2)
 theta0 = np.logspace(-2, 3, 49)
 theta1 = np.logspace(-2, 0, 50)
 Theta0, Theta1 = np.meshgrid(theta0, theta1)
@@ -41,8 +82,12 @@
 pylab.contour(Theta0, Theta1, -LML,
               levels=np.logspace(np.log10(vmin), np.log10(vmax), 50),
               norm=LogNorm(vmin=vmin, vmax=vmax))
+pylab.colorbar()
 pylab.xscale("log")
 pylab.yscale("log")
-pylab.colorbar()
+pylab.xlabel("Length-scale")
+pylab.ylabel("Noise-level")
+pylab.title("Log-marginal-likelihood")
+pylab.tight_layout()
 
-pylab.show()
\ No newline at end of file
+pylab.show()
diff --git a/examples/gaussian_process/new/plot_gpr_prior_posterior.py b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
index 7e6d202bc3a7b..8983043cfdb1b 100644
--- a/examples/gaussian_process/new/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
@@ -1,3 +1,13 @@
+"""Gaussian process regression (GPR) prior and posterior
+
+This example illustrates the prior and posterior of a GPR. Mean, standard
+deviation, and 10 samples are shown for both prior and posterior.
+"""
+print __doc__
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
 
 import numpy as np
 import pylab
@@ -25,7 +35,7 @@
 pylab.plot(X_, y_samples, color='b', lw=1)
 pylab.xlim(0, 5)
 pylab.ylim(-3, 3)
-pylab.title("Prior")
+pylab.title("Prior, kernel:  %s" % kernel)
 
 # Generate data and fit GP
 X = np.random.uniform(0, 5, 10)[:, None]
@@ -46,6 +56,6 @@
 pylab.scatter(X[:, 0], y, c='r', s=50, zorder=10)
 pylab.xlim(0, 5)
 pylab.ylim(-3, 3)
-pylab.title("Posterior")
+pylab.title("Posterior, kernel: %s" % kernel)
 pylab.tight_layout()
 pylab.show()
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 5d71fff7d2a74..fc18ddb3aad3e 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -87,8 +87,7 @@ def __rmul__(self, b):
 
     def __repr__(self):
         return "{0}({1})".format(self.__class__.__name__,
-                                 ", ".join(map("{0}".format,
-                                               self.params)))
+                                 ", ".join(map("{0}".format, self.params)))
 
 
 

From c4b345b6036bcc5d60f745d78671734ae22c1352 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 19 Feb 2015 11:08:14 +0100
Subject: [PATCH 013/165] ENH GPR.predict allows returning standard-deviations
 of predictions

---
 sklearn/gaussian_process/gpr.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index a3e28c9e9cd38..25044cb7dd9f1 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -78,7 +78,10 @@ def obj_func(theta):
 
         return self
 
-    def predict(self, X, return_cov=False):
+    def predict(self, X, return_std=False, return_cov=False):
+        assert not (return_std and return_cov), \
+            "Not returning standard deviation of predictions when " \
+            "returning full covariance."
         X = np.asarray(X)
 
         if not hasattr(self, "X_fit_"):  # Unfitted; predict based on GP prior
@@ -86,6 +89,10 @@ def predict(self, X, return_cov=False):
             if return_cov:
                 y_cov = self.kernel.auto(X)
                 return y_mean, y_cov
+            elif return_std:
+                # XXX: Compute y_std more efficiently
+                y_std = np.sqrt(np.diag(self.kernel.auto(X)))
+                return y_mean, y_std
             else:
                 return y_mean
         else:  # Predict based on GP posterior
@@ -96,6 +103,13 @@ def predict(self, X, return_cov=False):
                 y_cov = \
                     self.kernel.auto(X) - K_trans.dot(v)  # Line 6
                 return y_mean, y_cov
+            elif return_std:
+                # XXX: Compute y_std more efficiently
+                v = cho_solve((self.L_, True), K_trans.T)  # Line 5
+                y_cov = \
+                    self.kernel.auto(X) - K_trans.dot(v)  # Line 6
+                y_std = np.sqrt(np.diag(y_cov))
+                return y_mean, y_std
             else:
                 return y_mean
 

From 9f8a90c59b500c28db83512fa780c1dd01e2018e Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Fri, 20 Feb 2015 19:59:37 +0100
Subject: [PATCH 014/165] DOC Revised examples based on @eickenberg's comments

---
 examples/gaussian_process/new/plot_gpc.py     | 44 +++++------
 examples/gaussian_process/new/plot_gpc_xor.py | 13 ++--
 .../gaussian_process/new/plot_gpr_noisy.py    | 77 ++++++++++---------
 .../new/plot_gpr_prior_posterior.py           | 62 +++++++--------
 4 files changed, 99 insertions(+), 97 deletions(-)

diff --git a/examples/gaussian_process/new/plot_gpc.py b/examples/gaussian_process/new/plot_gpc.py
index 8ca781576e835..74358af8ef269 100644
--- a/examples/gaussian_process/new/plot_gpc.py
+++ b/examples/gaussian_process/new/plot_gpc.py
@@ -13,14 +13,15 @@
 
 import numpy as np
 
+from matplotlib import pyplot as plt
+
 from sklearn.gaussian_process import GaussianProcessClassification
 from sklearn.gaussian_process.kernels import RBF
 
-np.random.seed(0)
-
 
 # Generate data
-X = np.random.uniform(0, 5, 50)[:, None]
+rng = np.random.RandomState(0)
+X = rng.uniform(0, 5, 50)[:, np.newaxis]
 y = np.array(np.sin((X[:, 0] - 2.5) ** 2) > 0.0, dtype=int)
 
 # Specify Gaussian Processes with fixed and optimized hyperparameters
@@ -37,32 +38,31 @@
 
 
 # Plot posteriors
-import pylab
-pylab.figure(0)
-pylab.scatter(X[:, 0], y)
+plt.figure(0)
+plt.scatter(X[:, 0], y)
 X_ = np.linspace(0, 5, 100)
-pylab.plot(X_, gp_fix.predict_proba(X_[:, None]), 'r',
-           label="Initial kernel: %s" % kernel_fix)
-pylab.plot(X_, gp_opt.predict_proba(X_[:, None]), 'b',
-           label="Optimized kernel: %s" % kernel_opt)
-pylab.legend(loc="best")
-pylab.xlabel("Feature")
-pylab.ylabel("Class")
+plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis]), 'r',
+         label="Initial kernel: %s" % kernel_fix)
+plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis]), 'b',
+         label="Optimized kernel: %s" % kernel_opt)
+plt.legend(loc="best")
+plt.xlabel("Feature")
+plt.ylabel("Class")
 
 # Plot LML landscape
-pylab.figure(1)
+plt.figure(1)
 theta0 = np.logspace(0, 8, 30)
 theta1 = np.logspace(-1, 1, 29)
 Theta0, Theta1 = np.meshgrid(theta0, theta1)
 LML = [[gp_opt.log_marginal_likelihood([Theta0[i, j], Theta1[i, j]])
         for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
 LML = np.array(LML).T
-pylab.pcolor(Theta0, Theta1, LML)
-pylab.xscale("log")
-pylab.yscale("log")
-pylab.colorbar()
-pylab.xlabel("Magnitude")
-pylab.ylabel("Length-scale")
-pylab.title("Log-marginal-likelihood")
+plt.pcolor(Theta0, Theta1, LML)
+plt.xscale("log")
+plt.yscale("log")
+plt.colorbar()
+plt.xlabel("Magnitude")
+plt.ylabel("Length-scale")
+plt.title("Log-marginal-likelihood")
 
-pylab.show()
+plt.show()
diff --git a/examples/gaussian_process/new/plot_gpc_xor.py b/examples/gaussian_process/new/plot_gpc_xor.py
index d227d7c1a80b4..9d26785bc3a25 100644
--- a/examples/gaussian_process/new/plot_gpc_xor.py
+++ b/examples/gaussian_process/new/plot_gpc_xor.py
@@ -14,10 +14,11 @@
 from sklearn.gaussian_process import GaussianProcessClassification
 from sklearn.gaussian_process.kernels import RBF
 
+
 xx, yy = np.meshgrid(np.linspace(-3, 3, 50),
                      np.linspace(-3, 3, 50))
-np.random.seed(0)
-X = np.random.randn(200, 2)
+rng = np.random.RandomState(0)
+X = rng.randn(200, 2)
 Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
 
 # fit the model
@@ -25,12 +26,12 @@
 clf = GaussianProcessClassification(kernel=kernel).fit(X, Y)
 
 # plot the decision function for each datapoint on the grid
-Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])
+Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)
 Z = Z.reshape(xx.shape)
 
 image = plt.imshow(Z, interpolation='nearest',
-           extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
-           origin='lower', cmap=plt.cm.PuOr_r)
+                   extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+                   aspect='auto', origin='lower', cmap=plt.cm.PuOr_r)
 contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
                        linetypes='--')
 plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired)
@@ -38,4 +39,4 @@
 plt.yticks(())
 plt.axis([-3, 3, -3, 3])
 plt.colorbar(image)
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/examples/gaussian_process/new/plot_gpr_noisy.py b/examples/gaussian_process/new/plot_gpr_noisy.py
index 2f214c6b29945..2101e226ec18e 100644
--- a/examples/gaussian_process/new/plot_gpr_noisy.py
+++ b/examples/gaussian_process/new/plot_gpr_noisy.py
@@ -19,57 +19,58 @@
 # License: BSD 3 clause
 
 import numpy as np
-import pylab
+
+from matplotlib import pyplot as plt
 from matplotlib.colors import LogNorm
 
 from sklearn.gaussian_process import GaussianProcessRegression
 from sklearn.gaussian_process.kernels import RBF, WhiteKernel
 
 
-np.random.seed(0)
-X = np.random.uniform(0, 5, 20)[:, None]
-y = 0.5*np.sin(3*X[:, 0]) + np.random.normal(0, 0.5, X.shape[0])
+rng = np.random.RandomState(0)
+X = rng.uniform(0, 5, 20)[:, np.newaxis]
+y = 0.5*np.sin(3*X[:, 0]) + rng.normal(0, 0.5, X.shape[0])
 
 # First run
-pylab.figure(0)
+plt.figure(0)
 kernel = (1e-10, 1.0, None) * RBF(param_space=(1e-10, 100.0, None)) \
     + WhiteKernel(param_space=(1e-10, 1e-5, 1e+1))
 kernel_str = str(kernel)
 gp = GaussianProcessRegression(kernel=kernel,
                                y_err=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
-y_mean, y_cov = gp.predict(X_[:, None], return_cov=True)
-pylab.plot(X_, y_mean, 'k', lw=3, zorder=9)
-pylab.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
-                   y_mean + np.sqrt(np.diag(y_cov)),
-                   alpha=0.5, color='k')
-pylab.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
-pylab.scatter(X[:, 0], y, c='r', s=50, zorder=10)
-pylab.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
-            % (kernel_str, kernel, gp.log_marginal_likelihood(kernel.params)))
-pylab.tight_layout()
+y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
+plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
+plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
+                 y_mean + np.sqrt(np.diag(y_cov)),
+                 alpha=0.5, color='k')
+plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
+plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
+plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
+          % (kernel_str, kernel, gp.log_marginal_likelihood(kernel.params)))
+plt.tight_layout()
 
 # First run
-pylab.figure(1)
+plt.figure(1)
 kernel = (1e-10, 1.0, None) * RBF(param_space=(1e-10, 1.0, None)) \
     + WhiteKernel(param_space=(1e-10, 1e-5, 1e+1))
 kernel_str = str(kernel)
 gp = GaussianProcessRegression(kernel=kernel,
                                y_err=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
-y_mean, y_cov = gp.predict(X_[:, None], return_cov=True)
-pylab.plot(X_, y_mean, 'k', lw=3, zorder=9)
-pylab.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
-                   y_mean + np.sqrt(np.diag(y_cov)),
-                   alpha=0.5, color='k')
-pylab.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
-pylab.scatter(X[:, 0], y, c='r', s=50, zorder=10)
-pylab.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
-            % (kernel_str, kernel, gp.log_marginal_likelihood(kernel.params)))
-pylab.tight_layout()
+y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
+plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
+plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
+                 y_mean + np.sqrt(np.diag(y_cov)),
+                 alpha=0.5, color='k')
+plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
+plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
+plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
+          % (kernel_str, kernel, gp.log_marginal_likelihood(kernel.params)))
+plt.tight_layout()
 
 # Plot LML landscape
-pylab.figure(2)
+plt.figure(2)
 theta0 = np.logspace(-2, 3, 49)
 theta1 = np.logspace(-2, 0, 50)
 Theta0, Theta1 = np.meshgrid(theta0, theta1)
@@ -79,15 +80,15 @@
 
 vmin, vmax = (-LML).min(), (-LML).max()
 vmax = 50
-pylab.contour(Theta0, Theta1, -LML,
-              levels=np.logspace(np.log10(vmin), np.log10(vmax), 50),
-              norm=LogNorm(vmin=vmin, vmax=vmax))
-pylab.colorbar()
-pylab.xscale("log")
-pylab.yscale("log")
-pylab.xlabel("Length-scale")
-pylab.ylabel("Noise-level")
-pylab.title("Log-marginal-likelihood")
-pylab.tight_layout()
+plt.contour(Theta0, Theta1, -LML,
+            levels=np.logspace(np.log10(vmin), np.log10(vmax), 50),
+            norm=LogNorm(vmin=vmin, vmax=vmax))
+plt.colorbar()
+plt.xscale("log")
+plt.yscale("log")
+plt.xlabel("Length-scale")
+plt.ylabel("Noise-level")
+plt.title("Log-marginal-likelihood")
+plt.tight_layout()
 
-pylab.show()
+plt.show()
diff --git a/examples/gaussian_process/new/plot_gpr_prior_posterior.py b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
index 8983043cfdb1b..955dda4593f8e 100644
--- a/examples/gaussian_process/new/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
@@ -10,52 +10,52 @@
 # License: BSD 3 clause
 
 import numpy as np
-import pylab
+
+from matplotlib import pyplot as plt
 
 from sklearn.gaussian_process import GaussianProcessRegression
 from sklearn.gaussian_process.kernels import RBF
 
-np.random.seed(0)
-
 
 # Specify Gaussian Process
 kernel = (1e-10, 1.0, 100) * RBF(param_space=(1e-10, 1.0, None))
 gp = GaussianProcessRegression(kernel=kernel)
 
 # Plot prior
-pylab.figure(0, figsize=(8, 8))
-pylab.subplot(2, 1, 1)
+plt.figure(0, figsize=(8, 8))
+plt.subplot(2, 1, 1)
 X_ = np.linspace(0, 5, 100)
-y_mean, y_cov = gp.predict(X_[:, None], return_cov=True)
-pylab.plot(X_, y_mean, 'k', lw=3, zorder=9)
-pylab.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
-                   y_mean + np.sqrt(np.diag(y_cov)),
-                   alpha=0.5, color='k')
-y_samples = gp.sample(X_[:, None], 10)
-pylab.plot(X_, y_samples, color='b', lw=1)
-pylab.xlim(0, 5)
-pylab.ylim(-3, 3)
-pylab.title("Prior, kernel:  %s" % kernel)
+y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
+plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
+plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
+                 y_mean + np.sqrt(np.diag(y_cov)),
+                 alpha=0.5, color='k')
+y_samples = gp.sample(X_[:, np.newaxis], 10)
+plt.plot(X_, y_samples, color='b', lw=1)
+plt.xlim(0, 5)
+plt.ylim(-3, 3)
+plt.title("Prior, kernel:  %s" % kernel)
 
 # Generate data and fit GP
-X = np.random.uniform(0, 5, 10)[:, None]
+rng = np.random.RandomState(4)
+X = rng.uniform(0, 5, 10)[:, np.newaxis]
 y = np.sin((X[:, 0] - 2.5) ** 2)
 gp.fit(X, y)
 
 # Plot posterior
-pylab.subplot(2, 1, 2)
+plt.subplot(2, 1, 2)
 X_ = np.linspace(0, 5, 100)
-y_mean, y_cov = gp.predict(X_[:, None], return_cov=True)
-pylab.plot(X_, y_mean, 'k', lw=3, zorder=9)
-pylab.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
-                   y_mean + np.sqrt(np.diag(y_cov)),
-                   alpha=0.5, color='k')
-
-y_samples = gp.sample(X_[:, None], 10)
-pylab.plot(X_, y_samples, color='b', lw=1)
-pylab.scatter(X[:, 0], y, c='r', s=50, zorder=10)
-pylab.xlim(0, 5)
-pylab.ylim(-3, 3)
-pylab.title("Posterior, kernel: %s" % kernel)
-pylab.tight_layout()
-pylab.show()
+y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
+plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
+plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
+                 y_mean + np.sqrt(np.diag(y_cov)),
+                 alpha=0.5, color='k')
+
+y_samples = gp.sample(X_[:, np.newaxis], 10)
+plt.plot(X_, y_samples, color='b', lw=1)
+plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
+plt.xlim(0, 5)
+plt.ylim(-3, 3)
+plt.title("Posterior, kernel: %s" % kernel)
+plt.tight_layout()
+plt.show()

From 6681df04ec3989fb87c0cf990d86f151653b2ee9 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Fri, 20 Feb 2015 20:00:27 +0100
Subject: [PATCH 015/165] MISC Revised gaussian_process package based on
 @eickenberg's suggestions

---
 sklearn/gaussian_process/gpc.py            | 37 +++++++++++-----------
 sklearn/gaussian_process/gpr.py            | 16 ++++++----
 sklearn/gaussian_process/kernels.py        | 37 +++++++++++++---------
 sklearn/gaussian_process/tests/test_gpc.py |  3 +-
 sklearn/gaussian_process/tests/test_gpr.py |  3 +-
 5 files changed, 55 insertions(+), 41 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 3a5c9a93f7b09..cd28cdd691767 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -12,6 +12,17 @@
 from sklearn.base import BaseEstimator
 
 
+# Values required for approximating the logistic sigmoid by
+# error functions. coefs are obtained via:
+# x = np.array([0, 0.6, 2, 3.5, 4.5, np.inf])
+# b = logistic(x)
+# A = (erf(np.dot(x, self.lambdas)) + 1) / 2
+# coefs = lstsq(A, b)[0]
+LAMBDAS = np.array([0.41, 0.4, 0.37, 0.44, 0.39])[:, np.newaxis]
+COEFS = np.array([-1854.8214151, 3516.89893646, 221.29346712,
+                  128.12323805, -2010.49422654])[:, np.newaxis]
+
+
 class GaussianProcessClassification(BaseEstimator):
     """ Gaussian process classification (GPC).
 
@@ -31,16 +42,6 @@ def __init__(self, kernel, jitter=0.0):
         self.kernel = kernel
         self.jitter = jitter
 
-        # Values required for approximating the logistic sigmoid by
-        # error functions. coefs are obtained via:
-        # x = np.array([0, 0.6, 2, 3.5, 4.5, np.inf])
-        # b = logistic(x)
-        # A = (erf(np.dot(x, self.lambdas)) + 1) / 2
-        # coefs = lstsq(A, b)[0]
-        self.lambdas = np.array([0.41, 0.4, 0.37, 0.44, 0.39])[:, None]
-        self.coefs = np.array([-1854.8214151, 3516.89893646, 221.29346712,
-                               128.12323805, -2010.49422654])[:, None]
-
     def fit(self, X, y):
         # XXX: Assert that y is binary and labels are {0, 1}
         self.X_fit_ = np.asarray(X)
@@ -53,8 +54,8 @@ def obj_func(theta):
                 lml, grad = self.log_marginal_likelihood(theta,
                                                          eval_gradient=True)
                 return -lml, -grad
-            self.theta_, lml, _ = fmin_l_bfgs_b(obj_func, self.kernel.params,
-                                                bounds=self.kernel.bounds)
+            self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel.params,
+                                              bounds=self.kernel.bounds)
             self.kernel.params = self.theta_
         else:
             self.theta_ = self.kernel.params
@@ -97,11 +98,11 @@ def predict_proba(self, X):
         # blitiri.blogspot.de/2012/11/gaussian-integral-of-error-function.html
         # for information on how this integral can be computed
         alpha = 1 / (2 * np.diag(var_f_star))
-        gamma = self.lambdas * f_star
+        gamma = LAMBDAS * f_star
         integrals = np.sqrt(np.pi / alpha) \
-            * erf(gamma * np.sqrt(alpha / (alpha + self.lambdas**2))) \
+            * erf(gamma * np.sqrt(alpha / (alpha + LAMBDAS**2))) \
             / (2 * np.sqrt(np.diag(var_f_star) * 2 * np.pi))
-        pi_star = (self.coefs * integrals).sum(axis=0) + .5 * self.coefs.sum()
+        pi_star = (COEFS * integrals).sum(axis=0) + .5 * COEFS.sum()
 
         return pi_star
 
@@ -150,13 +151,13 @@ def _posterior_mode(self, K, return_temporaries=False):
         while True:
             # Line 4
             pi = 1 / (1 + np.exp(-f))
-            W = -np.diag(-pi*(1-pi))  # XXX: avoid creating square matrix?
+            W = pi * (1 - pi)
             # Line 5
-            W_sr = np.sqrt(W)
+            W_sr = np.diag(np.sqrt(W))  # XXX: avoid creating square matrix?
             B = np.eye(W.shape[0]) + W_sr.dot(K).dot(W_sr)
             L = cholesky(B, lower=True)
             # Line 6
-            b = W.dot(f) + (self.y_fit_ - pi)
+            b = W * f + (self.y_fit_ - pi)
             # Line 7
             a = b - W_sr.dot(cho_solve((L, True), W_sr.dot(K).dot(b)))
             # Line 8
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 25044cb7dd9f1..a13dd35c5f042 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -9,6 +9,7 @@
 from scipy.optimize import fmin_l_bfgs_b
 
 from sklearn.base import BaseEstimator
+from sklearn.utils import check_random_state
 
 
 class GaussianProcessRegression(BaseEstimator):
@@ -79,9 +80,11 @@ def obj_func(theta):
         return self
 
     def predict(self, X, return_std=False, return_cov=False):
-        assert not (return_std and return_cov), \
-            "Not returning standard deviation of predictions when " \
-            "returning full covariance."
+        if return_std and return_cov:
+            raise RuntimeError(
+                "Not returning standard deviation of predictions when "
+                "returning full covariance.")
+
         X = np.asarray(X)
 
         if not hasattr(self, "X_fit_"):  # Unfitted; predict based on GP prior
@@ -113,10 +116,11 @@ def predict(self, X, return_std=False, return_cov=False):
             else:
                 return y_mean
 
-    def sample(self, X, n_samples=1):
+    def sample(self, X, n_samples=1, random_state=0):
+        rng = check_random_state(random_state)
+
         y_mean, y_cov = self.predict(X, return_cov=True)
-        y_samples = \
-            np.random.multivariate_normal(y_mean, y_cov, n_samples).T
+        y_samples = rng.multivariate_normal(y_mean, y_cov, n_samples).T
         return y_samples
 
     def log_marginal_likelihood(self, theta, eval_gradient=False):
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index fc18ddb3aad3e..6a9603d1764ba 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -4,7 +4,7 @@
 combined via the "+" and "*" operators. These expressions can also contain
 scalar values, which are automatically converted to a constant kernel.
 
-All kernel allow (analytic) gradient-based hyperparameter optimization.
+All kernels allow (analytic) gradient-based hyperparameter optimization.
 The space of hyperparameters can be specified by giving lower und upper
 boundaries for the value of each hyperparameter (the search space is thus
 rectangular). This can be achieved by using a pair or triple instead of a
@@ -28,28 +28,34 @@ class Kernel(object):
     """ Base class for all kernels."""
 
     def _parse_param_space(self, param_space):
-        if not hasattr(param_space, "__iter__"):  # fixed hyperparameter
+        if not np.iterable(param_space):  # fixed hyperparameter
             self.params = np.array([float(param_space)])
             self.has_bounds = False
             return
         param_space = np.atleast_2d(param_space)
-        if param_space.shape[1] == 1:  # fixed hyperparameter
+        if param_space.shape[1] == 1:
+            # fixed hyperparameter
             self.params = param_space[:, 0]
             self.has_bounds = False
-        elif param_space.shape[1] == 2:  # lower+upper bound for hyperparameter
+        elif param_space.shape[1] == 2:
+            # lower+upper bound for hyperparameter
             self.bounds = param_space
             self.has_bounds = True
             # Use geometric mean of upper and lower boundary as initial
             # hyperparameter value
-            assert not np.any(self.l_bound == None)  # XXX: enforce element-wise comparison to None
-            assert not np.any(self.u_bound == None)
+            if np.any(np.equal(self.l_bound, None)) \
+               or np.any(np.equal(self.u_bound, None)):
+                raise ValueError("Lower or upper bound being None requires "
+                                 "explicitly specifying the initial value.")
             self.params = np.array([np.sqrt(self.l_bound * self.u_bound)])
-        elif param_space.shape[1] == 3:  # lower bound, initial value, upper bound
+        elif param_space.shape[1] == 3:
+            # lower bound, initial value, upper bound
             self.params = param_space[:, 1]
             self.bounds = param_space[:, [0, 2]]
             self.has_bounds = True
         else:
-            raise Exception()
+            raise ValueError("Invalid parameter space given. Must not have "
+                             "more than 3 entries per parameter.")
 
     @property
     def n_params(self):
@@ -90,7 +96,6 @@ def __repr__(self):
                                  ", ".join(map("{0}".format, self.params)))
 
 
-
 class KernelOperator(Kernel):
     """ Base class for all kernel operators. """
 
@@ -229,8 +234,8 @@ def auto(self, X, eval_gradient=False):
         if eval_gradient:
             K1, K1_gradient = self.k1.auto(X, eval_gradient=True)
             K2, K2_gradient = self.k2.auto(X, eval_gradient=True)
-            return K1 * K2, np.dstack((K1_gradient * K2[:, :, None],
-                                       K2_gradient * K1[:, :, None]))
+            return K1 * K2, np.dstack((K1_gradient * K2[:, :, np.newaxis],
+                                       K2_gradient * K1[:, :, np.newaxis]))
         else:
             return self.k1.auto(X) * self.k2.auto(X)
 
@@ -372,12 +377,14 @@ def auto(self, X, eval_gradient=False):
         np.fill_diagonal(K, 1)
         if eval_gradient:
             if self.l.shape[0] == 1:
-                K_gradient = (K * squareform(dists) / self.l[0])[:, :, None]
+                K_gradient = \
+                    (K * squareform(dists) / self.l[0])[:, :, np.newaxis]
                 return K, K_gradient
             elif self.l.shape[0] == X.shape[1]:
                 # We need to recompute the pairwise dimension-wise distances
-                D = (X[:, None, :] - X[None, :, :])**2 / (self.l ** 3)
-                K_gradient = K[..., None] * D
+                D = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 \
+                    / (self.l ** 3)
+                K_gradient = K[..., np.newaxis] * D
                 return K, K_gradient
             else:
                 raise Exception("Anisotropic kernels require that the number "
@@ -449,7 +456,7 @@ def auto(self, X, eval_gradient=False):
         """
         K = self.c * np.eye(X.shape[0])
         if eval_gradient:
-            return K, np.eye(X.shape[0])[:, :, None]
+            return K, np.eye(X.shape[0])[:, :, np.newaxis]
         else:
             return K
 
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 1f828ccbe02b2..3dcc109cdc8fb 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -16,7 +16,8 @@
                                    assert_almost_equal, assert_array_equal)
 
 
-f = lambda x: x * np.sin(x)
+def f(x):
+    return x * np.sin(x)
 X = np.atleast_2d(np.linspace(0, 10, 30)).T
 X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T
 y = np.array(f(X).ravel() > 0, dtype=int)
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 40efcae385398..0fb50f4272e00 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -16,7 +16,8 @@
                                    assert_almost_equal)
 
 
-f = lambda x: x * np.sin(x)
+def f(x):
+    return x * np.sin(x)
 X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T
 X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T
 y = f(X).ravel()

From 204ca41fe0f669fe7f9a72313f94ba7df4cf3e6e Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Fri, 20 Feb 2015 20:56:03 +0100
Subject: [PATCH 016/165] TST Dealing with failing tests from test_common

---
 sklearn/gaussian_process/gpc.py               | 20 ++++++++++++++-----
 sklearn/gaussian_process/gpr.py               | 12 +++++++----
 .../gaussian_process/tests/test_kernels.py    |  2 --
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index cd28cdd691767..447a2c9e56456 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -10,6 +10,8 @@
 from scipy.special import erf
 
 from sklearn.base import BaseEstimator
+from sklearn.gaussian_process.kernels import RBF
+from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
 
 
 # Values required for approximating the logistic sigmoid by
@@ -38,14 +40,16 @@ class GaussianProcessClassification(BaseEstimator):
       * binary classification
     """
 
-    def __init__(self, kernel, jitter=0.0):
+    def __init__(self, kernel=RBF(), jitter=0.0):
         self.kernel = kernel
         self.jitter = jitter
 
     def fit(self, X, y):
+        X, y = check_X_y(X, y)
+
         # XXX: Assert that y is binary and labels are {0, 1}
-        self.X_fit_ = np.asarray(X)
-        self.y_fit_ = np.asarray(y)
+        self.X_fit_ = X
+        self.y_fit_ = y
 
         if self.kernel.has_bounds:
             # Choose hyperparameters based on maximizing the log-marginal
@@ -71,16 +75,22 @@ def obj_func(theta):
         return self
 
     def predict(self, X):
-        # As discussed on Sectio 3.4.2 of GPML, for making hard binary
+        check_is_fitted(self, ["X_fit_", "y_fit_", "K_", "f_"])
+        X = check_array(X)
+
+        # As discussed on Section 3.4.2 of GPML, for making hard binary
         # decisions, it is enough to compute the MAP of the posterior and
         # pass it through the link function
         K_star = \
             self.kernel.cross(self.X_fit_, X)  # K_star =k(x_star)
-        f_star = K_star.T.dot(self.y_fit_ - self.pi)  # Line 4
+        f_star = K_star.T.dot(self.y_fit_ - self.pi)  # Line 4 (Algorithm 3.2)
 
         return f_star > 0
 
     def predict_proba(self, X):
+        check_is_fitted(self, ["X_fit_", "y_fit_", "K_", "f_"])
+        X = check_array(X)
+
         # Based on Algorithm 3.2 of GPML
         K_star = \
             self.kernel.cross(self.X_fit_, X)  # K_star =k(x_star)
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index a13dd35c5f042..926056ef474f2 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -9,7 +9,9 @@
 from scipy.optimize import fmin_l_bfgs_b
 
 from sklearn.base import BaseEstimator
+from sklearn.gaussian_process.kernels import RBF
 from sklearn.utils import check_random_state
+from sklearn.utils.validation import check_X_y, check_array
 
 
 class GaussianProcessRegression(BaseEstimator):
@@ -49,13 +51,15 @@ class GaussianProcessRegression(BaseEstimator):
     alpha_:
     """
 
-    def __init__(self, kernel, y_err=1e-10):
+    def __init__(self, kernel=RBF(), y_err=1e-10):
         self.kernel = kernel
         self.y_err = y_err
 
     def fit(self, X, y):
-        self.X_fit_ = np.asarray(X)
-        self.y_fit_ = np.asarray(y)
+        X, y = check_X_y(X, y)
+
+        self.X_fit_ = X
+        self.y_fit_ = y
 
         if self.kernel.has_bounds:
             # Choose hyperparameters based on maximizing the log-marginal
@@ -85,7 +89,7 @@ def predict(self, X, return_std=False, return_cov=False):
                 "Not returning standard deviation of predictions when "
                 "returning full covariance.")
 
-        X = np.asarray(X)
+        X = check_array(X)
 
         if not hasattr(self, "X_fit_"):  # Unfitted; predict based on GP prior
             y_mean = np.zeros(X.shape[0])
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 13376408f5797..8ebbcdba088e1 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -48,10 +48,8 @@ def eval_kernel_ij_for_theta(theta):
 def test_auto_vs_cross():
     """ Auto-correlation and cross-correlation should be consistent. """
     for kernel in kernels:
-        print kernel
         K_auto = kernel.auto(X)
         K_cross = kernel.cross(X, X)
-
         assert_almost_equal(K_auto, K_cross, 5)
 
 def test_kernel_operator_commutative():

From dfdac51f1779935cd9a9eed6a944423871508dcb Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 22 Feb 2015 14:11:26 +0100
Subject: [PATCH 017/165] REFACTOR Merging kernel methods auto() and cross() in
 __call__()

---
 sklearn/gaussian_process/gpc.py               |  14 +-
 sklearn/gaussian_process/gpr.py               |  17 +-
 sklearn/gaussian_process/kernels.py           | 261 ++++++++----------
 .../gaussian_process/tests/test_kernels.py    |  26 +-
 4 files changed, 135 insertions(+), 183 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 447a2c9e56456..6032f0e8726d4 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -66,7 +66,7 @@ def obj_func(theta):
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
-        self.K_ = self.kernel.auto(self.X_fit_)
+        self.K_ = self.kernel(self.X_fit_)
         self.K_[np.diag_indices_from(self.K_)] += self.jitter
 
         self.f_, _, (self.pi, self.W_sr, self.L, _, _) = \
@@ -82,7 +82,7 @@ def predict(self, X):
         # decisions, it is enough to compute the MAP of the posterior and
         # pass it through the link function
         K_star = \
-            self.kernel.cross(self.X_fit_, X)  # K_star =k(x_star)
+            self.kernel(self.X_fit_, X)  # K_star =k(x_star)
         f_star = K_star.T.dot(self.y_fit_ - self.pi)  # Line 4 (Algorithm 3.2)
 
         return f_star > 0
@@ -92,11 +92,10 @@ def predict_proba(self, X):
         X = check_array(X)
 
         # Based on Algorithm 3.2 of GPML
-        K_star = \
-            self.kernel.cross(self.X_fit_, X)  # K_star =k(x_star)
+        K_star = self.kernel(self.X_fit_, X)  # K_star =k(x_star)
         f_star = K_star.T.dot(self.y_fit_ - self.pi)  # Line 4
         v = solve(self.L, self.W_sr.dot(K_star))  # Line 5
-        var_f_star = self.kernel.auto(X) - v.T.dot(v)  # Line 6
+        var_f_star = self.kernel(X) - v.T.dot(v)  # Line 6
 
         # Line 7:
         # Approximate \int log(z) * N(z | f_star, var_f_star)
@@ -122,10 +121,9 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         kernel.params = theta
 
         if eval_gradient:
-            K, K_gradient = \
-                kernel.auto(self.X_fit_, eval_gradient=True)
+            K, K_gradient = kernel(self.X_fit_, eval_gradient=True)
         else:
-            K = kernel.auto(self.X_fit_)
+            K = kernel(self.X_fit_)
 
         K[np.diag_indices_from(K)] += self.jitter
 
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 926056ef474f2..513dbd8da45fa 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -76,7 +76,7 @@ def obj_func(theta):
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
-        K = self.kernel.auto(self.X_fit_)
+        K = self.kernel(self.X_fit_)
         K[np.diag_indices_from(K)] += self.y_err
         self.L_ = cholesky(K, lower=True)  # Line 2
         self.alpha_ = cho_solve((self.L_, True), self.y_fit_)  # Line 3
@@ -94,27 +94,27 @@ def predict(self, X, return_std=False, return_cov=False):
         if not hasattr(self, "X_fit_"):  # Unfitted; predict based on GP prior
             y_mean = np.zeros(X.shape[0])
             if return_cov:
-                y_cov = self.kernel.auto(X)
+                y_cov = self.kernel(X)
                 return y_mean, y_cov
             elif return_std:
                 # XXX: Compute y_std more efficiently
-                y_std = np.sqrt(np.diag(self.kernel.auto(X)))
+                y_std = np.sqrt(np.diag(self.kernel(X)))
                 return y_mean, y_std
             else:
                 return y_mean
         else:  # Predict based on GP posterior
-            K_trans = self.kernel.cross(X, self.X_fit_)
+            K_trans = self.kernel(X, self.X_fit_)
             y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
             if return_cov:
                 v = cho_solve((self.L_, True), K_trans.T)  # Line 5
                 y_cov = \
-                    self.kernel.auto(X) - K_trans.dot(v)  # Line 6
+                    self.kernel(X) - K_trans.dot(v)  # Line 6
                 return y_mean, y_cov
             elif return_std:
                 # XXX: Compute y_std more efficiently
                 v = cho_solve((self.L_, True), K_trans.T)  # Line 5
                 y_cov = \
-                    self.kernel.auto(X) - K_trans.dot(v)  # Line 6
+                    self.kernel(X) - K_trans.dot(v)  # Line 6
                 y_std = np.sqrt(np.diag(y_cov))
                 return y_mean, y_std
             else:
@@ -133,10 +133,9 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         kernel.params = theta
 
         if eval_gradient:
-            K, K_gradient = \
-                kernel.auto(self.X_fit_, eval_gradient=True)
+            K, K_gradient = kernel(self.X_fit_, eval_gradient=True)
         else:
-            K = kernel.auto(self.X_fit_)
+            K = kernel(self.X_fit_)
 
         K[np.diag_indices_from(K)] += self.y_err
         L = cholesky(K, lower=True)  # Line 2
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 6a9603d1764ba..2ae33b7bdb8cd 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -20,11 +20,15 @@
 # Note: this module is strongly inspired by the kernel module of the george
 #       package.
 
+from abc import ABCMeta, abstractmethod
+
 import numpy as np
 from scipy.spatial.distance import pdist, cdist, squareform
 
+from ..externals import six
+
 
-class Kernel(object):
+class Kernel(six.with_metaclass(ABCMeta)):
     """ Base class for all kernels."""
 
     def _parse_param_space(self, param_space):
@@ -95,6 +99,10 @@ def __repr__(self):
         return "{0}({1})".format(self.__class__.__name__,
                                  ", ".join(map("{0}".format, self.params)))
 
+    @abstractmethod
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Evaluate the kernel."""
+
 
 class KernelOperator(Kernel):
     """ Base class for all kernel operators. """
@@ -129,10 +137,10 @@ def bounds(self, bounds):
 
 
 class Sum(KernelOperator):
-    """ Sum-kernel k1+k2 of two kernels k1 and k2.
+    """ Sum-kernel k1 + k2 of two kernels k1 and k2.
 
     The resulting kernel is defined as
-    k_sum(X1, X2) = k1(X1, X2) + k2(X1, X2)
+    k_sum(X, Y) = k1(X, Y) + k2(X, Y)
 
     Parameters
     ----------
@@ -143,13 +151,17 @@ class Sum(KernelOperator):
         The second base-kernel of the sum-kernel
     """
 
-    def auto(self, X, eval_gradient=False):
-        """ Return the auto-kernel k(X, X) and optionally its gradient.
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """ Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
-        X : array, shape (n_samples, n_features)
-            Data for which the kernel k(X, X) is computed
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
 
         eval_gradient : bool (optional, default=False)
             Determines whether the gradient with respect to the kernel
@@ -157,48 +169,30 @@ def auto(self, X, eval_gradient=False):
 
         Returns
         -------
-        K : array, shape (n_samples, n_samples)
-            Kernel k(X, X)
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
 
-        K_gradient : array (optional), shape (n_samples, n_samples, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
         if eval_gradient:
-            K1, K1_gradient = self.k1.auto(X, eval_gradient=True)
-            K2, K2_gradient = self.k2.auto(X, eval_gradient=True)
+            K1, K1_gradient = self.k1(X, Y, eval_gradient=True)
+            K2, K2_gradient = self.k2(X, Y, eval_gradient=True)
             return K1 + K2, np.dstack((K1_gradient, K2_gradient))
         else:
-            return self.k1.auto(X) + self.k2.auto(X)
-
-    def cross(self, X1, X2):
-        """ Return the cross-kernel k(X1, X2).
-
-        Parameters
-        ----------
-        X1 : array, shape (n_samples_1, n_features)
-            Left argument of the returned kernel k(X1, X2)
-
-        X2 : array, shape (n_samples_2, n_features)
-            Right argument of the returned kernel k(X1, X2)
-
-        Returns
-        -------
-        K : array, shape (n_samples_1, n_samples_2)
-            Kernel k(X1, X2)
-        """
-        return self.k1.cross(X1, X2) + self.k2.cross(X1, X2)
+            return self.k1(X, Y) + self.k2(X, Y)
 
     def __repr__(self):
         return "{0} + {1}".format(self.k1, self.k2)
 
 
 class Product(KernelOperator):
-    """ Product-kernel k1*k2 of two kernels k1 and k2.
+    """ Product-kernel k1 * k2 of two kernels k1 and k2.
 
     The resulting kernel is defined as
-    k_prod(X1, X2) = k1(X1, X2) * k2(X1, X2)
+    k_prod(X, Y) = k1(X, Y) * k2(X, Y)
 
     Parameters
     ----------
@@ -209,13 +203,17 @@ class Product(KernelOperator):
         The second base-kernel of the product-kernel
     """
 
-    def auto(self, X, eval_gradient=False):
-        """ Return the auto-kernel k(X, X) and optionally its gradient.
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """ Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
-        X : array, shape (n_samples, n_features)
-            Data for which the kernel k(X, X) is computed
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
 
         eval_gradient : bool (optional, default=False)
             Determines whether the gradient with respect to the kernel
@@ -223,39 +221,21 @@ def auto(self, X, eval_gradient=False):
 
         Returns
         -------
-        K : array, shape (n_samples, n_samples)
-            Kernel k(X, X)
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
 
-        K_gradient : array (optional), shape (n_samples, n_samples, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
         if eval_gradient:
-            K1, K1_gradient = self.k1.auto(X, eval_gradient=True)
-            K2, K2_gradient = self.k2.auto(X, eval_gradient=True)
+            K1, K1_gradient = self.k1(X, Y, eval_gradient=True)
+            K2, K2_gradient = self.k2(X, Y, eval_gradient=True)
             return K1 * K2, np.dstack((K1_gradient * K2[:, :, np.newaxis],
                                        K2_gradient * K1[:, :, np.newaxis]))
         else:
-            return self.k1.auto(X) * self.k2.auto(X)
-
-    def cross(self, X1, X2):
-        """ Return the cross-kernel k(X1, X2).
-
-        Parameters
-        ----------
-        X1 : array, shape (n_samples_1, n_features)
-            Left argument of the returned kernel k(X1, X2)
-
-        X2 : array, shape (n_samples_2, n_features)
-            Right argument of the returned kernel k(X1, X2)
-
-        Returns
-        -------
-        K : array, shape (n_samples_1, n_samples_2)
-            Kernel k(X1, X2)
-        """
-        return self.k1.cross(X1, X2) * self.k2.cross(X1, X2)
+            return self.k1(X, Y) * self.k2(X, Y)
 
     def __repr__(self):
         return "{0} * {1}".format(self.k1, self.k2)
@@ -281,52 +261,43 @@ def params(self, theta):
         assert len(theta) == 1
         self.value = theta[0]
 
-    def auto(self, X, eval_gradient=False):
-        """ Return the auto-kernel k(X, X) and optionally its gradient.
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """ Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
-        X : array, shape (n_samples, n_features)
-            Data for which the kernel k(X, X) is computed
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
 
         eval_gradient : bool (optional, default=False)
             Determines whether the gradient with respect to the kernel
-            hyperparameter is determined.
+            hyperparameter is determined. Only supported when Y is None.
 
         Returns
         -------
-        K : array, shape (n_samples, n_samples)
-            Kernel k(X, X)
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
 
-        K_gradient : array (optional), shape (n_samples, n_samples, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
-        K = self.value * np.ones((X.shape[0], X.shape[0]))
+        if Y is None:
+            Y = X
+        elif eval_gradient:
+            raise ValueError("Gradient can only be evaluated when Y is None.")
+
+        K = self.value * np.ones((X.shape[0], Y.shape[0]))
         if eval_gradient:
             return K, np.ones((X.shape[0], X.shape[0], 1))
         else:
             return K
 
-    def cross(self, X1, X2):
-        """ Return the cross-kernel k(X1, X2).
-
-        Parameters
-        ----------
-        X1 : array, shape (n_samples_1, n_features)
-            Left argument of the returned kernel k(X1, X2)
-
-        X2 : array, shape (n_samples_2, n_features)
-            Right argument of the returned kernel k(X1, X2)
-
-        Returns
-        -------
-        K : array, shape (n_samples_1, n_samples_2)
-            Kernel k(X1, X2)
-        """
-        return self.value * np.ones((X1.shape[0], X2.shape[0]))
-
     def __repr__(self):
         return "{0}".format(self.value)
 
@@ -348,33 +319,45 @@ def params(self):
     def params(self, theta):
         self.l = theta
 
-    def auto(self, X, eval_gradient=False):
-        """ Return the auto-kernel k(X, X) and optionally its gradient.
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """ Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
-        X : array, shape (n_samples, n_features)
-            Data for which the kernel k(X, X) is computed
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
 
         eval_gradient : bool (optional, default=False)
             Determines whether the gradient with respect to the kernel
-            hyperparameter is determined.
+            hyperparameter is determined. Only supported when Y is None.
 
         Returns
         -------
-        K : array, shape (n_samples, n_samples)
-            Kernel k(X, X)
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
 
-        K_gradient : array (optional), shape (n_samples, n_samples, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
-        dists = pdist(X / self.l, metric='sqeuclidean')
-        K = np.exp(-.5 * dists)
-        # convert from upper-triangular matrix to square matrix
-        K = squareform(K)
-        np.fill_diagonal(K, 1)
+        if Y is None:
+            dists = pdist(X / self.l, metric='sqeuclidean')
+            K = np.exp(-.5 * dists)
+            # convert from upper-triangular matrix to square matrix
+            K = squareform(K)
+            np.fill_diagonal(K, 1)
+        else:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated when Y is None.")
+            dists = cdist(X / self.l, Y / self.l, metric='sqeuclidean')
+            K = np.exp(-.5 * dists)
+
         if eval_gradient:
             if self.l.shape[0] == 1:
                 K_gradient = \
@@ -392,26 +375,6 @@ def auto(self, X, eval_gradient=False):
         else:
             return K
 
-    def cross(self, X1, X2):
-        """ Return the cross-kernel k(X1, X2).
-
-        Parameters
-        ----------
-        X1 : array, shape (n_samples_1, n_features)
-            Left argument of the returned kernel k(X1, X2)
-
-        X2 : array, shape (n_samples_2, n_features)
-            Right argument of the returned kernel k(X1, X2)
-
-        Returns
-        -------
-        K : array, shape (n_samples_1, n_samples_2)
-            Kernel k(X1, X2)
-        """
-        dists = cdist(X1 / self.l, X2 / self.l, metric='sqeuclidean')
-        K = np.exp(-.5 * dists)
-        return K
-
 
 class WhiteKernel(Kernel):
     """ White kernel.
@@ -432,51 +395,43 @@ def params(self):
     def params(self, theta):
         self.c = theta[0]
 
-    def auto(self, X, eval_gradient=False):
-        """ Return the auto-kernel k(X, X) and optionally its gradient.
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """ Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
-        X : array, shape (n_samples, n_features)
-            Data for which the kernel k(X, X) is computed
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
 
         eval_gradient : bool (optional, default=False)
             Determines whether the gradient with respect to the kernel
-            hyperparameter is determined.
+            hyperparameter is determined. Only supported when Y is None.
 
         Returns
         -------
-        K : array, shape (n_samples, n_samples)
-            Kernel k(X, X)
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
 
-        K_gradient : array (optional), shape (n_samples, n_samples, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
-        K = self.c * np.eye(X.shape[0])
-        if eval_gradient:
-            return K, np.eye(X.shape[0])[:, :, np.newaxis]
+        if Y is not None and eval_gradient:
+            raise ValueError("Gradient can only be evaluated when Y is None.")
+
+        if Y is None:
+            K = self.c * np.eye(X.shape[0])
+            if eval_gradient:
+                return K, np.eye(X.shape[0])[:, :, np.newaxis]
+            else:
+                return K
         else:
+            K = np.zeros((X.shape[0], Y.shape[0]))
+            # entries which are sufficiently similar to be considered identical
+            K[cdist(X, Y) < 1e-10] = self.c
             return K
-
-    def cross(self, X1, X2):
-        """ Return the cross-kernel k(X1, X2).
-
-        Parameters
-        ----------
-        X1 : array, shape (n_samples_1, n_features)
-            Left argument of the returned kernel k(X1, X2)
-
-        X2 : array, shape (n_samples_2, n_features)
-            Right argument of the returned kernel k(X1, X2)
-
-        Returns
-        -------
-        K : array, shape (n_samples_1, n_samples_2)
-            Kernel k(X1, X2)
-        """
-        K = np.zeros((X1.shape[0], X2.shape[0]))
-        # entries which are sufficiently similar to be considered identical
-        K[cdist(X1, X2) < 1e-10] = 1
-        return K
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 8ebbcdba088e1..a80cf46a50ac2 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -17,14 +17,14 @@
 X = np.random.normal(0, 1, (10, 2))
 
 kernels = [RBF(2.0), RBF([[0.5], [2.0]]),
-           2.0*RBF(0.5), RBF(2.0) + WhiteKernel(1.0),
-           ConstantKernel(10.0)]
+           ConstantKernel(10.0),
+           2.0 * RBF(0.5), RBF(2.0) + WhiteKernel(1.0)]
 
 
 def test_kernel_gradient():
     """ Compare analytic and numeric gradient of kernels. """
     for kernel in kernels:
-        K, K_gradient = kernel.auto(X, eval_gradient=True)
+        K, K_gradient = kernel(X, eval_gradient=True)
 
         assert_equal(K_gradient.shape[0], X.shape[0])
         assert_equal(K_gradient.shape[1], X.shape[0])
@@ -36,7 +36,7 @@ def test_kernel_gradient():
                 def eval_kernel_ij_for_theta(theta):
                     kernel_copy = deepcopy(kernel)
                     kernel_copy.params = theta
-                    K = kernel_copy.auto(X, eval_gradient=False)
+                    K = kernel_copy(X, eval_gradient=False)
                     return K[i, j]
                 K_gradient_approx[i, j] = \
                     approx_fprime(kernel.params, eval_kernel_ij_for_theta,
@@ -48,30 +48,30 @@ def eval_kernel_ij_for_theta(theta):
 def test_auto_vs_cross():
     """ Auto-correlation and cross-correlation should be consistent. """
     for kernel in kernels:
-        K_auto = kernel.auto(X)
-        K_cross = kernel.cross(X, X)
+        K_auto = kernel(X)
+        K_cross = kernel(X, X)
         assert_almost_equal(K_auto, K_cross, 5)
 
 def test_kernel_operator_commutative():
     """ Adding kernels and multiplying kernels should be commutative. """
     # Check addition
-    assert_almost_equal((RBF(2.0) + 1.0).auto(X),
-                        (1.0 + RBF(2.0)).auto(X))
+    assert_almost_equal((RBF(2.0) + 1.0)(X),
+                        (1.0 + RBF(2.0))(X))
 
     # Check multiplication
-    assert_almost_equal((3.0 * RBF(2.0)).auto(X),
-                        (RBF(2.0) * 3.0).auto(X))
+    assert_almost_equal((3.0 * RBF(2.0))(X),
+                        (RBF(2.0) * 3.0)(X))
 
 
 def test_kernel_anisotropic():
     """ Anisotropic kernel should be consistent with isotropic kernels."""
-    K = RBF([[0.5], [2.0]]).auto(X)
+    K = RBF([[0.5], [2.0]])(X)
     X1 = np.array(X)
     X1[:, 0] *= 4
-    K1 = RBF(2.0).auto(X1)
+    K1 = RBF(2.0)(X1)
     assert_almost_equal(K, K1)
 
     X2 = np.array(X)
     X2[:, 1] /= 4
-    K2 = RBF(0.5).auto(X2)
+    K2 = RBF(0.5)(X2)
     assert_almost_equal(K, K2)

From 292a0093e246ca498a4d124178a4109189e20695 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 22 Feb 2015 15:03:46 +0100
Subject: [PATCH 018/165] ENH Adding PairwiseKernel for GPs (wraps kernels from
 metrics.pairwise)

---
 sklearn/gaussian_process/kernels.py           | 95 +++++++++++++++++++
 .../gaussian_process/tests/test_kernels.py    |  8 +-
 2 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 2ae33b7bdb8cd..e7ab76b0965c0 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -21,10 +21,13 @@
 #       package.
 
 from abc import ABCMeta, abstractmethod
+from functools import partial
 
 import numpy as np
 from scipy.spatial.distance import pdist, cdist, squareform
+from scipy.optimize import approx_fprime
 
+from ..metrics.pairwise import pairwise_kernels
 from ..externals import six
 
 
@@ -435,3 +438,95 @@ def __call__(self, X, Y=None, eval_gradient=False):
             # entries which are sufficiently similar to be considered identical
             K[cdist(X, Y) < 1e-10] = self.c
             return K
+
+
+class PairwiseKernel(Kernel):
+    """ Wrapper for kernels in sklearn.metrics.pairwise.
+
+    A thin wrapper around the functionality of the kernels in
+    sklearn.metrics.pairwise.
+
+    Note: Evaluation of eval_gradient is not analytic but numeric and all
+          kernels support only isotropic distances. The parameter gamma is
+          specified via the param_space and may be optimized. The other
+          kernel parameters are set directly  at initialization and are kept
+          fixed.
+
+    Parameters
+    ----------
+    metric : string, or callable
+        The metric to use when calculating kernel between instances in a
+        feature array. If metric is a string, it must be one of the metrics
+        in pairwise.PAIRWISE_KERNEL_FUNCTIONS.
+        If metric is "precomputed", X is assumed to be a kernel matrix.
+        Alternatively, if metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays from X as input and return a value indicating
+        the distance between them.
+
+    `**kwds` : optional keyword parameters
+        Any further parameters are passed directly to the kernel function.
+    """
+
+    def __init__(self, param_space=1.0, metric="linear", **kwargs):
+        self._parse_param_space(param_space)
+        self.metric = metric
+        self.kwargs = kwargs
+        if "gamma" in kwargs:
+            raise ValueError(
+                "Gamma must not be set directly but via param_space.")
+
+    @property
+    def params(self):
+        return np.asarray([self.gamma])
+
+    @params.setter
+    def params(self, theta):
+        self.gamma = theta[0]
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """ Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+            The gradient of the kernel k(X, X) with repect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        K = pairwise_kernels(X, Y, metric=self.metric, gamma=self.gamma,
+                             filter_params=True, **self.kwargs)
+        if eval_gradient:
+            # approximate gradient numerically
+            K_gradient = np.empty((K.shape[0], K.shape[1], 1))
+
+            def f(gamma, i, j):  # helper function
+                return pairwise_kernels(
+                    X, Y, metric=self.metric, gamma=gamma,
+                    filter_params=True, **self.kwargs)[i, j]
+            # XXX: avoid python for-loops
+            for i in range(K.shape[0]):
+                for j in range(K.shape[0]):
+                    K_gradient[i, j] = \
+                        approx_fprime(np.array([self.gamma]),
+                                      partial(f, i=i, j=j), 1e-10)
+
+            return K, K_gradient
+        else:
+            return K
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index a80cf46a50ac2..a4c870e0ded31 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -9,7 +9,9 @@
 
 from scipy.optimize import approx_fprime
 
-from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
+from sklearn.metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS
+from sklearn.gaussian_process.kernels \
+    import RBF, ConstantKernel, WhiteKernel, PairwiseKernel
 
 from sklearn.utils.testing import assert_equal, assert_almost_equal
 
@@ -19,6 +21,10 @@
 kernels = [RBF(2.0), RBF([[0.5], [2.0]]),
            ConstantKernel(10.0),
            2.0 * RBF(0.5), RBF(2.0) + WhiteKernel(1.0)]
+for metric in PAIRWISE_KERNEL_FUNCTIONS:
+    if metric in ["additive_chi2", "chi2"]:
+        continue
+    kernels.append(PairwiseKernel(1.0, metric=metric))
 
 
 def test_kernel_gradient():

From 2be7733e69ce97c7ea47e9af08c4bea54aacea5e Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 24 Feb 2015 17:36:48 +0100
Subject: [PATCH 019/165] REFACTOR GaussianProcessClassification/Regression ->
 ...Classifier/Regressor

---
 examples/gaussian_process/new/plot_gpc.py          |  6 +++---
 examples/gaussian_process/new/plot_gpc_xor.py      |  4 ++--
 examples/gaussian_process/new/plot_gpr_noisy.py    | 10 +++++-----
 .../new/plot_gpr_prior_posterior.py                |  4 ++--
 sklearn/gaussian_process/__init__.py               |  4 ++--
 sklearn/gaussian_process/gpc.py                    |  2 +-
 sklearn/gaussian_process/gpr.py                    |  4 ++--
 sklearn/gaussian_process/tests/test_gpc.py         | 10 +++++-----
 sklearn/gaussian_process/tests/test_gpr.py         | 14 +++++++-------
 9 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/examples/gaussian_process/new/plot_gpc.py b/examples/gaussian_process/new/plot_gpc.py
index 74358af8ef269..8eb4f04d3b60e 100644
--- a/examples/gaussian_process/new/plot_gpc.py
+++ b/examples/gaussian_process/new/plot_gpc.py
@@ -15,7 +15,7 @@
 
 from matplotlib import pyplot as plt
 
-from sklearn.gaussian_process import GaussianProcessClassification
+from sklearn.gaussian_process import GaussianProcessClassifier
 from sklearn.gaussian_process.kernels import RBF
 
 
@@ -26,10 +26,10 @@
 
 # Specify Gaussian Processes with fixed and optimized hyperparameters
 kernel_fix = 4.0 * RBF(param_space=[1.0])
-gp_fix = GaussianProcessClassification(kernel=kernel_fix).fit(X, y)
+gp_fix = GaussianProcessClassifier(kernel=kernel_fix).fit(X, y)
 
 kernel_opt = (1e-10, 1.0, 100) * RBF(param_space=(1e-10, 1, 10))
-gp_opt = GaussianProcessClassification(kernel=kernel_opt).fit(X, y)
+gp_opt = GaussianProcessClassifier(kernel=kernel_opt).fit(X, y)
 
 print "Log Marginal Likelihood (initial): %.3f" % \
     gp_fix.log_marginal_likelihood(gp_fix.theta_)
diff --git a/examples/gaussian_process/new/plot_gpc_xor.py b/examples/gaussian_process/new/plot_gpc_xor.py
index 9d26785bc3a25..89488e3a948bb 100644
--- a/examples/gaussian_process/new/plot_gpc_xor.py
+++ b/examples/gaussian_process/new/plot_gpc_xor.py
@@ -11,7 +11,7 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-from sklearn.gaussian_process import GaussianProcessClassification
+from sklearn.gaussian_process import GaussianProcessClassifier
 from sklearn.gaussian_process.kernels import RBF
 
 
@@ -23,7 +23,7 @@
 
 # fit the model
 kernel = [1e-10, 1.0, 100] * RBF(param_space=(1e-10, 1.0, 10))
-clf = GaussianProcessClassification(kernel=kernel).fit(X, Y)
+clf = GaussianProcessClassifier(kernel=kernel).fit(X, Y)
 
 # plot the decision function for each datapoint on the grid
 Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)
diff --git a/examples/gaussian_process/new/plot_gpr_noisy.py b/examples/gaussian_process/new/plot_gpr_noisy.py
index 2101e226ec18e..dcfab1c66d869 100644
--- a/examples/gaussian_process/new/plot_gpr_noisy.py
+++ b/examples/gaussian_process/new/plot_gpr_noisy.py
@@ -23,7 +23,7 @@
 from matplotlib import pyplot as plt
 from matplotlib.colors import LogNorm
 
-from sklearn.gaussian_process import GaussianProcessRegression
+from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels import RBF, WhiteKernel
 
 
@@ -36,8 +36,8 @@
 kernel = (1e-10, 1.0, None) * RBF(param_space=(1e-10, 100.0, None)) \
     + WhiteKernel(param_space=(1e-10, 1e-5, 1e+1))
 kernel_str = str(kernel)
-gp = GaussianProcessRegression(kernel=kernel,
-                               y_err=0.0).fit(X, y)
+gp = GaussianProcessRegressor(kernel=kernel,
+                              y_err=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
 y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
 plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
@@ -55,8 +55,8 @@
 kernel = (1e-10, 1.0, None) * RBF(param_space=(1e-10, 1.0, None)) \
     + WhiteKernel(param_space=(1e-10, 1e-5, 1e+1))
 kernel_str = str(kernel)
-gp = GaussianProcessRegression(kernel=kernel,
-                               y_err=0.0).fit(X, y)
+gp = GaussianProcessRegressor(kernel=kernel,
+                              y_err=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
 y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
 plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
diff --git a/examples/gaussian_process/new/plot_gpr_prior_posterior.py b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
index 955dda4593f8e..a2f353b86804e 100644
--- a/examples/gaussian_process/new/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
@@ -13,13 +13,13 @@
 
 from matplotlib import pyplot as plt
 
-from sklearn.gaussian_process import GaussianProcessRegression
+from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels import RBF
 
 
 # Specify Gaussian Process
 kernel = (1e-10, 1.0, 100) * RBF(param_space=(1e-10, 1.0, None))
-gp = GaussianProcessRegression(kernel=kernel)
+gp = GaussianProcessRegressor(kernel=kernel)
 
 # Plot prior
 plt.figure(0, figsize=(8, 8))
diff --git a/sklearn/gaussian_process/__init__.py b/sklearn/gaussian_process/__init__.py
index 691b025872ca3..79abc52e51d16 100644
--- a/sklearn/gaussian_process/__init__.py
+++ b/sklearn/gaussian_process/__init__.py
@@ -9,8 +9,8 @@
 based predictions.
 """
 
-from .gpr import GaussianProcessRegression
-from .gpc import GaussianProcessClassification
+from .gpr import GaussianProcessRegressor
+from .gpc import GaussianProcessClassifier
 from . import kernels
 
 from .gaussian_process import GaussianProcess
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 6032f0e8726d4..6c225887e0c24 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -25,7 +25,7 @@
                   128.12323805, -2010.49422654])[:, np.newaxis]
 
 
-class GaussianProcessClassification(BaseEstimator):
+class GaussianProcessClassifier(BaseEstimator):
     """ Gaussian process classification (GPC).
 
     The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 513dbd8da45fa..2ea8b023a0826 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -14,13 +14,13 @@
 from sklearn.utils.validation import check_X_y, check_array
 
 
-class GaussianProcessRegression(BaseEstimator):
+class GaussianProcessRegressor(BaseEstimator):
     """ Gaussian process regression (GPR).
 
     The implementation is based on Algorithm 2.1 of ``Gaussian Processes
     for Machine Learning'' (GPML) by Rasmussen and Williams.
 
-    In addition to standard sklearn estimators, GaussianProcessRegression
+    In addition to standard sklearn estimators, GaussianProcessRegressor
        * allows prediction without prior fitting (based on the GP prior)
        * provides an additional method sample(X), which evaluates samples drawn
          from the GPR (prior or posterior) at given inputs
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 3dcc109cdc8fb..d33b348901b0c 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -9,7 +9,7 @@
 
 from scipy.optimize import approx_fprime
 
-from sklearn.gaussian_process import GaussianProcessClassification
+from sklearn.gaussian_process import GaussianProcessClassifier
 from sklearn.gaussian_process.kernels import RBF
 
 from sklearn.utils.testing import (assert_true, assert_greater,
@@ -32,7 +32,7 @@ def test_predict_consistent():
     """
     for kernel in kernels:
         kernel = deepcopy(kernel)
-        gpc = GaussianProcessClassification(kernel=kernel).fit(X, y)
+        gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
         assert_array_equal(gpc.predict(X),
                            gpc.predict_proba(X) >=0.5)
 
@@ -44,7 +44,7 @@ def test_lml_improving():
             continue
         kernel = deepcopy(kernel)
         params_initial = kernel.params
-        gpc = GaussianProcessClassification(kernel=kernel).fit(X, y)
+        gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
         assert_greater(gpc.log_marginal_likelihood(kernel.params),
                        gpc.log_marginal_likelihood(params_initial))
 
@@ -55,7 +55,7 @@ def test_converged_to_local_maximum():
         if not kernel.has_bounds:
             continue
         kernel = deepcopy(kernel)
-        gpc = GaussianProcessClassification(kernel=kernel).fit(X, y)
+        gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
 
         lml, lml_gradient = gpc.log_marginal_likelihood(kernel.params, True)
 
@@ -67,7 +67,7 @@ def test_lml_gradient():
     for kernel in kernels:
         kernel = deepcopy(kernel)
         params = kernel.params
-        gpc = GaussianProcessClassification(kernel=kernel).fit(X, y)
+        gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
 
         lml, lml_gradient = gpc.log_marginal_likelihood(params, True)
         lml_gradient_approx = \
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 0fb50f4272e00..d7241b9402895 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -9,7 +9,7 @@
 
 from scipy.optimize import approx_fprime
 
-from sklearn.gaussian_process import GaussianProcessRegression
+from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels import RBF
 
 from sklearn.utils.testing import (assert_true, assert_greater,
@@ -32,7 +32,7 @@ def test_gpr_interpolation():
     """Test the interpolating property for different kernels."""
     for kernel in kernels:
         kernel = deepcopy(kernel)
-        gpr = GaussianProcessRegression(kernel=kernel).fit(X, y)
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
         y_pred, y_cov = gpr.predict(X, return_cov=True)
 
         assert_true(np.allclose(y_pred, y))
@@ -46,7 +46,7 @@ def test_lml_improving():
             continue
         kernel = deepcopy(kernel)
         params_initial = kernel.params
-        gpr = GaussianProcessRegression(kernel=kernel).fit(X, y)
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
         assert_greater(gpr.log_marginal_likelihood(kernel.params),
                        gpr.log_marginal_likelihood(params_initial))
 
@@ -57,7 +57,7 @@ def test_converged_to_local_maximum():
         if not kernel.has_bounds:
             continue
         kernel = deepcopy(kernel)
-        gpr = GaussianProcessRegression(kernel=kernel).fit(X, y)
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
         lml, lml_gradient = gpr.log_marginal_likelihood(kernel.params, True)
 
@@ -69,7 +69,7 @@ def test_lml_gradient():
     for kernel in kernels:
         kernel = deepcopy(kernel)
         params = kernel.params
-        gpr = GaussianProcessRegression(kernel=kernel).fit(X, y)
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
         lml, lml_gradient = gpr.log_marginal_likelihood(params, True)
         lml_gradient_approx = \
@@ -85,7 +85,7 @@ def test_prior():
     """ Test that GP prior has mean 0 and identical variances."""
     for kernel in kernels:
         kernel = deepcopy(kernel)
-        gpr = GaussianProcessRegression(kernel=kernel)
+        gpr = GaussianProcessRegressor(kernel=kernel)
 
         y_mean, y_cov = gpr.predict(X, return_cov=True)
 
@@ -101,7 +101,7 @@ def test_sample_statistics():
     """ Test that statistics of samples drawn from GP are correct."""
     for kernel in kernels:
         kernel = deepcopy(kernel)
-        gpr = GaussianProcessRegression(kernel=kernel).fit(
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(
             X, y)
 
         y_mean, y_cov = gpr.predict(X2, return_cov=True)

From 3a6afbc4eb2e4c37020fb26c1e9f4ddf0e30e16d Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 24 Feb 2015 18:00:05 +0100
Subject: [PATCH 020/165] ENH More efficient approximation of gradient in
 PairwiseKernel

---
 sklearn/gaussian_process/kernels.py | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index e7ab76b0965c0..7cc5d1901f435 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -25,7 +25,6 @@
 
 import numpy as np
 from scipy.spatial.distance import pdist, cdist, squareform
-from scipy.optimize import approx_fprime
 
 from ..metrics.pairwise import pairwise_kernels
 from ..externals import six
@@ -440,6 +439,19 @@ def __call__(self, X, Y=None, eval_gradient=False):
             return K
 
 
+# adapted from scipy/optimize/optimize.py for functions with 2d output
+def _approx_fprime(xk, f, epsilon, args=()):
+    f0 = f(*((xk,) + args))
+    grad = np.zeros((f0.shape[0], f0.shape[1], len(xk)), float)
+    ei = np.zeros((len(xk), ), float)
+    for k in range(len(xk)):
+        ei[k] = 1.0
+        d = epsilon * ei
+        grad[:, :, k] = (f(*((xk + d,) + args)) - f0) / d[k]
+        ei[k] = 0.0
+    return grad
+
+
 class PairwiseKernel(Kernel):
     """ Wrapper for kernels in sklearn.metrics.pairwise.
 
@@ -514,19 +526,10 @@ def __call__(self, X, Y=None, eval_gradient=False):
                              filter_params=True, **self.kwargs)
         if eval_gradient:
             # approximate gradient numerically
-            K_gradient = np.empty((K.shape[0], K.shape[1], 1))
-
-            def f(gamma, i, j):  # helper function
+            def f(gamma):  # helper function
                 return pairwise_kernels(
                     X, Y, metric=self.metric, gamma=gamma,
-                    filter_params=True, **self.kwargs)[i, j]
-            # XXX: avoid python for-loops
-            for i in range(K.shape[0]):
-                for j in range(K.shape[0]):
-                    K_gradient[i, j] = \
-                        approx_fprime(np.array([self.gamma]),
-                                      partial(f, i=i, j=j), 1e-10)
-
-            return K, K_gradient
+                    filter_params=True, **self.kwargs)
+            return K, _approx_fprime(np.array([self.gamma]), f, 1e-10)
         else:
             return K

From 38fe9cf1488c9e08b15a4f7cc12aa31739028dd4 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 24 Feb 2015 18:32:27 +0100
Subject: [PATCH 021/165] ENH Avoid to create square matrix W_sr in all but one
 place (TODO)

---
 sklearn/gaussian_process/gpc.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 6c225887e0c24..690a52232a7c0 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -94,7 +94,7 @@ def predict_proba(self, X):
         # Based on Algorithm 3.2 of GPML
         K_star = self.kernel(self.X_fit_, X)  # K_star =k(x_star)
         f_star = K_star.T.dot(self.y_fit_ - self.pi)  # Line 4
-        v = solve(self.L, self.W_sr.dot(K_star))  # Line 5
+        v = solve(self.L, self.W_sr[:, np.newaxis] * K_star)  # Line 5
         var_f_star = self.kernel(X) - v.T.dot(v)  # Line 6
 
         # Line 7:
@@ -136,8 +136,9 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
 
         # Compute gradient based on Algorithm 5.1 of GPML
         d_Z = np.empty(theta.shape[0])
-        R = W_sr.dot(cho_solve((L, True), W_sr))  # Line 7
-        C = solve(L, W_sr.dot(K))  # Line 8
+        # XXX: Get rid of the np.diag() in the next line
+        R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr))  # Line 7
+        C = solve(L, W_sr[:, np.newaxis] * K)  # Line 8
         # Line 9:
         s_2 = -0.5*(np.diag(K) - np.diag(C.T.dot(C))) \
             * (pi * (1 - pi) * (1 - 2*pi))  # third derivative
@@ -161,13 +162,14 @@ def _posterior_mode(self, K, return_temporaries=False):
             pi = 1 / (1 + np.exp(-f))
             W = pi * (1 - pi)
             # Line 5
-            W_sr = np.diag(np.sqrt(W))  # XXX: avoid creating square matrix?
-            B = np.eye(W.shape[0]) + W_sr.dot(K).dot(W_sr)
+            W_sr = np.sqrt(W)
+            W_sr_K = W_sr[:, np.newaxis] * K
+            B = np.eye(W.shape[0]) + W_sr_K * W_sr
             L = cholesky(B, lower=True)
             # Line 6
             b = W * f + (self.y_fit_ - pi)
             # Line 7
-            a = b - W_sr.dot(cho_solve((L, True), W_sr.dot(K).dot(b)))
+            a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b))
             # Line 8
             f = K.dot(a)
 

From 7899e5847985ff43f0c3ea77a7cd7352b7a88c8b Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 26 Feb 2015 07:03:23 +0100
Subject: [PATCH 022/165] ENH Adding RationalQuadratic and ExpSineSquared
 kernels

---
 sklearn/gaussian_process/kernels.py | 88 +++++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 4 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 7cc5d1901f435..01849e535c7c1 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -99,8 +99,7 @@ def __rmul__(self, b):
 
     def __repr__(self):
         return "{0}({1})".format(self.__class__.__name__,
-                                 ", ".join(map("{0}".format, self.params)))
-
+                                 ", ".join(map("{0:.3g}".format, self.params)))
     @abstractmethod
     def __call__(self, X, Y=None, eval_gradient=False):
         """Evaluate the kernel."""
@@ -301,7 +300,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             return K
 
     def __repr__(self):
-        return "{0}".format(self.value)
+        return "{0:.3g}".format(self.value)
 
 
 class RBF(Kernel):
@@ -530,6 +529,87 @@ def f(gamma):  # helper function
                 return pairwise_kernels(
                     X, Y, metric=self.metric, gamma=gamma,
                     filter_params=True, **self.kwargs)
-            return K, _approx_fprime(np.array([self.gamma]), f, 1e-10)
+            return K, _approx_fprime(self.params, f, 1e-10)
+        else:
+            return K
+
+
+class RationalQuadratic(Kernel):
+
+    def __init__(self, param_space=[(1.0,), (1.0,)]):
+        self._parse_param_space(param_space)
+
+
+    @property
+    def params(self):
+        return np.asarray([self.alpha, self.l])
+
+    @params.setter
+    def params(self, theta):
+        self.alpha = theta[0]
+        self.l = theta[1]
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        if Y is None:
+            dists = pdist(X, metric='sqeuclidean')
+            K = (1 + dists / (2 * self.alpha * self.l ** 2)) ** -self.alpha
+            # convert from upper-triangular matrix to square matrix
+            K = squareform(K)
+            np.fill_diagonal(K, 1)
+        else:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated when Y is None.")
+            dists = cdist(X, Y, metric='sqeuclidean')
+            K = (1 + dists / (2 * self.alpha * self.l ** 2)) ** -self.alpha
+
+        if eval_gradient:
+            # approximate gradient numerically
+            def f(theta):  # helper function
+                theta_, self.params = self.params, theta
+                K = self(X, Y)
+                self.params = theta_
+                return K
+            return K, _approx_fprime(self.params, f, 1e-10)
+        else:
+            return K
+
+
+class ExpSineSquared(Kernel):
+
+    def __init__(self, param_space=[(1.0,), (1.0,)]):
+        self._parse_param_space(param_space)
+
+    @property
+    def params(self):
+        return np.asarray([self.l, self.c])
+
+    @params.setter
+    def params(self, theta):
+        self.l = theta[0]
+        self.c = theta[1]
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        if Y is None:
+            dists = pdist(X, metric='euclidean')
+            K = np.exp(- self.c * np.sin(np.pi * dists / self.l) ** 2)
+            # convert from upper-triangular matrix to square matrix
+            K = squareform(K)
+            np.fill_diagonal(K, 1)
+        else:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated when Y is None.")
+            dists = cdist(X, Y, metric='euclidean')
+            K = np.exp(- self.c * np.sin(np.pi * dists / self.l) ** 2)
+
+        if eval_gradient:
+            # approximate gradient numerically
+            def f(theta):  # helper function
+                theta_, self.params = self.params, theta
+                K = self(X, Y)
+                self.params = theta_
+                return K
+            return K, _approx_fprime(self.params, f, 1e-10)
         else:
             return K

From 0f425724ce8c790a87ab358e966ea94652be6db9 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sat, 28 Feb 2015 11:07:25 +0100
Subject: [PATCH 023/165] FIX Dealing with singular kernel matrix (-inf
 log-marginal-likelihood)

---
 sklearn/gaussian_process/gpr.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 2ea8b023a0826..a8372d6356c61 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -138,7 +138,12 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
             K = kernel(self.X_fit_)
 
         K[np.diag_indices_from(K)] += self.y_err
-        L = cholesky(K, lower=True)  # Line 2
+        try:
+            L = cholesky(K, lower=True)  # Line 2
+        except np.linalg.LinAlgError:
+            return (-np.inf, np.zeros_like(theta))\
+                 if eval_gradient else -np.inf
+
         alpha = cho_solve((L, True), self.y_fit_)  # Line 3
 
         # Compute log-likelihood (compare line 7)

From 0fac1ad2bf230942eb637f75cc2deab7509a4826 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sat, 28 Feb 2015 11:10:03 +0100
Subject: [PATCH 024/165] ENH Adding DotProduct kernel and fixing ExpSine2
 kernel

---
 sklearn/gaussian_process/kernels.py | 44 ++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 01849e535c7c1..27bce9c3ff439 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -38,6 +38,7 @@ def _parse_param_space(self, param_space):
             self.params = np.array([float(param_space)])
             self.has_bounds = False
             return
+
         param_space = np.atleast_2d(param_space)
         if param_space.shape[1] == 1:
             # fixed hyperparameter
@@ -582,17 +583,17 @@ def __init__(self, param_space=[(1.0,), (1.0,)]):
 
     @property
     def params(self):
-        return np.asarray([self.l, self.c])
+        return np.asarray([self.l, self.p])
 
     @params.setter
     def params(self, theta):
         self.l = theta[0]
-        self.c = theta[1]
+        self.p = theta[1]
 
     def __call__(self, X, Y=None, eval_gradient=False):
         if Y is None:
             dists = pdist(X, metric='euclidean')
-            K = np.exp(- self.c * np.sin(np.pi * dists / self.l) ** 2)
+            K = np.exp(- 2 * (np.sin(np.pi / self.p * dists) / self.l) ** 2)
             # convert from upper-triangular matrix to square matrix
             K = squareform(K)
             np.fill_diagonal(K, 1)
@@ -601,7 +602,42 @@ def __call__(self, X, Y=None, eval_gradient=False):
                 raise ValueError(
                     "Gradient can only be evaluated when Y is None.")
             dists = cdist(X, Y, metric='euclidean')
-            K = np.exp(- self.c * np.sin(np.pi * dists / self.l) ** 2)
+            K = np.exp(- 2 * (np.sin(np.pi / self.p * dists) / self.l) ** 2)
+
+        if eval_gradient:
+            # approximate gradient numerically
+            def f(theta):  # helper function
+                theta_, self.params = self.params, theta
+                K = self(X, Y)
+                self.params = theta_
+                return K
+            return K, _approx_fprime(self.params, f, 1e-5)
+        else:
+            return K
+
+
+class DotProduct(Kernel):
+
+    def __init__(self, param_space=1.0, degree=1):
+        self._parse_param_space(param_space)
+        self.degree = degree
+
+    @property
+    def params(self):
+        return np.asarray([self.sigma_0])
+
+    @params.setter
+    def params(self, theta):
+        self.sigma_0 = theta[0]
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        if Y is None:
+            K = (np.inner(X, X) + self.sigma_0 ** 2) ** self.degree
+        else:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated when Y is None.")
+            K = (np.inner(X, Y) + self.sigma_0 ** 2) ** self.degree
 
         if eval_gradient:
             # approximate gradient numerically

From c7c673457d953e728adf2ee483a6c53283341ac0 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sat, 28 Feb 2015 11:15:40 +0100
Subject: [PATCH 025/165] DOC Initial version of Mauna Loa CO2 example

---
 examples/gaussian_process/new/plot_gpr_co2.py | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 examples/gaussian_process/new/plot_gpr_co2.py

diff --git a/examples/gaussian_process/new/plot_gpr_co2.py b/examples/gaussian_process/new/plot_gpr_co2.py
new file mode 100644
index 0000000000000..aeadfa50da148
--- /dev/null
+++ b/examples/gaussian_process/new/plot_gpr_co2.py
@@ -0,0 +1,48 @@
+"""Gaussian process regression (GPR) on Mauna Loa CO2 data. """
+print __doc__
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
+import numpy as np
+import statsmodels.api as sm   # XXX: Upload data on mldata
+
+from matplotlib import pyplot as plt
+
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels \
+    import RBF, Kernel, WhiteKernel, RationalQuadratic, ExpSineSquared
+
+data = sm.datasets.get_rdataset("co2").data
+X = np.array(data.time)[:, np.newaxis]
+y = np.array(data.co2)
+y_mean = y.mean()
+
+
+k1 = 66.0**2 * RBF(67.0)
+k2 = 2.4**2 * RBF(90) * ExpSineSquared(((1.3,), (1.0,)))
+k3 = 0.66**2 * RationalQuadratic(((0.78,), (1.2,)))
+k4 = 0.18**2 * RBF(0.134) + WhiteKernel((0, 0.19**2, None))
+kernel = k1 + k2 + k3 + k4
+
+gp = GaussianProcessRegressor(kernel=kernel, y_err=1e-2)
+gp.fit(X, y - y_mean)
+
+print "Learned kernel: %s" % kernel
+print "Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.theta_)
+
+X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
+y_pred, y_std = gp.predict(X_, return_std=True)
+y_pred += y_mean
+
+plt.scatter(X, y, c='k')
+plt.plot(X_, y_pred)
+plt.fill_between(X_[:, 0], y_pred - y_std, y_pred + y_std,
+                 alpha=0.5, color='k')
+plt.xlim(X_.min(), X_.max())
+plt.xlabel("Year")
+plt.ylabel(r"CO$_2$ in ppm")
+plt.title(r"Atmospheric CO$_2$ concentration at Mauna Loa")
+plt.tight_layout()
+plt.show()

From e087a22f6ed6f76f83a59ac958a61eecb922bf64 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sat, 28 Feb 2015 18:53:37 +0100
Subject: [PATCH 026/165] REFACTOR Hyperparameter optimization in GPs enabled
 explicitly by optimizer parameter

---
 examples/gaussian_process/new/plot_gpc.py     |  6 +--
 examples/gaussian_process/new/plot_gpc_xor.py |  2 +-
 .../gaussian_process/new/plot_gpr_noisy.py    |  6 +--
 .../new/plot_gpr_prior_posterior.py           |  2 +-
 sklearn/gaussian_process/gpc.py               |  9 +++--
 sklearn/gaussian_process/gpr.py               | 21 +++++++---
 sklearn/gaussian_process/kernels.py           | 29 +++++++-------
 sklearn/gaussian_process/tests/test_gpc.py    |  4 --
 sklearn/gaussian_process/tests/test_gpr.py    | 39 ++++++++++++++-----
 9 files changed, 75 insertions(+), 43 deletions(-)

diff --git a/examples/gaussian_process/new/plot_gpc.py b/examples/gaussian_process/new/plot_gpc.py
index 8eb4f04d3b60e..200da2e29acc9 100644
--- a/examples/gaussian_process/new/plot_gpc.py
+++ b/examples/gaussian_process/new/plot_gpc.py
@@ -25,10 +25,10 @@
 y = np.array(np.sin((X[:, 0] - 2.5) ** 2) > 0.0, dtype=int)
 
 # Specify Gaussian Processes with fixed and optimized hyperparameters
-kernel_fix = 4.0 * RBF(param_space=[1.0])
-gp_fix = GaussianProcessClassifier(kernel=kernel_fix).fit(X, y)
+kernel_fix = 1.0 * RBF(param_space=1.0)
+gp_fix = GaussianProcessClassifier(kernel=kernel_fix, optimizer=None).fit(X, y)
 
-kernel_opt = (1e-10, 1.0, 100) * RBF(param_space=(1e-10, 1, 10))
+kernel_opt = 1.0 * RBF(1.0)
 gp_opt = GaussianProcessClassifier(kernel=kernel_opt).fit(X, y)
 
 print "Log Marginal Likelihood (initial): %.3f" % \
diff --git a/examples/gaussian_process/new/plot_gpc_xor.py b/examples/gaussian_process/new/plot_gpc_xor.py
index 89488e3a948bb..adfeb12d57244 100644
--- a/examples/gaussian_process/new/plot_gpc_xor.py
+++ b/examples/gaussian_process/new/plot_gpc_xor.py
@@ -22,7 +22,7 @@
 Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
 
 # fit the model
-kernel = [1e-10, 1.0, 100] * RBF(param_space=(1e-10, 1.0, 10))
+kernel = 1.0 * RBF(1.0)
 clf = GaussianProcessClassifier(kernel=kernel).fit(X, Y)
 
 # plot the decision function for each datapoint on the grid
diff --git a/examples/gaussian_process/new/plot_gpr_noisy.py b/examples/gaussian_process/new/plot_gpr_noisy.py
index dcfab1c66d869..e732b65494da7 100644
--- a/examples/gaussian_process/new/plot_gpr_noisy.py
+++ b/examples/gaussian_process/new/plot_gpr_noisy.py
@@ -33,8 +33,7 @@
 
 # First run
 plt.figure(0)
-kernel = (1e-10, 1.0, None) * RBF(param_space=(1e-10, 100.0, None)) \
-    + WhiteKernel(param_space=(1e-10, 1e-5, 1e+1))
+kernel = 1.0 * RBF(100.0) + WhiteKernel((1e-10, 1e-5, 1e+1))
 kernel_str = str(kernel)
 gp = GaussianProcessRegressor(kernel=kernel,
                               y_err=0.0).fit(X, y)
@@ -52,8 +51,7 @@
 
 # First run
 plt.figure(1)
-kernel = (1e-10, 1.0, None) * RBF(param_space=(1e-10, 1.0, None)) \
-    + WhiteKernel(param_space=(1e-10, 1e-5, 1e+1))
+kernel = 1.0 * RBF(1.0) + WhiteKernel((1e-10, 1e-5, 1e+1))
 kernel_str = str(kernel)
 gp = GaussianProcessRegressor(kernel=kernel,
                               y_err=0.0).fit(X, y)
diff --git a/examples/gaussian_process/new/plot_gpr_prior_posterior.py b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
index a2f353b86804e..9898ed1444c3b 100644
--- a/examples/gaussian_process/new/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
@@ -18,7 +18,7 @@
 
 
 # Specify Gaussian Process
-kernel = (1e-10, 1.0, 100) * RBF(param_space=(1e-10, 1.0, None))
+kernel = 1.0 * RBF((1e-10, 1.0, None))
 gp = GaussianProcessRegressor(kernel=kernel)
 
 # Plot prior
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 690a52232a7c0..f95ca22d9ac47 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -40,9 +40,10 @@ class GaussianProcessClassifier(BaseEstimator):
       * binary classification
     """
 
-    def __init__(self, kernel=RBF(), jitter=0.0):
+    def __init__(self, kernel=RBF(), jitter=0.0, optimizer="fmin_l_bfgs_b"):
         self.kernel = kernel
         self.jitter = jitter
+        self.optimizer = optimizer
 
     def fit(self, X, y):
         X, y = check_X_y(X, y)
@@ -51,7 +52,7 @@ def fit(self, X, y):
         self.X_fit_ = X
         self.y_fit_ = y
 
-        if self.kernel.has_bounds:
+        if self.optimizer == "fmin_l_bfgs_b":
             # Choose hyperparameters based on maximizing the log-marginal
             # likelihood
             def obj_func(theta):
@@ -61,8 +62,10 @@ def obj_func(theta):
             self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel.params,
                                               bounds=self.kernel.bounds)
             self.kernel.params = self.theta_
-        else:
+        elif self.optimizer is None:
             self.theta_ = self.kernel.params
+        else:
+            raise ValueError("Unknown optimizer %s." % self.optimizer)
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index a8372d6356c61..568c0ebe2613d 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -38,6 +38,14 @@ class GaussianProcessRegressor(BaseEstimator):
         Larger values correspond to increased noise level in the observations
         and reduce potential numerical issue during fitting.
 
+    optimizer : string, optional (default: "fmin_l_bfgs_b")
+        A string specifying the optimization algorithm used for optimizing the
+        kernel's parameters. Default uses 'fmin_l_bfgs_b' algorithm from
+        scipy.optimize. If None, the kernel's paramters are kept fixed.
+        Available optimizers are::
+
+            'fmin_l_bfgs_b'
+
     Attributes
     ----------
     X_fit_:
@@ -51,9 +59,10 @@ class GaussianProcessRegressor(BaseEstimator):
     alpha_:
     """
 
-    def __init__(self, kernel=RBF(), y_err=1e-10):
+    def __init__(self, kernel=RBF(), y_err=1e-10, optimizer="fmin_l_bfgs_b"):
         self.kernel = kernel
         self.y_err = y_err
+        self.optimizer = optimizer
 
     def fit(self, X, y):
         X, y = check_X_y(X, y)
@@ -61,18 +70,20 @@ def fit(self, X, y):
         self.X_fit_ = X
         self.y_fit_ = y
 
-        if self.kernel.has_bounds:
+        if self.optimizer == "fmin_l_bfgs_b":
             # Choose hyperparameters based on maximizing the log-marginal
-            # likelihood
+            # likelihood using fmin_l_bfgs_b
             def obj_func(theta):
                 lml, grad = self.log_marginal_likelihood(theta,
                                                          eval_gradient=True)
                 return -lml, -grad
-            self.theta_, lml, _ = fmin_l_bfgs_b(obj_func, self.kernel.params,
+            self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel.params,
                                                 bounds=self.kernel.bounds)
             self.kernel.params = self.theta_
-        else:
+        elif self.optimizer is None:
             self.theta_ = self.kernel.params
+        else:
+            raise ValueError("Unknown optimizer %s." % self.optimizer)
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 27bce9c3ff439..e886c7d434803 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -34,24 +34,32 @@ class Kernel(six.with_metaclass(ABCMeta)):
     """ Base class for all kernels."""
 
     def _parse_param_space(self, param_space):
-        if not np.iterable(param_space):  # fixed hyperparameter
+        if not np.iterable(param_space):
             self.params = np.array([float(param_space)])
-            self.has_bounds = False
+            # No custom bounds specified; use default bounds
+            default_bounds = np.empty((self.params.shape[0], 2),
+                                      dtype=self.params.dtype)
+            default_bounds[:, 0] = 1e-5
+            default_bounds[:, 1] = np.inf
+            self.bounds = default_bounds
             return
 
         param_space = np.atleast_2d(param_space)
         if param_space.shape[1] == 1:
-            # fixed hyperparameter
             self.params = param_space[:, 0]
-            self.has_bounds = False
+            # No custom bounds specified; use default bounds
+            default_bounds = np.empty((self.params.shape[0], 2),
+                                      dtype=self.params.dtype)
+            default_bounds[:, 0] = 1e-5
+            default_bounds[:, 1] = np.inf
+            self.bounds = default_bounds
         elif param_space.shape[1] == 2:
-            # lower+upper bound for hyperparameter
+            # lower + upper bound for hyperparameter
             self.bounds = param_space
-            self.has_bounds = True
             # Use geometric mean of upper and lower boundary as initial
             # hyperparameter value
-            if np.any(np.equal(self.l_bound, None)) \
-               or np.any(np.equal(self.u_bound, None)):
+            if np.any(np.equal(self.l_bound, np.inf)) \
+               or np.any(np.equal(self.u_bound, np.inf)):
                 raise ValueError("Lower or upper bound being None requires "
                                  "explicitly specifying the initial value.")
             self.params = np.array([np.sqrt(self.l_bound * self.u_bound)])
@@ -59,7 +67,6 @@ def _parse_param_space(self, param_space):
             # lower bound, initial value, upper bound
             self.params = param_space[:, 1]
             self.bounds = param_space[:, [0, 2]]
-            self.has_bounds = True
         else:
             raise ValueError("Invalid parameter space given. Must not have "
                              "more than 3 entries per parameter.")
@@ -112,9 +119,6 @@ class KernelOperator(Kernel):
     def __init__(self, k1, k2):
         self.k1 = k1
         self.k2 = k2
-        # XXX: Deal with situations in which only some of the hyperparameter
-        #      shall be optimized
-        self.has_bounds = k1.has_bounds and k2.has_bounds
 
     @property
     def params(self):
@@ -128,7 +132,6 @@ def params(self, theta):
 
     @property
     def bounds(self):
-        assert self.has_bounds
         return np.vstack((self.k1.bounds, self.k2.bounds))
 
     @bounds.setter
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index d33b348901b0c..088166c25ae3d 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -40,8 +40,6 @@ def test_predict_consistent():
 def test_lml_improving():
     """ Test that hyperparameter-tuning improves log-marginal likelihood. """
     for kernel in kernels:
-        if not kernel.has_bounds:
-            continue
         kernel = deepcopy(kernel)
         params_initial = kernel.params
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
@@ -52,8 +50,6 @@ def test_lml_improving():
 def test_converged_to_local_maximum():
     """ Test that we are in local maximum after hyperparameter-optimization. """
     for kernel in kernels:
-        if not kernel.has_bounds:
-            continue
         kernel = deepcopy(kernel)
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
 
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index d7241b9402895..94a033b7a9496 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -12,8 +12,9 @@
 from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels import RBF
 
-from sklearn.utils.testing import (assert_true, assert_greater,
-                                   assert_almost_equal)
+from sklearn.utils.testing \
+    import (assert_true, assert_greater, assert_array_less,
+            assert_almost_equal, assert_equal)
 
 
 def f(x):
@@ -23,10 +24,11 @@ def f(x):
 y = f(X).ravel()
 
 
-kernels = [RBF(0.1), RBF((1e-3, 1.0, 1e3)),
+kernels = [RBF(1.0), RBF((1e-3, 1.0, 1e3)),
            (1e-2, 1.0, 1e2) * RBF((1e-3, 0.1, 1e3)),
-           (1e-2, 1.0, 1e2) * RBF((1e-3, 0.1, 1e3)) + (None, 0.0, None),
-           (1e-2, 0.1, 1e2) * RBF((1e-3, 0.1, 1e3)) + (None, 0.0, None)]
+           (1e-2, 1.0, 1e2) * RBF((1e-3, 0.1, 1e3)) + (0.0, 0.0, 1e2),
+           (1e-2, 0.1, 1e2) * RBF((1e-3, 0.1, 1e3)) + (0.0, 0.0, 1e2)]
+
 
 def test_gpr_interpolation():
     """Test the interpolating property for different kernels."""
@@ -42,8 +44,6 @@ def test_gpr_interpolation():
 def test_lml_improving():
     """ Test that hyperparameter-tuning improves log-marginal likelihood. """
     for kernel in kernels:
-        if not kernel.has_bounds:
-            continue
         kernel = deepcopy(kernel)
         params_initial = kernel.params
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
@@ -54,8 +54,6 @@ def test_lml_improving():
 def test_converged_to_local_maximum():
     """ Test that we are in local maximum after hyperparameter-optimization. """
     for kernel in kernels:
-        if not kernel.has_bounds:
-            continue
         kernel = deepcopy(kernel)
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
@@ -64,6 +62,21 @@ def test_converged_to_local_maximum():
         assert_almost_equal(lml_gradient, 0, 5)
 
 
+def test_solution_inside_bounds():
+    """ Test that hyperparameter-optimization remains in bounds"""
+    for kernel in kernels:
+        kernel = deepcopy(kernel)
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+        bounds = kernel.bounds
+        max_ = np.finfo(bounds.dtype).max
+        tiny = np.finfo(bounds.dtype).tiny
+        bounds[~np.isfinite(bounds[:, 1]), 1] = max_
+
+        assert_array_less(bounds[:, 0], kernel.params + tiny)
+        assert_array_less(kernel.params, bounds[:, 1] + tiny)
+
+
 def test_lml_gradient():
     """ Compare analytic and numeric gradient of log marginal likelihood. """
     for kernel in kernels:
@@ -112,3 +125,11 @@ def test_sample_statistics():
         assert_almost_equal(y_mean, np.mean(samples, 1), 2)
         assert_almost_equal(np.diag(y_cov) / np.diag(y_cov).max(),
                             np.var(samples, 1) / np.diag(y_cov).max(), 1)
+
+
+def test_no_optimizer():
+    """ Test that kernel parameters are unmodified when optimizer is None."""
+    kernel = RBF(1.0)
+    gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None).fit(X, y)
+    assert_equal(kernel.params, 1.0)
+    assert_equal(gpr.theta_, 1.0)

From 540a00915df1803c26a1cd74973fd3f40115cc1b Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sat, 28 Feb 2015 18:53:55 +0100
Subject: [PATCH 027/165] DOC Revising CO2 example

---
 examples/gaussian_process/new/plot_gpr_co2.py | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/examples/gaussian_process/new/plot_gpr_co2.py b/examples/gaussian_process/new/plot_gpr_co2.py
index aeadfa50da148..439941b2b8218 100644
--- a/examples/gaussian_process/new/plot_gpr_co2.py
+++ b/examples/gaussian_process/new/plot_gpr_co2.py
@@ -19,23 +19,37 @@
 y = np.array(data.co2)
 y_mean = y.mean()
 
+# Kernel with parameters given in GPML book
+k1 = 66.0**2 * RBF(67.0)  # long term smooth rising trend
+k2 = 2.4**2 * RBF(90) * ExpSineSquared(((1.3,), (1.0,)))  # seasonal component
+k3 = 0.66**2 * RationalQuadratic(((0.78,), (1.2,)))  # medium term irregularit.
+k4 = 0.18**2 * RBF(0.134) + WhiteKernel(0.19**2) # noise terms
+kernel_gpml = k1 + k2 + k3 + k4
+
+gp = GaussianProcessRegressor(kernel=kernel_gpml, y_err=0, optimizer=None)
+gp.fit(X, y - y_mean)
+
+print "GPML kernel: %s" % kernel_gpml
+print "Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.theta_)
 
-k1 = 66.0**2 * RBF(67.0)
-k2 = 2.4**2 * RBF(90) * ExpSineSquared(((1.3,), (1.0,)))
-k3 = 0.66**2 * RationalQuadratic(((0.78,), (1.2,)))
-k4 = 0.18**2 * RBF(0.134) + WhiteKernel((0, 0.19**2, None))
+# Kernel with optimized parameters
+k1 = 50.0**2 * RBF(50.0) # long term smooth rising trend
+k2 = 2.0**2 * RBF(100) * ExpSineSquared(((1.0,), (1.0,))) # seasonal component
+k3 = 0.5**2 * RationalQuadratic(((1.0,), (1.0,))) # medium term irregularities
+k4 = 0.1**2 * RBF(0.1) + WhiteKernel((1e-3, 0.1**2, np.inf))  # noise terms
 kernel = k1 + k2 + k3 + k4
 
-gp = GaussianProcessRegressor(kernel=kernel, y_err=1e-2)
+gp = GaussianProcessRegressor(kernel=kernel, y_err=0)
 gp.fit(X, y - y_mean)
 
-print "Learned kernel: %s" % kernel
+print "\nLearned kernel: %s" % kernel
 print "Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.theta_)
 
 X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
 y_pred, y_std = gp.predict(X_, return_std=True)
 y_pred += y_mean
 
+# Illustration
 plt.scatter(X, y, c='k')
 plt.plot(X_, y_pred)
 plt.fill_between(X_[:, 0], y_pred - y_std, y_pred + y_std,

From 8c9902795de7c95b05df3901c1425c99b68fb2a0 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 1 Mar 2015 13:33:58 +0100
Subject: [PATCH 028/165] TST Cover more kernels in tests

---
 sklearn/gaussian_process/tests/test_kernels.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index a4c870e0ded31..52cb472fba17c 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -11,7 +11,8 @@
 
 from sklearn.metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS
 from sklearn.gaussian_process.kernels \
-    import RBF, ConstantKernel, WhiteKernel, PairwiseKernel
+    import (RBF, RationalQuadratic, ExpSineSquared, DotProduct,
+            ConstantKernel, WhiteKernel, PairwiseKernel)
 
 from sklearn.utils.testing import assert_equal, assert_almost_equal
 
@@ -20,7 +21,10 @@
 
 kernels = [RBF(2.0), RBF([[0.5], [2.0]]),
            ConstantKernel(10.0),
-           2.0 * RBF(0.5), RBF(2.0) + WhiteKernel(1.0)]
+           2.0 * RBF(0.5), RBF(2.0) + WhiteKernel(1.0),
+           RationalQuadratic([(1.0,), (1.0,)]),
+           ExpSineSquared([(1.0,), (1.0,)]),
+           DotProduct(1.0), DotProduct(1.0, degree=2)]
 for metric in PAIRWISE_KERNEL_FUNCTIONS:
     if metric in ["additive_chi2", "chi2"]:
         continue
@@ -48,7 +52,7 @@ def eval_kernel_ij_for_theta(theta):
                     approx_fprime(kernel.params, eval_kernel_ij_for_theta,
                                   1e-10)
 
-        assert_almost_equal(K_gradient, K_gradient_approx, 5)
+        assert_almost_equal(K_gradient, K_gradient_approx, 4)
 
 
 def test_auto_vs_cross():

From 264dd9e11912ac2b7c2380b3eb512c3f8c993607 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 1 Mar 2015 13:35:48 +0100
Subject: [PATCH 029/165] ENH Analytic gradients for ExpSine2,
 RationalQuadratic, DotProduct kernels

---
 sklearn/gaussian_process/kernels.py | 55 +++++++++++++----------------
 1 file changed, 24 insertions(+), 31 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index e886c7d434803..bbff12cf1cf8d 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -555,10 +555,10 @@ def params(self, theta):
 
     def __call__(self, X, Y=None, eval_gradient=False):
         if Y is None:
-            dists = pdist(X, metric='sqeuclidean')
-            K = (1 + dists / (2 * self.alpha * self.l ** 2)) ** -self.alpha
-            # convert from upper-triangular matrix to square matrix
-            K = squareform(K)
+            dists = squareform(pdist(X, metric='sqeuclidean'))
+            tmp = dists / (2 * self.alpha * self.l ** 2)
+            base = (1 + tmp)
+            K = base ** -self.alpha
             np.fill_diagonal(K, 1)
         else:
             if eval_gradient:
@@ -568,13 +568,10 @@ def __call__(self, X, Y=None, eval_gradient=False):
             K = (1 + dists / (2 * self.alpha * self.l ** 2)) ** -self.alpha
 
         if eval_gradient:
-            # approximate gradient numerically
-            def f(theta):  # helper function
-                theta_, self.params = self.params, theta
-                K = self(X, Y)
-                self.params = theta_
-                return K
-            return K, _approx_fprime(self.params, f, 1e-10)
+            K_gradient = np.empty((K.shape[0], K.shape[1], 2))
+            K_gradient[..., 0] = K * (-np.log(base) + tmp / base)
+            K_gradient[..., 1] = dists * K / (self.l ** 2 * base)
+            return K, K_gradient
         else:
             return K
 
@@ -595,11 +592,10 @@ def params(self, theta):
 
     def __call__(self, X, Y=None, eval_gradient=False):
         if Y is None:
-            dists = pdist(X, metric='euclidean')
-            K = np.exp(- 2 * (np.sin(np.pi / self.p * dists) / self.l) ** 2)
-            # convert from upper-triangular matrix to square matrix
-            K = squareform(K)
-            np.fill_diagonal(K, 1)
+            dists = squareform(pdist(X, metric='euclidean'))
+            arg = np.pi  * dists / self.p
+            sin_of_arg = np.sin(arg)
+            K = np.exp(- 2 * (sin_of_arg / self.l) ** 2)
         else:
             if eval_gradient:
                 raise ValueError(
@@ -608,13 +604,12 @@ def __call__(self, X, Y=None, eval_gradient=False):
             K = np.exp(- 2 * (np.sin(np.pi / self.p * dists) / self.l) ** 2)
 
         if eval_gradient:
-            # approximate gradient numerically
-            def f(theta):  # helper function
-                theta_, self.params = self.params, theta
-                K = self(X, Y)
-                self.params = theta_
-                return K
-            return K, _approx_fprime(self.params, f, 1e-5)
+            K_gradient = np.empty((K.shape[0], K.shape[1], 2))
+            cos_of_arg = np.cos(arg)
+            K_gradient[..., 0] = 4 / self.l**3 * sin_of_arg**2 * K
+            K_gradient[..., 1] = \
+                4 * arg / (self.l**2 * self.p) * cos_of_arg * sin_of_arg * K
+            return K, K_gradient
         else:
             return K
 
@@ -635,7 +630,8 @@ def params(self, theta):
 
     def __call__(self, X, Y=None, eval_gradient=False):
         if Y is None:
-            K = (np.inner(X, X) + self.sigma_0 ** 2) ** self.degree
+            dot_product = np.inner(X, X)
+            K = (dot_product + self.sigma_0 ** 2) ** self.degree
         else:
             if eval_gradient:
                 raise ValueError(
@@ -643,12 +639,9 @@ def __call__(self, X, Y=None, eval_gradient=False):
             K = (np.inner(X, Y) + self.sigma_0 ** 2) ** self.degree
 
         if eval_gradient:
-            # approximate gradient numerically
-            def f(theta):  # helper function
-                theta_, self.params = self.params, theta
-                K = self(X, Y)
-                self.params = theta_
-                return K
-            return K, _approx_fprime(self.params, f, 1e-10)
+            K_gradient = np.empty((K.shape[0], K.shape[1], 1))
+            K_gradient[..., 0] = 2 * self.sigma_0 * self.degree \
+                * (dot_product + self.sigma_0 ** 2) ** (self.degree - 1)
+            return K, K_gradient
         else:
             return K

From 7e8c4a7bff8a61b0bcf79bcc767df9465c9a788f Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 1 Mar 2015 14:08:28 +0100
Subject: [PATCH 030/165] DOC Extended documentation of GP kernels module

---
 sklearn/gaussian_process/kernels.py | 130 ++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index bbff12cf1cf8d..9ed0b3be7f0e5 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -252,6 +252,12 @@ class ConstantKernel(Kernel):
     Can be used as part of a product-kernel where it scales the magnitude of
     the other factor (kernel) or as part of a sum-kernel, where it modifies
     the mean of the Gaussian process.
+
+    Tunable kernel parameters
+    -------------------------
+    value : float
+        The constant value used for determining the magnitude (product-kernel)
+        or offset of mean (sum-lernel).
     """
 
     def __init__(self, param_space=1.0):
@@ -311,6 +317,13 @@ class RBF(Kernel):
     """ Radial-basis function kernel (aka squared-exponential kernel).
 
     Both isotropic and anisotropic version are supported.
+
+    Tunable kernel parameters
+    -------------------------
+    l : float or array with shape (n_features,), entries > 0
+        The length scale of the kernel. If a float, an isotropic kernel is
+        used. If an array, an anisotropic kernel is used where each dimension
+        of l defines the length-scale of the respective feature dimension.
     """
 
     def __init__(self, param_space=1.0):
@@ -387,6 +400,11 @@ class WhiteKernel(Kernel):
     The main use-case of this kernel is as part of a sum-kernel where it
     explains the noise-component of the signal. Tuning its parameter
     corresponds to estimating the noise-level.
+
+    Tunable kernel parameters
+    -------------------------
+    c : float
+        Parameter controlling the noise level.
     """
 
     def __init__(self, param_space=1.0):
@@ -539,6 +557,20 @@ def f(gamma):  # helper function
 
 
 class RationalQuadratic(Kernel):
+    """ Rational Quadratic kernel.
+
+    This kernel can be seen as a scale mixture (an infinite sum) of RBF kernels
+    with different characteristic length-scales.
+
+    Only isotropic variant is supported at the moment.
+
+    Tunable kernel parameters
+    -------------------------
+    alpha : float > 0
+        Scale mixture parameter
+    l : float > 0
+        The length scale of the kernel.
+    """
 
     def __init__(self, param_space=[(1.0,), (1.0,)]):
         self._parse_param_space(param_space)
@@ -554,6 +586,31 @@ def params(self, theta):
         self.l = theta[1]
 
     def __call__(self, X, Y=None, eval_gradient=False):
+        """ Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+            The gradient of the kernel k(X, X) with repect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
         if Y is None:
             dists = squareform(pdist(X, metric='sqeuclidean'))
             tmp = dists / (2 * self.alpha * self.l ** 2)
@@ -577,6 +634,19 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
 
 class ExpSineSquared(Kernel):
+    """ Exp-Sine-Squared kernel.
+
+    This kernel allows modelling periodic functions.
+
+    Only isotropic variant is supported at the moment.
+
+    Tunable kernel parameters
+    -------------------------
+    l : float > 0
+        The length scale of the kernel.
+    p : float > 0
+        The periodicity of the kernel.
+    """
 
     def __init__(self, param_space=[(1.0,), (1.0,)]):
         self._parse_param_space(param_space)
@@ -591,6 +661,31 @@ def params(self, theta):
         self.p = theta[1]
 
     def __call__(self, X, Y=None, eval_gradient=False):
+        """ Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+            The gradient of the kernel k(X, X) with repect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
         if Y is None:
             dists = squareform(pdist(X, metric='euclidean'))
             arg = np.pi  * dists / self.p
@@ -615,6 +710,16 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
 
 class DotProduct(Kernel):
+    """ Dot-Product kernel.
+
+    This kernel is non-stationary.
+
+    Tunable kernel parameters
+    -------------------------
+    sigma_0 : float >= 0
+        Parameter controlling the inhomogenity of the kernel. If sigma_0=0,
+        the kernel is homogenous.
+    """
 
     def __init__(self, param_space=1.0, degree=1):
         self._parse_param_space(param_space)
@@ -629,6 +734,31 @@ def params(self, theta):
         self.sigma_0 = theta[0]
 
     def __call__(self, X, Y=None, eval_gradient=False):
+        """ Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+            The gradient of the kernel k(X, X) with repect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
         if Y is None:
             dot_product = np.inner(X, X)
             K = (dot_product + self.sigma_0 ** 2) ** self.degree

From 63e92de85d4126eac77753b56185f2ee9aa92f45 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 1 Mar 2015 14:10:48 +0100
Subject: [PATCH 031/165] MISC Reorganizing kernel module

---
 sklearn/gaussian_process/gpr.py            |   2 +-
 sklearn/gaussian_process/kernels.py        | 268 ++++++++++-----------
 sklearn/gaussian_process/tests/test_gpr.py |   4 +-
 3 files changed, 137 insertions(+), 137 deletions(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 568c0ebe2613d..aa9548bad5ac5 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -78,7 +78,7 @@ def obj_func(theta):
                                                          eval_gradient=True)
                 return -lml, -grad
             self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel.params,
-                                                bounds=self.kernel.bounds)
+                                              bounds=self.kernel.bounds)
             self.kernel.params = self.theta_
         elif self.optimizer is None:
             self.theta_ = self.kernel.params
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 9ed0b3be7f0e5..e3eaa64df1450 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -313,87 +313,6 @@ def __repr__(self):
         return "{0:.3g}".format(self.value)
 
 
-class RBF(Kernel):
-    """ Radial-basis function kernel (aka squared-exponential kernel).
-
-    Both isotropic and anisotropic version are supported.
-
-    Tunable kernel parameters
-    -------------------------
-    l : float or array with shape (n_features,), entries > 0
-        The length scale of the kernel. If a float, an isotropic kernel is
-        used. If an array, an anisotropic kernel is used where each dimension
-        of l defines the length-scale of the respective feature dimension.
-    """
-
-    def __init__(self, param_space=1.0):
-        self._parse_param_space(param_space)
-
-    @property
-    def params(self):
-        return np.asarray(self.l)
-
-    @params.setter
-    def params(self, theta):
-        self.l = theta
-
-    def __call__(self, X, Y=None, eval_gradient=False):
-        """ Return the kernel k(X, Y) and optionally its gradient.
-
-        Parameters
-        ----------
-        X : array, shape (n_samples_X, n_features)
-            Left argument of the returned kernel k(X, Y)
-
-        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
-            Right argument of the returned kernel k(X, Y). If None, k(X, X)
-            if evaluated instead.
-
-        eval_gradient : bool (optional, default=False)
-            Determines whether the gradient with respect to the kernel
-            hyperparameter is determined. Only supported when Y is None.
-
-        Returns
-        -------
-        K : array, shape (n_samples_X, n_samples_Y)
-            Kernel k(X, Y)
-
-        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
-            The gradient of the kernel k(X, X) with repect to the
-            hyperparameter of the kernel. Only returned when eval_gradient
-            is True.
-        """
-        if Y is None:
-            dists = pdist(X / self.l, metric='sqeuclidean')
-            K = np.exp(-.5 * dists)
-            # convert from upper-triangular matrix to square matrix
-            K = squareform(K)
-            np.fill_diagonal(K, 1)
-        else:
-            if eval_gradient:
-                raise ValueError(
-                    "Gradient can only be evaluated when Y is None.")
-            dists = cdist(X / self.l, Y / self.l, metric='sqeuclidean')
-            K = np.exp(-.5 * dists)
-
-        if eval_gradient:
-            if self.l.shape[0] == 1:
-                K_gradient = \
-                    (K * squareform(dists) / self.l[0])[:, :, np.newaxis]
-                return K, K_gradient
-            elif self.l.shape[0] == X.shape[1]:
-                # We need to recompute the pairwise dimension-wise distances
-                D = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 \
-                    / (self.l ** 3)
-                K_gradient = K[..., np.newaxis] * D
-                return K, K_gradient
-            else:
-                raise Exception("Anisotropic kernels require that the number "
-                                "of length scales and features match.")
-        else:
-            return K
-
-
 class WhiteKernel(Kernel):
     """ White kernel.
 
@@ -460,62 +379,29 @@ def __call__(self, X, Y=None, eval_gradient=False):
             return K
 
 
-# adapted from scipy/optimize/optimize.py for functions with 2d output
-def _approx_fprime(xk, f, epsilon, args=()):
-    f0 = f(*((xk,) + args))
-    grad = np.zeros((f0.shape[0], f0.shape[1], len(xk)), float)
-    ei = np.zeros((len(xk), ), float)
-    for k in range(len(xk)):
-        ei[k] = 1.0
-        d = epsilon * ei
-        grad[:, :, k] = (f(*((xk + d,) + args)) - f0) / d[k]
-        ei[k] = 0.0
-    return grad
-
-
-class PairwiseKernel(Kernel):
-    """ Wrapper for kernels in sklearn.metrics.pairwise.
-
-    A thin wrapper around the functionality of the kernels in
-    sklearn.metrics.pairwise.
-
-    Note: Evaluation of eval_gradient is not analytic but numeric and all
-          kernels support only isotropic distances. The parameter gamma is
-          specified via the param_space and may be optimized. The other
-          kernel parameters are set directly  at initialization and are kept
-          fixed.
+class RBF(Kernel):
+    """ Radial-basis function kernel (aka squared-exponential kernel).
 
-    Parameters
-    ----------
-    metric : string, or callable
-        The metric to use when calculating kernel between instances in a
-        feature array. If metric is a string, it must be one of the metrics
-        in pairwise.PAIRWISE_KERNEL_FUNCTIONS.
-        If metric is "precomputed", X is assumed to be a kernel matrix.
-        Alternatively, if metric is a callable function, it is called on each
-        pair of instances (rows) and the resulting value recorded. The callable
-        should take two arrays from X as input and return a value indicating
-        the distance between them.
+    Both isotropic and anisotropic version are supported.
 
-    `**kwds` : optional keyword parameters
-        Any further parameters are passed directly to the kernel function.
+    Tunable kernel parameters
+    -------------------------
+    l : float or array with shape (n_features,), entries > 0
+        The length scale of the kernel. If a float, an isotropic kernel is
+        used. If an array, an anisotropic kernel is used where each dimension
+        of l defines the length-scale of the respective feature dimension.
     """
 
-    def __init__(self, param_space=1.0, metric="linear", **kwargs):
+    def __init__(self, param_space=1.0):
         self._parse_param_space(param_space)
-        self.metric = metric
-        self.kwargs = kwargs
-        if "gamma" in kwargs:
-            raise ValueError(
-                "Gamma must not be set directly but via param_space.")
 
     @property
     def params(self):
-        return np.asarray([self.gamma])
+        return np.asarray(self.l)
 
     @params.setter
     def params(self, theta):
-        self.gamma = theta[0]
+        self.l = theta
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """ Return the kernel k(X, Y) and optionally its gradient.
@@ -543,15 +429,33 @@ def __call__(self, X, Y=None, eval_gradient=False):
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
-        K = pairwise_kernels(X, Y, metric=self.metric, gamma=self.gamma,
-                             filter_params=True, **self.kwargs)
+        if Y is None:
+            dists = pdist(X / self.l, metric='sqeuclidean')
+            K = np.exp(-.5 * dists)
+            # convert from upper-triangular matrix to square matrix
+            K = squareform(K)
+            np.fill_diagonal(K, 1)
+        else:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated when Y is None.")
+            dists = cdist(X / self.l, Y / self.l, metric='sqeuclidean')
+            K = np.exp(-.5 * dists)
+
         if eval_gradient:
-            # approximate gradient numerically
-            def f(gamma):  # helper function
-                return pairwise_kernels(
-                    X, Y, metric=self.metric, gamma=gamma,
-                    filter_params=True, **self.kwargs)
-            return K, _approx_fprime(self.params, f, 1e-10)
+            if self.l.shape[0] == 1:
+                K_gradient = \
+                    (K * squareform(dists) / self.l[0])[:, :, np.newaxis]
+                return K, K_gradient
+            elif self.l.shape[0] == X.shape[1]:
+                # We need to recompute the pairwise dimension-wise distances
+                D = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 \
+                    / (self.l ** 3)
+                K_gradient = K[..., np.newaxis] * D
+                return K, K_gradient
+            else:
+                raise Exception("Anisotropic kernels require that the number "
+                                "of length scales and features match.")
         else:
             return K
 
@@ -775,3 +679,99 @@ def __call__(self, X, Y=None, eval_gradient=False):
             return K, K_gradient
         else:
             return K
+
+
+# adapted from scipy/optimize/optimize.py for functions with 2d output
+def _approx_fprime(xk, f, epsilon, args=()):
+    f0 = f(*((xk,) + args))
+    grad = np.zeros((f0.shape[0], f0.shape[1], len(xk)), float)
+    ei = np.zeros((len(xk), ), float)
+    for k in range(len(xk)):
+        ei[k] = 1.0
+        d = epsilon * ei
+        grad[:, :, k] = (f(*((xk + d,) + args)) - f0) / d[k]
+        ei[k] = 0.0
+    return grad
+
+
+class PairwiseKernel(Kernel):
+    """ Wrapper for kernels in sklearn.metrics.pairwise.
+
+    A thin wrapper around the functionality of the kernels in
+    sklearn.metrics.pairwise.
+
+    Note: Evaluation of eval_gradient is not analytic but numeric and all
+          kernels support only isotropic distances. The parameter gamma is
+          specified via the param_space and may be optimized. The other
+          kernel parameters are set directly  at initialization and are kept
+          fixed.
+
+    Parameters
+    ----------
+    metric : string, or callable
+        The metric to use when calculating kernel between instances in a
+        feature array. If metric is a string, it must be one of the metrics
+        in pairwise.PAIRWISE_KERNEL_FUNCTIONS.
+        If metric is "precomputed", X is assumed to be a kernel matrix.
+        Alternatively, if metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays from X as input and return a value indicating
+        the distance between them.
+
+    `**kwds` : optional keyword parameters
+        Any further parameters are passed directly to the kernel function.
+    """
+
+    def __init__(self, param_space=1.0, metric="linear", **kwargs):
+        self._parse_param_space(param_space)
+        self.metric = metric
+        self.kwargs = kwargs
+        if "gamma" in kwargs:
+            raise ValueError(
+                "Gamma must not be set directly but via param_space.")
+
+    @property
+    def params(self):
+        return np.asarray([self.gamma])
+
+    @params.setter
+    def params(self, theta):
+        self.gamma = theta[0]
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """ Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+            The gradient of the kernel k(X, X) with repect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        K = pairwise_kernels(X, Y, metric=self.metric, gamma=self.gamma,
+                             filter_params=True, **self.kwargs)
+        if eval_gradient:
+            # approximate gradient numerically
+            def f(gamma):  # helper function
+                return pairwise_kernels(
+                    X, Y, metric=self.metric, gamma=gamma,
+                    filter_params=True, **self.kwargs)
+            return K, _approx_fprime(self.params, f, 1e-10)
+        else:
+            return K
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 94a033b7a9496..f9c1a818b4b27 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -52,14 +52,14 @@ def test_lml_improving():
 
 
 def test_converged_to_local_maximum():
-    """ Test that we are in local maximum after hyperparameter-optimization. """
+    """ Test that we are in local maximum after hyperparameter-optimization."""
     for kernel in kernels:
         kernel = deepcopy(kernel)
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
         lml, lml_gradient = gpr.log_marginal_likelihood(kernel.params, True)
 
-        assert_almost_equal(lml_gradient, 0, 5)
+        assert_almost_equal(lml_gradient, 0, 2)  # XXX: Check why only 2 digits
 
 
 def test_solution_inside_bounds():

From 6be29fa82181f39e1e877970250930c3bd39abe8 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 1 Mar 2015 14:26:32 +0100
Subject: [PATCH 032/165] ENH Adding method is_stationary to GP kernels

---
 sklearn/gaussian_process/kernels.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index e3eaa64df1450..16cbd1478af82 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -108,10 +108,15 @@ def __rmul__(self, b):
     def __repr__(self):
         return "{0}({1})".format(self.__class__.__name__,
                                  ", ".join(map("{0:.3g}".format, self.params)))
+
     @abstractmethod
     def __call__(self, X, Y=None, eval_gradient=False):
         """Evaluate the kernel."""
 
+    def is_stationary(self):
+        """ Returns whether the kernel is stationary. """
+        return True
+
 
 class KernelOperator(Kernel):
     """ Base class for all kernel operators. """
@@ -140,6 +145,10 @@ def bounds(self, bounds):
         self.k1.bounds = bounds[:i]
         self.k2.bounds = bounds[i:]
 
+    def is_stationary(self):
+        """ Retuuns whether the kernel is stationary. """
+        return self.k1.is_stationary() and self.k2.is_stationary()
+
 
 class Sum(KernelOperator):
     """ Sum-kernel k1 + k2 of two kernels k1 and k2.
@@ -680,6 +689,10 @@ def __call__(self, X, Y=None, eval_gradient=False):
         else:
             return K
 
+    def is_stationary(self):
+        """ Returns whether the kernel is stationary. """
+        return False
+
 
 # adapted from scipy/optimize/optimize.py for functions with 2d output
 def _approx_fprime(xk, f, epsilon, args=()):
@@ -775,3 +788,7 @@ def f(gamma):  # helper function
             return K, _approx_fprime(self.params, f, 1e-10)
         else:
             return K
+
+    def is_stationary(self):
+        """ Returns whether the kernel is stationary. """
+        return self.metric in ["rbf"]

From 57a7b9380fdbbe97edf33543934f6cfc731945cc Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 1 Mar 2015 14:27:00 +0100
Subject: [PATCH 033/165] TST Testing kernel stationarity

---
 sklearn/gaussian_process/tests/test_kernels.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 52cb472fba17c..e8a19e78febea 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -85,3 +85,12 @@ def test_kernel_anisotropic():
     X2[:, 1] /= 4
     K2 = RBF(0.5)(X2)
     assert_almost_equal(K, K2)
+
+
+def test_kernel_stationary():
+    """ Test stationarity of kernels."""
+    for kernel in kernels:
+        if not kernel.is_stationary():
+            continue
+        K = kernel(X, X + 1)
+        assert_almost_equal(K[0, 0], np.diag(K))

From ac5f635cee600863b780670b7ac2d096612f025b Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 1 Mar 2015 14:44:41 +0100
Subject: [PATCH 034/165] DOC Illustrating different kernels in
 plot_gpr_prior_posterior.py

---
 .../new/plot_gpr_prior_posterior.py           | 102 ++++++++++--------
 1 file changed, 56 insertions(+), 46 deletions(-)

diff --git a/examples/gaussian_process/new/plot_gpr_prior_posterior.py b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
index 9898ed1444c3b..3cd1ed79c99ee 100644
--- a/examples/gaussian_process/new/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
@@ -1,7 +1,8 @@
 """Gaussian process regression (GPR) prior and posterior
 
-This example illustrates the prior and posterior of a GPR. Mean, standard
-deviation, and 10 samples are shown for both prior and posterior.
+This example illustrates the prior and posterior of a GPR with different
+kernels. Mean, standard deviation, and 10 samples are shown for both prior
+and posterior.
 """
 print __doc__
 
@@ -14,48 +15,57 @@
 from matplotlib import pyplot as plt
 
 from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels import RBF
-
-
-# Specify Gaussian Process
-kernel = 1.0 * RBF((1e-10, 1.0, None))
-gp = GaussianProcessRegressor(kernel=kernel)
-
-# Plot prior
-plt.figure(0, figsize=(8, 8))
-plt.subplot(2, 1, 1)
-X_ = np.linspace(0, 5, 100)
-y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
-plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
-plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
-                 y_mean + np.sqrt(np.diag(y_cov)),
-                 alpha=0.5, color='k')
-y_samples = gp.sample(X_[:, np.newaxis], 10)
-plt.plot(X_, y_samples, color='b', lw=1)
-plt.xlim(0, 5)
-plt.ylim(-3, 3)
-plt.title("Prior, kernel:  %s" % kernel)
-
-# Generate data and fit GP
-rng = np.random.RandomState(4)
-X = rng.uniform(0, 5, 10)[:, np.newaxis]
-y = np.sin((X[:, 0] - 2.5) ** 2)
-gp.fit(X, y)
-
-# Plot posterior
-plt.subplot(2, 1, 2)
-X_ = np.linspace(0, 5, 100)
-y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
-plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
-plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
-                 y_mean + np.sqrt(np.diag(y_cov)),
-                 alpha=0.5, color='k')
-
-y_samples = gp.sample(X_[:, np.newaxis], 10)
-plt.plot(X_, y_samples, color='b', lw=1)
-plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
-plt.xlim(0, 5)
-plt.ylim(-3, 3)
-plt.title("Posterior, kernel: %s" % kernel)
-plt.tight_layout()
+from sklearn.gaussian_process.kernels \
+    import RBF, RationalQuadratic, ExpSineSquared, DotProduct
+
+
+kernels = [1.0 * RBF((1e-10, 1.0, None)),
+           1.0 * RationalQuadratic(((0.1,), (1.0,))),
+           1.0 * ExpSineSquared(((0.1, 1.0, 10.0), (1.0, 3.0, 10.0))),
+           (0.01, 0.1, 10.0) * DotProduct(((0.0, 1.0, 10.0)), degree=2)]
+
+for fig_index, kernel in enumerate(kernels):
+    if fig_index > 3: continue
+
+    # Specify Gaussian Process
+    gp = GaussianProcessRegressor(kernel=kernel)
+
+    # Plot prior
+    plt.figure(fig_index, figsize=(8, 8))
+    plt.subplot(2, 1, 1)
+    X_ = np.linspace(0, 5, 100)
+    y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
+    plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
+    plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
+                     y_mean + np.sqrt(np.diag(y_cov)),
+                     alpha=0.5, color='k')
+    y_samples = gp.sample(X_[:, np.newaxis], 10)
+    plt.plot(X_, y_samples, color='b', lw=1)
+    plt.xlim(0, 5)
+    plt.ylim(-3, 3)
+    plt.title("Prior (kernel:  %s)" % kernel)
+
+    # Generate data and fit GP
+    rng = np.random.RandomState(4)
+    X = rng.uniform(0, 5, 10)[:, np.newaxis]
+    y = np.sin((X[:, 0] - 2.5) ** 2)
+    gp.fit(X, y)
+
+    # Plot posterior
+    plt.subplot(2, 1, 2)
+    X_ = np.linspace(0, 5, 100)
+    y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
+    plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
+    plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
+                     y_mean + np.sqrt(np.diag(y_cov)),
+                     alpha=0.5, color='k')
+
+    y_samples = gp.sample(X_[:, np.newaxis], 10)
+    plt.plot(X_, y_samples, color='b', lw=1)
+    plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
+    plt.xlim(0, 5)
+    plt.ylim(-3, 3)
+    plt.title("Posterior (kernel: %s)" % kernel)
+    plt.tight_layout()
+
 plt.show()

From 0a98dfc4699f868075ef0a817b2322464fbe57b8 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 1 Mar 2015 14:57:24 +0100
Subject: [PATCH 035/165] FIX test_converged_to_local_maximum deals with
 convergence to boundary

---
 sklearn/gaussian_process/tests/test_gpr.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index f9c1a818b4b27..82ba4337277dd 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -59,7 +59,9 @@ def test_converged_to_local_maximum():
 
         lml, lml_gradient = gpr.log_marginal_likelihood(kernel.params, True)
 
-        assert_almost_equal(lml_gradient, 0, 2)  # XXX: Check why only 2 digits
+        assert_true(np.all(np.isclose(lml_gradient, 0, atol=1e-5)
+                           | (kernel.params == kernel.bounds[:, 0])
+                           | (kernel.params == kernel.bounds[:, 1])))
 
 
 def test_solution_inside_bounds():

From 508867cc1e1877f6a5e98c883daedc723ae87da3 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 1 Mar 2015 17:57:26 +0100
Subject: [PATCH 036/165] ENH More efficient computation of lml-gradient in GPR
 using einsum

---
 sklearn/gaussian_process/gpr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index aa9548bad5ac5..bccfd49d5cab0 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -165,7 +165,7 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         if eval_gradient:  # compare Equation 5.9 from GPML
             tmp = np.outer(alpha, alpha)
             tmp -= cho_solve((L, True), np.eye(K.shape[0]))
-            gradient = 0.5 * np.trace(tmp.dot(K_gradient))
+            gradient = 0.5 * np.einsum("ij,ijk->k", tmp, K_gradient)
             return log_likelihood, gradient
         else:
             return log_likelihood

From da8bdc425a4106df24643764187057e86b97a412 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 2 Mar 2015 19:27:52 +0100
Subject: [PATCH 037/165] FIX GPs do not modify kernel attribute but store
 optimized kernels in kernel_

---
 examples/gaussian_process/new/plot_gpc.py     | 11 +++---
 examples/gaussian_process/new/plot_gpr_co2.py |  4 +-
 .../gaussian_process/new/plot_gpr_noisy.py    |  8 ++--
 sklearn/gaussian_process/gpc.py               | 26 ++++++++-----
 sklearn/gaussian_process/gpr.py               | 27 +++++++------
 sklearn/gaussian_process/tests/test_gpc.py    | 18 +++------
 sklearn/gaussian_process/tests/test_gpr.py    | 39 +++++++------------
 7 files changed, 62 insertions(+), 71 deletions(-)

diff --git a/examples/gaussian_process/new/plot_gpc.py b/examples/gaussian_process/new/plot_gpc.py
index 200da2e29acc9..fdce10096763a 100644
--- a/examples/gaussian_process/new/plot_gpc.py
+++ b/examples/gaussian_process/new/plot_gpc.py
@@ -25,11 +25,10 @@
 y = np.array(np.sin((X[:, 0] - 2.5) ** 2) > 0.0, dtype=int)
 
 # Specify Gaussian Processes with fixed and optimized hyperparameters
-kernel_fix = 1.0 * RBF(param_space=1.0)
-gp_fix = GaussianProcessClassifier(kernel=kernel_fix, optimizer=None).fit(X, y)
+gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(1.0),
+								   optimizer=None).fit(X, y)
 
-kernel_opt = 1.0 * RBF(1.0)
-gp_opt = GaussianProcessClassifier(kernel=kernel_opt).fit(X, y)
+gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(1.0)).fit(X, y)
 
 print "Log Marginal Likelihood (initial): %.3f" % \
     gp_fix.log_marginal_likelihood(gp_fix.theta_)
@@ -42,9 +41,9 @@
 plt.scatter(X[:, 0], y)
 X_ = np.linspace(0, 5, 100)
 plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis]), 'r',
-         label="Initial kernel: %s" % kernel_fix)
+         label="Initial kernel: %s" % gp_fix.kernel_)
 plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis]), 'b',
-         label="Optimized kernel: %s" % kernel_opt)
+         label="Optimized kernel: %s" % gp_opt.kernel_)
 plt.legend(loc="best")
 plt.xlabel("Feature")
 plt.ylabel("Class")
diff --git a/examples/gaussian_process/new/plot_gpr_co2.py b/examples/gaussian_process/new/plot_gpr_co2.py
index 439941b2b8218..12c5eb1636e72 100644
--- a/examples/gaussian_process/new/plot_gpr_co2.py
+++ b/examples/gaussian_process/new/plot_gpr_co2.py
@@ -29,7 +29,7 @@
 gp = GaussianProcessRegressor(kernel=kernel_gpml, y_err=0, optimizer=None)
 gp.fit(X, y - y_mean)
 
-print "GPML kernel: %s" % kernel_gpml
+print "GPML kernel: %s" % gp.kernel_
 print "Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.theta_)
 
 # Kernel with optimized parameters
@@ -42,7 +42,7 @@
 gp = GaussianProcessRegressor(kernel=kernel, y_err=0)
 gp.fit(X, y - y_mean)
 
-print "\nLearned kernel: %s" % kernel
+print "\nLearned kernel: %s" % gp.kernel_
 print "Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.theta_)
 
 X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
diff --git a/examples/gaussian_process/new/plot_gpr_noisy.py b/examples/gaussian_process/new/plot_gpr_noisy.py
index e732b65494da7..da999115c321f 100644
--- a/examples/gaussian_process/new/plot_gpr_noisy.py
+++ b/examples/gaussian_process/new/plot_gpr_noisy.py
@@ -34,7 +34,6 @@
 # First run
 plt.figure(0)
 kernel = 1.0 * RBF(100.0) + WhiteKernel((1e-10, 1e-5, 1e+1))
-kernel_str = str(kernel)
 gp = GaussianProcessRegressor(kernel=kernel,
                               y_err=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
@@ -46,13 +45,13 @@
 plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
 plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
 plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
-          % (kernel_str, kernel, gp.log_marginal_likelihood(kernel.params)))
+          % (kernel, gp.kernel_,
+             gp.log_marginal_likelihood(gp.kernel_.params)))
 plt.tight_layout()
 
 # First run
 plt.figure(1)
 kernel = 1.0 * RBF(1.0) + WhiteKernel((1e-10, 1e-5, 1e+1))
-kernel_str = str(kernel)
 gp = GaussianProcessRegressor(kernel=kernel,
                               y_err=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
@@ -64,7 +63,8 @@
 plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
 plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
 plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
-          % (kernel_str, kernel, gp.log_marginal_likelihood(kernel.params)))
+          % (kernel, gp.kernel_,
+             gp.log_marginal_likelihood(gp.kernel_.params)))
 plt.tight_layout()
 
 # Plot LML landscape
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index f95ca22d9ac47..6fcf21ff3b359 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -40,12 +40,18 @@ class GaussianProcessClassifier(BaseEstimator):
       * binary classification
     """
 
-    def __init__(self, kernel=RBF(), jitter=0.0, optimizer="fmin_l_bfgs_b"):
+    def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b"):
         self.kernel = kernel
         self.jitter = jitter
         self.optimizer = optimizer
 
     def fit(self, X, y):
+        if self.kernel is None:  # Use an RBF kernel as default
+            self.kernel_ = RBF()
+        else:
+            import copy  # XXX
+            self.kernel_ = copy.deepcopy(self.kernel)
+
         X, y = check_X_y(X, y)
 
         # XXX: Assert that y is binary and labels are {0, 1}
@@ -59,17 +65,17 @@ def obj_func(theta):
                 lml, grad = self.log_marginal_likelihood(theta,
                                                          eval_gradient=True)
                 return -lml, -grad
-            self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel.params,
-                                              bounds=self.kernel.bounds)
-            self.kernel.params = self.theta_
+            self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel_.params,
+                                              bounds=self.kernel_.bounds)
+            self.kernel_.params = self.theta_
         elif self.optimizer is None:
-            self.theta_ = self.kernel.params
+            self.theta_ = self.kernel_.params
         else:
             raise ValueError("Unknown optimizer %s." % self.optimizer)
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
-        self.K_ = self.kernel(self.X_fit_)
+        self.K_ = self.kernel_(self.X_fit_)
         self.K_[np.diag_indices_from(self.K_)] += self.jitter
 
         self.f_, _, (self.pi, self.W_sr, self.L, _, _) = \
@@ -85,7 +91,7 @@ def predict(self, X):
         # decisions, it is enough to compute the MAP of the posterior and
         # pass it through the link function
         K_star = \
-            self.kernel(self.X_fit_, X)  # K_star =k(x_star)
+            self.kernel_(self.X_fit_, X)  # K_star =k(x_star)
         f_star = K_star.T.dot(self.y_fit_ - self.pi)  # Line 4 (Algorithm 3.2)
 
         return f_star > 0
@@ -95,10 +101,10 @@ def predict_proba(self, X):
         X = check_array(X)
 
         # Based on Algorithm 3.2 of GPML
-        K_star = self.kernel(self.X_fit_, X)  # K_star =k(x_star)
+        K_star = self.kernel_(self.X_fit_, X)  # K_star =k(x_star)
         f_star = K_star.T.dot(self.y_fit_ - self.pi)  # Line 4
         v = solve(self.L, self.W_sr[:, np.newaxis] * K_star)  # Line 5
-        var_f_star = self.kernel(X) - v.T.dot(v)  # Line 6
+        var_f_star = self.kernel_(X) - v.T.dot(v)  # Line 6
 
         # Line 7:
         # Approximate \int log(z) * N(z | f_star, var_f_star)
@@ -120,7 +126,7 @@ def predict_proba(self, X):
 
     def log_marginal_likelihood(self, theta, eval_gradient=False):
         import copy  # XXX
-        kernel = copy.deepcopy(self.kernel)
+        kernel = copy.deepcopy(self.kernel_)
         kernel.params = theta
 
         if eval_gradient:
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index bccfd49d5cab0..e58d22d6ab764 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -59,12 +59,18 @@ class GaussianProcessRegressor(BaseEstimator):
     alpha_:
     """
 
-    def __init__(self, kernel=RBF(), y_err=1e-10, optimizer="fmin_l_bfgs_b"):
+    def __init__(self, kernel=None, y_err=1e-10, optimizer="fmin_l_bfgs_b"):
         self.kernel = kernel
         self.y_err = y_err
         self.optimizer = optimizer
 
     def fit(self, X, y):
+        if self.kernel is None:  # Use an RBF kernel as default
+            self.kernel_ = RBF()
+        else:
+            import copy  # XXX
+            self.kernel_ = copy.deepcopy(self.kernel)
+
         X, y = check_X_y(X, y)
 
         self.X_fit_ = X
@@ -77,17 +83,17 @@ def obj_func(theta):
                 lml, grad = self.log_marginal_likelihood(theta,
                                                          eval_gradient=True)
                 return -lml, -grad
-            self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel.params,
-                                              bounds=self.kernel.bounds)
-            self.kernel.params = self.theta_
+            self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel_.params,
+                                              bounds=self.kernel_.bounds)
+            self.kernel_.params = self.theta_
         elif self.optimizer is None:
-            self.theta_ = self.kernel.params
+            self.theta_ = self.kernel_.params
         else:
             raise ValueError("Unknown optimizer %s." % self.optimizer)
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
-        K = self.kernel(self.X_fit_)
+        K = self.kernel_(self.X_fit_)
         K[np.diag_indices_from(K)] += self.y_err
         self.L_ = cholesky(K, lower=True)  # Line 2
         self.alpha_ = cho_solve((self.L_, True), self.y_fit_)  # Line 3
@@ -114,18 +120,17 @@ def predict(self, X, return_std=False, return_cov=False):
             else:
                 return y_mean
         else:  # Predict based on GP posterior
-            K_trans = self.kernel(X, self.X_fit_)
+            K_trans = self.kernel_(X, self.X_fit_)
             y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
             if return_cov:
                 v = cho_solve((self.L_, True), K_trans.T)  # Line 5
                 y_cov = \
-                    self.kernel(X) - K_trans.dot(v)  # Line 6
+                    self.kernel_(X) - K_trans.dot(v)  # Line 6
                 return y_mean, y_cov
             elif return_std:
                 # XXX: Compute y_std more efficiently
                 v = cho_solve((self.L_, True), K_trans.T)  # Line 5
-                y_cov = \
-                    self.kernel(X) - K_trans.dot(v)  # Line 6
+                y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
                 y_std = np.sqrt(np.diag(y_cov))
                 return y_mean, y_std
             else:
@@ -140,7 +145,7 @@ def sample(self, X, n_samples=1, random_state=0):
 
     def log_marginal_likelihood(self, theta, eval_gradient=False):
         import copy  # XXX: Avoid deepcopy
-        kernel = copy.deepcopy(self.kernel)
+        kernel = copy.deepcopy(self.kernel_)
         kernel.params = theta
 
         if eval_gradient:
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 088166c25ae3d..ccaa6679d0ee8 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -3,8 +3,6 @@
 # Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 # Licence: BSD 3 clause
 
-from copy import deepcopy
-
 import numpy as np
 
 from scipy.optimize import approx_fprime
@@ -31,7 +29,6 @@ def test_predict_consistent():
     """ Check binary predict decision has also predicted probability above 0.5.
     """
     for kernel in kernels:
-        kernel = deepcopy(kernel)
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
         assert_array_equal(gpc.predict(X),
                            gpc.predict_proba(X) >=0.5)
@@ -40,20 +37,17 @@ def test_predict_consistent():
 def test_lml_improving():
     """ Test that hyperparameter-tuning improves log-marginal likelihood. """
     for kernel in kernels:
-        kernel = deepcopy(kernel)
-        params_initial = kernel.params
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
-        assert_greater(gpc.log_marginal_likelihood(kernel.params),
-                       gpc.log_marginal_likelihood(params_initial))
+        assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.params),
+                       gpc.log_marginal_likelihood(kernel.params))
 
 
 def test_converged_to_local_maximum():
     """ Test that we are in local maximum after hyperparameter-optimization. """
     for kernel in kernels:
-        kernel = deepcopy(kernel)
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
 
-        lml, lml_gradient = gpc.log_marginal_likelihood(kernel.params, True)
+        lml, lml_gradient = gpc.log_marginal_likelihood(gpc.kernel_.params, True)
 
         assert_almost_equal(lml_gradient, 0, 2)
 
@@ -61,13 +55,11 @@ def test_converged_to_local_maximum():
 def test_lml_gradient():
     """ Compare analytic and numeric gradient of log marginal likelihood. """
     for kernel in kernels:
-        kernel = deepcopy(kernel)
-        params = kernel.params
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
 
-        lml, lml_gradient = gpc.log_marginal_likelihood(params, True)
+        lml, lml_gradient = gpc.log_marginal_likelihood(kernel.params, True)
         lml_gradient_approx = \
-            approx_fprime(params,
+            approx_fprime(kernel.params,
                           lambda theta: gpc.log_marginal_likelihood(theta,
                                                                     False),
                           1e-10)
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 82ba4337277dd..7f38cf918eef3 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -3,8 +3,6 @@
 # Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 # Licence: BSD 3 clause
 
-from copy import deepcopy
-
 import numpy as np
 
 from scipy.optimize import approx_fprime
@@ -33,7 +31,6 @@ def f(x):
 def test_gpr_interpolation():
     """Test the interpolating property for different kernels."""
     for kernel in kernels:
-        kernel = deepcopy(kernel)
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
         y_pred, y_cov = gpr.predict(X, return_cov=True)
 
@@ -44,51 +41,46 @@ def test_gpr_interpolation():
 def test_lml_improving():
     """ Test that hyperparameter-tuning improves log-marginal likelihood. """
     for kernel in kernels:
-        kernel = deepcopy(kernel)
-        params_initial = kernel.params
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
-        assert_greater(gpr.log_marginal_likelihood(kernel.params),
-                       gpr.log_marginal_likelihood(params_initial))
+        assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.params),
+                       gpr.log_marginal_likelihood(kernel.params))
 
 
 def test_converged_to_local_maximum():
     """ Test that we are in local maximum after hyperparameter-optimization."""
     for kernel in kernels:
-        kernel = deepcopy(kernel)
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
-        lml, lml_gradient = gpr.log_marginal_likelihood(kernel.params, True)
+        lml, lml_gradient = \
+            gpr.log_marginal_likelihood(gpr.kernel_.params, True)
 
         assert_true(np.all(np.isclose(lml_gradient, 0, atol=1e-5)
-                           | (kernel.params == kernel.bounds[:, 0])
-                           | (kernel.params == kernel.bounds[:, 1])))
+                           | (gpr.kernel_.params == gpr.kernel_.bounds[:, 0])
+                           | (gpr.kernel_.params == gpr.kernel_.bounds[:, 1])))
 
 
 def test_solution_inside_bounds():
     """ Test that hyperparameter-optimization remains in bounds"""
     for kernel in kernels:
-        kernel = deepcopy(kernel)
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
-        bounds = kernel.bounds
+        bounds = gpr.kernel_.bounds
         max_ = np.finfo(bounds.dtype).max
         tiny = np.finfo(bounds.dtype).tiny
         bounds[~np.isfinite(bounds[:, 1]), 1] = max_
 
-        assert_array_less(bounds[:, 0], kernel.params + tiny)
-        assert_array_less(kernel.params, bounds[:, 1] + tiny)
+        assert_array_less(bounds[:, 0], gpr.kernel_.params + tiny)
+        assert_array_less(gpr.kernel_.params, bounds[:, 1] + tiny)
 
 
 def test_lml_gradient():
     """ Compare analytic and numeric gradient of log marginal likelihood. """
     for kernel in kernels:
-        kernel = deepcopy(kernel)
-        params = kernel.params
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
-        lml, lml_gradient = gpr.log_marginal_likelihood(params, True)
+        lml, lml_gradient = gpr.log_marginal_likelihood(kernel.params, True)
         lml_gradient_approx = \
-            approx_fprime(params,
+            approx_fprime(kernel.params,
                           lambda theta: gpr.log_marginal_likelihood(theta,
                                                                     False),
                           1e-10)
@@ -99,13 +91,12 @@ def test_lml_gradient():
 def test_prior():
     """ Test that GP prior has mean 0 and identical variances."""
     for kernel in kernels:
-        kernel = deepcopy(kernel)
         gpr = GaussianProcessRegressor(kernel=kernel)
 
         y_mean, y_cov = gpr.predict(X, return_cov=True)
 
         assert_almost_equal(y_mean, 0, 5)
-        if len(kernel.params) > 1:
+        if len(gpr.kernel.params) > 1:
             # XXX: quite hacky, works only for current kernels
             assert_almost_equal(np.diag(y_cov), kernel.params[0] , 5)
         else:
@@ -115,9 +106,7 @@ def test_prior():
 def test_sample_statistics():
     """ Test that statistics of samples drawn from GP are correct."""
     for kernel in kernels:
-        kernel = deepcopy(kernel)
-        gpr = GaussianProcessRegressor(kernel=kernel).fit(
-            X, y)
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
         y_mean, y_cov = gpr.predict(X2, return_cov=True)
 
@@ -133,5 +122,5 @@ def test_no_optimizer():
     """ Test that kernel parameters are unmodified when optimizer is None."""
     kernel = RBF(1.0)
     gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None).fit(X, y)
-    assert_equal(kernel.params, 1.0)
+    assert_equal(gpr.kernel_.params, 1.0)
     assert_equal(gpr.theta_, 1.0)

From e34fa32f579ea3e7b469c97b8028154bb15f96ec Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sat, 7 Mar 2015 20:00:27 +0100
Subject: [PATCH 038/165] REFACTOR GP kernels use separate specification of
 bounds and params (no joint param_space)

---
 examples/gaussian_process/new/plot_gpr_co2.py |  10 +-
 .../gaussian_process/new/plot_gpr_noisy.py    |   4 +-
 .../new/plot_gpr_prior_posterior.py           |  10 +-
 sklearn/gaussian_process/kernels.py           | 108 +++++++-----------
 sklearn/gaussian_process/tests/test_gpc.py    |   4 +-
 sklearn/gaussian_process/tests/test_gpr.py    |  12 +-
 .../gaussian_process/tests/test_kernels.py    |   8 +-
 7 files changed, 68 insertions(+), 88 deletions(-)

diff --git a/examples/gaussian_process/new/plot_gpr_co2.py b/examples/gaussian_process/new/plot_gpr_co2.py
index 12c5eb1636e72..ab6f40877185f 100644
--- a/examples/gaussian_process/new/plot_gpr_co2.py
+++ b/examples/gaussian_process/new/plot_gpr_co2.py
@@ -21,8 +21,8 @@
 
 # Kernel with parameters given in GPML book
 k1 = 66.0**2 * RBF(67.0)  # long term smooth rising trend
-k2 = 2.4**2 * RBF(90) * ExpSineSquared(((1.3,), (1.0,)))  # seasonal component
-k3 = 0.66**2 * RationalQuadratic(((0.78,), (1.2,)))  # medium term irregularit.
+k2 = 2.4**2 * RBF(90.0) * ExpSineSquared((1.3, 1.0))  # seasonal component
+k3 = 0.66**2 * RationalQuadratic((0.78, 1.2))  # medium term irregularit.
 k4 = 0.18**2 * RBF(0.134) + WhiteKernel(0.19**2) # noise terms
 kernel_gpml = k1 + k2 + k3 + k4
 
@@ -34,9 +34,9 @@
 
 # Kernel with optimized parameters
 k1 = 50.0**2 * RBF(50.0) # long term smooth rising trend
-k2 = 2.0**2 * RBF(100) * ExpSineSquared(((1.0,), (1.0,))) # seasonal component
-k3 = 0.5**2 * RationalQuadratic(((1.0,), (1.0,))) # medium term irregularities
-k4 = 0.1**2 * RBF(0.1) + WhiteKernel((1e-3, 0.1**2, np.inf))  # noise terms
+k2 = 2.0**2 * RBF(100.0) * ExpSineSquared((1.0, 1.0)) # seasonal component
+k3 = 0.5**2 * RationalQuadratic((1.0, 1.0)) # medium term irregularities
+k4 = 0.1**2 * RBF(0.1) + WhiteKernel(0.1**2, 1e-3, np.inf)  # noise terms
 kernel = k1 + k2 + k3 + k4
 
 gp = GaussianProcessRegressor(kernel=kernel, y_err=0)
diff --git a/examples/gaussian_process/new/plot_gpr_noisy.py b/examples/gaussian_process/new/plot_gpr_noisy.py
index da999115c321f..2c527ab511458 100644
--- a/examples/gaussian_process/new/plot_gpr_noisy.py
+++ b/examples/gaussian_process/new/plot_gpr_noisy.py
@@ -33,7 +33,7 @@
 
 # First run
 plt.figure(0)
-kernel = 1.0 * RBF(100.0) + WhiteKernel((1e-10, 1e-5, 1e+1))
+kernel = 1.0 * RBF(100.0) + WhiteKernel(1e-5, 1e-10, 1e+1)
 gp = GaussianProcessRegressor(kernel=kernel,
                               y_err=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
@@ -51,7 +51,7 @@
 
 # First run
 plt.figure(1)
-kernel = 1.0 * RBF(1.0) + WhiteKernel((1e-10, 1e-5, 1e+1))
+kernel = 1.0 * RBF(1.0) + WhiteKernel(1e-5, 1e-10, 1e+1)
 gp = GaussianProcessRegressor(kernel=kernel,
                               y_err=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
diff --git a/examples/gaussian_process/new/plot_gpr_prior_posterior.py b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
index 3cd1ed79c99ee..577ec914f3993 100644
--- a/examples/gaussian_process/new/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
@@ -19,10 +19,10 @@
     import RBF, RationalQuadratic, ExpSineSquared, DotProduct
 
 
-kernels = [1.0 * RBF((1e-10, 1.0, None)),
-           1.0 * RationalQuadratic(((0.1,), (1.0,))),
-           1.0 * ExpSineSquared(((0.1, 1.0, 10.0), (1.0, 3.0, 10.0))),
-           (0.01, 0.1, 10.0) * DotProduct(((0.0, 1.0, 10.0)), degree=2)]
+kernels = [1.0 * RBF(1.0, 1e-1, 10.0),
+           1.0 * RationalQuadratic((0.1, 1.0)),
+           1.0 * ExpSineSquared((1.0, 3.0), (0.1, 1.0), (10.0, 10.0)),
+           (0.01, 0.1, 10.0) * DotProduct(1.0, 0.0, 10.0, degree=2)]
 
 for fig_index, kernel in enumerate(kernels):
     if fig_index > 3: continue
@@ -65,7 +65,7 @@
     plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
     plt.xlim(0, 5)
     plt.ylim(-3, 3)
-    plt.title("Posterior (kernel: %s)" % kernel)
+    plt.title("Posterior (kernel: %s)" % gp.kernel_)
     plt.tight_layout()
 
 plt.show()
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 16cbd1478af82..bcf2329acbd63 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -33,43 +33,12 @@
 class Kernel(six.with_metaclass(ABCMeta)):
     """ Base class for all kernels."""
 
-    def _parse_param_space(self, param_space):
-        if not np.iterable(param_space):
-            self.params = np.array([float(param_space)])
-            # No custom bounds specified; use default bounds
-            default_bounds = np.empty((self.params.shape[0], 2),
-                                      dtype=self.params.dtype)
-            default_bounds[:, 0] = 1e-5
-            default_bounds[:, 1] = np.inf
-            self.bounds = default_bounds
-            return
-
-        param_space = np.atleast_2d(param_space)
-        if param_space.shape[1] == 1:
-            self.params = param_space[:, 0]
-            # No custom bounds specified; use default bounds
-            default_bounds = np.empty((self.params.shape[0], 2),
-                                      dtype=self.params.dtype)
-            default_bounds[:, 0] = 1e-5
-            default_bounds[:, 1] = np.inf
-            self.bounds = default_bounds
-        elif param_space.shape[1] == 2:
-            # lower + upper bound for hyperparameter
-            self.bounds = param_space
-            # Use geometric mean of upper and lower boundary as initial
-            # hyperparameter value
-            if np.any(np.equal(self.l_bound, np.inf)) \
-               or np.any(np.equal(self.u_bound, np.inf)):
-                raise ValueError("Lower or upper bound being None requires "
-                                 "explicitly specifying the initial value.")
-            self.params = np.array([np.sqrt(self.l_bound * self.u_bound)])
-        elif param_space.shape[1] == 3:
-            # lower bound, initial value, upper bound
-            self.params = param_space[:, 1]
-            self.bounds = param_space[:, [0, 2]]
-        else:
-            raise ValueError("Invalid parameter space given. Must not have "
-                             "more than 3 entries per parameter.")
+    def __init__(self, theta=1.0, thetaL=1e-5, thetaU=np.inf):
+        if not np.iterable(theta):
+            theta = np.array([theta])
+        self.params = np.asarray(theta, dtype=np.float)
+        self.bounds = (np.asarray(thetaL, dtype=np.float),
+                       np.asarray(thetaU, dtype=np.float))
 
     @property
     def n_params(self):
@@ -81,28 +50,30 @@ def bounds(self):
 
     @bounds.setter
     def bounds(self, bounds):
-        bounds = bounds.reshape(-1, 2)
-        self.l_bound = bounds[:, 0]
-        self.u_bound = bounds[:, 1]
+        self.l_bound, self.u_bound = bounds
+        if not np.iterable(self.l_bound):
+             self.l_bound = np.full_like(self.params,  self.l_bound)
+        if not np.iterable(self.u_bound):
+             self.u_bound = np.full_like(self.params,  self.u_bound)
 
     def __add__(self, b):
         if not isinstance(b, Kernel):
-            return Sum(self, ConstantKernel(b))
+            return Sum(self, ConstantKernel.from_literal(b))
         return Sum(self, b)
 
     def __radd__(self, b):
         if not isinstance(b, Kernel):
-            return Sum(ConstantKernel(b), self)
+            return Sum(ConstantKernel.from_literal(b), self)
         return Sum(b, self)
 
     def __mul__(self, b):
         if not isinstance(b, Kernel):
-            return Product(self, ConstantKernel(b))
+            return Product(self, ConstantKernel.from_literal(b))
         return Product(self, b)
 
     def __rmul__(self, b):
         if not isinstance(b, Kernel):
-            return Product(ConstantKernel(b), self)
+            return Product(ConstantKernel.from_literal(b), self)
         return Product(b, self)
 
     def __repr__(self):
@@ -266,11 +237,24 @@ class ConstantKernel(Kernel):
     -------------------------
     value : float
         The constant value used for determining the magnitude (product-kernel)
-        or offset of mean (sum-lernel).
+        or offset of mean (sum-kernel).
     """
 
-    def __init__(self, param_space=1.0):
-        self._parse_param_space(param_space)
+    @classmethod
+    def from_literal(cls, literal):
+        if np.iterable(literal):
+            if len(literal) == 1:
+                return cls(literal[0])
+            elif len(literal) == 2:
+                return cls((literal[0] + literal[1]) / 2, literal[0],
+                            literal[1])
+            elif len(literal) == 3:
+                return cls(literal[1], literal[0], literal[2])
+            else:
+                raise ValueError("Cannot interpret literal %s for "
+                                 "ConstantKernel." % literal)
+        else:
+            return cls(literal)
 
     @property
     def params(self):
@@ -278,7 +262,9 @@ def params(self):
 
     @params.setter
     def params(self, theta):
-        assert len(theta) == 1
+        if len(theta) != 1:
+            raise ValueError("theta has not the correct number of entries."
+                             " Should be 1; given are %d" % len(theta))
         self.value = theta[0]
 
     def __call__(self, X, Y=None, eval_gradient=False):
@@ -335,9 +321,6 @@ class WhiteKernel(Kernel):
         Parameter controlling the noise level.
     """
 
-    def __init__(self, param_space=1.0):
-        self._parse_param_space(param_space)
-
     @property
     def params(self):
         return np.asarray([self.c])
@@ -401,9 +384,6 @@ class RBF(Kernel):
         of l defines the length-scale of the respective feature dimension.
     """
 
-    def __init__(self, param_space=1.0):
-        self._parse_param_space(param_space)
-
     @property
     def params(self):
         return np.asarray(self.l)
@@ -454,7 +434,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         if eval_gradient:
             if self.l.shape[0] == 1:
                 K_gradient = \
-                    (K * squareform(dists) / self.l[0])[:, :, np.newaxis]
+                    (K * squareform(dists) / self.l)[:, :, np.newaxis]
                 return K, K_gradient
             elif self.l.shape[0] == X.shape[1]:
                 # We need to recompute the pairwise dimension-wise distances
@@ -485,9 +465,8 @@ class RationalQuadratic(Kernel):
         The length scale of the kernel.
     """
 
-    def __init__(self, param_space=[(1.0,), (1.0,)]):
-        self._parse_param_space(param_space)
-
+    def __init__(self, theta=[1.0, 1.0], thetaL=1e-5, thetaU=np.inf):
+        super(RationalQuadratic, self).__init__(theta, thetaL, thetaU)
 
     @property
     def params(self):
@@ -561,8 +540,8 @@ class ExpSineSquared(Kernel):
         The periodicity of the kernel.
     """
 
-    def __init__(self, param_space=[(1.0,), (1.0,)]):
-        self._parse_param_space(param_space)
+    def __init__(self, theta=[1.0, 1.0], thetaL=1e-5, thetaU=np.inf):
+        super(ExpSineSquared, self).__init__(theta, thetaL, thetaU)
 
     @property
     def params(self):
@@ -634,8 +613,8 @@ class DotProduct(Kernel):
         the kernel is homogenous.
     """
 
-    def __init__(self, param_space=1.0, degree=1):
-        self._parse_param_space(param_space)
+    def __init__(self, theta=[1.0, 1.0], thetaL=1e-5, thetaU=np.inf, degree=1):
+        super(DotProduct, self).__init__(theta, thetaL, thetaU)
         self.degree = degree
 
     @property
@@ -735,8 +714,9 @@ class PairwiseKernel(Kernel):
         Any further parameters are passed directly to the kernel function.
     """
 
-    def __init__(self, param_space=1.0, metric="linear", **kwargs):
-        self._parse_param_space(param_space)
+    def __init__(self, theta=1.0, thetaL=1e-5, thetaU=np.inf, metric="linear",
+                 **kwargs):
+        super(PairwiseKernel, self).__init__(theta, thetaL, thetaU)
         self.metric = metric
         self.kwargs = kwargs
         if "gamma" in kwargs:
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index ccaa6679d0ee8..26c6aed68b62b 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -21,8 +21,8 @@ def f(x):
 y = np.array(f(X).ravel() > 0, dtype=int)
 
 
-kernels = [RBF(0.1), RBF((1e-3, 1.0, 1e3)),
-           (1e-2, 1.0, 1e2) * RBF((1e-3, 0.1, 1e3))]
+kernels = [RBF(0.1), RBF(1.0, 1e-3, 1e3),
+           (1e-2, 1.0, 1e2) *  RBF(1.0, 1e-3, 1e3)]
 
 
 def test_predict_consistent():
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 7f38cf918eef3..43032a44dc4ce 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -22,10 +22,10 @@ def f(x):
 y = f(X).ravel()
 
 
-kernels = [RBF(1.0), RBF((1e-3, 1.0, 1e3)),
-           (1e-2, 1.0, 1e2) * RBF((1e-3, 0.1, 1e3)),
-           (1e-2, 1.0, 1e2) * RBF((1e-3, 0.1, 1e3)) + (0.0, 0.0, 1e2),
-           (1e-2, 0.1, 1e2) * RBF((1e-3, 0.1, 1e3)) + (0.0, 0.0, 1e2)]
+kernels = [RBF(1.0), RBF(1.0, 1e-3, 1e3),
+           (1e-2, 1.0, 1e2) * RBF(1.0, 1e-3, 1e3),
+           (1e-2, 1.0, 1e2) * RBF(1.0, 1e-3, 1e3) + (0.0, 0.0, 1e2),
+           (1e-2, 0.1, 1e2) * RBF(1.0, 1e-3, 1e3) + (0.0, 0.0, 1e2)]
 
 
 def test_gpr_interpolation():
@@ -65,8 +65,8 @@ def test_solution_inside_bounds():
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
         bounds = gpr.kernel_.bounds
-        max_ = np.finfo(bounds.dtype).max
-        tiny = np.finfo(bounds.dtype).tiny
+        max_ = np.finfo(gpr.kernel_.params.dtype).max
+        tiny = 1e-10
         bounds[~np.isfinite(bounds[:, 1]), 1] = max_
 
         assert_array_less(bounds[:, 0], gpr.kernel_.params + tiny)
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index e8a19e78febea..a4a509cbe6a39 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -19,11 +19,11 @@
 
 X = np.random.normal(0, 1, (10, 2))
 
-kernels = [RBF(2.0), RBF([[0.5], [2.0]]),
+kernels = [RBF(2.0), RBF([0.5, 2.0]),
            ConstantKernel(10.0),
            2.0 * RBF(0.5), RBF(2.0) + WhiteKernel(1.0),
-           RationalQuadratic([(1.0,), (1.0,)]),
-           ExpSineSquared([(1.0,), (1.0,)]),
+           RationalQuadratic([1.0, 1.0]),
+           ExpSineSquared([1.0, 1.0]),
            DotProduct(1.0), DotProduct(1.0, degree=2)]
 for metric in PAIRWISE_KERNEL_FUNCTIONS:
     if metric in ["additive_chi2", "chi2"]:
@@ -75,7 +75,7 @@ def test_kernel_operator_commutative():
 
 def test_kernel_anisotropic():
     """ Anisotropic kernel should be consistent with isotropic kernels."""
-    K = RBF([[0.5], [2.0]])(X)
+    K = RBF([0.5, 2.0])(X)
     X1 = np.array(X)
     X1[:, 0] *= 4
     K1 = RBF(2.0)(X1)

From 5c99aadca0a0d4e95972a7d08d0ff95bb3512917 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sat, 7 Mar 2015 20:10:07 +0100
Subject: [PATCH 039/165] REFACTOR REnamed kernel property params to theta

---
 .../gaussian_process/new/plot_gpr_noisy.py    |  4 +-
 sklearn/gaussian_process/gpc.py               |  8 +-
 sklearn/gaussian_process/gpr.py               |  8 +-
 sklearn/gaussian_process/kernels.py           | 90 +++++++++----------
 sklearn/gaussian_process/tests/test_gpc.py    | 10 +--
 sklearn/gaussian_process/tests/test_gpr.py    | 26 +++---
 .../gaussian_process/tests/test_kernels.py    |  6 +-
 7 files changed, 76 insertions(+), 76 deletions(-)

diff --git a/examples/gaussian_process/new/plot_gpr_noisy.py b/examples/gaussian_process/new/plot_gpr_noisy.py
index 2c527ab511458..a55879575efb1 100644
--- a/examples/gaussian_process/new/plot_gpr_noisy.py
+++ b/examples/gaussian_process/new/plot_gpr_noisy.py
@@ -46,7 +46,7 @@
 plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
 plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
           % (kernel, gp.kernel_,
-             gp.log_marginal_likelihood(gp.kernel_.params)))
+             gp.log_marginal_likelihood(gp.kernel_.theta)))
 plt.tight_layout()
 
 # First run
@@ -64,7 +64,7 @@
 plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
 plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
           % (kernel, gp.kernel_,
-             gp.log_marginal_likelihood(gp.kernel_.params)))
+             gp.log_marginal_likelihood(gp.kernel_.theta)))
 plt.tight_layout()
 
 # Plot LML landscape
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 6fcf21ff3b359..c66af1b101c01 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -65,11 +65,11 @@ def obj_func(theta):
                 lml, grad = self.log_marginal_likelihood(theta,
                                                          eval_gradient=True)
                 return -lml, -grad
-            self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel_.params,
+            self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel_.theta,
                                               bounds=self.kernel_.bounds)
-            self.kernel_.params = self.theta_
+            self.kernel_.theta = self.theta_
         elif self.optimizer is None:
-            self.theta_ = self.kernel_.params
+            self.theta_ = self.kernel_.theta
         else:
             raise ValueError("Unknown optimizer %s." % self.optimizer)
 
@@ -127,7 +127,7 @@ def predict_proba(self, X):
     def log_marginal_likelihood(self, theta, eval_gradient=False):
         import copy  # XXX
         kernel = copy.deepcopy(self.kernel_)
-        kernel.params = theta
+        kernel.theta = theta
 
         if eval_gradient:
             K, K_gradient = kernel(self.X_fit_, eval_gradient=True)
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index e58d22d6ab764..33a6fd1cbec54 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -83,11 +83,11 @@ def obj_func(theta):
                 lml, grad = self.log_marginal_likelihood(theta,
                                                          eval_gradient=True)
                 return -lml, -grad
-            self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel_.params,
+            self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel_.theta,
                                               bounds=self.kernel_.bounds)
-            self.kernel_.params = self.theta_
+            self.kernel_.theta = self.theta_
         elif self.optimizer is None:
-            self.theta_ = self.kernel_.params
+            self.theta_ = self.kernel_.theta
         else:
             raise ValueError("Unknown optimizer %s." % self.optimizer)
 
@@ -146,7 +146,7 @@ def sample(self, X, n_samples=1, random_state=0):
     def log_marginal_likelihood(self, theta, eval_gradient=False):
         import copy  # XXX: Avoid deepcopy
         kernel = copy.deepcopy(self.kernel_)
-        kernel.params = theta
+        kernel.theta = theta
 
         if eval_gradient:
             K, K_gradient = kernel(self.X_fit_, eval_gradient=True)
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index bcf2329acbd63..e67ba35af1597 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -36,13 +36,13 @@ class Kernel(six.with_metaclass(ABCMeta)):
     def __init__(self, theta=1.0, thetaL=1e-5, thetaU=np.inf):
         if not np.iterable(theta):
             theta = np.array([theta])
-        self.params = np.asarray(theta, dtype=np.float)
+        self.theta = np.asarray(theta, dtype=np.float)
         self.bounds = (np.asarray(thetaL, dtype=np.float),
                        np.asarray(thetaU, dtype=np.float))
 
     @property
-    def n_params(self):
-        return self.params.shape[0]
+    def n_dims(self):
+        return self.theta.shape[0]
 
     @property
     def bounds(self):
@@ -52,9 +52,9 @@ def bounds(self):
     def bounds(self, bounds):
         self.l_bound, self.u_bound = bounds
         if not np.iterable(self.l_bound):
-             self.l_bound = np.full_like(self.params,  self.l_bound)
+             self.l_bound = np.full_like(self.theta, self.l_bound)
         if not np.iterable(self.u_bound):
-             self.u_bound = np.full_like(self.params,  self.u_bound)
+             self.u_bound = np.full_like(self.theta, self.u_bound)
 
     def __add__(self, b):
         if not isinstance(b, Kernel):
@@ -78,7 +78,7 @@ def __rmul__(self, b):
 
     def __repr__(self):
         return "{0}({1})".format(self.__class__.__name__,
-                                 ", ".join(map("{0:.3g}".format, self.params)))
+                                 ", ".join(map("{0:.3g}".format, self.theta)))
 
     @abstractmethod
     def __call__(self, X, Y=None, eval_gradient=False):
@@ -97,14 +97,14 @@ def __init__(self, k1, k2):
         self.k2 = k2
 
     @property
-    def params(self):
-        return np.append(self.k1.params, self.k2.params)
+    def theta(self):
+        return np.append(self.k1.theta, self.k2.theta)
 
-    @params.setter
-    def params(self, theta):
-        i = self.k1.n_params
-        self.k1.params = theta[:i]
-        self.k2.params = theta[i:]
+    @theta.setter
+    def theta(self, theta):
+        i = self.k1.n_dims
+        self.k1.theta = theta[:i]
+        self.k2.theta = theta[i:]
 
     @property
     def bounds(self):
@@ -112,7 +112,7 @@ def bounds(self):
 
     @bounds.setter
     def bounds(self, bounds):
-        i = self.k1.n_params
+        i = self.k1.n_dims
         self.k1.bounds = bounds[:i]
         self.k2.bounds = bounds[i:]
 
@@ -157,7 +157,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         K : array, shape (n_samples_X, n_samples_Y)
             Kernel k(X, Y)
 
-        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
@@ -209,7 +209,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         K : array, shape (n_samples_X, n_samples_Y)
             Kernel k(X, Y)
 
-        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
@@ -257,11 +257,11 @@ def from_literal(cls, literal):
             return cls(literal)
 
     @property
-    def params(self):
+    def theta(self):
         return np.array([self.value])
 
-    @params.setter
-    def params(self, theta):
+    @theta.setter
+    def theta(self, theta):
         if len(theta) != 1:
             raise ValueError("theta has not the correct number of entries."
                              " Should be 1; given are %d" % len(theta))
@@ -288,7 +288,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         K : array, shape (n_samples_X, n_samples_Y)
             Kernel k(X, Y)
 
-        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
@@ -322,11 +322,11 @@ class WhiteKernel(Kernel):
     """
 
     @property
-    def params(self):
+    def theta(self):
         return np.asarray([self.c])
 
-    @params.setter
-    def params(self, theta):
+    @theta.setter
+    def theta(self, theta):
         self.c = theta[0]
 
     def __call__(self, X, Y=None, eval_gradient=False):
@@ -350,7 +350,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         K : array, shape (n_samples_X, n_samples_Y)
             Kernel k(X, Y)
 
-        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
@@ -385,11 +385,11 @@ class RBF(Kernel):
     """
 
     @property
-    def params(self):
+    def theta(self):
         return np.asarray(self.l)
 
-    @params.setter
-    def params(self, theta):
+    @theta.setter
+    def theta(self, theta):
         self.l = theta
 
     def __call__(self, X, Y=None, eval_gradient=False):
@@ -413,7 +413,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         K : array, shape (n_samples_X, n_samples_Y)
             Kernel k(X, Y)
 
-        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
@@ -469,11 +469,11 @@ def __init__(self, theta=[1.0, 1.0], thetaL=1e-5, thetaU=np.inf):
         super(RationalQuadratic, self).__init__(theta, thetaL, thetaU)
 
     @property
-    def params(self):
+    def theta(self):
         return np.asarray([self.alpha, self.l])
 
-    @params.setter
-    def params(self, theta):
+    @theta.setter
+    def theta(self, theta):
         self.alpha = theta[0]
         self.l = theta[1]
 
@@ -498,7 +498,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         K : array, shape (n_samples_X, n_samples_Y)
             Kernel k(X, Y)
 
-        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
@@ -544,11 +544,11 @@ def __init__(self, theta=[1.0, 1.0], thetaL=1e-5, thetaU=np.inf):
         super(ExpSineSquared, self).__init__(theta, thetaL, thetaU)
 
     @property
-    def params(self):
+    def theta(self):
         return np.asarray([self.l, self.p])
 
-    @params.setter
-    def params(self, theta):
+    @theta.setter
+    def theta(self, theta):
         self.l = theta[0]
         self.p = theta[1]
 
@@ -573,7 +573,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         K : array, shape (n_samples_X, n_samples_Y)
             Kernel k(X, Y)
 
-        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
@@ -618,11 +618,11 @@ def __init__(self, theta=[1.0, 1.0], thetaL=1e-5, thetaU=np.inf, degree=1):
         self.degree = degree
 
     @property
-    def params(self):
+    def theta(self):
         return np.asarray([self.sigma_0])
 
-    @params.setter
-    def params(self, theta):
+    @theta.setter
+    def theta(self, theta):
         self.sigma_0 = theta[0]
 
     def __call__(self, X, Y=None, eval_gradient=False):
@@ -646,7 +646,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         K : array, shape (n_samples_X, n_samples_Y)
             Kernel k(X, Y)
 
-        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
@@ -724,11 +724,11 @@ def __init__(self, theta=1.0, thetaL=1e-5, thetaU=np.inf, metric="linear",
                 "Gamma must not be set directly but via param_space.")
 
     @property
-    def params(self):
+    def theta(self):
         return np.asarray([self.gamma])
 
-    @params.setter
-    def params(self, theta):
+    @theta.setter
+    def theta(self, theta):
         self.gamma = theta[0]
 
     def __call__(self, X, Y=None, eval_gradient=False):
@@ -752,7 +752,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         K : array, shape (n_samples_X, n_samples_Y)
             Kernel k(X, Y)
 
-        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_params)
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
             The gradient of the kernel k(X, X) with repect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
@@ -765,7 +765,7 @@ def f(gamma):  # helper function
                 return pairwise_kernels(
                     X, Y, metric=self.metric, gamma=gamma,
                     filter_params=True, **self.kwargs)
-            return K, _approx_fprime(self.params, f, 1e-10)
+            return K, _approx_fprime(self.theta, f, 1e-10)
         else:
             return K
 
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 26c6aed68b62b..aa64f141a158a 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -38,8 +38,8 @@ def test_lml_improving():
     """ Test that hyperparameter-tuning improves log-marginal likelihood. """
     for kernel in kernels:
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
-        assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.params),
-                       gpc.log_marginal_likelihood(kernel.params))
+        assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.theta),
+                       gpc.log_marginal_likelihood(kernel.theta))
 
 
 def test_converged_to_local_maximum():
@@ -47,7 +47,7 @@ def test_converged_to_local_maximum():
     for kernel in kernels:
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
 
-        lml, lml_gradient = gpc.log_marginal_likelihood(gpc.kernel_.params, True)
+        lml, lml_gradient = gpc.log_marginal_likelihood(gpc.kernel_.theta, True)
 
         assert_almost_equal(lml_gradient, 0, 2)
 
@@ -57,9 +57,9 @@ def test_lml_gradient():
     for kernel in kernels:
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
 
-        lml, lml_gradient = gpc.log_marginal_likelihood(kernel.params, True)
+        lml, lml_gradient = gpc.log_marginal_likelihood(kernel.theta, True)
         lml_gradient_approx = \
-            approx_fprime(kernel.params,
+            approx_fprime(kernel.theta,
                           lambda theta: gpc.log_marginal_likelihood(theta,
                                                                     False),
                           1e-10)
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 43032a44dc4ce..52996850de11a 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -42,8 +42,8 @@ def test_lml_improving():
     """ Test that hyperparameter-tuning improves log-marginal likelihood. """
     for kernel in kernels:
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
-        assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.params),
-                       gpr.log_marginal_likelihood(kernel.params))
+        assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta),
+                       gpr.log_marginal_likelihood(kernel.theta))
 
 
 def test_converged_to_local_maximum():
@@ -52,11 +52,11 @@ def test_converged_to_local_maximum():
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
         lml, lml_gradient = \
-            gpr.log_marginal_likelihood(gpr.kernel_.params, True)
+            gpr.log_marginal_likelihood(gpr.kernel_.theta, True)
 
         assert_true(np.all(np.isclose(lml_gradient, 0, atol=1e-5)
-                           | (gpr.kernel_.params == gpr.kernel_.bounds[:, 0])
-                           | (gpr.kernel_.params == gpr.kernel_.bounds[:, 1])))
+                           | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 0])
+                           | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 1])))
 
 
 def test_solution_inside_bounds():
@@ -65,12 +65,12 @@ def test_solution_inside_bounds():
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
         bounds = gpr.kernel_.bounds
-        max_ = np.finfo(gpr.kernel_.params.dtype).max
+        max_ = np.finfo(gpr.kernel_.theta.dtype).max
         tiny = 1e-10
         bounds[~np.isfinite(bounds[:, 1]), 1] = max_
 
-        assert_array_less(bounds[:, 0], gpr.kernel_.params + tiny)
-        assert_array_less(gpr.kernel_.params, bounds[:, 1] + tiny)
+        assert_array_less(bounds[:, 0], gpr.kernel_.theta + tiny)
+        assert_array_less(gpr.kernel_.theta, bounds[:, 1] + tiny)
 
 
 def test_lml_gradient():
@@ -78,9 +78,9 @@ def test_lml_gradient():
     for kernel in kernels:
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
-        lml, lml_gradient = gpr.log_marginal_likelihood(kernel.params, True)
+        lml, lml_gradient = gpr.log_marginal_likelihood(kernel.theta, True)
         lml_gradient_approx = \
-            approx_fprime(kernel.params,
+            approx_fprime(kernel.theta,
                           lambda theta: gpr.log_marginal_likelihood(theta,
                                                                     False),
                           1e-10)
@@ -96,9 +96,9 @@ def test_prior():
         y_mean, y_cov = gpr.predict(X, return_cov=True)
 
         assert_almost_equal(y_mean, 0, 5)
-        if len(gpr.kernel.params) > 1:
+        if len(gpr.kernel.theta) > 1:
             # XXX: quite hacky, works only for current kernels
-            assert_almost_equal(np.diag(y_cov), kernel.params[0] , 5)
+            assert_almost_equal(np.diag(y_cov), kernel.theta[0] , 5)
         else:
             assert_almost_equal(np.diag(y_cov), 1, 5)
 
@@ -122,5 +122,5 @@ def test_no_optimizer():
     """ Test that kernel parameters are unmodified when optimizer is None."""
     kernel = RBF(1.0)
     gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None).fit(X, y)
-    assert_equal(gpr.kernel_.params, 1.0)
+    assert_equal(gpr.kernel_.theta, 1.0)
     assert_equal(gpr.theta_, 1.0)
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index a4a509cbe6a39..f417fc72b0dbe 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -38,18 +38,18 @@ def test_kernel_gradient():
 
         assert_equal(K_gradient.shape[0], X.shape[0])
         assert_equal(K_gradient.shape[1], X.shape[0])
-        assert_equal(K_gradient.shape[2], kernel.params.shape[0])
+        assert_equal(K_gradient.shape[2], kernel.theta.shape[0])
 
         K_gradient_approx = np.empty_like(K_gradient)
         for i in range(K.shape[0]):
             for j in range(K.shape[1]):
                 def eval_kernel_ij_for_theta(theta):
                     kernel_copy = deepcopy(kernel)
-                    kernel_copy.params = theta
+                    kernel_copy.theta = theta
                     K = kernel_copy(X, eval_gradient=False)
                     return K[i, j]
                 K_gradient_approx[i, j] = \
-                    approx_fprime(kernel.params, eval_kernel_ij_for_theta,
+                    approx_fprime(kernel.theta, eval_kernel_ij_for_theta,
                                   1e-10)
 
         assert_almost_equal(K_gradient, K_gradient_approx, 4)

From cd375d745fbdad9aae0d8f646fe689de20bf3da5 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 8 Mar 2015 10:55:12 +0100
Subject: [PATCH 040/165] ENH Kernel implements equality check and get_params()
 method

---
 sklearn/gaussian_process/kernels.py | 73 +++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index e67ba35af1597..89aeaf4558e9c 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -22,6 +22,7 @@
 
 from abc import ABCMeta, abstractmethod
 from functools import partial
+import inspect
 
 import numpy as np
 from scipy.spatial.distance import pdist, cdist, squareform
@@ -40,8 +41,44 @@ def __init__(self, theta=1.0, thetaL=1e-5, thetaU=np.inf):
         self.bounds = (np.asarray(thetaL, dtype=np.float),
                        np.asarray(thetaU, dtype=np.float))
 
+    def get_params(self, deep=True):
+        """Get parameters of this kernel.
+
+        Parameters
+        ----------
+        deep: boolean, optional
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : mapping of string to any
+            Parameter names mapped to their values.
+        """
+        params = dict(theta=self.theta, thetaL=self.bounds[:, 0],
+                      thetaU=self.bounds[:, 1])
+
+        # introspect the constructor arguments to find the model parameters
+        # to represent
+        cls = self.__class__
+        init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
+        args, varargs, kw, default = inspect.getargspec(init)
+        if varargs is not None:
+            raise RuntimeError("scikit-learn estimators should always "
+                               "specify their parameters in the signature"
+                               " of their __init__ (no varargs)."
+                               " %s doesn't follow this convention."
+                               % (cls, ))
+        # Remove 'self', theta, thetaL, and thetaU, and store remaining
+        # arguments in params
+        args = args[4:]
+        for arg in args:
+            params[arg] = getattr(self, arg, None)
+        return params
+
     @property
     def n_dims(self):
+        """ Returns the number of hyperparameters of the kernel."""
         return self.theta.shape[0]
 
     @property
@@ -76,6 +113,16 @@ def __rmul__(self, b):
             return Product(ConstantKernel.from_literal(b), self)
         return Product(b, self)
 
+    def __eq__(self, b):
+        if type(self) != type(b):
+            return False
+        params_a = self.get_params()
+        params_b = b.get_params()
+        for key in set(params_a.keys() + params_b.keys()):
+            if np.any(params_a.get(key, None) != params_b.get(key, None)):
+                return False
+        return True
+
     def __repr__(self):
         return "{0}({1})".format(self.__class__.__name__,
                                  ", ".join(map("{0:.3g}".format, self.theta)))
@@ -96,6 +143,23 @@ def __init__(self, k1, k2):
         self.k1 = k1
         self.k2 = k2
 
+    def get_params(self, deep=True):
+        """Get parameters of this kernel.
+
+        Parameters
+        ----------
+        deep: boolean, optional
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : mapping of string to any
+            Parameter names mapped to their values.
+        """
+        params = dict(k1=self.k1, k2=self.k2)
+        return params
+
     @property
     def theta(self):
         return np.append(self.k1.theta, self.k2.theta)
@@ -116,6 +180,10 @@ def bounds(self, bounds):
         self.k1.bounds = bounds[:i]
         self.k2.bounds = bounds[i:]
 
+    def __eq__(self, b):
+        return (self.k1 == b.k1 and self.k2 == b.k2) \
+            or (self.k1 == b.k2 and self.k2 == b.k1)
+
     def is_stationary(self):
         """ Retuuns whether the kernel is stationary. """
         return self.k1.is_stationary() and self.k2.is_stationary()
@@ -772,3 +840,8 @@ def f(gamma):  # helper function
     def is_stationary(self):
         """ Returns whether the kernel is stationary. """
         return self.metric in ["rbf"]
+
+    def __repr__(self):
+        return "{0}({1}, metric={2})".format(
+            self.__class__.__name__,
+            ", ".join(map("{0:.3g}".format, self.theta)), self.metric)

From 04ebe40c2775ffe4581892052092fd34f39043c4 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 8 Mar 2015 10:55:39 +0100
Subject: [PATCH 041/165] TST Tests for cloning kernels with sklearn.base's
 clone function

---
 .../gaussian_process/tests/test_kernels.py    | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index f417fc72b0dbe..c2b6c03f01795 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -4,6 +4,7 @@
 # Licence: BSD 3 clause
 
 from copy import deepcopy
+from collections import Hashable
 
 import numpy as np
 
@@ -13,8 +14,10 @@
 from sklearn.gaussian_process.kernels \
     import (RBF, RationalQuadratic, ExpSineSquared, DotProduct,
             ConstantKernel, WhiteKernel, PairwiseKernel)
+from sklearn.base import clone
 
-from sklearn.utils.testing import assert_equal, assert_almost_equal
+from sklearn.utils.testing import (assert_equal, assert_almost_equal,
+    assert_not_equal, assert_array_equal)
 
 
 X = np.random.normal(0, 1, (10, 2))
@@ -94,3 +97,22 @@ def test_kernel_stationary():
             continue
         K = kernel(X, X + 1)
         assert_almost_equal(K[0, 0], np.diag(K))
+
+
+def test_kernel_clone():
+    """ Test that sklearn's clone works correctly on kernels. """
+    for kernel in kernels:
+        kernel_cloned = clone(kernel)
+
+        assert_equal(kernel, kernel_cloned)
+        assert_not_equal(id(kernel), id(kernel_cloned))
+        for attr in kernel.__dict__.keys():
+            attr_value = getattr(kernel, attr)
+            attr_value_cloned = getattr(kernel_cloned, attr)
+            if np.iterable(attr_value):
+                assert_array_equal(attr_value, attr_value_cloned)
+            else:
+                assert_equal(attr_value, attr_value_cloned)
+            if not isinstance(attr_value, Hashable):
+                # modifiable attributes must not be identical
+                assert_not_equal(id(attr_value), id(attr_value_cloned))

From ca6e6a24a8ed833a6abd50a07f73c33337489717 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 8 Mar 2015 11:06:53 +0100
Subject: [PATCH 042/165] REFACTOR Replacing all calls to deepcopy() by clone()

---
 sklearn/gaussian_process/gpc.py     | 9 +++------
 sklearn/gaussian_process/gpr.py     | 9 +++------
 sklearn/gaussian_process/kernels.py | 7 +++++++
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index c66af1b101c01..9e8b3d2610728 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -9,7 +9,7 @@
 from scipy.optimize import fmin_l_bfgs_b
 from scipy.special import erf
 
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, clone
 from sklearn.gaussian_process.kernels import RBF
 from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
 
@@ -49,8 +49,7 @@ def fit(self, X, y):
         if self.kernel is None:  # Use an RBF kernel as default
             self.kernel_ = RBF()
         else:
-            import copy  # XXX
-            self.kernel_ = copy.deepcopy(self.kernel)
+            self.kernel_ = clone(self.kernel)
 
         X, y = check_X_y(X, y)
 
@@ -125,9 +124,7 @@ def predict_proba(self, X):
         return pi_star
 
     def log_marginal_likelihood(self, theta, eval_gradient=False):
-        import copy  # XXX
-        kernel = copy.deepcopy(self.kernel_)
-        kernel.theta = theta
+        kernel = self.kernel_.clone_with_theta(theta)
 
         if eval_gradient:
             K, K_gradient = kernel(self.X_fit_, eval_gradient=True)
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 33a6fd1cbec54..0382837c69f76 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -8,7 +8,7 @@
 from scipy.linalg import cholesky, cho_solve, solve
 from scipy.optimize import fmin_l_bfgs_b
 
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, clone
 from sklearn.gaussian_process.kernels import RBF
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import check_X_y, check_array
@@ -68,8 +68,7 @@ def fit(self, X, y):
         if self.kernel is None:  # Use an RBF kernel as default
             self.kernel_ = RBF()
         else:
-            import copy  # XXX
-            self.kernel_ = copy.deepcopy(self.kernel)
+            self.kernel_ = clone(self.kernel)
 
         X, y = check_X_y(X, y)
 
@@ -144,9 +143,7 @@ def sample(self, X, n_samples=1, random_state=0):
         return y_samples
 
     def log_marginal_likelihood(self, theta, eval_gradient=False):
-        import copy  # XXX: Avoid deepcopy
-        kernel = copy.deepcopy(self.kernel_)
-        kernel.theta = theta
+        kernel = self.kernel_.clone_with_theta(theta)
 
         if eval_gradient:
             K, K_gradient = kernel(self.X_fit_, eval_gradient=True)
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 89aeaf4558e9c..2be7b8e2c0727 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -29,6 +29,7 @@
 
 from ..metrics.pairwise import pairwise_kernels
 from ..externals import six
+from ..base import clone
 
 
 class Kernel(six.with_metaclass(ABCMeta)):
@@ -76,6 +77,12 @@ def get_params(self, deep=True):
             params[arg] = getattr(self, arg, None)
         return params
 
+    def clone_with_theta(self, theta):
+        """ Returns a clone of self with given hyperparameters theta. """
+        cloned = clone(self)
+        cloned.theta = theta
+        return cloned
+
     @property
     def n_dims(self):
         """ Returns the number of hyperparameters of the kernel."""

From e7586b118adf014e914e38507b2e86425104a10c Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 8 Mar 2015 11:07:10 +0100
Subject: [PATCH 043/165] TST Replace call to deepcopy() by clone()

---
 sklearn/gaussian_process/tests/test_kernels.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index c2b6c03f01795..52fa6c80c2760 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -3,7 +3,6 @@
 # Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 # Licence: BSD 3 clause
 
-from copy import deepcopy
 from collections import Hashable
 
 import numpy as np
@@ -47,9 +46,8 @@ def test_kernel_gradient():
         for i in range(K.shape[0]):
             for j in range(K.shape[1]):
                 def eval_kernel_ij_for_theta(theta):
-                    kernel_copy = deepcopy(kernel)
-                    kernel_copy.theta = theta
-                    K = kernel_copy(X, eval_gradient=False)
+                    kernel_clone = kernel.clone_with_theta(theta)
+                    K = kernel_clone(X, eval_gradient=False)
                     return K[i, j]
                 K_gradient_approx[i, j] = \
                     approx_fprime(kernel.theta, eval_kernel_ij_for_theta,

From a1186bbc11785ec1fd3ea58203d7dcd22b6f2b21 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 8 Mar 2015 15:11:34 +0100
Subject: [PATCH 044/165] ENH Added Exponentiation kernel (DotProduct no longer
 has internal degree)

---
 .../new/plot_gpr_prior_posterior.py           |   2 +-
 sklearn/gaussian_process/kernels.py           | 100 ++++++++++++++++--
 2 files changed, 94 insertions(+), 8 deletions(-)

diff --git a/examples/gaussian_process/new/plot_gpr_prior_posterior.py b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
index 577ec914f3993..72b4529bf95d8 100644
--- a/examples/gaussian_process/new/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/new/plot_gpr_prior_posterior.py
@@ -22,7 +22,7 @@
 kernels = [1.0 * RBF(1.0, 1e-1, 10.0),
            1.0 * RationalQuadratic((0.1, 1.0)),
            1.0 * ExpSineSquared((1.0, 3.0), (0.1, 1.0), (10.0, 10.0)),
-           (0.01, 0.1, 10.0) * DotProduct(1.0, 0.0, 10.0, degree=2)]
+           (0.01, 0.1, 10.0) * (DotProduct(1.0, 0.0, 10.0) ** 2)]
 
 for fig_index, kernel in enumerate(kernels):
     if fig_index > 3: continue
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 2be7b8e2c0727..df2bcbca69e80 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -120,6 +120,9 @@ def __rmul__(self, b):
             return Product(ConstantKernel.from_literal(b), self)
         return Product(b, self)
 
+    def __pow__(self, b):
+        return Exponentiation(self, b)
+
     def __eq__(self, b):
         if type(self) != type(b):
             return False
@@ -301,6 +304,92 @@ def __repr__(self):
         return "{0} * {1}".format(self.k1, self.k2)
 
 
+class Exponentiation(Kernel):
+    """ Exponentiate kernel by given exponent. """
+
+    def __init__(self, kernel, exponent):
+        self.kernel = kernel
+        self.exponent = exponent
+
+    def get_params(self, deep=True):
+        """Get parameters of this kernel.
+
+        Parameters
+        ----------
+        deep: boolean, optional
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : mapping of string to any
+            Parameter names mapped to their values.
+        """
+        params = dict(kernel=self.kernel, exponent=self.exponent)
+        return params
+
+    @property
+    def theta(self):
+        return self.kernel.theta
+
+    @theta.setter
+    def theta(self, theta):
+        self.kernel.theta = theta
+
+    @property
+    def bounds(self):
+        return self.kernel.bounds
+
+    @bounds.setter
+    def bounds(self, bounds):
+        self.kernel.bounds = bounds
+
+    def __eq__(self, b):
+        return (self.kernel == b.kernel and self.exponent == b.exponent)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """ Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with repect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        if eval_gradient:
+            K, K_gradient = self.kernel(X, Y, eval_gradient=True)
+            K_gradient *= \
+                self.exponent * K[:, :, np.newaxis] ** (self.exponent - 1)
+            return K ** self.exponent, K_gradient
+        else:
+            K = self.kernel(X, Y, eval_gradient=False)
+            return K ** self.exponent
+
+    def __repr__(self):
+        return "{0} ** {1}".format(self.kernel, self.exponent)
+
+    def is_stationary(self):
+        """ Returns whether the kernel is stationary. """
+        return self.kernel.is_stationary()
+
+
 class ConstantKernel(Kernel):
     """ Constant kernel.
 
@@ -688,9 +777,8 @@ class DotProduct(Kernel):
         the kernel is homogenous.
     """
 
-    def __init__(self, theta=[1.0, 1.0], thetaL=1e-5, thetaU=np.inf, degree=1):
+    def __init__(self, theta=[1.0, 1.0], thetaL=1e-5, thetaU=np.inf):
         super(DotProduct, self).__init__(theta, thetaL, thetaU)
-        self.degree = degree
 
     @property
     def theta(self):
@@ -727,18 +815,16 @@ def __call__(self, X, Y=None, eval_gradient=False):
             is True.
         """
         if Y is None:
-            dot_product = np.inner(X, X)
-            K = (dot_product + self.sigma_0 ** 2) ** self.degree
+            K = np.inner(X, X) + self.sigma_0 ** 2
         else:
             if eval_gradient:
                 raise ValueError(
                     "Gradient can only be evaluated when Y is None.")
-            K = (np.inner(X, Y) + self.sigma_0 ** 2) ** self.degree
+            K = np.inner(X, Y) + self.sigma_0 ** 2
 
         if eval_gradient:
             K_gradient = np.empty((K.shape[0], K.shape[1], 1))
-            K_gradient[..., 0] = 2 * self.sigma_0 * self.degree \
-                * (dot_product + self.sigma_0 ** 2) ** (self.degree - 1)
+            K_gradient[..., 0] = 2 * self.sigma_0
             return K, K_gradient
         else:
             return K

From f2ec6a2c9b0ddd00260ba1794d039d0a427b9b2b Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 8 Mar 2015 15:12:03 +0100
Subject: [PATCH 045/165] TST test_kernels tests Exponentiation kernel

---
 sklearn/gaussian_process/tests/test_kernels.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 52fa6c80c2760..5e076177fcf81 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -26,7 +26,7 @@
            2.0 * RBF(0.5), RBF(2.0) + WhiteKernel(1.0),
            RationalQuadratic([1.0, 1.0]),
            ExpSineSquared([1.0, 1.0]),
-           DotProduct(1.0), DotProduct(1.0, degree=2)]
+           DotProduct(1.0), DotProduct(1.0) ** 2]
 for metric in PAIRWISE_KERNEL_FUNCTIONS:
     if metric in ["additive_chi2", "chi2"]:
         continue

From 29c6d3bc42f189284e027d87215b962f937179b4 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 8 Mar 2015 16:29:54 +0100
Subject: [PATCH 046/165] FIX Using numpy-downward compatible ones_lile()
 instead of full_like()

---
 sklearn/gaussian_process/kernels.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index df2bcbca69e80..54aadf3b0e1f2 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -96,9 +96,9 @@ def bounds(self):
     def bounds(self, bounds):
         self.l_bound, self.u_bound = bounds
         if not np.iterable(self.l_bound):
-             self.l_bound = np.full_like(self.theta, self.l_bound)
+             self.l_bound = np.ones_like(self.theta) * self.l_bound
         if not np.iterable(self.u_bound):
-             self.u_bound = np.full_like(self.theta, self.u_bound)
+             self.u_bound = np.ones_like(self.theta) * self.u_bound
 
     def __add__(self, b):
         if not isinstance(b, Kernel):

From 6a0b32ee6a3915a9d0ce7a2b8976c1da0f544f5f Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 8 Mar 2015 18:29:45 +0100
Subject: [PATCH 047/165] DOC Reorganizing gaussian_process examples

---
 .../gaussian_process/gp_diabetes_dataset.py   | 51 ----------------
 .../gaussian_process/{new => }/plot_gpc.py    |  0
 ...gression.py => plot_gpc_isoprobability.py} | 60 ++++++++-----------
 .../{new => }/plot_gpc_xor.py                 |  0
 .../{new => }/plot_gpr_co2.py                 |  0
 .../{new => }/plot_gpr_noisy.py               |  0
 .../{new => }/plot_gpr_prior_posterior.py     |  0
 7 files changed, 25 insertions(+), 86 deletions(-)
 delete mode 100644 examples/gaussian_process/gp_diabetes_dataset.py
 rename examples/gaussian_process/{new => }/plot_gpc.py (100%)
 rename examples/gaussian_process/{plot_gp_probabilistic_classification_after_regression.py => plot_gpc_isoprobability.py} (56%)
 rename examples/gaussian_process/{new => }/plot_gpc_xor.py (100%)
 rename examples/gaussian_process/{new => }/plot_gpr_co2.py (100%)
 rename examples/gaussian_process/{new => }/plot_gpr_noisy.py (100%)
 rename examples/gaussian_process/{new => }/plot_gpr_prior_posterior.py (100%)

diff --git a/examples/gaussian_process/gp_diabetes_dataset.py b/examples/gaussian_process/gp_diabetes_dataset.py
deleted file mode 100644
index 9baaeccf17d6d..0000000000000
--- a/examples/gaussian_process/gp_diabetes_dataset.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-
-"""
-========================================================================
-Gaussian Processes regression: goodness-of-fit on the 'diabetes' dataset
-========================================================================
-
-In this example, we fit a Gaussian Process model onto the diabetes
-dataset.
-
-We determine the correlation parameters with maximum likelihood
-estimation (MLE). We use an anisotropic squared exponential
-correlation model with a constant regression model. We also use a
-nugget of 1e-2 to account for the (strong) noise in the targets.
-
-We compute a cross-validation estimate of the coefficient of
-determination (R2) without reperforming MLE, using the set of correlation
-parameters found on the whole dataset.
-"""
-print(__doc__)
-
-# Author: Vincent Dubourg <vincent.dubourg@gmail.com>
-# Licence: BSD 3 clause
-
-from sklearn import datasets
-from sklearn.gaussian_process import GaussianProcess
-from sklearn.cross_validation import cross_val_score, KFold
-
-# Load the dataset from scikit's data sets
-diabetes = datasets.load_diabetes()
-X, y = diabetes.data, diabetes.target
-
-# Instanciate a GP model
-gp = GaussianProcess(regr='constant', corr='absolute_exponential',
-                     theta0=[1e-4] * 10, thetaL=[1e-12] * 10,
-                     thetaU=[1e-2] * 10, nugget=1e-2, optimizer='Welch')
-
-# Fit the GP model to the data performing maximum likelihood estimation
-gp.fit(X, y)
-
-# Deactivate maximum likelihood estimation for the cross-validation loop
-gp.theta0 = gp.theta_  # Given correlation parameter = MLE
-gp.thetaL, gp.thetaU = None, None  # None bounds deactivate MLE
-
-# Perform a cross-validation estimate of the coefficient of determination using
-# the cross_validation module using all CPUs available on the machine
-K = 20  # folds
-R2 = cross_val_score(gp, X, y=y, cv=KFold(y.size, K), n_jobs=1).mean()
-print("The %d-Folds estimate of the coefficient of determination is R2 = %s"
-      % (K, R2))
diff --git a/examples/gaussian_process/new/plot_gpc.py b/examples/gaussian_process/plot_gpc.py
similarity index 100%
rename from examples/gaussian_process/new/plot_gpc.py
rename to examples/gaussian_process/plot_gpc.py
diff --git a/examples/gaussian_process/plot_gp_probabilistic_classification_after_regression.py b/examples/gaussian_process/plot_gpc_isoprobability.py
similarity index 56%
rename from examples/gaussian_process/plot_gp_probabilistic_classification_after_regression.py
rename to examples/gaussian_process/plot_gpc_isoprobability.py
index 3c9887aa66852..40441ee808511 100644
--- a/examples/gaussian_process/plot_gp_probabilistic_classification_after_regression.py
+++ b/examples/gaussian_process/plot_gpc_isoprobability.py
@@ -2,38 +2,31 @@
 # -*- coding: utf-8 -*-
 
 """
-==============================================================================
-Gaussian Processes classification example: exploiting the probabilistic output
-==============================================================================
+===================================================================
+Gaussian Processes classification example with probabilistic output
+===================================================================
 
-A two-dimensional regression exercise with a post-processing allowing for
-probabilistic classification thanks to the Gaussian property of the prediction.
-
-The figure illustrates the probability that the prediction is negative with
-respect to the remaining uncertainty in the prediction. The red and blue lines
-corresponds to the 95% confidence interval on the prediction of the zero level
-set.
+A two-dimensional classification exampe showing iso-probability lines for
+the predicted probabilities.
 """
 print(__doc__)
 
 # Author: Vincent Dubourg <vincent.dubourg@gmail.com>
+# Adapted to GaussianProcessClassifier:
+#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 # Licence: BSD 3 clause
 
 import numpy as np
-from scipy import stats
-from sklearn.gaussian_process import GaussianProcess
+
 from matplotlib import pyplot as pl
 from matplotlib import cm
 
-# Standard normal distribution functions
-phi = stats.distributions.norm().pdf
-PHI = stats.distributions.norm().cdf
-PHIinv = stats.distributions.norm().ppf
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import DotProduct
 
 # A few constants
 lim = 8
 
-
 def g(x):
     """The function to predict (classification will then consist in predicting
     whether g(x) <= 0 or not)"""
@@ -50,32 +43,28 @@ def g(x):
               [5.21301203, 4.26386883]])
 
 # Observations
-y = g(X)
+y = np.array(g(X) > 0, dtype=int)
 
 # Instanciate and fit Gaussian Process Model
-gp = GaussianProcess(theta0=5e-1)
-
-# Don't perform MLE or you'll get a perfect prediction for this simple example!
+kernel = 0.1 * DotProduct(0.1) ** 2
+gp = GaussianProcessClassifier(kernel=kernel)
 gp.fit(X, y)
+print "Learned kernel: %s " % gp.kernel_
 
-# Evaluate real function, the prediction and its MSE on a grid
+# Evaluate real function and the predicted probability
 res = 50
 x1, x2 = np.meshgrid(np.linspace(- lim, lim, res),
                      np.linspace(- lim, lim, res))
 xx = np.vstack([x1.reshape(x1.size), x2.reshape(x2.size)]).T
 
 y_true = g(xx)
-y_pred, MSE = gp.predict(xx, eval_MSE=True)
-sigma = np.sqrt(MSE)
+y_prob = gp.predict_proba(xx)
 y_true = y_true.reshape((res, res))
-y_pred = y_pred.reshape((res, res))
-sigma = sigma.reshape((res, res))
-k = PHIinv(.975)
+y_prob = y_prob.reshape((res, res))
 
-# Plot the probabilistic classification iso-values using the Gaussian property
-# of the prediction
+# Plot the probabilistic classification iso-values
 fig = pl.figure(1)
-ax = fig.add_subplot(111)
+ax = fig.gca()
 ax.axes.set_aspect('equal')
 pl.xticks([])
 pl.yticks([])
@@ -84,11 +73,12 @@ def g(x):
 pl.xlabel('$x_1$')
 pl.ylabel('$x_2$')
 
-cax = pl.imshow(np.flipud(PHI(- y_pred / sigma)), cmap=cm.gray_r, alpha=0.8,
-                extent=(- lim, lim, - lim, lim))
+cax = pl.imshow(y_prob, cmap=cm.gray_r, alpha=0.8,
+                extent=(-lim, lim, -lim, lim))
 norm = pl.matplotlib.colors.Normalize(vmin=0., vmax=0.9)
 cb = pl.colorbar(cax, ticks=[0., 0.2, 0.4, 0.6, 0.8, 1.], norm=norm)
 cb.set_label('${\\rm \mathbb{P}}\left[\widehat{G}(\mathbf{x}) \leq 0\\right]$')
+pl.clim(0, 1)
 
 pl.plot(X[y <= 0, 0], X[y <= 0, 1], 'r.', markersize=12)
 
@@ -96,15 +86,15 @@ def g(x):
 
 cs = pl.contour(x1, x2, y_true, [0.], colors='k', linestyles='dashdot')
 
-cs = pl.contour(x1, x2, PHI(- y_pred / sigma), [0.025], colors='b',
+cs = pl.contour(x1, x2, y_prob, [0.666], colors='b',
                 linestyles='solid')
 pl.clabel(cs, fontsize=11)
 
-cs = pl.contour(x1, x2, PHI(- y_pred / sigma), [0.5], colors='k',
+cs = pl.contour(x1, x2, y_prob, [0.5], colors='k',
                 linestyles='dashed')
 pl.clabel(cs, fontsize=11)
 
-cs = pl.contour(x1, x2, PHI(- y_pred / sigma), [0.975], colors='r',
+cs = pl.contour(x1, x2, y_prob, [0.334], colors='r',
                 linestyles='solid')
 pl.clabel(cs, fontsize=11)
 
diff --git a/examples/gaussian_process/new/plot_gpc_xor.py b/examples/gaussian_process/plot_gpc_xor.py
similarity index 100%
rename from examples/gaussian_process/new/plot_gpc_xor.py
rename to examples/gaussian_process/plot_gpc_xor.py
diff --git a/examples/gaussian_process/new/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
similarity index 100%
rename from examples/gaussian_process/new/plot_gpr_co2.py
rename to examples/gaussian_process/plot_gpr_co2.py
diff --git a/examples/gaussian_process/new/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
similarity index 100%
rename from examples/gaussian_process/new/plot_gpr_noisy.py
rename to examples/gaussian_process/plot_gpr_noisy.py
diff --git a/examples/gaussian_process/new/plot_gpr_prior_posterior.py b/examples/gaussian_process/plot_gpr_prior_posterior.py
similarity index 100%
rename from examples/gaussian_process/new/plot_gpr_prior_posterior.py
rename to examples/gaussian_process/plot_gpr_prior_posterior.py

From e144924eff8eaeaf41694adfc54f6febee9f78e5 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 8 Mar 2015 18:43:28 +0100
Subject: [PATCH 048/165] FIX Minor bugfixes and downward-compatibility/python3
 issue

---
 sklearn/gaussian_process/gpc.py            | 2 +-
 sklearn/gaussian_process/kernels.py        | 2 +-
 sklearn/gaussian_process/tests/test_gpr.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 9e8b3d2610728..6161f0472994a 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -55,7 +55,7 @@ def fit(self, X, y):
 
         # XXX: Assert that y is binary and labels are {0, 1}
         self.X_fit_ = X
-        self.y_fit_ = y
+        self.y_fit_ = check_array(y, ensure_2d=False, dtype=float)
 
         if self.optimizer == "fmin_l_bfgs_b":
             # Choose hyperparameters based on maximizing the log-marginal
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 54aadf3b0e1f2..685e3b6765447 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -128,7 +128,7 @@ def __eq__(self, b):
             return False
         params_a = self.get_params()
         params_b = b.get_params()
-        for key in set(params_a.keys() + params_b.keys()):
+        for key in set(list(params_a.keys()) + list(params_b.keys())):
             if np.any(params_a.get(key, None) != params_b.get(key, None)):
                 return False
         return True
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 52996850de11a..4560c3c781141 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -54,7 +54,7 @@ def test_converged_to_local_maximum():
         lml, lml_gradient = \
             gpr.log_marginal_likelihood(gpr.kernel_.theta, True)
 
-        assert_true(np.all(np.isclose(lml_gradient, 0, atol=1e-5)
+        assert_true(np.all((np.abs(lml_gradient) < 1e-5)
                            | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 0])
                            | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 1])))
 

From 6c45af45dd397683a8cdf256291ebec490bf7a8c Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 10 Mar 2015 19:32:33 +0100
Subject: [PATCH 049/165] REFACTOR Rename GPR method sample to sample_y

---
 examples/gaussian_process/plot_gpr_prior_posterior.py | 4 ++--
 sklearn/gaussian_process/gpr.py                       | 2 +-
 sklearn/gaussian_process/tests/test_gpr.py            | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_prior_posterior.py b/examples/gaussian_process/plot_gpr_prior_posterior.py
index 72b4529bf95d8..958530b54100a 100644
--- a/examples/gaussian_process/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/plot_gpr_prior_posterior.py
@@ -39,7 +39,7 @@
     plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
                      y_mean + np.sqrt(np.diag(y_cov)),
                      alpha=0.5, color='k')
-    y_samples = gp.sample(X_[:, np.newaxis], 10)
+    y_samples = gp.sample_y(X_[:, np.newaxis], 10)
     plt.plot(X_, y_samples, color='b', lw=1)
     plt.xlim(0, 5)
     plt.ylim(-3, 3)
@@ -60,7 +60,7 @@
                      y_mean + np.sqrt(np.diag(y_cov)),
                      alpha=0.5, color='k')
 
-    y_samples = gp.sample(X_[:, np.newaxis], 10)
+    y_samples = gp.sample_y(X_[:, np.newaxis], 10)
     plt.plot(X_, y_samples, color='b', lw=1)
     plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
     plt.xlim(0, 5)
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 0382837c69f76..01137d188fc2c 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -135,7 +135,7 @@ def predict(self, X, return_std=False, return_cov=False):
             else:
                 return y_mean
 
-    def sample(self, X, n_samples=1, random_state=0):
+    def sample_y(self, X, n_samples=1, random_state=0):
         rng = check_random_state(random_state)
 
         y_mean, y_cov = self.predict(X, return_cov=True)
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 4560c3c781141..77bc1cfe986ef 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -110,7 +110,7 @@ def test_sample_statistics():
 
         y_mean, y_cov = gpr.predict(X2, return_cov=True)
 
-        samples = gpr.sample(X2, 1000000)
+        samples = gpr.sample_y(X2, 1000000)
 
         # More digits accuracy would require many more samples
         assert_almost_equal(y_mean, np.mean(samples, 1), 2)

From b1531eb0bd99839e4ad11e2e27394ad927a2bba3 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 11 Mar 2015 20:14:13 +0100
Subject: [PATCH 050/165] REFACTOR Compute standard deviation of predictive
 distribution at n points in O(n)

---
 sklearn/gaussian_process/gpr.py            | 28 +++++++++++++---------
 sklearn/gaussian_process/kernels.py        |  7 ++++++
 sklearn/gaussian_process/tests/test_gpr.py | 10 ++++++++
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 01137d188fc2c..0be08844bfd9c 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -5,7 +5,7 @@
 # License: BSD 3 clause
 
 import numpy as np
-from scipy.linalg import cholesky, cho_solve, solve
+from scipy.linalg import cholesky, cho_solve, solve, solve_triangular
 from scipy.optimize import fmin_l_bfgs_b
 
 from sklearn.base import BaseEstimator, clone
@@ -113,9 +113,8 @@ def predict(self, X, return_std=False, return_cov=False):
                 y_cov = self.kernel(X)
                 return y_mean, y_cov
             elif return_std:
-                # XXX: Compute y_std more efficiently
-                y_std = np.sqrt(np.diag(self.kernel(X)))
-                return y_mean, y_std
+                y_var = np.apply_along_axis(self.kernel, 1, X)[:, 0]
+                return y_mean, np.sqrt(y_var)
             else:
                 return y_mean
         else:  # Predict based on GP posterior
@@ -123,15 +122,19 @@ def predict(self, X, return_std=False, return_cov=False):
             y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
             if return_cov:
                 v = cho_solve((self.L_, True), K_trans.T)  # Line 5
-                y_cov = \
-                    self.kernel_(X) - K_trans.dot(v)  # Line 6
+                y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
                 return y_mean, y_cov
             elif return_std:
-                # XXX: Compute y_std more efficiently
-                v = cho_solve((self.L_, True), K_trans.T)  # Line 5
-                y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
-                y_std = np.sqrt(np.diag(y_cov))
-                return y_mean, y_std
+                # compute inverse K_inv of K based on its cholesky
+                # decomposition L and its inverse L_inv
+                L_inv = solve_triangular(self.L_.T, np.eye(self.L_.shape[0]))
+                K_inv = L_inv.dot(L_inv.T)
+                # Compute variance of predictive distribution
+                y_var = np.apply_along_axis(self.kernel_, 1, X)[:, 0]
+                y_var -= np.sum(K_trans.T[:, np.newaxis] * K_trans.T
+                                * K_inv[:, :, np.newaxis],
+                                axis=(0, 1))
+                return y_mean, np.sqrt(y_var)
             else:
                 return y_mean
 
@@ -167,6 +170,9 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         if eval_gradient:  # compare Equation 5.9 from GPML
             tmp = np.outer(alpha, alpha)
             tmp -= cho_solve((L, True), np.eye(K.shape[0]))
+            # Compute "0.5 * trace(tmp.dot(K_gradient))" without constructing
+            # the full matrix tmp.dot(K_gradient) since only its diagonal is
+            # required
             gradient = 0.5 * np.einsum("ij,ijk->k", tmp, K_gradient)
             return log_likelihood, gradient
         else:
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 685e3b6765447..5f9cefbb10eb4 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -457,6 +457,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
+        X = np.atleast_2d(X)
         if Y is None:
             Y = X
         elif eval_gradient:
@@ -519,6 +520,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
+        X = np.atleast_2d(X)
         if Y is not None and eval_gradient:
             raise ValueError("Gradient can only be evaluated when Y is None.")
 
@@ -582,6 +584,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
+        X = np.atleast_2d(X)
         if Y is None:
             dists = pdist(X / self.l, metric='sqeuclidean')
             K = np.exp(-.5 * dists)
@@ -667,6 +670,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
+        X = np.atleast_2d(X)
         if Y is None:
             dists = squareform(pdist(X, metric='sqeuclidean'))
             tmp = dists / (2 * self.alpha * self.l ** 2)
@@ -742,6 +746,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
+        X = np.atleast_2d(X)
         if Y is None:
             dists = squareform(pdist(X, metric='euclidean'))
             arg = np.pi  * dists / self.p
@@ -814,6 +819,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
+        X = np.atleast_2d(X)
         if Y is None:
             K = np.inner(X, X) + self.sigma_0 ** 2
         else:
@@ -918,6 +924,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
+        X = np.atleast_2d(X)
         K = pairwise_kernels(X, Y, metric=self.metric, gamma=self.gamma,
                              filter_params=True, **self.kwargs)
         if eval_gradient:
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 77bc1cfe986ef..e3c443ca5c025 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -124,3 +124,13 @@ def test_no_optimizer():
     gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None).fit(X, y)
     assert_equal(gpr.kernel_.theta, 1.0)
     assert_equal(gpr.theta_, 1.0)
+
+
+def test_predict_cov_vs_std():
+    """ Test that predicted std.-dev. is consistent with cov's diagonal."""
+    for kernel in kernels:
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+        print gpr.kernel_
+        y_mean, y_cov = gpr.predict(X2, return_cov=True)
+        y_mean, y_std = gpr.predict(X2, return_std=True)
+        assert_almost_equal(np.sqrt(np.diag(y_cov)), y_std)

From 5293bbc1a6d90cf33457385400b41ca6c5daf9ae Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Fri, 13 Mar 2015 18:58:15 +0100
Subject: [PATCH 051/165] FIX GPC uses LabelEncoder in fit() and returns
 (n_samples, n_classes) array in predict_proba()

---
 sklearn/gaussian_process/gpc.py | 49 ++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 6161f0472994a..3dd45b9f11bd1 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -4,15 +4,17 @@
 #
 # License: BSD 3 clause
 
+import warnings
+
 import numpy as np
 from scipy.linalg import cholesky, cho_solve, solve
 from scipy.optimize import fmin_l_bfgs_b
 from scipy.special import erf
 
-from sklearn.base import BaseEstimator, clone
+from sklearn.base import BaseEstimator, ClassifierMixin, clone
 from sklearn.gaussian_process.kernels import RBF
 from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
-
+from sklearn.preprocessing import LabelEncoder
 
 # Values required for approximating the logistic sigmoid by
 # error functions. coefs are obtained via:
@@ -25,7 +27,7 @@
                   128.12323805, -2010.49422654])[:, np.newaxis]
 
 
-class GaussianProcessClassifier(BaseEstimator):
+class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
     """ Gaussian process classification (GPC).
 
     The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
@@ -47,15 +49,27 @@ def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b"):
 
     def fit(self, X, y):
         if self.kernel is None:  # Use an RBF kernel as default
-            self.kernel_ = RBF()
+            self.kernel_ = 1.0 * RBF(1.0)
         else:
             self.kernel_ = clone(self.kernel)
 
         X, y = check_X_y(X, y)
 
-        # XXX: Assert that y is binary and labels are {0, 1}
         self.X_fit_ = X
-        self.y_fit_ = check_array(y, ensure_2d=False, dtype=float)
+
+        # Encode class labels and check that it is a binary classification
+        # problem
+        label_encoder = LabelEncoder()
+        self.y_fit_ = label_encoder.fit_transform(y)
+        self.classes_ = label_encoder.classes_
+        if self.classes_.size > 2:
+            raise ValueError("GaussianProcessClassifier supports only binary "
+                             "classification. y contains classes %s"
+                             % self.classes_)
+        elif self.classes_.size == 1:
+            warnings.warn("Only one class label (%s) occurrs in training set."
+                          % self.classes_)
+            self.classes_ = np.array([self.classes_[0], self.classes_[0]])
 
         if self.optimizer == "fmin_l_bfgs_b":
             # Choose hyperparameters based on maximizing the log-marginal
@@ -77,7 +91,7 @@ def obj_func(theta):
         self.K_ = self.kernel_(self.X_fit_)
         self.K_[np.diag_indices_from(self.K_)] += self.jitter
 
-        self.f_, _, (self.pi, self.W_sr, self.L, _, _) = \
+        self.f_, _, (self.pi_, self.W_sr_, self.L_, _, _) = \
             self._posterior_mode(self.K_, return_temporaries=True)
 
         return self
@@ -89,11 +103,10 @@ def predict(self, X):
         # As discussed on Section 3.4.2 of GPML, for making hard binary
         # decisions, it is enough to compute the MAP of the posterior and
         # pass it through the link function
-        K_star = \
-            self.kernel_(self.X_fit_, X)  # K_star =k(x_star)
-        f_star = K_star.T.dot(self.y_fit_ - self.pi)  # Line 4 (Algorithm 3.2)
+        K_star = self.kernel_(self.X_fit_, X)  # K_star =k(x_star)
+        f_star = K_star.T.dot(self.y_fit_ - self.pi_)  # Line 4 (Algorithm 3.2)
 
-        return f_star > 0
+        return np.where(f_star > 0, self.classes_[1], self.classes_[0])
 
     def predict_proba(self, X):
         check_is_fitted(self, ["X_fit_", "y_fit_", "K_", "f_"])
@@ -101,19 +114,17 @@ def predict_proba(self, X):
 
         # Based on Algorithm 3.2 of GPML
         K_star = self.kernel_(self.X_fit_, X)  # K_star =k(x_star)
-        f_star = K_star.T.dot(self.y_fit_ - self.pi)  # Line 4
-        v = solve(self.L, self.W_sr[:, np.newaxis] * K_star)  # Line 5
+        f_star = K_star.T.dot(self.y_fit_ - self.pi_)  # Line 4
+        v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star)  # Line 5
         var_f_star = self.kernel_(X) - v.T.dot(v)  # Line 6
 
         # Line 7:
         # Approximate \int log(z) * N(z | f_star, var_f_star)
         # Approximation is due to Williams & Barber, "Bayesian Classification
-        # with Gaussian Processes", Appendix A:
-        # Approximate the logistic sigmoid by a linear combination of
-        # 5 error functions
-        # See
+        # with Gaussian Processes", Appendix A: Approximate the logistic
+        # sigmoid by a linear combination of 5 error functions.
+        # For information on how this integral can be computed see
         # blitiri.blogspot.de/2012/11/gaussian-integral-of-error-function.html
-        # for information on how this integral can be computed
         alpha = 1 / (2 * np.diag(var_f_star))
         gamma = LAMBDAS * f_star
         integrals = np.sqrt(np.pi / alpha) \
@@ -121,7 +132,7 @@ def predict_proba(self, X):
             / (2 * np.sqrt(np.diag(var_f_star) * 2 * np.pi))
         pi_star = (COEFS * integrals).sum(axis=0) + .5 * COEFS.sum()
 
-        return pi_star
+        return np.vstack((1 - pi_star, pi_star)).T
 
     def log_marginal_likelihood(self, theta, eval_gradient=False):
         kernel = self.kernel_.clone_with_theta(theta)

From d922f5d5038b896c789d8e339fe5acf59d33f4f3 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Fri, 13 Mar 2015 18:59:11 +0100
Subject: [PATCH 052/165] FIX Adapted examples and tests to GPC.predict_proba()
 adapted return shape

---
 examples/gaussian_process/plot_gpc.py                | 6 +++---
 examples/gaussian_process/plot_gpc_isoprobability.py | 2 +-
 examples/gaussian_process/plot_gpc_xor.py            | 2 +-
 sklearn/gaussian_process/tests/test_gpc.py           | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/gaussian_process/plot_gpc.py b/examples/gaussian_process/plot_gpc.py
index fdce10096763a..863d68725d8d4 100644
--- a/examples/gaussian_process/plot_gpc.py
+++ b/examples/gaussian_process/plot_gpc.py
@@ -40,13 +40,13 @@
 plt.figure(0)
 plt.scatter(X[:, 0], y)
 X_ = np.linspace(0, 5, 100)
-plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis]), 'r',
+plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r',
          label="Initial kernel: %s" % gp_fix.kernel_)
-plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis]), 'b',
+plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b',
          label="Optimized kernel: %s" % gp_opt.kernel_)
 plt.legend(loc="best")
 plt.xlabel("Feature")
-plt.ylabel("Class")
+plt.ylabel("Class 1 probability")
 
 # Plot LML landscape
 plt.figure(1)
diff --git a/examples/gaussian_process/plot_gpc_isoprobability.py b/examples/gaussian_process/plot_gpc_isoprobability.py
index 40441ee808511..e0d55c0160a67 100644
--- a/examples/gaussian_process/plot_gpc_isoprobability.py
+++ b/examples/gaussian_process/plot_gpc_isoprobability.py
@@ -58,7 +58,7 @@ def g(x):
 xx = np.vstack([x1.reshape(x1.size), x2.reshape(x2.size)]).T
 
 y_true = g(xx)
-y_prob = gp.predict_proba(xx)
+y_prob = gp.predict_proba(xx)[:, 1]
 y_true = y_true.reshape((res, res))
 y_prob = y_prob.reshape((res, res))
 
diff --git a/examples/gaussian_process/plot_gpc_xor.py b/examples/gaussian_process/plot_gpc_xor.py
index adfeb12d57244..1fb8cb90e15a4 100644
--- a/examples/gaussian_process/plot_gpc_xor.py
+++ b/examples/gaussian_process/plot_gpc_xor.py
@@ -26,7 +26,7 @@
 clf = GaussianProcessClassifier(kernel=kernel).fit(X, Y)
 
 # plot the decision function for each datapoint on the grid
-Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)
+Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1]
 Z = Z.reshape(xx.shape)
 
 image = plt.imshow(Z, interpolation='nearest',
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index aa64f141a158a..1bfb56a132bae 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -31,7 +31,7 @@ def test_predict_consistent():
     for kernel in kernels:
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
         assert_array_equal(gpc.predict(X),
-                           gpc.predict_proba(X) >=0.5)
+                           gpc.predict_proba(X)[:, 1] >=0.5)
 
 
 def test_lml_improving():

From 948949cf35739ac06b36f56b3fd9e196b28f71bd Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Fri, 13 Mar 2015 18:59:42 +0100
Subject: [PATCH 053/165] FIX GaussianProcessRegressor inherits from
 RegressorMixin

---
 sklearn/gaussian_process/gpr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 0be08844bfd9c..7902323a32941 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -8,13 +8,13 @@
 from scipy.linalg import cholesky, cho_solve, solve, solve_triangular
 from scipy.optimize import fmin_l_bfgs_b
 
-from sklearn.base import BaseEstimator, clone
+from sklearn.base import BaseEstimator, RegressorMixin, clone
 from sklearn.gaussian_process.kernels import RBF
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import check_X_y, check_array
 
 
-class GaussianProcessRegressor(BaseEstimator):
+class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
     """ Gaussian process regression (GPR).
 
     The implementation is based on Algorithm 2.1 of ``Gaussian Processes

From 5b1189a2aee6bcb4acfa516302d7b102810d0b70 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Fri, 13 Mar 2015 20:01:13 +0100
Subject: [PATCH 054/165] ENH Reducing memory-consumption of
 GPC.predict_proba()

We compute the diagonal of var_f_star directly and not the whole matrix.
---
 sklearn/gaussian_process/gpc.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 3dd45b9f11bd1..a7874218950a1 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -116,7 +116,9 @@ def predict_proba(self, X):
         K_star = self.kernel_(self.X_fit_, X)  # K_star =k(x_star)
         f_star = K_star.T.dot(self.y_fit_ - self.pi_)  # Line 4
         v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star)  # Line 5
-        var_f_star = self.kernel_(X) - v.T.dot(v)  # Line 6
+        # Compute np.diag(v.T.dot(v)) via einsum
+        var_f_star = np.apply_along_axis(self.kernel_, 1, X)[:, 0] \
+            - np.einsum("ij,ij->j", v, v) # Line 6
 
         # Line 7:
         # Approximate \int log(z) * N(z | f_star, var_f_star)
@@ -125,11 +127,11 @@ def predict_proba(self, X):
         # sigmoid by a linear combination of 5 error functions.
         # For information on how this integral can be computed see
         # blitiri.blogspot.de/2012/11/gaussian-integral-of-error-function.html
-        alpha = 1 / (2 * np.diag(var_f_star))
+        alpha = 1 / (2 * var_f_star)
         gamma = LAMBDAS * f_star
         integrals = np.sqrt(np.pi / alpha) \
             * erf(gamma * np.sqrt(alpha / (alpha + LAMBDAS**2))) \
-            / (2 * np.sqrt(np.diag(var_f_star) * 2 * np.pi))
+            / (2 * np.sqrt(var_f_star * 2 * np.pi))
         pi_star = (COEFS * integrals).sum(axis=0) + .5 * COEFS.sum()
 
         return np.vstack((1 - pi_star, pi_star)).T

From 1e7713bec4d4a57d84c35559ad7e6757ab006fe0 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Fri, 13 Mar 2015 20:02:13 +0100
Subject: [PATCH 055/165] FIX Numpy downward-compatible way of computing sum
 over two axis

---
 sklearn/gaussian_process/gpr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 7902323a32941..16472d1db618b 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -133,7 +133,7 @@ def predict(self, X, return_std=False, return_cov=False):
                 y_var = np.apply_along_axis(self.kernel_, 1, X)[:, 0]
                 y_var -= np.sum(K_trans.T[:, np.newaxis] * K_trans.T
                                 * K_inv[:, :, np.newaxis],
-                                axis=(0, 1))
+                                axis=0).sum(axis=0)  # axis=(0, 1)
                 return y_mean, np.sqrt(y_var)
             else:
                 return y_mean

From 6496f928faf583f282146c30d4c24aab9f7e3b47 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sat, 14 Mar 2015 10:11:29 +0100
Subject: [PATCH 056/165] ENH GPC can use warm-starts in _posterior_mode()

---
 sklearn/gaussian_process/gpc.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index a7874218950a1..9a2c8d66cee74 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -42,10 +42,12 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
       * binary classification
     """
 
-    def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b"):
+    def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
+                 warm_start=False):
         self.kernel = kernel
         self.jitter = jitter
         self.optimizer = optimizer
+        self.warm_start = warm_start
 
     def fit(self, X, y):
         if self.kernel is None:  # Use an RBF kernel as default
@@ -174,7 +176,15 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
 
     def _posterior_mode(self, K, return_temporaries=False):
         # Based on Algorithm 3.1 of GPML
-        f = np.zeros_like(self.y_fit_, dtype=np.float64)
+
+        # If warm_start are enabled, we reuse the last solution for the
+        # posterior mode as initialization; otherwise, we initialize with 0
+        if self.warm_start and hasattr(self, "f_cached") \
+           and self.f_cached.shape == self.y_fit_.shape:
+            f = self.f_cached
+        else:
+            f = np.zeros_like(self.y_fit_, dtype=np.float64)
+
         log_marginal_likelihood = -np.inf
         while True:
             # Line 4
@@ -204,6 +214,7 @@ def _posterior_mode(self, K, return_temporaries=False):
                 break
             log_marginal_likelihood = lml
 
+        self.f_cached = f  # Remember solution for later warm-starts
         if return_temporaries:
             return f, log_marginal_likelihood, (pi, W_sr, L, b, a)
         else:

From eec5c5072763a2f95174ff802221d4a3f6cfbde3 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sat, 14 Mar 2015 15:08:53 +0100
Subject: [PATCH 057/165] ENH Adding diag(X) method to GP kernels.

Using this method rather than np.diag(kernel(X)) or np.apply_along_axis
reduces memory-consumption and speeds up computation considerably
---
 sklearn/gaussian_process/gpc.py               |   5 +-
 sklearn/gaussian_process/gpr.py               |   4 +-
 sklearn/gaussian_process/kernels.py           | 153 ++++++++++++++++++
 .../gaussian_process/tests/test_kernels.py    |   9 ++
 4 files changed, 166 insertions(+), 5 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 9a2c8d66cee74..a021b46801bf1 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -118,9 +118,8 @@ def predict_proba(self, X):
         K_star = self.kernel_(self.X_fit_, X)  # K_star =k(x_star)
         f_star = K_star.T.dot(self.y_fit_ - self.pi_)  # Line 4
         v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star)  # Line 5
-        # Compute np.diag(v.T.dot(v)) via einsum
-        var_f_star = np.apply_along_axis(self.kernel_, 1, X)[:, 0] \
-            - np.einsum("ij,ij->j", v, v) # Line 6
+        # Line 6 (compute np.diag(v.T.dot(v)) via einsum)
+        var_f_star = self.kernel_.diag(X) - np.einsum("ij,ij->j", v, v)
 
         # Line 7:
         # Approximate \int log(z) * N(z | f_star, var_f_star)
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 16472d1db618b..b311560086615 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -113,7 +113,7 @@ def predict(self, X, return_std=False, return_cov=False):
                 y_cov = self.kernel(X)
                 return y_mean, y_cov
             elif return_std:
-                y_var = np.apply_along_axis(self.kernel, 1, X)[:, 0]
+                y_var = self.kernel.diag(X)
                 return y_mean, np.sqrt(y_var)
             else:
                 return y_mean
@@ -130,7 +130,7 @@ def predict(self, X, return_std=False, return_cov=False):
                 L_inv = solve_triangular(self.L_.T, np.eye(self.L_.shape[0]))
                 K_inv = L_inv.dot(L_inv.T)
                 # Compute variance of predictive distribution
-                y_var = np.apply_along_axis(self.kernel_, 1, X)[:, 0]
+                y_var = self.kernel_.diag(X)
                 y_var -= np.sum(K_trans.T[:, np.newaxis] * K_trans.T
                                 * K_inv[:, :, np.newaxis],
                                 axis=0).sum(axis=0)  # axis=(0, 1)
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 5f9cefbb10eb4..68ac3d045edbd 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -141,6 +141,25 @@ def __repr__(self):
     def __call__(self, X, Y=None, eval_gradient=False):
         """Evaluate the kernel."""
 
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluted more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return np.ones(X.shape[0])
+
     def is_stationary(self):
         """ Returns whether the kernel is stationary. """
         return True
@@ -247,6 +266,25 @@ def __call__(self, X, Y=None, eval_gradient=False):
         else:
             return self.k1(X, Y) + self.k2(X, Y)
 
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluted more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return self.k1.diag(X) + self.k2.diag(X)
+
     def __repr__(self):
         return "{0} + {1}".format(self.k1, self.k2)
 
@@ -300,6 +338,25 @@ def __call__(self, X, Y=None, eval_gradient=False):
         else:
             return self.k1(X, Y) * self.k2(X, Y)
 
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluted more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return self.k1.diag(X) * self.k2.diag(X)
+
     def __repr__(self):
         return "{0} * {1}".format(self.k1, self.k2)
 
@@ -382,6 +439,25 @@ def __call__(self, X, Y=None, eval_gradient=False):
             K = self.kernel(X, Y, eval_gradient=False)
             return K ** self.exponent
 
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluted more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return self.kernel.diag(X) ** self.exponent
+
     def __repr__(self):
         return "{0} ** {1}".format(self.kernel, self.exponent)
 
@@ -469,6 +545,25 @@ def __call__(self, X, Y=None, eval_gradient=False):
         else:
             return K
 
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluted more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return self.value * np.ones(X.shape[0])
+
     def __repr__(self):
         return "{0:.3g}".format(self.value)
 
@@ -536,6 +631,25 @@ def __call__(self, X, Y=None, eval_gradient=False):
             K[cdist(X, Y) < 1e-10] = self.c
             return K
 
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluted more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return self.c * np.ones(X.shape[0])
+
 
 class RBF(Kernel):
     """ Radial-basis function kernel (aka squared-exponential kernel).
@@ -835,6 +949,25 @@ def __call__(self, X, Y=None, eval_gradient=False):
         else:
             return K
 
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluted more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return (X ** 2).sum(1) + self.sigma_0 ** 2
+
     def is_stationary(self):
         """ Returns whether the kernel is stationary. """
         return False
@@ -937,6 +1070,26 @@ def f(gamma):  # helper function
         else:
             return K
 
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluted more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        # We have to fall back to slow way of computing diagonal
+        return np.apply_along_axis(self, 1, X)[:, 0]
+
     def is_stationary(self):
         """ Returns whether the kernel is stationary. """
         return self.metric in ["rbf"]
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 5e076177fcf81..811ff6e383c6c 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -63,6 +63,15 @@ def test_auto_vs_cross():
         K_cross = kernel(X, X)
         assert_almost_equal(K_auto, K_cross, 5)
 
+
+def test_kernel_diag():
+    """ Test that diag method of kernel returns consistent results. """
+    for kernel in kernels:
+        K_call_diag = np.diag(kernel(X))
+        K_diag = kernel.diag(X)
+        assert_almost_equal(K_call_diag, K_diag, 5)
+
+
 def test_kernel_operator_commutative():
     """ Adding kernels and multiplying kernels should be commutative. """
     # Check addition

From 08ead1a8962c460c4938db6f21be2e66f68855ea Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sat, 14 Mar 2015 15:09:52 +0100
Subject: [PATCH 058/165] DOC Adding GPC to plot_classifier_comparison.py
 script and enabling warm_start

---
 .../classification/plot_classifier_comparison.py    | 13 +++++++++----
 examples/gaussian_process/plot_gpc_xor.py           |  2 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/classification/plot_classifier_comparison.py b/examples/classification/plot_classifier_comparison.py
index 7be3c0cacad9f..7e2b953df8dc7 100644
--- a/examples/classification/plot_classifier_comparison.py
+++ b/examples/classification/plot_classifier_comparison.py
@@ -36,6 +36,8 @@
 from sklearn.datasets import make_moons, make_circles, make_classification
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import RBF
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
 from sklearn.naive_bayes import GaussianNB
@@ -44,12 +46,14 @@
 
 h = .02  # step size in the mesh
 
-names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
-         "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
+names = ["3 Near. Neighb.", "Linear SVM", "RBF SVM", "RBF GPC",
+         "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA",
+         "QDA"]
 classifiers = [
     KNeighborsClassifier(3),
     SVC(kernel="linear", C=0.025),
     SVC(gamma=2, C=1),
+    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
     DecisionTreeClassifier(max_depth=5),
     RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
     AdaBoostClassifier(),
@@ -75,7 +79,8 @@
     # preprocess dataset, split into training and test part
     X, y = ds
     X = StandardScaler().fit_transform(X)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
+    X_train, X_test, y_train, y_test = \
+        train_test_split(X, y, test_size=.4, random_state=42)
 
     x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
     y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
@@ -128,5 +133,5 @@
                 size=15, horizontalalignment='right')
         i += 1
 
-figure.subplots_adjust(left=.02, right=.98)
+plt.tight_layout()
 plt.show()
diff --git a/examples/gaussian_process/plot_gpc_xor.py b/examples/gaussian_process/plot_gpc_xor.py
index 1fb8cb90e15a4..8563b1e3a504b 100644
--- a/examples/gaussian_process/plot_gpc_xor.py
+++ b/examples/gaussian_process/plot_gpc_xor.py
@@ -23,7 +23,7 @@
 
 # fit the model
 kernel = 1.0 * RBF(1.0)
-clf = GaussianProcessClassifier(kernel=kernel).fit(X, Y)
+clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y)
 
 # plot the decision function for each datapoint on the grid
 Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1]

From 4ac2b12ea5b8d9a668600c98593ff11a33a1abf9 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 15 Mar 2015 11:46:41 +0100
Subject: [PATCH 059/165] DOC gpr module fully documented

---
 sklearn/gaussian_process/gpr.py | 126 ++++++++++++++++++++++++++++----
 1 file changed, 111 insertions(+), 15 deletions(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index b311560086615..8777bf5cd1e34 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -22,10 +22,10 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
 
     In addition to standard sklearn estimators, GaussianProcessRegressor
        * allows prediction without prior fitting (based on the GP prior)
-       * provides an additional method sample(X), which evaluates samples drawn
-         from the GPR (prior or posterior) at given inputs
+       * provides an additional method sample_y(X), which evaluates samples
+         drawn from the GPR (prior or posterior) at given inputs
        * exposes a method log_marginal_likelihood(theta), which can be used
-         externally for other ways of selecting hyperparamters, e.g., via
+         externally for other ways of selecting hyperparameters, e.g., via
          Markov chain Monte Carlo.
 
     Parameters
@@ -41,22 +41,27 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
     optimizer : string, optional (default: "fmin_l_bfgs_b")
         A string specifying the optimization algorithm used for optimizing the
         kernel's parameters. Default uses 'fmin_l_bfgs_b' algorithm from
-        scipy.optimize. If None, the kernel's paramters are kept fixed.
+        scipy.optimize. If None, the kernel's parameters are kept fixed.
         Available optimizers are::
 
             'fmin_l_bfgs_b'
 
     Attributes
     ----------
-    X_fit_:
+    X_fit_ : array-like, shape = (n_samples, n_features)
+        Feature values in training data (also required for prediction)
 
-    y_fit_:
+    y_fit_: array-like, shape = (n_samples,)
+        Target values in training data (also required for prediction)
 
-    theta_:
+    theta_: array-like, shape =(n_kernel_params,)
+        Selected kernel hyperparameters
 
-    L_:
+    L_: array-like, shape = (n_samples, n_samples)
+        Lower-triangular Cholesky decomposition of the kernel in X_fit_
 
-    alpha_:
+    alpha_: array-like, shape = (n_samples,)
+        Dual coefficients of training data points in kernel space
     """
 
     def __init__(self, kernel=None, y_err=1e-10, optimizer="fmin_l_bfgs_b"):
@@ -65,6 +70,20 @@ def __init__(self, kernel=None, y_err=1e-10, optimizer="fmin_l_bfgs_b"):
         self.optimizer = optimizer
 
     def fit(self, X, y):
+        """Fit Gaussian process regression model
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Training data
+
+        y : array-like, shape = (n_samples, )
+            Target values
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
         if self.kernel is None:  # Use an RBF kernel as default
             self.kernel_ = RBF()
         else:
@@ -86,6 +105,7 @@ def obj_func(theta):
                                               bounds=self.kernel_.bounds)
             self.kernel_.theta = self.theta_
         elif self.optimizer is None:
+            # Use initially provided hyperparameters
             self.theta_ = self.kernel_.theta
         else:
             raise ValueError("Unknown optimizer %s." % self.optimizer)
@@ -100,6 +120,39 @@ def obj_func(theta):
         return self
 
     def predict(self, X, return_std=False, return_cov=False):
+        """Predict using the Gaussian process regression model
+
+        We can also predict based on an unfitted model by using the GP prior.
+        In addition to the mean of the predictive distribution, also its
+        standard deviation (return_std=True) or covariance (return_cov=True).
+        Note that at most one of the two can be requested.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Query points where the GP is evaluated
+
+        return_std : bool, default: False
+            If True, the standard-deviation of the predictive distribution at
+            the query points is returned along with the mean.
+
+        return_cov : bool, default: False
+            If True, the covariance of the joint predictive distribution at
+            the query points is returned along with the mean
+
+        Returns
+        -------
+        y_mean : array, shape = (n_samples,)
+            Mean of predictive distribution a query points
+
+        y_std : array, shape = (n_samples,), optional
+            Standard deviation of predictive distribution a query points.
+            Only returned when return_std is True
+
+        y_cov : array, shape = (n_samples, n_samples), optional
+            Covariance of joint predictive distribution a query points.
+            Only returned when return_cov is True
+        """
         if return_std and return_cov:
             raise RuntimeError(
                 "Not returning standard deviation of predictions when "
@@ -125,7 +178,7 @@ def predict(self, X, return_std=False, return_cov=False):
                 y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
                 return y_mean, y_cov
             elif return_std:
-                # compute inverse K_inv of K based on its cholesky
+                # compute inverse K_inv of K based on its Cholesky
                 # decomposition L and its inverse L_inv
                 L_inv = solve_triangular(self.L_.T, np.eye(self.L_.shape[0]))
                 K_inv = L_inv.dot(L_inv.T)
@@ -133,12 +186,31 @@ def predict(self, X, return_std=False, return_cov=False):
                 y_var = self.kernel_.diag(X)
                 y_var -= np.sum(K_trans.T[:, np.newaxis] * K_trans.T
                                 * K_inv[:, :, np.newaxis],
-                                axis=0).sum(axis=0)  # axis=(0, 1)
+                                axis=0).sum(axis=0)  # axis=(0, 1) in np >= 1.7
                 return y_mean, np.sqrt(y_var)
             else:
                 return y_mean
 
     def sample_y(self, X, n_samples=1, random_state=0):
+        """Draw samples from Gaussian process and evaluate at X.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples_X, n_features)
+            Query points where the GP samples are evaluated
+
+        n_samples : int, default: 1
+            The number of samples drawn from the Gaussian process
+
+        random_state: RandomState or an int seed (0 by default)
+            A random number generator instance
+
+        Returns
+        -------
+        y_samples : array, shape = (n_samples_X, n_samples)
+            Values of n_samples samples drawn from Gaussian process and
+            evaluated at query points.
+        """
         rng = check_random_state(random_state)
 
         y_mean, y_cov = self.predict(X, return_cov=True)
@@ -146,6 +218,29 @@ def sample_y(self, X, n_samples=1, random_state=0):
         return y_samples
 
     def log_marginal_likelihood(self, theta, eval_gradient=False):
+        """ Returns log-marginal likelihood of theta for training data.
+
+        Parameters
+        ----------
+        theta : array-like, shape = (n_kernel_params,)
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated
+
+        eval_gradient : bool, default: False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : array, shape = (n_kernel_params,), optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when eval_gradient is True.
+        """
         kernel = self.kernel_.clone_with_theta(theta)
 
         if eval_gradient:
@@ -157,8 +252,8 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         try:
             L = cholesky(K, lower=True)  # Line 2
         except np.linalg.LinAlgError:
-            return (-np.inf, np.zeros_like(theta))\
-                 if eval_gradient else -np.inf
+            return (-np.inf, np.zeros_like(theta)) \
+                if eval_gradient else -np.inf
 
         alpha = cho_solve((L, True), self.y_fit_)  # Line 3
 
@@ -173,7 +268,8 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
             # Compute "0.5 * trace(tmp.dot(K_gradient))" without constructing
             # the full matrix tmp.dot(K_gradient) since only its diagonal is
             # required
-            gradient = 0.5 * np.einsum("ij,ijk->k", tmp, K_gradient)
-            return log_likelihood, gradient
+            log_likelihood_gradient = \
+                0.5 * np.einsum("ij,ijk->k", tmp, K_gradient)
+            return log_likelihood, log_likelihood_gradient
         else:
             return log_likelihood

From f5bb2a7451fa1dfe672d874c861d1c62af7f601a Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 15 Mar 2015 17:00:54 +0100
Subject: [PATCH 060/165] DOC gpc module fully documented

---
 sklearn/gaussian_process/gpc.py | 154 +++++++++++++++++++++++++++++---
 sklearn/gaussian_process/gpr.py |  12 ++-
 2 files changed, 149 insertions(+), 17 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index a021b46801bf1..142e1674045a0 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -1,4 +1,4 @@
-"""Gaussian processes classification based on the Laplace approximation. """
+"""Gaussian processes classification."""
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 #
@@ -28,7 +28,7 @@
 
 
 class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
-    """ Gaussian process classification (GPC).
+    """ Gaussian process classification (GPC) based on Laplace approximation.
 
     The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
     ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and
@@ -39,7 +39,64 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
 
     Currently, the implementation is restricted to
       * using the logistic link function
-      * binary classification
+      * and binary classification
+
+    Parameters
+    ----------
+    kernel : kernel object
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
+        the kernel's hyperparameters are optimized during fitting.
+
+    jitter : float, optional (default: 0.0)
+        Value added to the diagonal of the kernel matrix during fitting.
+        Larger values correspond to increased noise level in the observations
+        and reduce potential numerical issue during fitting.
+
+    optimizer : string, optional (default: "fmin_l_bfgs_b")
+        A string specifying the optimization algorithm used for optimizing the
+        kernel's parameters. Default uses 'fmin_l_bfgs_b' algorithm from
+        scipy.optimize. If None, the kernel's parameters are kept fixed.
+        Available optimizers are::
+
+            'fmin_l_bfgs_b'
+
+    warm_start : bool, optional (default: False)
+        If warm-starts are enabled, the solution of the last Newton iteration
+        on the Laplace approximation of the posterior mode is used as
+        initialization for the next call of _posterior_mode(). This can speed
+        up convergence when _posterior_mode is called several times on similar
+        problems as in hyperparameter optimization.
+
+
+    Attributes
+    ----------
+    X_fit_ : array-like, shape = (n_samples, n_features)
+        Feature values in training data (also required for prediction)
+
+    y_fit_: array-like, shape = (n_samples,)
+        Target values in training data (also required for prediction)
+
+    classes_ : array-like, shape = (n_classes,)
+        Unique class labels.
+
+    kernel_: kernel object
+        The kernel used for prediction. The structure of the kernel is the
+        same as the one passed as parameter but with optimized hyperparameters
+
+    theta_: array-like, shape = (n_kernel_params,)
+        Selected kernel hyperparameters
+
+    L_: array-like, shape = (n_samples, n_samples)
+        Lower-triangular Cholesky decomposition of the kernel in X_fit_
+
+    pi_: array-like, shape = (n_samples,)
+        The probabilities of the positive class for the training points X_fit_
+
+    W_sr_: array-like, shape = (n_samples,)
+        Square root of W, the Hessian of log-likelihood of the latent function
+        values for the observed labels. Since W is diagonal, only the diagonal
+        of sqrt(W) is stored.
     """
 
     def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
@@ -50,6 +107,20 @@ def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
         self.warm_start = warm_start
 
     def fit(self, X, y):
+        """Fit Gaussian process regression model
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Training data
+
+        y : array-like, shape = (n_samples,)
+            Target values, must be binary
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
         if self.kernel is None:  # Use an RBF kernel as default
             self.kernel_ = 1.0 * RBF(1.0)
         else:
@@ -90,16 +161,27 @@ def obj_func(theta):
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
-        self.K_ = self.kernel_(self.X_fit_)
-        self.K_[np.diag_indices_from(self.K_)] += self.jitter
+        K = self.kernel_(self.X_fit_)
+        K[np.diag_indices_from(K)] += self.jitter
 
-        self.f_, _, (self.pi_, self.W_sr_, self.L_, _, _) = \
-            self._posterior_mode(self.K_, return_temporaries=True)
+        _, (self.pi_, self.W_sr_, self.L_, _, _) = \
+            self._posterior_mode(K, return_temporaries=True)
 
         return self
 
     def predict(self, X):
-        check_is_fitted(self, ["X_fit_", "y_fit_", "K_", "f_"])
+        """Perform classification on an array of test vectors X.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+
+        Returns
+        -------
+        C : array, shape = (n_samples,)
+            Predicted target values for X, values are from classes_
+        """
+        check_is_fitted(self, ["X_fit_", "y_fit_", "pi_", "W_sr_", "L_"])
         X = check_array(X)
 
         # As discussed on Section 3.4.2 of GPML, for making hard binary
@@ -111,7 +193,20 @@ def predict(self, X):
         return np.where(f_star > 0, self.classes_[1], self.classes_[0])
 
     def predict_proba(self, X):
-        check_is_fitted(self, ["X_fit_", "y_fit_", "K_", "f_"])
+        """Return probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+
+        Returns
+        -------
+        C : array-like, shape = (n_samples, n_classes)
+            Returns the probability of the samples for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute `classes_`.
+        """
+        check_is_fitted(self, ["X_fit_", "y_fit_", "pi_", "W_sr_", "L_"])
         X = check_array(X)
 
         # Based on Algorithm 3.2 of GPML
@@ -138,6 +233,29 @@ def predict_proba(self, X):
         return np.vstack((1 - pi_star, pi_star)).T
 
     def log_marginal_likelihood(self, theta, eval_gradient=False):
+        """ Returns log-marginal likelihood of theta for training data.
+
+        Parameters
+        ----------
+        theta : array-like, shape = (n_kernel_params,)
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated
+
+        eval_gradient : bool, default: False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : array, shape = (n_kernel_params,), optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when eval_gradient is True.
+        """
         kernel = self.kernel_.clone_with_theta(theta)
 
         if eval_gradient:
@@ -147,8 +265,9 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
 
         K[np.diag_indices_from(K)] += self.jitter
 
-        # Return temporaries
-        f, Z, (pi, W_sr, L, b, a) = \
+        # Compute log-marginal-likelihood Z and also store some temporaries
+        # which can be reused for computing Z's gradient
+        Z, (pi, W_sr, L, b, a) = \
             self._posterior_mode(K, return_temporaries=True)
 
         if not eval_gradient:
@@ -163,7 +282,7 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         s_2 = -0.5*(np.diag(K) - np.diag(C.T.dot(C))) \
             * (pi * (1 - pi) * (1 - 2*pi))  # third derivative
         for j in range(d_Z.shape[0]):
-            C = K_gradient[..., j]   # Line 11
+            C = K_gradient[:, :, j]   # Line 11
             s_1 = .5 * a.T.dot(C).dot(a) - .5 * np.trace(R.dot(C))  # Line 12
 
             b = C.dot(self.y_fit_ - pi)  # Line 13
@@ -174,6 +293,12 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         return Z, d_Z
 
     def _posterior_mode(self, K, return_temporaries=False):
+        """ Mode-finding for binary Laplace GPC and fixed kernel.
+
+        This approximates the posterior of the latent function values for given
+        inputs and target observations with a Gaussian approximation and uses
+        Newton's iteration to find the mode of this approximation.
+        """
         # Based on Algorithm 3.1 of GPML
 
         # If warm_start are enabled, we reuse the last solution for the
@@ -184,6 +309,7 @@ def _posterior_mode(self, K, return_temporaries=False):
         else:
             f = np.zeros_like(self.y_fit_, dtype=np.float64)
 
+        # Use Newton's iteration method to find mode of Laplace approximation
         log_marginal_likelihood = -np.inf
         while True:
             # Line 4
@@ -215,6 +341,6 @@ def _posterior_mode(self, K, return_temporaries=False):
 
         self.f_cached = f  # Remember solution for later warm-starts
         if return_temporaries:
-            return f, log_marginal_likelihood, (pi, W_sr, L, b, a)
+            return log_marginal_likelihood, (pi, W_sr, L, b, a)
         else:
-            return f, log_marginal_likelihood
+            return log_marginal_likelihood
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 8777bf5cd1e34..1e4cf09917d85 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -30,8 +30,10 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
 
     Parameters
     ----------
-    kernel : Kernel object
-        The kernel specifying the covariance function of the GP.
+    kernel : kernel object
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
+        the kernel's hyperparameters are optimized during fitting.
 
     y_err : float, optional (default: 1e-10)
         Value added to the diagonal of the kernel matrix during fitting.
@@ -54,6 +56,10 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
     y_fit_: array-like, shape = (n_samples,)
         Target values in training data (also required for prediction)
 
+    kernel_: kernel object
+        The kernel used for prediction. The structure of the kernel is the
+        same as the one passed as parameter but with optimized hyperparameters
+
     theta_: array-like, shape =(n_kernel_params,)
         Selected kernel hyperparameters
 
@@ -85,7 +91,7 @@ def fit(self, X, y):
         self : returns an instance of self.
         """
         if self.kernel is None:  # Use an RBF kernel as default
-            self.kernel_ = RBF()
+            self.kernel_ = 1.0 * RBF(1.0)
         else:
             self.kernel_ = clone(self.kernel)
 

From 64ec6ae5247aba436e9113186058ec6078f80359 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 29 Mar 2015 10:21:56 +0200
Subject: [PATCH 061/165] REFACTOR Kernel hyperparamters are now set via their
 name

The property theta, its gradient and bounds are used solely for optimization but not during kernel specification.
---
 sklearn/gaussian_process/kernels.py | 295 ++++++++++++++++------------
 1 file changed, 170 insertions(+), 125 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 68ac3d045edbd..0413f3bdbe0e9 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -35,13 +35,6 @@
 class Kernel(six.with_metaclass(ABCMeta)):
     """ Base class for all kernels."""
 
-    def __init__(self, theta=1.0, thetaL=1e-5, thetaU=np.inf):
-        if not np.iterable(theta):
-            theta = np.array([theta])
-        self.theta = np.asarray(theta, dtype=np.float)
-        self.bounds = (np.asarray(thetaL, dtype=np.float),
-                       np.asarray(thetaU, dtype=np.float))
-
     def get_params(self, deep=True):
         """Get parameters of this kernel.
 
@@ -56,8 +49,7 @@ def get_params(self, deep=True):
         params : mapping of string to any
             Parameter names mapped to their values.
         """
-        params = dict(theta=self.theta, thetaL=self.bounds[:, 0],
-                      thetaU=self.bounds[:, 1])
+        params = dict()
 
         # introspect the constructor arguments to find the model parameters
         # to represent
@@ -70,9 +62,8 @@ def get_params(self, deep=True):
                                " of their __init__ (no varargs)."
                                " %s doesn't follow this convention."
                                % (cls, ))
-        # Remove 'self', theta, thetaL, and thetaU, and store remaining
-        # arguments in params
-        args = args[4:]
+        # Remove 'self' and store remaining arguments in params
+        args = args[1:]
         for arg in args:
             params[arg] = getattr(self, arg, None)
         return params
@@ -88,17 +79,33 @@ def n_dims(self):
         """ Returns the number of hyperparameters of the kernel."""
         return self.theta.shape[0]
 
+    @property
+    def theta(self):
+        return np.array([getattr(self, var_name)
+                         for var_name in self.theta_vars])
+
+    @theta.setter
+    def theta(self, theta):
+        if len(theta) != len(self.theta_vars):
+            raise ValueError("theta has not the correct number of entries."
+                             " Should be %d; given are %d"
+                             % (len(self.theta_vars), len(theta)))
+        for i, var_name in enumerate(self.theta_vars):
+            setattr(self, var_name, theta[i])
+
     @property
     def bounds(self):
-        return np.vstack((self.l_bound, self.u_bound)).T
+        return np.array([getattr(self, var_name + "_bounds")
+                         for var_name in self.theta_vars])
 
     @bounds.setter
     def bounds(self, bounds):
-        self.l_bound, self.u_bound = bounds
-        if not np.iterable(self.l_bound):
-             self.l_bound = np.ones_like(self.theta) * self.l_bound
-        if not np.iterable(self.u_bound):
-             self.u_bound = np.ones_like(self.theta) * self.u_bound
+        if len(bounds) != len(self.theta_vars):
+            raise ValueError("theta has not the correct number of entries."
+                             " Should be %d; given are %d"
+                             % (len(self.theta_vars), len(bounds)))
+        for i, var_name in enumerate(self.theta_vars):
+            setattr(self, var_name + "_bounds", bounds[i])
 
     def __add__(self, b):
         if not isinstance(b, Kernel):
@@ -195,26 +202,30 @@ def theta(self):
 
     @theta.setter
     def theta(self, theta):
-        i = self.k1.n_dims
-        self.k1.theta = theta[:i]
-        self.k2.theta = theta[i:]
+        k1_dims = self.k1.n_dims
+        self.k1.theta = theta[:k1_dims]
+        self.k2.theta = theta[k1_dims:]
 
     @property
     def bounds(self):
+        if self.k1.bounds.size == 0:
+            return self.k2.bounds
+        if self.k2.bounds.size == 0:
+            return self.k1.bounds
         return np.vstack((self.k1.bounds, self.k2.bounds))
 
     @bounds.setter
     def bounds(self, bounds):
-        i = self.k1.n_dims
-        self.k1.bounds = bounds[:i]
-        self.k2.bounds = bounds[i:]
+        k1_dims = self.k1.n_dims
+        self.k1.bounds = bounds[:k1_dims]
+        self.k2.bounds = bounds[k1_dims:]
 
     def __eq__(self, b):
         return (self.k1 == b.k1 and self.k2 == b.k2) \
             or (self.k1 == b.k2 and self.k2 == b.k1)
 
     def is_stationary(self):
-        """ Retuuns whether the kernel is stationary. """
+        """ Returns whether the kernel is stationary. """
         return self.k1.is_stationary() and self.k2.is_stationary()
 
 
@@ -475,38 +486,32 @@ class ConstantKernel(Kernel):
 
     Tunable kernel parameters
     -------------------------
-    value : float
+    c : float
         The constant value used for determining the magnitude (product-kernel)
         or offset of mean (sum-kernel).
     """
+    def __init__(self, c=1.0, c_bounds=(0, np.inf)):
+        self.c = c
+        self.c_bounds = c_bounds
+
+        self.theta_vars = ["c"] if c_bounds is not "fixed" else []
 
     @classmethod
     def from_literal(cls, literal):
         if np.iterable(literal):
             if len(literal) == 1:
-                return cls(literal[0])
+                return cls(c=literal[0])
             elif len(literal) == 2:
-                return cls((literal[0] + literal[1]) / 2, literal[0],
-                            literal[1])
+                return cls(c=(literal[0] + literal[1]) / 2,
+                           c_bounds=(literal[0], literal[1]))
             elif len(literal) == 3:
-                return cls(literal[1], literal[0], literal[2])
+                return cls(c=literal[1], c_bounds=(literal[0], literal[2]))
             else:
                 raise ValueError("Cannot interpret literal %s for "
                                  "ConstantKernel." % literal)
         else:
             return cls(literal)
 
-    @property
-    def theta(self):
-        return np.array([self.value])
-
-    @theta.setter
-    def theta(self, theta):
-        if len(theta) != 1:
-            raise ValueError("theta has not the correct number of entries."
-                             " Should be 1; given are %d" % len(theta))
-        self.value = theta[0]
-
     def __call__(self, X, Y=None, eval_gradient=False):
         """ Return the kernel k(X, Y) and optionally its gradient.
 
@@ -539,9 +544,12 @@ def __call__(self, X, Y=None, eval_gradient=False):
         elif eval_gradient:
             raise ValueError("Gradient can only be evaluated when Y is None.")
 
-        K = self.value * np.ones((X.shape[0], Y.shape[0]))
+        K = self.c * np.ones((X.shape[0], Y.shape[0]))
         if eval_gradient:
-            return K, np.ones((X.shape[0], X.shape[0], 1))
+            if self.c_bounds is not "fixed":
+                return K, np.ones((X.shape[0], X.shape[0], 1))
+            else:
+                return K, np.empty((X.shape[0], X.shape[0], 0))
         else:
             return K
 
@@ -562,10 +570,10 @@ def diag(self, X):
         K_diag : array, shape (n_samples_X,)
             Diagonal of kernel k(X, X)
         """
-        return self.value * np.ones(X.shape[0])
+        return self.c * np.ones(X.shape[0])
 
     def __repr__(self):
-        return "{0:.3g}".format(self.value)
+        return "{0:.3g}".format(self.c)
 
 
 class WhiteKernel(Kernel):
@@ -580,14 +588,11 @@ class WhiteKernel(Kernel):
     c : float
         Parameter controlling the noise level.
     """
+    def __init__(self, c=1.0, c_bounds=(0.0, np.inf)):
+        self.c = c
+        self.c_bounds = c_bounds
 
-    @property
-    def theta(self):
-        return np.asarray([self.c])
-
-    @theta.setter
-    def theta(self, theta):
-        self.c = theta[0]
+        self.theta_vars = ["c"] if c_bounds is not "fixed" else []
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """ Return the kernel k(X, Y) and optionally its gradient.
@@ -622,7 +627,10 @@ def __call__(self, X, Y=None, eval_gradient=False):
         if Y is None:
             K = self.c * np.eye(X.shape[0])
             if eval_gradient:
-                return K, np.eye(X.shape[0])[:, :, np.newaxis]
+                if self.c_bounds is not "fixed":
+                    return K, np.eye(X.shape[0])[:, :, np.newaxis]
+                else:
+                    return K, np.empty((X.shape[0], X.shape[0], 0))
             else:
                 return K
         else:
@@ -650,6 +658,9 @@ def diag(self, X):
         """
         return self.c * np.ones(X.shape[0])
 
+    def __repr__(self):
+        return "{0}(c={1:.3g})".format(self.__class__.__name__, self.c)
+
 
 class RBF(Kernel):
     """ Radial-basis function kernel (aka squared-exponential kernel).
@@ -663,14 +674,15 @@ class RBF(Kernel):
         used. If an array, an anisotropic kernel is used where each dimension
         of l defines the length-scale of the respective feature dimension.
     """
+    def __init__(self, l=1.0, l_bounds=(1e-5, np.inf)):
+        if np.iterable(l):
+            self.l = np.asarray(l, dtype=np.float)
+        else:
+            self.l = float(l)
+        self.l_bounds = l_bounds
 
-    @property
-    def theta(self):
-        return np.asarray(self.l)
+        self.theta_vars = ["l"] if l_bounds is not "fixed" else []
 
-    @theta.setter
-    def theta(self, theta):
-        self.l = theta
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """ Return the kernel k(X, Y) and optionally its gradient.
@@ -713,7 +725,9 @@ def __call__(self, X, Y=None, eval_gradient=False):
             K = np.exp(-.5 * dists)
 
         if eval_gradient:
-            if self.l.shape[0] == 1:
+            if self.l_bounds is "fixed":  # Hyperparameter l kept fixed
+                return K, np.empty((X.shape[0], X.shape[0], 0))
+            elif not np.iterable(self.l) or self.l.shape[0] == 1:
                 K_gradient = \
                     (K * squareform(dists) / self.l)[:, :, np.newaxis]
                 return K, K_gradient
@@ -729,6 +743,15 @@ def __call__(self, X, Y=None, eval_gradient=False):
         else:
             return K
 
+    def __repr__(self):
+        if np.iterable(self.l):  # anisotropic
+            return "{0}(l=[{1}])".format(self.__class__.__name__,
+                                         ", ".join(map("{0:.3g}".format,
+                                                   self.l)))
+        else:  # isotropic
+            return "{0}(l={1:.3g})".format(self.__class__.__name__, self.l)
+
+
 
 class RationalQuadratic(Kernel):
     """ Rational Quadratic kernel.
@@ -745,18 +768,16 @@ class RationalQuadratic(Kernel):
     l : float > 0
         The length scale of the kernel.
     """
+    def __init__(self, alpha=1.0, l=1.0, alpha_bounds=(1e-5, np.inf),
+                 l_bounds=(1e-5, np.inf)):
+        self.alpha = alpha
+        self.l = l
+        self.alpha_bounds = alpha_bounds
+        self.l_bounds = l_bounds
 
-    def __init__(self, theta=[1.0, 1.0], thetaL=1e-5, thetaU=np.inf):
-        super(RationalQuadratic, self).__init__(theta, thetaL, thetaU)
-
-    @property
-    def theta(self):
-        return np.asarray([self.alpha, self.l])
-
-    @theta.setter
-    def theta(self, theta):
-        self.alpha = theta[0]
-        self.l = theta[1]
+        self.theta_vars = ["alpha"] if alpha_bounds is not "fixed" else []
+        if self.l_bounds is not "fixed":
+            self.theta_vars += ["l"]
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """ Return the kernel k(X, Y) and optionally its gradient.
@@ -799,13 +820,27 @@ def __call__(self, X, Y=None, eval_gradient=False):
             K = (1 + dists / (2 * self.alpha * self.l ** 2)) ** -self.alpha
 
         if eval_gradient:
-            K_gradient = np.empty((K.shape[0], K.shape[1], 2))
-            K_gradient[..., 0] = K * (-np.log(base) + tmp / base)
-            K_gradient[..., 1] = dists * K / (self.l ** 2 * base)
-            return K, K_gradient
+            # gradient with respect to alpha
+            if "alpha" in self.theta_vars:
+                alpha_gradient = K * (-np.log(base) + tmp / base)
+                alpha_gradient = alpha_gradient[:, :, np.newaxis]
+            else:  # alpha is kept fixed
+                alpha_gradient = np.empty((K.shape[0], K.shape[1], 0))
+            # gradient with respect to l
+            if "l" in self.theta_vars:
+                l_gradient = dists * K / (self.l ** 2 * base)
+                l_gradient = l_gradient[:, :, np.newaxis]
+            else:  # l is kept fixed
+                l_gradient = np.empty((K.shape[0], K.shape[1], 0))
+
+            return K, np.dstack((alpha_gradient, l_gradient))
         else:
             return K
 
+    def __repr__(self):
+        return "{0}(alpha={1:.3g}, l={2:.3g})".format(
+            self.__class__.__name__, self.alpha, self.l)
+
 
 class ExpSineSquared(Kernel):
     """ Exp-Sine-Squared kernel.
@@ -822,17 +857,16 @@ class ExpSineSquared(Kernel):
         The periodicity of the kernel.
     """
 
-    def __init__(self, theta=[1.0, 1.0], thetaL=1e-5, thetaU=np.inf):
-        super(ExpSineSquared, self).__init__(theta, thetaL, thetaU)
+    def __init__(self, l=1.0, p=1.0, l_bounds=(1e-5, np.inf),
+                 p_bounds=(1e-5, np.inf)):
+        self.l = l
+        self.p = p
+        self.l_bounds = l_bounds
+        self.p_bounds = p_bounds
 
-    @property
-    def theta(self):
-        return np.asarray([self.l, self.p])
-
-    @theta.setter
-    def theta(self, theta):
-        self.l = theta[0]
-        self.p = theta[1]
+        self.theta_vars = ["l"] if l_bounds is not "fixed" else []
+        if self.p_bounds is not "fixed":
+            self.theta_vars += ["p"]
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """ Return the kernel k(X, Y) and optionally its gradient.
@@ -874,15 +908,29 @@ def __call__(self, X, Y=None, eval_gradient=False):
             K = np.exp(- 2 * (np.sin(np.pi / self.p * dists) / self.l) ** 2)
 
         if eval_gradient:
-            K_gradient = np.empty((K.shape[0], K.shape[1], 2))
             cos_of_arg = np.cos(arg)
-            K_gradient[..., 0] = 4 / self.l**3 * sin_of_arg**2 * K
-            K_gradient[..., 1] = \
-                4 * arg / (self.l**2 * self.p) * cos_of_arg * sin_of_arg * K
-            return K, K_gradient
+            # gradient with respect to l
+            if "l" in self.theta_vars:
+                l_gradient = 4 / self.l**3 * sin_of_arg**2 * K
+                l_gradient = l_gradient[:, :, np.newaxis]
+            else:  # l is kept fixed
+                l_gradient = np.empty((K.shape[0], K.shape[1], 0))
+            # gradient with respect to p
+            if "p" in self.theta_vars:
+                p_gradient = \
+                   4 * arg / (self.l**2 * self.p) * cos_of_arg * sin_of_arg * K
+                p_gradient = p_gradient[:, :, np.newaxis]
+            else:  # p is kept fixed
+                p_gradient = np.empty((K.shape[0], K.shape[1], 0))
+
+            return K, np.dstack((l_gradient, p_gradient))
         else:
             return K
 
+    def __repr__(self):
+        return "{0}(l={1:.3g}, p={2:.3g})".format(
+            self.__class__.__name__, self.l, self.p)
+
 
 class DotProduct(Kernel):
     """ Dot-Product kernel.
@@ -896,16 +944,11 @@ class DotProduct(Kernel):
         the kernel is homogenous.
     """
 
-    def __init__(self, theta=[1.0, 1.0], thetaL=1e-5, thetaU=np.inf):
-        super(DotProduct, self).__init__(theta, thetaL, thetaU)
-
-    @property
-    def theta(self):
-        return np.asarray([self.sigma_0])
+    def __init__(self, sigma_0=1.0, sigma_0_bounds=(1e-5, np.inf)):
+        self.sigma_0 = sigma_0
+        self.sigma_0_bounds = sigma_0_bounds
 
-    @theta.setter
-    def theta(self, theta):
-        self.sigma_0 = theta[0]
+        self.theta_vars = ["sigma_0"] if sigma_0_bounds is not "fixed" else []
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """ Return the kernel k(X, Y) and optionally its gradient.
@@ -943,9 +986,12 @@ def __call__(self, X, Y=None, eval_gradient=False):
             K = np.inner(X, Y) + self.sigma_0 ** 2
 
         if eval_gradient:
-            K_gradient = np.empty((K.shape[0], K.shape[1], 1))
-            K_gradient[..., 0] = 2 * self.sigma_0
-            return K, K_gradient
+            if self.sigma_0_bounds is not "fixed":
+                K_gradient = np.empty((K.shape[0], K.shape[1], 1))
+                K_gradient[..., 0] = 2 * self.sigma_0
+                return K, K_gradient
+            else:
+                return K, np.empty((X.shape[0], X.shape[0], 0))
         else:
             return K
 
@@ -972,6 +1018,10 @@ def is_stationary(self):
         """ Returns whether the kernel is stationary. """
         return False
 
+    def __repr__(self):
+        return "{0}(sigma_0={1:.3g})".format(
+            self.__class__.__name__, self.sigma_0)
+
 
 # adapted from scipy/optimize/optimize.py for functions with 2d output
 def _approx_fprime(xk, f, epsilon, args=()):
@@ -1014,22 +1064,15 @@ class PairwiseKernel(Kernel):
         Any further parameters are passed directly to the kernel function.
     """
 
-    def __init__(self, theta=1.0, thetaL=1e-5, thetaU=np.inf, metric="linear",
-                 **kwargs):
-        super(PairwiseKernel, self).__init__(theta, thetaL, thetaU)
-        self.metric = metric
-        self.kwargs = kwargs
-        if "gamma" in kwargs:
-            raise ValueError(
-                "Gamma must not be set directly but via param_space.")
+    def __init__(self, gamma=1.0, gamma_bounds=(1e-5, np.inf),
+                 metric="linear", **kwargs):
+        self.gamma = gamma
+        self.gamma_bounds = gamma_bounds
 
-    @property
-    def theta(self):
-        return np.asarray([self.gamma])
+        self.theta_vars = ["gamma"] if gamma_bounds is not "fixed" else []
 
-    @theta.setter
-    def theta(self, theta):
-        self.gamma = theta[0]
+        self.metric = metric
+        self.kwargs = kwargs
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """ Return the kernel k(X, Y) and optionally its gradient.
@@ -1061,12 +1104,15 @@ def __call__(self, X, Y=None, eval_gradient=False):
         K = pairwise_kernels(X, Y, metric=self.metric, gamma=self.gamma,
                              filter_params=True, **self.kwargs)
         if eval_gradient:
-            # approximate gradient numerically
-            def f(gamma):  # helper function
-                return pairwise_kernels(
-                    X, Y, metric=self.metric, gamma=gamma,
-                    filter_params=True, **self.kwargs)
-            return K, _approx_fprime(self.theta, f, 1e-10)
+            if self.gamma_bounds is "fixed":
+                return K, np.empty((X.shape[0], X.shape[0], 0))
+            else:
+                # approximate gradient numerically
+                def f(gamma):  # helper function
+                    return pairwise_kernels(
+                        X, Y, metric=self.metric, gamma=gamma,
+                        filter_params=True, **self.kwargs)
+                return K, _approx_fprime(self.theta, f, 1e-10)
         else:
             return K
 
@@ -1095,6 +1141,5 @@ def is_stationary(self):
         return self.metric in ["rbf"]
 
     def __repr__(self):
-        return "{0}({1}, metric={2})".format(
-            self.__class__.__name__,
-            ", ".join(map("{0:.3g}".format, self.theta)), self.metric)
+        return "{0}(gamma={1}, metric={2})".format(
+            self.__class__.__name__, self.gamma, self.metric)

From 5ba8e7182dce2c40bb422a54168309b73589a26d Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 29 Mar 2015 10:22:48 +0200
Subject: [PATCH 062/165] TST Adapted tests to changes in kernels and added
 additional test for theta property

---
 sklearn/gaussian_process/tests/test_gpc.py    |  4 +-
 sklearn/gaussian_process/tests/test_gpr.py    |  8 +--
 .../gaussian_process/tests/test_kernels.py    | 70 ++++++++++++++++---
 3 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 1bfb56a132bae..04d9f103e15e6 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -21,8 +21,8 @@ def f(x):
 y = np.array(f(X).ravel() > 0, dtype=int)
 
 
-kernels = [RBF(0.1), RBF(1.0, 1e-3, 1e3),
-           (1e-2, 1.0, 1e2) *  RBF(1.0, 1e-3, 1e3)]
+kernels = [RBF(l=0.1), RBF(l=1.0, l_bounds=(1e-3, 1e3)),
+           (1e-2, 1.0, 1e2) *  RBF(l=1.0, l_bounds=(1e-3, 1e3))]
 
 
 def test_predict_consistent():
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index e3c443ca5c025..f02ef7050d68c 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -22,10 +22,10 @@ def f(x):
 y = f(X).ravel()
 
 
-kernels = [RBF(1.0), RBF(1.0, 1e-3, 1e3),
-           (1e-2, 1.0, 1e2) * RBF(1.0, 1e-3, 1e3),
-           (1e-2, 1.0, 1e2) * RBF(1.0, 1e-3, 1e3) + (0.0, 0.0, 1e2),
-           (1e-2, 0.1, 1e2) * RBF(1.0, 1e-3, 1e3) + (0.0, 0.0, 1e2)]
+kernels = [RBF(l=1.0), RBF(l=1.0, l_bounds=(1e-3, 1e3)),
+           (1e-2, 1.0, 1e2)*RBF(l=1.0, l_bounds=(1e-3, 1e3)),
+           (1e-2, 1.0, 1e2)*RBF(l=1.0, l_bounds=(1e-3, 1e3)) + (0.0, 0.0, 1e2),
+           (1e-2, 0.1, 1e2)*RBF(l=1.0, l_bounds=(1e-3, 1e3)) + (0.0, 0.0, 1e2)]
 
 
 def test_gpr_interpolation():
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 811ff6e383c6c..78f8e4e3ff33c 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -4,6 +4,7 @@
 # Licence: BSD 3 clause
 
 from collections import Hashable
+import inspect
 
 import numpy as np
 
@@ -12,7 +13,8 @@
 from sklearn.metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS
 from sklearn.gaussian_process.kernels \
     import (RBF, RationalQuadratic, ExpSineSquared, DotProduct,
-            ConstantKernel, WhiteKernel, PairwiseKernel)
+            ConstantKernel, WhiteKernel, PairwiseKernel, KernelOperator,
+            Exponentiation)
 from sklearn.base import clone
 
 from sklearn.utils.testing import (assert_equal, assert_almost_equal,
@@ -21,16 +23,16 @@
 
 X = np.random.normal(0, 1, (10, 2))
 
-kernels = [RBF(2.0), RBF([0.5, 2.0]),
-           ConstantKernel(10.0),
-           2.0 * RBF(0.5), RBF(2.0) + WhiteKernel(1.0),
-           RationalQuadratic([1.0, 1.0]),
-           ExpSineSquared([1.0, 1.0]),
-           DotProduct(1.0), DotProduct(1.0) ** 2]
+kernels = [RBF(l=2.0), RBF(l_bounds=(0.5, 2.0)),
+           ConstantKernel(c=10.0),
+           2.0 * RBF(l=0.5), RBF(l=2.0) + WhiteKernel(c=1.0),
+           RationalQuadratic(alpha=1.0, l=1.0),
+           ExpSineSquared(l=1.0, p=1.0),
+           DotProduct(sigma_0=1.0), DotProduct(sigma_0=1.0) ** 2]
 for metric in PAIRWISE_KERNEL_FUNCTIONS:
     if metric in ["additive_chi2", "chi2"]:
         continue
-    kernels.append(PairwiseKernel(1.0, metric=metric))
+    kernels.append(PairwiseKernel(gamma=1.0, metric=metric))
 
 
 def test_kernel_gradient():
@@ -56,6 +58,58 @@ def eval_kernel_ij_for_theta(theta):
         assert_almost_equal(K_gradient, K_gradient_approx, 4)
 
 
+def test_kernel_theta():
+    """ Check that parameter vector theta of kernel is set correctly. """
+    for kernel in kernels:
+        if isinstance(kernel, KernelOperator) \
+            or isinstance(kernel, Exponentiation):  # skip non-basic kernels
+            continue
+        theta = kernel.theta
+        _, K_gradient = kernel(X, eval_gradient=True)
+
+        # Determine kernel parameters that contribute to theta
+        args, varargs, kw, default = \
+            inspect.getargspec(kernel.__class__.__init__)
+        theta_vars = map(lambda s: s.rstrip("_bounds"),
+                         filter(lambda s: s.endswith("_bounds"), args))
+        assert_equal(kernel.theta_vars, theta_vars)
+
+        # Check that values returned in theta are consistent with
+        # hyperparameter values
+        for i, theta_var in enumerate(theta_vars):
+            assert_equal(theta[i], getattr(kernel, theta_var))
+
+        # Fixed kernel parameters must be excluded from theta and gradient.
+        for i, theta_var in enumerate(theta_vars):
+            # create copy with certain hyperparameter fixed
+            params = kernel.get_params()
+            params[theta_var + "_bounds"] = "fixed"
+            kernel_class = kernel.__class__
+            new_kernel = kernel_class(**params)
+            # Check that theta and K_gradient are identical with the fixed
+            # dimension left out
+            _, K_gradient_new = new_kernel(X, eval_gradient=True)
+            assert_equal(theta.shape[0], new_kernel.theta.shape[0] + 1)
+            assert_equal(K_gradient.shape[2], K_gradient_new.shape[2] + 1)
+            if i > 0:
+                assert_equal(theta[:i], new_kernel.theta[:i])
+                assert_array_equal(K_gradient[..., :i],
+                                   K_gradient_new[..., :i])
+            if i + 1 < len(theta_vars):
+                assert_equal(theta[i+1:], new_kernel.theta[i:])
+                assert_array_equal(K_gradient[..., i+1:],
+                                   K_gradient_new[..., i:])
+
+        # Check that values of theta are modified correctly
+        for i, theta_var in enumerate(theta_vars):
+            theta[i] = 42
+            kernel.theta = theta
+            assert_equal(getattr(kernel, theta_var), 42)
+
+            setattr(kernel, theta_var, 43)
+            assert_equal(kernel.theta[i], 43)
+
+
 def test_auto_vs_cross():
     """ Auto-correlation and cross-correlation should be consistent. """
     for kernel in kernels:

From 57ebe4fd1791b34f3cdf78111e50a598a4519b5c Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 29 Mar 2015 10:23:33 +0200
Subject: [PATCH 063/165] DOC Adapted GP examples to changes in kernel module

---
 examples/gaussian_process/plot_gpc.py          |  4 ++--
 .../plot_gpc_isoprobability.py                 |  2 +-
 examples/gaussian_process/plot_gpc_xor.py      |  2 +-
 examples/gaussian_process/plot_gpr_co2.py      | 18 ++++++++++--------
 examples/gaussian_process/plot_gpr_noisy.py    |  6 +++---
 .../plot_gpr_prior_posterior.py                | 10 ++++++----
 6 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/examples/gaussian_process/plot_gpc.py b/examples/gaussian_process/plot_gpc.py
index 863d68725d8d4..45d1c3385d8f1 100644
--- a/examples/gaussian_process/plot_gpc.py
+++ b/examples/gaussian_process/plot_gpc.py
@@ -25,10 +25,10 @@
 y = np.array(np.sin((X[:, 0] - 2.5) ** 2) > 0.0, dtype=int)
 
 # Specify Gaussian Processes with fixed and optimized hyperparameters
-gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(1.0),
+gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(l=1.0),
 								   optimizer=None).fit(X, y)
 
-gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(1.0)).fit(X, y)
+gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(l=1.0)).fit(X, y)
 
 print "Log Marginal Likelihood (initial): %.3f" % \
     gp_fix.log_marginal_likelihood(gp_fix.theta_)
diff --git a/examples/gaussian_process/plot_gpc_isoprobability.py b/examples/gaussian_process/plot_gpc_isoprobability.py
index e0d55c0160a67..cef2a8160d9b5 100644
--- a/examples/gaussian_process/plot_gpc_isoprobability.py
+++ b/examples/gaussian_process/plot_gpc_isoprobability.py
@@ -46,7 +46,7 @@ def g(x):
 y = np.array(g(X) > 0, dtype=int)
 
 # Instanciate and fit Gaussian Process Model
-kernel = 0.1 * DotProduct(0.1) ** 2
+kernel = (1e-5, 0.1, np.inf) * DotProduct(sigma_0=0.1) ** 2
 gp = GaussianProcessClassifier(kernel=kernel)
 gp.fit(X, y)
 print "Learned kernel: %s " % gp.kernel_
diff --git a/examples/gaussian_process/plot_gpc_xor.py b/examples/gaussian_process/plot_gpc_xor.py
index 8563b1e3a504b..5f61f21ff6984 100644
--- a/examples/gaussian_process/plot_gpc_xor.py
+++ b/examples/gaussian_process/plot_gpc_xor.py
@@ -22,7 +22,7 @@
 Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
 
 # fit the model
-kernel = 1.0 * RBF(1.0)
+kernel = 1.0 * RBF(l=1.0)
 clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y)
 
 # plot the decision function for each datapoint on the grid
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index ab6f40877185f..eb8af85e018d9 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -20,10 +20,10 @@
 y_mean = y.mean()
 
 # Kernel with parameters given in GPML book
-k1 = 66.0**2 * RBF(67.0)  # long term smooth rising trend
-k2 = 2.4**2 * RBF(90.0) * ExpSineSquared((1.3, 1.0))  # seasonal component
-k3 = 0.66**2 * RationalQuadratic((0.78, 1.2))  # medium term irregularit.
-k4 = 0.18**2 * RBF(0.134) + WhiteKernel(0.19**2) # noise terms
+k1 = 66.0**2 * RBF(l=67.0)  # long term smooth rising trend
+k2 = 2.4**2 * RBF(l=90.0) * ExpSineSquared(l=1.3, p=1.0)  # seasonal component
+k3 = 0.66**2 * RationalQuadratic(alpha=0.78, l=1.2)  # medium term irregularit.
+k4 = 0.18**2 * RBF(l=0.134) + WhiteKernel(c=0.19**2) # noise terms
 kernel_gpml = k1 + k2 + k3 + k4
 
 gp = GaussianProcessRegressor(kernel=kernel_gpml, y_err=0, optimizer=None)
@@ -33,10 +33,12 @@
 print "Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.theta_)
 
 # Kernel with optimized parameters
-k1 = 50.0**2 * RBF(50.0) # long term smooth rising trend
-k2 = 2.0**2 * RBF(100.0) * ExpSineSquared((1.0, 1.0)) # seasonal component
-k3 = 0.5**2 * RationalQuadratic((1.0, 1.0)) # medium term irregularities
-k4 = 0.1**2 * RBF(0.1) + WhiteKernel(0.1**2, 1e-3, np.inf)  # noise terms
+k1 = 50.0**2 * RBF(l=50.0) # long term smooth rising trend
+k2 = 2.0**2 * RBF(l=100.0) \
+	* ExpSineSquared(l=1.0, p=1.0, p_bounds="fixed") # seasonal component
+k3 = 0.5**2 * RationalQuadratic(alpha=1.0, l=1.0) # medium term irregularities
+k4 = 0.1**2 * RBF(l=0.1) + WhiteKernel(c=0.1**2,
+									   c_bounds=(1e-3, np.inf))  # noise terms
 kernel = k1 + k2 + k3 + k4
 
 gp = GaussianProcessRegressor(kernel=kernel, y_err=0)
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index a55879575efb1..108ba936a6f15 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -29,11 +29,11 @@
 
 rng = np.random.RandomState(0)
 X = rng.uniform(0, 5, 20)[:, np.newaxis]
-y = 0.5*np.sin(3*X[:, 0]) + rng.normal(0, 0.5, X.shape[0])
+y = 0.5 * np.sin(3 * X[:, 0]) + rng.normal(0, 0.5, X.shape[0])
 
 # First run
 plt.figure(0)
-kernel = 1.0 * RBF(100.0) + WhiteKernel(1e-5, 1e-10, 1e+1)
+kernel = 1.0 * RBF(l=100.0) + WhiteKernel(c=1e-5, c_bounds=(1e-10, 1e+1))
 gp = GaussianProcessRegressor(kernel=kernel,
                               y_err=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
@@ -51,7 +51,7 @@
 
 # First run
 plt.figure(1)
-kernel = 1.0 * RBF(1.0) + WhiteKernel(1e-5, 1e-10, 1e+1)
+kernel = 1.0 * RBF(l=1.0) + WhiteKernel(c=1e-5, c_bounds=(1e-10, 1e+1))
 gp = GaussianProcessRegressor(kernel=kernel,
                               y_err=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
diff --git a/examples/gaussian_process/plot_gpr_prior_posterior.py b/examples/gaussian_process/plot_gpr_prior_posterior.py
index 958530b54100a..d6bc63ed788ec 100644
--- a/examples/gaussian_process/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/plot_gpr_prior_posterior.py
@@ -19,10 +19,12 @@
     import RBF, RationalQuadratic, ExpSineSquared, DotProduct
 
 
-kernels = [1.0 * RBF(1.0, 1e-1, 10.0),
-           1.0 * RationalQuadratic((0.1, 1.0)),
-           1.0 * ExpSineSquared((1.0, 3.0), (0.1, 1.0), (10.0, 10.0)),
-           (0.01, 0.1, 10.0) * (DotProduct(1.0, 0.0, 10.0) ** 2)]
+kernels = [1.0 * RBF(l=1.0, l_bounds=(1e-1, 10.0)),
+           1.0 * RationalQuadratic(alpha=0.1, l=1.0),
+           1.0 * ExpSineSquared(l=1.0, p=3.0, l_bounds=(0.1, 10.0),
+                                p_bounds=(1.0, 10.0)),
+           (0.01, 0.1, 10.0) \
+                * (DotProduct(sigma_0=1.0, sigma_0_bounds=(0.0, 10.0)) ** 2)]
 
 for fig_index, kernel in enumerate(kernels):
     if fig_index > 3: continue

From 9ffab4c15d5b3b85c73ebfd962ca71bb893483b3 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 29 Mar 2015 18:06:49 +0200
Subject: [PATCH 064/165] FIX Kernel property theta can deal with anisotropic
 RBF kernel

---
 sklearn/gaussian_process/kernels.py           | 58 ++++++++++++++-----
 .../gaussian_process/tests/test_kernels.py    | 13 ++++-
 2 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 0413f3bdbe0e9..dc80390e603a0 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -81,31 +81,55 @@ def n_dims(self):
 
     @property
     def theta(self):
-        return np.array([getattr(self, var_name)
-                         for var_name in self.theta_vars])
+        theta = []
+        for var_name in self.theta_vars:
+            if not isinstance(var_name, basestring):  # vector-valued parameter
+                var_name, _ = var_name
+            theta.append(getattr(self, var_name))
+        return np.array(theta)
 
     @theta.setter
     def theta(self, theta):
-        if len(theta) != len(self.theta_vars):
+        i = 0
+        for var_name in self.theta_vars:
+            if not isinstance(var_name, basestring):  # vector-valued parameter
+                var_name, var_length = var_name
+                setattr(self, var_name, theta[i:i + var_length])
+                i += var_length
+            else:
+                setattr(self, var_name, theta[i])
+                i += 1
+
+        if i != len(theta):
             raise ValueError("theta has not the correct number of entries."
                              " Should be %d; given are %d"
-                             % (len(self.theta_vars), len(theta)))
-        for i, var_name in enumerate(self.theta_vars):
-            setattr(self, var_name, theta[i])
+                             % (i, len(theta)))
 
     @property
     def bounds(self):
-        return np.array([getattr(self, var_name + "_bounds")
-                         for var_name in self.theta_vars])
+        bounds = []
+        for var_name in self.theta_vars:
+            if not isinstance(var_name, basestring):  # vector-valued parameter
+                var_name, _ = var_name
+            bounds.append(getattr(self, var_name + "_bounds"))
+        return np.array(bounds)
 
     @bounds.setter
     def bounds(self, bounds):
-        if len(bounds) != len(self.theta_vars):
-            raise ValueError("theta has not the correct number of entries."
+        i = 0
+        for var_name in self.theta_vars:
+            if not isinstance(var_name, basestring):  # vector-valued parameter
+                var_name, var_length = var_name
+                setattr(self, var_name + "_bounds", bounds[i:i + var_length])
+                i += var_length
+            else:
+                setattr(self, var_name + "_bounds", bounds[i])
+                i += 1
+
+        if i != len(bounds):
+            raise ValueError("bounds has not the correct number of entries."
                              " Should be %d; given are %d"
-                             % (len(self.theta_vars), len(bounds)))
-        for i, var_name in enumerate(self.theta_vars):
-            setattr(self, var_name + "_bounds", bounds[i])
+                             % (i, len(bounds)))
 
     def __add__(self, b):
         if not isinstance(b, Kernel):
@@ -681,8 +705,12 @@ def __init__(self, l=1.0, l_bounds=(1e-5, np.inf)):
             self.l = float(l)
         self.l_bounds = l_bounds
 
-        self.theta_vars = ["l"] if l_bounds is not "fixed" else []
-
+        self.theta_vars = []
+        if l_bounds is not "fixed":
+            if np.iterable(l):  # anisotropic l needs special care
+                self.theta_vars.append(("l", len(l)))
+            else:
+                self.theta_vars.append("l")
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """ Return the kernel k(X, Y) and optionally its gradient.
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 78f8e4e3ff33c..9679d929fa660 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -139,17 +139,24 @@ def test_kernel_operator_commutative():
 
 def test_kernel_anisotropic():
     """ Anisotropic kernel should be consistent with isotropic kernels."""
-    K = RBF([0.5, 2.0])(X)
+    kernel = 3.0 * RBF([0.5, 2.0])
+
+    K = kernel(X)
     X1 = np.array(X)
     X1[:, 0] *= 4
-    K1 = RBF(2.0)(X1)
+    K1 = 3.0 * RBF(2.0)(X1)
     assert_almost_equal(K, K1)
 
     X2 = np.array(X)
     X2[:, 1] /= 4
-    K2 = RBF(0.5)(X2)
+    K2 = 3.0 * RBF(0.5)(X2)
     assert_almost_equal(K, K2)
 
+    # Check getting and setting via theta
+    kernel.theta = kernel.theta * 2
+    assert_array_equal(kernel.theta, [6.0, 1.0, 4.0])
+    assert_array_equal(kernel.k2.l, [1.0, 4.0])
+
 
 def test_kernel_stationary():
     """ Test stationarity of kernels."""

From 584b7b1030941492855980b0e00a5ffe661cdb43 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 31 Mar 2015 13:08:04 +0200
Subject: [PATCH 065/165] FIX Dealing with anisotropic length-scales correctly

---
 sklearn/gaussian_process/kernels.py        | 18 ++++++++++++++----
 sklearn/gaussian_process/tests/test_gpr.py | 15 ++++++++++++++-
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index dc80390e603a0..6dde221268489 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -86,7 +86,7 @@ def theta(self):
             if not isinstance(var_name, basestring):  # vector-valued parameter
                 var_name, _ = var_name
             theta.append(getattr(self, var_name))
-        return np.array(theta)
+        return np.array(theta).ravel()
 
     @theta.setter
     def theta(self, theta):
@@ -110,9 +110,19 @@ def bounds(self):
         bounds = []
         for var_name in self.theta_vars:
             if not isinstance(var_name, basestring):  # vector-valued parameter
-                var_name, _ = var_name
-            bounds.append(getattr(self, var_name + "_bounds"))
-        return np.array(bounds)
+                var_name, var_length = var_name
+                var_bounds = np.atleast_2d(getattr(self, var_name + "_bounds"))
+                if var_bounds.shape[0] == 1:
+                    var_bounds = np.repeat(var_bounds, var_length, 0)
+                elif var_bounds.shape[0] != var_length:
+                    raise ValueError("Bounds on %s should have either 1 or "
+                                     "%d dimensions. Given are %d"
+                                     % (var_name, var_length,
+                                        var_bounds.shape[0]))
+                bounds.append(var_bounds)
+            else:
+                bounds.append(getattr(self, var_name + "_bounds"))
+        return np.vstack(bounds)
 
     @bounds.setter
     def bounds(self, bounds):
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index f02ef7050d68c..856c88a9d0b8d 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -130,7 +130,20 @@ def test_predict_cov_vs_std():
     """ Test that predicted std.-dev. is consistent with cov's diagonal."""
     for kernel in kernels:
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
-        print gpr.kernel_
         y_mean, y_cov = gpr.predict(X2, return_cov=True)
         y_mean, y_std = gpr.predict(X2, return_std=True)
         assert_almost_equal(np.sqrt(np.diag(y_cov)), y_std)
+
+
+def test_anisotropic_kernel():
+    """ Test that GPR can identify meaningful anisotropic length-scales. """
+    # We learn a function which varies in one dimension ten-times slower
+    # than in the other. The corresponding length-scales should differ by at
+    # least a factor 5
+    rng = np.random.RandomState(0)
+    X = rng.uniform(-1, 1, (50, 2))
+    y = X[:, 0] +  0.1 * X[:, 1]
+
+    kernel = RBF([1.0, 1.0])
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    assert_greater(gpr.kernel_.theta[1], gpr.kernel_.theta[0] * 5)

From 78ff16c7add4494acfa0d29a105b4d87570e32fe Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 31 Mar 2015 16:38:30 +0200
Subject: [PATCH 066/165] DOC Revised docstrings of kernels.py

---
 sklearn/gaussian_process/gpc.py     |   6 +-
 sklearn/gaussian_process/gpr.py     |   4 +-
 sklearn/gaussian_process/kernels.py | 250 +++++++++++++++++++++-------
 3 files changed, 195 insertions(+), 65 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 142e1674045a0..3220f447b5831 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -28,7 +28,7 @@
 
 
 class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
-    """ Gaussian process classification (GPC) based on Laplace approximation.
+    """Gaussian process classification (GPC) based on Laplace approximation.
 
     The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
     ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and
@@ -233,7 +233,7 @@ def predict_proba(self, X):
         return np.vstack((1 - pi_star, pi_star)).T
 
     def log_marginal_likelihood(self, theta, eval_gradient=False):
-        """ Returns log-marginal likelihood of theta for training data.
+        """Returns log-marginal likelihood of theta for training data.
 
         Parameters
         ----------
@@ -293,7 +293,7 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         return Z, d_Z
 
     def _posterior_mode(self, K, return_temporaries=False):
-        """ Mode-finding for binary Laplace GPC and fixed kernel.
+        """Mode-finding for binary Laplace GPC and fixed kernel.
 
         This approximates the posterior of the latent function values for given
         inputs and target observations with a Gaussian approximation and uses
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 1e4cf09917d85..134c8ec7c185d 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -15,7 +15,7 @@
 
 
 class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
-    """ Gaussian process regression (GPR).
+    """Gaussian process regression (GPR).
 
     The implementation is based on Algorithm 2.1 of ``Gaussian Processes
     for Machine Learning'' (GPML) by Rasmussen and Williams.
@@ -224,7 +224,7 @@ def sample_y(self, X, n_samples=1, random_state=0):
         return y_samples
 
     def log_marginal_likelihood(self, theta, eval_gradient=False):
-        """ Returns log-marginal likelihood of theta for training data.
+        """Returns log-marginal likelihood of theta for training data.
 
         Parameters
         ----------
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 6dde221268489..cfa7e1e839fb8 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -1,17 +1,16 @@
-""" Kernels for Gaussian process regression and classification.
+"""Kernels for Gaussian process regression and classification.
 
 The kernels in this module allow kernel-engineering, i.e., they can be
-combined via the "+" and "*" operators. These expressions can also contain
-scalar values, which are automatically converted to a constant kernel.
+combined via the "+" and "*" operators or be exponentiated with a scalar
+via "**". These sum and product expressions can also contain scalar values,
+which are automatically converted to a constant kernel.
 
 All kernels allow (analytic) gradient-based hyperparameter optimization.
 The space of hyperparameters can be specified by giving lower und upper
 boundaries for the value of each hyperparameter (the search space is thus
-rectangular). This can be achieved by using a pair or triple instead of a
-single float wherever a parameter value is specified. In case of a pair,
-the first value specifies the lower boundary and the second value the upper
-boundary. In case of a triple, the middle value specified the initial value
-of the parameter during hyperparameter-optimization.
+rectangular). Instead of specifying bounds, hyperparameters can also be
+declared to be "fixed", which causes these hyperparameters to be excluded from
+optimization.
 """
 
 # Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
@@ -33,7 +32,7 @@
 
 
 class Kernel(six.with_metaclass(ABCMeta)):
-    """ Base class for all kernels."""
+    """Base class for all kernels."""
 
     def get_params(self, deep=True):
         """Get parameters of this kernel.
@@ -69,18 +68,25 @@ def get_params(self, deep=True):
         return params
 
     def clone_with_theta(self, theta):
-        """ Returns a clone of self with given hyperparameters theta. """
+        """Returns a clone of self with given hyperparameters theta. """
         cloned = clone(self)
         cloned.theta = theta
         return cloned
 
     @property
     def n_dims(self):
-        """ Returns the number of hyperparameters of the kernel."""
+        """Returns the number of non-fixed hyperparameters of the kernel."""
         return self.theta.shape[0]
 
     @property
     def theta(self):
+        """Returns the (flattened) non-fixed hyperparameters of the kernel.
+
+        Returns
+        -------
+        theta : array, shape (n_dims,)
+            The non-fixed hyperparameters of the kernel
+        """
         theta = []
         for var_name in self.theta_vars:
             if not isinstance(var_name, basestring):  # vector-valued parameter
@@ -90,6 +96,13 @@ def theta(self):
 
     @theta.setter
     def theta(self, theta):
+        """Sets the (flattened) non-fixed hyperparameters of the kernel.
+
+        Parameters
+        ----------
+        theta : array, shape (n_dims,)
+            The non-fixed hyperparameters of the kernel
+        """
         i = 0
         for var_name in self.theta_vars:
             if not isinstance(var_name, basestring):  # vector-valued parameter
@@ -107,6 +120,13 @@ def theta(self, theta):
 
     @property
     def bounds(self):
+        """Returns the bounds on the kernel's hyperparameters.
+
+        Returns
+        -------
+        bounds : array, shape (n_dims, 2)
+            The bounds on the kernel's hyperparameters
+        """
         bounds = []
         for var_name in self.theta_vars:
             if not isinstance(var_name, basestring):  # vector-valued parameter
@@ -126,6 +146,13 @@ def bounds(self):
 
     @bounds.setter
     def bounds(self, bounds):
+        """Sets the bounds on the kernel's hyperparameters.
+
+        Parameters
+        ----------
+        bounds : array, shape (n_dims, 2)
+            The bounds on the kernel's hyperparameters
+        """
         i = 0
         for var_name in self.theta_vars:
             if not isinstance(var_name, basestring):  # vector-valued parameter
@@ -202,12 +229,12 @@ def diag(self, X):
         return np.ones(X.shape[0])
 
     def is_stationary(self):
-        """ Returns whether the kernel is stationary. """
+        """Returns whether the kernel is stationary. """
         return True
 
 
 class KernelOperator(Kernel):
-    """ Base class for all kernel operators. """
+    """Base class for all kernel operators. """
 
     def __init__(self, k1, k2):
         self.k1 = k1
@@ -232,16 +259,37 @@ def get_params(self, deep=True):
 
     @property
     def theta(self):
+        """Returns the (flattened) non-fixed hyperparameters of the kernel.
+
+        Returns
+        -------
+        theta : array, shape (n_dims,)
+            The non-fixed hyperparameters of the kernel
+        """
         return np.append(self.k1.theta, self.k2.theta)
 
     @theta.setter
     def theta(self, theta):
+        """Sets the (flattened) non-fixed hyperparameters of the kernel.
+
+        Parameters
+        ----------
+        theta : array, shape (n_dims,)
+            The non-fixed hyperparameters of the kernel
+        """
         k1_dims = self.k1.n_dims
         self.k1.theta = theta[:k1_dims]
         self.k2.theta = theta[k1_dims:]
 
     @property
     def bounds(self):
+        """Returns the bounds on the kernel's hyperparameters.
+
+        Returns
+        -------
+        bounds : array, shape (n_dims, 2)
+            The bounds on the kernel's hyperparameters
+        """
         if self.k1.bounds.size == 0:
             return self.k2.bounds
         if self.k2.bounds.size == 0:
@@ -250,6 +298,13 @@ def bounds(self):
 
     @bounds.setter
     def bounds(self, bounds):
+        """Sets the bounds on the kernel's hyperparameters.
+
+        Parameters
+        ----------
+        bounds : array, shape (n_dims, 2)
+            The bounds on the kernel's hyperparameters
+        """
         k1_dims = self.k1.n_dims
         self.k1.bounds = bounds[:k1_dims]
         self.k2.bounds = bounds[k1_dims:]
@@ -259,12 +314,12 @@ def __eq__(self, b):
             or (self.k1 == b.k2 and self.k2 == b.k1)
 
     def is_stationary(self):
-        """ Returns whether the kernel is stationary. """
+        """Returns whether the kernel is stationary. """
         return self.k1.is_stationary() and self.k2.is_stationary()
 
 
 class Sum(KernelOperator):
-    """ Sum-kernel k1 + k2 of two kernels k1 and k2.
+    """Sum-kernel k1 + k2 of two kernels k1 and k2.
 
     The resulting kernel is defined as
     k_sum(X, Y) = k1(X, Y) + k2(X, Y)
@@ -279,7 +334,7 @@ class Sum(KernelOperator):
     """
 
     def __call__(self, X, Y=None, eval_gradient=False):
-        """ Return the kernel k(X, Y) and optionally its gradient.
+        """Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
@@ -335,7 +390,7 @@ def __repr__(self):
 
 
 class Product(KernelOperator):
-    """ Product-kernel k1 * k2 of two kernels k1 and k2.
+    """Product-kernel k1 * k2 of two kernels k1 and k2.
 
     The resulting kernel is defined as
     k_prod(X, Y) = k1(X, Y) * k2(X, Y)
@@ -350,7 +405,7 @@ class Product(KernelOperator):
     """
 
     def __call__(self, X, Y=None, eval_gradient=False):
-        """ Return the kernel k(X, Y) and optionally its gradient.
+        """Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
@@ -407,8 +462,20 @@ def __repr__(self):
 
 
 class Exponentiation(Kernel):
-    """ Exponentiate kernel by given exponent. """
+    """Exponentiate kernel by given exponent.
+
+    The resulting kernel is defined as
+    k_exp(X, Y) = k(X, Y) ** exponent
+
+    Parameters
+    ----------
+    kernel : Kernel object
+        The base kernel
+
+    exponent : float
+        The exponent for the base kernel
 
+    """
     def __init__(self, kernel, exponent):
         self.kernel = kernel
         self.exponent = exponent
@@ -432,25 +499,53 @@ def get_params(self, deep=True):
 
     @property
     def theta(self):
+        """Returns the (flattened) non-fixed hyperparameters of the kernel.
+
+        Returns
+        -------
+        theta : array, shape (n_dims,)
+            The non-fixed hyperparameters of the kernel
+        """
         return self.kernel.theta
 
     @theta.setter
     def theta(self, theta):
+        """Sets the (flattened) non-fixed hyperparameters of the kernel.
+
+        Parameters
+        ----------
+        theta : array, shape (n_dims,)
+            The non-fixed hyperparameters of the kernel
+        """
         self.kernel.theta = theta
 
     @property
     def bounds(self):
+        """Returns the bounds on the kernel's hyperparameters.
+
+        Returns
+        -------
+        bounds : array, shape (n_dims, 2)
+            The bounds on the kernel's hyperparameters
+        """
         return self.kernel.bounds
 
     @bounds.setter
     def bounds(self, bounds):
+        """Sets the bounds on the kernel's hyperparameters.
+
+        Parameters
+        ----------
+        bounds : array, shape (n_dims, 2)
+            The bounds on the kernel's hyperparameters
+        """
         self.kernel.bounds = bounds
 
     def __eq__(self, b):
         return (self.kernel == b.kernel and self.exponent == b.exponent)
 
     def __call__(self, X, Y=None, eval_gradient=False):
-        """ Return the kernel k(X, Y) and optionally its gradient.
+        """Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
@@ -507,22 +602,26 @@ def __repr__(self):
         return "{0} ** {1}".format(self.kernel, self.exponent)
 
     def is_stationary(self):
-        """ Returns whether the kernel is stationary. """
+        """Returns whether the kernel is stationary. """
         return self.kernel.is_stationary()
 
 
 class ConstantKernel(Kernel):
-    """ Constant kernel.
+    """Constant kernel.
 
     Can be used as part of a product-kernel where it scales the magnitude of
     the other factor (kernel) or as part of a sum-kernel, where it modifies
     the mean of the Gaussian process.
 
-    Tunable kernel parameters
-    -------------------------
-    c : float
-        The constant value used for determining the magnitude (product-kernel)
-        or offset of mean (sum-kernel).
+    k(x_1, x_2) = c for all x_1, x_2
+
+    Parameters
+    ----------
+    c : float, default: 1.0
+        The constant value which defines the covariance: k(x_1, x_2) = c
+
+    c_bounds : pair of floats >= 0, default: (0, np.inf)
+        The lower and upper bound on c
     """
     def __init__(self, c=1.0, c_bounds=(0, np.inf)):
         self.c = c
@@ -547,7 +646,7 @@ def from_literal(cls, literal):
             return cls(literal)
 
     def __call__(self, X, Y=None, eval_gradient=False):
-        """ Return the kernel k(X, Y) and optionally its gradient.
+        """Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
@@ -611,16 +710,21 @@ def __repr__(self):
 
 
 class WhiteKernel(Kernel):
-    """ White kernel.
+    """White kernel.
 
     The main use-case of this kernel is as part of a sum-kernel where it
     explains the noise-component of the signal. Tuning its parameter
     corresponds to estimating the noise-level.
 
-    Tunable kernel parameters
-    -------------------------
-    c : float
-        Parameter controlling the noise level.
+    k(x_1, x_2) = c if x_1 == x_2 else 0
+
+    Parameters
+    ----------
+    c : float, default: 1.0
+        Parameter controlling the noise level
+
+    c_bounds : pair of floats >= 0, default: (0.0, np.inf)
+        The lower and upper bound on c
     """
     def __init__(self, c=1.0, c_bounds=(0.0, np.inf)):
         self.c = c
@@ -629,7 +733,7 @@ def __init__(self, c=1.0, c_bounds=(0.0, np.inf)):
         self.theta_vars = ["c"] if c_bounds is not "fixed" else []
 
     def __call__(self, X, Y=None, eval_gradient=False):
-        """ Return the kernel k(X, Y) and optionally its gradient.
+        """Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
@@ -697,16 +801,19 @@ def __repr__(self):
 
 
 class RBF(Kernel):
-    """ Radial-basis function kernel (aka squared-exponential kernel).
+    """Radial-basis function kernel (aka squared-exponential kernel).
 
     Both isotropic and anisotropic version are supported.
 
-    Tunable kernel parameters
-    -------------------------
-    l : float or array with shape (n_features,), entries > 0
+    Parameters
+    -----------
+    l : float or array with shape (n_features,), entries > 0, default: 1.0
         The length scale of the kernel. If a float, an isotropic kernel is
         used. If an array, an anisotropic kernel is used where each dimension
         of l defines the length-scale of the respective feature dimension.
+
+    l_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+        The lower and upper bound on l
     """
     def __init__(self, l=1.0, l_bounds=(1e-5, np.inf)):
         if np.iterable(l):
@@ -723,7 +830,7 @@ def __init__(self, l=1.0, l_bounds=(1e-5, np.inf)):
                 self.theta_vars.append("l")
 
     def __call__(self, X, Y=None, eval_gradient=False):
-        """ Return the kernel k(X, Y) and optionally its gradient.
+        """Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
@@ -792,19 +899,26 @@ def __repr__(self):
 
 
 class RationalQuadratic(Kernel):
-    """ Rational Quadratic kernel.
+    """Rational Quadratic kernel.
 
     This kernel can be seen as a scale mixture (an infinite sum) of RBF kernels
     with different characteristic length-scales.
 
     Only isotropic variant is supported at the moment.
 
-    Tunable kernel parameters
-    -------------------------
-    alpha : float > 0
+    Parameters
+    ----------
+    alpha : float > 0, default: 1.0
         Scale mixture parameter
-    l : float > 0
+
+    l : float > 0, default: 1.0
         The length scale of the kernel.
+
+    alpha_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+        The lower and upper bound on alpha
+
+    l_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+        The lower and upper bound on l
     """
     def __init__(self, alpha=1.0, l=1.0, alpha_bounds=(1e-5, np.inf),
                  l_bounds=(1e-5, np.inf)):
@@ -818,7 +932,7 @@ def __init__(self, alpha=1.0, l=1.0, alpha_bounds=(1e-5, np.inf),
             self.theta_vars += ["l"]
 
     def __call__(self, X, Y=None, eval_gradient=False):
-        """ Return the kernel k(X, Y) and optionally its gradient.
+        """Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
@@ -881,18 +995,25 @@ def __repr__(self):
 
 
 class ExpSineSquared(Kernel):
-    """ Exp-Sine-Squared kernel.
+    """Exp-Sine-Squared kernel.
 
     This kernel allows modelling periodic functions.
 
     Only isotropic variant is supported at the moment.
 
-    Tunable kernel parameters
-    -------------------------
-    l : float > 0
+    Parameters
+    ----------
+    l : float > 0, default: 1.0
         The length scale of the kernel.
-    p : float > 0
+
+    p : float > 0, default: 1.0
         The periodicity of the kernel.
+
+    l_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+        The lower and upper bound on l
+
+    p_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+        The lower and upper bound on p
     """
 
     def __init__(self, l=1.0, p=1.0, l_bounds=(1e-5, np.inf),
@@ -907,7 +1028,7 @@ def __init__(self, l=1.0, p=1.0, l_bounds=(1e-5, np.inf),
             self.theta_vars += ["p"]
 
     def __call__(self, X, Y=None, eval_gradient=False):
-        """ Return the kernel k(X, Y) and optionally its gradient.
+        """Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
@@ -971,15 +1092,18 @@ def __repr__(self):
 
 
 class DotProduct(Kernel):
-    """ Dot-Product kernel.
+    """Dot-Product kernel.
 
     This kernel is non-stationary.
 
-    Tunable kernel parameters
-    -------------------------
-    sigma_0 : float >= 0
+    Parameters
+    ----------
+    sigma_0 : float >= 0, default: 1.0
         Parameter controlling the inhomogenity of the kernel. If sigma_0=0,
         the kernel is homogenous.
+
+    sigma_0_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+        The lower and upper bound on l
     """
 
     def __init__(self, sigma_0=1.0, sigma_0_bounds=(1e-5, np.inf)):
@@ -989,7 +1113,7 @@ def __init__(self, sigma_0=1.0, sigma_0_bounds=(1e-5, np.inf)):
         self.theta_vars = ["sigma_0"] if sigma_0_bounds is not "fixed" else []
 
     def __call__(self, X, Y=None, eval_gradient=False):
-        """ Return the kernel k(X, Y) and optionally its gradient.
+        """Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
@@ -1053,7 +1177,7 @@ def diag(self, X):
         return (X ** 2).sum(1) + self.sigma_0 ** 2
 
     def is_stationary(self):
-        """ Returns whether the kernel is stationary. """
+        """Returns whether the kernel is stationary. """
         return False
 
     def __repr__(self):
@@ -1075,7 +1199,7 @@ def _approx_fprime(xk, f, epsilon, args=()):
 
 
 class PairwiseKernel(Kernel):
-    """ Wrapper for kernels in sklearn.metrics.pairwise.
+    """Wrapper for kernels in sklearn.metrics.pairwise.
 
     A thin wrapper around the functionality of the kernels in
     sklearn.metrics.pairwise.
@@ -1088,6 +1212,12 @@ class PairwiseKernel(Kernel):
 
     Parameters
     ----------
+    gam ma: float >= 0, default: 1.0
+        Parameter gamma of the pairwise kernel specified by metric
+
+    gamma_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+        The lower and upper bound on gamma
+
     metric : string, or callable
         The metric to use when calculating kernel between instances in a
         feature array. If metric is a string, it must be one of the metrics
@@ -1113,7 +1243,7 @@ def __init__(self, gamma=1.0, gamma_bounds=(1e-5, np.inf),
         self.kwargs = kwargs
 
     def __call__(self, X, Y=None, eval_gradient=False):
-        """ Return the kernel k(X, Y) and optionally its gradient.
+        """Return the kernel k(X, Y) and optionally its gradient.
 
         Parameters
         ----------
@@ -1175,7 +1305,7 @@ def diag(self, X):
         return np.apply_along_axis(self, 1, X)[:, 0]
 
     def is_stationary(self):
-        """ Returns whether the kernel is stationary. """
+        """Returns whether the kernel is stationary. """
         return self.metric in ["rbf"]
 
     def __repr__(self):

From 07b08bb2339d9d24db76cdaf58ddf20e2bd36285 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 31 Mar 2015 16:48:26 +0200
Subject: [PATCH 067/165] REFACTOR Removed literal form of specifying bounds
 for ConstantKernel

---
 .../plot_gpc_isoprobability.py                |  4 ++--
 .../plot_gpr_prior_posterior.py               |  4 ++--
 sklearn/gaussian_process/kernels.py           | 24 ++++---------------
 sklearn/gaussian_process/tests/test_gpc.py    |  4 ++--
 sklearn/gaussian_process/tests/test_gpr.py    | 10 ++++----
 5 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/examples/gaussian_process/plot_gpc_isoprobability.py b/examples/gaussian_process/plot_gpc_isoprobability.py
index cef2a8160d9b5..49df64b89b25a 100644
--- a/examples/gaussian_process/plot_gpc_isoprobability.py
+++ b/examples/gaussian_process/plot_gpc_isoprobability.py
@@ -22,7 +22,7 @@
 from matplotlib import cm
 
 from sklearn.gaussian_process import GaussianProcessClassifier
-from sklearn.gaussian_process.kernels import DotProduct
+from sklearn.gaussian_process.kernels import DotProduct, ConstantKernel as C
 
 # A few constants
 lim = 8
@@ -46,7 +46,7 @@ def g(x):
 y = np.array(g(X) > 0, dtype=int)
 
 # Instanciate and fit Gaussian Process Model
-kernel = (1e-5, 0.1, np.inf) * DotProduct(sigma_0=0.1) ** 2
+kernel = C(0.1, (1e-5, np.inf)) * DotProduct(sigma_0=0.1) ** 2
 gp = GaussianProcessClassifier(kernel=kernel)
 gp.fit(X, y)
 print "Learned kernel: %s " % gp.kernel_
diff --git a/examples/gaussian_process/plot_gpr_prior_posterior.py b/examples/gaussian_process/plot_gpr_prior_posterior.py
index d6bc63ed788ec..92aa57b123ac7 100644
--- a/examples/gaussian_process/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/plot_gpr_prior_posterior.py
@@ -16,14 +16,14 @@
 
 from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels \
-    import RBF, RationalQuadratic, ExpSineSquared, DotProduct
+    import RBF, RationalQuadratic, ExpSineSquared, DotProduct, ConstantKernel
 
 
 kernels = [1.0 * RBF(l=1.0, l_bounds=(1e-1, 10.0)),
            1.0 * RationalQuadratic(alpha=0.1, l=1.0),
            1.0 * ExpSineSquared(l=1.0, p=3.0, l_bounds=(0.1, 10.0),
                                 p_bounds=(1.0, 10.0)),
-           (0.01, 0.1, 10.0) \
+           ConstantKernel(0.1, (0.01, 10.0)) \
                 * (DotProduct(sigma_0=1.0, sigma_0_bounds=(0.0, 10.0)) ** 2)]
 
 for fig_index, kernel in enumerate(kernels):
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index cfa7e1e839fb8..fdcebd6de1259 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -170,22 +170,22 @@ def bounds(self, bounds):
 
     def __add__(self, b):
         if not isinstance(b, Kernel):
-            return Sum(self, ConstantKernel.from_literal(b))
+            return Sum(self, ConstantKernel(b))
         return Sum(self, b)
 
     def __radd__(self, b):
         if not isinstance(b, Kernel):
-            return Sum(ConstantKernel.from_literal(b), self)
+            return Sum(ConstantKernel(b), self)
         return Sum(b, self)
 
     def __mul__(self, b):
         if not isinstance(b, Kernel):
-            return Product(self, ConstantKernel.from_literal(b))
+            return Product(self, ConstantKernel(b))
         return Product(self, b)
 
     def __rmul__(self, b):
         if not isinstance(b, Kernel):
-            return Product(ConstantKernel.from_literal(b), self)
+            return Product(ConstantKernel(b), self)
         return Product(b, self)
 
     def __pow__(self, b):
@@ -629,22 +629,6 @@ def __init__(self, c=1.0, c_bounds=(0, np.inf)):
 
         self.theta_vars = ["c"] if c_bounds is not "fixed" else []
 
-    @classmethod
-    def from_literal(cls, literal):
-        if np.iterable(literal):
-            if len(literal) == 1:
-                return cls(c=literal[0])
-            elif len(literal) == 2:
-                return cls(c=(literal[0] + literal[1]) / 2,
-                           c_bounds=(literal[0], literal[1]))
-            elif len(literal) == 3:
-                return cls(c=literal[1], c_bounds=(literal[0], literal[2]))
-            else:
-                raise ValueError("Cannot interpret literal %s for "
-                                 "ConstantKernel." % literal)
-        else:
-            return cls(literal)
-
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
 
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 04d9f103e15e6..dd88fae268eef 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -8,7 +8,7 @@
 from scipy.optimize import approx_fprime
 
 from sklearn.gaussian_process import GaussianProcessClassifier
-from sklearn.gaussian_process.kernels import RBF
+from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
 
 from sklearn.utils.testing import (assert_true, assert_greater,
                                    assert_almost_equal, assert_array_equal)
@@ -22,7 +22,7 @@ def f(x):
 
 
 kernels = [RBF(l=0.1), RBF(l=1.0, l_bounds=(1e-3, 1e3)),
-           (1e-2, 1.0, 1e2) *  RBF(l=1.0, l_bounds=(1e-3, 1e3))]
+           C(1.0, (1e-2, 1e2)) *  RBF(l=1.0, l_bounds=(1e-3, 1e3))]
 
 
 def test_predict_consistent():
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 856c88a9d0b8d..467994b6d891e 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -8,7 +8,7 @@
 from scipy.optimize import approx_fprime
 
 from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels import RBF
+from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
 
 from sklearn.utils.testing \
     import (assert_true, assert_greater, assert_array_less,
@@ -23,9 +23,11 @@ def f(x):
 
 
 kernels = [RBF(l=1.0), RBF(l=1.0, l_bounds=(1e-3, 1e3)),
-           (1e-2, 1.0, 1e2)*RBF(l=1.0, l_bounds=(1e-3, 1e3)),
-           (1e-2, 1.0, 1e2)*RBF(l=1.0, l_bounds=(1e-3, 1e3)) + (0.0, 0.0, 1e2),
-           (1e-2, 0.1, 1e2)*RBF(l=1.0, l_bounds=(1e-3, 1e3)) + (0.0, 0.0, 1e2)]
+           C(1.0, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3)),
+           C(1.0, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3))
+               + C(0.0, (0.0, 1e2)),
+           C(0.1, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3))
+               + C(0.0, (0.0, 1e2))]
 
 
 def test_gpr_interpolation():

From 5f4038db28b85e29263a667306781c4f6449cfeb Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 31 Mar 2015 17:42:00 +0200
Subject: [PATCH 068/165] REFACTOR Changed order of l and alpha in
 RationalQuadratic kernel

---
 examples/gaussian_process/plot_gpr_co2.py     |  4 +-
 .../plot_gpr_prior_posterior.py               |  2 +-
 sklearn/gaussian_process/kernels.py           | 46 +++++++++----------
 .../gaussian_process/tests/test_kernels.py    |  2 +-
 4 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index eb8af85e018d9..71236b48c3094 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -22,7 +22,7 @@
 # Kernel with parameters given in GPML book
 k1 = 66.0**2 * RBF(l=67.0)  # long term smooth rising trend
 k2 = 2.4**2 * RBF(l=90.0) * ExpSineSquared(l=1.3, p=1.0)  # seasonal component
-k3 = 0.66**2 * RationalQuadratic(alpha=0.78, l=1.2)  # medium term irregularit.
+k3 = 0.66**2 * RationalQuadratic(l=1.2, alpha=0.78)  # medium term irregularit.
 k4 = 0.18**2 * RBF(l=0.134) + WhiteKernel(c=0.19**2) # noise terms
 kernel_gpml = k1 + k2 + k3 + k4
 
@@ -36,7 +36,7 @@
 k1 = 50.0**2 * RBF(l=50.0) # long term smooth rising trend
 k2 = 2.0**2 * RBF(l=100.0) \
 	* ExpSineSquared(l=1.0, p=1.0, p_bounds="fixed") # seasonal component
-k3 = 0.5**2 * RationalQuadratic(alpha=1.0, l=1.0) # medium term irregularities
+k3 = 0.5**2 * RationalQuadratic(l=1.0, alpha=1.0) # medium term irregularities
 k4 = 0.1**2 * RBF(l=0.1) + WhiteKernel(c=0.1**2,
 									   c_bounds=(1e-3, np.inf))  # noise terms
 kernel = k1 + k2 + k3 + k4
diff --git a/examples/gaussian_process/plot_gpr_prior_posterior.py b/examples/gaussian_process/plot_gpr_prior_posterior.py
index 92aa57b123ac7..075178738f34a 100644
--- a/examples/gaussian_process/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/plot_gpr_prior_posterior.py
@@ -20,7 +20,7 @@
 
 
 kernels = [1.0 * RBF(l=1.0, l_bounds=(1e-1, 10.0)),
-           1.0 * RationalQuadratic(alpha=0.1, l=1.0),
+           1.0 * RationalQuadratic(l=1.0, alpha=0.1),
            1.0 * ExpSineSquared(l=1.0, p=3.0, l_bounds=(0.1, 10.0),
                                 p_bounds=(1.0, 10.0)),
            ConstantKernel(0.1, (0.01, 10.0)) \
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index fdcebd6de1259..4eac08e486fe3 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -881,7 +881,6 @@ def __repr__(self):
             return "{0}(l={1:.3g})".format(self.__class__.__name__, self.l)
 
 
-
 class RationalQuadratic(Kernel):
     """Rational Quadratic kernel.
 
@@ -892,28 +891,28 @@ class RationalQuadratic(Kernel):
 
     Parameters
     ----------
-    alpha : float > 0, default: 1.0
-        Scale mixture parameter
-
     l : float > 0, default: 1.0
         The length scale of the kernel.
 
-    alpha_bounds : pair of floats >= 0, default: (1e-5, np.inf)
-        The lower and upper bound on alpha
+    alpha : float > 0, default: 1.0
+        Scale mixture parameter
 
     l_bounds : pair of floats >= 0, default: (1e-5, np.inf)
         The lower and upper bound on l
+
+    alpha_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+        The lower and upper bound on alpha
     """
-    def __init__(self, alpha=1.0, l=1.0, alpha_bounds=(1e-5, np.inf),
-                 l_bounds=(1e-5, np.inf)):
-        self.alpha = alpha
+    def __init__(self, l=1.0, alpha=1.0, l_bounds=(1e-5, np.inf),
+                 alpha_bounds=(1e-5, np.inf)):
         self.l = l
-        self.alpha_bounds = alpha_bounds
+        self.alpha = alpha
         self.l_bounds = l_bounds
+        self.alpha_bounds = alpha_bounds
 
-        self.theta_vars = ["alpha"] if alpha_bounds is not "fixed" else []
-        if self.l_bounds is not "fixed":
-            self.theta_vars += ["l"]
+        self.theta_vars = ["l"] if self.l_bounds is not "fixed" else []
+        if self.alpha_bounds is not "fixed":
+            self.theta_vars += ["alpha"]
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
@@ -956,12 +955,6 @@ def __call__(self, X, Y=None, eval_gradient=False):
             K = (1 + dists / (2 * self.alpha * self.l ** 2)) ** -self.alpha
 
         if eval_gradient:
-            # gradient with respect to alpha
-            if "alpha" in self.theta_vars:
-                alpha_gradient = K * (-np.log(base) + tmp / base)
-                alpha_gradient = alpha_gradient[:, :, np.newaxis]
-            else:  # alpha is kept fixed
-                alpha_gradient = np.empty((K.shape[0], K.shape[1], 0))
             # gradient with respect to l
             if "l" in self.theta_vars:
                 l_gradient = dists * K / (self.l ** 2 * base)
@@ -969,7 +962,14 @@ def __call__(self, X, Y=None, eval_gradient=False):
             else:  # l is kept fixed
                 l_gradient = np.empty((K.shape[0], K.shape[1], 0))
 
-            return K, np.dstack((alpha_gradient, l_gradient))
+            # gradient with respect to alpha
+            if "alpha" in self.theta_vars:
+                alpha_gradient = K * (-np.log(base) + tmp / base)
+                alpha_gradient = alpha_gradient[:, :, np.newaxis]
+            else:  # alpha is kept fixed
+                alpha_gradient = np.empty((K.shape[0], K.shape[1], 0))
+
+            return K, np.dstack((l_gradient, alpha_gradient))
         else:
             return K
 
@@ -1040,7 +1040,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         X = np.atleast_2d(X)
         if Y is None:
             dists = squareform(pdist(X, metric='euclidean'))
-            arg = np.pi  * dists / self.p
+            arg = np.pi * dists / self.p
             sin_of_arg = np.sin(arg)
             K = np.exp(- 2 * (sin_of_arg / self.l) ** 2)
         else:
@@ -1060,8 +1060,8 @@ def __call__(self, X, Y=None, eval_gradient=False):
                 l_gradient = np.empty((K.shape[0], K.shape[1], 0))
             # gradient with respect to p
             if "p" in self.theta_vars:
-                p_gradient = \
-                   4 * arg / (self.l**2 * self.p) * cos_of_arg * sin_of_arg * K
+                p_gradient = 4 * arg / (self.l**2 * self.p) * cos_of_arg \
+                    * sin_of_arg * K
                 p_gradient = p_gradient[:, :, np.newaxis]
             else:  # p is kept fixed
                 p_gradient = np.empty((K.shape[0], K.shape[1], 0))
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 9679d929fa660..9873739ab4ba0 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -26,7 +26,7 @@
 kernels = [RBF(l=2.0), RBF(l_bounds=(0.5, 2.0)),
            ConstantKernel(c=10.0),
            2.0 * RBF(l=0.5), RBF(l=2.0) + WhiteKernel(c=1.0),
-           RationalQuadratic(alpha=1.0, l=1.0),
+           RationalQuadratic(l=1.0, alpha=1.0),
            ExpSineSquared(l=1.0, p=1.0),
            DotProduct(sigma_0=1.0), DotProduct(sigma_0=1.0) ** 2]
 for metric in PAIRWISE_KERNEL_FUNCTIONS:

From be9d74980d2680fc43855db29ce7f5802fec1a0d Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 1 Apr 2015 15:27:24 +0200
Subject: [PATCH 069/165] ENH GPR can use individual noise-levels for each
 datapoint

---
 ...egression.py => plot_gpr_noisy_targets.py} |  0
 sklearn/gaussian_process/gpr.py               | 21 +++++++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)
 rename examples/gaussian_process/{plot_gp_regression.py => plot_gpr_noisy_targets.py} (100%)

diff --git a/examples/gaussian_process/plot_gp_regression.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
similarity index 100%
rename from examples/gaussian_process/plot_gp_regression.py
rename to examples/gaussian_process/plot_gpr_noisy_targets.py
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 134c8ec7c185d..97df6d6fe86b8 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -35,10 +35,12 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
         passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
         the kernel's hyperparameters are optimized during fitting.
 
-    y_err : float, optional (default: 1e-10)
+    sigma_squared_n : float or array-like, optional (default: 1e-10)
         Value added to the diagonal of the kernel matrix during fitting.
         Larger values correspond to increased noise level in the observations
-        and reduce potential numerical issue during fitting.
+        and reduce potential numerical issue during fitting. If an array is
+        passed, it must have the same number of entries as the data used for
+        fitting and is used as datapoint-dependent noise level.
 
     optimizer : string, optional (default: "fmin_l_bfgs_b")
         A string specifying the optimization algorithm used for optimizing the
@@ -70,9 +72,10 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
         Dual coefficients of training data points in kernel space
     """
 
-    def __init__(self, kernel=None, y_err=1e-10, optimizer="fmin_l_bfgs_b"):
+    def __init__(self, kernel=None, sigma_squared_n=1e-10,
+                 optimizer="fmin_l_bfgs_b"):
         self.kernel = kernel
-        self.y_err = y_err
+        self.sigma_squared_n = sigma_squared_n
         self.optimizer = optimizer
 
     def fit(self, X, y):
@@ -97,6 +100,12 @@ def fit(self, X, y):
 
         X, y = check_X_y(X, y)
 
+        if np.iterable(self.sigma_squared_n) \
+           and self.sigma_squared_n.shape[0] != y.shape[0]:
+            raise ValueError("sigma_n_squared must be a scalar or an array "
+                             "with same number of entries as y. (%d != %d)"
+                              % (self.sigma_squared_n.shape[0], y.shape[0]))
+
         self.X_fit_ = X
         self.y_fit_ = y
 
@@ -119,7 +128,7 @@ def obj_func(theta):
         # Precompute quantities required for predictions which are independent
         # of actual query points
         K = self.kernel_(self.X_fit_)
-        K[np.diag_indices_from(K)] += self.y_err
+        K[np.diag_indices_from(K)] += self.sigma_squared_n
         self.L_ = cholesky(K, lower=True)  # Line 2
         self.alpha_ = cho_solve((self.L_, True), self.y_fit_)  # Line 3
 
@@ -254,7 +263,7 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         else:
             K = kernel(self.X_fit_)
 
-        K[np.diag_indices_from(K)] += self.y_err
+        K[np.diag_indices_from(K)] += self.sigma_squared_n
         try:
             L = cholesky(K, lower=True)  # Line 2
         except np.linalg.LinAlgError:

From c7f9ff50a37732836928787fa94c38fb17d27079 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 2 Apr 2015 15:22:23 +0200
Subject: [PATCH 070/165] ENH GaussianProcessRegressor supports multiple
 restarts of the optimizer

---
 sklearn/gaussian_process/gpr.py | 55 ++++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 5 deletions(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 97df6d6fe86b8..fa3623de21e16 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -4,6 +4,8 @@
 #
 # License: BSD 3 clause
 
+from operator import itemgetter
+
 import numpy as np
 from scipy.linalg import cholesky, cho_solve, solve, solve_triangular
 from scipy.optimize import fmin_l_bfgs_b
@@ -50,6 +52,19 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
 
             'fmin_l_bfgs_b'
 
+    n_restarts_optimizer: int, optional (default: 1)
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 1, all bounds
+        must be finite.
+
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+
     Attributes
     ----------
     X_fit_ : array-like, shape = (n_samples, n_features)
@@ -73,10 +88,13 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
     """
 
     def __init__(self, kernel=None, sigma_squared_n=1e-10,
-                 optimizer="fmin_l_bfgs_b"):
+                 optimizer="fmin_l_bfgs_b", n_restarts_optimizer=1,
+                 random_state=None):
         self.kernel = kernel
         self.sigma_squared_n = sigma_squared_n
         self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.rng = check_random_state(random_state)
 
     def fit(self, X, y):
         """Fit Gaussian process regression model
@@ -109,15 +127,36 @@ def fit(self, X, y):
         self.X_fit_ = X
         self.y_fit_ = y
 
-        if self.optimizer == "fmin_l_bfgs_b":
+        if self.optimizer in ["fmin_l_bfgs_b"]:
             # Choose hyperparameters based on maximizing the log-marginal
-            # likelihood using fmin_l_bfgs_b
+            # likelihood (potentially starting from several initial values)
             def obj_func(theta):
                 lml, grad = self.log_marginal_likelihood(theta,
                                                          eval_gradient=True)
                 return -lml, -grad
-            self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel_.theta,
-                                              bounds=self.kernel_.bounds)
+
+            # First optimize starting from theta specified in kernel
+            optima = [(self._constrained_optimization(obj_func,
+                                                      self.kernel_.theta,
+                                                      self.kernel_.bounds))]
+
+            # Additional runs are performed from log-uniform chosen initial
+            # theta
+            if self.n_restarts_optimizer > 1:
+                if not np.isfinite(self.kernel_.bounds).all():
+                    raise ValueError(
+                        "Multiple optimizer restarts (n_restarts_optimizer>1) "
+                        "requires that all bounds are finite.")
+                log_bounds = np.log(self.kernel_.bounds)
+                for iteration in range(1, self.n_restarts_optimizer):
+                    theta_initial = np.exp(self.rng.uniform(log_bounds[:, 0],
+                                                            log_bounds[:, 1]))
+                    optima.append(
+                        self._constrained_optimization(obj_func, theta_initial,
+                                                       self.kernel_.bounds))
+            # Select result from run with minimal (negative) log-marginal
+            # likelihood
+            self.theta_ = optima[np.argmin(map(itemgetter(1), optima))][0]
             self.kernel_.theta = self.theta_
         elif self.optimizer is None:
             # Use initially provided hyperparameters
@@ -288,3 +327,9 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
             return log_likelihood, log_likelihood_gradient
         else:
             return log_likelihood
+
+    def _constrained_optimization(self, obj_func, initial_theta, bounds):
+        if self.optimizer in ["fmin_l_bfgs_b"]:
+            theta_opt, func_min, _ = \
+                fmin_l_bfgs_b(obj_func, initial_theta, bounds=bounds)
+        return theta_opt, func_min

From 90aceb82c4ecd757a03e9280856914b4adf09daf Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 2 Apr 2015 15:23:28 +0200
Subject: [PATCH 071/165] TST An increasing number of random-starts of GPR only
 increases the LML of the chosen theta.

---
 sklearn/gaussian_process/tests/test_gpr.py | 27 +++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 467994b6d891e..b3eb7b63fdbc0 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -8,7 +8,8 @@
 from scipy.optimize import approx_fprime
 
 from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
+from sklearn.gaussian_process.kernels \
+    import RBF, ConstantKernel as C, WhiteKernel
 
 from sklearn.utils.testing \
     import (assert_true, assert_greater, assert_array_less,
@@ -149,3 +150,27 @@ def test_anisotropic_kernel():
     kernel = RBF([1.0, 1.0])
     gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
     assert_greater(gpr.kernel_.theta[1], gpr.kernel_.theta[0] * 5)
+
+def test_random_starts():
+    """
+    Test that an increasing number of random-starts of GP fitting only
+    increases the log marginal likelihood of the chosen theta.
+    """
+    n_samples, n_features = 25, 3
+    np.random.seed(0)
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features) * 2 - 1
+    y = np.sin(X).sum(axis=1) + np.sin(3 * X).sum(axis=1) \
+        + rng.normal(scale=0.1, size=n_samples)
+
+    kernel = C(1.0, (1e-2, 1e2)) \
+        * RBF(l=[1.0] * n_features, l_bounds=[(1e-4, 1e+2)] * n_features) \
+        + WhiteKernel(c=1e-5, c_bounds=(1e-5, 1e1))
+    last_lml = -np.inf
+    for n_restarts_optimizer in range(1, 10):
+        gp = GaussianProcessRegressor(
+            kernel=kernel, n_restarts_optimizer=n_restarts_optimizer,
+            random_state=0,).fit(X, y)
+        lml = gp.log_marginal_likelihood(gp.theta_)
+        assert_greater(lml, last_lml - np.finfo(np.float32).eps)
+        last_lml = lml

From 3d18f95203cb2b92e262ee04f958786f1c4c4ec7 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 2 Apr 2015 15:23:56 +0200
Subject: [PATCH 072/165] DOC Adapted old example of fitting with noisy targets
 to new GPR class

---
 .../plot_gpr_noisy_targets.py                 | 41 +++++++------------
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index 33b78750d1fe4..b88f0e816b18c 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -8,35 +8,32 @@
 
 A simple one-dimensional regression exercise computed in two different ways:
 
-1. A noise-free case with a cubic correlation model
-2. A noisy case with a squared Euclidean correlation model
+1. A noise-free case
+2. A noisy case with known noise-leve per datapoint
 
-In both cases, the model parameters are estimated using the maximum
+In both cases, the kernel's parameters are estimated using the maximum
 likelihood principle.
 
 The figures illustrate the interpolating property of the Gaussian Process
 model as well as its probabilistic nature in the form of a pointwise 95%
 confidence interval.
 
-Note that the parameter ``nugget`` is applied as a Tikhonov regularization
-of the assumed covariance between the training points.  In the special case
-of the squared euclidean correlation model, nugget is mathematically equivalent
-to a normalized variance:  That is
-
-.. math::
-   \mathrm{nugget}_i = \left[\frac{\sigma_i}{y_i}\right]^2
-
+Note that the parameter ``sigma_squared_n`` is applied as a Tikhonov
+regularization of the assumed covariance between the training points.
 """
 print(__doc__)
 
 # Author: Vincent Dubourg <vincent.dubourg@gmail.com>
 #         Jake Vanderplas <vanderplas@astro.washington.edu>
+#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>s
 # Licence: BSD 3 clause
 
 import numpy as np
-from sklearn.gaussian_process import GaussianProcess
 from matplotlib import pyplot as pl
 
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
+
 np.random.seed(1)
 
 
@@ -56,15 +53,14 @@ def f(x):
 x = np.atleast_2d(np.linspace(0, 10, 1000)).T
 
 # Instanciate a Gaussian Process model
-gp = GaussianProcess(corr='cubic', theta0=1e-2, thetaL=1e-4, thetaU=1e-1,
-                     random_start=100)
+kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
+gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)
 
 # Fit to data using Maximum Likelihood Estimation of the parameters
 gp.fit(X, y)
 
 # Make the prediction on the meshed x-axis (ask for MSE as well)
-y_pred, MSE = gp.predict(x, eval_MSE=True)
-sigma = np.sqrt(MSE)
+y_pred, sigma = gp.predict(x, return_std=True)
 
 # Plot the function, the prediction and the 95% confidence interval based on
 # the MSE
@@ -92,22 +88,15 @@ def f(x):
 noise = np.random.normal(0, dy)
 y += noise
 
-# Mesh the input space for evaluations of the real function, the prediction and
-# its MSE
-x = np.atleast_2d(np.linspace(0, 10, 1000)).T
-
 # Instanciate a Gaussian Process model
-gp = GaussianProcess(corr='squared_exponential', theta0=1e-1,
-                     thetaL=1e-3, thetaU=1,
-                     nugget=(dy / y) ** 2,
-                     random_start=100)
+gp = GaussianProcessRegressor(kernel=kernel, sigma_squared_n=(dy / y) ** 2,
+                              n_restarts_optimizer=10)
 
 # Fit to data using Maximum Likelihood Estimation of the parameters
 gp.fit(X, y)
 
 # Make the prediction on the meshed x-axis (ask for MSE as well)
-y_pred, MSE = gp.predict(x, eval_MSE=True)
-sigma = np.sqrt(MSE)
+y_pred, sigma = gp.predict(x, return_std=True)
 
 # Plot the function, the prediction and the 95% confidence interval based on
 # the MSE

From c29a2b2810ae941f0adb0b0d77f4f75152551ac9 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 2 Apr 2015 16:07:34 +0200
Subject: [PATCH 073/165] ENH GaussianProcessClassifier supports multiple
 restarts of the optimizer

---
 sklearn/gaussian_process/gpc.py | 53 +++++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 5 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 3220f447b5831..ba770f0acd557 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -5,6 +5,7 @@
 # License: BSD 3 clause
 
 import warnings
+from operator import itemgetter
 
 import numpy as np
 from scipy.linalg import cholesky, cho_solve, solve
@@ -14,6 +15,7 @@
 from sklearn.base import BaseEstimator, ClassifierMixin, clone
 from sklearn.gaussian_process.kernels import RBF
 from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
+from sklearn.utils import check_random_state
 from sklearn.preprocessing import LabelEncoder
 
 # Values required for approximating the logistic sigmoid by
@@ -61,6 +63,14 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
 
             'fmin_l_bfgs_b'
 
+    n_restarts_optimizer: int, optional (default: 1)
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 1, all bounds
+        must be finite.
+
     warm_start : bool, optional (default: False)
         If warm-starts are enabled, the solution of the last Newton iteration
         on the Laplace approximation of the posterior mode is used as
@@ -68,6 +78,10 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
         up convergence when _posterior_mode is called several times on similar
         problems as in hyperparameter optimization.
 
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
 
     Attributes
     ----------
@@ -100,11 +114,13 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
     """
 
     def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
-                 warm_start=False):
+                 n_restarts_optimizer=1, warm_start=False, random_state=None):
         self.kernel = kernel
         self.jitter = jitter
         self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
         self.warm_start = warm_start
+        self.rng = check_random_state(random_state)
 
     def fit(self, X, y):
         """Fit Gaussian process regression model
@@ -144,15 +160,36 @@ def fit(self, X, y):
                           % self.classes_)
             self.classes_ = np.array([self.classes_[0], self.classes_[0]])
 
-        if self.optimizer == "fmin_l_bfgs_b":
+        if self.optimizer in ["fmin_l_bfgs_b"]:
             # Choose hyperparameters based on maximizing the log-marginal
-            # likelihood
+            # likelihood (potentially starting from several initial values)
             def obj_func(theta):
                 lml, grad = self.log_marginal_likelihood(theta,
                                                          eval_gradient=True)
                 return -lml, -grad
-            self.theta_, _, _ = fmin_l_bfgs_b(obj_func, self.kernel_.theta,
-                                              bounds=self.kernel_.bounds)
+
+            # First optimize starting from theta specified in kernel
+            optima = [(self._constrained_optimization(obj_func,
+                                                      self.kernel_.theta,
+                                                      self.kernel_.bounds))]
+
+            # Additional runs are performed from log-uniform chosen initial
+            # theta
+            if self.n_restarts_optimizer > 1:
+                if not np.isfinite(self.kernel_.bounds).all():
+                    raise ValueError(
+                        "Multiple optimizer restarts (n_restarts_optimizer>1) "
+                        "requires that all bounds are finite.")
+                log_bounds = np.log(self.kernel_.bounds)
+                for iteration in range(1, self.n_restarts_optimizer):
+                    theta_initial = np.exp(self.rng.uniform(log_bounds[:, 0],
+                                                            log_bounds[:, 1]))
+                    optima.append(
+                        self._constrained_optimization(obj_func, theta_initial,
+                                                       self.kernel_.bounds))
+            # Select result from run with minimal (negative) log-marginal
+            # likelihood
+            self.theta_ = optima[np.argmin(map(itemgetter(1), optima))][0]
             self.kernel_.theta = self.theta_
         elif self.optimizer is None:
             self.theta_ = self.kernel_.theta
@@ -344,3 +381,9 @@ def _posterior_mode(self, K, return_temporaries=False):
             return log_marginal_likelihood, (pi, W_sr, L, b, a)
         else:
             return log_marginal_likelihood
+
+    def _constrained_optimization(self, obj_func, initial_theta, bounds):
+        if self.optimizer in ["fmin_l_bfgs_b"]:
+            theta_opt, func_min, _ = \
+                fmin_l_bfgs_b(obj_func, initial_theta, bounds=bounds)
+        return theta_opt, func_min

From b6681a355bc95080c7f4c663bb7a5d13e32a4e26 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 2 Apr 2015 16:07:48 +0200
Subject: [PATCH 074/165] TST An increasing number of random-starts of GPC only
 increases the LML of the chosen theta.

---
 sklearn/gaussian_process/tests/test_gpc.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index dd88fae268eef..ef54d15130877 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -65,3 +65,25 @@ def test_lml_gradient():
                           1e-10)
 
         assert_almost_equal(lml_gradient, lml_gradient_approx, 3)
+
+def test_random_starts():
+    """
+    Test that an increasing number of random-starts of GP fitting only
+    increases the log marginal likelihood of the chosen theta.
+    """
+    n_samples, n_features = 25, 3
+    np.random.seed(0)
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features) * 2 - 1
+    y = (np.sin(X).sum(axis=1) + np.sin(3 * X).sum(axis=1)) > 0
+
+    kernel = C(1.0, (1e-2, 1e2)) \
+        * RBF(l=[1e-3] * n_features, l_bounds=[(1e-4, 1e+2)] * n_features)
+    last_lml = -np.inf
+    for n_restarts_optimizer in range(1, 10):
+        gp = GaussianProcessClassifier(
+            kernel=kernel, n_restarts_optimizer=n_restarts_optimizer,
+            random_state=0,).fit(X, y)
+        lml = gp.log_marginal_likelihood(gp.theta_)
+        assert_greater(lml, last_lml - np.finfo(np.float32).eps)
+        last_lml = lml

From 9947c259db907ba1dc8bd4c574a78c6fda4b3edc Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 2 Apr 2015 17:16:23 +0200
Subject: [PATCH 075/165] FIX Fetching CO2 data from mldata

---
 examples/gaussian_process/plot_gpr_co2.py   | 13 +++++++------
 examples/gaussian_process/plot_gpr_noisy.py |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 71236b48c3094..4361fc3912807 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -6,17 +6,17 @@
 # License: BSD 3 clause
 
 import numpy as np
-import statsmodels.api as sm   # XXX: Upload data on mldata
 
 from matplotlib import pyplot as plt
 
 from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels \
     import RBF, Kernel, WhiteKernel, RationalQuadratic, ExpSineSquared
+from sklearn.datasets import fetch_mldata
 
-data = sm.datasets.get_rdataset("co2").data
-X = np.array(data.time)[:, np.newaxis]
-y = np.array(data.co2)
+data = fetch_mldata('mauna-loa-atmospheric-co2').data
+X = data[:, [1]]
+y = data[:, 0]
 y_mean = y.mean()
 
 # Kernel with parameters given in GPML book
@@ -26,7 +26,8 @@
 k4 = 0.18**2 * RBF(l=0.134) + WhiteKernel(c=0.19**2) # noise terms
 kernel_gpml = k1 + k2 + k3 + k4
 
-gp = GaussianProcessRegressor(kernel=kernel_gpml, y_err=0, optimizer=None)
+gp = GaussianProcessRegressor(kernel=kernel_gpml, sigma_squared_n=0,
+							  optimizer=None)
 gp.fit(X, y - y_mean)
 
 print "GPML kernel: %s" % gp.kernel_
@@ -41,7 +42,7 @@
 									   c_bounds=(1e-3, np.inf))  # noise terms
 kernel = k1 + k2 + k3 + k4
 
-gp = GaussianProcessRegressor(kernel=kernel, y_err=0)
+gp = GaussianProcessRegressor(kernel=kernel, sigma_squared_n=0)
 gp.fit(X, y - y_mean)
 
 print "\nLearned kernel: %s" % gp.kernel_
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index 108ba936a6f15..519ff838fe2e3 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -35,7 +35,7 @@
 plt.figure(0)
 kernel = 1.0 * RBF(l=100.0) + WhiteKernel(c=1e-5, c_bounds=(1e-10, 1e+1))
 gp = GaussianProcessRegressor(kernel=kernel,
-                              y_err=0.0).fit(X, y)
+                              sigma_squared_n=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
 y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
 plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
@@ -53,7 +53,7 @@
 plt.figure(1)
 kernel = 1.0 * RBF(l=1.0) + WhiteKernel(c=1e-5, c_bounds=(1e-10, 1e+1))
 gp = GaussianProcessRegressor(kernel=kernel,
-                              y_err=0.0).fit(X, y)
+                              sigma_squared_n=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
 y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
 plt.plot(X_, y_mean, 'k', lw=3, zorder=9)

From eaba93c2113d87387597a106017877abd2fb5e06 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Fri, 3 Apr 2015 14:30:03 +0200
Subject: [PATCH 076/165] FIX GP kernels check for type equality before
 checking attribute equality

---
 sklearn/gaussian_process/kernels.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 4eac08e486fe3..3c9dfb9737665 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -310,6 +310,8 @@ def bounds(self, bounds):
         self.k2.bounds = bounds[k1_dims:]
 
     def __eq__(self, b):
+        if type(self) != type(b):
+            return False
         return (self.k1 == b.k1 and self.k2 == b.k2) \
             or (self.k1 == b.k2 and self.k2 == b.k1)
 
@@ -542,6 +544,8 @@ def bounds(self, bounds):
         self.kernel.bounds = bounds
 
     def __eq__(self, b):
+        if type(self) != type(b):
+            return False
         return (self.kernel == b.kernel and self.exponent == b.exponent)
 
     def __call__(self, X, Y=None, eval_gradient=False):

From 63c931aed2c36cfea276c3d8f67106e65827ba17 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Fri, 3 Apr 2015 14:30:44 +0200
Subject: [PATCH 077/165] FIX cross_validation and pairwise_kernels treat GP
 kernels as special case

---
 sklearn/cross_validation.py | 4 +++-
 sklearn/metrics/pairwise.py | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index fa7c7f210bc05..baafcf0bba32d 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -31,6 +31,7 @@
 from .externals.six.moves import zip
 from .metrics.scorer import check_scoring
 from .utils.fixes import bincount
+from .gaussian_process.kernels import Kernel as GPKernel
 
 __all__ = ['KFold',
            'LeaveOneLabelOut',
@@ -1338,7 +1339,8 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
 
 def _safe_split(estimator, X, y, indices, train_indices=None):
     """Create subset of dataset and properly handle kernels."""
-    if hasattr(estimator, 'kernel') and callable(estimator.kernel):
+    if hasattr(estimator, 'kernel') and callable(estimator.kernel) \
+       and not isinstance(estimator.kernel, GPKernel):
         # cannot compute the kernel values with custom function
         raise ValueError("Cannot use a custom kernel function. "
                          "Precompute the kernel matrix instead.")
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index a5120f2cd0cec..de696470ebcec 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -1270,6 +1270,9 @@ def pairwise_kernels(X, Y=None, metric="linear", filter_params=False,
     If metric is 'precomputed', Y is ignored and X is returned.
 
     """
+    # import GPKernel locally to prevent circular imports
+    from ..gaussian_process.kernels import Kernel as GPKernel
+
     if metric == "precomputed":
         return X
     elif metric in PAIRWISE_KERNEL_FUNCTIONS:
@@ -1277,6 +1280,8 @@ def pairwise_kernels(X, Y=None, metric="linear", filter_params=False,
             kwds = dict((k, kwds[k]) for k in kwds
                         if k in KERNEL_PARAMS[metric])
         func = PAIRWISE_KERNEL_FUNCTIONS[metric]
+    elif isinstance(metric, GPKernel):
+        func = metric.__call__
     elif callable(metric):
         func = partial(_pairwise_callable, metric=metric, **kwds)
     else:

From e0e04938265e859dbbe00bec62a7098f1ce941b0 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sat, 4 Apr 2015 11:49:20 +0200
Subject: [PATCH 078/165] DOC Adding example comparing kernel ridge with GPR.

---
 .../gaussian_process/plot_compare_gpr_krr.py  | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 examples/gaussian_process/plot_compare_gpr_krr.py

diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
new file mode 100644
index 0000000000000..71e953882e237
--- /dev/null
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -0,0 +1,115 @@
+"""
+==========================================================
+Comparison of kernel ridge and Gaussian process regression
+==========================================================
+
+Both kernel ridge regression (KRR) and Gaussian process regression (GPR) learn
+a target function by employing internally the "kernel trick". KRR learns a
+linear function in the space induced by the respective kernel which corresponds
+to a non-linear function in the original space. The linear function in the
+kernel space is chosen based on the mean-squared error loss with
+ridge regularization. GPR uses the kernel to define the covariance of
+a prior distribution over the target functions and uses the observed training
+data to define a likelihood function. Based on Bayes theorem, a (Gaussian)
+posterior distribution over target functions is defined, whose mean is used
+for prediction.
+
+A major difference is that GPR can choose the kernel's hyperparameters based
+on gradient-ascent on the marginal likelihood function while KRR needs to
+perform a grid search on a cross-validated loss function (mean-squared error
+loss). A further difference is that GPR learns a generative, probabilistic
+model of the target function and can thus provide meaningful confidence
+intervals and posterior samples along with the predictions while KRR only
+provides predictions.
+
+This example illustrates both methods on an artificial dataset, which
+consists of a sinusoidal target function and strong noise. The figure compares
+the learned model of KRR and GPR based on a ExpSineSquared kernel, which is
+suited for learning periodic functions. The kernel's hyperparameters control
+the smoothness (l) and periodicity of the kernel (p). Moreover, the noise level
+of the data is learned explicitly by GPR by an additional WhiteKernel component
+in the kernel and by the regularization parameter alpha of KRR.
+
+The figure shows that both methods learn reasonable models of the target
+function. GPR correctly identifies the periodicity of the function to be
+roughly 2*pi (6.28), while KRR chooses the doubled periodicity 4*pi. Besides
+that, GPR provides reasonable confidence bounds on the prediction which are not
+available for KRR. A major difference between the two methods is the time
+required for fitting and predicting: while fitting KRR is fast in principle,
+the grid-search for hyperparameter optimization scales exponentially with the
+number of hyperparameters ("curse of dimensionality"). The gradient-based
+optimization of the parameters in GPR does not suffer from this exponential
+scaling and is thus considerable faster on this example with 3-dimensional
+hyperparameter space. The time for predicting is similar; however, generating
+the variance of the predictive distribution of GPR takes considerable longer
+than just predicting the mean.
+"""
+print(__doc__)
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# License: BSD 3 clause
+
+
+import time
+
+import numpy as np
+
+import matplotlib.pyplot as plt
+
+from sklearn.kernel_ridge import KernelRidge
+from sklearn.grid_search import GridSearchCV
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import WhiteKernel, ExpSineSquared
+
+rng = np.random.RandomState(0)
+
+# Generate sample data
+X = 15 * rng.rand(100, 1)
+y = np.sin(X).ravel()
+y += 3 * (0.5 - rng.rand(X.shape[0]))  # add noise
+
+# Fit KernelRidge with parameter selection based on 5-fold cross validation
+param_grid = {"alpha": [1e0, 1e-1, 1e-2, 1e-3],
+              "kernel": [ExpSineSquared(l, p)
+                         for l in np.logspace(-2, 2, 10)
+                         for p in np.logspace(0, 2, 10)]}
+kr = GridSearchCV(KernelRidge(), cv=5, param_grid=param_grid)
+stime = time.time()
+kr.fit(X, y)
+print "Time for KRR fitting: %.3f" % (time.time() - stime)
+
+gp_kernel = ExpSineSquared(1.0, 5.0) + WhiteKernel(1e-1)
+gpr = GaussianProcessRegressor(kernel=gp_kernel)
+stime = time.time()
+gpr.fit(X, y)
+print "Time for GPR fitting: %.3f" % (time.time() - stime)
+
+# Predict using kernel ridge
+X_plot = np.linspace(0, 20, 10000)[:, None]
+stime = time.time()
+y_kr = kr.predict(X_plot)
+print "Time for KRR prediction: %.3f" % (time.time() - stime)
+
+# Predict using kernel ridge
+stime = time.time()
+y_gpr = gpr.predict(X_plot, return_std=False)
+print "Time for GPR prediction: %.3f" % (time.time() - stime)
+
+stime = time.time()
+y_gpr, y_std = gpr.predict(X_plot, return_std=True)
+print "Time for GPR prediction with standard-deviation: %.3f" \
+    % (time.time() - stime)
+
+# Plot results
+plt.scatter(X, y, c='k', label='data')
+plt.plot(X_plot, np.sin(X_plot), c='k', label='True')
+plt.plot(X_plot, y_kr, c='g', label='KRR (%s)' % kr.best_params_)
+plt.plot(X_plot, y_gpr, c='r', label='GPR (%s)' % gpr.kernel_)
+plt.fill_between(X_plot[:, 0], y_gpr - y_std, y_gpr + y_std, color='r',
+                 alpha=0.2)
+plt.xlabel('data')
+plt.ylabel('target')
+plt.xlim(0, 20)
+plt.title('GPR versus Kernel Ridge')
+plt.legend(loc="best")
+plt.show()

From 7b47208f51528f73e68650d47b48916cd50ab47f Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sat, 4 Apr 2015 17:49:31 +0200
Subject: [PATCH 079/165] DOC Revised plot_gpc example

---
 examples/gaussian_process/plot_gpc.py | 52 ++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/examples/gaussian_process/plot_gpc.py b/examples/gaussian_process/plot_gpc.py
index 45d1c3385d8f1..1524ac19918c7 100644
--- a/examples/gaussian_process/plot_gpc.py
+++ b/examples/gaussian_process/plot_gpc.py
@@ -1,9 +1,21 @@
 """Gaussian process classification (GPC)
 
-This example illustrates both prediction of the prior GPC and the posterior
-GPC. While the posterior model has a considerably larger
-log-marginal-likelihood, the generated predictions are not optimal. This
-is caused by the Laplace approximations used internally by GPC.
+This example illustrates the predicted probability of GPC for an RBF kernel
+with different choices of the hyperparameters. The first figure shows the
+predicted probability of GPC with arbitrarily chosen hyperparameters and with
+the hyperparameters corresponding to the maximum log-marginal-likelihood (LML).
+
+While the hyperparameters chosen by optimizing LML have a considerable larger
+LML, they perform slightly worse according to the log-loss on test data. The
+figure shows that this is because they exhibit a steep change of the class
+probabilities at the class boundaries (which is good) but have predicted
+probabilities close to 0.5 far away from the class boundaries (which is bad)
+This undiesirable effect is caused by the Laplace approximation used
+internally by GPC.
+
+The second figure shows the log-marginal-likelihood for different choices of
+the kernel's hyperparameters, highlighting the two choices of the
+hyperparameters used in the first figure by black dots.
 """
 print __doc__
 
@@ -15,38 +27,52 @@
 
 from matplotlib import pyplot as plt
 
+from sklearn.metrics.classification import accuracy_score, log_loss
 from sklearn.gaussian_process import GaussianProcessClassifier
 from sklearn.gaussian_process.kernels import RBF
 
 
 # Generate data
+train_size = 50
 rng = np.random.RandomState(0)
-X = rng.uniform(0, 5, 50)[:, np.newaxis]
-y = np.array(np.sin((X[:, 0] - 2.5) ** 2) > 0.0, dtype=int)
+X = rng.uniform(0, 5, 100)[:, np.newaxis]
+y = np.array(X[:, 0] > 2.5, dtype=int)
 
 # Specify Gaussian Processes with fixed and optimized hyperparameters
 gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(l=1.0),
-								   optimizer=None).fit(X, y)
+                                   optimizer=None)
+gp_fix.fit(X[:train_size], y[:train_size])
 
-gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(l=1.0)).fit(X, y)
+gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(l=1.0))
+gp_opt.fit(X[:train_size], y[:train_size])
 
 print "Log Marginal Likelihood (initial): %.3f" % \
     gp_fix.log_marginal_likelihood(gp_fix.theta_)
 print "Log Marginal Likelihood (optimized): %.3f" % \
-    gp_fix.log_marginal_likelihood(gp_opt.theta_)
+    gp_opt.log_marginal_likelihood(gp_opt.theta_)
+
+print "Accuracy: %.3f (initial) %.3f (optimized)" \
+    % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
+       accuracy_score(y[:train_size], gp_opt.predict(X[:train_size])))
+print "Log-loss: %.3f (initial) %.3f (optimized)" \
+    % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
+       log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1]))
 
 
 # Plot posteriors
 plt.figure(0)
-plt.scatter(X[:, 0], y)
+plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data")
+plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data")
 X_ = np.linspace(0, 5, 100)
 plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r',
          label="Initial kernel: %s" % gp_fix.kernel_)
 plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b',
          label="Optimized kernel: %s" % gp_opt.kernel_)
-plt.legend(loc="best")
 plt.xlabel("Feature")
 plt.ylabel("Class 1 probability")
+plt.xlim(0, 5)
+plt.ylim(-0.25, 1.5)
+plt.legend(loc="best")
 
 # Plot LML landscape
 plt.figure(1)
@@ -56,10 +82,12 @@
 LML = [[gp_opt.log_marginal_likelihood([Theta0[i, j], Theta1[i, j]])
         for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
 LML = np.array(LML).T
+plt.plot(gp_fix.theta_[0], gp_fix.theta_[1], 'ko', zorder=10)
+plt.plot(gp_opt.theta_[0], gp_opt.theta_[1], 'ko', zorder=10)
 plt.pcolor(Theta0, Theta1, LML)
 plt.xscale("log")
 plt.yscale("log")
-plt.colorbar()
+plt.colorbar(label="Log-marginal Likelihood")
 plt.xlabel("Magnitude")
 plt.ylabel("Length-scale")
 plt.title("Log-marginal-likelihood")

From 57c4746a141a60bb6d872240a8e074838f559e29 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 8 Apr 2015 11:28:51 +0200
Subject: [PATCH 080/165] DOC Adding documentation for GP kernels

---
 doc/modules/classes.rst                       |  28 +-
 doc/modules/gaussian_process.rst              | 300 +++++++++++++++++-
 .../gaussian_process/plot_compare_gpr_krr.py  |  12 +-
 examples/gaussian_process/plot_gpc.py         |  29 +-
 .../plot_gpc_isoprobability.py                |   8 +-
 examples/gaussian_process/plot_gpc_xor.py     |   7 +-
 examples/gaussian_process/plot_gpr_co2.py     |  18 +-
 examples/gaussian_process/plot_gpr_noisy.py   |   7 +-
 .../plot_gpr_noisy_targets.py                 |   5 +-
 .../plot_gpr_prior_posterior.py               |  11 +-
 sklearn/gaussian_process/gpr.py               |   3 +-
 11 files changed, 361 insertions(+), 67 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index b310ba3ea3dbf..f69618cfd5078 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -490,21 +490,25 @@ From text
   :toctree: generated/
   :template: class.rst
 
-  gaussian_process.GaussianProcess
+  gaussian_process.GaussianProcessRegressor
+  gaussian_process.GaussianProcessClassifier
+
+Kernels:
 
 .. autosummary::
-   :toctree: generated
-   :template: function.rst
+  :toctree: generated/
+  :template: class.rst
 
-   gaussian_process.correlation_models.absolute_exponential
-   gaussian_process.correlation_models.squared_exponential
-   gaussian_process.correlation_models.generalized_exponential
-   gaussian_process.correlation_models.pure_nugget
-   gaussian_process.correlation_models.cubic
-   gaussian_process.correlation_models.linear
-   gaussian_process.regression_models.constant
-   gaussian_process.regression_models.linear
-   gaussian_process.regression_models.quadratic
+  gaussian_process.kernels.Sum
+  gaussian_process.kernels.Product
+  gaussian_process.kernels.Exponentiation
+  gaussian_process.kernels.ConstantKernel
+  gaussian_process.kernels.WhiteKernel
+  gaussian_process.kernels.RBF
+  gaussian_process.kernels.RationalQuadratic
+  gaussian_process.kernels.ExpSineSquared
+  gaussian_process.kernels.DotProduct
+  gaussian_process.kernels.PairwiseKernel
 
 
 .. _grid_search_ref:
diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index a272dd177fa1e..5042378575522 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -45,8 +45,283 @@ The disadvantages of Gaussian Processes for Machine Learning include:
 Thanks to the Gaussian property of the prediction, it has been given varied
 applications: e.g. for global optimization, probabilistic classification.
 
-Examples
-========
+
+Gaussian Process Regression
+===========================
+
+Noise-free and noisy targets
+----------------------------
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_noisy_targets_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_noisy_targets.html
+   :align: center
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_noisy_targets_002.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_noisy_targets.html
+   :align: center
+
+
+Gaussian process regression (GPR) with noise-level estimation
+-------------------------------------------------------------
+This example illustrates that GPR with a sum-kernel including a WhiteKernel can
+estimate the noise level of data. An illustration of the
+log-marginal-likelihood (LML) landscape shows that there exist two local
+maxima of LML.
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_noisy_000.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
+   :align: center
+
+The first corresponds to a model with a high noise level and a
+large length scale, which explains all variations in the data by noise.
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_noisy_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
+   :align: center
+
+The second one has a smaller noise level and shorter length scale, which explains
+most of the variation by the noise-free functional relationship. The second
+model has a higher likelihood; however, depending on the initial value for the
+hyperparameters, the gradient-based optimization might also converge to the
+high-noise solution. It is thus important to repeat the optimization several
+times for different initializations.
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_noisy_002.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
+   :align: center
+
+
+Comparison of GPR and Kernel Ridge Regression
+---------------------------------------------
+
+Both kernel ridge regression (KRR) and GPR learn
+a target function by employing internally the "kernel trick". KRR learns a
+linear function in the space induced by the respective kernel which corresponds
+to a non-linear function in the original space. The linear function in the
+kernel space is chosen based on the mean-squared error loss with
+ridge regularization. GPR uses the kernel to define the covariance of
+a prior distribution over the target functions and uses the observed training
+data to define a likelihood function. Based on Bayes theorem, a (Gaussian)
+posterior distribution over target functions is defined, whose mean is used
+for prediction.
+
+A major difference is that GPR can choose the kernel's hyperparameters based
+on gradient-ascent on the marginal likelihood function while KRR needs to
+perform a grid search on a cross-validated loss function (mean-squared error
+loss). A further difference is that GPR learns a generative, probabilistic
+model of the target function and can thus provide meaningful confidence
+intervals and posterior samples along with the predictions while KRR only
+provides predictions.
+
+The following figure illustrates both methods on an artificial dataset, which
+consists of a sinusoidal target function and strong noise. The figure compares
+the learned model of KRR and GPR based on a ExpSineSquared kernel, which is
+suited for learning periodic functions. The kernel's hyperparameters control
+the smoothness (l) and periodicity of the kernel (p). Moreover, the noise level
+of the data is learned explicitly by GPR by an additional WhiteKernel component
+in the kernel and by the regularization parameter alpha of KRR.
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_compare_gpr_krr_001.png
+   :target: ../auto_examples/gaussian_process/plot_compare_gpr_krr.html
+   :align: center
+
+The figure shows that both methods learn reasonable models of the target
+function. GPR correctly identifies the periodicity of the function to be
+roughly 2*pi (6.28), while KRR chooses the doubled periodicity 4*pi. Besides
+that, GPR provides reasonable confidence bounds on the prediction which are not
+available for KRR. A major difference between the two methods is the time
+required for fitting and predicting: while fitting KRR is fast in principle,
+the grid-search for hyperparameter optimization scales exponentially with the
+number of hyperparameters ("curse of dimensionality"). The gradient-based
+optimization of the parameters in GPR does not suffer from this exponential
+scaling and is thus considerable faster on this example with 3-dimensional
+hyperparameter space. The time for predicting is similar; however, generating
+the variance of the predictive distribution of GPR takes considerable longer
+than just predicting the mean.
+
+Gaussian process regression (GPR) on Mauna Loa CO2 data
+-------------------------------------------------------
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_co2_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_co2.html
+   :align: center
+
+
+Gaussian Process Classification
+===============================
+
+Probabilistic predictions with Gaussian process classification (GPC)
+--------------------------------------------------------------------
+
+This example illustrates the predicted probability of GPC for an RBF kernel
+with different choices of the hyperparameters. The first figure shows the
+predicted probability of GPC with arbitrarily chosen hyperparameters and with
+the hyperparameters corresponding to the maximum log-marginal-likelihood (LML).
+
+While the hyperparameters chosen by optimizing LML have a considerable larger
+LML, they perform slightly worse according to the log-loss on test data. The
+figure shows that this is because they exhibit a steep change of the class
+probabilities at the class boundaries (which is good) but have predicted
+probabilities close to 0.5 far away from the class boundaries (which is bad)
+This undiesirable effect is caused by the Laplace approximation used
+internally by GPC.
+
+The second figure shows the log-marginal-likelihood for different choices of
+the kernel's hyperparameters, highlighting the two choices of the
+hyperparameters used in the first figure by black dots.
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_000.png
+   :target: ../auto_examples/gaussian_process/plot_gpc.html
+   :align: center
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpc.html
+   :align: center
+
+
+Iso-probability lines for Gaussian Processes classification (GPC)
+-----------------------------------------------------------------
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_isoprobability_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpc_isoprobability.html
+   :align: center
+
+
+Illustration of Gaussian process classification (GPC) on the XOR dataset
+------------------------------------------------------------------------
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_xor_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpc_xor.html
+   :align: center
+
+
+Kernels for Gaussian Processes
+==============================
+.. currentmodule:: sklearn.gaussian_process.kernels
+
+Kernels (also called "covariance functions" in the context of GPs) are a crucial
+ingredient of GPs which determine the shape of prior and posterior of the GP.
+They encode the assumptions on the function being learned by defining the "similarity"
+of two datapoints combined with the assumption that similar datapoints should
+have similar target values. Two categories of kernels can be distinguished:
+stationary kernels depend only on the distance of two datapoints and not on their
+absolute values :math:`k(x_i, x_j)= k(d(x_i, x_j))` and are thus invariant to
+translations in the input space, while non-stationary kernels
+depend also on the specific values of the datapoints. Stationary kernels can further
+be subdivided into isotropic and anisotropic kernels, where isotropic kernels are
+also invariant to rotations in the input space. For more details, we refer to
+Chapter 4 of [RW2006]_.
+
+Basic kernels
+-------------
+The :class:`ConstantKernel` kernel can be used as part of a product-kernel
+where it scales the magnitude of the other factor (kernel) or as part of a
+sum-kernel, where it modifies the mean of the Gaussian process. It depends
+on a parameter :math:`c`. It is defined as:
+
+.. math::
+   k(x_i, x_j) = c \;\forall\; x_1, x_2
+
+The main use-case of the :class:`WhiteKernel` kernel is as part of a
+sum-kernel where it explains the noise-component of the signal. Tuning its
+parameter :math:`c` corresponds to estimating the noise-level.
+It is defined as:
+
+.. math::
+    k(x_i, x_j) = c \text{ if } x_i == x_j \text{ else } 0
+
+
+Kernel operators
+----------------
+Kernel operators take one or two base kernels and combine them into a new
+kernel. The :class:`Sum` kernel takes two kernels :math:`k1` and :math:`k2`
+and combines them via :math:`k_{sum}(X, Y) = k1(X, Y) + k2(X, Y)`.
+The  :class:`Product` kernel takes two kernels :math:`k1` and :math:`k2`
+and combines them via :math:`k_{product}(X, Y) = k1(X, Y) * k2(X, Y)`.
+The :class:`Exponentiation` kernel takes one base kernel and a scalar parameter
+:math:`exponent` and combines them via
+:math:`k_{exp}(X, Y) = k(X, Y)^\text{exponent}`.
+
+Radial-basis function (RBF) kernel
+----------------------------------
+The :class:`RBF` kernel is a stationary kernel. It is also known as the "squared
+exponential" kernel. It is parameterized by a length-scale parameter :math:`l>0`, which
+can either be a scalar (isotropic variant of the kernel) or a vector with the same
+number of dimensions as the inputs :math:`x` (anisotropic variant of the kernel).
+The kernel given by:
+
+.. math::
+   k(x_i, x_j) = \text{exp}\left(-\frac{1}{2} \vert d(x_i / l, x_j / l)^2\right)
+
+This kernel is infinitely differentiable, which implies that GPs with this
+kernel as covariance function have mean square derivatives of all orders, and are thus
+very smooth. The prior and posterior of a GP resulting from an RBF kernel is shown in
+the following figure:
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_000.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
+   :align: center
+
+Rational quadratic kernel
+-------------------------
+
+The :class:`RationalQuadratic` kernel can be seen as a scale mixture (an infinite sum)
+of :class:`RBF` kernels with different characteristic length-scales. It is parameterized
+by a length-scale parameter :math:`l>0` and a scale mixture parameter  :math:`\alpha>0`
+Only the isotropic variant where :math:`l` is a scalar is supported at the moment.
+The kernel given by:
+
+.. math::
+   k(x_i, x_j) = \left(1 + \frac{d(x_i, x_j)^2}{2\alpha l^2}\right)^\alpha
+
+The prior and posterior of a GP resulting from an RBF kernel is shown in
+the following figure:
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
+   :align: center
+
+Exp-Sine-Squared kernel
+-----------------------
+
+The :class:`ExpSineSquared` kernel allows modeling periodic functions.
+It is parameterized by a length-scale parameter :math:`l>0` and a periodicity parameter
+:math:`p>0`. Only the isotropic variant where :math:`l` is a scalar is supported at the moment.
+The kernel given by:
+
+.. math::
+   k(x_i, x_j) = \text{exp}\left(-2 \text{sin}(\pi / p * d(x_i, x_j)) / l\right)^2
+
+The prior and posterior of a GP resulting from an ExpSineSquared kernel is shown in
+the following figure:
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_002.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
+   :align: center
+
+Dot-Product kernel
+------------------
+
+The :class:`DotProduct` kernel is non-stationary and can be obtained from linear regression
+by putting :math:`N(0, 1)` priors on the coefficients of :math:`x_d (d = 1, . . . , D)` and
+a prior of :math:`N(0, \sigma_0^2)` on the bias. The :class:`DotProduct` kernel is invariant to a rotation
+of the coordinates about the origin, but not translations.
+It is parameterized by a parameter :math:`\sigma_0^2`. For :math:`\sigma_0^2 = 0`, the kernel
+is called the homogeneous linear kernel, otherwise it is inhomogeneous. The kernel is given by
+
+.. math::
+   k(x_i, x_j) = \sigma_0 ^ 2 + x_i \cdot x_j
+
+The :class:`DotProduct` kernel is commonly combined with exponentiation. An example with exponent 2 is
+shown in the following figure:
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_003.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
+   :align: center
+
+
+Legacy
+======
 
 An introductory regression example
 ----------------------------------
@@ -88,10 +363,10 @@ Fitting Noisy Data
 
 When the data to be fit includes noise, the Gaussian process model can be
 used by specifying the variance of the noise for each point.
-:class:`GaussianProcess` takes a parameter ``nugget`` which 
+:class:`GaussianProcess` takes a parameter ``nugget`` which
 is added to the diagonal of the correlation matrix between training points:
 in general this is a type of Tikhonov regularization.  In the special case
-of a squared-exponential correlation function, this normalization is 
+of a squared-exponential correlation function, this normalization is
 equivalent to specifying a fractional variance in the input.  That is
 
 .. math::
@@ -287,23 +562,22 @@ Processes for Machine Learning, please refer to the references below:
 
 .. topic:: References:
 
-    * `DACE, A Matlab Kriging Toolbox
-      <http://www2.imm.dtu.dk/~hbn/dace/>`_ S Lophaven, HB Nielsen, J
-      Sondergaard 2002
+    .. `DACE, A Matlab Kriging Toolbox
+       <http://www2.imm.dtu.dk/~hbn/dace/>`_ S Lophaven, HB Nielsen, J
+       Sondergaard 2002
 
 
-    * `Screening, predicting, and computer experiments
+    .. `Screening, predicting, and computer experiments
       <http://www.jstor.org/pss/1269548>`_ WJ Welch, RJ Buck, J Sacks,
       HP Wynn, TJ Mitchell, and MD Morris Technometrics 34(1) 15--25,
       1992
 
 
-    * `Gaussian Processes for Machine Learning
-      <http://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_ CE
-      Rasmussen, CKI Williams MIT Press, 2006 (Ed. T Diettrich)
-
+    .. [RW2006] `Gaussian Processes for Machine Learning
+      <http://www.gaussianprocess.org/gpml/chapters/>`_ CE
+      Rasmussen, CKI Williams, MIT Press, 2006 (Ed. T Diettrich)
 
-    * `The design and analysis of computer experiments
+    .. `The design and analysis of computer experiments
       <http://www.stat.osu.edu/~comp_exp/book.html>`_ TJ Santner, BJ
       Williams, W Notz Springer, 2003
 
diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index 71e953882e237..0493fa122c3dc 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -76,29 +76,29 @@
 kr = GridSearchCV(KernelRidge(), cv=5, param_grid=param_grid)
 stime = time.time()
 kr.fit(X, y)
-print "Time for KRR fitting: %.3f" % (time.time() - stime)
+print("Time for KRR fitting: %.3f" % (time.time() - stime))
 
 gp_kernel = ExpSineSquared(1.0, 5.0) + WhiteKernel(1e-1)
 gpr = GaussianProcessRegressor(kernel=gp_kernel)
 stime = time.time()
 gpr.fit(X, y)
-print "Time for GPR fitting: %.3f" % (time.time() - stime)
+print("Time for GPR fitting: %.3f" % (time.time() - stime))
 
 # Predict using kernel ridge
 X_plot = np.linspace(0, 20, 10000)[:, None]
 stime = time.time()
 y_kr = kr.predict(X_plot)
-print "Time for KRR prediction: %.3f" % (time.time() - stime)
+print("Time for KRR prediction: %.3f" % (time.time() - stime))
 
 # Predict using kernel ridge
 stime = time.time()
 y_gpr = gpr.predict(X_plot, return_std=False)
-print "Time for GPR prediction: %.3f" % (time.time() - stime)
+print("Time for GPR prediction: %.3f" % (time.time() - stime))
 
 stime = time.time()
 y_gpr, y_std = gpr.predict(X_plot, return_std=True)
-print "Time for GPR prediction with standard-deviation: %.3f" \
-    % (time.time() - stime)
+print("Time for GPR prediction with standard-deviation: %.3f"
+      % (time.time() - stime))
 
 # Plot results
 plt.scatter(X, y, c='k', label='data')
diff --git a/examples/gaussian_process/plot_gpc.py b/examples/gaussian_process/plot_gpc.py
index 1524ac19918c7..87266c204e18b 100644
--- a/examples/gaussian_process/plot_gpc.py
+++ b/examples/gaussian_process/plot_gpc.py
@@ -1,4 +1,7 @@
-"""Gaussian process classification (GPC)
+"""
+====================================================================
+Probabilistic predictions with Gaussian process classification (GPC)
+====================================================================
 
 This example illustrates the predicted probability of GPC for an RBF kernel
 with different choices of the hyperparameters. The first figure shows the
@@ -17,7 +20,7 @@
 the kernel's hyperparameters, highlighting the two choices of the
 hyperparameters used in the first figure by black dots.
 """
-print __doc__
+print(__doc__)
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 #
@@ -46,17 +49,17 @@
 gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(l=1.0))
 gp_opt.fit(X[:train_size], y[:train_size])
 
-print "Log Marginal Likelihood (initial): %.3f" % \
-    gp_fix.log_marginal_likelihood(gp_fix.theta_)
-print "Log Marginal Likelihood (optimized): %.3f" % \
-    gp_opt.log_marginal_likelihood(gp_opt.theta_)
-
-print "Accuracy: %.3f (initial) %.3f (optimized)" \
-    % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
-       accuracy_score(y[:train_size], gp_opt.predict(X[:train_size])))
-print "Log-loss: %.3f (initial) %.3f (optimized)" \
-    % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
-       log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1]))
+print("Log Marginal Likelihood (initial): %.3f"
+      % gp_fix.log_marginal_likelihood(gp_fix.theta_))
+print("Log Marginal Likelihood (optimized): %.3f"
+      % gp_opt.log_marginal_likelihood(gp_opt.theta_))
+
+print("Accuracy: %.3f (initial) %.3f (optimized)"
+      % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
+         accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
+print("Log-loss: %.3f (initial) %.3f (optimized)"
+      % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
+         log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1])))
 
 
 # Plot posteriors
diff --git a/examples/gaussian_process/plot_gpc_isoprobability.py b/examples/gaussian_process/plot_gpc_isoprobability.py
index 49df64b89b25a..a3bcc083f2314 100644
--- a/examples/gaussian_process/plot_gpc_isoprobability.py
+++ b/examples/gaussian_process/plot_gpc_isoprobability.py
@@ -2,9 +2,9 @@
 # -*- coding: utf-8 -*-
 
 """
-===================================================================
-Gaussian Processes classification example with probabilistic output
-===================================================================
+=================================================================
+Iso-probability lines for Gaussian Processes classification (GPC)
+=================================================================
 
 A two-dimensional classification exampe showing iso-probability lines for
 the predicted probabilities.
@@ -49,7 +49,7 @@ def g(x):
 kernel = C(0.1, (1e-5, np.inf)) * DotProduct(sigma_0=0.1) ** 2
 gp = GaussianProcessClassifier(kernel=kernel)
 gp.fit(X, y)
-print "Learned kernel: %s " % gp.kernel_
+print("Learned kernel: %s " % gp.kernel_)
 
 # Evaluate real function and the predicted probability
 res = 50
diff --git a/examples/gaussian_process/plot_gpc_xor.py b/examples/gaussian_process/plot_gpc_xor.py
index 5f61f21ff6984..5e2dcb9662f63 100644
--- a/examples/gaussian_process/plot_gpc_xor.py
+++ b/examples/gaussian_process/plot_gpc_xor.py
@@ -1,8 +1,11 @@
-"""Gaussian process classification (GPC)
+"""
+========================================================================
+Illustration of Gaussian process classification (GPC) on the XOR dataset
+========================================================================
 
 This example illustrates GPC on XOR data.
 """
-print __doc__
+print(__doc__)
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 #
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 4361fc3912807..ac68423076db6 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -1,5 +1,11 @@
-"""Gaussian process regression (GPR) on Mauna Loa CO2 data. """
-print __doc__
+"""
+========================================================
+Gaussian process regression (GPR) on Mauna Loa CO2 data.
+========================================================
+
+Gaussian process regression (GPR) on Mauna Loa CO2 data.
+"""
+print(__doc__)
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 #
@@ -30,8 +36,8 @@
 							  optimizer=None)
 gp.fit(X, y - y_mean)
 
-print "GPML kernel: %s" % gp.kernel_
-print "Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.theta_)
+print("GPML kernel: %s" % gp.kernel_)
+print("Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.theta_))
 
 # Kernel with optimized parameters
 k1 = 50.0**2 * RBF(l=50.0) # long term smooth rising trend
@@ -45,8 +51,8 @@
 gp = GaussianProcessRegressor(kernel=kernel, sigma_squared_n=0)
 gp.fit(X, y - y_mean)
 
-print "\nLearned kernel: %s" % gp.kernel_
-print "Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.theta_)
+print("\nLearned kernel: %s" % gp.kernel_)
+print("Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.theta_))
 
 X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
 y_pred, y_std = gp.predict(X_, return_std=True)
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index 519ff838fe2e3..a0d1b3447d034 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -1,4 +1,7 @@
-"""Gaussian process regression (GPR) with noise-level estimation.
+"""
+=============================================================
+Gaussian process regression (GPR) with noise-level estimation
+=============================================================
 
 This example illustrates that GPR with a sum-kernel including a WhiteKernel can
 estimate the noise level of data. An illustration of the
@@ -12,7 +15,7 @@
 high-noise solution. It is thus important to repeat the optimization several
 times for different initializations.
 """
-print __doc__
+print(__doc__)
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 #
diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index b88f0e816b18c..59f3c76bdeba4 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -1,7 +1,4 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-
-r"""
+"""
 =========================================================
 Gaussian Processes regression: basic introductory example
 =========================================================
diff --git a/examples/gaussian_process/plot_gpr_prior_posterior.py b/examples/gaussian_process/plot_gpr_prior_posterior.py
index 075178738f34a..9c6042e3e15a6 100644
--- a/examples/gaussian_process/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/plot_gpr_prior_posterior.py
@@ -1,10 +1,13 @@
-"""Gaussian process regression (GPR) prior and posterior
+"""
+==========================================================================
+Illustration of prior and posterior Gaussian process for different kernels
+==========================================================================
 
 This example illustrates the prior and posterior of a GPR with different
 kernels. Mean, standard deviation, and 10 samples are shown for both prior
 and posterior.
 """
-print __doc__
+print(__doc__)
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 #
@@ -42,7 +45,7 @@
                      y_mean + np.sqrt(np.diag(y_cov)),
                      alpha=0.5, color='k')
     y_samples = gp.sample_y(X_[:, np.newaxis], 10)
-    plt.plot(X_, y_samples, color='b', lw=1)
+    plt.plot(X_, y_samples, lw=1)
     plt.xlim(0, 5)
     plt.ylim(-3, 3)
     plt.title("Prior (kernel:  %s)" % kernel)
@@ -63,7 +66,7 @@
                      alpha=0.5, color='k')
 
     y_samples = gp.sample_y(X_[:, np.newaxis], 10)
-    plt.plot(X_, y_samples, color='b', lw=1)
+    plt.plot(X_, y_samples, lw=1)
     plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
     plt.xlim(0, 5)
     plt.ylim(-3, 3)
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index fa3623de21e16..eb8f76134257e 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -22,7 +22,8 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
     The implementation is based on Algorithm 2.1 of ``Gaussian Processes
     for Machine Learning'' (GPML) by Rasmussen and Williams.
 
-    In addition to standard sklearn estimators, GaussianProcessRegressor
+    In addition to standard sklearn estimators, GaussianProcessRegressor:
+
        * allows prediction without prior fitting (based on the GP prior)
        * provides an additional method sample_y(X), which evaluates samples
          drawn from the GPR (prior or posterior) at given inputs

From 813510641ae11c9f9a9a72c9a66c8039a30fd4f6 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Fri, 10 Apr 2015 11:26:14 +0200
Subject: [PATCH 081/165] DOC Documentation of the CO2 example

---
 doc/modules/gaussian_process.rst          | 52 +++++++++++++++++++++++
 examples/gaussian_process/plot_gpr_co2.py | 48 ++++++++++++++++++++-
 2 files changed, 99 insertions(+), 1 deletion(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 5042378575522..9e4f0e6ea3e38 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -142,6 +142,58 @@ than just predicting the mean.
 Gaussian process regression (GPR) on Mauna Loa CO2 data
 -------------------------------------------------------
 
+This example is based on Section 5.4.3 of "Gaussian Processes for Machine
+Learning" [RW2006]_. It illustrates an example of complex kernel engineering and
+hyperparameter optimization using gradient ascent on the
+log-marginal-likelihood. The data consists of the monthly average atmospheric
+CO2 concentrations (in parts per million by volume (ppmv)) collected at the
+Mauna Loa Observatory in Hawaii, between 1958 and 1997. The objective is to
+model the CO2 concentration as a function of the time t.
+
+The kernel is composed of several terms that are responsible for explaining
+different properties of the signal:
+ - a long term, smooth rising trend is to be explained by an RBF kernel. The
+   RBF kernel with a large length-scale enforces this component to be smooth;
+   it is not enforced that the trend is rising which leaves this choice to the
+   GP. The specific length-scale and the amplitude are free hyperparameters.
+ - a seasonal component, which is to be explained by the periodic
+   ExpSineSquared kernel with a fixed periodicity of 1 year. The length-scale
+   of this periodic component, controlling its smoothness, is a free parameter.
+   In order to allow decaying away from exact periodicity, the product with an
+   RBF kernel is taken. The length-scale of this RBF component controls the
+   decay time and is a further free parameter.
+ - smaller, medium term irregularities are to be explained by a
+   RationalQuadratic kernel component, whose length-scale and alpha parameter,
+   which determines the diffuseness of the length-scales, are to be determined.
+   According to [RW2006], these irregularities can better be explained by
+   a RationalQuadratic than an RBF kernel component, probably because it can
+   accommodate several length-scales.
+ - a "noise" term, consisting of an RBF kernel contribution, which shall
+   explain the correlated noise components such as local weather phenomena,
+   and a WhiteKernel contribution for the white noise. The relative amplitudes
+   and the RBF's length scale are further free parameters.
+
+Maximizing the log-marginal-likelihood after subtracting the target's mean
+yields the following kernel with an LML of -84.483:
+
+::
+
+   2.5e+03 * RBF(l=49.8)
+   + 6.68 * RBF(l=100) * ExpSineSquared(l=1.37, p=1)
+   + 0.215 * RationalQuadratic(alpha=3.98, l=0.982)
+   + 0.0381 * RBF(l=0.136) + WhiteKernel(c=0.0335)
+
+Thus, most of the target signal (sqrt(2.5e+03)ppm = 50ppm) is explained by a
+long-term rising trend (length-scale 49.8 years). The periodic component has
+an amplitude of sqrt(6.68)ppm = 2.58ppm, a decay time of 100 years and a
+length-scale of 1.37. The long decay time indicates that we have a locally very
+close to periodic seasonal component. The correlated noise has an amplitude of
+sqrt(0.0381)ppm = 0.195ppm with a length scale of 0.136 years and a white-noise
+contribution of sqrt(0.0335)ppm = 0.183pm. Thus, the overall noise level is
+very small, indicating that the data can be very well explained by the model.
+The following figure shows also that the model makes very confident predictions
+until around 2015.
+
 .. figure:: ../auto_examples/gaussian_process/images/plot_gpr_co2_001.png
    :target: ../auto_examples/gaussian_process/plot_gpr_co2.html
    :align: center
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index ac68423076db6..58482495e5dda 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -3,7 +3,53 @@
 Gaussian process regression (GPR) on Mauna Loa CO2 data.
 ========================================================
 
-Gaussian process regression (GPR) on Mauna Loa CO2 data.
+This example is based on Section 5.4.3 of "Gaussian Processes for Machine
+Learning" [RW2006]. It illustrates an example of complex kernel engineering and
+hyperparameter optimization using gradient ascent on the
+log-marginal-likelihood. The data consists of the monthly average atmospheric
+CO2 concentrations (in parts per million by volume (ppmv)) collected at the
+Mauna Loa Observatory in Hawaii, between 1958 and 1997. The objective is to
+model the CO2 concentration as a function of the time t.
+
+The kernel is composed of several terms that are responsible for explaining
+different properties of the signal:
+ - a long term, smooth rising trend is to be explained by an RBF kernel. The
+   RBF kernel with a large length-scale enforces this component to be smooth;
+   it is not enforced that the trend is rising which leaves this choice to the
+   GP. The specific length-scale and the amplitude are free hyperparameters.
+ - a seasonal component, which is to be explained by the periodic
+   ExpSineSquared kernel with a fixed periodicity of 1 year. The length-scale
+   of this periodic component, controlling its smoothness, is a free parameter.
+   In order to allow decaying away from exact periodicity, the product with an
+   RBF kernel is taken. The length-scale of this RBF component controls the
+   decay time and is a further free parameter.
+ - smaller, medium term irregularities are to be explained by a
+   RationalQuadratic kernel component, whose length-scale and alpha parameter,
+   which determines the diffuseness of the length-scales, are to be determined.
+   According to [RW2006], these irregularities can better be explained by
+   a RationalQuadratic than an RBF kernel component, probably because it can
+   accommodate several length-scales.
+ - a "noise" term, consisting of an RBF kernel contribution, which shall
+   explain the correlated noise components such as local weather phenomena,
+   and a WhiteKernel contribution for the white noise. The relative amplitudes
+   and the RBF's length scale are further free parameters.
+
+Maximizing the log-marginal-likelihood after subtracting the target's mean
+yields the following kernel with an LML of -84.483:
+   2.5e+03 * RBF(l=49.8)
+   + 6.68 * RBF(l=100) * ExpSineSquared(l=1.37, p=1)
+   + 0.215 * RationalQuadratic(alpha=3.98, l=0.982)
+   + 0.0381 * RBF(l=0.136) + WhiteKernel(c=0.0335)
+Thus, most of the target signal (sqrt(2.5e+03)ppm = 50ppm) is explained by a
+long-term rising trend (length-scale 49.8 years). The periodic component has
+an amplitude of sqrt(6.68)ppm = 2.58ppm, a decay time of 100 years and a
+length-scale of 1.37. The long decay time indicates that we have a locally very
+close to periodic seasonal component. The correlated noise has an amplitude of
+sqrt(0.0381)ppm = 0.195ppm with a length scale of 0.136 years and a white-noise
+contribution of sqrt(0.0335)ppm = 0.183pm. Thus, the overall noise level is
+very small, indicating that the data can be very well explained by the model.
+The figure shows also that the model makes very confident predictions until
+around 2015.
 """
 print(__doc__)
 

From 2515a25275dcb0d6744eb263f41cd4c2edbdf285 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Fri, 10 Apr 2015 11:43:14 +0200
Subject: [PATCH 082/165] DOC Backported documentation to kernels module

---
 sklearn/gaussian_process/kernels.py | 41 +++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 3c9dfb9737665..f8c5f8257ceed 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -791,7 +791,17 @@ def __repr__(self):
 class RBF(Kernel):
     """Radial-basis function kernel (aka squared-exponential kernel).
 
-    Both isotropic and anisotropic version are supported.
+    The RBF kernel is a stationary kernel. It is also known as the
+    "squared exponential" kernel. It is parameterized by a length-scale
+    parameter l>0, which can either be a scalar (isotropic variant of
+    the kernel) or a vector with the same number of dimensions as the inputs
+    X (anisotropic variant of the kernel). The kernel given by:
+
+    k(x_i, x_j) = exp(-1 / 2 d(x_i / l, x_j / l)^2)
+
+    This kernel is infinitely differentiable, which implies that GPs with this
+    kernel as covariance function have mean square derivatives of all orders,
+    and are thus very smooth.
 
     Parameters
     -----------
@@ -888,10 +898,13 @@ def __repr__(self):
 class RationalQuadratic(Kernel):
     """Rational Quadratic kernel.
 
-    This kernel can be seen as a scale mixture (an infinite sum) of RBF kernels
-    with different characteristic length-scales.
+    The RationalQuadratic kernel can be seen as a scale mixture (an infinite
+    sum) of RBF kernels with different characteristic length-scales. It is
+    parameterized by a length-scale parameter l>0 and a scale mixture parameter
+    alpha>0 Only the isotropic variant where l is a scalar is supported at the
+    moment. The kernel given by:
 
-    Only isotropic variant is supported at the moment.
+    k(x_i, x_j) = (1 + d(x_i, x_j)^2 / (2*alpha l^2))^alpha
 
     Parameters
     ----------
@@ -985,9 +998,12 @@ def __repr__(self):
 class ExpSineSquared(Kernel):
     """Exp-Sine-Squared kernel.
 
-    This kernel allows modelling periodic functions.
+    The ExpSineSquared kernel allows modeling periodic functions. It is
+    parameterized by a length-scale parameter l>0 and a periodicity parameter
+    p>0. Only the isotropic variant where l is a scalar is supported at the
+    moment. The kernel given by:
 
-    Only isotropic variant is supported at the moment.
+    k(x_i, x_j) =  exp(-2 sin(\pi / p * d(x_i, x_j)) / l)^2
 
     Parameters
     ----------
@@ -1003,7 +1019,6 @@ class ExpSineSquared(Kernel):
     p_bounds : pair of floats >= 0, default: (1e-5, np.inf)
         The lower and upper bound on p
     """
-
     def __init__(self, l=1.0, p=1.0, l_bounds=(1e-5, np.inf),
                  p_bounds=(1e-5, np.inf)):
         self.l = l
@@ -1082,7 +1097,17 @@ def __repr__(self):
 class DotProduct(Kernel):
     """Dot-Product kernel.
 
-    This kernel is non-stationary.
+    The DotProduct kernel is non-stationary and can be obtained from linear
+    regression by putting N(0, 1) priors on the coefficients of x_d (d = 1, . .
+    . , D) and a prior of N(0, \sigma_0^2) on the bias. The DotProduct kernel is
+    invariant to a rotation of the coordinates about the origin, but not
+    translations. It is parameterized by a parameter sigma_0^2. For
+    sigma_0^2 =0, the kernel is called the homogeneous linear kernel, otherwise
+    it is inhomogeneous. The kernel is given by
+
+    k(x_i, x_j) = sigma_0 ^ 2 + x_i \cdot x_j
+
+    The DotProduct kernel is commonly combined with exponentiation.
 
     Parameters
     ----------

From eaf560208b8a517a1152753cb98e0a4581d23c64 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 15 Apr 2015 15:21:47 +0200
Subject: [PATCH 083/165] DOC Extending further the GP documentation

---
 doc/modules/gaussian_process.rst              | 119 +++++++++++-------
 .../plot_gpr_noisy_targets.py                 |   4 +-
 sklearn/gaussian_process/gpr.py               |   3 +-
 3 files changed, 74 insertions(+), 52 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 9e4f0e6ea3e38..fabc4b099b21b 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -8,61 +8,78 @@ Gaussian Processes
 
 .. currentmodule:: sklearn.gaussian_process
 
-**Gaussian Processes for Machine Learning (GPML)** is a generic supervised
-learning method primarily designed to solve *regression* problems. It has also
-been extended to *probabilistic classification*, but in the present
-implementation, this is only a post-processing of the *regression* exercise.
+**Gaussian Processes (GP)** are a generic supervised learning method designed
+to solve *regression* and *probabilistic classification* problems.
 
-The advantages of Gaussian Processes for Machine Learning are:
+The advantages of Gaussian processes are:
 
     - The prediction interpolates the observations (at least for regular
-      correlation models).
+      kernels).
 
     - The prediction is probabilistic (Gaussian) so that one can compute
       empirical confidence intervals and exceedance probabilities that might be
       used to refit (online fitting, adaptive fitting) the prediction in some
       region of interest.
 
-    - Versatile: different :ref:`linear regression models
-      <linear_model>` and :ref:`correlation models
-      <correlation_models>` can be specified. Common models are provided, but
-      it is also possible to specify custom models provided they are
-      stationary.
+    - Versatile: different :ref:`kernels
+      <gp_kernels>` can be specified. Common kernels are provided, but
+      it is also possible to specify custom kernels.
 
-The disadvantages of Gaussian Processes for Machine Learning include:
+The disadvantages of Gaussian processes include:
 
-    - It is not sparse. It uses the whole samples/features information to
+    - They are not sparse, i.e., they use the whole samples/features information to
       perform the prediction.
 
-    - It loses efficiency in high dimensional spaces -- namely when the number
-      of features exceeds a few dozens. It might indeed give poor performance
-      and it loses computational efficiency.
+    - They lose efficiency in high dimensional spaces -- namely when the number
+      of features exceeds a few dozens.
 
-    - Classification is only a post-processing, meaning that one first need
-      to solve a regression problem by providing the complete scalar float
-      precision output :math:`y` of the experiment one attempt to model.
 
-Thanks to the Gaussian property of the prediction, it has been given varied
-applications: e.g. for global optimization, probabilistic classification.
+.. _gpr:
 
+Gaussian Process Regression (GPR)
+=================================
 
-Gaussian Process Regression
-===========================
-
-Noise-free and noisy targets
-----------------------------
-
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_noisy_targets_001.png
-   :target: ../auto_examples/gaussian_process/plot_gpr_noisy_targets.html
-   :align: center
-
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_noisy_targets_002.png
-   :target: ../auto_examples/gaussian_process/plot_gpr_noisy_targets.html
-   :align: center
-
+.. currentmodule:: sklearn.gaussian_process
 
-Gaussian process regression (GPR) with noise-level estimation
--------------------------------------------------------------
+The :class:`GaussianProcessRegressor` implements Gaussian processes (GP) for
+regression purposes. For this, the prior of the GP needs to be specified. The
+prior mean is assumed to be zero (other constants can be achieved by
+subtracting the specific value externally from all target values). The prior's
+covariance is specified by a passing a :ref:`kernel <gp_kernels>` object. The
+hyperparameters of the kernel are optimized during fitting of
+GaussianProcessRegressor by maximizing the log-marginal-likelihood (LML) based
+on the passed `optimizer`. As the LML may have multiple local optima, the
+optimizer can be started repeatedly by specifying `n_restarts_optimizer`. The
+first run is always conducted starting from the initial hyperparameter values
+of the kernel; subsequent runs are conducted from hyperparameter values
+that have been chosen randomly from the range of allowed values.
+If the initial hyperparameters should be kept fixed, `None` can be passed as
+optimizer.
+
+The noise level in the targets can be specified by passing it via the
+parameter `sigma_squared_n`, either globally as a scalar or per datapoint.
+Note that a moderate noise level can also be helpful for dealing with numeric
+issues during fitting as it is effectively implemented as Tikhonov
+regularization, i.e., by adding it to the diagonal of the kernel matrix. An
+alternative to specifying the noise level explicitly is to include a
+WhiteKernel component into the kernel, which can estimate the global noise
+level from the data (see example below).
+
+The implementation is based on Algorithm 2.1 of [RW2006]_. In addition to
+the API of standard sklearn estimators, GaussianProcessRegressor:
+     * allows prediction without prior fitting (based on the GP prior)
+     * provides an additional method `sample_y(X)`, which evaluates samples
+       drawn from the GPR (prior or posterior) at given inputs
+     * exposes a method `log_marginal_likelihood(theta)`, which can be used
+       externally for other ways of selecting hyperparameters, e.g., via
+       Markov chain Monte Carlo.
+
+
+GPR examples
+============
+
+GPR with noise-level estimation
+-------------------------------
 This example illustrates that GPR with a sum-kernel including a WhiteKernel can
 estimate the noise level of data. An illustration of the
 log-marginal-likelihood (LML) landscape shows that there exist two local
@@ -139,8 +156,8 @@ hyperparameter space. The time for predicting is similar; however, generating
 the variance of the predictive distribution of GPR takes considerable longer
 than just predicting the mean.
 
-Gaussian process regression (GPR) on Mauna Loa CO2 data
--------------------------------------------------------
+GPR on Mauna Loa CO2 data
+-------------------------
 
 This example is based on Section 5.4.3 of "Gaussian Processes for Machine
 Learning" [RW2006]_. It illustrates an example of complex kernel engineering and
@@ -198,12 +215,16 @@ until around 2015.
    :target: ../auto_examples/gaussian_process/plot_gpr_co2.html
    :align: center
 
+.. _gpc:
 
-Gaussian Process Classification
-===============================
+Gaussian Process Classification (GPC)
+=====================================
 
-Probabilistic predictions with Gaussian process classification (GPC)
---------------------------------------------------------------------
+GPC examples
+============
+
+Probabilistic predictions with GPC
+----------------------------------
 
 This example illustrates the predicted probability of GPC for an RBF kernel
 with different choices of the hyperparameters. The first figure shows the
@@ -231,22 +252,24 @@ hyperparameters used in the first figure by black dots.
    :align: center
 
 
-Iso-probability lines for Gaussian Processes classification (GPC)
------------------------------------------------------------------
+Iso-probability lines for GPC
+-----------------------------
 
 .. figure:: ../auto_examples/gaussian_process/images/plot_gpc_isoprobability_001.png
    :target: ../auto_examples/gaussian_process/plot_gpc_isoprobability.html
    :align: center
 
 
-Illustration of Gaussian process classification (GPC) on the XOR dataset
-------------------------------------------------------------------------
+Illustration of GPC on the XOR dataset
+--------------------------------------
 
 .. figure:: ../auto_examples/gaussian_process/images/plot_gpc_xor_001.png
    :target: ../auto_examples/gaussian_process/plot_gpc_xor.html
    :align: center
 
 
+.. _gp_kernels:
+
 Kernels for Gaussian Processes
 ==============================
 .. currentmodule:: sklearn.gaussian_process.kernels
@@ -303,7 +326,7 @@ number of dimensions as the inputs :math:`x` (anisotropic variant of the kernel)
 The kernel given by:
 
 .. math::
-   k(x_i, x_j) = \text{exp}\left(-\frac{1}{2} \vert d(x_i / l, x_j / l)^2\right)
+   k(x_i, x_j) = \text{exp}\left(-\frac{1}{2} d(x_i / l, x_j / l)^2\right)
 
 This kernel is infinitely differentiable, which implies that GPs with this
 kernel as covariance function have mean square derivatives of all orders, and are thus
diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index 59f3c76bdeba4..da336558edaf7 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -3,10 +3,10 @@
 Gaussian Processes regression: basic introductory example
 =========================================================
 
-A simple one-dimensional regression exercise computed in two different ways:
+A simple one-dimensional regression example computed in two different ways:
 
 1. A noise-free case
-2. A noisy case with known noise-leve per datapoint
+2. A noisy case with known noise-level per datapoint
 
 In both cases, the kernel's parameters are estimated using the maximum
 likelihood principle.
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index eb8f76134257e..b9af8b0d1cc47 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -22,7 +22,7 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
     The implementation is based on Algorithm 2.1 of ``Gaussian Processes
     for Machine Learning'' (GPML) by Rasmussen and Williams.
 
-    In addition to standard sklearn estimators, GaussianProcessRegressor:
+    In addition to standard sklearn estimator API, GaussianProcessRegressor:
 
        * allows prediction without prior fitting (based on the GP prior)
        * provides an additional method sample_y(X), which evaluates samples
@@ -87,7 +87,6 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
     alpha_: array-like, shape = (n_samples,)
         Dual coefficients of training data points in kernel space
     """
-
     def __init__(self, kernel=None, sigma_squared_n=1e-10,
                  optimizer="fmin_l_bfgs_b", n_restarts_optimizer=1,
                  random_state=None):

From 1e8c719e23896858ced95c87b27a4d1a5e1029bf Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 20 May 2015 11:04:39 +0200
Subject: [PATCH 084/165] ENH GPR supports normalization of target values

---
 sklearn/gaussian_process/gpr.py | 39 ++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index b9af8b0d1cc47..4d872893ff342 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -61,6 +61,16 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
         from the space of allowed theta-values. If greater than 1, all bounds
         must be finite.
 
+    normalize_y: boolean, optional (default: False)
+        Whether the target values y are normalized, i.e., mean and standard
+        deviation of observed target values become zero and one, respectively.
+        This parameter should be set to True if the target values' mean is
+        expected to differ considerable from zero or if the standard deviation
+        of the target values is very small or large. When enabled, the
+        normalization effectively modifies the GP's prior based on the data,
+        which contradicts the likelihood principle; normalization is thus
+        disabled per default.
+
     random_state : integer or numpy.RandomState, optional
         The generator used to initialize the centers. If an integer is
         given, it fixes the seed. Defaults to the global numpy random
@@ -89,11 +99,12 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
     """
     def __init__(self, kernel=None, sigma_squared_n=1e-10,
                  optimizer="fmin_l_bfgs_b", n_restarts_optimizer=1,
-                 random_state=None):
+                 normalize_y=False, random_state=None):
         self.kernel = kernel
         self.sigma_squared_n = sigma_squared_n
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
+        self.normalize_y = normalize_y
         self.rng = check_random_state(random_state)
 
     def fit(self, X, y):
@@ -118,11 +129,26 @@ def fit(self, X, y):
 
         X, y = check_X_y(X, y)
 
+        # Normalize target value
+        if self.normalize_y:
+            self.y_fit_mean = np.mean(y, axis=0)
+            self.y_fit_std = np.atleast_1d(np.std(y, axis=0))
+            self.y_fit_std[self.y_fit_std == 0.] = 1.
+            # center and scale y (and sigma_squared_n)
+            y = (y - self.y_fit_mean) / self.y_fit_std
+            self.sigma_squared_n /= self.y_fit_std**2
+        else:
+            self.y_fit_mean = np.zeros(1)
+            self.y_fit_std = np.ones(1)
+
         if np.iterable(self.sigma_squared_n) \
            and self.sigma_squared_n.shape[0] != y.shape[0]:
-            raise ValueError("sigma_n_squared must be a scalar or an array "
-                             "with same number of entries as y. (%d != %d)"
-                              % (self.sigma_squared_n.shape[0], y.shape[0]))
+            if self.sigma_squared_n.shape[0] == 1:
+                self.sigma_squared_n = self.sigma_squared_n[0]
+            else:
+                raise ValueError("sigma_n_squared must be a scalar or an array"
+                                 " with same number of entries as y.(%d != %d)"
+                                 % (self.sigma_squared_n.shape[0], y.shape[0]))
 
         self.X_fit_ = X
         self.y_fit_ = y
@@ -227,10 +253,11 @@ def predict(self, X, return_std=False, return_cov=False):
         else:  # Predict based on GP posterior
             K_trans = self.kernel_(X, self.X_fit_)
             y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
+            y_mean = self.y_fit_mean + self.y_fit_std * y_mean  # undo normal.
             if return_cov:
                 v = cho_solve((self.L_, True), K_trans.T)  # Line 5
                 y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
-                return y_mean, y_cov
+                return y_mean, y_cov * self.y_fit_std ** 2
             elif return_std:
                 # compute inverse K_inv of K based on its Cholesky
                 # decomposition L and its inverse L_inv
@@ -241,7 +268,7 @@ def predict(self, X, return_std=False, return_cov=False):
                 y_var -= np.sum(K_trans.T[:, np.newaxis] * K_trans.T
                                 * K_inv[:, :, np.newaxis],
                                 axis=0).sum(axis=0)  # axis=(0, 1) in np >= 1.7
-                return y_mean, np.sqrt(y_var)
+                return y_mean, np.sqrt(y_var) * self.y_fit_std
             else:
                 return y_mean
 

From c3387ad3fd81446d76d09f229f0bfbe0a24d8440 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 20 May 2015 11:04:53 +0200
Subject: [PATCH 085/165] TST Testing normalization of output values in GPR

---
 sklearn/gaussian_process/tests/test_gpr.py | 33 ++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index b3eb7b63fdbc0..a6d6028291ca1 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -174,3 +174,36 @@ def test_random_starts():
         lml = gp.log_marginal_likelihood(gp.theta_)
         assert_greater(lml, last_lml - np.finfo(np.float32).eps)
         last_lml = lml
+
+def test_y_normalization():
+    """ Test normalization of the target values in GP
+
+    Fitting non-normalizing GP on normalized y and fitting normalizing GP
+    on unnormalized y should yield identical results
+    """
+    y_mean, y_std = y.mean(0), y.std(0)
+    y_norm = (y - y_mean) / y_std
+    for kernel in kernels:
+        # Fit non-normalizing GP on normalized y
+        gpr = GaussianProcessRegressor(kernel=kernel,
+                                       sigma_squared_n=1e-10 / y_std**2)
+        gpr.fit(X, y_norm)
+        # Fit normalizing GP on unnormalized y
+        gpr_norm = GaussianProcessRegressor(kernel=kernel,
+                                            sigma_squared_n=1e-10,
+                                            normalize_y=True)
+        gpr_norm.fit(X, y)
+
+        # Compare predicted mean, std-devs and covariances
+        y_pred, y_pred_std = gpr.predict(X2, return_std=True)
+        y_pred = y_mean + y_pred * y_std
+        y_pred_std *= y_std
+        y_pred_norm, y_pred_std_norm = gpr_norm.predict(X2, return_std=True)
+
+        assert_almost_equal(y_pred, y_pred_norm)
+        assert_almost_equal(y_pred_std, y_pred_std_norm)
+
+        _, y_cov = gpr.predict(X2, return_cov=True)
+        y_cov *= y_std ** 2
+        _, y_cov_norm = gpr_norm.predict(X2, return_cov=True)
+        assert_almost_equal(y_cov, y_cov_norm)

From cf6ce0e7ab03359d6ec10e3d4eb490ef3eed8881 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 20 May 2015 14:15:18 +0200
Subject: [PATCH 086/165] ENH GPR supports target value with more than one
 dimension

---
 sklearn/gaussian_process/gpr.py | 77 +++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 27 deletions(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 4d872893ff342..59314aa03bbc0 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -81,7 +81,7 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
     X_fit_ : array-like, shape = (n_samples, n_features)
         Feature values in training data (also required for prediction)
 
-    y_fit_: array-like, shape = (n_samples,)
+    y_fit_: array-like, shape = (n_samples, [n_output_dims])
         Target values in training data (also required for prediction)
 
     kernel_: kernel object
@@ -115,7 +115,7 @@ def fit(self, X, y):
         X : array-like, shape = (n_samples, n_features)
             Training data
 
-        y : array-like, shape = (n_samples, )
+        y : array-like, shape = (n_samples, [n_output_dims])
             Target values
 
         Returns
@@ -127,12 +127,12 @@ def fit(self, X, y):
         else:
             self.kernel_ = clone(self.kernel)
 
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y, multi_output=True)
 
         # Normalize target value
         if self.normalize_y:
             self.y_fit_mean = np.mean(y, axis=0)
-            self.y_fit_std = np.atleast_1d(np.std(y, axis=0))
+            self.y_fit_std = np.atleast_1d(np.std(y))  # XXX: std per dim?
             self.y_fit_std[self.y_fit_std == 0.] = 1.
             # center and scale y (and sigma_squared_n)
             y = (y - self.y_fit_mean) / self.y_fit_std
@@ -222,16 +222,16 @@ def predict(self, X, return_std=False, return_cov=False):
 
         Returns
         -------
-        y_mean : array, shape = (n_samples,)
+        y_mean : array, shape = (n_samples, [n_output_dims])
             Mean of predictive distribution a query points
 
         y_std : array, shape = (n_samples,), optional
-            Standard deviation of predictive distribution a query points.
-            Only returned when return_std is True
+            Standard deviation of predictive distribution at query points.
+            Only returned when return_std is True.
 
         y_cov : array, shape = (n_samples, n_samples), optional
             Covariance of joint predictive distribution a query points.
-            Only returned when return_cov is True
+            Only returned when return_cov is True.
         """
         if return_std and return_cov:
             raise RuntimeError(
@@ -257,7 +257,8 @@ def predict(self, X, return_std=False, return_cov=False):
             if return_cov:
                 v = cho_solve((self.L_, True), K_trans.T)  # Line 5
                 y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
-                return y_mean, y_cov * self.y_fit_std ** 2
+                y_cov *= self.y_fit_std ** 2  # undo normalization
+                return y_mean, y_cov
             elif return_std:
                 # compute inverse K_inv of K based on its Cholesky
                 # decomposition L and its inverse L_inv
@@ -268,7 +269,8 @@ def predict(self, X, return_std=False, return_cov=False):
                 y_var -= np.sum(K_trans.T[:, np.newaxis] * K_trans.T
                                 * K_inv[:, :, np.newaxis],
                                 axis=0).sum(axis=0)  # axis=(0, 1) in np >= 1.7
-                return y_mean, np.sqrt(y_var) * self.y_fit_std
+                y_std = np.sqrt(y_var) * self.y_fit_std  # undo normalization
+                return y_mean, y_std
             else:
                 return y_mean
 
@@ -288,14 +290,21 @@ def sample_y(self, X, n_samples=1, random_state=0):
 
         Returns
         -------
-        y_samples : array, shape = (n_samples_X, n_samples)
+        y_samples : array, shape = (n_samples_X, [n_output_dims], n_samples)
             Values of n_samples samples drawn from Gaussian process and
             evaluated at query points.
         """
         rng = check_random_state(random_state)
 
         y_mean, y_cov = self.predict(X, return_cov=True)
-        y_samples = rng.multivariate_normal(y_mean, y_cov, n_samples).T
+        if y_mean.ndim == 1:
+            y_samples = rng.multivariate_normal(y_mean, y_cov, n_samples).T
+        else:
+            y_samples = \
+                [rng.multivariate_normal(y_mean[:, i], y_cov,
+                                         n_samples).T[:, np.newaxis]
+                 for i in range(y_mean.shape[1])]
+            y_samples = np.hstack(y_samples)
         return y_samples
 
     def log_marginal_likelihood(self, theta, eval_gradient=False):
@@ -336,21 +345,35 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
             return (-np.inf, np.zeros_like(theta)) \
                 if eval_gradient else -np.inf
 
-        alpha = cho_solve((L, True), self.y_fit_)  # Line 3
-
-        # Compute log-likelihood (compare line 7)
-        log_likelihood = -0.5*self.y_fit_.dot(alpha)
-        log_likelihood -= np.log(np.diag(L)).sum()
-        log_likelihood -= K.shape[0] / 2 * np.log(2 * np.pi)
-
-        if eval_gradient:  # compare Equation 5.9 from GPML
-            tmp = np.outer(alpha, alpha)
-            tmp -= cho_solve((L, True), np.eye(K.shape[0]))
-            # Compute "0.5 * trace(tmp.dot(K_gradient))" without constructing
-            # the full matrix tmp.dot(K_gradient) since only its diagonal is
-            # required
-            log_likelihood_gradient = \
-                0.5 * np.einsum("ij,ijk->k", tmp, K_gradient)
+        log_likelihood = 0
+        if eval_gradient:
+            log_likelihood_gradient = 0
+
+        # Iterate over output dimensions of self.y_fit_
+        y_fit = self.y_fit_
+        if y_fit.ndim == 1:
+            y_fit = y_fit[:, np.newaxis]
+        for i in range(y_fit.shape[1]):
+            alpha = cho_solve((L, True), y_fit[:, i])  # Line 3
+
+            # Compute log-likelihood of output dimension (compare line 7)
+            log_likelihood_dim = -0.5 * y_fit[:, i].dot(alpha)
+            log_likelihood_dim -= np.log(np.diag(L)).sum()
+            log_likelihood_dim -= K.shape[0] / 2 * np.log(2 * np.pi)
+
+            log_likelihood += log_likelihood_dim
+
+            if eval_gradient:  # compare Equation 5.9 from GPML
+                tmp = np.outer(alpha, alpha)
+                tmp -= cho_solve((L, True), np.eye(K.shape[0]))
+                # Compute "0.5 * trace(tmp.dot(K_gradient))" without
+                # constructing the full matrix tmp.dot(K_gradient) since only
+                # its diagonal is required
+                log_likelihood_gradient_dim = \
+                    0.5 * np.einsum("ij,ijk->k", tmp, K_gradient)
+                log_likelihood_gradient += log_likelihood_gradient_dim
+
+        if eval_gradient:
             return log_likelihood, log_likelihood_gradient
         else:
             return log_likelihood

From bbd575344d3d25aac0f5f969a60bef888045c2b2 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 20 May 2015 14:15:29 +0200
Subject: [PATCH 087/165] TST Testing multi-out GPR

---
 sklearn/gaussian_process/tests/test_gpr.py | 44 ++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index a6d6028291ca1..e08d0cb1899bd 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -207,3 +207,47 @@ def test_y_normalization():
         y_cov *= y_std ** 2
         _, y_cov_norm = gpr_norm.predict(X2, return_cov=True)
         assert_almost_equal(y_cov, y_cov_norm)
+
+
+def test_y_multioutput():
+    """
+    """
+    y_2d = np.vstack((y, y*2)).T
+
+    # Test for fixed kernel that first dimension of 2d GP equals the output
+    # of 1d GP and that second dimension is twice as large
+    kernel = RBF(l=1.0)
+
+    gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None,
+                                   normalize_y=False)
+    gpr.fit(X, y)
+
+    gpr_2d = GaussianProcessRegressor(kernel=kernel, optimizer=None,
+                                      normalize_y=False)
+    gpr_2d.fit(X, y_2d)
+
+    y_pred_1d, y_std_1d = gpr.predict(X2, return_std=True)
+    y_pred_2d, y_std_2d = gpr_2d.predict(X2, return_std=True)
+    _, y_cov_1d = gpr.predict(X2, return_cov=True)
+    _, y_cov_2d = gpr_2d.predict(X2, return_cov=True)
+
+    assert_almost_equal(y_pred_1d, y_pred_2d[:, 0])
+    assert_almost_equal(y_pred_1d, y_pred_2d[:, 1] / 2)
+
+    # Standard deviation and covariance do not depend on output
+    assert_almost_equal(y_std_1d, y_std_2d)
+    assert_almost_equal(y_cov_1d, y_cov_2d)
+
+    y_sample_1d = gpr.sample_y(X2, n_samples=10)
+    y_sample_2d = gpr_2d.sample_y(X2, n_samples=10)
+    assert_almost_equal(y_sample_1d, y_sample_2d[:, 0])
+
+    # Test hyperparameter optimization
+    for kernel in kernels:
+        gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+        gpr.fit(X, y)
+
+        gpr_2d = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+        gpr_2d.fit(X, np.vstack((y, y)).T)
+
+        assert_almost_equal(gpr.kernel_.theta, gpr_2d.kernel_.theta, 4)

From f779bf11f15ac5f6a40081ce32e0324322cfca4d Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 20 May 2015 14:31:20 +0200
Subject: [PATCH 088/165] FIX Not performing hyperparamter-tuning in GPR if
 there are no hyperparameters

---
 sklearn/gaussian_process/gpr.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 59314aa03bbc0..9d2de535f31e0 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -153,7 +153,9 @@ def fit(self, X, y):
         self.X_fit_ = X
         self.y_fit_ = y
 
-        if self.optimizer in ["fmin_l_bfgs_b"]:
+        if self.kernel_.n_dims == 0:  # no tunable hyperparameters
+            self.theta_ = self.kernel_.theta   # actually an empty sequence
+        elif self.optimizer in ["fmin_l_bfgs_b"]:
             # Choose hyperparameters based on maximizing the log-marginal
             # likelihood (potentially starting from several initial values)
             def obj_func(theta):

From 8ad4c51414566a7611b18a7a2a8c0f401a9d8c8d Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 20 May 2015 15:18:46 +0200
Subject: [PATCH 089/165] ENH GPR can use an externally defined optimizer for
 hyperparameter tuning

---
 sklearn/gaussian_process/gpr.py | 52 ++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 9d2de535f31e0..3a7911b089389 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -45,11 +45,28 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
         passed, it must have the same number of entries as the data used for
         fitting and is used as datapoint-dependent noise level.
 
-    optimizer : string, optional (default: "fmin_l_bfgs_b")
-        A string specifying the optimization algorithm used for optimizing the
-        kernel's parameters. Default uses 'fmin_l_bfgs_b' algorithm from
-        scipy.optimize. If None, the kernel's parameters are kept fixed.
-        Available optimizers are::
+    optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the  signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func' is the objective function to be maximized, which
+                #   takes the hyperparameters theta as parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additional to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and the
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are::
 
             'fmin_l_bfgs_b'
 
@@ -155,13 +172,16 @@ def fit(self, X, y):
 
         if self.kernel_.n_dims == 0:  # no tunable hyperparameters
             self.theta_ = self.kernel_.theta   # actually an empty sequence
-        elif self.optimizer in ["fmin_l_bfgs_b"]:
+        elif self.optimizer is not None:
             # Choose hyperparameters based on maximizing the log-marginal
             # likelihood (potentially starting from several initial values)
-            def obj_func(theta):
-                lml, grad = self.log_marginal_likelihood(theta,
-                                                         eval_gradient=True)
-                return -lml, -grad
+            def obj_func(theta, eval_gradient=True):
+                if eval_gradient:
+                    lml, grad = self.log_marginal_likelihood(
+                        theta, eval_gradient=True)
+                    return -lml, -grad
+                else:
+                    return -self.log_marginal_likelihood(theta)
 
             # First optimize starting from theta specified in kernel
             optima = [(self._constrained_optimization(obj_func,
@@ -186,11 +206,9 @@ def obj_func(theta):
             # likelihood
             self.theta_ = optima[np.argmin(map(itemgetter(1), optima))][0]
             self.kernel_.theta = self.theta_
-        elif self.optimizer is None:
+        else:
             # Use initially provided hyperparameters
             self.theta_ = self.kernel_.theta
-        else:
-            raise ValueError("Unknown optimizer %s." % self.optimizer)
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
@@ -381,7 +399,13 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
             return log_likelihood
 
     def _constrained_optimization(self, obj_func, initial_theta, bounds):
-        if self.optimizer in ["fmin_l_bfgs_b"]:
+        if self.optimizer == "fmin_l_bfgs_b":
             theta_opt, func_min, _ = \
                 fmin_l_bfgs_b(obj_func, initial_theta, bounds=bounds)
+        elif callable(self.optimizer):
+            theta_opt, func_min = \
+                self.optimizer(obj_func, initial_theta, bounds=bounds)
+        else:
+            raise ValueError("Unknown optimizer %s." % self.optimizer)
+
         return theta_opt, func_min

From 103672e9d3001f016b3626ed20ca8f7f22153faf Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 20 May 2015 15:19:06 +0200
Subject: [PATCH 090/165] TST Testing GPR with custom external optimizer

---
 sklearn/gaussian_process/tests/test_gpr.py | 26 ++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index e08d0cb1899bd..319e90f895a39 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -210,8 +210,7 @@ def test_y_normalization():
 
 
 def test_y_multioutput():
-    """
-    """
+    """ Test that GPR can deal with multi-dimensional target values"""
     y_2d = np.vstack((y, y*2)).T
 
     # Test for fixed kernel that first dimension of 2d GP equals the output
@@ -251,3 +250,26 @@ def test_y_multioutput():
         gpr_2d.fit(X, np.vstack((y, y)).T)
 
         assert_almost_equal(gpr.kernel_.theta, gpr_2d.kernel_.theta, 4)
+
+
+def test_custom_optimizer():
+    """ Test that GPR can use externally defined optimizers. """
+    # Define a dummy optimizer that simply tests 1000 random hyperparameters
+    def optimizer(obj_func, initial_theta, bounds):
+        rng = np.random.RandomState(0)
+        theta_opt, func_min = \
+            initial_theta, obj_func(initial_theta, eval_gradient=False)
+        for _ in range(1000):
+            theta = np.atleast_1d(rng.uniform(np.maximum(1e-2, bounds[:, 0]),
+                                              np.minimum(10, bounds[:, 1])))
+            f = obj_func(theta, eval_gradient=False)
+            if f < func_min:
+                theta_opt, func_min = theta, f
+        return theta_opt, func_min
+
+    for kernel in kernels:
+        gpr = GaussianProcessRegressor(kernel=kernel, optimizer=optimizer)
+        gpr.fit(X, y)
+        # Checks that optimizer improved marginal likelihood
+        assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta),
+                       gpr.log_marginal_likelihood(gpr.kernel.theta))

From 7c13efe5244a1ea80cb56851ab1ff6902a9bf42f Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 21 May 2015 10:00:58 +0200
Subject: [PATCH 091/165] REFACTOR GP kernel hyperparameters theta and their
 bounds are log-tranformed

This representation of the search space  is more amenable for
hyperparameter search, as hyperparameters like
length-scales naturally live on a log-scale.
---
 sklearn/gaussian_process/gpc.py     |   8 +-
 sklearn/gaussian_process/gpr.py     |  12 +-
 sklearn/gaussian_process/kernels.py | 170 +++++++++++++++-------------
 3 files changed, 104 insertions(+), 86 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index ba770f0acd557..c46055a641b7c 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -180,13 +180,13 @@ def obj_func(theta):
                     raise ValueError(
                         "Multiple optimizer restarts (n_restarts_optimizer>1) "
                         "requires that all bounds are finite.")
-                log_bounds = np.log(self.kernel_.bounds)
+                bounds = self.kernel_.bounds
                 for iteration in range(1, self.n_restarts_optimizer):
-                    theta_initial = np.exp(self.rng.uniform(log_bounds[:, 0],
-                                                            log_bounds[:, 1]))
+                    theta_initial = np.exp(self.rng.uniform(bounds[:, 0],
+                                                            bounds[:, 1]))
                     optima.append(
                         self._constrained_optimization(obj_func, theta_initial,
-                                                       self.kernel_.bounds))
+                                                       bounds))
             # Select result from run with minimal (negative) log-marginal
             # likelihood
             self.theta_ = optima[np.argmin(map(itemgetter(1), optima))][0]
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 3a7911b089389..e24e30bbcf5aa 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -55,12 +55,12 @@ def optimizer(obj_func, initial_theta, bounds):
                 # * 'obj_func' is the objective function to be maximized, which
                 #   takes the hyperparameters theta as parameter and an
                 #   optional flag eval_gradient, which determines if the
-                #   gradient is returned additional to the function value
+                #   gradient is returned additionally to the function value
                 # * 'initial_theta': the initial value for theta, which can be
                 #   used by local optimizers
                 # * 'bounds': the bounds on the values of theta
                 ....
-                # Returned are the best found hyperparameters theta and the
+                # Returned are the best found hyperparameters theta and
                 # the corresponding value of the target function.
                 return theta_opt, func_min
 
@@ -195,13 +195,13 @@ def obj_func(theta, eval_gradient=True):
                     raise ValueError(
                         "Multiple optimizer restarts (n_restarts_optimizer>1) "
                         "requires that all bounds are finite.")
-                log_bounds = np.log(self.kernel_.bounds)
+                bounds = self.kernel_.bounds
                 for iteration in range(1, self.n_restarts_optimizer):
-                    theta_initial = np.exp(self.rng.uniform(log_bounds[:, 0],
-                                                            log_bounds[:, 1]))
+                    theta_initial = np.exp(self.rng.uniform(bounds[:, 0],
+                                                            bounds[:, 1]))
                     optima.append(
                         self._constrained_optimization(obj_func, theta_initial,
-                                                       self.kernel_.bounds))
+                                                       bounds))
             # Select result from run with minimal (negative) log-marginal
             # likelihood
             self.theta_ = optima[np.argmin(map(itemgetter(1), optima))][0]
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index f8c5f8257ceed..ccdf3aaea5ce3 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -80,37 +80,42 @@ def n_dims(self):
 
     @property
     def theta(self):
-        """Returns the (flattened) non-fixed hyperparameters of the kernel.
+        """Returns the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Note that theta are typically the log-transformed values of the
+        kernel's hyperparameters as this representation of the search space
+        is more amenable for hyperparameter search, as hyperparameters like
+        length-scales naturally live on a log-scale.
 
         Returns
         -------
         theta : array, shape (n_dims,)
-            The non-fixed hyperparameters of the kernel
+            The non-fixed, log-transformed hyperparameters of the kernel
         """
         theta = []
         for var_name in self.theta_vars:
             if not isinstance(var_name, basestring):  # vector-valued parameter
                 var_name, _ = var_name
             theta.append(getattr(self, var_name))
-        return np.array(theta).ravel()
+        return np.log(theta).ravel()
 
     @theta.setter
     def theta(self, theta):
-        """Sets the (flattened) non-fixed hyperparameters of the kernel.
+        """Sets the (flattened, log-transformed) non-fixed hyperparameters.
 
         Parameters
         ----------
         theta : array, shape (n_dims,)
-            The non-fixed hyperparameters of the kernel
+            The non-fixed, log-transformed hyperparameters of the kernel
         """
         i = 0
         for var_name in self.theta_vars:
             if not isinstance(var_name, basestring):  # vector-valued parameter
                 var_name, var_length = var_name
-                setattr(self, var_name, theta[i:i + var_length])
+                setattr(self, var_name, np.exp(theta[i:i + var_length]))
                 i += var_length
             else:
-                setattr(self, var_name, theta[i])
+                setattr(self, var_name, np.exp(theta[i]))
                 i += 1
 
         if i != len(theta):
@@ -120,12 +125,12 @@ def theta(self, theta):
 
     @property
     def bounds(self):
-        """Returns the bounds on the kernel's hyperparameters.
+        """Returns the bounds on the kernel's hyperparameters theta.
 
         Returns
         -------
         bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters
+            The bounds on the kernel's hyperparameters theta
         """
         bounds = []
         for var_name in self.theta_vars:
@@ -142,25 +147,26 @@ def bounds(self):
                 bounds.append(var_bounds)
             else:
                 bounds.append(getattr(self, var_name + "_bounds"))
-        return np.vstack(bounds)
+        return np.log(np.vstack(bounds))
 
     @bounds.setter
     def bounds(self, bounds):
-        """Sets the bounds on the kernel's hyperparameters.
+        """Sets the bounds on the kernel's hyperparameters theta.
 
         Parameters
         ----------
         bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters
+            The bounds on the kernel's hyperparameters theta
         """
         i = 0
         for var_name in self.theta_vars:
             if not isinstance(var_name, basestring):  # vector-valued parameter
                 var_name, var_length = var_name
-                setattr(self, var_name + "_bounds", bounds[i:i + var_length])
+                setattr(self, var_name + "_bounds",
+                        np.exp(bounds[i:i + var_length]))
                 i += var_length
             else:
-                setattr(self, var_name + "_bounds", bounds[i])
+                setattr(self, var_name + "_bounds", np.exp(bounds[i]))
                 i += 1
 
         if i != len(bounds):
@@ -213,7 +219,7 @@ def diag(self, X):
         """Returns the diagonal of the kernel k(X, X).
 
         The result of this method is identical to np.diag(self(X)); however,
-        it can be evaluted more efficiently since only the diagonal is
+        it can be evaluated more efficiently since only the diagonal is
         evaluated.
 
         Parameters
@@ -259,23 +265,28 @@ def get_params(self, deep=True):
 
     @property
     def theta(self):
-        """Returns the (flattened) non-fixed hyperparameters of the kernel.
+        """Returns the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Note that theta are typically the log-transformed values of the
+        kernel's hyperparameters as this representation of the search space
+        is more amenable for hyperparameter search, as hyperparameters like
+        length-scales naturally live on a log-scale.
 
         Returns
         -------
         theta : array, shape (n_dims,)
-            The non-fixed hyperparameters of the kernel
+            The non-fixed, log-transformed hyperparameters of the kernel
         """
         return np.append(self.k1.theta, self.k2.theta)
 
     @theta.setter
     def theta(self, theta):
-        """Sets the (flattened) non-fixed hyperparameters of the kernel.
+        """Sets the (flattened, log-transformed) non-fixed hyperparameters.
 
         Parameters
         ----------
         theta : array, shape (n_dims,)
-            The non-fixed hyperparameters of the kernel
+            The non-fixed, log-transformed hyperparameters of the kernel
         """
         k1_dims = self.k1.n_dims
         self.k1.theta = theta[:k1_dims]
@@ -283,12 +294,12 @@ def theta(self, theta):
 
     @property
     def bounds(self):
-        """Returns the bounds on the kernel's hyperparameters.
+        """Returns the bounds on the kernel's hyperparameters theta.
 
         Returns
         -------
         bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters
+            The bounds on the kernel's hyperparameters theta
         """
         if self.k1.bounds.size == 0:
             return self.k2.bounds
@@ -298,12 +309,12 @@ def bounds(self):
 
     @bounds.setter
     def bounds(self, bounds):
-        """Sets the bounds on the kernel's hyperparameters.
+        """Sets the bounds on the kernel's hyperparameters theta.
 
         Parameters
         ----------
         bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters
+            The bounds on the kernel's hyperparameters theta
         """
         k1_dims = self.k1.n_dims
         self.k1.bounds = bounds[:k1_dims]
@@ -357,7 +368,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             Kernel k(X, Y)
 
         K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
-            The gradient of the kernel k(X, X) with repect to the
+            The gradient of the kernel k(X, X) with respect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
@@ -372,7 +383,7 @@ def diag(self, X):
         """Returns the diagonal of the kernel k(X, X).
 
         The result of this method is identical to np.diag(self(X)); however,
-        it can be evaluted more efficiently since only the diagonal is
+        it can be evaluated more efficiently since only the diagonal is
         evaluated.
 
         Parameters
@@ -428,7 +439,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             Kernel k(X, Y)
 
         K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
-            The gradient of the kernel k(X, X) with repect to the
+            The gradient of the kernel k(X, X) with respect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
@@ -444,7 +455,7 @@ def diag(self, X):
         """Returns the diagonal of the kernel k(X, X).
 
         The result of this method is identical to np.diag(self(X)); however,
-        it can be evaluted more efficiently since only the diagonal is
+        it can be evaluated more efficiently since only the diagonal is
         evaluated.
 
         Parameters
@@ -501,45 +512,50 @@ def get_params(self, deep=True):
 
     @property
     def theta(self):
-        """Returns the (flattened) non-fixed hyperparameters of the kernel.
+        """Returns the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Note that theta are typically the log-transformed values of the
+        kernel's hyperparameters as this representation of the search space
+        is more amenable for hyperparameter search, as hyperparameters like
+        length-scales naturally live on a log-scale.
 
         Returns
         -------
         theta : array, shape (n_dims,)
-            The non-fixed hyperparameters of the kernel
+            The non-fixed, log-transformed hyperparameters of the kernel
         """
         return self.kernel.theta
 
     @theta.setter
     def theta(self, theta):
-        """Sets the (flattened) non-fixed hyperparameters of the kernel.
+        """Sets the (flattened, log-transformed) non-fixed hyperparameters.
 
         Parameters
         ----------
         theta : array, shape (n_dims,)
-            The non-fixed hyperparameters of the kernel
+            The non-fixed, log-transformed hyperparameters of the kernel
         """
         self.kernel.theta = theta
 
     @property
     def bounds(self):
-        """Returns the bounds on the kernel's hyperparameters.
+        """Returns the bounds on the kernel's hyperparameters theta.
 
         Returns
         -------
         bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters
+            The bounds on the kernel's hyperparameters theta
         """
         return self.kernel.bounds
 
     @bounds.setter
     def bounds(self, bounds):
-        """Sets the bounds on the kernel's hyperparameters.
+        """Sets the bounds on the kernel's hyperparameters theta.
 
         Parameters
         ----------
         bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters
+            The bounds on the kernel's hyperparameters theta
         """
         self.kernel.bounds = bounds
 
@@ -570,7 +586,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             Kernel k(X, Y)
 
         K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
-            The gradient of the kernel k(X, X) with repect to the
+            The gradient of the kernel k(X, X) with respect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
@@ -587,7 +603,7 @@ def diag(self, X):
         """Returns the diagonal of the kernel k(X, X).
 
         The result of this method is identical to np.diag(self(X)); however,
-        it can be evaluted more efficiently since only the diagonal is
+        it can be evaluated more efficiently since only the diagonal is
         evaluated.
 
         Parameters
@@ -624,10 +640,10 @@ class ConstantKernel(Kernel):
     c : float, default: 1.0
         The constant value which defines the covariance: k(x_1, x_2) = c
 
-    c_bounds : pair of floats >= 0, default: (0, np.inf)
+    c_bounds : pair of floats >= 0, default: (1e-5, 1e5)
         The lower and upper bound on c
     """
-    def __init__(self, c=1.0, c_bounds=(0, np.inf)):
+    def __init__(self, c=1.0, c_bounds=(1e-5, 1e5)):
         self.c = c
         self.c_bounds = c_bounds
 
@@ -655,7 +671,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             Kernel k(X, Y)
 
         K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
-            The gradient of the kernel k(X, X) with repect to the
+            The gradient of the kernel k(X, X) with respect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
@@ -668,7 +684,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         K = self.c * np.ones((X.shape[0], Y.shape[0]))
         if eval_gradient:
             if self.c_bounds is not "fixed":
-                return K, np.ones((X.shape[0], X.shape[0], 1))
+                return K, self.c * np.ones((X.shape[0], X.shape[0], 1))
             else:
                 return K, np.empty((X.shape[0], X.shape[0], 0))
         else:
@@ -678,7 +694,7 @@ def diag(self, X):
         """Returns the diagonal of the kernel k(X, X).
 
         The result of this method is identical to np.diag(self(X)); however,
-        it can be evaluted more efficiently since only the diagonal is
+        it can be evaluated more efficiently since only the diagonal is
         evaluated.
 
         Parameters
@@ -711,10 +727,10 @@ class WhiteKernel(Kernel):
     c : float, default: 1.0
         Parameter controlling the noise level
 
-    c_bounds : pair of floats >= 0, default: (0.0, np.inf)
+    c_bounds : pair of floats >= 0, default: (1e-5, 1e5)
         The lower and upper bound on c
     """
-    def __init__(self, c=1.0, c_bounds=(0.0, np.inf)):
+    def __init__(self, c=1.0, c_bounds=(1e-5, 1e5)):
         self.c = c
         self.c_bounds = c_bounds
 
@@ -742,7 +758,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             Kernel k(X, Y)
 
         K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
-            The gradient of the kernel k(X, X) with repect to the
+            The gradient of the kernel k(X, X) with respect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
@@ -754,7 +770,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             K = self.c * np.eye(X.shape[0])
             if eval_gradient:
                 if self.c_bounds is not "fixed":
-                    return K, np.eye(X.shape[0])[:, :, np.newaxis]
+                    return K, self.c * np.eye(X.shape[0])[:, :, np.newaxis]
                 else:
                     return K, np.empty((X.shape[0], X.shape[0], 0))
             else:
@@ -769,7 +785,7 @@ def diag(self, X):
         """Returns the diagonal of the kernel k(X, X).
 
         The result of this method is identical to np.diag(self(X)); however,
-        it can be evaluted more efficiently since only the diagonal is
+        it can be evaluated more efficiently since only the diagonal is
         evaluated.
 
         Parameters
@@ -810,10 +826,10 @@ class RBF(Kernel):
         used. If an array, an anisotropic kernel is used where each dimension
         of l defines the length-scale of the respective feature dimension.
 
-    l_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+    l_bounds : pair of floats >= 0, default: (1e-5, 1e5)
         The lower and upper bound on l
     """
-    def __init__(self, l=1.0, l_bounds=(1e-5, np.inf)):
+    def __init__(self, l=1.0, l_bounds=(1e-5, 1e5)):
         if np.iterable(l):
             self.l = np.asarray(l, dtype=np.float)
         else:
@@ -849,7 +865,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             Kernel k(X, Y)
 
         K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
-            The gradient of the kernel k(X, X) with repect to the
+            The gradient of the kernel k(X, X) with respect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
@@ -872,12 +888,12 @@ def __call__(self, X, Y=None, eval_gradient=False):
                 return K, np.empty((X.shape[0], X.shape[0], 0))
             elif not np.iterable(self.l) or self.l.shape[0] == 1:
                 K_gradient = \
-                    (K * squareform(dists) / self.l)[:, :, np.newaxis]
+                    (K * squareform(dists))[:, :, np.newaxis]
                 return K, K_gradient
             elif self.l.shape[0] == X.shape[1]:
                 # We need to recompute the pairwise dimension-wise distances
                 D = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 \
-                    / (self.l ** 3)
+                    / (self.l ** 2)
                 K_gradient = K[..., np.newaxis] * D
                 return K, K_gradient
             else:
@@ -904,7 +920,7 @@ class RationalQuadratic(Kernel):
     alpha>0 Only the isotropic variant where l is a scalar is supported at the
     moment. The kernel given by:
 
-    k(x_i, x_j) = (1 + d(x_i, x_j)^2 / (2*alpha l^2))^alpha
+    k(x_i, x_j) = (1 + d(x_i, x_j)^2 / (2*alpha l^2))^-alpha
 
     Parameters
     ----------
@@ -914,14 +930,14 @@ class RationalQuadratic(Kernel):
     alpha : float > 0, default: 1.0
         Scale mixture parameter
 
-    l_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+    l_bounds : pair of floats >= 0, default: (1e-5, 1e5)
         The lower and upper bound on l
 
-    alpha_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+    alpha_bounds : pair of floats >= 0, default: (1e-5, 1e5)
         The lower and upper bound on alpha
     """
-    def __init__(self, l=1.0, alpha=1.0, l_bounds=(1e-5, np.inf),
-                 alpha_bounds=(1e-5, np.inf)):
+    def __init__(self, l=1.0, alpha=1.0, l_bounds=(1e-5, 1e5),
+                 alpha_bounds=(1e-5, 1e5)):
         self.l = l
         self.alpha = alpha
         self.l_bounds = l_bounds
@@ -953,7 +969,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             Kernel k(X, Y)
 
         K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
-            The gradient of the kernel k(X, X) with repect to the
+            The gradient of the kernel k(X, X) with respect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
@@ -981,7 +997,9 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
             # gradient with respect to alpha
             if "alpha" in self.theta_vars:
-                alpha_gradient = K * (-np.log(base) + tmp / base)
+                alpha_gradient = \
+                    K * (-self.alpha * np.log(base)
+                         + dists / (2 * self.l ** 2 * base))
                 alpha_gradient = alpha_gradient[:, :, np.newaxis]
             else:  # alpha is kept fixed
                 alpha_gradient = np.empty((K.shape[0], K.shape[1], 0))
@@ -1013,14 +1031,14 @@ class ExpSineSquared(Kernel):
     p : float > 0, default: 1.0
         The periodicity of the kernel.
 
-    l_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+    l_bounds : pair of floats >= 0, default: (1e-5, 1e5)
         The lower and upper bound on l
 
-    p_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+    p_bounds : pair of floats >= 0, default: (1e-5, 1e5)
         The lower and upper bound on p
     """
-    def __init__(self, l=1.0, p=1.0, l_bounds=(1e-5, np.inf),
-                 p_bounds=(1e-5, np.inf)):
+    def __init__(self, l=1.0, p=1.0, l_bounds=(1e-5, 1e5),
+                 p_bounds=(1e-5, 1e5)):
         self.l = l
         self.p = p
         self.l_bounds = l_bounds
@@ -1052,7 +1070,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             Kernel k(X, Y)
 
         K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
-            The gradient of the kernel k(X, X) with repect to the
+            The gradient of the kernel k(X, X) with respect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
@@ -1073,13 +1091,13 @@ def __call__(self, X, Y=None, eval_gradient=False):
             cos_of_arg = np.cos(arg)
             # gradient with respect to l
             if "l" in self.theta_vars:
-                l_gradient = 4 / self.l**3 * sin_of_arg**2 * K
+                l_gradient = 4 / self.l**2 * sin_of_arg**2 * K
                 l_gradient = l_gradient[:, :, np.newaxis]
             else:  # l is kept fixed
                 l_gradient = np.empty((K.shape[0], K.shape[1], 0))
             # gradient with respect to p
             if "p" in self.theta_vars:
-                p_gradient = 4 * arg / (self.l**2 * self.p) * cos_of_arg \
+                p_gradient = 4 * arg / self.l**2 * cos_of_arg \
                     * sin_of_arg * K
                 p_gradient = p_gradient[:, :, np.newaxis]
             else:  # p is kept fixed
@@ -1115,11 +1133,11 @@ class DotProduct(Kernel):
         Parameter controlling the inhomogenity of the kernel. If sigma_0=0,
         the kernel is homogenous.
 
-    sigma_0_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+    sigma_0_bounds : pair of floats >= 0, default: (1e-5, 1e5)
         The lower and upper bound on l
     """
 
-    def __init__(self, sigma_0=1.0, sigma_0_bounds=(1e-5, np.inf)):
+    def __init__(self, sigma_0=1.0, sigma_0_bounds=(1e-5, 1e5)):
         self.sigma_0 = sigma_0
         self.sigma_0_bounds = sigma_0_bounds
 
@@ -1147,7 +1165,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             Kernel k(X, Y)
 
         K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
-            The gradient of the kernel k(X, X) with repect to the
+            The gradient of the kernel k(X, X) with respect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
@@ -1163,7 +1181,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         if eval_gradient:
             if self.sigma_0_bounds is not "fixed":
                 K_gradient = np.empty((K.shape[0], K.shape[1], 1))
-                K_gradient[..., 0] = 2 * self.sigma_0
+                K_gradient[..., 0] = 2 * self.sigma_0 ** 2
                 return K, K_gradient
             else:
                 return K, np.empty((X.shape[0], X.shape[0], 0))
@@ -1174,7 +1192,7 @@ def diag(self, X):
         """Returns the diagonal of the kernel k(X, X).
 
         The result of this method is identical to np.diag(self(X)); however,
-        it can be evaluted more efficiently since only the diagonal is
+        it can be evaluated more efficiently since only the diagonal is
         evaluated.
 
         Parameters
@@ -1228,7 +1246,7 @@ class PairwiseKernel(Kernel):
     gam ma: float >= 0, default: 1.0
         Parameter gamma of the pairwise kernel specified by metric
 
-    gamma_bounds : pair of floats >= 0, default: (1e-5, np.inf)
+    gamma_bounds : pair of floats >= 0, default: (1e-5, 1e5)
         The lower and upper bound on gamma
 
     metric : string, or callable
@@ -1245,7 +1263,7 @@ class PairwiseKernel(Kernel):
         Any further parameters are passed directly to the kernel function.
     """
 
-    def __init__(self, gamma=1.0, gamma_bounds=(1e-5, np.inf),
+    def __init__(self, gamma=1.0, gamma_bounds=(1e-5, 1e5),
                  metric="linear", **kwargs):
         self.gamma = gamma
         self.gamma_bounds = gamma_bounds
@@ -1277,7 +1295,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             Kernel k(X, Y)
 
         K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
-            The gradient of the kernel k(X, X) with repect to the
+            The gradient of the kernel k(X, X) with respect to the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
@@ -1291,7 +1309,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
                 # approximate gradient numerically
                 def f(gamma):  # helper function
                     return pairwise_kernels(
-                        X, Y, metric=self.metric, gamma=gamma,
+                        X, Y, metric=self.metric, gamma=np.exp(gamma),
                         filter_params=True, **self.kwargs)
                 return K, _approx_fprime(self.theta, f, 1e-10)
         else:
@@ -1301,7 +1319,7 @@ def diag(self, X):
         """Returns the diagonal of the kernel k(X, X).
 
         The result of this method is identical to np.diag(self(X)); however,
-        it can be evaluted more efficiently since only the diagonal is
+        it can be evaluated more efficiently since only the diagonal is
         evaluated.
 
         Parameters

From 910188aa0c9899e3ecb9dc35fb1a1b521dd0536f Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 21 May 2015 10:01:42 +0200
Subject: [PATCH 092/165] TST Adapting GP tests to changed hyperparameter
 representation

---
 sklearn/gaussian_process/tests/test_gpc.py    |  7 ++++--
 sklearn/gaussian_process/tests/test_gpr.py    | 21 +++++++++--------
 .../gaussian_process/tests/test_kernels.py    | 23 ++++++++++---------
 3 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index ef54d15130877..3743fb12beccc 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -47,9 +47,12 @@ def test_converged_to_local_maximum():
     for kernel in kernels:
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
 
-        lml, lml_gradient = gpc.log_marginal_likelihood(gpc.kernel_.theta, True)
+        lml, lml_gradient = \
+            gpc.log_marginal_likelihood(gpc.kernel_.theta, True)
 
-        assert_almost_equal(lml_gradient, 0, 2)
+        assert_true(np.all((np.abs(lml_gradient) < 1e-4)
+                           | (gpc.kernel_.theta == gpc.kernel_.bounds[:, 0])
+                           | (gpc.kernel_.theta == gpc.kernel_.bounds[:, 1])))
 
 
 def test_lml_gradient():
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 319e90f895a39..1db0ffc3a4740 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -26,9 +26,9 @@ def f(x):
 kernels = [RBF(l=1.0), RBF(l=1.0, l_bounds=(1e-3, 1e3)),
            C(1.0, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3)),
            C(1.0, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3))
-               + C(0.0, (0.0, 1e2)),
+               + C(1e-5, (1e-5, 1e2)),
            C(0.1, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3))
-               + C(0.0, (0.0, 1e2))]
+               + C(1e-5, (1e-5, 1e2))]
 
 
 def test_gpr_interpolation():
@@ -57,7 +57,7 @@ def test_converged_to_local_maximum():
         lml, lml_gradient = \
             gpr.log_marginal_likelihood(gpr.kernel_.theta, True)
 
-        assert_true(np.all((np.abs(lml_gradient) < 1e-5)
+        assert_true(np.all((np.abs(lml_gradient) < 1e-4)
                            | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 0])
                            | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 1])))
 
@@ -101,7 +101,7 @@ def test_prior():
         assert_almost_equal(y_mean, 0, 5)
         if len(gpr.kernel.theta) > 1:
             # XXX: quite hacky, works only for current kernels
-            assert_almost_equal(np.diag(y_cov), kernel.theta[0] , 5)
+            assert_almost_equal(np.diag(y_cov), np.exp(kernel.theta[0]) , 5)
         else:
             assert_almost_equal(np.diag(y_cov), 1, 5)
 
@@ -125,8 +125,8 @@ def test_no_optimizer():
     """ Test that kernel parameters are unmodified when optimizer is None."""
     kernel = RBF(1.0)
     gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None).fit(X, y)
-    assert_equal(gpr.kernel_.theta, 1.0)
-    assert_equal(gpr.theta_, 1.0)
+    assert_equal(np.exp(gpr.kernel_.theta), 1.0)
+    assert_equal(np.exp(gpr.theta_), 1.0)
 
 
 def test_predict_cov_vs_std():
@@ -149,7 +149,9 @@ def test_anisotropic_kernel():
 
     kernel = RBF([1.0, 1.0])
     gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
-    assert_greater(gpr.kernel_.theta[1], gpr.kernel_.theta[0] * 5)
+    assert_greater(np.exp(gpr.kernel_.theta[1]),
+                   np.exp(gpr.kernel_.theta[0]) * 5)
+
 
 def test_random_starts():
     """
@@ -175,6 +177,7 @@ def test_random_starts():
         assert_greater(lml, last_lml - np.finfo(np.float32).eps)
         last_lml = lml
 
+
 def test_y_normalization():
     """ Test normalization of the target values in GP
 
@@ -260,8 +263,8 @@ def optimizer(obj_func, initial_theta, bounds):
         theta_opt, func_min = \
             initial_theta, obj_func(initial_theta, eval_gradient=False)
         for _ in range(1000):
-            theta = np.atleast_1d(rng.uniform(np.maximum(1e-2, bounds[:, 0]),
-                                              np.minimum(10, bounds[:, 1])))
+            theta = np.atleast_1d(rng.uniform(np.maximum(-2, bounds[:, 0]),
+                                              np.minimum(1, bounds[:, 1])))
             f = obj_func(theta, eval_gradient=False)
             if f < func_min:
                 theta_opt, func_min = theta, f
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 9873739ab4ba0..394240b05b691 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -25,10 +25,11 @@
 
 kernels = [RBF(l=2.0), RBF(l_bounds=(0.5, 2.0)),
            ConstantKernel(c=10.0),
-           2.0 * RBF(l=0.5), RBF(l=2.0) + WhiteKernel(c=1.0),
-           RationalQuadratic(l=1.0, alpha=1.0),
-           ExpSineSquared(l=1.0, p=1.0),
-           DotProduct(sigma_0=1.0), DotProduct(sigma_0=1.0) ** 2]
+           2.0 * RBF(l=0.5), RBF(l=2.0) + WhiteKernel(c=3.0),
+           2.0 * RBF(l=[0.5, 2.0]),
+           RationalQuadratic(l=0.5, alpha=1.5),
+           ExpSineSquared(l=0.5, p=1.5),
+           DotProduct(sigma_0=2.0), DotProduct(sigma_0=2.0) ** 2]
 for metric in PAIRWISE_KERNEL_FUNCTIONS:
     if metric in ["additive_chi2", "chi2"]:
         continue
@@ -75,9 +76,9 @@ def test_kernel_theta():
         assert_equal(kernel.theta_vars, theta_vars)
 
         # Check that values returned in theta are consistent with
-        # hyperparameter values
+        # hyperparameter values (being their logarithms)
         for i, theta_var in enumerate(theta_vars):
-            assert_equal(theta[i], getattr(kernel, theta_var))
+            assert_equal(theta[i], np.log(getattr(kernel, theta_var)))
 
         # Fixed kernel parameters must be excluded from theta and gradient.
         for i, theta_var in enumerate(theta_vars):
@@ -102,12 +103,12 @@ def test_kernel_theta():
 
         # Check that values of theta are modified correctly
         for i, theta_var in enumerate(theta_vars):
-            theta[i] = 42
+            theta[i] = np.log(42)
             kernel.theta = theta
-            assert_equal(getattr(kernel, theta_var), 42)
+            assert_almost_equal(getattr(kernel, theta_var), 42)
 
             setattr(kernel, theta_var, 43)
-            assert_equal(kernel.theta[i], 43)
+            assert_almost_equal(kernel.theta[i], np.log(43))
 
 
 def test_auto_vs_cross():
@@ -153,8 +154,8 @@ def test_kernel_anisotropic():
     assert_almost_equal(K, K2)
 
     # Check getting and setting via theta
-    kernel.theta = kernel.theta * 2
-    assert_array_equal(kernel.theta, [6.0, 1.0, 4.0])
+    kernel.theta = kernel.theta + np.log(2)
+    assert_array_equal(kernel.theta, np.log([6.0, 1.0, 4.0]))
     assert_array_equal(kernel.k2.l, [1.0, 4.0])
 
 

From 947bc0daee5c2a4ebe9f23f37c0d057a81dffa8a Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 21 May 2015 10:01:56 +0200
Subject: [PATCH 093/165] DOCAdapting GP examples to changed hyperparameter
 representation

---
 .../gaussian_process/plot_compare_gpr_krr.py  |  2 +-
 examples/gaussian_process/plot_gpc.py         |  8 +++---
 examples/gaussian_process/plot_gpr_co2.py     | 26 +++++++++----------
 examples/gaussian_process/plot_gpr_noisy.py   | 10 ++++---
 4 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index 0493fa122c3dc..b23e587978d06 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -78,7 +78,7 @@
 kr.fit(X, y)
 print("Time for KRR fitting: %.3f" % (time.time() - stime))
 
-gp_kernel = ExpSineSquared(1.0, 5.0) + WhiteKernel(1e-1)
+gp_kernel = ExpSineSquared(1.0, 5.0, p_bounds=(1e-2, 1e1)) + WhiteKernel(1e-1)
 gpr = GaussianProcessRegressor(kernel=gp_kernel)
 stime = time.time()
 gpr.fit(X, y)
diff --git a/examples/gaussian_process/plot_gpc.py b/examples/gaussian_process/plot_gpc.py
index 87266c204e18b..d1f973def1b42 100644
--- a/examples/gaussian_process/plot_gpc.py
+++ b/examples/gaussian_process/plot_gpc.py
@@ -13,7 +13,7 @@
 figure shows that this is because they exhibit a steep change of the class
 probabilities at the class boundaries (which is good) but have predicted
 probabilities close to 0.5 far away from the class boundaries (which is bad)
-This undiesirable effect is caused by the Laplace approximation used
+This undesirable effect is caused by the Laplace approximation used
 internally by GPC.
 
 The second figure shows the log-marginal-likelihood for different choices of
@@ -82,11 +82,11 @@
 theta0 = np.logspace(0, 8, 30)
 theta1 = np.logspace(-1, 1, 29)
 Theta0, Theta1 = np.meshgrid(theta0, theta1)
-LML = [[gp_opt.log_marginal_likelihood([Theta0[i, j], Theta1[i, j]])
+LML = [[gp_opt.log_marginal_likelihood(np.log([Theta0[i, j], Theta1[i, j]]))
         for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
 LML = np.array(LML).T
-plt.plot(gp_fix.theta_[0], gp_fix.theta_[1], 'ko', zorder=10)
-plt.plot(gp_opt.theta_[0], gp_opt.theta_[1], 'ko', zorder=10)
+plt.plot(np.exp(gp_fix.theta_)[0], np.exp(gp_fix.theta_)[1], 'ko', zorder=10)
+plt.plot(np.exp(gp_opt.theta_)[0], np.exp(gp_opt.theta_)[1], 'ko', zorder=10)
 plt.pcolor(Theta0, Theta1, LML)
 plt.xscale("log")
 plt.yscale("log")
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 58482495e5dda..1b2fef12f20fb 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -35,18 +35,18 @@
    and the RBF's length scale are further free parameters.
 
 Maximizing the log-marginal-likelihood after subtracting the target's mean
-yields the following kernel with an LML of -84.483:
-   2.5e+03 * RBF(l=49.8)
-   + 6.68 * RBF(l=100) * ExpSineSquared(l=1.37, p=1)
-   + 0.215 * RationalQuadratic(alpha=3.98, l=0.982)
-   + 0.0381 * RBF(l=0.136) + WhiteKernel(c=0.0335)
-Thus, most of the target signal (sqrt(2.5e+03)ppm = 50ppm) is explained by a
-long-term rising trend (length-scale 49.8 years). The periodic component has
-an amplitude of sqrt(6.68)ppm = 2.58ppm, a decay time of 100 years and a
-length-scale of 1.37. The long decay time indicates that we have a locally very
+yields the following kernel with an LML of -83.214:
+   1.19e+03 * RBF(l=41.8)
+   + 10.7 * RBF(l=180) * ExpSineSquared(l=1.44, p=1)
+   + 0.199 * RationalQuadratic(alpha=17.7, l=0.957)
+   + 0.0389 * RBF(l=0.138) + WhiteKernel(c=0.0336)
+Thus, most of the target signal (sqrt(1.19e+03)ppm = 34.5ppm) is explained by a
+long-term rising trend (length-scale 41.8 years). The periodic component has
+an amplitude of sqrt(10.7)ppm = 3.27ppm, a decay time of 180 years and a
+length-scale of 1.44. The long decay time indicates that we have a locally very
 close to periodic seasonal component. The correlated noise has an amplitude of
-sqrt(0.0381)ppm = 0.195ppm with a length scale of 0.136 years and a white-noise
-contribution of sqrt(0.0335)ppm = 0.183pm. Thus, the overall noise level is
+sqrt(0.0389)ppm = 0.197ppm with a length scale of 0.138 years and a white-noise
+contribution of sqrt(0.0336)ppm = 0.183pm. Thus, the overall noise level is
 very small, indicating that the data can be very well explained by the model.
 The figure shows also that the model makes very confident predictions until
 around 2015.
@@ -79,8 +79,8 @@
 kernel_gpml = k1 + k2 + k3 + k4
 
 gp = GaussianProcessRegressor(kernel=kernel_gpml, sigma_squared_n=0,
-							  optimizer=None)
-gp.fit(X, y - y_mean)
+							         optimizer=None)
+gp.fit(X, y)
 
 print("GPML kernel: %s" % gp.kernel_)
 print("Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.theta_))
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index a0d1b3447d034..3259fee864e83 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -36,7 +36,8 @@
 
 # First run
 plt.figure(0)
-kernel = 1.0 * RBF(l=100.0) + WhiteKernel(c=1e-5, c_bounds=(1e-10, 1e+1))
+kernel = 1.0 * RBF(l=100.0, l_bounds=(1e-2, 1e3)) \
+  + WhiteKernel(c=1, c_bounds=(1e-10, 1e+1))
 gp = GaussianProcessRegressor(kernel=kernel,
                               sigma_squared_n=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
@@ -52,9 +53,10 @@
              gp.log_marginal_likelihood(gp.kernel_.theta)))
 plt.tight_layout()
 
-# First run
+# Second run
 plt.figure(1)
-kernel = 1.0 * RBF(l=1.0) + WhiteKernel(c=1e-5, c_bounds=(1e-10, 1e+1))
+kernel = 1.0 * RBF(l=1.0, l_bounds=(1e-2, 1e3)) \
+  + WhiteKernel(c=1e-5, c_bounds=(1e-10, 1e+1))
 gp = GaussianProcessRegressor(kernel=kernel,
                               sigma_squared_n=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
@@ -75,7 +77,7 @@
 theta0 = np.logspace(-2, 3, 49)
 theta1 = np.logspace(-2, 0, 50)
 Theta0, Theta1 = np.meshgrid(theta0, theta1)
-LML = [[gp.log_marginal_likelihood([0.36, Theta0[i, j], Theta1[i, j]])
+LML = [[gp.log_marginal_likelihood(np.log([0.36, Theta0[i, j], Theta1[i, j]]))
         for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
 LML = np.array(LML).T
 

From 96075767c5aa162df698359ff60bf3860261d59d Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 21 May 2015 10:12:07 +0200
Subject: [PATCH 094/165] REFACTOR Remove theta_ attribute from GPR and GPC

It can be accessed instead via kernel_.theta
---
 examples/gaussian_process/plot_gpc.py      | 10 ++++++----
 examples/gaussian_process/plot_gpr_co2.py  |  6 ++++--
 sklearn/gaussian_process/gpc.py            |  8 ++------
 sklearn/gaussian_process/gpr.py            | 11 ++---------
 sklearn/gaussian_process/tests/test_gpc.py |  2 +-
 sklearn/gaussian_process/tests/test_gpr.py |  3 +--
 6 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/examples/gaussian_process/plot_gpc.py b/examples/gaussian_process/plot_gpc.py
index d1f973def1b42..736e64c6da3f2 100644
--- a/examples/gaussian_process/plot_gpc.py
+++ b/examples/gaussian_process/plot_gpc.py
@@ -50,9 +50,9 @@
 gp_opt.fit(X[:train_size], y[:train_size])
 
 print("Log Marginal Likelihood (initial): %.3f"
-      % gp_fix.log_marginal_likelihood(gp_fix.theta_))
+      % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
 print("Log Marginal Likelihood (optimized): %.3f"
-      % gp_opt.log_marginal_likelihood(gp_opt.theta_))
+      % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))
 
 print("Accuracy: %.3f (initial) %.3f (optimized)"
       % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
@@ -85,8 +85,10 @@
 LML = [[gp_opt.log_marginal_likelihood(np.log([Theta0[i, j], Theta1[i, j]]))
         for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
 LML = np.array(LML).T
-plt.plot(np.exp(gp_fix.theta_)[0], np.exp(gp_fix.theta_)[1], 'ko', zorder=10)
-plt.plot(np.exp(gp_opt.theta_)[0], np.exp(gp_opt.theta_)[1], 'ko', zorder=10)
+plt.plot(np.exp(gp_fix.kernel_.theta)[0], np.exp(gp_fix.kernel_.theta)[1],
+         'ko', zorder=10)
+plt.plot(np.exp(gp_opt.kernel_.theta)[0], np.exp(gp_opt.kernel_.theta)[1],
+         'ko', zorder=10)
 plt.pcolor(Theta0, Theta1, LML)
 plt.xscale("log")
 plt.yscale("log")
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 1b2fef12f20fb..a3e24d5aa9e16 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -83,7 +83,8 @@
 gp.fit(X, y)
 
 print("GPML kernel: %s" % gp.kernel_)
-print("Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.theta_))
+print("Log-marginal-likelihood: %.3f"
+      % gp.log_marginal_likelihood(gp.kernel_.theta))
 
 # Kernel with optimized parameters
 k1 = 50.0**2 * RBF(l=50.0) # long term smooth rising trend
@@ -98,7 +99,8 @@
 gp.fit(X, y - y_mean)
 
 print("\nLearned kernel: %s" % gp.kernel_)
-print("Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.theta_))
+print("Log-marginal-likelihood: %.3f"
+      % gp.log_marginal_likelihood(gp.kernel_.theta))
 
 X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
 y_pred, y_std = gp.predict(X_, return_std=True)
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index c46055a641b7c..148b0f7628ed0 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -98,9 +98,6 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
         The kernel used for prediction. The structure of the kernel is the
         same as the one passed as parameter but with optimized hyperparameters
 
-    theta_: array-like, shape = (n_kernel_params,)
-        Selected kernel hyperparameters
-
     L_: array-like, shape = (n_samples, n_samples)
         Lower-triangular Cholesky decomposition of the kernel in X_fit_
 
@@ -189,10 +186,9 @@ def obj_func(theta):
                                                        bounds))
             # Select result from run with minimal (negative) log-marginal
             # likelihood
-            self.theta_ = optima[np.argmin(map(itemgetter(1), optima))][0]
-            self.kernel_.theta = self.theta_
+            self.kernel_.theta = optima[np.argmin(map(itemgetter(1), optima))][0]
         elif self.optimizer is None:
-            self.theta_ = self.kernel_.theta
+            pass
         else:
             raise ValueError("Unknown optimizer %s." % self.optimizer)
 
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index e24e30bbcf5aa..7cb0c0c213de3 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -105,9 +105,6 @@ def optimizer(obj_func, initial_theta, bounds):
         The kernel used for prediction. The structure of the kernel is the
         same as the one passed as parameter but with optimized hyperparameters
 
-    theta_: array-like, shape =(n_kernel_params,)
-        Selected kernel hyperparameters
-
     L_: array-like, shape = (n_samples, n_samples)
         Lower-triangular Cholesky decomposition of the kernel in X_fit_
 
@@ -171,7 +168,7 @@ def fit(self, X, y):
         self.y_fit_ = y
 
         if self.kernel_.n_dims == 0:  # no tunable hyperparameters
-            self.theta_ = self.kernel_.theta   # actually an empty sequence
+            pass
         elif self.optimizer is not None:
             # Choose hyperparameters based on maximizing the log-marginal
             # likelihood (potentially starting from several initial values)
@@ -204,11 +201,7 @@ def obj_func(theta, eval_gradient=True):
                                                        bounds))
             # Select result from run with minimal (negative) log-marginal
             # likelihood
-            self.theta_ = optima[np.argmin(map(itemgetter(1), optima))][0]
-            self.kernel_.theta = self.theta_
-        else:
-            # Use initially provided hyperparameters
-            self.theta_ = self.kernel_.theta
+            self.kernel_.theta = optima[np.argmin(map(itemgetter(1), optima))][0]
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 3743fb12beccc..8334ccc639cce 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -87,6 +87,6 @@ def test_random_starts():
         gp = GaussianProcessClassifier(
             kernel=kernel, n_restarts_optimizer=n_restarts_optimizer,
             random_state=0,).fit(X, y)
-        lml = gp.log_marginal_likelihood(gp.theta_)
+        lml = gp.log_marginal_likelihood(gp.kernel_.theta)
         assert_greater(lml, last_lml - np.finfo(np.float32).eps)
         last_lml = lml
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 1db0ffc3a4740..6033b3bca4d3a 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -126,7 +126,6 @@ def test_no_optimizer():
     kernel = RBF(1.0)
     gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None).fit(X, y)
     assert_equal(np.exp(gpr.kernel_.theta), 1.0)
-    assert_equal(np.exp(gpr.theta_), 1.0)
 
 
 def test_predict_cov_vs_std():
@@ -173,7 +172,7 @@ def test_random_starts():
         gp = GaussianProcessRegressor(
             kernel=kernel, n_restarts_optimizer=n_restarts_optimizer,
             random_state=0,).fit(X, y)
-        lml = gp.log_marginal_likelihood(gp.theta_)
+        lml = gp.log_marginal_likelihood(gp.kernel_.theta)
         assert_greater(lml, last_lml - np.finfo(np.float32).eps)
         last_lml = lml
 

From 0272c48fed36057bf3ee7385c3fae28f1b9f0860 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 21 May 2015 10:19:11 +0200
Subject: [PATCH 095/165] ENH GPC can use an externally defined optimizer for
 hyperparameter tuning

---
 sklearn/gaussian_process/gpc.py | 54 ++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 15 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 148b0f7628ed0..a6989f9950d87 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -55,11 +55,28 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
         Larger values correspond to increased noise level in the observations
         and reduce potential numerical issue during fitting.
 
-    optimizer : string, optional (default: "fmin_l_bfgs_b")
-        A string specifying the optimization algorithm used for optimizing the
-        kernel's parameters. Default uses 'fmin_l_bfgs_b' algorithm from
-        scipy.optimize. If None, the kernel's parameters are kept fixed.
-        Available optimizers are::
+    optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the  signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func' is the objective function to be maximized, which
+                #   takes the hyperparameters theta as parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additionally to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are::
 
             'fmin_l_bfgs_b'
 
@@ -157,13 +174,18 @@ def fit(self, X, y):
                           % self.classes_)
             self.classes_ = np.array([self.classes_[0], self.classes_[0]])
 
-        if self.optimizer in ["fmin_l_bfgs_b"]:
+        if self.kernel_.n_dims == 0:  # no tunable hyperparameters
+            pass
+        elif self.optimizer is not None:
             # Choose hyperparameters based on maximizing the log-marginal
             # likelihood (potentially starting from several initial values)
-            def obj_func(theta):
-                lml, grad = self.log_marginal_likelihood(theta,
-                                                         eval_gradient=True)
-                return -lml, -grad
+            def obj_func(theta, eval_gradient=True):
+                if eval_gradient:
+                    lml, grad = self.log_marginal_likelihood(
+                        theta, eval_gradient=True)
+                    return -lml, -grad
+                else:
+                    return -self.log_marginal_likelihood(theta)
 
             # First optimize starting from theta specified in kernel
             optima = [(self._constrained_optimization(obj_func,
@@ -187,10 +209,6 @@ def obj_func(theta):
             # Select result from run with minimal (negative) log-marginal
             # likelihood
             self.kernel_.theta = optima[np.argmin(map(itemgetter(1), optima))][0]
-        elif self.optimizer is None:
-            pass
-        else:
-            raise ValueError("Unknown optimizer %s." % self.optimizer)
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
@@ -379,7 +397,13 @@ def _posterior_mode(self, K, return_temporaries=False):
             return log_marginal_likelihood
 
     def _constrained_optimization(self, obj_func, initial_theta, bounds):
-        if self.optimizer in ["fmin_l_bfgs_b"]:
+        if self.optimizer == "fmin_l_bfgs_b":
             theta_opt, func_min, _ = \
                 fmin_l_bfgs_b(obj_func, initial_theta, bounds=bounds)
+        elif callable(self.optimizer):
+            theta_opt, func_min = \
+                self.optimizer(obj_func, initial_theta, bounds=bounds)
+        else:
+            raise ValueError("Unknown optimizer %s." % self.optimizer)
+
         return theta_opt, func_min

From 63660e27280fc55d9840f0db3e7810137337c54e Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 21 May 2015 10:19:24 +0200
Subject: [PATCH 096/165] TST Testing GPC with custom external optimizer

---
 sklearn/gaussian_process/tests/test_gpc.py | 23 ++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 8334ccc639cce..86a04ee386c61 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -90,3 +90,26 @@ def test_random_starts():
         lml = gp.log_marginal_likelihood(gp.kernel_.theta)
         assert_greater(lml, last_lml - np.finfo(np.float32).eps)
         last_lml = lml
+
+
+def test_custom_optimizer():
+    """ Test that GPC can use externally defined optimizers. """
+    # Define a dummy optimizer that simply tests 1000 random hyperparameters
+    def optimizer(obj_func, initial_theta, bounds):
+        rng = np.random.RandomState(0)
+        theta_opt, func_min = \
+            initial_theta, obj_func(initial_theta, eval_gradient=False)
+        for _ in range(1000):
+            theta = np.atleast_1d(rng.uniform(np.maximum(-2, bounds[:, 0]),
+                                              np.minimum(1, bounds[:, 1])))
+            f = obj_func(theta, eval_gradient=False)
+            if f < func_min:
+                theta_opt, func_min = theta, f
+        return theta_opt, func_min
+
+    for kernel in kernels:
+        gpc = GaussianProcessClassifier(kernel=kernel, optimizer=optimizer)
+        gpc.fit(X, y)
+        # Checks that optimizer improved marginal likelihood
+        assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.theta),
+                       gpc.log_marginal_likelihood(gpc.kernel.theta))

From a3ae48b24294a20f8356f005302a3c0278b5d2c1 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 21 May 2015 11:47:04 +0200
Subject: [PATCH 097/165] REFACTOR normalize_y in GPR does not modify
 standard-deviation

---
 examples/gaussian_process/plot_gpr_co2.py  |  9 ++++---
 sklearn/gaussian_process/gpr.py            | 28 ++++++++--------------
 sklearn/gaussian_process/tests/test_gpr.py | 15 ++++--------
 3 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index a3e24d5aa9e16..d6ec25ab8f2a4 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -69,7 +69,6 @@
 data = fetch_mldata('mauna-loa-atmospheric-co2').data
 X = data[:, [1]]
 y = data[:, 0]
-y_mean = y.mean()
 
 # Kernel with parameters given in GPML book
 k1 = 66.0**2 * RBF(l=67.0)  # long term smooth rising trend
@@ -79,7 +78,7 @@
 kernel_gpml = k1 + k2 + k3 + k4
 
 gp = GaussianProcessRegressor(kernel=kernel_gpml, sigma_squared_n=0,
-							         optimizer=None)
+							         optimizer=None, normalize_y=True)
 gp.fit(X, y)
 
 print("GPML kernel: %s" % gp.kernel_)
@@ -95,8 +94,9 @@
 									   c_bounds=(1e-3, np.inf))  # noise terms
 kernel = k1 + k2 + k3 + k4
 
-gp = GaussianProcessRegressor(kernel=kernel, sigma_squared_n=0)
-gp.fit(X, y - y_mean)
+gp = GaussianProcessRegressor(kernel=kernel, sigma_squared_n=0,
+                              normalize_y=True)
+gp.fit(X, y)
 
 print("\nLearned kernel: %s" % gp.kernel_)
 print("Log-marginal-likelihood: %.3f"
@@ -104,7 +104,6 @@
 
 X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
 y_pred, y_std = gp.predict(X_, return_std=True)
-y_pred += y_mean
 
 # Illustration
 plt.scatter(X, y, c='k')
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 7cb0c0c213de3..3ce02f5fb7610 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -79,14 +79,12 @@ def optimizer(obj_func, initial_theta, bounds):
         must be finite.
 
     normalize_y: boolean, optional (default: False)
-        Whether the target values y are normalized, i.e., mean and standard
-        deviation of observed target values become zero and one, respectively.
-        This parameter should be set to True if the target values' mean is
-        expected to differ considerable from zero or if the standard deviation
-        of the target values is very small or large. When enabled, the
-        normalization effectively modifies the GP's prior based on the data,
-        which contradicts the likelihood principle; normalization is thus
-        disabled per default.
+        Whether the target values y are normalized, i.e., the mean of the
+        observed target values become zero. This parameter should be set to
+        True if the target values' mean is expected to differ considerable from
+        zero. When enabled, the normalization effectively modifies the GP's
+        prior based on the data, which contradicts the likelihood principle;
+        normalization is thus disabled per default.
 
     random_state : integer or numpy.RandomState, optional
         The generator used to initialize the centers. If an integer is
@@ -146,14 +144,10 @@ def fit(self, X, y):
         # Normalize target value
         if self.normalize_y:
             self.y_fit_mean = np.mean(y, axis=0)
-            self.y_fit_std = np.atleast_1d(np.std(y))  # XXX: std per dim?
-            self.y_fit_std[self.y_fit_std == 0.] = 1.
-            # center and scale y (and sigma_squared_n)
-            y = (y - self.y_fit_mean) / self.y_fit_std
-            self.sigma_squared_n /= self.y_fit_std**2
+            # demean y
+            y = y - self.y_fit_mean
         else:
             self.y_fit_mean = np.zeros(1)
-            self.y_fit_std = np.ones(1)
 
         if np.iterable(self.sigma_squared_n) \
            and self.sigma_squared_n.shape[0] != y.shape[0]:
@@ -266,11 +260,10 @@ def predict(self, X, return_std=False, return_cov=False):
         else:  # Predict based on GP posterior
             K_trans = self.kernel_(X, self.X_fit_)
             y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
-            y_mean = self.y_fit_mean + self.y_fit_std * y_mean  # undo normal.
+            y_mean = self.y_fit_mean + y_mean  # undo normal.
             if return_cov:
                 v = cho_solve((self.L_, True), K_trans.T)  # Line 5
                 y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
-                y_cov *= self.y_fit_std ** 2  # undo normalization
                 return y_mean, y_cov
             elif return_std:
                 # compute inverse K_inv of K based on its Cholesky
@@ -282,8 +275,7 @@ def predict(self, X, return_std=False, return_cov=False):
                 y_var -= np.sum(K_trans.T[:, np.newaxis] * K_trans.T
                                 * K_inv[:, :, np.newaxis],
                                 axis=0).sum(axis=0)  # axis=(0, 1) in np >= 1.7
-                y_std = np.sqrt(y_var) * self.y_fit_std  # undo normalization
-                return y_mean, y_std
+                return y_mean, np.sqrt(y_var)
             else:
                 return y_mean
 
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 6033b3bca4d3a..e36542979281e 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -183,30 +183,25 @@ def test_y_normalization():
     Fitting non-normalizing GP on normalized y and fitting normalizing GP
     on unnormalized y should yield identical results
     """
-    y_mean, y_std = y.mean(0), y.std(0)
-    y_norm = (y - y_mean) / y_std
+    y_mean = y.mean(0)
+    y_norm = y - y_mean
     for kernel in kernels:
         # Fit non-normalizing GP on normalized y
-        gpr = GaussianProcessRegressor(kernel=kernel,
-                                       sigma_squared_n=1e-10 / y_std**2)
+        gpr = GaussianProcessRegressor(kernel=kernel)
         gpr.fit(X, y_norm)
         # Fit normalizing GP on unnormalized y
-        gpr_norm = GaussianProcessRegressor(kernel=kernel,
-                                            sigma_squared_n=1e-10,
-                                            normalize_y=True)
+        gpr_norm = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
         gpr_norm.fit(X, y)
 
         # Compare predicted mean, std-devs and covariances
         y_pred, y_pred_std = gpr.predict(X2, return_std=True)
-        y_pred = y_mean + y_pred * y_std
-        y_pred_std *= y_std
+        y_pred = y_mean + y_pred
         y_pred_norm, y_pred_std_norm = gpr_norm.predict(X2, return_std=True)
 
         assert_almost_equal(y_pred, y_pred_norm)
         assert_almost_equal(y_pred_std, y_pred_std_norm)
 
         _, y_cov = gpr.predict(X2, return_cov=True)
-        y_cov *= y_std ** 2
         _, y_cov_norm = gpr_norm.predict(X2, return_cov=True)
         assert_almost_equal(y_cov, y_cov_norm)
 

From 60da93327c8260302f28480a8b40cf7b69b32137 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 28 May 2015 16:18:40 +0200
Subject: [PATCH 098/165] FIX Sampling theta_intial uniformly from log-bounds
 in GPR

---
 sklearn/gaussian_process/gpr.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 3ce02f5fb7610..87fcc4215f121 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -188,14 +188,15 @@ def obj_func(theta, eval_gradient=True):
                         "requires that all bounds are finite.")
                 bounds = self.kernel_.bounds
                 for iteration in range(1, self.n_restarts_optimizer):
-                    theta_initial = np.exp(self.rng.uniform(bounds[:, 0],
-                                                            bounds[:, 1]))
+                    theta_initial = \
+                        self.rng.uniform(bounds[:, 0], bounds[:, 1])
                     optima.append(
                         self._constrained_optimization(obj_func, theta_initial,
                                                        bounds))
             # Select result from run with minimal (negative) log-marginal
             # likelihood
-            self.kernel_.theta = optima[np.argmin(map(itemgetter(1), optima))][0]
+            self.kernel_.theta = \
+                optima[np.argmin(map(itemgetter(1), optima))][0]
 
         # Precompute quantities required for predictions which are independent
         # of actual query points

From 3f7672be65d8074f1f9af820042f91a34ddda7be Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 28 May 2015 16:19:14 +0200
Subject: [PATCH 099/165] FIX Correcly composing theta vector in GP kernels

---
 sklearn/gaussian_process/kernels.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index ccdf3aaea5ce3..aedfd5c365b70 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -97,7 +97,7 @@ def theta(self):
             if not isinstance(var_name, basestring):  # vector-valued parameter
                 var_name, _ = var_name
             theta.append(getattr(self, var_name))
-        return np.log(theta).ravel()
+        return np.log(np.hstack(theta))
 
     @theta.setter
     def theta(self, theta):
@@ -710,7 +710,7 @@ def diag(self, X):
         return self.c * np.ones(X.shape[0])
 
     def __repr__(self):
-        return "{0:.3g}".format(self.c)
+        return "{0:.3g}**2".format(np.sqrt(self.c))
 
 
 class WhiteKernel(Kernel):
@@ -778,7 +778,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         else:
             K = np.zeros((X.shape[0], Y.shape[0]))
             # entries which are sufficiently similar to be considered identical
-            K[cdist(X, Y) < 1e-10] = self.c
+            #K[cdist(X, Y) < 1e-10] = self.c
             return K
 
     def diag(self, X):

From 9af9c2429856ec25ee21b45d2a90abcea26a9bed Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 25 Jun 2015 08:35:10 +0200
Subject: [PATCH 100/165] FIX Ensuring that predicted variances are never
 negative

---
 sklearn/gaussian_process/gpr.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 87fcc4215f121..9cb543ce216b9 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -4,6 +4,7 @@
 #
 # License: BSD 3 clause
 
+import warnings
 from operator import itemgetter
 
 import numpy as np
@@ -276,6 +277,13 @@ def predict(self, X, return_std=False, return_cov=False):
                 y_var -= np.sum(K_trans.T[:, np.newaxis] * K_trans.T
                                 * K_inv[:, :, np.newaxis],
                                 axis=0).sum(axis=0)  # axis=(0, 1) in np >= 1.7
+                # Check if any of the variances is negative because of
+                # numerical issues. If yes: set the the variance to 0.
+                y_var_negative = y_var < 0
+                if np.any(y_var_negative):
+                    warnings.warn("Predicted variances smaller than 0. "
+                                  "Setting those variances to 0.")
+                    y_var[y_var_negative] = 0.0
                 return y_mean, np.sqrt(y_var)
             else:
                 return y_mean

From b68237000110be436306bb84806053920bc3c5d0 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 7 Jul 2015 08:21:09 +0200
Subject: [PATCH 101/165] ENH Adding Matern kernel for Gaussian Processes

---
 .../plot_gpr_prior_posterior.py               |  12 +-
 sklearn/gaussian_process/kernels.py           | 147 +++++++++++++++++-
 .../gaussian_process/tests/test_kernels.py    |  36 ++++-
 3 files changed, 183 insertions(+), 12 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_prior_posterior.py b/examples/gaussian_process/plot_gpr_prior_posterior.py
index 9c6042e3e15a6..27528bbdc9822 100644
--- a/examples/gaussian_process/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/plot_gpr_prior_posterior.py
@@ -18,8 +18,8 @@
 from matplotlib import pyplot as plt
 
 from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels \
-    import RBF, RationalQuadratic, ExpSineSquared, DotProduct, ConstantKernel
+from sklearn.gaussian_process.kernels import (RBF, Matern, RationalQuadratic,
+    ExpSineSquared, DotProduct, ConstantKernel)
 
 
 kernels = [1.0 * RBF(l=1.0, l_bounds=(1e-1, 10.0)),
@@ -27,11 +27,10 @@
            1.0 * ExpSineSquared(l=1.0, p=3.0, l_bounds=(0.1, 10.0),
                                 p_bounds=(1.0, 10.0)),
            ConstantKernel(0.1, (0.01, 10.0)) \
-                * (DotProduct(sigma_0=1.0, sigma_0_bounds=(0.0, 10.0)) ** 2)]
+                * (DotProduct(sigma_0=1.0, sigma_0_bounds=(0.0, 10.0)) ** 2),
+           1.0 * Matern(l=1.0, l_bounds=(1e-1, 10.0), nu=1.5)]
 
 for fig_index, kernel in enumerate(kernels):
-    if fig_index > 3: continue
-
     # Specify Gaussian Process
     gp = GaussianProcessRegressor(kernel=kernel)
 
@@ -70,7 +69,8 @@
     plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
     plt.xlim(0, 5)
     plt.ylim(-3, 3)
-    plt.title("Posterior (kernel: %s)" % gp.kernel_)
+    plt.title("Posterior (kernel: %s)\n Log-Likelihood: %.3f"
+              % (gp.kernel_, gp.log_marginal_likelihood(gp.kernel_.theta)))
     plt.tight_layout()
 
 plt.show()
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index aedfd5c365b70..c13956ad5474e 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -22,8 +22,10 @@
 from abc import ABCMeta, abstractmethod
 from functools import partial
 import inspect
+import math
 
 import numpy as np
+from scipy.special import kv, gamma
 from scipy.spatial.distance import pdist, cdist, squareform
 
 from ..metrics.pairwise import pairwise_kernels
@@ -97,7 +99,10 @@ def theta(self):
             if not isinstance(var_name, basestring):  # vector-valued parameter
                 var_name, _ = var_name
             theta.append(getattr(self, var_name))
-        return np.log(np.hstack(theta))
+        if len(theta) > 0:
+            return np.log(np.hstack(theta))
+        else:
+            return np.array([])
 
     @theta.setter
     def theta(self, theta):
@@ -911,6 +916,144 @@ def __repr__(self):
             return "{0}(l={1:.3g})".format(self.__class__.__name__, self.l)
 
 
+class Matern(RBF):
+    """ Matern kernel.
+
+    The class of Matern kernels is a generalization of the RBF and the
+    absolute exponential kernel parameterized by an additional parameter
+    nu. The smaller nu, the less smooth the approximated function is.
+    For nu=inf, the kernel becomes equivalent to the RBF kernel and for nu=0.5
+    to the absolute exponential kernel. Important intermediate values are
+    nu=1.5 (once differentiable functions) and nu=2.5 (twice differentiable
+    functions).
+
+    See Rasmussen and Williams 2006, pp84 for details regarding the
+    different variants of the Matern kernel.
+
+    Parameters
+    -----------
+    l : float or array with shape (n_features,), entries > 0, default: 1.0
+        The length scale of the kernel. If a float, an isotropic kernel is
+        used. If an array, an anisotropic kernel is used where each dimension
+        of l defines the length-scale of the respective feature dimension.
+
+    l_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on l
+
+    nu: float, default: 1.5
+        The parameter nu controlling the smoothness of the learned function.
+        The smaller nu, the less smooth the approximated function is.
+        For nu=inf, the kernel becomes equivalent to the RBF kernel and for
+        nu=0.5 to the absolute exponential kernel. Important intermediate
+        values are nu=1.5 (once differentiable functions) and nu=2.5
+        (twice differentiable functions). Note that values of nu not in
+        [0.5, 1.5, 2.5, inf] incur a considerably higher computational cost
+        (appr. 10 times higher) since they require to evaluate the modified
+        Bessel function. Furthermore, in contrast to l, nu is kept fixed to
+        its initial value and not optimized.
+    """
+    def __init__(self, l=1.0, l_bounds=(1e-5, 1e5), nu=1.5):
+        super(Matern, self).__init__(l, l_bounds)
+        self.nu = nu
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        X = np.atleast_2d(X)
+        if Y is None:
+            dists = pdist(X / self.l, metric='euclidean')
+        else:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated when Y is None.")
+            dists = cdist(X / self.l, Y / self.l, metric='euclidean')
+
+        if self.nu == 0.5:
+            K = np.exp(-dists)
+        elif self.nu == 1.5:
+            K = dists * math.sqrt(3)
+            K = (1. + K) * np.exp(-K)
+        elif self.nu == 2.5:
+            K = dists * math.sqrt(5)
+            K = (1. + K + K ** 2 / 3.0) * np.exp(-K)
+        else:  # general case; expensive to evaluate
+            K = dists
+            K[K == 0.0] += np.finfo(float).eps  # strict zeros result in nan
+            tmp = (math.sqrt(2 * self.nu) * K)
+            K.fill((2 ** (1. - self.nu)) / gamma(self.nu))
+            K *= tmp ** self.nu
+            K *= kv(self.nu, tmp)
+
+        if Y is None:
+            # convert from upper-triangular matrix to square matrix
+            K = squareform(K)
+            np.fill_diagonal(K, 1)
+
+        if eval_gradient:
+            if self.l_bounds is "fixed":  # Hyperparameter l kept fixed
+                K_gradient = np.empty((X.shape[0], X.shape[0], 0))
+                return K, K_gradient
+
+            # We need to recompute the pairwise dimension-wise distances
+            D = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 \
+                    / (self.l ** 2)
+            if self.nu == 0.5:
+                K_gradient = K[..., np.newaxis] * D \
+                    /  np.sqrt(D.sum(2))[:, :, np.newaxis]
+                K_gradient[~np.isfinite(K_gradient)] = 0
+            elif self.nu == 1.5:
+                K_gradient = \
+                    3 * D * np.exp(-np.sqrt(3 * D.sum(-1)))[..., np.newaxis]
+            elif self.nu == 2.5:
+                tmp = np.sqrt(5 * D.sum(-1))[..., np.newaxis]
+                K_gradient = 5.0/3.0 * D * (tmp + 1) * np.exp(-tmp)
+            else:
+                # approximate gradient numerically
+                def f(theta):  # helper function
+                    return self.clone_with_theta(theta)(X, Y)
+                return K, _approx_fprime(self.theta, f, 1e-10)
+
+            if not np.iterable(self.l) or self.l.shape[0] == 1:
+                return K, K_gradient[:, :].sum(-1)[:, :, np.newaxis]
+            else:
+                return K, K_gradient
+        else:
+            return K
+
+    def __repr__(self):
+        if np.iterable(self.l):  # anisotropic
+            return "{0}(l=[{1}], nu={2:.3g})".format(
+                self.__class__.__name__,
+                ", ".join(map("{0:.3g}".format, self.l)),
+                self.nu)
+        else:  # isotropic
+            return "{0}(l={1:.3g}, nu={2:.3g})".format(
+                self.__class__.__name__, self.l, self.nu)
+
+
 class RationalQuadratic(Kernel):
     """Rational Quadratic kernel.
 
@@ -1243,7 +1386,7 @@ class PairwiseKernel(Kernel):
 
     Parameters
     ----------
-    gam ma: float >= 0, default: 1.0
+    gamma: float >= 0, default: 1.0
         Parameter gamma of the pairwise kernel specified by metric
 
     gamma_bounds : pair of floats >= 0, default: (1e-5, 1e5)
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 394240b05b691..fe4f7e28ca138 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -10,23 +10,32 @@
 
 from scipy.optimize import approx_fprime
 
-from sklearn.metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS
+from sklearn.metrics.pairwise \
+    import PAIRWISE_KERNEL_FUNCTIONS, euclidean_distances
 from sklearn.gaussian_process.kernels \
-    import (RBF, RationalQuadratic, ExpSineSquared, DotProduct,
+    import (RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct,
             ConstantKernel, WhiteKernel, PairwiseKernel, KernelOperator,
             Exponentiation)
 from sklearn.base import clone
 
 from sklearn.utils.testing import (assert_equal, assert_almost_equal,
-    assert_not_equal, assert_array_equal)
+    assert_not_equal, assert_array_equal, assert_array_almost_equal)
 
 
-X = np.random.normal(0, 1, (10, 2))
+X = np.random.RandomState(0).normal(0, 1, (10, 2))
 
 kernels = [RBF(l=2.0), RBF(l_bounds=(0.5, 2.0)),
            ConstantKernel(c=10.0),
+           2.0 * RBF(l=0.33, l_bounds="fixed"),
            2.0 * RBF(l=0.5), RBF(l=2.0) + WhiteKernel(c=3.0),
            2.0 * RBF(l=[0.5, 2.0]),
+           2.0 * Matern(l=0.33, l_bounds="fixed"),
+           2.0 * Matern(l=0.5, nu=0.5),
+           2.0 * Matern(l=1.5, nu=1.5),
+           2.0 * Matern(l=2.5, nu=2.5),
+           2.0 * Matern(l=[0.5, 2.0], nu=0.5),
+           3.0 * Matern(l=[2.0, 0.5], nu=1.5),
+           4.0 * Matern(l=[0.5, 0.5], nu=2.5),
            RationalQuadratic(l=0.5, alpha=1.5),
            ExpSineSquared(l=0.5, p=1.5),
            DotProduct(sigma_0=2.0), DotProduct(sigma_0=2.0) ** 2]
@@ -185,3 +194,22 @@ def test_kernel_clone():
             if not isinstance(attr_value, Hashable):
                 # modifiable attributes must not be identical
                 assert_not_equal(id(attr_value), id(attr_value_cloned))
+
+
+def test_matern_kernel():
+    """ Test consistency of Matern kernel for special values of nu. """
+    K = Matern(nu=1.5, l=1.0)(X)
+    # the diagonal elements of a matern kernel are 1
+    assert_array_almost_equal(np.diag(K), np.ones(X.shape[0]))
+    # matern kernel for coef0==0.5 is equal to absolute exponential kernel
+    K_absexp = np.exp(-euclidean_distances(X, X, squared=False))
+    K = Matern(nu=0.5, l=1.0)(X)
+    assert_array_almost_equal(K, K_absexp)
+    # test that special cases of matern kernel (coef0 in [0.5, 1.5, 2.5])
+    # result in nearly identical results as the general case for coef0 in
+    # [0.5 + tiny, 1.5 + tiny, 2.5 + tiny]
+    tiny = 1e-10
+    for nu in [0.5, 1.5, 2.5]:
+        K1 = Matern(nu=nu, l=1.0)(X)
+        K2 = Matern(nu=nu + tiny, l=1.0)(X)
+        assert_array_almost_equal(K1, K2)

From 537790a6f18b5e62b0d4f664a9463f3757ca8a0c Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 7 Jul 2015 09:22:33 +0200
Subject: [PATCH 102/165] DOC Adding documentation on Matern kernel

---
 doc/modules/classes.rst          |  1 +
 doc/modules/gaussian_process.rst | 58 ++++++++++++++++++++++++++++----
 2 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index f69618cfd5078..ed8e937cbdcc9 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -505,6 +505,7 @@ Kernels:
   gaussian_process.kernels.ConstantKernel
   gaussian_process.kernels.WhiteKernel
   gaussian_process.kernels.RBF
+  gaussian_process.kernels.Matern
   gaussian_process.kernels.RationalQuadratic
   gaussian_process.kernels.ExpSineSquared
   gaussian_process.kernels.DotProduct
diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index fabc4b099b21b..0ee5b20f5db21 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -182,7 +182,7 @@ different properties of the signal:
  - smaller, medium term irregularities are to be explained by a
    RationalQuadratic kernel component, whose length-scale and alpha parameter,
    which determines the diffuseness of the length-scales, are to be determined.
-   According to [RW2006], these irregularities can better be explained by
+   According to [RW2006]_, these irregularities can better be explained by
    a RationalQuadratic than an RBF kernel component, probably because it can
    accommodate several length-scales.
  - a "noise" term, consisting of an RBF kernel contribution, which shall
@@ -323,20 +323,64 @@ The :class:`RBF` kernel is a stationary kernel. It is also known as the "squared
 exponential" kernel. It is parameterized by a length-scale parameter :math:`l>0`, which
 can either be a scalar (isotropic variant of the kernel) or a vector with the same
 number of dimensions as the inputs :math:`x` (anisotropic variant of the kernel).
-The kernel given by:
+The kernel is given by:
 
 .. math::
    k(x_i, x_j) = \text{exp}\left(-\frac{1}{2} d(x_i / l, x_j / l)^2\right)
 
 This kernel is infinitely differentiable, which implies that GPs with this
 kernel as covariance function have mean square derivatives of all orders, and are thus
-very smooth. The prior and posterior of a GP resulting from an RBF kernel is shown in
+very smooth. The prior and posterior of a GP resulting from an RBF kernel are shown in
 the following figure:
 
 .. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_000.png
    :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
    :align: center
 
+
+Matérn kernel
+-------------
+The :class:`Matern` kernel is a stationary kernel and a generalization of the
+:class:`RBF` kernel. It has an additional parameter :math:`\nu` which controls
+the smoothness of the resulting function. It is parameterized by a length-scale parameter :math:`l>0`, which can either be a scalar (isotropic variant of the kernel) or a vector with the same number of dimensions as the inputs :math:`x` (anisotropic variant of the kernel). The kernel is given by:
+
+.. math::
+
+    k(x_i, x_j) = \sigma^2\frac{1}{\Gamma(\nu)2^{\nu-1}}\Bigg(\gamma\sqrt{2\nu} d(x_i / l, x_j / l)\Bigg)^\nu K_\nu\Bigg(\gamma\sqrt{2\nu} d(x_i / l, x_j / l)\Bigg),
+
+As :math:`\nu\rightarrow\infty`, the Matérn kernel converges to the RBF kernel.
+When :math:`\nu = 1/2`, the Matérn kernel becomes identical to the absolute
+exponential kernel, i.e.,
+
+.. math::
+    k(x_i, x_j) = \sigma^2 \exp \Bigg(-\gamma d(x_i / l, x_j / l) \Bigg) \quad \quad \nu= \tfrac{1}{2}
+
+In particular, :math:`\nu = 3/2`:
+
+.. math::
+    k(x_i, x_j) = \sigma^2 \Bigg(1 + \gamma \sqrt{3} d(x_i / l, x_j / l)\Bigg) \exp \Bigg(-\gamma \sqrt{3}d(x_i / l, x_j / l) \Bigg) \quad \quad \nu= \tfrac{3}{2}
+
+and :math:`\nu = 5/2`:
+
+.. math::
+    k(x_i, x_j) = \sigma^2 \Bigg(1 + \gamma \sqrt{5}d(x_i / l, x_j / l) +\frac{5}{3} \gamma^2d(x_i / l, x_j / l)^2 \Bigg) \exp \Bigg(-\gamma \sqrt{5}d(x_i / l, x_j / l) \Bigg) \quad \quad \nu= \tfrac{5}{2}
+
+are popular choices for learning functions that are not infinitely
+differentiable (as assumed by the RBF kernel) but at least once (:math:`\nu =
+3/2`) or twice differentiable (:math:`\nu = 5/2`).
+
+The flexibility of controlling the smoothness of the learned function via :math:`\nu`
+allows adapting to the properties of the true underlying functional relation.
+The prior and posterior of a GP resulting from a Matérn kernel are shown in
+the following figure:
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_004.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
+   :align: center
+
+See [RW2006]_, pp84 for further details regarding the
+different variants of the Matérn kernel.
+
 Rational quadratic kernel
 -------------------------
 
@@ -344,12 +388,12 @@ The :class:`RationalQuadratic` kernel can be seen as a scale mixture (an infinit
 of :class:`RBF` kernels with different characteristic length-scales. It is parameterized
 by a length-scale parameter :math:`l>0` and a scale mixture parameter  :math:`\alpha>0`
 Only the isotropic variant where :math:`l` is a scalar is supported at the moment.
-The kernel given by:
+The kernel is given by:
 
 .. math::
    k(x_i, x_j) = \left(1 + \frac{d(x_i, x_j)^2}{2\alpha l^2}\right)^\alpha
 
-The prior and posterior of a GP resulting from an RBF kernel is shown in
+The prior and posterior of a GP resulting from an RBF kernel are shown in
 the following figure:
 
 .. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_001.png
@@ -362,12 +406,12 @@ Exp-Sine-Squared kernel
 The :class:`ExpSineSquared` kernel allows modeling periodic functions.
 It is parameterized by a length-scale parameter :math:`l>0` and a periodicity parameter
 :math:`p>0`. Only the isotropic variant where :math:`l` is a scalar is supported at the moment.
-The kernel given by:
+The kernel is given by:
 
 .. math::
    k(x_i, x_j) = \text{exp}\left(-2 \text{sin}(\pi / p * d(x_i, x_j)) / l\right)^2
 
-The prior and posterior of a GP resulting from an ExpSineSquared kernel is shown in
+The prior and posterior of a GP resulting from an ExpSineSquared kernel are shown in
 the following figure:
 
 .. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_002.png

From 245b262c8e79b4d8e1d3869f11d6a3ce720bf3b5 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 14 Jul 2015 08:19:50 +0200
Subject: [PATCH 103/165] TST Added test checking that GPR can deal with
 different outputs for same input

---
 sklearn/gaussian_process/tests/test_gpr.py | 34 +++++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index e36542979281e..ed1692cb6ca39 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -26,9 +26,9 @@ def f(x):
 kernels = [RBF(l=1.0), RBF(l=1.0, l_bounds=(1e-3, 1e3)),
            C(1.0, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3)),
            C(1.0, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3))
-               + C(1e-5, (1e-5, 1e2)),
+           + C(1e-5, (1e-5, 1e2)),
            C(0.1, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3))
-               + C(1e-5, (1e-5, 1e2))]
+           + C(1e-5, (1e-5, 1e2))]
 
 
 def test_gpr_interpolation():
@@ -101,7 +101,7 @@ def test_prior():
         assert_almost_equal(y_mean, 0, 5)
         if len(gpr.kernel.theta) > 1:
             # XXX: quite hacky, works only for current kernels
-            assert_almost_equal(np.diag(y_cov), np.exp(kernel.theta[0]) , 5)
+            assert_almost_equal(np.diag(y_cov), np.exp(kernel.theta[0]), 5)
         else:
             assert_almost_equal(np.diag(y_cov), 1, 5)
 
@@ -144,7 +144,7 @@ def test_anisotropic_kernel():
     # least a factor 5
     rng = np.random.RandomState(0)
     X = rng.uniform(-1, 1, (50, 2))
-    y = X[:, 0] +  0.1 * X[:, 1]
+    y = X[:, 0] + 0.1 * X[:, 1]
 
     kernel = RBF([1.0, 1.0])
     gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
@@ -270,3 +270,29 @@ def optimizer(obj_func, initial_theta, bounds):
         # Checks that optimizer improved marginal likelihood
         assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta),
                        gpr.log_marginal_likelihood(gpr.kernel.theta))
+
+
+def test_duplicate_input():
+    """ Test GPR can handle two different output-values for the same input. """
+    for kernel in kernels:
+        gpr_equal_inputs = \
+            GaussianProcessRegressor(kernel=kernel, sigma_squared_n=1e-2)
+        gpr_similar_inputs = \
+            GaussianProcessRegressor(kernel=kernel, sigma_squared_n=1e-2)
+
+        X_ = np.vstack((X, X[0]))
+        y_ = np.hstack((y, y[0] + 1))
+        gpr_equal_inputs.fit(X_, y_)
+
+        X_ = np.vstack((X, X[0] + 1e-15))
+        y_ = np.hstack((y, y[0] + 1))
+        gpr_similar_inputs.fit(X_, y_)
+
+        X_test = np.linspace(0, 10, 100)[:, None]
+        y_pred_equal, y_std_equal = \
+            gpr_equal_inputs.predict(X_test, return_std=True)
+        y_pred_similar, y_std_similar = \
+            gpr_similar_inputs.predict(X_test, return_std=True)
+
+        assert_almost_equal(y_pred_equal, y_pred_similar)
+        assert_almost_equal(y_std_equal, y_std_similar)

From 4aeef9524dadbb7dd15bc14b167c909efb4b5462 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 30 Jul 2015 10:12:24 +0200
Subject: [PATCH 104/165] FIX Checking for correct dimensionality of
 anisotropic kernels

---
 sklearn/gaussian_process/kernels.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index c13956ad5474e..6d4a85edba703 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -875,6 +875,11 @@ def __call__(self, X, Y=None, eval_gradient=False):
             is True.
         """
         X = np.atleast_2d(X)
+        if np.iterable(self.l) and X.shape[1] != self.l.shape[0]:
+            raise Exception("Anisotropic kernel must have the same number of "
+                            "dimensions as data (%d!=%d)"
+                            % (self.l.shape[0], X.shape[1]))
+
         if Y is None:
             dists = pdist(X / self.l, metric='sqeuclidean')
             K = np.exp(-.5 * dists)
@@ -983,6 +988,11 @@ def __call__(self, X, Y=None, eval_gradient=False):
             is True.
         """
         X = np.atleast_2d(X)
+        if np.iterable(self.l) and X.shape[1] != self.l.shape[0]:
+            raise Exception("Anisotropic kernel must have the same number of "
+                            "dimensions as data (%d!=%d)"
+                            % (self.l.shape[0], X.shape[1]))
+
         if Y is None:
             dists = pdist(X / self.l, metric='euclidean')
         else:

From a4f55703f16ae4d562ea010a53b4f086b7350750 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Fri, 31 Jul 2015 11:24:51 +0200
Subject: [PATCH 105/165] FIX RBF and Matern kernel handle 1d length 1
 length-scales correctly

---
 sklearn/gaussian_process/kernels.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 6d4a85edba703..77de5aadd0796 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -836,7 +836,10 @@ class RBF(Kernel):
     """
     def __init__(self, l=1.0, l_bounds=(1e-5, 1e5)):
         if np.iterable(l):
-            self.l = np.asarray(l, dtype=np.float)
+            if len(l) > 1:
+                self.l = np.asarray(l, dtype=np.float)
+            else:
+                self.l = float(l[0])
         else:
             self.l = float(l)
         self.l_bounds = l_bounds

From 692ec6655471fc64424c8f804a057b9d36dc0487 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 19 Apr 2015 18:00:22 +0200
Subject: [PATCH 106/165] DOC Added further doc for GPC

---
 doc/modules/gaussian_process.rst | 34 ++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 0ee5b20f5db21..66abe047e085c 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -220,6 +220,40 @@ until around 2015.
 Gaussian Process Classification (GPC)
 =====================================
 
+.. currentmodule:: sklearn.gaussian_process
+
+The :class:`GaussianProcessClassifier` implements Gaussian processes (GP) for
+classification purposes, more specifically for probabilistic classification,
+where test predictions take the form of class probabilities.
+GaussianProcessClassifier places a GP prior on a latent function :math:`f`,
+which is then squashed through a link function to obtain the probabilistic
+classification. The latent function :math:`f` is a so-called nuisance function,
+whose values are not observed and are not relevant by themselves.
+Its purpose is to allow a convenient formulation of the model, and :math:`f`
+is removed (integrated out) during prediction. GaussianProcessClassifier
+implements the logistic link function, for which the integral cannot be
+computed analytically but is easily approximated in the binary case.
+
+In contrast to the regression setting, the posterior of the latent function
+:math:`f` is not Gaussian even for a GP prior since a Gaussian likelihood is
+inappropriate for discrete class labels. Rather, a non-Gaussian likelihood
+corresponding to the logistic link function (logit) is used.
+GaussianProcessClassifier approximates the non-Gaussian posterior with a
+Gaussian based on the Laplace approximation. More details can be found in
+Chapter 3 of [RW2006]_.
+
+The GP prior mean is assumed to be zero. The prior's
+covariance is specified by a passing a :ref:`kernel <gp_kernels>` object. The
+hyperparameters of the kernel are optimized during fitting of
+GaussianProcessRegressor by maximizing the log-marginal-likelihood (LML) based
+on the passed `optimizer`. As the LML may have multiple local optima, the
+optimizer can be started repeatedly by specifying `n_restarts_optimizer`. The
+first run is always conducted starting from the initial hyperparameter values
+of the kernel; subsequent runs are conducted from hyperparameter values
+that have been chosen randomly from the range of allowed values.
+If the initial hyperparameters should be kept fixed, `None` can be passed as
+optimizer.
+
 GPC examples
 ============
 

From 2396737a24a10b095f89ebada1b0af2733221e36 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 2 Aug 2015 10:25:58 +0200
Subject: [PATCH 107/165] ADD CompoundKernel for multi-class GP classification

---
 sklearn/gaussian_process/kernels.py | 127 ++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 77de5aadd0796..4aa45b4d8c97d 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -244,6 +244,133 @@ def is_stationary(self):
         return True
 
 
+class CompoundKernel(Kernel):
+    """Kernel which is composed of a set of other kernels."""
+
+    def __init__(self, kernels):
+        self.kernels = kernels
+
+    def get_params(self, deep=True):
+        """Get parameters of this kernel.
+
+        Parameters
+        ----------
+        deep: boolean, optional
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : mapping of string to any
+            Parameter names mapped to their values.
+        """
+        return dict(kernels=kernels)
+
+    @property
+    def theta(self):
+        """Returns the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Note that theta are typically the log-transformed values of the
+        kernel's hyperparameters as this representation of the search space
+        is more amenable for hyperparameter search, as hyperparameters like
+        length-scales naturally live on a log-scale.
+
+        Returns
+        -------
+        theta : array, shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        return np.hstack([kernel.theta for kernel in self.kernels])
+
+    @theta.setter
+    def theta(self, theta):
+        """Sets the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Parameters
+        ----------
+        theta : array, shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        k_dims = self.k1.n_dims
+        for i, kernel in enumerate(self.kernels):
+            kernel.theta = theta[i*k_dims:(i+1)*k_dims]
+
+    @property
+    def bounds(self):
+        """Returns the bounds on the kernel's hyperparameters theta.
+
+        Returns
+        -------
+        bounds : array, shape (n_dims, 2)
+            The bounds on the kernel's hyperparameters theta
+        """
+        return np.vstack([kernel.bounds for kernel in self.kernels])
+
+    @bounds.setter
+    def bounds(self, bounds):
+        """Sets the bounds on the kernel's hyperparameters theta.
+
+        Parameters
+        ----------
+        bounds : array, shape (n_dims, 2)
+            The bounds on the kernel's hyperparameters theta
+        """
+        k1_dims = self.k1.n_dims
+        for i, kernel in enumerate(self.kernels):
+            kernel.bounds = bounds[i*k_dims:(i+1)*k_dims]
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Note that this compound kernel returns the results of all simple kernel
+        stacked along an additional axis.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y, n_kernels)
+            Kernel k(X, Y)
+
+        K_gradient : array, shape (n_samples_X, n_samples_X, n_dims, n_kernels)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        if eval_gradient:
+            K = []
+            K_grad = []
+            for kernel in self.kernels:
+                K_single, K_grad_single = kernel(X, Y, eval_gradient)
+                K.append(K_single)
+                K_grad.append(K_grad_single[..., np.newaxis])
+            return np.dstack(K), np.concatenate(K_grad, 3)
+        else:
+            return np.dstack([kernel(X, Y, eval_gradient)
+                              for kernel in self.kernels])
+
+    def __eq__(self, b):
+        if type(self) != type(b) or len(self.kernels) != len(b.kernels):
+            return False
+        return np.all([self.kernels[i] == b.kernels[i]
+                       for i in range(len(self.kernels))])
+
+    def is_stationary(self):
+        """Returns whether the kernel is stationary. """
+        return np.all([kernel.is_stationary() for kernel in self.kernels])
+
+
 class KernelOperator(Kernel):
     """Base class for all kernel operators. """
 

From c86ed8e1511deb83105622195301963098a289cd Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 2 Aug 2015 10:26:26 +0200
Subject: [PATCH 108/165] FIX GPR's random_state is cloned correctly

---
 sklearn/gaussian_process/gpr.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 9cb543ce216b9..3384a2c7d1d82 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -118,7 +118,8 @@ def __init__(self, kernel=None, sigma_squared_n=1e-10,
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
         self.normalize_y = normalize_y
-        self.rng = check_random_state(random_state)
+        self.random_state = random_state
+        self.rng = check_random_state(self.random_state)
 
     def fit(self, X, y):
         """Fit Gaussian process regression model

From 3867429f9eb74a95b987fa22038ca5b1163bf549 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 2 Aug 2015 10:43:40 +0200
Subject: [PATCH 109/165] ENH Support for multi-class GPC added based on
 OneVsRestClassifier

---
 sklearn/gaussian_process/gpc.py | 172 ++++++++++++++++++++++++++++++--
 1 file changed, 164 insertions(+), 8 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index a6989f9950d87..a0539b61aa46f 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -13,10 +13,12 @@
 from scipy.special import erf
 
 from sklearn.base import BaseEstimator, ClassifierMixin, clone
-from sklearn.gaussian_process.kernels import RBF
+from sklearn.gaussian_process.kernels import RBF, CompoundKernel
 from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
 from sklearn.utils import check_random_state
 from sklearn.preprocessing import LabelEncoder
+from sklearn.multiclass import OneVsRestClassifier
+
 
 # Values required for approximating the logistic sigmoid by
 # error functions. coefs are obtained via:
@@ -29,8 +31,8 @@
                   128.12323805, -2010.49422654])[:, np.newaxis]
 
 
-class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
-    """Gaussian process classification (GPC) based on Laplace approximation.
+class BinaryGaussianProcessClassifierLaplace(BaseEstimator, ClassifierMixin):
+    """Binary Gaussian process classification (GPC) based on Laplace approximation.
 
     The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
     ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and
@@ -39,9 +41,7 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
     Internally, the Laplace approximation is used for approximating the
     non-Gaussian posterior by a Gaussian.
 
-    Currently, the implementation is restricted to
-      * using the logistic link function
-      * and binary classification
+    Currently, the implementation is restricted to using the logistic link function
 
     Parameters
     ----------
@@ -126,7 +126,6 @@ def optimizer(obj_func, initial_theta, bounds):
         values for the observed labels. Since W is diagonal, only the diagonal
         of sqrt(W) is stored.
     """
-
     def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
                  n_restarts_optimizer=1, warm_start=False, random_state=None):
         self.kernel = kernel
@@ -134,7 +133,8 @@ def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
         self.warm_start = warm_start
-        self.rng = check_random_state(random_state)
+        self.random_state = random_state
+        self.rng = check_random_state(self.random_state)
 
     def fit(self, X, y):
         """Fit Gaussian process regression model
@@ -407,3 +407,159 @@ def _constrained_optimization(self, obj_func, initial_theta, bounds):
             raise ValueError("Unknown optimizer %s." % self.optimizer)
 
         return theta_opt, func_min
+
+
+class GaussianProcessClassifier(OneVsRestClassifier):
+    """Gaussian process classification (GPC) based on Laplace approximation.
+
+    The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
+    ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and
+    Williams.
+
+    Internally, the Laplace approximation is used for approximating the
+    non-Gaussian posterior by a Gaussian.
+
+    Currently, the implementation is restricted to using the logistic link
+    function. For multi-class classification, several binary one-versus rest
+    classifiers are fitted. Note that this class thus does not implement
+    a true multi-class Laplace approximation.
+
+    Parameters
+    ----------
+    kernel : kernel object
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
+        the kernel's hyperparameters are optimized during fitting.
+
+    jitter : float, optional (default: 0.0)
+        Value added to the diagonal of the kernel matrix during fitting.
+        Larger values correspond to increased noise level in the observations
+        and reduce potential numerical issue during fitting.
+
+    optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the  signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func' is the objective function to be maximized, which
+                #   takes the hyperparameters theta as parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additionally to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are::
+
+            'fmin_l_bfgs_b'
+
+    n_restarts_optimizer: int, optional (default: 1)
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 1, all bounds
+        must be finite.
+
+    warm_start : bool, optional (default: False)
+        If warm-starts are enabled, the solution of the last Newton iteration
+        on the Laplace approximation of the posterior mode is used as
+        initialization for the next call of _posterior_mode(). This can speed
+        up convergence when _posterior_mode is called several times on similar
+        problems as in hyperparameter optimization.
+
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+
+    n_jobs : int, optional, default: 1
+        The number of jobs to use for the computation. If -1 all CPUs are used.
+        If 1 is given, no parallel computing code is used at all, which is
+        useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
+        used. Thus for n_jobs = -2, all CPUs but one are used.
+
+    Attributes
+    ----------
+    kernel_: kernel object
+        The kernel used for prediction. In case of binary classification,
+        the structure of the kernel is the same as the one passed as parameter
+        but with optimized hyperparameters. In case of multi-class
+        classification, a CompoundKernel is returned which consists of the
+        different kernels used in the one-versus-rest classifiers.
+
+    classes_ : array-like, shape = (n_classes,)
+        Unique class labels.
+    """
+    def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
+                 n_restarts_optimizer=1, warm_start=False, random_state=None,
+                 n_jobs=1):
+        self.base_estimator = BinaryGaussianProcessClassifierLaplace(
+            kernel, jitter, optimizer, n_restarts_optimizer, warm_start,
+            random_state)
+        super(GaussianProcessClassifier, self).__init__(
+            self.base_estimator, n_jobs)
+
+    @property
+    def kernel_(self):
+        if len(self.estimators_) == 1:
+            return self.estimators_[0].kernel_
+        else:
+            return CompoundKernel([estimator.kernel_
+                                   for estimator in self.estimators_])
+
+    def log_marginal_likelihood(self, theta, eval_gradient=False):
+        """Returns log-marginal likelihood of theta for training data.
+
+        In the case of multi-class classification, the mean log-marginal
+        likelihood of the one-versus-rest classifiers are returned.
+
+        Parameters
+        ----------
+        theta : array-like, shape = (n_kernel_params,)
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated. In the case of multi-class classification, theta must
+            be the  hyperparameters of the compound kernel.
+
+        eval_gradient : bool, default: False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally. Note that gradient computation is not supported
+            for non-binary classification.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : array, shape = (n_kernel_params,), optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when eval_gradient is True.
+        """
+        if len(self.estimators_) == 1:
+            return self.estimators_[0].log_marginal_likelihood(
+                theta, eval_gradient)
+        else:
+            if eval_gradient:
+                raise NotImplementedError("Gradient of log-marginal-likelhood "
+                    "not implemented for multi-class GPC.")
+            n_dims = self.estimators_[0].kernel_.n_dims
+            return np.mean(
+                [estimator.log_marginal_likelihood(theta[n_dims*i:n_dims*(i+1)])
+                 for i, estimator in enumerate(self.estimators_)])
+
+    # Some code checks simply for the existence of the method decision_function
+    # before calling it. However, OneVsRestClassifier implements the method
+    # but raises an Exception because BinaryGaussianProcessClassifierLaplace
+    # does not implement it. We thus raise an AttributeError since calling the
+    # method would always fail.
+    decision_function = property(lambda: AttributeError)

From bbc6f2472f51e67259c749e6eb209157e1709c4a Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 2 Aug 2015 11:28:02 +0200
Subject: [PATCH 110/165] DOC Adding GPC to plot_classification_probability.py
 example

---
 .../classification/plot_classification_probability.py  | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/classification/plot_classification_probability.py b/examples/classification/plot_classification_probability.py
index 1b768535e4391..3991a70cb4c0f 100644
--- a/examples/classification/plot_classification_probability.py
+++ b/examples/classification/plot_classification_probability.py
@@ -6,7 +6,7 @@
 Plot the classification probability for different classifiers. We use a 3
 class dataset, and we classify it with a Support Vector classifier, L1
 and L2 penalized logistic regression with either a One-Vs-Rest or multinomial
-setting.
+setting, and Gaussian process classification.
 
 The logistic regression is not a multiclass classifier out of the box. As
 a result it can identify only the first class.
@@ -21,6 +21,8 @@ class dataset, and we classify it with a Support Vector classifier, L1
 
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import SVC
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import RBF
 from sklearn import datasets
 
 iris = datasets.load_iris()
@@ -30,6 +32,7 @@ class dataset, and we classify it with a Support Vector classifier, L1
 n_features = X.shape[1]
 
 C = 1.0
+kernel = 1.0 * RBF([1.0, 1.0])  # for GPC
 
 # Create different classifiers. The logistic regression cannot do
 # multiclass out of the box.
@@ -38,8 +41,9 @@ class dataset, and we classify it with a Support Vector classifier, L1
                'Linear SVC': SVC(kernel='linear', C=C, probability=True,
                                  random_state=0),
                'L2 logistic (Multinomial)': LogisticRegression(
-                C=C, solver='lbfgs', multi_class='multinomial'
-                )}
+                C=C, solver='lbfgs', multi_class='multinomial'),
+               'GPC': GaussianProcessClassifier(kernel)
+               }
 
 n_classifiers = len(classifiers)
 

From 4a79bfe1660bedc9cb278aeb3aa211bafe0069ae Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 2 Aug 2015 12:05:47 +0200
Subject: [PATCH 111/165] TST Testing GPC for multi-class classification
 problems

---
 sklearn/gaussian_process/tests/test_gpc.py | 25 +++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 86a04ee386c61..4d9771e0aca47 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -15,7 +15,7 @@
 
 
 def f(x):
-    return x * np.sin(x)
+    return np.sin(x)
 X = np.atleast_2d(np.linspace(0, 10, 30)).T
 X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T
 y = np.array(f(X).ravel() > 0, dtype=int)
@@ -86,7 +86,7 @@ def test_random_starts():
     for n_restarts_optimizer in range(1, 10):
         gp = GaussianProcessClassifier(
             kernel=kernel, n_restarts_optimizer=n_restarts_optimizer,
-            random_state=0,).fit(X, y)
+            random_state=0).fit(X, y)
         lml = gp.log_marginal_likelihood(gp.kernel_.theta)
         assert_greater(lml, last_lml - np.finfo(np.float32).eps)
         last_lml = lml
@@ -112,4 +112,23 @@ def optimizer(obj_func, initial_theta, bounds):
         gpc.fit(X, y)
         # Checks that optimizer improved marginal likelihood
         assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.theta),
-                       gpc.log_marginal_likelihood(gpc.kernel.theta))
+                       gpc.log_marginal_likelihood(kernel.theta))
+
+
+def test_multi_class():
+    """ Test GPC for multi-class classification problems. """
+    fX = f(X).ravel()
+    y_mc = np.empty(y.shape, dtype=int)  # multi-class
+    y_mc[fX < -0.35] = 0
+    y_mc[(fX >= -0.35) & (fX < 0.35)] = 1
+    y_mc[fX > 0.35] = 2
+
+    for kernel in kernels:
+        gpc = GaussianProcessClassifier(kernel=kernel)
+        gpc.fit(X, y_mc)
+
+        y_prob = gpc.predict_proba(X2)
+        assert_almost_equal(y_prob.sum(1), 1)
+
+        y_pred = gpc.predict(X2)
+        assert_array_equal(np.argmax(y_prob, 1), y_pred)

From a5b8db469c9310fd9338f20f444fe37a90d738b8 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 3 Aug 2015 17:00:43 +0200
Subject: [PATCH 112/165] FIX Several minor changes/fixes in gaussian_process
 module

---
 sklearn/gaussian_process/__init__.py |  3 ++-
 sklearn/gaussian_process/gpc.py      | 32 ++++++++++++++++++++++------
 sklearn/gaussian_process/gpr.py      |  5 ++++-
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/sklearn/gaussian_process/__init__.py b/sklearn/gaussian_process/__init__.py
index 79abc52e51d16..c2e142505007f 100644
--- a/sklearn/gaussian_process/__init__.py
+++ b/sklearn/gaussian_process/__init__.py
@@ -10,7 +10,8 @@
 """
 
 from .gpr import GaussianProcessRegressor
-from .gpc import GaussianProcessClassifier
+from .gpc import (GaussianProcessClassifier,
+    BinaryGaussianProcessClassifierLaplace)
 from . import kernels
 
 from .gaussian_process import GaussianProcess
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index a0539b61aa46f..2ff2b7b6bc0b7 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -170,7 +170,7 @@ def fit(self, X, y):
                              "classification. y contains classes %s"
                              % self.classes_)
         elif self.classes_.size == 1:
-            warnings.warn("Only one class label (%s) occurrs in training set."
+            warnings.warn("Only one class label (%s) occurs in training set."
                           % self.classes_)
             self.classes_ = np.array([self.classes_[0], self.classes_[0]])
 
@@ -398,8 +398,11 @@ def _posterior_mode(self, K, return_temporaries=False):
 
     def _constrained_optimization(self, obj_func, initial_theta, bounds):
         if self.optimizer == "fmin_l_bfgs_b":
-            theta_opt, func_min, _ = \
+            theta_opt, func_min, convergence_dict = \
                 fmin_l_bfgs_b(obj_func, initial_theta, bounds=bounds)
+            if convergence_dict["warnflag"] != 0:
+                warnings.warn("fmin_l_bfgs_b terminated abnormally with the "
+                              " state: %s" % convergence_dict)
         elif callable(self.optimizer):
             theta_opt, func_min = \
                 self.optimizer(obj_func, initial_theta, bounds=bounds)
@@ -526,8 +529,10 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         ----------
         theta : array-like, shape = (n_kernel_params,)
             Kernel hyperparameters for which the log-marginal likelihood is
-            evaluated. In the case of multi-class classification, theta must
-            be the  hyperparameters of the compound kernel.
+            evaluated. In the case of multi-class classification, theta may
+            be the  hyperparameters of the compound kernel or of an individual
+            kernel. In the latter case, all individual kernel get assigned the
+            same theta values.
 
         eval_gradient : bool, default: False
             If True, the gradient of the log-marginal likelihood with respect
@@ -545,6 +550,7 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
             hyperparameters at position theta.
             Only returned when eval_gradient is True.
         """
+        theta = np.asarray(theta)
         if len(self.estimators_) == 1:
             return self.estimators_[0].log_marginal_likelihood(
                 theta, eval_gradient)
@@ -553,9 +559,21 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
                 raise NotImplementedError("Gradient of log-marginal-likelhood "
                     "not implemented for multi-class GPC.")
             n_dims = self.estimators_[0].kernel_.n_dims
-            return np.mean(
-                [estimator.log_marginal_likelihood(theta[n_dims*i:n_dims*(i+1)])
-                 for i, estimator in enumerate(self.estimators_)])
+            if theta.shape[0] == n_dims:  # use same theta for all sub-kernels
+                return np.mean(
+                    [estimator.log_marginal_likelihood(theta)
+                    for i, estimator in enumerate(self.estimators_)])
+            elif theta.shape[0] == n_dims * self.classes_.shape[0]:
+                # theta for compound kernel
+                return np.mean(
+                    [estimator.log_marginal_likelihood(
+                        theta[n_dims*i:n_dims*(i+1)])
+                    for i, estimator in enumerate(self.estimators_)])
+            else:
+                raise ValueError("Shape of theta must be either %d or %d. "
+                                 "Obtained theta with shape %d."
+                                 % (n_dims, n_dims * self.classes_.shape[0],
+                                    theta.shape[0]))
 
     # Some code checks simply for the existence of the method decision_function
     # before calling it. However, OneVsRestClassifier implements the method
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 3384a2c7d1d82..5172991381357 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -395,8 +395,11 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
 
     def _constrained_optimization(self, obj_func, initial_theta, bounds):
         if self.optimizer == "fmin_l_bfgs_b":
-            theta_opt, func_min, _ = \
+            theta_opt, func_min, convergence_dict = \
                 fmin_l_bfgs_b(obj_func, initial_theta, bounds=bounds)
+            if convergence_dict["warnflag"] != 0:
+                warnings.warn("fmin_l_bfgs_b terminated abnormally with the "
+                              " state: %s" % convergence_dict)
         elif callable(self.optimizer):
             theta_opt, func_min = \
                 self.optimizer(obj_func, initial_theta, bounds=bounds)

From f943b5dfac9c04d4c944a04ddda4a4006ecc3fa6 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 3 Aug 2015 17:04:31 +0200
Subject: [PATCH 113/165] FIX Using six.string_types instead of basestring
 (python3 compatibility)

---
 sklearn/gaussian_process/kernels.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 4aa45b4d8c97d..fdb3c5ab685dc 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -96,7 +96,8 @@ def theta(self):
         """
         theta = []
         for var_name in self.theta_vars:
-            if not isinstance(var_name, basestring):  # vector-valued parameter
+            if not isinstance(var_name, six.string_types):
+                # vector-valued parameter
                 var_name, _ = var_name
             theta.append(getattr(self, var_name))
         if len(theta) > 0:
@@ -115,7 +116,8 @@ def theta(self, theta):
         """
         i = 0
         for var_name in self.theta_vars:
-            if not isinstance(var_name, basestring):  # vector-valued parameter
+            if not isinstance(var_name, six.string_types):
+                # vector-valued parameter
                 var_name, var_length = var_name
                 setattr(self, var_name, np.exp(theta[i:i + var_length]))
                 i += var_length
@@ -139,7 +141,8 @@ def bounds(self):
         """
         bounds = []
         for var_name in self.theta_vars:
-            if not isinstance(var_name, basestring):  # vector-valued parameter
+            if not isinstance(var_name, six.string_types):
+                # vector-valued parameter
                 var_name, var_length = var_name
                 var_bounds = np.atleast_2d(getattr(self, var_name + "_bounds"))
                 if var_bounds.shape[0] == 1:
@@ -165,7 +168,8 @@ def bounds(self, bounds):
         """
         i = 0
         for var_name in self.theta_vars:
-            if not isinstance(var_name, basestring):  # vector-valued parameter
+            if not isinstance(var_name, six.string_types):
+                # vector-valued parameter
                 var_name, var_length = var_name
                 setattr(self, var_name + "_bounds",
                         np.exp(bounds[i:i + var_length]))

From 6a931909acd0c62ad7c524e967be6dd65d286921 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 3 Aug 2015 17:05:10 +0200
Subject: [PATCH 114/165] FIX Excluding WhiteKernel from one unit-test

---
 sklearn/gaussian_process/tests/test_kernels.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index fe4f7e28ca138..560d2847418be 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -24,10 +24,11 @@
 
 X = np.random.RandomState(0).normal(0, 1, (10, 2))
 
+kernel_white = RBF(l=2.0) + WhiteKernel(c=3.0)
 kernels = [RBF(l=2.0), RBF(l_bounds=(0.5, 2.0)),
            ConstantKernel(c=10.0),
            2.0 * RBF(l=0.33, l_bounds="fixed"),
-           2.0 * RBF(l=0.5), RBF(l=2.0) + WhiteKernel(c=3.0),
+           2.0 * RBF(l=0.5), kernel_white,
            2.0 * RBF(l=[0.5, 2.0]),
            2.0 * Matern(l=0.33, l_bounds="fixed"),
            2.0 * Matern(l=0.5, nu=0.5),
@@ -123,6 +124,8 @@ def test_kernel_theta():
 def test_auto_vs_cross():
     """ Auto-correlation and cross-correlation should be consistent. """
     for kernel in kernels:
+        if kernel == kernel_white:
+            continue  # Identity does is not satisfied on diagonal
         K_auto = kernel(X)
         K_cross = kernel(X, X)
         assert_almost_equal(K_auto, K_cross, 5)

From 1a42e7619897254e0a79d0e24a958b7a4d5c6345 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 3 Aug 2015 17:06:04 +0200
Subject: [PATCH 115/165] TST Extending testing of multi-class GP

---
 sklearn/gaussian_process/tests/test_gpc.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 4d9771e0aca47..025531a592a44 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -19,6 +19,11 @@ def f(x):
 X = np.atleast_2d(np.linspace(0, 10, 30)).T
 X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T
 y = np.array(f(X).ravel() > 0, dtype=int)
+fX = f(X).ravel()
+y_mc = np.empty(y.shape, dtype=int)  # multi-class
+y_mc[fX < -0.35] = 0
+y_mc[(fX >= -0.35) & (fX < 0.35)] = 1
+y_mc[fX > 0.35] = 2
 
 
 kernels = [RBF(l=0.1), RBF(l=1.0, l_bounds=(1e-3, 1e3)),
@@ -109,7 +114,7 @@ def optimizer(obj_func, initial_theta, bounds):
 
     for kernel in kernels:
         gpc = GaussianProcessClassifier(kernel=kernel, optimizer=optimizer)
-        gpc.fit(X, y)
+        gpc.fit(X, y_mc)
         # Checks that optimizer improved marginal likelihood
         assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.theta),
                        gpc.log_marginal_likelihood(kernel.theta))
@@ -117,12 +122,6 @@ def optimizer(obj_func, initial_theta, bounds):
 
 def test_multi_class():
     """ Test GPC for multi-class classification problems. """
-    fX = f(X).ravel()
-    y_mc = np.empty(y.shape, dtype=int)  # multi-class
-    y_mc[fX < -0.35] = 0
-    y_mc[(fX >= -0.35) & (fX < 0.35)] = 1
-    y_mc[fX > 0.35] = 2
-
     for kernel in kernels:
         gpc = GaussianProcessClassifier(kernel=kernel)
         gpc.fit(X, y_mc)

From 089aab4e853f72c438f23abf0106b47ceea2fdfc Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 3 Aug 2015 17:07:56 +0200
Subject: [PATCH 116/165] DOC Updated documentation of Gaussian process
 classifier

---
 doc/modules/classes.rst          |  2 ++
 doc/modules/gaussian_process.rst | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index ed8e937cbdcc9..6ff3082317085 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -492,6 +492,7 @@ From text
 
   gaussian_process.GaussianProcessRegressor
   gaussian_process.GaussianProcessClassifier
+  gaussian_process.BinaryGaussianProcessClassifierLaplace
 
 Kernels:
 
@@ -510,6 +511,7 @@ Kernels:
   gaussian_process.kernels.ExpSineSquared
   gaussian_process.kernels.DotProduct
   gaussian_process.kernels.PairwiseKernel
+  gaussian_process.kernels.CompoundKernel
 
 
 .. _grid_search_ref:
diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 66abe047e085c..6f4e26df59f28 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -254,6 +254,13 @@ that have been chosen randomly from the range of allowed values.
 If the initial hyperparameters should be kept fixed, `None` can be passed as
 optimizer.
 
+:class:`GaussianProcessClassifier` supports multi-class classification
+by performing one-versus-rest based training and prediction based on :class:`BinaryGaussianProcessClassifierLaplace`, which implements the Laplace
+approximation of the posterior discussed above. Note that
+:class:`GaussianProcessClassifier` thus does not implement
+a true multi-class Laplace approximation, but is based on several binary
+classification tasks.
+
 GPC examples
 ============
 
@@ -302,6 +309,19 @@ Illustration of GPC on the XOR dataset
    :align: center
 
 
+Gaussian process classification (GPC) on iris dataset
+-----------------------------------------------------
+
+This example illustrates the predicted probability of GPC for an isotropic
+and anisotropic RBF kernel on a two-dimensional version for the iris-dataset.
+The anisotropic RBF kernel obtains slightly higher log-marginal-likelihood by
+assigning different length-scales to the two feature dimensions.
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_xor_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpc_xor.html
+   :align: center
+
+
 .. _gp_kernels:
 
 Kernels for Gaussian Processes

From b447daef1802de8141b89acf9e9c7cc3107bdbc7 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 3 Aug 2015 17:08:24 +0200
Subject: [PATCH 117/165] ADD Example illustrating Gaussian process classifier
 on iris dataset

---
 examples/gaussian_process/plot_gpc_iris.py    | 64 +++++++++++++++++++
 .../plot_gpc_isoprobability.py                |  2 +-
 2 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 examples/gaussian_process/plot_gpc_iris.py

diff --git a/examples/gaussian_process/plot_gpc_iris.py b/examples/gaussian_process/plot_gpc_iris.py
new file mode 100644
index 0000000000000..15f292be4fbef
--- /dev/null
+++ b/examples/gaussian_process/plot_gpc_iris.py
@@ -0,0 +1,64 @@
+"""
+=====================================================
+Gaussian process classification (GPC) on iris dataset
+=====================================================
+
+This example illustrates the predicted probability of GPC for an isotropic
+and anisotropic RBF kernel on a two-dimensional version for the iris-dataset.
+The anisotropic RBF kernel obtains slightly higher log-marginal-likelihood by
+assigning different length-scales to the two feature dimensions.
+"""
+print(__doc__)
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn import datasets
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import RBF
+
+# import some data to play with
+iris = datasets.load_iris()
+X = iris.data[:, :2]  # we only take the first two features. We could
+                      # avoid this ugly slicing by using a two-dim dataset
+y = np.array(iris.target, dtype=int)
+
+h = .02  # step size in the mesh
+
+kernel = 1.0 * RBF([1.0])
+gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+kernel = 1.0 * RBF([1.0, 1.0])
+gpc_rbf_anisotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+
+# create a mesh to plot in
+x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
+y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
+                     np.arange(y_min, y_max, h))
+
+titles = ["Isotropic RBF", "Anisotropic RBF"]
+plt.figure(figsize=(10, 5))
+for i, clf in enumerate((gpc_rbf_isotropic, gpc_rbf_anisotropic)):
+    # Plot the predicted probabilities. For that, we will assign a color to
+    # each point in the mesh [x_min, m_max]x[y_min, y_max].
+    plt.subplot(1, 2, i + 1)
+
+    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])
+
+    # Put the result into a color plot
+    Z = Z.reshape((xx.shape[0], xx.shape[1], 3))
+    plt.imshow(Z, extent=(x_min, x_max, y_min, y_max), origin="lower")
+
+    # Plot also the training points
+    plt.scatter(X[:, 0], X[:, 1], c=np.array(["r", "g", "b"])[y])
+    plt.xlabel('Sepal length')
+    plt.ylabel('Sepal width')
+    plt.xlim(xx.min(), xx.max())
+    plt.ylim(yy.min(), yy.max())
+    plt.xticks(())
+    plt.yticks(())
+    plt.title("%s, LML: %.3f" %
+        (titles[i], clf.log_marginal_likelihood(clf.kernel_.theta)))
+
+plt.tight_layout()
+plt.show()
diff --git a/examples/gaussian_process/plot_gpc_isoprobability.py b/examples/gaussian_process/plot_gpc_isoprobability.py
index a3bcc083f2314..44c8969f52792 100644
--- a/examples/gaussian_process/plot_gpc_isoprobability.py
+++ b/examples/gaussian_process/plot_gpc_isoprobability.py
@@ -6,7 +6,7 @@
 Iso-probability lines for Gaussian Processes classification (GPC)
 =================================================================
 
-A two-dimensional classification exampe showing iso-probability lines for
+A two-dimensional classification example showing iso-probability lines for
 the predicted probabilities.
 """
 print(__doc__)

From 9a9b1ce23b5fc054a6581704c6a84e1990c0ade7 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 3 Aug 2015 20:32:45 +0200
Subject: [PATCH 118/165] TST Fixing some GP-releated issues in estimator
 checks

---
 sklearn/utils/estimator_checks.py | 5 +++--
 sklearn/utils/testing.py          | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 3255d9c58790a..46df03ee4a04d 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -127,8 +127,9 @@ def _yield_regressor_checks(name, Regressor):
     if name != 'CCA':
         # check that the regressor handles int input
         yield check_regressors_int
-    # Test if NotFittedError is raised
-    yield check_estimators_unfitted
+    if name != "GaussianProcessRegressor":
+        # Test if NotFittedError is raised
+        yield check_estimators_unfitted
 
 
 def _yield_transformer_checks(name, Transformer):
diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py
index 65c362719e767..bbf83715b4657 100644
--- a/sklearn/utils/testing.py
+++ b/sklearn/utils/testing.py
@@ -546,7 +546,8 @@ def uninstall_mldata_mock():
              'ZeroEstimator', 'ScaledLogOddsEstimator',
              'QuantileEstimator', 'MeanEstimator',
              'LogOddsEstimator', 'PriorProbabilityEstimator',
-             '_SigmoidCalibration', 'VotingClassifier']
+             '_SigmoidCalibration', 'VotingClassifier',
+             'BinaryGaussianProcessClassifierLaplace']
 
 
 def all_estimators(include_meta_estimators=False,

From ff0df69a7b4e1e7fa11c6aa3f2bb028dbdb2b1e7 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 3 Aug 2015 20:33:16 +0200
Subject: [PATCH 119/165] TST Fix GPC and GPR such that all tests are ok

---
 sklearn/gaussian_process/gpc.py | 44 +++++++++++++++++++++++++++------
 sklearn/gaussian_process/gpr.py |  5 ++--
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 2ff2b7b6bc0b7..79d0d3d36331e 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -137,7 +137,7 @@ def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
         self.rng = check_random_state(self.random_state)
 
     def fit(self, X, y):
-        """Fit Gaussian process regression model
+        """Fit Gaussian process classification model
 
         Parameters
         ----------
@@ -156,7 +156,7 @@ def fit(self, X, y):
         else:
             self.kernel_ = clone(self.kernel)
 
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y, multi_output=False)
 
         self.X_fit_ = X
 
@@ -166,13 +166,12 @@ def fit(self, X, y):
         self.y_fit_ = label_encoder.fit_transform(y)
         self.classes_ = label_encoder.classes_
         if self.classes_.size > 2:
-            raise ValueError("GaussianProcessClassifier supports only binary "
-                             "classification. y contains classes %s"
+            raise ValueError("BinaryGaussianProcessClassifierLaplace supports "
+                             "only binary classification.y contains classes %s"
                              % self.classes_)
         elif self.classes_.size == 1:
-            warnings.warn("Only one class label (%s) occurs in training set."
-                          % self.classes_)
-            self.classes_ = np.array([self.classes_[0], self.classes_[0]])
+            raise ValueError("{0:s} requires 2 classes.".format(
+                self.__class__.__name__))
 
         if self.kernel_.n_dims == 0:  # no tunable hyperparameters
             pass
@@ -511,6 +510,37 @@ def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
         super(GaussianProcessClassifier, self).__init__(
             self.base_estimator, n_jobs)
 
+        self.jitter = jitter
+        self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.warm_start = warm_start
+        self.random_state = random_state
+        self.n_jobs = n_jobs
+
+    def fit(self, X, y):
+        """Fit Gaussian process classification model
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Training data
+
+        y : array-like, shape = (n_samples,)
+            Target values, must be binary
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        X, y = check_X_y(X, y, multi_output=False)
+
+        if np.unique(y).size == 1:
+            raise ValueError("GaussianProcessClassifier requires 2 or more "
+                "distinct classes. Only class %s present." % np.unique(y)[0])
+
+        return super(GaussianProcessClassifier, self).fit(X, y)
+
+
     @property
     def kernel_(self):
         if len(self.estimators_) == 1:
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 5172991381357..cab4917823f65 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -12,7 +12,7 @@
 from scipy.optimize import fmin_l_bfgs_b
 
 from sklearn.base import BaseEstimator, RegressorMixin, clone
-from sklearn.gaussian_process.kernels import RBF
+from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import check_X_y, check_array
 
@@ -137,7 +137,8 @@ def fit(self, X, y):
         self : returns an instance of self.
         """
         if self.kernel is None:  # Use an RBF kernel as default
-            self.kernel_ = 1.0 * RBF(1.0)
+            self.kernel_ = \
+                C(1.0, c_bounds="fixed") * RBF(1.0, l_bounds="fixed")
         else:
             self.kernel_ = clone(self.kernel)
 

From 1085ff05d0fe3afe98cd45e10c7c3c2a1cf13289 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 3 Aug 2015 20:57:12 +0200
Subject: [PATCH 120/165] TST Further GP-related fixes

---
 sklearn/gaussian_process/gpc.py                | 2 +-
 sklearn/gaussian_process/tests/test_kernels.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 79d0d3d36331e..516129d137347 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -610,4 +610,4 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
     # but raises an Exception because BinaryGaussianProcessClassifierLaplace
     # does not implement it. We thus raise an AttributeError since calling the
     # method would always fail.
-    decision_function = property(lambda: AttributeError)
+    decision_function = property(lambda *args, **kwargs: AttributeError)
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 560d2847418be..0d74dd7818aab 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -83,7 +83,7 @@ def test_kernel_theta():
             inspect.getargspec(kernel.__class__.__init__)
         theta_vars = map(lambda s: s.rstrip("_bounds"),
                          filter(lambda s: s.endswith("_bounds"), args))
-        assert_equal(kernel.theta_vars, theta_vars)
+        assert_equal(kernel.theta_vars, list(theta_vars))
 
         # Check that values returned in theta are consistent with
         # hyperparameter values (being their logarithms)

From e27c23adb4b6f8bb33784021b1de4893cb0d6011 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 4 Aug 2015 07:45:45 +0200
Subject: [PATCH 121/165] FIX Hiding method decision_function in
 GaussianProcessClassifier

---
 sklearn/gaussian_process/gpc.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 516129d137347..5e28f19d829f8 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -606,8 +606,13 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
                                     theta.shape[0]))
 
     # Some code checks simply for the existence of the method decision_function
-    # before calling it. However, OneVsRestClassifier implements the method
-    # but raises an Exception because BinaryGaussianProcessClassifierLaplace
+    # before calling it. However, OneVsRestClassifier has the method but raises
+    # always an Exception because BinaryGaussianProcessClassifierLaplace
     # does not implement it. We thus raise an AttributeError since calling the
     # method would always fail.
-    decision_function = property(lambda *args, **kwargs: AttributeError)
+    def __getattribute__(self, name):
+        if name in ['decision_function']:
+            raise AttributeError("decision_function not available for "
+                "GaussianProcessClassifier")
+        return super(GaussianProcessClassifier, self).__getattribute__(name)
+

From 71ac5900ed96438c431d5645c177790fa57b97d1 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 4 Aug 2015 08:05:10 +0200
Subject: [PATCH 122/165] TST GaussianProcessRegressor added to MULTI_OUTPUT
 estimators

---
 sklearn/utils/estimator_checks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 46df03ee4a04d..6dd7ce2ea0dc5 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -50,6 +50,7 @@
 CROSS_DECOMPOSITION = ['PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']
 MULTI_OUTPUT = ['CCA', 'DecisionTreeRegressor', 'ElasticNet',
                 'ExtraTreeRegressor', 'ExtraTreesRegressor', 'GaussianProcess',
+                'GaussianProcessRegressor',
                 'KNeighborsRegressor', 'KernelRidge', 'Lars', 'Lasso',
                 'LassoLars', 'LinearRegression', 'MultiTaskElasticNet',
                 'MultiTaskElasticNetCV', 'MultiTaskLasso', 'MultiTaskLassoCV',

From fea47351b74a11f1b1c651f7cb9810db49f8d08b Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 4 Aug 2015 09:18:24 +0200
Subject: [PATCH 123/165] PEP8 Fixing PEP8 issues in gaussian_process package

---
 sklearn/gaussian_process/__init__.py          | 13 ++++++-----
 sklearn/gaussian_process/gpc.py               | 22 ++++++++++---------
 sklearn/gaussian_process/kernels.py           | 14 +++++-------
 sklearn/gaussian_process/tests/test_gpc.py    |  7 +++---
 .../gaussian_process/tests/test_kernels.py    |  5 +++--
 5 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/sklearn/gaussian_process/__init__.py b/sklearn/gaussian_process/__init__.py
index c2e142505007f..01932219799c0 100644
--- a/sklearn/gaussian_process/__init__.py
+++ b/sklearn/gaussian_process/__init__.py
@@ -1,21 +1,24 @@
 # -*- coding: utf-8 -*-
 
-# Author: Vincent Dubourg <vincent.dubourg@gmail.com>
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#         Vincent Dubourg <vincent.dubourg@gmail.com>
 #         (mostly translation, see implementation details)
 # Licence: BSD 3 clause
 
 """
-The :mod:`sklearn.gaussian_process` module implements scalar Gaussian Process
-based predictions.
+The :mod:`sklearn.gaussian_process` module implements Gaussian Process
+based regression and classification.
 """
 
 from .gpr import GaussianProcessRegressor
 from .gpc import (GaussianProcessClassifier,
-    BinaryGaussianProcessClassifierLaplace)
+                  BinaryGaussianProcessClassifierLaplace)
 from . import kernels
 
 from .gaussian_process import GaussianProcess
 from . import correlation_models
 from . import regression_models
 
-__all__ = ['GaussianProcess', 'correlation_models', 'regression_models']
+__all__ = ['GaussianProcess', 'correlation_models', 'regression_models',
+           'GaussianProcessRegressor', 'GaussianProcessClassifier',
+           'BinaryGaussianProcessClassifierLaplace', 'kernels']
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 5e28f19d829f8..c84a8130db6e6 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -41,7 +41,8 @@ class BinaryGaussianProcessClassifierLaplace(BaseEstimator, ClassifierMixin):
     Internally, the Laplace approximation is used for approximating the
     non-Gaussian posterior by a Gaussian.
 
-    Currently, the implementation is restricted to using the logistic link function
+    Currently, the implementation is restricted to using the logistic link
+    function.
 
     Parameters
     ----------
@@ -207,7 +208,8 @@ def obj_func(theta, eval_gradient=True):
                                                        bounds))
             # Select result from run with minimal (negative) log-marginal
             # likelihood
-            self.kernel_.theta = optima[np.argmin(map(itemgetter(1), optima))][0]
+            self.kernel_.theta = \
+                optima[np.argmin(map(itemgetter(1), optima))][0]
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
@@ -536,11 +538,11 @@ def fit(self, X, y):
 
         if np.unique(y).size == 1:
             raise ValueError("GaussianProcessClassifier requires 2 or more "
-                "distinct classes. Only class %s present." % np.unique(y)[0])
+                             "distinct classes. Only class %s present."
+                             % np.unique(y)[0])
 
         return super(GaussianProcessClassifier, self).fit(X, y)
 
-
     @property
     def kernel_(self):
         if len(self.estimators_) == 1:
@@ -586,19 +588,20 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
                 theta, eval_gradient)
         else:
             if eval_gradient:
-                raise NotImplementedError("Gradient of log-marginal-likelhood "
-                    "not implemented for multi-class GPC.")
+                raise NotImplementedError(
+                    "Gradient of log-marginal-likelhood not implemented for "
+                    "multi-class GPC.")
             n_dims = self.estimators_[0].kernel_.n_dims
             if theta.shape[0] == n_dims:  # use same theta for all sub-kernels
                 return np.mean(
                     [estimator.log_marginal_likelihood(theta)
-                    for i, estimator in enumerate(self.estimators_)])
+                     for i, estimator in enumerate(self.estimators_)])
             elif theta.shape[0] == n_dims * self.classes_.shape[0]:
                 # theta for compound kernel
                 return np.mean(
                     [estimator.log_marginal_likelihood(
                         theta[n_dims*i:n_dims*(i+1)])
-                    for i, estimator in enumerate(self.estimators_)])
+                     for i, estimator in enumerate(self.estimators_)])
             else:
                 raise ValueError("Shape of theta must be either %d or %d. "
                                  "Obtained theta with shape %d."
@@ -613,6 +616,5 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
     def __getattribute__(self, name):
         if name in ['decision_function']:
             raise AttributeError("decision_function not available for "
-                "GaussianProcessClassifier")
+                                 "GaussianProcessClassifier")
         return super(GaussianProcessClassifier, self).__getattribute__(name)
-
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index fdb3c5ab685dc..8f262031fe4da 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -912,10 +912,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             else:
                 return K
         else:
-            K = np.zeros((X.shape[0], Y.shape[0]))
-            # entries which are sufficiently similar to be considered identical
-            #K[cdist(X, Y) < 1e-10] = self.c
-            return K
+            return np.zeros((X.shape[0], Y.shape[0]))
 
     def diag(self, X):
         """Returns the diagonal of the kernel k(X, X).
@@ -1162,11 +1159,10 @@ def __call__(self, X, Y=None, eval_gradient=False):
                 return K, K_gradient
 
             # We need to recompute the pairwise dimension-wise distances
-            D = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 \
-                    / (self.l ** 2)
+            D = (X[:, np.newaxis, :] - X[np.newaxis, :, :])**2 / (self.l ** 2)
             if self.nu == 0.5:
                 K_gradient = K[..., np.newaxis] * D \
-                    /  np.sqrt(D.sum(2))[:, :, np.newaxis]
+                    / np.sqrt(D.sum(2))[:, :, np.newaxis]
                 K_gradient[~np.isfinite(K_gradient)] = 0
             elif self.nu == 1.5:
                 K_gradient = \
@@ -1404,8 +1400,8 @@ class DotProduct(Kernel):
 
     The DotProduct kernel is non-stationary and can be obtained from linear
     regression by putting N(0, 1) priors on the coefficients of x_d (d = 1, . .
-    . , D) and a prior of N(0, \sigma_0^2) on the bias. The DotProduct kernel is
-    invariant to a rotation of the coordinates about the origin, but not
+    . , D) and a prior of N(0, \sigma_0^2) on the bias. The DotProduct kernel
+    is invariant to a rotation of the coordinates about the origin, but not
     translations. It is parameterized by a parameter sigma_0^2. For
     sigma_0^2 =0, the kernel is called the homogeneous linear kernel, otherwise
     it is inhomogeneous. The kernel is given by
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 025531a592a44..0f66f37dcb221 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -27,7 +27,7 @@ def f(x):
 
 
 kernels = [RBF(l=0.1), RBF(l=1.0, l_bounds=(1e-3, 1e3)),
-           C(1.0, (1e-2, 1e2)) *  RBF(l=1.0, l_bounds=(1e-3, 1e3))]
+           C(1.0, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3))]
 
 
 def test_predict_consistent():
@@ -36,7 +36,7 @@ def test_predict_consistent():
     for kernel in kernels:
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
         assert_array_equal(gpc.predict(X),
-                           gpc.predict_proba(X)[:, 1] >=0.5)
+                           gpc.predict_proba(X)[:, 1] >= 0.5)
 
 
 def test_lml_improving():
@@ -48,7 +48,7 @@ def test_lml_improving():
 
 
 def test_converged_to_local_maximum():
-    """ Test that we are in local maximum after hyperparameter-optimization. """
+    """ Test that we are in local maximum after hyperparameter-optimization."""
     for kernel in kernels:
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
 
@@ -74,6 +74,7 @@ def test_lml_gradient():
 
         assert_almost_equal(lml_gradient, lml_gradient_approx, 3)
 
+
 def test_random_starts():
     """
     Test that an increasing number of random-starts of GP fitting only
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 0d74dd7818aab..21512f9c74161 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -19,7 +19,8 @@
 from sklearn.base import clone
 
 from sklearn.utils.testing import (assert_equal, assert_almost_equal,
-    assert_not_equal, assert_array_equal, assert_array_almost_equal)
+                                   assert_not_equal, assert_array_equal,
+                                   assert_array_almost_equal)
 
 
 X = np.random.RandomState(0).normal(0, 1, (10, 2))
@@ -73,7 +74,7 @@ def test_kernel_theta():
     """ Check that parameter vector theta of kernel is set correctly. """
     for kernel in kernels:
         if isinstance(kernel, KernelOperator) \
-            or isinstance(kernel, Exponentiation):  # skip non-basic kernels
+           or isinstance(kernel, Exponentiation):  # skip non-basic kernels
             continue
         theta = kernel.theta
         _, K_gradient = kernel(X, eval_gradient=True)

From 75f105319e07a1d29006ba2fe52ff9c84a22a3f3 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 4 Aug 2015 18:29:16 +0800
Subject: [PATCH 124/165] ENH X_train_ can be copied if requested

---
 sklearn/gaussian_process/gpc.py | 51 +++++++++++++++++++--------------
 sklearn/gaussian_process/gpr.py | 43 +++++++++++++++------------
 2 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index c84a8130db6e6..85e604a7948da 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -96,6 +96,12 @@ def optimizer(obj_func, initial_theta, bounds):
         up convergence when _posterior_mode is called several times on similar
         problems as in hyperparameter optimization.
 
+    copy_X_train : bool, optional (default: False)
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
     random_state : integer or numpy.RandomState, optional
         The generator used to initialize the centers. If an integer is
         given, it fixes the seed. Defaults to the global numpy random
@@ -103,10 +109,10 @@ def optimizer(obj_func, initial_theta, bounds):
 
     Attributes
     ----------
-    X_fit_ : array-like, shape = (n_samples, n_features)
+    X_train_ : array-like, shape = (n_samples, n_features)
         Feature values in training data (also required for prediction)
 
-    y_fit_: array-like, shape = (n_samples,)
+    y_train_: array-like, shape = (n_samples,)
         Target values in training data (also required for prediction)
 
     classes_ : array-like, shape = (n_classes,)
@@ -117,10 +123,11 @@ def optimizer(obj_func, initial_theta, bounds):
         same as the one passed as parameter but with optimized hyperparameters
 
     L_: array-like, shape = (n_samples, n_samples)
-        Lower-triangular Cholesky decomposition of the kernel in X_fit_
+        Lower-triangular Cholesky decomposition of the kernel in X_train_
 
     pi_: array-like, shape = (n_samples,)
-        The probabilities of the positive class for the training points X_fit_
+        The probabilities of the positive class for the training points
+        X_train_
 
     W_sr_: array-like, shape = (n_samples,)
         Square root of W, the Hessian of log-likelihood of the latent function
@@ -128,12 +135,14 @@ def optimizer(obj_func, initial_theta, bounds):
         of sqrt(W) is stored.
     """
     def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
-                 n_restarts_optimizer=1, warm_start=False, random_state=None):
+                 n_restarts_optimizer=1, warm_start=False,
+                 copy_X_train=False, random_state=None):
         self.kernel = kernel
         self.jitter = jitter
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
         self.warm_start = warm_start
+        self.copy_X_train = copy_X_train
         self.random_state = random_state
         self.rng = check_random_state(self.random_state)
 
@@ -159,12 +168,12 @@ def fit(self, X, y):
 
         X, y = check_X_y(X, y, multi_output=False)
 
-        self.X_fit_ = X
+        self.X_train_ = np.copy(X) if self.copy_X_train else X
 
         # Encode class labels and check that it is a binary classification
         # problem
         label_encoder = LabelEncoder()
-        self.y_fit_ = label_encoder.fit_transform(y)
+        self.y_train_ = label_encoder.fit_transform(y)
         self.classes_ = label_encoder.classes_
         if self.classes_.size > 2:
             raise ValueError("BinaryGaussianProcessClassifierLaplace supports "
@@ -213,7 +222,7 @@ def obj_func(theta, eval_gradient=True):
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
-        K = self.kernel_(self.X_fit_)
+        K = self.kernel_(self.X_train_)
         K[np.diag_indices_from(K)] += self.jitter
 
         _, (self.pi_, self.W_sr_, self.L_, _, _) = \
@@ -233,14 +242,14 @@ def predict(self, X):
         C : array, shape = (n_samples,)
             Predicted target values for X, values are from classes_
         """
-        check_is_fitted(self, ["X_fit_", "y_fit_", "pi_", "W_sr_", "L_"])
+        check_is_fitted(self, ["X_train_", "y_train_", "pi_", "W_sr_", "L_"])
         X = check_array(X)
 
         # As discussed on Section 3.4.2 of GPML, for making hard binary
         # decisions, it is enough to compute the MAP of the posterior and
         # pass it through the link function
-        K_star = self.kernel_(self.X_fit_, X)  # K_star =k(x_star)
-        f_star = K_star.T.dot(self.y_fit_ - self.pi_)  # Line 4 (Algorithm 3.2)
+        K_star = self.kernel_(self.X_train_, X)  # K_star =k(x_star)
+        f_star = K_star.T.dot(self.y_train_ - self.pi_)  # Algorithm 3.2,Line 4
 
         return np.where(f_star > 0, self.classes_[1], self.classes_[0])
 
@@ -258,12 +267,12 @@ def predict_proba(self, X):
             the model. The columns correspond to the classes in sorted
             order, as they appear in the attribute `classes_`.
         """
-        check_is_fitted(self, ["X_fit_", "y_fit_", "pi_", "W_sr_", "L_"])
+        check_is_fitted(self, ["X_train_", "y_train_", "pi_", "W_sr_", "L_"])
         X = check_array(X)
 
         # Based on Algorithm 3.2 of GPML
-        K_star = self.kernel_(self.X_fit_, X)  # K_star =k(x_star)
-        f_star = K_star.T.dot(self.y_fit_ - self.pi_)  # Line 4
+        K_star = self.kernel_(self.X_train_, X)  # K_star =k(x_star)
+        f_star = K_star.T.dot(self.y_train_ - self.pi_)  # Line 4
         v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star)  # Line 5
         # Line 6 (compute np.diag(v.T.dot(v)) via einsum)
         var_f_star = self.kernel_.diag(X) - np.einsum("ij,ij->j", v, v)
@@ -311,9 +320,9 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         kernel = self.kernel_.clone_with_theta(theta)
 
         if eval_gradient:
-            K, K_gradient = kernel(self.X_fit_, eval_gradient=True)
+            K, K_gradient = kernel(self.X_train_, eval_gradient=True)
         else:
-            K = kernel(self.X_fit_)
+            K = kernel(self.X_train_)
 
         K[np.diag_indices_from(K)] += self.jitter
 
@@ -337,7 +346,7 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
             C = K_gradient[:, :, j]   # Line 11
             s_1 = .5 * a.T.dot(C).dot(a) - .5 * np.trace(R.dot(C))  # Line 12
 
-            b = C.dot(self.y_fit_ - pi)  # Line 13
+            b = C.dot(self.y_train_ - pi)  # Line 13
             s_3 = b - K.dot(R).dot(b)  # Line 14
 
             d_Z[j] = s_1 + s_2.T.dot(s_3)  # Line 15
@@ -356,10 +365,10 @@ def _posterior_mode(self, K, return_temporaries=False):
         # If warm_start are enabled, we reuse the last solution for the
         # posterior mode as initialization; otherwise, we initialize with 0
         if self.warm_start and hasattr(self, "f_cached") \
-           and self.f_cached.shape == self.y_fit_.shape:
+           and self.f_cached.shape == self.y_train_.shape:
             f = self.f_cached
         else:
-            f = np.zeros_like(self.y_fit_, dtype=np.float64)
+            f = np.zeros_like(self.y_train_, dtype=np.float64)
 
         # Use Newton's iteration method to find mode of Laplace approximation
         log_marginal_likelihood = -np.inf
@@ -373,7 +382,7 @@ def _posterior_mode(self, K, return_temporaries=False):
             B = np.eye(W.shape[0]) + W_sr_K * W_sr
             L = cholesky(B, lower=True)
             # Line 6
-            b = W * f + (self.y_fit_ - pi)
+            b = W * f + (self.y_train_ - pi)
             # Line 7
             a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b))
             # Line 8
@@ -382,7 +391,7 @@ def _posterior_mode(self, K, return_temporaries=False):
             # Line 10: Compute log marginal likelihood in loop and use as
             #          convergence criterion
             lml = -0.5*a.T.dot(f) \
-                - np.log(1 + np.exp(-(self.y_fit_*2 - 1)*f)).sum() \
+                - np.log(1 + np.exp(-(self.y_train_*2 - 1)*f)).sum() \
                 - np.log(np.diag(L)).sum()
             # Check if we have converged (log marginal likelihood does
             # not decrease)
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index cab4917823f65..f1c84a509a3cd 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -87,6 +87,12 @@ def optimizer(obj_func, initial_theta, bounds):
         prior based on the data, which contradicts the likelihood principle;
         normalization is thus disabled per default.
 
+    copy_X_train : bool, optional (default: False)
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
     random_state : integer or numpy.RandomState, optional
         The generator used to initialize the centers. If an integer is
         given, it fixes the seed. Defaults to the global numpy random
@@ -94,10 +100,10 @@ def optimizer(obj_func, initial_theta, bounds):
 
     Attributes
     ----------
-    X_fit_ : array-like, shape = (n_samples, n_features)
+    X_train_ : array-like, shape = (n_samples, n_features)
         Feature values in training data (also required for prediction)
 
-    y_fit_: array-like, shape = (n_samples, [n_output_dims])
+    y_train_: array-like, shape = (n_samples, [n_output_dims])
         Target values in training data (also required for prediction)
 
     kernel_: kernel object
@@ -105,19 +111,20 @@ def optimizer(obj_func, initial_theta, bounds):
         same as the one passed as parameter but with optimized hyperparameters
 
     L_: array-like, shape = (n_samples, n_samples)
-        Lower-triangular Cholesky decomposition of the kernel in X_fit_
+        Lower-triangular Cholesky decomposition of the kernel in X_train_
 
     alpha_: array-like, shape = (n_samples,)
         Dual coefficients of training data points in kernel space
     """
     def __init__(self, kernel=None, sigma_squared_n=1e-10,
                  optimizer="fmin_l_bfgs_b", n_restarts_optimizer=1,
-                 normalize_y=False, random_state=None):
+                 normalize_y=False, copy_X_train=False, random_state=None):
         self.kernel = kernel
         self.sigma_squared_n = sigma_squared_n
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
         self.normalize_y = normalize_y
+        self.copy_X_train = copy_X_train
         self.random_state = random_state
         self.rng = check_random_state(self.random_state)
 
@@ -146,11 +153,11 @@ def fit(self, X, y):
 
         # Normalize target value
         if self.normalize_y:
-            self.y_fit_mean = np.mean(y, axis=0)
+            self.y_train_mean = np.mean(y, axis=0)
             # demean y
-            y = y - self.y_fit_mean
+            y = y - self.y_train_mean
         else:
-            self.y_fit_mean = np.zeros(1)
+            self.y_train_mean = np.zeros(1)
 
         if np.iterable(self.sigma_squared_n) \
            and self.sigma_squared_n.shape[0] != y.shape[0]:
@@ -161,8 +168,8 @@ def fit(self, X, y):
                                  " with same number of entries as y.(%d != %d)"
                                  % (self.sigma_squared_n.shape[0], y.shape[0]))
 
-        self.X_fit_ = X
-        self.y_fit_ = y
+        self.X_train_ = np.copy(X) if self.copy_X_train else X
+        self.y_train_ = np.copy(y) if self.copy_X_train else y
 
         if self.kernel_.n_dims == 0:  # no tunable hyperparameters
             pass
@@ -203,10 +210,10 @@ def obj_func(theta, eval_gradient=True):
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
-        K = self.kernel_(self.X_fit_)
+        K = self.kernel_(self.X_train_)
         K[np.diag_indices_from(K)] += self.sigma_squared_n
         self.L_ = cholesky(K, lower=True)  # Line 2
-        self.alpha_ = cho_solve((self.L_, True), self.y_fit_)  # Line 3
+        self.alpha_ = cho_solve((self.L_, True), self.y_train_)  # Line 3
 
         return self
 
@@ -251,7 +258,7 @@ def predict(self, X, return_std=False, return_cov=False):
 
         X = check_array(X)
 
-        if not hasattr(self, "X_fit_"):  # Unfitted; predict based on GP prior
+        if not hasattr(self, "X_train_"):  # Unfitted;predict based on GP prior
             y_mean = np.zeros(X.shape[0])
             if return_cov:
                 y_cov = self.kernel(X)
@@ -262,9 +269,9 @@ def predict(self, X, return_std=False, return_cov=False):
             else:
                 return y_mean
         else:  # Predict based on GP posterior
-            K_trans = self.kernel_(X, self.X_fit_)
+            K_trans = self.kernel_(X, self.X_train_)
             y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
-            y_mean = self.y_fit_mean + y_mean  # undo normal.
+            y_mean = self.y_train_mean + y_mean  # undo normal.
             if return_cov:
                 v = cho_solve((self.L_, True), K_trans.T)  # Line 5
                 y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
@@ -350,9 +357,9 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         kernel = self.kernel_.clone_with_theta(theta)
 
         if eval_gradient:
-            K, K_gradient = kernel(self.X_fit_, eval_gradient=True)
+            K, K_gradient = kernel(self.X_train_, eval_gradient=True)
         else:
-            K = kernel(self.X_fit_)
+            K = kernel(self.X_train_)
 
         K[np.diag_indices_from(K)] += self.sigma_squared_n
         try:
@@ -365,8 +372,8 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         if eval_gradient:
             log_likelihood_gradient = 0
 
-        # Iterate over output dimensions of self.y_fit_
-        y_fit = self.y_fit_
+        # Iterate over output dimensions of self.y_train_
+        y_fit = self.y_train_
         if y_fit.ndim == 1:
             y_fit = y_fit[:, np.newaxis]
         for i in range(y_fit.shape[1]):

From 65a1c6569e3fa26faf58e589d039d2b0461a3cc3 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 4 Aug 2015 18:44:30 +0800
Subject: [PATCH 125/165] ENH Adding max_iter parameter for Newton iteration in
 GPC

---
 examples/gaussian_process/plot_gpr_co2.py |  2 +-
 sklearn/gaussian_process/gpc.py           | 34 ++++++++++++++++++-----
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index d6ec25ab8f2a4..722fd42ebc5b0 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -73,7 +73,7 @@
 # Kernel with parameters given in GPML book
 k1 = 66.0**2 * RBF(l=67.0)  # long term smooth rising trend
 k2 = 2.4**2 * RBF(l=90.0) * ExpSineSquared(l=1.3, p=1.0)  # seasonal component
-k3 = 0.66**2 * RationalQuadratic(l=1.2, alpha=0.78)  # medium term irregularit.
+k3 = 0.66**2 * RationalQuadratic(l=1.2, alpha=0.78)  # medium term irregularity
 k4 = 0.18**2 * RBF(l=0.134) + WhiteKernel(c=0.19**2) # noise terms
 kernel_gpml = k1 + k2 + k3 + k4
 
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 85e604a7948da..8f71ddc57ab9e 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -89,6 +89,11 @@ def optimizer(obj_func, initial_theta, bounds):
         from the space of allowed theta-values. If greater than 1, all bounds
         must be finite.
 
+    max_iter: int, optional (default: 100)
+        The maximum number of iterations in Newton's method for approximating
+        the posterior during predict. Smaller values will reduce computation
+        time at the cost of worse results.
+
     warm_start : bool, optional (default: False)
         If warm-starts are enabled, the solution of the last Newton iteration
         on the Laplace approximation of the posterior mode is used as
@@ -135,12 +140,13 @@ def optimizer(obj_func, initial_theta, bounds):
         of sqrt(W) is stored.
     """
     def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
-                 n_restarts_optimizer=1, warm_start=False,
+                 n_restarts_optimizer=1, max_iter=100, warm_start=False,
                  copy_X_train=False, random_state=None):
         self.kernel = kernel
         self.jitter = jitter
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
+        self.max_iter = max_iter
         self.warm_start = warm_start
         self.copy_X_train = copy_X_train
         self.random_state = random_state
@@ -162,7 +168,8 @@ def fit(self, X, y):
         self : returns an instance of self.
         """
         if self.kernel is None:  # Use an RBF kernel as default
-            self.kernel_ = 1.0 * RBF(1.0)
+            self.kernel_ = \
+                C(1.0, c_bounds="fixed") * RBF(1.0, l_bounds="fixed")
         else:
             self.kernel_ = clone(self.kernel)
 
@@ -372,7 +379,7 @@ def _posterior_mode(self, K, return_temporaries=False):
 
         # Use Newton's iteration method to find mode of Laplace approximation
         log_marginal_likelihood = -np.inf
-        while True:
+        for _ in range(self.max_iter):
             # Line 4
             pi = 1 / (1 + np.exp(-f))
             W = pi * (1 - pi)
@@ -482,6 +489,11 @@ def optimizer(obj_func, initial_theta, bounds):
         from the space of allowed theta-values. If greater than 1, all bounds
         must be finite.
 
+    max_iter: int, optional (default: 100)
+        The maximum number of iterations in Newton's method for approximating
+        the posterior during predict. Smaller values will reduce computation
+        time at the cost of worse results.
+
     warm_start : bool, optional (default: False)
         If warm-starts are enabled, the solution of the last Newton iteration
         on the Laplace approximation of the posterior mode is used as
@@ -489,6 +501,12 @@ def optimizer(obj_func, initial_theta, bounds):
         up convergence when _posterior_mode is called several times on similar
         problems as in hyperparameter optimization.
 
+    copy_X_train : bool, optional (default: False)
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
     random_state : integer or numpy.RandomState, optional
         The generator used to initialize the centers. If an integer is
         given, it fixes the seed. Defaults to the global numpy random
@@ -513,18 +531,20 @@ def optimizer(obj_func, initial_theta, bounds):
         Unique class labels.
     """
     def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
-                 n_restarts_optimizer=1, warm_start=False, random_state=None,
-                 n_jobs=1):
+                 n_restarts_optimizer=1, max_iter=100, warm_start=False,
+                 copy_X_train=False, random_state=None, n_jobs=1):
         self.base_estimator = BinaryGaussianProcessClassifierLaplace(
-            kernel, jitter, optimizer, n_restarts_optimizer, warm_start,
-            random_state)
+            kernel, jitter, optimizer, n_restarts_optimizer, max_iter,
+            warm_start, copy_X_train, random_state)
         super(GaussianProcessClassifier, self).__init__(
             self.base_estimator, n_jobs)
 
         self.jitter = jitter
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
+        self.max_iter = max_iter
         self.warm_start = warm_start
+        self.copy_X_train = copy_X_train
         self.random_state = random_state
         self.n_jobs = n_jobs
 

From db579023ba2e1e8fe0877f24073c69cdb507775c Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 4 Aug 2015 18:49:29 +0800
Subject: [PATCH 126/165] COS Removing keyword magic from PairwiseKernel

---
 sklearn/gaussian_process/kernels.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 8f262031fe4da..0e06b99bb67cd 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -1532,7 +1532,7 @@ class PairwiseKernel(Kernel):
     gamma_bounds : pair of floats >= 0, default: (1e-5, 1e5)
         The lower and upper bound on gamma
 
-    metric : string, or callable
+    metric : string, or callable, default: "linear"
         The metric to use when calculating kernel between instances in a
         feature array. If metric is a string, it must be one of the metrics
         in pairwise.PAIRWISE_KERNEL_FUNCTIONS.
@@ -1542,19 +1542,23 @@ class PairwiseKernel(Kernel):
         should take two arrays from X as input and return a value indicating
         the distance between them.
 
-    `**kwds` : optional keyword parameters
-        Any further parameters are passed directly to the kernel function.
+    pairwise_kernels_kwargs : dict, default: None
+        All entries of this dict (if any) are passed as keyword arguments to
+        the pairwise kernel function.
     """
 
-    def __init__(self, gamma=1.0, gamma_bounds=(1e-5, 1e5),
-                 metric="linear", **kwargs):
+    def __init__(self, gamma=1.0, gamma_bounds=(1e-5, 1e5), metric="linear",
+                 pairwise_kernels_kwargs=None):
         self.gamma = gamma
         self.gamma_bounds = gamma_bounds
 
         self.theta_vars = ["gamma"] if gamma_bounds is not "fixed" else []
 
         self.metric = metric
-        self.kwargs = kwargs
+        if pairwise_kernels_kwargs is not None:
+            self.pairwise_kernels_kwargs = pairwise_kernels_kwargs
+        else:
+            self.pairwise_kernels_kwargs = {}
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
@@ -1584,7 +1588,8 @@ def __call__(self, X, Y=None, eval_gradient=False):
         """
         X = np.atleast_2d(X)
         K = pairwise_kernels(X, Y, metric=self.metric, gamma=self.gamma,
-                             filter_params=True, **self.kwargs)
+                             filter_params=True,
+                             **self.pairwise_kernels_kwargs)
         if eval_gradient:
             if self.gamma_bounds is "fixed":
                 return K, np.empty((X.shape[0], X.shape[0], 0))
@@ -1593,7 +1598,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
                 def f(gamma):  # helper function
                     return pairwise_kernels(
                         X, Y, metric=self.metric, gamma=np.exp(gamma),
-                        filter_params=True, **self.kwargs)
+                        filter_params=True, **self.pairwise_kernels_kwargs)
                 return K, _approx_fprime(self.theta, f, 1e-10)
         else:
             return K

From 81580fd2d3bfa83026aef54c720695d1f91e0dc0 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 4 Aug 2015 19:15:59 +0800
Subject: [PATCH 127/165] FIX Added missing import of ConstantKernel in gpc.py

---
 sklearn/gaussian_process/gpc.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 8f71ddc57ab9e..7daf5bbd022ff 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -13,7 +13,8 @@
 from scipy.special import erf
 
 from sklearn.base import BaseEstimator, ClassifierMixin, clone
-from sklearn.gaussian_process.kernels import RBF, CompoundKernel
+from sklearn.gaussian_process.kernels \
+    import RBF, CompoundKernel, ConstantKernel as C
 from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
 from sklearn.utils import check_random_state
 from sklearn.preprocessing import LabelEncoder

From 798ec3dd0b7b16c53ed30b879598f9c56eba05e2 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 4 Aug 2015 19:16:26 +0800
Subject: [PATCH 128/165] TST Excluded GPC from
 test_non_transformer_estimators_n_iter()

---
 sklearn/tests/test_common.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index c129f10c885e8..e92e07e7aafa5 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -163,6 +163,10 @@ def test_non_transformer_estimators_n_iter():
                              'LogisticRegressionCV']):
                     continue
 
+                # These models perform iterations for predict
+                if name in (['GaussianProcessClassifier']):
+                    continue
+
                 # Tested in test_transformer_n_iter below
                 elif (name in CROSS_DECOMPOSITION or
                       name in ['LinearSVC', 'LogisticRegression']):

From 1ec07f737f0318d852395cd71e132eb0928f8af1 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 5 Aug 2015 18:28:13 +0200
Subject: [PATCH 129/165] MISC Deprecating legacy GP implementation

---
 sklearn/gaussian_process/correlation_models.py | 9 +++++++++
 sklearn/gaussian_process/gaussian_process.py   | 7 +++++++
 sklearn/gaussian_process/regression_models.py  | 5 +++++
 3 files changed, 21 insertions(+)

diff --git a/sklearn/gaussian_process/correlation_models.py b/sklearn/gaussian_process/correlation_models.py
index cec09af733dba..d29e13cdc4f71 100644
--- a/sklearn/gaussian_process/correlation_models.py
+++ b/sklearn/gaussian_process/correlation_models.py
@@ -11,7 +11,10 @@
 
 import numpy as np
 
+from ..utils import deprecated
 
+
+@deprecated("absolute_exponential is deprecated and will be removed in 0.18.")
 def absolute_exponential(theta, d):
     """
     Absolute exponential autocorrelation model.
@@ -54,6 +57,7 @@ def absolute_exponential(theta, d):
         return np.exp(- np.sum(theta.reshape(1, n_features) * d, axis=1))
 
 
+@deprecated("squared_exponential is deprecated and will be removed in 0.18.")
 def squared_exponential(theta, d):
     """
     Squared exponential correlation model (Radial Basis Function).
@@ -97,6 +101,8 @@ def squared_exponential(theta, d):
         return np.exp(-np.sum(theta.reshape(1, n_features) * d ** 2, axis=1))
 
 
+@deprecated("generalized_exponential is deprecated and will be removed in "
+            "0.18.")
 def generalized_exponential(theta, d):
     """
     Generalized exponential correlation model.
@@ -147,6 +153,7 @@ def generalized_exponential(theta, d):
     return r
 
 
+@deprecated("pure_nugget is deprecated and will be removed in 0.18.")
 def pure_nugget(theta, d):
     """
     Spatial independence correlation model (pure nugget).
@@ -184,6 +191,7 @@ def pure_nugget(theta, d):
     return r
 
 
+@deprecated("cubic is deprecated and will be removed in 0.18.")
 def cubic(theta, d):
     """
     Cubic correlation model::
@@ -234,6 +242,7 @@ def cubic(theta, d):
     return r
 
 
+@deprecated("linear is deprecated and will be removed in 0.18.")
 def linear(theta, d):
     """
     Linear correlation model::
diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py
index 07f688bc57f61..b66e672f2b6c0 100644
--- a/sklearn/gaussian_process/gaussian_process.py
+++ b/sklearn/gaussian_process/gaussian_process.py
@@ -15,10 +15,12 @@
 from ..utils.validation import check_is_fitted
 from . import regression_models as regression
 from . import correlation_models as correlation
+from ..utils import deprecated
 
 MACHINE_EPSILON = np.finfo(np.double).eps
 
 
+@deprecated("l1_cross_distances is deprecated and will be removed in 0.18.")
 def l1_cross_distances(X):
     """
     Computes the nonzero componentwise L1 cross-distances between the vectors
@@ -56,9 +58,14 @@ def l1_cross_distances(X):
     return D, ij
 
 
+@deprecated("GaussianProcess is deprecated and will be removed in 0.18."
+            "Use the GaussianProcessRegressor instead.")
 class GaussianProcess(BaseEstimator, RegressorMixin):
     """The Gaussian Process model class.
 
+    Note that this class is deprecated and will be removed in 0.18.
+    Use the GaussianProcessRegressor instead.
+
     Read more in the :ref:`User Guide <gaussian_process>`.
 
     Parameters
diff --git a/sklearn/gaussian_process/regression_models.py b/sklearn/gaussian_process/regression_models.py
index ad240d8ee948d..41fb1b2ec3bc9 100644
--- a/sklearn/gaussian_process/regression_models.py
+++ b/sklearn/gaussian_process/regression_models.py
@@ -11,7 +11,10 @@
 
 import numpy as np
 
+from ..utils import deprecated
 
+
+@deprecated("constant is deprecated and will be removed in 0.18.")
 def constant(x):
     """
     Zero order polynomial (constant, p = 1) regression model.
@@ -36,6 +39,7 @@ def constant(x):
     return f
 
 
+@deprecated("linear is deprecated and will be removed in 0.18.")
 def linear(x):
     """
     First order polynomial (linear, p = n+1) regression model.
@@ -60,6 +64,7 @@ def linear(x):
     return f
 
 
+@deprecated("quadratic is deprecated and will be removed in 0.18.")
 def quadratic(x):
     """
     Second order polynomial (quadratic, p = n*(n-1)/2+n+1) regression model.

From ee3e67095bff03da55e4d0402637f31e86291ece Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 5 Aug 2015 18:29:05 +0200
Subject: [PATCH 130/165] COS Making BinaryGaussianProcessClassifierLaplace
 private

---
 sklearn/gaussian_process/__init__.py |  3 +--
 sklearn/gaussian_process/gpc.py      | 14 +++++++-------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/sklearn/gaussian_process/__init__.py b/sklearn/gaussian_process/__init__.py
index 01932219799c0..2f88a005c3ae1 100644
--- a/sklearn/gaussian_process/__init__.py
+++ b/sklearn/gaussian_process/__init__.py
@@ -11,8 +11,7 @@
 """
 
 from .gpr import GaussianProcessRegressor
-from .gpc import (GaussianProcessClassifier,
-                  BinaryGaussianProcessClassifierLaplace)
+from .gpc import GaussianProcessClassifier
 from . import kernels
 
 from .gaussian_process import GaussianProcess
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 7daf5bbd022ff..6c0f3ea7119e4 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -12,7 +12,7 @@
 from scipy.optimize import fmin_l_bfgs_b
 from scipy.special import erf
 
-from sklearn.base import BaseEstimator, ClassifierMixin, clone
+from sklearn.base import BaseEstimator, clone
 from sklearn.gaussian_process.kernels \
     import RBF, CompoundKernel, ConstantKernel as C
 from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
@@ -32,7 +32,7 @@
                   128.12323805, -2010.49422654])[:, np.newaxis]
 
 
-class BinaryGaussianProcessClassifierLaplace(BaseEstimator, ClassifierMixin):
+class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
     """Binary Gaussian process classification (GPC) based on Laplace approximation.
 
     The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
@@ -184,9 +184,9 @@ def fit(self, X, y):
         self.y_train_ = label_encoder.fit_transform(y)
         self.classes_ = label_encoder.classes_
         if self.classes_.size > 2:
-            raise ValueError("BinaryGaussianProcessClassifierLaplace supports "
-                             "only binary classification.y contains classes %s"
-                             % self.classes_)
+            raise ValueError("%s supports only binary classification. "
+                             "y contains classes %s"
+                             % (self.__class__.__name__, self.classes_))
         elif self.classes_.size == 1:
             raise ValueError("{0:s} requires 2 classes.".format(
                 self.__class__.__name__))
@@ -534,7 +534,7 @@ def optimizer(obj_func, initial_theta, bounds):
     def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
                  n_restarts_optimizer=1, max_iter=100, warm_start=False,
                  copy_X_train=False, random_state=None, n_jobs=1):
-        self.base_estimator = BinaryGaussianProcessClassifierLaplace(
+        self.base_estimator = _BinaryGaussianProcessClassifierLaplace(
             kernel, jitter, optimizer, n_restarts_optimizer, max_iter,
             warm_start, copy_X_train, random_state)
         super(GaussianProcessClassifier, self).__init__(
@@ -640,7 +640,7 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
 
     # Some code checks simply for the existence of the method decision_function
     # before calling it. However, OneVsRestClassifier has the method but raises
-    # always an Exception because BinaryGaussianProcessClassifierLaplace
+    # always an Exception because _BinaryGaussianProcessClassifierLaplace
     # does not implement it. We thus raise an AttributeError since calling the
     # method would always fail.
     def __getattribute__(self, name):

From 2b43f628f2ad45193d3ee4c5300260230fa2f395 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 5 Aug 2015 18:31:46 +0200
Subject: [PATCH 131/165] DOC Cleaning up documenation of legacy GPs

---
 doc/modules/classes.rst          |  2 +-
 doc/modules/gaussian_process.rst | 93 ++++++++++++--------------------
 2 files changed, 34 insertions(+), 61 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 6ff3082317085..c1c909356886a 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -492,7 +492,7 @@ From text
 
   gaussian_process.GaussianProcessRegressor
   gaussian_process.GaussianProcessClassifier
-  gaussian_process.BinaryGaussianProcessClassifierLaplace
+  gaussian_process.GaussianProcess
 
 Kernels:
 
diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 6f4e26df59f28..885e84e723335 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -255,8 +255,9 @@ If the initial hyperparameters should be kept fixed, `None` can be passed as
 optimizer.
 
 :class:`GaussianProcessClassifier` supports multi-class classification
-by performing one-versus-rest based training and prediction based on :class:`BinaryGaussianProcessClassifierLaplace`, which implements the Laplace
-approximation of the posterior discussed above. Note that
+by performing one-versus-rest based training and prediction based on
+a binary Gaussian process classifier using the Laplace approximation of the
+posterior for binary classification discussed above. Note that
 :class:`GaussianProcessClassifier` thus does not implement
 a true multi-class Laplace approximation, but is based on several binary
 classification tasks.
@@ -293,32 +294,35 @@ hyperparameters used in the first figure by black dots.
    :align: center
 
 
-Iso-probability lines for GPC
------------------------------
-
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_isoprobability_001.png
-   :target: ../auto_examples/gaussian_process/plot_gpc_isoprobability.html
-   :align: center
-
-
 Illustration of GPC on the XOR dataset
 --------------------------------------
 
+.. currentmodule:: sklearn.gaussian_process.kernels
+
+This example illustrates GPC on XOR data. Compared are a stationary, isotropic
+kernel (`RBF`) and a non-stationary kernel (`DotProduct`). On this particular
+dataset, the `DotProduct` kernel obtains considerably better results because the
+class-boundaries are linear and coincide with the coordinate axes. In general,
+stationary kernels often obtain better results.
+
 .. figure:: ../auto_examples/gaussian_process/images/plot_gpc_xor_001.png
    :target: ../auto_examples/gaussian_process/plot_gpc_xor.html
    :align: center
 
+.. currentmodule:: sklearn.gaussian_process
+
 
 Gaussian process classification (GPC) on iris dataset
 -----------------------------------------------------
 
 This example illustrates the predicted probability of GPC for an isotropic
 and anisotropic RBF kernel on a two-dimensional version for the iris-dataset.
+It thus illustrated the applicability of GPC to non-binary classification.
 The anisotropic RBF kernel obtains slightly higher log-marginal-likelihood by
 assigning different length-scales to the two feature dimensions.
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_xor_001.png
-   :target: ../auto_examples/gaussian_process/plot_gpc_xor.html
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_iris_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpc_iris.html
    :align: center
 
 
@@ -493,8 +497,14 @@ shown in the following figure:
    :align: center
 
 
-Legacy
-======
+.. currentmodule:: sklearn.gaussian_process
+
+Legacy Gaussian Processes
+=========================
+
+In this section, the implementation of Gaussian processes used in sklearn until
+release 0.16.1 is described. Note that this implementation is deprecated and
+will be removed in version 0.18.
 
 An introductory regression example
 ----------------------------------
@@ -507,9 +517,6 @@ data. Depending on the number of parameters provided at instantiation, the
 fitting procedure may recourse to maximum likelihood estimation for the
 parameters or alternatively it uses the given parameters.
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gp_regression_001.png
-   :target: ../auto_examples/gaussian_process/plot_gp_regression.html
-   :align: center
 
 ::
 
@@ -546,24 +553,16 @@ equivalent to specifying a fractional variance in the input.  That is
    \mathrm{nugget}_i = \left[\frac{\sigma_i}{y_i}\right]^2
 
 With ``nugget`` and ``corr`` properly set, Gaussian Processes can be
-used to robustly recover an underlying function from noisy data:
-
-.. figure:: ../auto_examples/gaussian_process/images/plot_gp_regression_002.png
-   :target: ../auto_examples/gaussian_process/plot_gp_regression.html
-   :align: center
-
-.. topic:: Other examples
-
-  * :ref:`example_gaussian_process_plot_gp_probabilistic_classification_after_regression.py`
+used to robustly recover an underlying function from noisy data.
 
 
 
 Mathematical formulation
-========================
+------------------------
 
 
 The initial assumption
-----------------------
+^^^^^^^^^^^^^^^^^^^^^^
 
 Suppose one wants to model the output of a computer experiment, say a
 mathematical function:
@@ -607,7 +606,7 @@ and zero otherwise : a *dirac* correlation model -- sometimes referred to as a
 
 
 The best linear unbiased prediction (BLUP)
-------------------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 We now derive the *best linear unbiased prediction* of the sample path
 :math:`g` conditioned on the observations:
@@ -712,7 +711,7 @@ decomposition algorithm.
 
 
 The empirical best linear unbiased predictor (EBLUP)
-----------------------------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Until now, both the autocorrelation and regression models were assumed given.
 In practice however they are never known in advance so that one has to make
@@ -730,36 +729,10 @@ fmin_cobyla optimization function from scipy.optimize. In the case of
 anisotropy however, we provide an implementation of Welch's componentwise
 optimization algorithm -- see references.
 
-For a more comprehensive description of the theoretical aspects of Gaussian
-Processes for Machine Learning, please refer to the references below:
-
-.. topic:: References:
-
-    .. `DACE, A Matlab Kriging Toolbox
-       <http://www2.imm.dtu.dk/~hbn/dace/>`_ S Lophaven, HB Nielsen, J
-       Sondergaard 2002
-
-
-    .. `Screening, predicting, and computer experiments
-      <http://www.jstor.org/pss/1269548>`_ WJ Welch, RJ Buck, J Sacks,
-      HP Wynn, TJ Mitchell, and MD Morris Technometrics 34(1) 15--25,
-      1992
-
-
-    .. [RW2006] `Gaussian Processes for Machine Learning
-      <http://www.gaussianprocess.org/gpml/chapters/>`_ CE
-      Rasmussen, CKI Williams, MIT Press, 2006 (Ed. T Diettrich)
-
-    .. `The design and analysis of computer experiments
-      <http://www.stat.osu.edu/~comp_exp/book.html>`_ TJ Santner, BJ
-      Williams, W Notz Springer, 2003
-
-
-
 .. _correlation_models:
 
 Correlation Models
-==================
+------------------
 
 Common correlation models matches some famous SVM's kernels because they are
 mostly built on equivalent assumptions. They must fulfill Mercer's conditions
@@ -781,7 +754,7 @@ models, see the book by Rasmussen & Williams in references.
 
 
 Regression Models
-=================
+-----------------
 
 Common linear regression models involve zero- (constant), first- and
 second-order polynomials. But one may specify its own in the form of a Python
@@ -792,9 +765,9 @@ that the underlying regression problem is not *underdetermined*.
 
 
 Implementation details
-======================
+----------------------
 
-The present implementation is based on a translation of the DACE Matlab
+The implementation is based on a translation of the DACE Matlab
 toolbox.
 
 .. topic:: References:

From 2a0792b8496b9e0b19bfc025859e6124dd79e228 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 5 Aug 2015 18:32:19 +0200
Subject: [PATCH 132/165] DOC Updating GPC XOR example (showing also results
 for DotProduct kernel)

---
 examples/gaussian_process/plot_gpc_xor.py | 49 ++++++++++++++---------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/examples/gaussian_process/plot_gpc_xor.py b/examples/gaussian_process/plot_gpc_xor.py
index 5e2dcb9662f63..5e8678d2bb0ba 100644
--- a/examples/gaussian_process/plot_gpc_xor.py
+++ b/examples/gaussian_process/plot_gpc_xor.py
@@ -3,7 +3,11 @@
 Illustration of Gaussian process classification (GPC) on the XOR dataset
 ========================================================================
 
-This example illustrates GPC on XOR data.
+This example illustrates GPC on XOR data. Compared are a stationary, isotropic
+kernel (RBF) and a non-stationary kernel (DotProduct). On this particular
+dataset, the DotProduct kernel obtains considerably better results because the
+class-boundaries are linear and coincide with the coordinate axes. In general,
+stationary kernels often obtain better results.
 """
 print(__doc__)
 
@@ -15,7 +19,7 @@
 import matplotlib.pyplot as plt
 
 from sklearn.gaussian_process import GaussianProcessClassifier
-from sklearn.gaussian_process.kernels import RBF
+from sklearn.gaussian_process.kernels import RBF, DotProduct
 
 
 xx, yy = np.meshgrid(np.linspace(-3, 3, 50),
@@ -25,21 +29,28 @@
 Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
 
 # fit the model
-kernel = 1.0 * RBF(l=1.0)
-clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y)
-
-# plot the decision function for each datapoint on the grid
-Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1]
-Z = Z.reshape(xx.shape)
-
-image = plt.imshow(Z, interpolation='nearest',
-                   extent=(xx.min(), xx.max(), yy.min(), yy.max()),
-                   aspect='auto', origin='lower', cmap=plt.cm.PuOr_r)
-contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
-                       linetypes='--')
-plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired)
-plt.xticks(())
-plt.yticks(())
-plt.axis([-3, 3, -3, 3])
-plt.colorbar(image)
+plt.figure(figsize=(10, 5))
+kernels = [1.0 * RBF(l=1.0), 1.0 * DotProduct(sigma_0=1.0)**2]
+for i, kernel in enumerate(kernels):
+    clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y)
+
+    # plot the decision function for each datapoint on the grid
+    Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1]
+    Z = Z.reshape(xx.shape)
+
+    plt.subplot(1, 2, i + 1)
+    image = plt.imshow(Z, interpolation='nearest',
+                       extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+                       aspect='auto', origin='lower', cmap=plt.cm.PuOr_r)
+    contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
+                           linetypes='--')
+    plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired)
+    plt.xticks(())
+    plt.yticks(())
+    plt.axis([-3, 3, -3, 3])
+    plt.colorbar(image)
+    plt.title("%s\n Log-Marginal-Likelihood:%.3f"
+              % (clf.kernel_, clf.log_marginal_likelihood(clf.kernel_.theta)))
+
+plt.tight_layout()
 plt.show()

From 47cc6a23cc3a94ab656a460629a5b909245a8c8b Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 5 Aug 2015 20:56:33 +0200
Subject: [PATCH 133/165] FIX Remove BinaryGaussianProcessClassifierLaplace
 from packages __all__ list

---
 sklearn/gaussian_process/__init__.py | 2 +-
 sklearn/utils/testing.py             | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/gaussian_process/__init__.py b/sklearn/gaussian_process/__init__.py
index 2f88a005c3ae1..0d6fa6bd1e765 100644
--- a/sklearn/gaussian_process/__init__.py
+++ b/sklearn/gaussian_process/__init__.py
@@ -20,4 +20,4 @@
 
 __all__ = ['GaussianProcess', 'correlation_models', 'regression_models',
            'GaussianProcessRegressor', 'GaussianProcessClassifier',
-           'BinaryGaussianProcessClassifierLaplace', 'kernels']
+           'kernels']
diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py
index bbf83715b4657..65c362719e767 100644
--- a/sklearn/utils/testing.py
+++ b/sklearn/utils/testing.py
@@ -546,8 +546,7 @@ def uninstall_mldata_mock():
              'ZeroEstimator', 'ScaledLogOddsEstimator',
              'QuantileEstimator', 'MeanEstimator',
              'LogOddsEstimator', 'PriorProbabilityEstimator',
-             '_SigmoidCalibration', 'VotingClassifier',
-             'BinaryGaussianProcessClassifierLaplace']
+             '_SigmoidCalibration', 'VotingClassifier']
 
 
 def all_estimators(include_meta_estimators=False,

From 26b7ed74d5443f683687fbed6a824d47ba4c7eed Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 5 Aug 2015 20:57:43 +0200
Subject: [PATCH 134/165] FIX Undoing regression_model and correlation_model 
 deprecation

Using the deprecation decorator caused issues when pickling the GaussianProcess estimator.
---
 sklearn/gaussian_process/correlation_models.py | 9 ---------
 sklearn/gaussian_process/regression_models.py  | 5 -----
 2 files changed, 14 deletions(-)

diff --git a/sklearn/gaussian_process/correlation_models.py b/sklearn/gaussian_process/correlation_models.py
index d29e13cdc4f71..cec09af733dba 100644
--- a/sklearn/gaussian_process/correlation_models.py
+++ b/sklearn/gaussian_process/correlation_models.py
@@ -11,10 +11,7 @@
 
 import numpy as np
 
-from ..utils import deprecated
 
-
-@deprecated("absolute_exponential is deprecated and will be removed in 0.18.")
 def absolute_exponential(theta, d):
     """
     Absolute exponential autocorrelation model.
@@ -57,7 +54,6 @@ def absolute_exponential(theta, d):
         return np.exp(- np.sum(theta.reshape(1, n_features) * d, axis=1))
 
 
-@deprecated("squared_exponential is deprecated and will be removed in 0.18.")
 def squared_exponential(theta, d):
     """
     Squared exponential correlation model (Radial Basis Function).
@@ -101,8 +97,6 @@ def squared_exponential(theta, d):
         return np.exp(-np.sum(theta.reshape(1, n_features) * d ** 2, axis=1))
 
 
-@deprecated("generalized_exponential is deprecated and will be removed in "
-            "0.18.")
 def generalized_exponential(theta, d):
     """
     Generalized exponential correlation model.
@@ -153,7 +147,6 @@ def generalized_exponential(theta, d):
     return r
 
 
-@deprecated("pure_nugget is deprecated and will be removed in 0.18.")
 def pure_nugget(theta, d):
     """
     Spatial independence correlation model (pure nugget).
@@ -191,7 +184,6 @@ def pure_nugget(theta, d):
     return r
 
 
-@deprecated("cubic is deprecated and will be removed in 0.18.")
 def cubic(theta, d):
     """
     Cubic correlation model::
@@ -242,7 +234,6 @@ def cubic(theta, d):
     return r
 
 
-@deprecated("linear is deprecated and will be removed in 0.18.")
 def linear(theta, d):
     """
     Linear correlation model::
diff --git a/sklearn/gaussian_process/regression_models.py b/sklearn/gaussian_process/regression_models.py
index 41fb1b2ec3bc9..ad240d8ee948d 100644
--- a/sklearn/gaussian_process/regression_models.py
+++ b/sklearn/gaussian_process/regression_models.py
@@ -11,10 +11,7 @@
 
 import numpy as np
 
-from ..utils import deprecated
 
-
-@deprecated("constant is deprecated and will be removed in 0.18.")
 def constant(x):
     """
     Zero order polynomial (constant, p = 1) regression model.
@@ -39,7 +36,6 @@ def constant(x):
     return f
 
 
-@deprecated("linear is deprecated and will be removed in 0.18.")
 def linear(x):
     """
     First order polynomial (linear, p = n+1) regression model.
@@ -64,7 +60,6 @@ def linear(x):
     return f
 
 
-@deprecated("quadratic is deprecated and will be removed in 0.18.")
 def quadratic(x):
     """
     Second order polynomial (quadratic, p = n*(n-1)/2+n+1) regression model.

From c2546888b538dd79faced42fd5efc81e57fee54f Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 6 Aug 2015 19:32:02 +0200
Subject: [PATCH 135/165] TST Testing n_jobs option of
 GaussianProcessClassifier

---
 sklearn/gaussian_process/tests/test_gpc.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 0f66f37dcb221..34831e966aa3a 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -132,3 +132,17 @@ def test_multi_class():
 
         y_pred = gpc.predict(X2)
         assert_array_equal(np.argmax(y_prob, 1), y_pred)
+
+
+def test_multi_class_n_jobs():
+    """ Test that multi-class GPC produces identical results with n_jobs>1. """
+    for kernel in kernels:
+        gpc = GaussianProcessClassifier(kernel=kernel)
+        gpc.fit(X, y_mc)
+
+        gpc_2 = GaussianProcessClassifier(kernel=kernel, n_jobs=2)
+        gpc_2.fit(X, y_mc)
+
+        y_prob = gpc.predict_proba(X2)
+        y_prob_2 = gpc_2.predict_proba(X2)
+        assert_almost_equal(y_prob, y_prob_2)

From c25532683a64b52514fa4bb2b9e2013b9089e643 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 6 Aug 2015 19:34:19 +0200
Subject: [PATCH 136/165] REF Realizing multi-class GPC bei delegating to
 OneVsRest or OneVsOne

Delegation is more flexible than subclassing.
---
 sklearn/gaussian_process/gpc.py | 135 +++++++++++++++++++++++---------
 1 file changed, 96 insertions(+), 39 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 6c0f3ea7119e4..54b42f09add0a 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -12,13 +12,13 @@
 from scipy.optimize import fmin_l_bfgs_b
 from scipy.special import erf
 
-from sklearn.base import BaseEstimator, clone
+from sklearn.base import BaseEstimator, ClassifierMixin, clone
 from sklearn.gaussian_process.kernels \
     import RBF, CompoundKernel, ConstantKernel as C
 from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
 from sklearn.utils import check_random_state
 from sklearn.preprocessing import LabelEncoder
-from sklearn.multiclass import OneVsRestClassifier
+from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
 
 
 # Values required for approximating the logistic sigmoid by
@@ -33,7 +33,7 @@
 
 
 class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
-    """Binary Gaussian process classification (GPC) based on Laplace approximation.
+    """Binary Gaussian process classification based on Laplace approximation.
 
     The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
     ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and
@@ -174,8 +174,6 @@ def fit(self, X, y):
         else:
             self.kernel_ = clone(self.kernel)
 
-        X, y = check_X_y(X, y, multi_output=False)
-
         self.X_train_ = np.copy(X) if self.copy_X_train else X
 
         # Encode class labels and check that it is a binary classification
@@ -251,7 +249,6 @@ def predict(self, X):
             Predicted target values for X, values are from classes_
         """
         check_is_fitted(self, ["X_train_", "y_train_", "pi_", "W_sr_", "L_"])
-        X = check_array(X)
 
         # As discussed on Section 3.4.2 of GPML, for making hard binary
         # decisions, it is enough to compute the MAP of the posterior and
@@ -276,7 +273,6 @@ def predict_proba(self, X):
             order, as they appear in the attribute `classes_`.
         """
         check_is_fitted(self, ["X_train_", "y_train_", "pi_", "W_sr_", "L_"])
-        X = check_array(X)
 
         # Based on Algorithm 3.2 of GPML
         K_star = self.kernel_(self.X_train_, X)  # K_star =k(x_star)
@@ -430,7 +426,7 @@ def _constrained_optimization(self, obj_func, initial_theta, bounds):
         return theta_opt, func_min
 
 
-class GaussianProcessClassifier(OneVsRestClassifier):
+class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
     """Gaussian process classification (GPC) based on Laplace approximation.
 
     The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
@@ -513,6 +509,17 @@ def optimizer(obj_func, initial_theta, bounds):
         given, it fixes the seed. Defaults to the global numpy random
         number generator.
 
+    multi_class: string, default: "one_vs_rest"
+        Specifies how multi-class classification problems are handled.
+        Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest",
+        one binary Gaussian process classifier is fitted for each class, which
+        is trained to separate this class from the rest. In "one_vs_one", one
+        binary Gaussian process classifier is fitted for each pair of classes,
+        which is trained to separate these two classes. The predictions of
+        these binary predictors are combined into multi-class predictions.
+        Note that "one_vs_one" does not support predicting probability
+        estimates.
+
     n_jobs : int, optional, default: 1
         The number of jobs to use for the computation. If -1 all CPUs are used.
         If 1 is given, no parallel computing code is used at all, which is
@@ -521,7 +528,7 @@ def optimizer(obj_func, initial_theta, bounds):
 
     Attributes
     ----------
-    kernel_: kernel object
+    kernel_ : kernel object
         The kernel used for prediction. In case of binary classification,
         the structure of the kernel is the same as the one passed as parameter
         but with optimized hyperparameters. In case of multi-class
@@ -530,16 +537,15 @@ def optimizer(obj_func, initial_theta, bounds):
 
     classes_ : array-like, shape = (n_classes,)
         Unique class labels.
+
+    n_classes_ : int
+        The number of classes in the training data
     """
     def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
                  n_restarts_optimizer=1, max_iter=100, warm_start=False,
-                 copy_X_train=False, random_state=None, n_jobs=1):
-        self.base_estimator = _BinaryGaussianProcessClassifierLaplace(
-            kernel, jitter, optimizer, n_restarts_optimizer, max_iter,
-            warm_start, copy_X_train, random_state)
-        super(GaussianProcessClassifier, self).__init__(
-            self.base_estimator, n_jobs)
-
+                 copy_X_train=False, random_state=None,
+                 multi_class="one_vs_rest", n_jobs=1):
+        self.kernel = kernel
         self.jitter = jitter
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
@@ -547,8 +553,13 @@ def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
         self.warm_start = warm_start
         self.copy_X_train = copy_X_train
         self.random_state = random_state
+        self.multi_class = multi_class
         self.n_jobs = n_jobs
 
+        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
+            kernel, jitter, optimizer, n_restarts_optimizer, max_iter,
+            warm_start, copy_X_train, random_state)
+
     def fit(self, X, y):
         """Fit Gaussian process classification model
 
@@ -566,20 +577,74 @@ def fit(self, X, y):
         """
         X, y = check_X_y(X, y, multi_output=False)
 
-        if np.unique(y).size == 1:
+        self.classes_ = np.unique(y)
+        self.n_classes_ = self.classes_.size
+        if self.n_classes_ == 1:
             raise ValueError("GaussianProcessClassifier requires 2 or more "
                              "distinct classes. Only class %s present."
-                             % np.unique(y)[0])
+                             % self.classes_[0])
+        if self.n_classes_ > 2:
+            if self.multi_class == "one_vs_rest":
+                self.base_estimator_ = \
+                    OneVsRestClassifier(self.base_estimator_,
+                                        n_jobs=self.n_jobs)
+            elif self.multi_class == "one_vs_one":
+                self.base_estimator_ = \
+                    OneVsOneClassifier(self.base_estimator_,
+                                       n_jobs=self.n_jobs)
+            else:
+                raise ValueError("Unknown multi-class mode %s"
+                                 % self.multi_class)
+
+        self.base_estimator_.fit(X, y)
+        return self
+
+    def predict(self, X):
+        """Perform classification on an array of test vectors X.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
 
-        return super(GaussianProcessClassifier, self).fit(X, y)
+        Returns
+        -------
+        C : array, shape = (n_samples,)
+            Predicted target values for X, values are from classes_
+        """
+        check_is_fitted(self, ["classes_", "n_classes_"])
+        X = check_array(X)
+        return self.base_estimator_.predict(X)
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+
+        Returns
+        -------
+        C : array-like, shape = (n_samples, n_classes)
+            Returns the probability of the samples for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute `classes_`.
+        """
+        check_is_fitted(self, ["classes_", "n_classes_"])
+        if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
+            raise ValueError("one_vs_one multi-class mode does not support "
+                             "predicting probability estimates. Use "
+                             "one_vs_rest mode instead.")
+        X = check_array(X)
+        return self.base_estimator_.predict_proba(X)
 
     @property
     def kernel_(self):
-        if len(self.estimators_) == 1:
-            return self.estimators_[0].kernel_
+        if self.n_classes_ == 2:
+            return self.base_estimator_.kernel_
         else:
-            return CompoundKernel([estimator.kernel_
-                                   for estimator in self.estimators_])
+            return CompoundKernel(
+                [estimator.kernel_
+                 for estimator in self.base_estimator_.estimators_])
 
     def log_marginal_likelihood(self, theta, eval_gradient=False):
         """Returns log-marginal likelihood of theta for training data.
@@ -612,39 +677,31 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
             hyperparameters at position theta.
             Only returned when eval_gradient is True.
         """
+        check_is_fitted(self, ["classes_", "n_classes_"])
+
         theta = np.asarray(theta)
-        if len(self.estimators_) == 1:
-            return self.estimators_[0].log_marginal_likelihood(
+        if self.n_classes_ == 2:
+            return self.base_estimator_.log_marginal_likelihood(
                 theta, eval_gradient)
         else:
             if eval_gradient:
                 raise NotImplementedError(
                     "Gradient of log-marginal-likelhood not implemented for "
                     "multi-class GPC.")
-            n_dims = self.estimators_[0].kernel_.n_dims
+            estimators = self.base_estimator_.estimators_
+            n_dims = estimators[0].kernel_.n_dims
             if theta.shape[0] == n_dims:  # use same theta for all sub-kernels
                 return np.mean(
                     [estimator.log_marginal_likelihood(theta)
-                     for i, estimator in enumerate(self.estimators_)])
+                     for i, estimator in enumerate(estimators)])
             elif theta.shape[0] == n_dims * self.classes_.shape[0]:
                 # theta for compound kernel
                 return np.mean(
                     [estimator.log_marginal_likelihood(
                         theta[n_dims*i:n_dims*(i+1)])
-                     for i, estimator in enumerate(self.estimators_)])
+                     for i, estimator in enumerate(estimators)])
             else:
                 raise ValueError("Shape of theta must be either %d or %d. "
                                  "Obtained theta with shape %d."
                                  % (n_dims, n_dims * self.classes_.shape[0],
                                     theta.shape[0]))
-
-    # Some code checks simply for the existence of the method decision_function
-    # before calling it. However, OneVsRestClassifier has the method but raises
-    # always an Exception because _BinaryGaussianProcessClassifierLaplace
-    # does not implement it. We thus raise an AttributeError since calling the
-    # method would always fail.
-    def __getattribute__(self, name):
-        if name in ['decision_function']:
-            raise AttributeError("decision_function not available for "
-                                 "GaussianProcessClassifier")
-        return super(GaussianProcessClassifier, self).__getattribute__(name)

From f9ff45eebad623fbc17986f2d658901c04486332 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 6 Aug 2015 19:34:34 +0200
Subject: [PATCH 137/165] DOC Updating documentation of multi-class GPC

---
 doc/modules/gaussian_process.rst | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 885e84e723335..1720dab9d5514 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -255,12 +255,23 @@ If the initial hyperparameters should be kept fixed, `None` can be passed as
 optimizer.
 
 :class:`GaussianProcessClassifier` supports multi-class classification
-by performing one-versus-rest based training and prediction based on
-a binary Gaussian process classifier using the Laplace approximation of the
-posterior for binary classification discussed above. Note that
-:class:`GaussianProcessClassifier` thus does not implement
-a true multi-class Laplace approximation, but is based on several binary
-classification tasks.
+by performing either one-versus-rest or one-versus-one based training and
+prediction.  In one-versus-rest, one binary Gaussian process classifier is
+fitted for each class, which is trained to separate this class from the rest.
+In "one_vs_one", one binary Gaussian process classifier is fitted for each pair
+of classes, which is trained to separate these two classes. The predictions of
+these binary predictors are combined into multi-class predictions. See the
+section on :ref:`multi-class classification <multiclass>` for more details.
+
+In the case of Gaussian process classification, "one_vs_one" might be
+computationally  cheaper since it has to solve many problems involving only a
+subset of the whole training set rather than fewer problems on the whole
+dataset. Since Gaussian process classification scales cubically with the size
+of the dataset, this might be considerably faster. However, note that
+"one_vs_one" does not support predicting probability estimates but only plain
+predictions. Moreover, note that :class:`GaussianProcessClassifier` does not
+(yet) implement a true multi-class Laplace approximation, but is based on
+several binary classification tasks.
 
 GPC examples
 ============

From 7f429cf7b26fe95c9910aa489ded06daf41b54c5 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 10 Aug 2015 17:59:50 +0200
Subject: [PATCH 138/165] REF Several small changes based on @eickenberg's
 suggestions

---
 .../plot_gpr_noisy_targets.py                 |  2 +-
 .../plot_gpr_prior_posterior.py               | 10 +--
 sklearn/gaussian_process/gpc.py               | 76 +++++++++----------
 sklearn/gaussian_process/gpr.py               | 74 +++++++++---------
 sklearn/gaussian_process/kernels.py           |  2 +-
 sklearn/gaussian_process/tests/test_gpc.py    |  2 +-
 sklearn/gaussian_process/tests/test_gpr.py    |  2 +-
 7 files changed, 77 insertions(+), 91 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index da336558edaf7..c90c4be229d3b 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -51,7 +51,7 @@ def f(x):
 
 # Instanciate a Gaussian Process model
 kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
-gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)
+gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)
 
 # Fit to data using Maximum Likelihood Estimation of the parameters
 gp.fit(X, y)
diff --git a/examples/gaussian_process/plot_gpr_prior_posterior.py b/examples/gaussian_process/plot_gpr_prior_posterior.py
index 27528bbdc9822..47f253cdf3ba8 100644
--- a/examples/gaussian_process/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/plot_gpr_prior_posterior.py
@@ -38,10 +38,9 @@
     plt.figure(fig_index, figsize=(8, 8))
     plt.subplot(2, 1, 1)
     X_ = np.linspace(0, 5, 100)
-    y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
+    y_mean, y_std = gp.predict(X_[:, np.newaxis], return_std=True)
     plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
-    plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
-                     y_mean + np.sqrt(np.diag(y_cov)),
+    plt.fill_between(X_, y_mean - y_std, y_mean + y_std,
                      alpha=0.5, color='k')
     y_samples = gp.sample_y(X_[:, np.newaxis], 10)
     plt.plot(X_, y_samples, lw=1)
@@ -58,10 +57,9 @@
     # Plot posterior
     plt.subplot(2, 1, 2)
     X_ = np.linspace(0, 5, 100)
-    y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
+    y_mean, y_std = gp.predict(X_[:, np.newaxis], return_std=True)
     plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
-    plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
-                     y_mean + np.sqrt(np.diag(y_cov)),
+    plt.fill_between(X_, y_mean - y_std, y_mean + y_std,
                      alpha=0.5, color='k')
 
     y_samples = gp.sample_y(X_[:, np.newaxis], 10)
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 54b42f09add0a..0efc758e797d7 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -52,11 +52,6 @@ class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
         passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
         the kernel's hyperparameters are optimized during fitting.
 
-    jitter : float, optional (default: 0.0)
-        Value added to the diagonal of the kernel matrix during fitting.
-        Larger values correspond to increased noise level in the observations
-        and reduce potential numerical issue during fitting.
-
     optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
         Can either be one of the internally supported optimizers for optimizing
         the kernel's parameters, specified by a string, or an externally
@@ -82,13 +77,14 @@ def optimizer(obj_func, initial_theta, bounds):
 
             'fmin_l_bfgs_b'
 
-    n_restarts_optimizer: int, optional (default: 1)
+    n_restarts_optimizer: int, optional (default: 0)
         The number of restarts of the optimizer for finding the kernel's
         parameters which maximize the log-marginal likelihood. The first run
         of the optimizer is performed from the kernel's initial parameters,
         the remaining ones (if any) from thetas sampled log-uniform randomly
-        from the space of allowed theta-values. If greater than 1, all bounds
-        must be finite.
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that n_restarts_optimizer=0 implies that one
+        run is performed.
 
     max_iter: int, optional (default: 100)
         The maximum number of iterations in Newton's method for approximating
@@ -140,18 +136,16 @@ def optimizer(obj_func, initial_theta, bounds):
         values for the observed labels. Since W is diagonal, only the diagonal
         of sqrt(W) is stored.
     """
-    def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
-                 n_restarts_optimizer=1, max_iter=100, warm_start=False,
+    def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
+                 n_restarts_optimizer=0, max_iter=100, warm_start=False,
                  copy_X_train=False, random_state=None):
         self.kernel = kernel
-        self.jitter = jitter
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
         self.max_iter = max_iter
         self.warm_start = warm_start
         self.copy_X_train = copy_X_train
         self.random_state = random_state
-        self.rng = check_random_state(self.random_state)
 
     def fit(self, X, y):
         """Fit Gaussian process classification model
@@ -174,6 +168,8 @@ def fit(self, X, y):
         else:
             self.kernel_ = clone(self.kernel)
 
+        self.rng = check_random_state(self.random_state)
+
         self.X_train_ = np.copy(X) if self.copy_X_train else X
 
         # Encode class labels and check that it is a binary classification
@@ -203,19 +199,19 @@ def obj_func(theta, eval_gradient=True):
                     return -self.log_marginal_likelihood(theta)
 
             # First optimize starting from theta specified in kernel
-            optima = [(self._constrained_optimization(obj_func,
-                                                      self.kernel_.theta,
-                                                      self.kernel_.bounds))]
+            optima = [self._constrained_optimization(obj_func,
+                                                     self.kernel_.theta,
+                                                     self.kernel_.bounds)]
 
             # Additional runs are performed from log-uniform chosen initial
             # theta
-            if self.n_restarts_optimizer > 1:
+            if self.n_restarts_optimizer > 0:
                 if not np.isfinite(self.kernel_.bounds).all():
                     raise ValueError(
-                        "Multiple optimizer restarts (n_restarts_optimizer>1) "
+                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
                         "requires that all bounds are finite.")
                 bounds = self.kernel_.bounds
-                for iteration in range(1, self.n_restarts_optimizer):
+                for iteration in range(self.n_restarts_optimizer):
                     theta_initial = np.exp(self.rng.uniform(bounds[:, 0],
                                                             bounds[:, 1]))
                     optima.append(
@@ -223,13 +219,12 @@ def obj_func(theta, eval_gradient=True):
                                                        bounds))
             # Select result from run with minimal (negative) log-marginal
             # likelihood
-            self.kernel_.theta = \
-                optima[np.argmin(map(itemgetter(1), optima))][0]
+            lml_values = map(itemgetter(1), optima)
+            self.kernel_.theta = optima[np.argmin(lml_values)][0]
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
         K = self.kernel_(self.X_train_)
-        K[np.diag_indices_from(K)] += self.jitter
 
         _, (self.pi_, self.W_sr_, self.L_, _, _) = \
             self._posterior_mode(K, return_temporaries=True)
@@ -328,8 +323,6 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         else:
             K = kernel(self.X_train_)
 
-        K[np.diag_indices_from(K)] += self.jitter
-
         # Compute log-marginal-likelihood Z and also store some temporaries
         # which can be reused for computing Z's gradient
         Z, (pi, W_sr, L, b, a) = \
@@ -343,15 +336,17 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         # XXX: Get rid of the np.diag() in the next line
         R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr))  # Line 7
         C = solve(L, W_sr[:, np.newaxis] * K)  # Line 8
-        # Line 9:
-        s_2 = -0.5*(np.diag(K) - np.diag(C.T.dot(C))) \
+        # Line 9: (use einsum to compute np.diag(C.T.dot(C))))
+        s_2 = -0.5*(np.diag(K) - np.einsum('ij, ij -> j', C, C)) \
             * (pi * (1 - pi) * (1 - 2*pi))  # third derivative
+
         for j in range(d_Z.shape[0]):
             C = K_gradient[:, :, j]   # Line 11
-            s_1 = .5 * a.T.dot(C).dot(a) - .5 * np.trace(R.dot(C))  # Line 12
+            # Line 12: (R.T.ravel().dot(C.ravel()) = np.trace(R.dot(C)))
+            s_1 = .5 * a.T.dot(C).dot(a) - .5 * R.T.ravel().dot(C.ravel())
 
             b = C.dot(self.y_train_ - pi)  # Line 13
-            s_3 = b - K.dot(R).dot(b)  # Line 14
+            s_3 = b - K.dot(R.dot(b))  # Line 14
 
             d_Z[j] = s_1 + s_2.T.dot(s_3)  # Line 15
 
@@ -448,11 +443,6 @@ class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
         passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
         the kernel's hyperparameters are optimized during fitting.
 
-    jitter : float, optional (default: 0.0)
-        Value added to the diagonal of the kernel matrix during fitting.
-        Larger values correspond to increased noise level in the observations
-        and reduce potential numerical issue during fitting.
-
     optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
         Can either be one of the internally supported optimizers for optimizing
         the kernel's parameters, specified by a string, or an externally
@@ -478,13 +468,14 @@ def optimizer(obj_func, initial_theta, bounds):
 
             'fmin_l_bfgs_b'
 
-    n_restarts_optimizer: int, optional (default: 1)
+    n_restarts_optimizer: int, optional (default: 0)
         The number of restarts of the optimizer for finding the kernel's
         parameters which maximize the log-marginal likelihood. The first run
         of the optimizer is performed from the kernel's initial parameters,
         the remaining ones (if any) from thetas sampled log-uniform randomly
-        from the space of allowed theta-values. If greater than 1, all bounds
-        must be finite.
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that n_restarts_optimizer=0 implies that one
+        run is performed.
 
     max_iter: int, optional (default: 100)
         The maximum number of iterations in Newton's method for approximating
@@ -541,12 +532,11 @@ def optimizer(obj_func, initial_theta, bounds):
     n_classes_ : int
         The number of classes in the training data
     """
-    def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
-                 n_restarts_optimizer=1, max_iter=100, warm_start=False,
+    def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
+                 n_restarts_optimizer=0, max_iter=100, warm_start=False,
                  copy_X_train=False, random_state=None,
                  multi_class="one_vs_rest", n_jobs=1):
         self.kernel = kernel
-        self.jitter = jitter
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
         self.max_iter = max_iter
@@ -556,10 +546,6 @@ def __init__(self, kernel=None, jitter=0.0, optimizer="fmin_l_bfgs_b",
         self.multi_class = multi_class
         self.n_jobs = n_jobs
 
-        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
-            kernel, jitter, optimizer, n_restarts_optimizer, max_iter,
-            warm_start, copy_X_train, random_state)
-
     def fit(self, X, y):
         """Fit Gaussian process classification model
 
@@ -577,6 +563,11 @@ def fit(self, X, y):
         """
         X, y = check_X_y(X, y, multi_output=False)
 
+        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
+            self.kernel, self.optimizer, self.n_restarts_optimizer,
+            self.max_iter, self.warm_start, self.copy_X_train,
+            self.random_state)
+
         self.classes_ = np.unique(y)
         self.n_classes_ = self.classes_.size
         if self.n_classes_ == 1:
@@ -597,6 +588,7 @@ def fit(self, X, y):
                                  % self.multi_class)
 
         self.base_estimator_.fit(X, y)
+
         return self
 
     def predict(self, X):
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index f1c84a509a3cd..093a1b23c5ba6 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -50,7 +50,7 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
         Can either be one of the internally supported optimizers for optimizing
         the kernel's parameters, specified by a string, or an externally
         defined optimizer passed as a callable. If a callable is passed, it
-        must have the  signature::
+        must have the signature::
 
             def optimizer(obj_func, initial_theta, bounds):
                 # * 'obj_func' is the objective function to be maximized, which
@@ -71,13 +71,14 @@ def optimizer(obj_func, initial_theta, bounds):
 
             'fmin_l_bfgs_b'
 
-    n_restarts_optimizer: int, optional (default: 1)
+    n_restarts_optimizer: int, optional (default: 0)
         The number of restarts of the optimizer for finding the kernel's
         parameters which maximize the log-marginal likelihood. The first run
         of the optimizer is performed from the kernel's initial parameters,
         the remaining ones (if any) from thetas sampled log-uniform randomly
-        from the space of allowed theta-values. If greater than 1, all bounds
-        must be finite.
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that n_restarts_optimizer == 0 implies that one
+        run is performed.
 
     normalize_y: boolean, optional (default: False)
         Whether the target values y are normalized, i.e., the mean of the
@@ -117,7 +118,7 @@ def optimizer(obj_func, initial_theta, bounds):
         Dual coefficients of training data points in kernel space
     """
     def __init__(self, kernel=None, sigma_squared_n=1e-10,
-                 optimizer="fmin_l_bfgs_b", n_restarts_optimizer=1,
+                 optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0,
                  normalize_y=False, copy_X_train=False, random_state=None):
         self.kernel = kernel
         self.sigma_squared_n = sigma_squared_n
@@ -126,7 +127,6 @@ def __init__(self, kernel=None, sigma_squared_n=1e-10,
         self.normalize_y = normalize_y
         self.copy_X_train = copy_X_train
         self.random_state = random_state
-        self.rng = check_random_state(self.random_state)
 
     def fit(self, X, y):
         """Fit Gaussian process regression model
@@ -149,6 +149,8 @@ def fit(self, X, y):
         else:
             self.kernel_ = clone(self.kernel)
 
+        self.rng = check_random_state(self.random_state)
+
         X, y = check_X_y(X, y, multi_output=True)
 
         # Normalize target value
@@ -191,13 +193,13 @@ def obj_func(theta, eval_gradient=True):
 
             # Additional runs are performed from log-uniform chosen initial
             # theta
-            if self.n_restarts_optimizer > 1:
+            if self.n_restarts_optimizer > 0:
                 if not np.isfinite(self.kernel_.bounds).all():
                     raise ValueError(
-                        "Multiple optimizer restarts (n_restarts_optimizer>1) "
+                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
                         "requires that all bounds are finite.")
                 bounds = self.kernel_.bounds
-                for iteration in range(1, self.n_restarts_optimizer):
+                for iteration in range(self.n_restarts_optimizer):
                     theta_initial = \
                         self.rng.uniform(bounds[:, 0], bounds[:, 1])
                     optima.append(
@@ -205,8 +207,8 @@ def obj_func(theta, eval_gradient=True):
                                                        bounds))
             # Select result from run with minimal (negative) log-marginal
             # likelihood
-            self.kernel_.theta = \
-                optima[np.argmin(map(itemgetter(1), optima))][0]
+            lml_values = map(itemgetter(1), optima)
+            self.kernel_.theta = optima[np.argmin(lml_values)][0]
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
@@ -283,11 +285,10 @@ def predict(self, X, return_std=False, return_cov=False):
                 K_inv = L_inv.dot(L_inv.T)
                 # Compute variance of predictive distribution
                 y_var = self.kernel_.diag(X)
-                y_var -= np.sum(K_trans.T[:, np.newaxis] * K_trans.T
-                                * K_inv[:, :, np.newaxis],
-                                axis=0).sum(axis=0)  # axis=(0, 1) in np >= 1.7
+                y_var -= np.einsum("ki,kj,ij->k", K_trans, K_trans, K_inv)
+
                 # Check if any of the variances is negative because of
-                # numerical issues. If yes: set the the variance to 0.
+                # numerical issues. If yes: set the variance to 0.
                 y_var_negative = y_var < 0
                 if np.any(y_var_negative):
                     warnings.warn("Predicted variances smaller than 0. "
@@ -368,33 +369,28 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
             return (-np.inf, np.zeros_like(theta)) \
                 if eval_gradient else -np.inf
 
-        log_likelihood = 0
-        if eval_gradient:
-            log_likelihood_gradient = 0
-
-        # Iterate over output dimensions of self.y_train_
+        # Support multi-dimensional output of self.y_train_
         y_fit = self.y_train_
         if y_fit.ndim == 1:
             y_fit = y_fit[:, np.newaxis]
-        for i in range(y_fit.shape[1]):
-            alpha = cho_solve((L, True), y_fit[:, i])  # Line 3
-
-            # Compute log-likelihood of output dimension (compare line 7)
-            log_likelihood_dim = -0.5 * y_fit[:, i].dot(alpha)
-            log_likelihood_dim -= np.log(np.diag(L)).sum()
-            log_likelihood_dim -= K.shape[0] / 2 * np.log(2 * np.pi)
-
-            log_likelihood += log_likelihood_dim
-
-            if eval_gradient:  # compare Equation 5.9 from GPML
-                tmp = np.outer(alpha, alpha)
-                tmp -= cho_solve((L, True), np.eye(K.shape[0]))
-                # Compute "0.5 * trace(tmp.dot(K_gradient))" without
-                # constructing the full matrix tmp.dot(K_gradient) since only
-                # its diagonal is required
-                log_likelihood_gradient_dim = \
-                    0.5 * np.einsum("ij,ijk->k", tmp, K_gradient)
-                log_likelihood_gradient += log_likelihood_gradient_dim
+
+        alpha = cho_solve((L, True), y_fit)  # Line 3
+
+        # Compute log-likelihood (compare line 7)
+        log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_fit, alpha)
+        log_likelihood_dims -= np.log(np.diag(L)).sum()
+        log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi)
+        log_likelihood = log_likelihood_dims.sum(-1)  # sum over dimensions
+
+        if eval_gradient:  # compare Equation 5.9 from GPML
+            tmp = np.einsum("ik,jk->ijk", alpha, alpha)  # k: output-dimension
+            tmp -= cho_solve((L, True), np.eye(K.shape[0]))[:, :, np.newaxis]
+            # Compute "0.5 * trace(tmp.dot(K_gradient))" without
+            # constructing the full matrix tmp.dot(K_gradient) since only
+            # its diagonal is required
+            log_likelihood_gradient_dims = \
+                0.5 * np.einsum("ijl,ijk->kl", tmp, K_gradient)
+            log_likelihood_gradient = log_likelihood_gradient_dims.sum(-1)
 
         if eval_gradient:
             return log_likelihood, log_likelihood_gradient
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 0e06b99bb67cd..c867304891ca4 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -1488,7 +1488,7 @@ def diag(self, X):
         K_diag : array, shape (n_samples_X,)
             Diagonal of kernel k(X, X)
         """
-        return (X ** 2).sum(1) + self.sigma_0 ** 2
+        return np.einsum('ij,ij->i', X, X) + self.sigma_0 ** 2
 
     def is_stationary(self):
         """Returns whether the kernel is stationary. """
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 34831e966aa3a..83ceb6ce8199d 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -89,7 +89,7 @@ def test_random_starts():
     kernel = C(1.0, (1e-2, 1e2)) \
         * RBF(l=[1e-3] * n_features, l_bounds=[(1e-4, 1e+2)] * n_features)
     last_lml = -np.inf
-    for n_restarts_optimizer in range(1, 10):
+    for n_restarts_optimizer in range(9):
         gp = GaussianProcessClassifier(
             kernel=kernel, n_restarts_optimizer=n_restarts_optimizer,
             random_state=0).fit(X, y)
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index ed1692cb6ca39..6941937b07b64 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -168,7 +168,7 @@ def test_random_starts():
         * RBF(l=[1.0] * n_features, l_bounds=[(1e-4, 1e+2)] * n_features) \
         + WhiteKernel(c=1e-5, c_bounds=(1e-5, 1e1))
     last_lml = -np.inf
-    for n_restarts_optimizer in range(1, 10):
+    for n_restarts_optimizer in range(9):
         gp = GaussianProcessRegressor(
             kernel=kernel, n_restarts_optimizer=n_restarts_optimizer,
             random_state=0,).fit(X, y)

From 434531f9bd25e9689689674be484afc8e1be3ad8 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 11 Aug 2015 20:09:22 +0200
Subject: [PATCH 139/165] ENH More efficient computation of K_gradient for
 anisotropic Matern kernel

---
 sklearn/gaussian_process/kernels.py | 30 ++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index c867304891ca4..506847ce6b932 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -965,16 +965,19 @@ class RBF(Kernel):
     def __init__(self, l=1.0, l_bounds=(1e-5, 1e5)):
         if np.iterable(l):
             if len(l) > 1:
+                self.anisotropic = True
                 self.l = np.asarray(l, dtype=np.float)
             else:
+                self.anisotropic = False
                 self.l = float(l[0])
         else:
+            self.anisotropic = False
             self.l = float(l)
         self.l_bounds = l_bounds
 
         self.theta_vars = []
         if l_bounds is not "fixed":
-            if np.iterable(l):  # anisotropic l needs special care
+            if self.anisotropic:  # anisotropic l needs special care
                 self.theta_vars.append(("l", len(l)))
             else:
                 self.theta_vars.append("l")
@@ -1006,7 +1009,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             is True.
         """
         X = np.atleast_2d(X)
-        if np.iterable(self.l) and X.shape[1] != self.l.shape[0]:
+        if self.anisotropic and X.shape[1] != self.l.shape[0]:
             raise Exception("Anisotropic kernel must have the same number of "
                             "dimensions as data (%d!=%d)"
                             % (self.l.shape[0], X.shape[1]))
@@ -1027,15 +1030,15 @@ def __call__(self, X, Y=None, eval_gradient=False):
         if eval_gradient:
             if self.l_bounds is "fixed":  # Hyperparameter l kept fixed
                 return K, np.empty((X.shape[0], X.shape[0], 0))
-            elif not np.iterable(self.l) or self.l.shape[0] == 1:
+            elif not self.anisotropic or self.l.shape[0] == 1:
                 K_gradient = \
                     (K * squareform(dists))[:, :, np.newaxis]
                 return K, K_gradient
-            elif self.l.shape[0] == X.shape[1]:
+            elif self.anisotropic:
                 # We need to recompute the pairwise dimension-wise distances
-                D = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 \
+                K_gradient = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 \
                     / (self.l ** 2)
-                K_gradient = K[..., np.newaxis] * D
+                K_gradient *= K[..., np.newaxis]
                 return K, K_gradient
             else:
                 raise Exception("Anisotropic kernels require that the number "
@@ -1044,7 +1047,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             return K
 
     def __repr__(self):
-        if np.iterable(self.l):  # anisotropic
+        if self.anisotropic:
             return "{0}(l=[{1}])".format(self.__class__.__name__,
                                          ", ".join(map("{0:.3g}".format,
                                                    self.l)))
@@ -1119,7 +1122,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             is True.
         """
         X = np.atleast_2d(X)
-        if np.iterable(self.l) and X.shape[1] != self.l.shape[0]:
+        if self.anisotropic and X.shape[1] != self.l.shape[0]:
             raise Exception("Anisotropic kernel must have the same number of "
                             "dimensions as data (%d!=%d)"
                             % (self.l.shape[0], X.shape[1]))
@@ -1159,7 +1162,12 @@ def __call__(self, X, Y=None, eval_gradient=False):
                 return K, K_gradient
 
             # We need to recompute the pairwise dimension-wise distances
-            D = (X[:, np.newaxis, :] - X[np.newaxis, :, :])**2 / (self.l ** 2)
+            if self.anisotropic:
+                D = (X[:, np.newaxis, :] - X[np.newaxis, :, :])**2 \
+                    / (self.l ** 2)
+            else:
+                D = squareform(dists**2)[:, :, np.newaxis]
+
             if self.nu == 0.5:
                 K_gradient = K[..., np.newaxis] * D \
                     / np.sqrt(D.sum(2))[:, :, np.newaxis]
@@ -1176,7 +1184,7 @@ def f(theta):  # helper function
                     return self.clone_with_theta(theta)(X, Y)
                 return K, _approx_fprime(self.theta, f, 1e-10)
 
-            if not np.iterable(self.l) or self.l.shape[0] == 1:
+            if not self.anisotropic:
                 return K, K_gradient[:, :].sum(-1)[:, :, np.newaxis]
             else:
                 return K, K_gradient
@@ -1184,7 +1192,7 @@ def f(theta):  # helper function
             return K
 
     def __repr__(self):
-        if np.iterable(self.l):  # anisotropic
+        if self.anisotropic:
             return "{0}(l=[{1}], nu={2:.3g})".format(
                 self.__class__.__name__,
                 ", ".join(map("{0:.3g}".format, self.l)),

From 628aeb5982a27bbbc08f01b44c9f3724db5b76c0 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Tue, 11 Aug 2015 20:26:37 +0200
Subject: [PATCH 140/165] REF Renaming sigma_squared_n to alpha (consistency
 with Ridge)

---
 examples/gaussian_process/plot_gpr_co2.py     |  4 +--
 examples/gaussian_process/plot_gpr_noisy.py   |  4 +--
 .../plot_gpr_noisy_targets.py                 |  4 +--
 sklearn/gaussian_process/gpr.py               | 25 +++++++++++--------
 sklearn/gaussian_process/tests/test_gpr.py    |  4 +--
 5 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 722fd42ebc5b0..ad2007ac105fc 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -77,7 +77,7 @@
 k4 = 0.18**2 * RBF(l=0.134) + WhiteKernel(c=0.19**2) # noise terms
 kernel_gpml = k1 + k2 + k3 + k4
 
-gp = GaussianProcessRegressor(kernel=kernel_gpml, sigma_squared_n=0,
+gp = GaussianProcessRegressor(kernel=kernel_gpml, alpha=0,
 							         optimizer=None, normalize_y=True)
 gp.fit(X, y)
 
@@ -94,7 +94,7 @@
 									   c_bounds=(1e-3, np.inf))  # noise terms
 kernel = k1 + k2 + k3 + k4
 
-gp = GaussianProcessRegressor(kernel=kernel, sigma_squared_n=0,
+gp = GaussianProcessRegressor(kernel=kernel, alpha=0,
                               normalize_y=True)
 gp.fit(X, y)
 
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index 3259fee864e83..31fc7f2527bbb 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -39,7 +39,7 @@
 kernel = 1.0 * RBF(l=100.0, l_bounds=(1e-2, 1e3)) \
   + WhiteKernel(c=1, c_bounds=(1e-10, 1e+1))
 gp = GaussianProcessRegressor(kernel=kernel,
-                              sigma_squared_n=0.0).fit(X, y)
+                              alpha=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
 y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
 plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
@@ -58,7 +58,7 @@
 kernel = 1.0 * RBF(l=1.0, l_bounds=(1e-2, 1e3)) \
   + WhiteKernel(c=1e-5, c_bounds=(1e-10, 1e+1))
 gp = GaussianProcessRegressor(kernel=kernel,
-                              sigma_squared_n=0.0).fit(X, y)
+                              alpha=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
 y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
 plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index c90c4be229d3b..e74ac1253d23d 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -15,7 +15,7 @@
 model as well as its probabilistic nature in the form of a pointwise 95%
 confidence interval.
 
-Note that the parameter ``sigma_squared_n`` is applied as a Tikhonov
+Note that the parameter ``alpha`` is applied as a Tikhonov
 regularization of the assumed covariance between the training points.
 """
 print(__doc__)
@@ -86,7 +86,7 @@ def f(x):
 y += noise
 
 # Instanciate a Gaussian Process model
-gp = GaussianProcessRegressor(kernel=kernel, sigma_squared_n=(dy / y) ** 2,
+gp = GaussianProcessRegressor(kernel=kernel, alpha=(dy / y) ** 2,
                               n_restarts_optimizer=10)
 
 # Fit to data using Maximum Likelihood Estimation of the parameters
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 093a1b23c5ba6..56d93c0b2c65e 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -39,12 +39,15 @@ class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
         passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
         the kernel's hyperparameters are optimized during fitting.
 
-    sigma_squared_n : float or array-like, optional (default: 1e-10)
+    alpha : float or array-like, optional (default: 1e-10)
         Value added to the diagonal of the kernel matrix during fitting.
         Larger values correspond to increased noise level in the observations
         and reduce potential numerical issue during fitting. If an array is
         passed, it must have the same number of entries as the data used for
-        fitting and is used as datapoint-dependent noise level.
+        fitting and is used as datapoint-dependent noise level. Note that this
+        is equivalent to adding a WhiteKernel with c=alpha. Allowing to specify
+        the noise level directly as a parameter is mainly for convenience and
+        for consistency with Ridge.
 
     optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
         Can either be one of the internally supported optimizers for optimizing
@@ -117,11 +120,11 @@ def optimizer(obj_func, initial_theta, bounds):
     alpha_: array-like, shape = (n_samples,)
         Dual coefficients of training data points in kernel space
     """
-    def __init__(self, kernel=None, sigma_squared_n=1e-10,
+    def __init__(self, kernel=None, alpha=1e-10,
                  optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0,
                  normalize_y=False, copy_X_train=False, random_state=None):
         self.kernel = kernel
-        self.sigma_squared_n = sigma_squared_n
+        self.alpha = alpha
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
         self.normalize_y = normalize_y
@@ -161,14 +164,14 @@ def fit(self, X, y):
         else:
             self.y_train_mean = np.zeros(1)
 
-        if np.iterable(self.sigma_squared_n) \
-           and self.sigma_squared_n.shape[0] != y.shape[0]:
-            if self.sigma_squared_n.shape[0] == 1:
-                self.sigma_squared_n = self.sigma_squared_n[0]
+        if np.iterable(self.alpha) \
+           and self.alpha.shape[0] != y.shape[0]:
+            if self.alpha.shape[0] == 1:
+                self.alpha = self.alpha[0]
             else:
                 raise ValueError("sigma_n_squared must be a scalar or an array"
                                  " with same number of entries as y.(%d != %d)"
-                                 % (self.sigma_squared_n.shape[0], y.shape[0]))
+                                 % (self.alpha.shape[0], y.shape[0]))
 
         self.X_train_ = np.copy(X) if self.copy_X_train else X
         self.y_train_ = np.copy(y) if self.copy_X_train else y
@@ -213,7 +216,7 @@ def obj_func(theta, eval_gradient=True):
         # Precompute quantities required for predictions which are independent
         # of actual query points
         K = self.kernel_(self.X_train_)
-        K[np.diag_indices_from(K)] += self.sigma_squared_n
+        K[np.diag_indices_from(K)] += self.alpha
         self.L_ = cholesky(K, lower=True)  # Line 2
         self.alpha_ = cho_solve((self.L_, True), self.y_train_)  # Line 3
 
@@ -362,7 +365,7 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         else:
             K = kernel(self.X_train_)
 
-        K[np.diag_indices_from(K)] += self.sigma_squared_n
+        K[np.diag_indices_from(K)] += self.alpha
         try:
             L = cholesky(K, lower=True)  # Line 2
         except np.linalg.LinAlgError:
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 6941937b07b64..a7c9049b89407 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -276,9 +276,9 @@ def test_duplicate_input():
     """ Test GPR can handle two different output-values for the same input. """
     for kernel in kernels:
         gpr_equal_inputs = \
-            GaussianProcessRegressor(kernel=kernel, sigma_squared_n=1e-2)
+            GaussianProcessRegressor(kernel=kernel, alpha=1e-2)
         gpr_similar_inputs = \
-            GaussianProcessRegressor(kernel=kernel, sigma_squared_n=1e-2)
+            GaussianProcessRegressor(kernel=kernel, alpha=1e-2)
 
         X_ = np.vstack((X, X[0]))
         y_ = np.hstack((y, y[0] + 1))

From 05da11e1609bfa6ae99ab5ea0309f84fb0647a1e Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 12 Aug 2015 19:46:22 +0200
Subject: [PATCH 141/165] ENH Full support of (deep) get_params() and
 set_params() in GP kernels

---
 sklearn/gaussian_process/kernels.py | 50 +++++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 2 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 506847ce6b932..5390f65442a2b 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -58,7 +58,7 @@ def get_params(self, deep=True):
         init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
         args, varargs, kw, default = inspect.getargspec(init)
         if varargs is not None:
-            raise RuntimeError("scikit-learn estimators should always "
+            raise RuntimeError("scikit-learn kernels should always "
                                "specify their parameters in the signature"
                                " of their __init__ (no varargs)."
                                " %s doesn't follow this convention."
@@ -69,6 +69,43 @@ def get_params(self, deep=True):
             params[arg] = getattr(self, arg, None)
         return params
 
+    def set_params(self, **params):
+        """Set the parameters of this kernel.
+
+        The method works on simple kernels as well as on nested kernels.
+        The latter have parameters of the form ``<component>__<parameter>``
+        so that it's possible to update each component of a nested object.
+
+        Returns
+        -------
+        self
+        """
+        if not params:
+            # Simple optimisation to gain speed (inspect is slow)
+            return self
+        valid_params = self.get_params(deep=True)
+        for key, value in six.iteritems(params):
+            split = key.split('__', 1)
+            if len(split) > 1:
+                # nested objects case
+                name, sub_name = split
+                if name not in valid_params:
+                    raise ValueError('Invalid parameter %s for kernel %s. '
+                                     'Check the list of available parameters '
+                                     'with `kernel.get_params().keys()`.' %
+                                     (name, self))
+                sub_object = valid_params[name]
+                sub_object.set_params(**{sub_name: value})
+            else:
+                # simple objects case
+                if key not in valid_params:
+                    raise ValueError('Invalid parameter %s for kernel %s. '
+                                     'Check the list of available parameters '
+                                     'with `kernel.get_params().keys()`.' %
+                                     (key, self.__class__.__name__))
+                setattr(self, key, value)
+        return self
+
     def clone_with_theta(self, theta):
         """Returns a clone of self with given hyperparameters theta. """
         cloned = clone(self)
@@ -155,7 +192,10 @@ def bounds(self):
                 bounds.append(var_bounds)
             else:
                 bounds.append(getattr(self, var_name + "_bounds"))
-        return np.log(np.vstack(bounds))
+        if len(bounds) > 0:
+            return np.log(np.vstack(bounds))
+        else:
+            return np.array([])
 
     @bounds.setter
     def bounds(self, bounds):
@@ -397,6 +437,12 @@ def get_params(self, deep=True):
             Parameter names mapped to their values.
         """
         params = dict(k1=self.k1, k2=self.k2)
+        if deep:
+            deep_items = self.k1.get_params().items()
+            params.update(('k1__' + k, val) for k, val in deep_items)
+            deep_items = self.k2.get_params().items()
+            params.update(('k2__' + k, val) for k, val in deep_items)
+
         return params
 
     @property

From 963e968700ce5707878ed00f4c8d207f572ba5a0 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sat, 15 Aug 2015 12:40:17 +0200
Subject: [PATCH 142/165] DOC Revising GP documentation based on @kastnerkyle
 comments

---
 doc/modules/gaussian_process.rst | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 1720dab9d5514..92fe45705a0a1 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -17,8 +17,8 @@ The advantages of Gaussian processes are:
       kernels).
 
     - The prediction is probabilistic (Gaussian) so that one can compute
-      empirical confidence intervals and exceedance probabilities that might be
-      used to refit (online fitting, adaptive fitting) the prediction in some
+      empirical confidence intervals and decide based on those if one should
+      refit (online fitting, adaptive fitting) the prediction in some
       region of interest.
 
     - Versatile: different :ref:`kernels
@@ -57,7 +57,7 @@ If the initial hyperparameters should be kept fixed, `None` can be passed as
 optimizer.
 
 The noise level in the targets can be specified by passing it via the
-parameter `sigma_squared_n`, either globally as a scalar or per datapoint.
+parameter `alpha`, either globally as a scalar or per datapoint.
 Note that a moderate noise level can also be helpful for dealing with numeric
 issues during fitting as it is effectively implemented as Tikhonov
 regularization, i.e., by adding it to the diagonal of the kernel matrix. An
@@ -144,7 +144,8 @@ in the kernel and by the regularization parameter alpha of KRR.
 
 The figure shows that both methods learn reasonable models of the target
 function. GPR correctly identifies the periodicity of the function to be
-roughly 2*pi (6.28), while KRR chooses the doubled periodicity 4*pi. Besides
+roughly :math:`2*\pi` (6.28), while KRR chooses the doubled periodicity
+:math:`4*\pi` . Besides
 that, GPR provides reasonable confidence bounds on the prediction which are not
 available for KRR. A major difference between the two methods is the time
 required for fitting and predicting: while fitting KRR is fast in principle,
@@ -270,8 +271,9 @@ dataset. Since Gaussian process classification scales cubically with the size
 of the dataset, this might be considerably faster. However, note that
 "one_vs_one" does not support predicting probability estimates but only plain
 predictions. Moreover, note that :class:`GaussianProcessClassifier` does not
-(yet) implement a true multi-class Laplace approximation, but is based on
-several binary classification tasks.
+(yet) implement a true multi-class Laplace approximation internally, but
+as discussed aboved is based on solving several binary classification tasks
+internally, which are combined using one-versus-rest or one-versus-one.
 
 GPC examples
 ============
@@ -313,8 +315,8 @@ Illustration of GPC on the XOR dataset
 This example illustrates GPC on XOR data. Compared are a stationary, isotropic
 kernel (`RBF`) and a non-stationary kernel (`DotProduct`). On this particular
 dataset, the `DotProduct` kernel obtains considerably better results because the
-class-boundaries are linear and coincide with the coordinate axes. In general,
-stationary kernels often obtain better results.
+class-boundaries are linear and coincide with the coordinate axes. In practice,
+however, stationary kernels such as `RBF` often obtain better results
 
 .. figure:: ../auto_examples/gaussian_process/images/plot_gpc_xor_001.png
    :target: ../auto_examples/gaussian_process/plot_gpc_xor.html
@@ -328,7 +330,7 @@ Gaussian process classification (GPC) on iris dataset
 
 This example illustrates the predicted probability of GPC for an isotropic
 and anisotropic RBF kernel on a two-dimensional version for the iris-dataset.
-It thus illustrated the applicability of GPC to non-binary classification.
+This illustrates the applicability of GPC to non-binary classification.
 The anisotropic RBF kernel obtains slightly higher log-marginal-likelihood by
 assigning different length-scales to the two feature dimensions.
 

From 8ced3a818731009c4c662a5e92b364fdf4b20fbf Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 17 Aug 2015 12:53:18 +0200
Subject: [PATCH 143/165] FIX theta_vars correctly supported in KernelOperators

---
 sklearn/gaussian_process/kernels.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 5390f65442a2b..9a59c8c6ccf4d 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -422,6 +422,18 @@ def __init__(self, k1, k2):
         self.k1 = k1
         self.k2 = k2
 
+        self.theta_vars = []
+        for theta_var in self.k1.theta_vars:
+            if isinstance(theta_var, tuple):
+                self.theta_vars.append(("k1__" + theta_var[0], theta_var[1]))
+            else:
+                self.theta_vars.append("k1__" + theta_var)
+        for theta_var in self.k2.theta_vars:
+            if isinstance(theta_var, tuple):
+                self.theta_vars.append(("k2__" + theta_var[0], theta_var[1]))
+            else:
+                self.theta_vars.append("k2__" + theta_var)
+
     def get_params(self, deep=True):
         """Get parameters of this kernel.
 
@@ -675,6 +687,13 @@ def __init__(self, kernel, exponent):
         self.kernel = kernel
         self.exponent = exponent
 
+        self.theta_vars = []
+        for theta_var in self.kernel.theta_vars:
+            if isinstance(theta_var, tuple):
+                self.theta_vars.append(("kernel__" + theta_var[0], theta_var[1]))
+            else:
+                self.theta_vars.append("kernel__" + theta_var)
+
     def get_params(self, deep=True):
         """Get parameters of this kernel.
 
@@ -690,6 +709,9 @@ def get_params(self, deep=True):
             Parameter names mapped to their values.
         """
         params = dict(kernel=self.kernel, exponent=self.exponent)
+        if deep:
+            deep_items = self.kernel.get_params().items()
+            params.update(('kernel__' + k, val) for k, val in deep_items)
         return params
 
     @property

From fec159afaf0770ab7b17b809c9494600e3959f40 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 17 Aug 2015 12:55:44 +0200
Subject: [PATCH 144/165] TST Adding two unittests for testing GP kernels

test_kernel_versus_pairwise() checks that GP kernels can also be used as pairwise kernels.
test_set_get_params() checks that set_params()/get_params() is consistent with kernel.theta.
---
 .../gaussian_process/tests/test_kernels.py    | 57 ++++++++++++++++++-
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 21512f9c74161..43f6332449431 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -11,7 +11,7 @@
 from scipy.optimize import approx_fprime
 
 from sklearn.metrics.pairwise \
-    import PAIRWISE_KERNEL_FUNCTIONS, euclidean_distances
+    import PAIRWISE_KERNEL_FUNCTIONS, euclidean_distances, pairwise_kernels
 from sklearn.gaussian_process.kernels \
     import (RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct,
             ConstantKernel, WhiteKernel, PairwiseKernel, KernelOperator,
@@ -24,6 +24,7 @@
 
 
 X = np.random.RandomState(0).normal(0, 1, (10, 2))
+Y = np.random.RandomState(0).normal(0, 1, (11, 2))
 
 kernel_white = RBF(l=2.0) + WhiteKernel(c=3.0)
 kernels = [RBF(l=2.0), RBF(l_bounds=(0.5, 2.0)),
@@ -126,7 +127,7 @@ def test_auto_vs_cross():
     """ Auto-correlation and cross-correlation should be consistent. """
     for kernel in kernels:
         if kernel == kernel_white:
-            continue  # Identity does is not satisfied on diagonal
+            continue  # Identity is not satisfied on diagonal
         K_auto = kernel(X)
         K_cross = kernel(X, X)
         assert_almost_equal(K_auto, K_cross, 5)
@@ -192,7 +193,8 @@ def test_kernel_clone():
             attr_value = getattr(kernel, attr)
             attr_value_cloned = getattr(kernel_cloned, attr)
             if np.iterable(attr_value):
-                assert_array_equal(attr_value, attr_value_cloned)
+                for i in range(len(attr_value)):
+                    assert_array_equal(attr_value[i], attr_value_cloned[i])
             else:
                 assert_equal(attr_value, attr_value_cloned)
             if not isinstance(attr_value, Hashable):
@@ -217,3 +219,52 @@ def test_matern_kernel():
         K1 = Matern(nu=nu, l=1.0)(X)
         K2 = Matern(nu=nu + tiny, l=1.0)(X)
         assert_array_almost_equal(K1, K2)
+
+
+def test_kernel_versus_pairwise():
+    """Check that GP kernels can also be used as pairwise kernels."""
+    for kernel in kernels:
+        # Test auto-kernel
+        if kernel != kernel_white:
+            # For WhiteKernel: k(X) != k(X,X). This is assumed by
+            # pairwise_kernels
+            K1 = kernel(X)
+            K2 = pairwise_kernels(X, metric=kernel)
+            assert_array_almost_equal(K1, K2)
+
+        # Test cross-kernel
+        K1 = kernel(X, Y)
+        K2 = pairwise_kernels(X, Y, metric=kernel)
+        assert_array_almost_equal(K1, K2)
+
+
+def test_set_get_params():
+    """Check that set_params()/get_params() is consistent with kernel.theta."""
+    for kernel in kernels:
+        # Test get_params()
+        index = 0
+        params = kernel.get_params()
+        for theta_var in kernel.theta_vars:
+            if isinstance(theta_var, tuple):  # anisotropic kernels
+                theta_var, size = theta_var
+                assert_almost_equal(np.exp(kernel.theta[index:index+size]),
+                                    params[theta_var])
+                index += size
+            else:
+                assert_almost_equal(np.exp(kernel.theta[index]),
+                                    params[theta_var])
+                index += 1
+        # Test set_params()
+        index = 0
+        value = 10  # arbitrary value
+        for theta_var in kernel.theta_vars:
+            if isinstance(theta_var, tuple):  # anisotropic kernels
+                theta_var, size = theta_var
+                kernel.set_params(**{theta_var: [value]*size})
+                assert_almost_equal(np.exp(kernel.theta[index:index+size]),
+                                    [value]*size)
+                index += size
+            else:
+                kernel.set_params(**{theta_var: value})
+                assert_almost_equal(np.exp(kernel.theta[index]), value)
+                index += 1

From a7dfc3dcfad82a378b46c3893494f6ad74612a79 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 17 Aug 2015 20:30:37 +0200
Subject: [PATCH 145/165] REF Removing theta_vars attribute from kernels and
 using Hyperparameter instances instead

---
 sklearn/gaussian_process/kernels.py           | 203 +++++++++++-------
 .../gaussian_process/tests/test_kernels.py    |  44 ++--
 2 files changed, 156 insertions(+), 91 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 9a59c8c6ccf4d..f87307a20d350 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -21,6 +21,7 @@
 
 from abc import ABCMeta, abstractmethod
 from functools import partial
+from collections import namedtuple
 import inspect
 import math
 
@@ -33,6 +34,36 @@
 from ..base import clone
 
 
+class Hyperparameter(namedtuple(
+     'Hyperparameter',
+     ('name', 'value_type', 'bounds', 'n_elements', 'fixed'))):
+    # A raw namedtuple is very memory efficient as it packs the attributes
+    # in a struct to get rid of the __dict__ of attributes in particular it
+    # does not copy the string for the keys on each instance.
+    # By deriving a namedtuple class just to introduce the __init__ method we
+    # would also reintroduce the __dict__ on the instance. By telling the
+    # Python interpreter that this subclass uses static __slots__ instead of
+    # dynamic attributes. Furthermore we don't need any additional slot in the
+    # subclass so we set __slots__ to the empty tuple.
+    __slots__ = ()
+
+    def __new__(cls, name, value_type, bounds, n_elements=1, fixed=None):
+        if bounds is not "fixed":
+            bounds = np.atleast_2d(bounds)
+            if n_elements > 1:  # vector-valued parameter
+                if bounds.shape[0] == 1:
+                    bounds = np.repeat(bounds, n_elements, 0)
+                elif bounds.shape[0] != n_elements:
+                    raise ValueError("Bounds on %s should have either 1 or "
+                                     "%d dimensions. Given are %d"
+                                     % (name, n_elements, bounds.shape[0]))
+
+        if fixed is None:
+             fixed = bounds is "fixed"
+        return super(Hyperparameter, cls).__new__(
+            cls, name, value_type, bounds, n_elements, fixed)
+
+
 class Kernel(six.with_metaclass(ABCMeta)):
     """Base class for all kernels."""
 
@@ -117,6 +148,15 @@ def n_dims(self):
         """Returns the number of non-fixed hyperparameters of the kernel."""
         return self.theta.shape[0]
 
+    @property
+    def hyperparameters(self):
+        """Returns a list of all hyperparameter."""
+        r = []
+        for attr, value in self.__dict__.items():
+            if attr.startswith("hyperparameter_"):
+                r.append(value)
+        return r
+
     @property
     def theta(self):
         """Returns the (flattened, log-transformed) non-fixed hyperparameters.
@@ -132,11 +172,9 @@ def theta(self):
             The non-fixed, log-transformed hyperparameters of the kernel
         """
         theta = []
-        for var_name in self.theta_vars:
-            if not isinstance(var_name, six.string_types):
-                # vector-valued parameter
-                var_name, _ = var_name
-            theta.append(getattr(self, var_name))
+        for hyperparameter in self.hyperparameters:
+            if not hyperparameter.fixed:
+                theta.append(getattr(self, hyperparameter.name))
         if len(theta) > 0:
             return np.log(np.hstack(theta))
         else:
@@ -152,14 +190,16 @@ def theta(self, theta):
             The non-fixed, log-transformed hyperparameters of the kernel
         """
         i = 0
-        for var_name in self.theta_vars:
-            if not isinstance(var_name, six.string_types):
+        for hyperparameter in self.hyperparameters:
+            if hyperparameter.fixed:
+                continue
+            if hyperparameter.n_elements > 1:
                 # vector-valued parameter
-                var_name, var_length = var_name
-                setattr(self, var_name, np.exp(theta[i:i + var_length]))
-                i += var_length
+                setattr(self, hyperparameter.name,
+                        np.exp(theta[i:i + hyperparameter.n_elements]))
+                i += hyperparameter.n_elements
             else:
-                setattr(self, var_name, np.exp(theta[i]))
+                setattr(self, hyperparameter.name, np.exp(theta[i]))
                 i += 1
 
         if i != len(theta):
@@ -177,21 +217,9 @@ def bounds(self):
             The bounds on the kernel's hyperparameters theta
         """
         bounds = []
-        for var_name in self.theta_vars:
-            if not isinstance(var_name, six.string_types):
-                # vector-valued parameter
-                var_name, var_length = var_name
-                var_bounds = np.atleast_2d(getattr(self, var_name + "_bounds"))
-                if var_bounds.shape[0] == 1:
-                    var_bounds = np.repeat(var_bounds, var_length, 0)
-                elif var_bounds.shape[0] != var_length:
-                    raise ValueError("Bounds on %s should have either 1 or "
-                                     "%d dimensions. Given are %d"
-                                     % (var_name, var_length,
-                                        var_bounds.shape[0]))
-                bounds.append(var_bounds)
-            else:
-                bounds.append(getattr(self, var_name + "_bounds"))
+        for hyperparameter in self.hyperparameters:
+            if not hyperparameter.fixed:
+                bounds.append(hyperparameter.bounds)
         if len(bounds) > 0:
             return np.log(np.vstack(bounds))
         else:
@@ -206,16 +234,24 @@ def bounds(self, bounds):
         bounds : array, shape (n_dims, 2)
             The bounds on the kernel's hyperparameters theta
         """
+        bounds_exp = np.exp(bounds)
         i = 0
-        for var_name in self.theta_vars:
-            if not isinstance(var_name, six.string_types):
-                # vector-valued parameter
-                var_name, var_length = var_name
-                setattr(self, var_name + "_bounds",
-                        np.exp(bounds[i:i + var_length]))
+        for hyperparameter in self.hyperparameters:
+            if hyperparameter.n_elements > 1:  # vector-valued parameter
+                setattr(self, "hyperparameter_" + hyperparameter.name,
+                        Hyperparameter(
+                            hyperparameter.name, hyperparameter.value_type,
+                            bounds_exp[i:i + hyperparameter.n_elements],
+                            hyperparameter.n_elements))
+                setattr(self, hyperparameter.name + "_bounds",
+                        bounds_exp[i:i + hyperparameter.n_elements])
                 i += var_length
             else:
-                setattr(self, var_name + "_bounds", np.exp(bounds[i]))
+                setattr(self, "hyperparameter_" + hyperparameter.name,
+                        Hyperparameter(hyperparameter.name,
+                                       hyperparameter.value_type,
+                                       bounds_exp[i]))
+                setattr(self, hyperparameter.name + "_bounds", bounds_exp[i])
                 i += 1
 
         if i != len(bounds):
@@ -422,18 +458,6 @@ def __init__(self, k1, k2):
         self.k1 = k1
         self.k2 = k2
 
-        self.theta_vars = []
-        for theta_var in self.k1.theta_vars:
-            if isinstance(theta_var, tuple):
-                self.theta_vars.append(("k1__" + theta_var[0], theta_var[1]))
-            else:
-                self.theta_vars.append("k1__" + theta_var)
-        for theta_var in self.k2.theta_vars:
-            if isinstance(theta_var, tuple):
-                self.theta_vars.append(("k2__" + theta_var[0], theta_var[1]))
-            else:
-                self.theta_vars.append("k2__" + theta_var)
-
     def get_params(self, deep=True):
         """Get parameters of this kernel.
 
@@ -457,6 +481,22 @@ def get_params(self, deep=True):
 
         return params
 
+    @property
+    def hyperparameters(self):
+        """Returns a list of all hyperparameter."""
+        r = []
+        for hyperparameter in self.k1.hyperparameters:
+            r.append(Hyperparameter("k1__" + hyperparameter.name,
+                                    hyperparameter.value_type,
+                                    hyperparameter.bounds,
+                                    hyperparameter.n_elements))
+        for hyperparameter in self.k2.hyperparameters:
+            r.append(Hyperparameter("k2__" + hyperparameter.name,
+                                    hyperparameter.value_type,
+                                    hyperparameter.bounds,
+                                    hyperparameter.n_elements))
+        return r
+
     @property
     def theta(self):
         """Returns the (flattened, log-transformed) non-fixed hyperparameters.
@@ -687,13 +727,6 @@ def __init__(self, kernel, exponent):
         self.kernel = kernel
         self.exponent = exponent
 
-        self.theta_vars = []
-        for theta_var in self.kernel.theta_vars:
-            if isinstance(theta_var, tuple):
-                self.theta_vars.append(("kernel__" + theta_var[0], theta_var[1]))
-            else:
-                self.theta_vars.append("kernel__" + theta_var)
-
     def get_params(self, deep=True):
         """Get parameters of this kernel.
 
@@ -714,6 +747,17 @@ def get_params(self, deep=True):
             params.update(('kernel__' + k, val) for k, val in deep_items)
         return params
 
+    @property
+    def hyperparameters(self):
+        """Returns a list of all hyperparameter."""
+        r = []
+        for hyperparameter in self.kernel.hyperparameters:
+            r.append(Hyperparameter("kernel__" + hyperparameter.name,
+                                    hyperparameter.value_type,
+                                    hyperparameter.bounds,
+                                    hyperparameter.n_elements))
+        return r
+
     @property
     def theta(self):
         """Returns the (flattened, log-transformed) non-fixed hyperparameters.
@@ -851,7 +895,7 @@ def __init__(self, c=1.0, c_bounds=(1e-5, 1e5)):
         self.c = c
         self.c_bounds = c_bounds
 
-        self.theta_vars = ["c"] if c_bounds is not "fixed" else []
+        self.hyperparameter_c = Hyperparameter("c", "numeric", c_bounds)
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
@@ -887,7 +931,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         K = self.c * np.ones((X.shape[0], Y.shape[0]))
         if eval_gradient:
-            if self.c_bounds is not "fixed":
+            if not self.hyperparameter_c.fixed:
                 return K, self.c * np.ones((X.shape[0], X.shape[0], 1))
             else:
                 return K, np.empty((X.shape[0], X.shape[0], 0))
@@ -938,7 +982,7 @@ def __init__(self, c=1.0, c_bounds=(1e-5, 1e5)):
         self.c = c
         self.c_bounds = c_bounds
 
-        self.theta_vars = ["c"] if c_bounds is not "fixed" else []
+        self.hyperparameter_c = Hyperparameter("c", "numeric", c_bounds)
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
@@ -973,7 +1017,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         if Y is None:
             K = self.c * np.eye(X.shape[0])
             if eval_gradient:
-                if self.c_bounds is not "fixed":
+                if not self.hyperparameter_c.fixed:
                     return K, self.c * np.eye(X.shape[0])[:, :, np.newaxis]
                 else:
                     return K, np.empty((X.shape[0], X.shape[0], 0))
@@ -1043,12 +1087,12 @@ def __init__(self, l=1.0, l_bounds=(1e-5, 1e5)):
             self.l = float(l)
         self.l_bounds = l_bounds
 
-        self.theta_vars = []
-        if l_bounds is not "fixed":
-            if self.anisotropic:  # anisotropic l needs special care
-                self.theta_vars.append(("l", len(l)))
-            else:
-                self.theta_vars.append("l")
+        if self.anisotropic:  # anisotropic l
+            self.hyperparameter_l = \
+                Hyperparameter("l", "numeric", l_bounds, len(l))
+        else:
+            self.hyperparameter_l = \
+                Hyperparameter("l", "numeric", l_bounds)
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
@@ -1096,7 +1140,8 @@ def __call__(self, X, Y=None, eval_gradient=False):
             K = np.exp(-.5 * dists)
 
         if eval_gradient:
-            if self.l_bounds is "fixed":  # Hyperparameter l kept fixed
+            if self.hyperparameter_l.fixed:
+                # Hyperparameter l kept fixed
                 return K, np.empty((X.shape[0], X.shape[0], 0))
             elif not self.anisotropic or self.l.shape[0] == 1:
                 K_gradient = \
@@ -1225,7 +1270,8 @@ def __call__(self, X, Y=None, eval_gradient=False):
             np.fill_diagonal(K, 1)
 
         if eval_gradient:
-            if self.l_bounds is "fixed":  # Hyperparameter l kept fixed
+            if self.hyperparameter_l.fixed:
+                # Hyperparameter l kept fixed
                 K_gradient = np.empty((X.shape[0], X.shape[0], 0))
                 return K, K_gradient
 
@@ -1302,9 +1348,9 @@ def __init__(self, l=1.0, alpha=1.0, l_bounds=(1e-5, 1e5),
         self.l_bounds = l_bounds
         self.alpha_bounds = alpha_bounds
 
-        self.theta_vars = ["l"] if self.l_bounds is not "fixed" else []
-        if self.alpha_bounds is not "fixed":
-            self.theta_vars += ["alpha"]
+        self.hyperparameter_l = Hyperparameter("l", "numeric", l_bounds)
+        self.hyperparameter_alpha = \
+             Hyperparameter("alpha", "numeric", alpha_bounds)
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
@@ -1348,14 +1394,14 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         if eval_gradient:
             # gradient with respect to l
-            if "l" in self.theta_vars:
+            if not self.hyperparameter_l.fixed:
                 l_gradient = dists * K / (self.l ** 2 * base)
                 l_gradient = l_gradient[:, :, np.newaxis]
             else:  # l is kept fixed
                 l_gradient = np.empty((K.shape[0], K.shape[1], 0))
 
             # gradient with respect to alpha
-            if "alpha" in self.theta_vars:
+            if not self.hyperparameter_alpha.fixed:
                 alpha_gradient = \
                     K * (-self.alpha * np.log(base)
                          + dists / (2 * self.l ** 2 * base))
@@ -1403,9 +1449,8 @@ def __init__(self, l=1.0, p=1.0, l_bounds=(1e-5, 1e5),
         self.l_bounds = l_bounds
         self.p_bounds = p_bounds
 
-        self.theta_vars = ["l"] if l_bounds is not "fixed" else []
-        if self.p_bounds is not "fixed":
-            self.theta_vars += ["p"]
+        self.hyperparameter_l = Hyperparameter("l", "numeric", l_bounds)
+        self.hyperparameter_p = Hyperparameter("p", "numeric", p_bounds)
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
@@ -1449,13 +1494,13 @@ def __call__(self, X, Y=None, eval_gradient=False):
         if eval_gradient:
             cos_of_arg = np.cos(arg)
             # gradient with respect to l
-            if "l" in self.theta_vars:
+            if not self.hyperparameter_l.fixed:
                 l_gradient = 4 / self.l**2 * sin_of_arg**2 * K
                 l_gradient = l_gradient[:, :, np.newaxis]
             else:  # l is kept fixed
                 l_gradient = np.empty((K.shape[0], K.shape[1], 0))
             # gradient with respect to p
-            if "p" in self.theta_vars:
+            if not self.hyperparameter_p.fixed:
                 p_gradient = 4 * arg / self.l**2 * cos_of_arg \
                     * sin_of_arg * K
                 p_gradient = p_gradient[:, :, np.newaxis]
@@ -1500,7 +1545,8 @@ def __init__(self, sigma_0=1.0, sigma_0_bounds=(1e-5, 1e5)):
         self.sigma_0 = sigma_0
         self.sigma_0_bounds = sigma_0_bounds
 
-        self.theta_vars = ["sigma_0"] if sigma_0_bounds is not "fixed" else []
+        self.hyperparameter_sigma_0 = \
+            Hyperparameter("sigma_0", "numeric", sigma_0_bounds)
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
@@ -1538,7 +1584,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             K = np.inner(X, Y) + self.sigma_0 ** 2
 
         if eval_gradient:
-            if self.sigma_0_bounds is not "fixed":
+            if not self.hyperparameter_sigma_0.fixed:
                 K_gradient = np.empty((K.shape[0], K.shape[1], 1))
                 K_gradient[..., 0] = 2 * self.sigma_0 ** 2
                 return K, K_gradient
@@ -1628,7 +1674,8 @@ def __init__(self, gamma=1.0, gamma_bounds=(1e-5, 1e5), metric="linear",
         self.gamma = gamma
         self.gamma_bounds = gamma_bounds
 
-        self.theta_vars = ["gamma"] if gamma_bounds is not "fixed" else []
+        self.hyperparameter_gamma = \
+            Hyperparameter("gamma", "numeric", gamma_bounds)
 
         self.metric = metric
         if pairwise_kernels_kwargs is not None:
@@ -1667,7 +1714,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
                              filter_params=True,
                              **self.pairwise_kernels_kwargs)
         if eval_gradient:
-            if self.gamma_bounds is "fixed":
+            if self.hyperparameter_gamma.fixed:
                 return K, np.empty((X.shape[0], X.shape[0], 0))
             else:
                 # approximate gradient numerically
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 43f6332449431..ea637a544596d 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -85,7 +85,9 @@ def test_kernel_theta():
             inspect.getargspec(kernel.__class__.__init__)
         theta_vars = map(lambda s: s.rstrip("_bounds"),
                          filter(lambda s: s.endswith("_bounds"), args))
-        assert_equal(kernel.theta_vars, list(theta_vars))
+        assert_equal(
+            [hyperparameter.name for hyperparameter in kernel.hyperparameters],
+            list(theta_vars))
 
         # Check that values returned in theta are consistent with
         # hyperparameter values (being their logarithms)
@@ -192,9 +194,21 @@ def test_kernel_clone():
         for attr in kernel.__dict__.keys():
             attr_value = getattr(kernel, attr)
             attr_value_cloned = getattr(kernel_cloned, attr)
-            if np.iterable(attr_value):
+            if attr.startswith("hyperparameter_"):
+                assert_equal(attr_value.name, attr_value_cloned.name)
+                assert_equal(attr_value.value_type,
+                             attr_value_cloned.value_type)
+                assert_array_equal(attr_value.bounds,
+                                   attr_value_cloned.bounds)
+                assert_equal(attr_value.n_elements,
+                             attr_value_cloned.n_elements)
+            elif np.iterable(attr_value):
                 for i in range(len(attr_value)):
-                    assert_array_equal(attr_value[i], attr_value_cloned[i])
+                    if np.iterable(attr_value[i]):
+                        assert_array_equal(attr_value[i],
+                                           attr_value_cloned[i])
+                    else:
+                        assert_equal(attr_value[i], attr_value_cloned[i])
             else:
                 assert_equal(attr_value, attr_value_cloned)
             if not isinstance(attr_value, Hashable):
@@ -244,27 +258,31 @@ def test_set_get_params():
         # Test get_params()
         index = 0
         params = kernel.get_params()
-        for theta_var in kernel.theta_vars:
-            if isinstance(theta_var, tuple):  # anisotropic kernels
-                theta_var, size = theta_var
+        for hyperparameter in kernel.hyperparameters:
+            if hyperparameter.bounds is "fixed":
+                continue
+            size = hyperparameter.n_elements
+            if size > 1:  # anisotropic kernels
                 assert_almost_equal(np.exp(kernel.theta[index:index+size]),
-                                    params[theta_var])
+                                    params[hyperparameter.name])
                 index += size
             else:
                 assert_almost_equal(np.exp(kernel.theta[index]),
-                                    params[theta_var])
+                                    params[hyperparameter.name])
                 index += 1
         # Test set_params()
         index = 0
         value = 10  # arbitrary value
-        for theta_var in kernel.theta_vars:
-            if isinstance(theta_var, tuple):  # anisotropic kernels
-                theta_var, size = theta_var
-                kernel.set_params(**{theta_var: [value]*size})
+        for hyperparameter in kernel.hyperparameters:
+            if hyperparameter.bounds is "fixed":
+                continue
+            size = hyperparameter.n_elements
+            if size > 1:  # anisotropic kernels
+                kernel.set_params(**{hyperparameter.name: [value]*size})
                 assert_almost_equal(np.exp(kernel.theta[index:index+size]),
                                     [value]*size)
                 index += size
             else:
-                kernel.set_params(**{theta_var: value})
+                kernel.set_params(**{hyperparameter.name: value})
                 assert_almost_equal(np.exp(kernel.theta[index]), value)
                 index += 1

From 96ad112379a614264df878cef12ae0cec2913f41 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 19 Aug 2015 10:39:14 +0200
Subject: [PATCH 146/165] DEL Removing bounds.setter in kernels.py

---
 sklearn/gaussian_process/kernels.py | 71 -----------------------------
 1 file changed, 71 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index f87307a20d350..13608c66e4039 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -225,40 +225,6 @@ def bounds(self):
         else:
             return np.array([])
 
-    @bounds.setter
-    def bounds(self, bounds):
-        """Sets the bounds on the kernel's hyperparameters theta.
-
-        Parameters
-        ----------
-        bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters theta
-        """
-        bounds_exp = np.exp(bounds)
-        i = 0
-        for hyperparameter in self.hyperparameters:
-            if hyperparameter.n_elements > 1:  # vector-valued parameter
-                setattr(self, "hyperparameter_" + hyperparameter.name,
-                        Hyperparameter(
-                            hyperparameter.name, hyperparameter.value_type,
-                            bounds_exp[i:i + hyperparameter.n_elements],
-                            hyperparameter.n_elements))
-                setattr(self, hyperparameter.name + "_bounds",
-                        bounds_exp[i:i + hyperparameter.n_elements])
-                i += var_length
-            else:
-                setattr(self, "hyperparameter_" + hyperparameter.name,
-                        Hyperparameter(hyperparameter.name,
-                                       hyperparameter.value_type,
-                                       bounds_exp[i]))
-                setattr(self, hyperparameter.name + "_bounds", bounds_exp[i])
-                i += 1
-
-        if i != len(bounds):
-            raise ValueError("bounds has not the correct number of entries."
-                             " Should be %d; given are %d"
-                             % (i, len(bounds)))
-
     def __add__(self, b):
         if not isinstance(b, Kernel):
             return Sum(self, ConstantKernel(b))
@@ -386,19 +352,6 @@ def bounds(self):
         """
         return np.vstack([kernel.bounds for kernel in self.kernels])
 
-    @bounds.setter
-    def bounds(self, bounds):
-        """Sets the bounds on the kernel's hyperparameters theta.
-
-        Parameters
-        ----------
-        bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters theta
-        """
-        k1_dims = self.k1.n_dims
-        for i, kernel in enumerate(self.kernels):
-            kernel.bounds = bounds[i*k_dims:(i+1)*k_dims]
-
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
 
@@ -541,19 +494,6 @@ def bounds(self):
             return self.k1.bounds
         return np.vstack((self.k1.bounds, self.k2.bounds))
 
-    @bounds.setter
-    def bounds(self, bounds):
-        """Sets the bounds on the kernel's hyperparameters theta.
-
-        Parameters
-        ----------
-        bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters theta
-        """
-        k1_dims = self.k1.n_dims
-        self.k1.bounds = bounds[:k1_dims]
-        self.k2.bounds = bounds[k1_dims:]
-
     def __eq__(self, b):
         if type(self) != type(b):
             return False
@@ -796,17 +736,6 @@ def bounds(self):
         """
         return self.kernel.bounds
 
-    @bounds.setter
-    def bounds(self, bounds):
-        """Sets the bounds on the kernel's hyperparameters theta.
-
-        Parameters
-        ----------
-        bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters theta
-        """
-        self.kernel.bounds = bounds
-
     def __eq__(self, b):
         if type(self) != type(b):
             return False

From 0581e369cdd3682b4690fa98094ce4a6e380a2f3 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 19 Aug 2015 11:02:49 +0200
Subject: [PATCH 147/165] REF Renaming max_iter to max_iter_predict in
 GaussianProcessClassifier

---
 sklearn/gaussian_process/gpc.py | 18 +++++++++---------
 sklearn/tests/test_common.py    |  4 ----
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 0efc758e797d7..da1d0c613ee70 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -86,7 +86,7 @@ def optimizer(obj_func, initial_theta, bounds):
         must be finite. Note that n_restarts_optimizer=0 implies that one
         run is performed.
 
-    max_iter: int, optional (default: 100)
+    max_iter_predict: int, optional (default: 100)
         The maximum number of iterations in Newton's method for approximating
         the posterior during predict. Smaller values will reduce computation
         time at the cost of worse results.
@@ -137,12 +137,12 @@ def optimizer(obj_func, initial_theta, bounds):
         of sqrt(W) is stored.
     """
     def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
-                 n_restarts_optimizer=0, max_iter=100, warm_start=False,
+                 n_restarts_optimizer=0, max_iter_predict=100, warm_start=False,
                  copy_X_train=False, random_state=None):
         self.kernel = kernel
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
-        self.max_iter = max_iter
+        self.max_iter_predict = max_iter_predict
         self.warm_start = warm_start
         self.copy_X_train = copy_X_train
         self.random_state = random_state
@@ -371,7 +371,7 @@ def _posterior_mode(self, K, return_temporaries=False):
 
         # Use Newton's iteration method to find mode of Laplace approximation
         log_marginal_likelihood = -np.inf
-        for _ in range(self.max_iter):
+        for _ in range(self.max_iter_predict):
             # Line 4
             pi = 1 / (1 + np.exp(-f))
             W = pi * (1 - pi)
@@ -477,7 +477,7 @@ def optimizer(obj_func, initial_theta, bounds):
         must be finite. Note that n_restarts_optimizer=0 implies that one
         run is performed.
 
-    max_iter: int, optional (default: 100)
+    max_iter_predict: int, optional (default: 100)
         The maximum number of iterations in Newton's method for approximating
         the posterior during predict. Smaller values will reduce computation
         time at the cost of worse results.
@@ -533,13 +533,13 @@ def optimizer(obj_func, initial_theta, bounds):
         The number of classes in the training data
     """
     def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
-                 n_restarts_optimizer=0, max_iter=100, warm_start=False,
-                 copy_X_train=False, random_state=None,
+                 n_restarts_optimizer=0, max_iter_predict=100,
+                 warm_start=False, copy_X_train=False, random_state=None,
                  multi_class="one_vs_rest", n_jobs=1):
         self.kernel = kernel
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
-        self.max_iter = max_iter
+        self.max_iter_predict = max_iter_predict
         self.warm_start = warm_start
         self.copy_X_train = copy_X_train
         self.random_state = random_state
@@ -565,7 +565,7 @@ def fit(self, X, y):
 
         self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
             self.kernel, self.optimizer, self.n_restarts_optimizer,
-            self.max_iter, self.warm_start, self.copy_X_train,
+            self.max_iter_predict, self.warm_start, self.copy_X_train,
             self.random_state)
 
         self.classes_ = np.unique(y)
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index e92e07e7aafa5..c129f10c885e8 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -163,10 +163,6 @@ def test_non_transformer_estimators_n_iter():
                              'LogisticRegressionCV']):
                     continue
 
-                # These models perform iterations for predict
-                if name in (['GaussianProcessClassifier']):
-                    continue
-
                 # Tested in test_transformer_n_iter below
                 elif (name in CROSS_DECOMPOSITION or
                       name in ['LinearSVC', 'LogisticRegression']):

From a4d81c500aa870849c86158960853f3f1699672e Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 19 Aug 2015 16:37:58 +0200
Subject: [PATCH 148/165] DOC Documenting GP kernel API

---
 doc/modules/classes.rst                      |  3 +-
 doc/modules/gaussian_process.rst             | 82 +++++++++++++++++++-
 sklearn/gaussian_process/gaussian_process.py |  4 +-
 sklearn/gaussian_process/kernels.py          | 52 ++++++++++---
 4 files changed, 123 insertions(+), 18 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index c1c909356886a..59a51fd01fa03 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -500,6 +500,7 @@ Kernels:
   :toctree: generated/
   :template: class.rst
 
+  gaussian_process.kernels.Kernel
   gaussian_process.kernels.Sum
   gaussian_process.kernels.Product
   gaussian_process.kernels.Exponentiation
@@ -512,7 +513,7 @@ Kernels:
   gaussian_process.kernels.DotProduct
   gaussian_process.kernels.PairwiseKernel
   gaussian_process.kernels.CompoundKernel
-
+  gaussian_process.kernels.Hyperparameter
 
 .. _grid_search_ref:
 
diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 92fe45705a0a1..2aa78b1668118 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -358,12 +358,86 @@ be subdivided into isotropic and anisotropic kernels, where isotropic kernels ar
 also invariant to rotations in the input space. For more details, we refer to
 Chapter 4 of [RW2006]_.
 
+Gaussian Process Kernel API
+---------------------------
+The main usage of a :class:`Kernel` is to compute the GP's covariance between
+datapoints. For this, the method ``__call__`` of the kernel can be called. This
+method can either be used to compute the "auto-covariance" of all pairs of
+datapoints in a 2d array X, or the "cross-covariance" of all combinations
+of datapoints of a 2d array X with datapoints in a 2d array Y. The following
+identity holds true for all kernels k (except for the :class:`WhiteKernel`):
+``k(X) == K(X, Y=X)``
+
+If only the diagonal of the auto-covariance is being used, the method ``diag()``
+of a kernel can be called, which is more computationally efficient than the
+equivalent call to ``__call__``: ``np.diag(k(X, X)) == k.diag(X)``
+
+Kernels are parameterized by a vector :math:`\theta` of hyperparameters. These
+hyperparameters can for instance control length-scales or periodicity of a
+kernel (see below). All kernels support computing analytic gradients of
+of the kernel's auto-covariance with respect to :math:`\theta` via setting
+``eval_gradient=True`` in the ``__call__`` method. This gradient is used by the
+Gaussian process (both regressor and classifier) in computing the gradient
+of the log-marginal-likelihood, which in turn is used to determine the
+value of :math:`\theta`, which maximizes the log-marginal-likelihood,  via
+gradient ascent. For each hyperparameter, the initial value and the
+bounds need to be specified when creating an instance of the kernel. The
+current value of :math:`\theta` can be get and set via the property
+``theta`` of the kernel object. Moreover, the bounds of the hyperparameters can be
+accessed by the property ``bounds`` of the kernel. Note that both properties
+(theta and bounds) return log-transformed values of the internally used values
+since those are typically more amenable to gradient-based optimization.
+The specification of each hyperparameter is stored in the form of an instance of
+:class:`Hyperparameter` in the respective kernel. Note that a kernel using a
+hyperparameter with name "x" must have the attributes self.x and self.x_bounds.
+
+The abstract base class for all kernels is :class:`Kernel`. Kernel implements a
+similar interface as :class:`Estimator`, providing the methods ``get_params()``,
+``set_params()``, and ``clone()``. This allows setting kernel values also via
+meta-estimators such as :class:`Pipeline` or :class:`GridSearch`. Note that due to the nested
+structure of kernels (by applying kernel operators, see below), the names of
+kernel parameters might become relatively complicated. In general, for a
+binary kernel operator, parameters of the left operand are prefixed with ``k1__``
+and parameters of the right operand with ``k2__``. An additional convenience
+method is ``clone_with_theta(theta)``, which returns a cloned version of the
+kernel but with the hyperparameters set to ``theta``. An illustrative example:
+
+    >>> from sklearn.gaussian_process.kernels import ConstantKernel, RBF
+    >>> kernel = ConstantKernel(c=1.0) * RBF(l=0.5) + RBF(l=2.0)
+    >>> for hyperparameter in kernel.hyperparameters: print hyperparameter
+    Hyperparameter(name='k1__k1__c', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k1__k2__l', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k2__l', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False
+    >>> print kernel.get_params()
+    {'k1__k1': 1**2, 'k1__k1__c': 1.0, 'k1__k2': RBF(l=0.5),
+     'k1__k2__l_bounds': (1e-05, 100000.0), 'k1__k2__l': 0.5,
+     'k1__k1__c_bounds': (1e-05, 100000.0), 'k2__l': 2.0, 'k2': RBF(l=2),
+     'k1': 1**2 * RBF(l=0.5), 'k2__l_bounds': (1e-05, 100000.0)}
+    >>> print kernel.theta  # Note: log-transformed
+     [ 0.         -0.69314718  0.69314718]
+    >>> print kernel.bounds  # Note: log-transformed
+     [[-11.51292546  11.51292546]
+      [-11.51292546  11.51292546]
+      [-11.51292546  11.51292546]]
+
+
+All Gaussian process kernels are interoperable with :mod:`sklearn.metrics.pairwise`
+and vice versa: instances of subclasses of :class:`Kernel` can be passed as
+``metric`` to pairwise_kernels`` from :mod:`sklearn.metrics.pairwise`. Moreover,
+kernel functions from pairwise can be used as GP kernels by using the wrapper
+class :class:`PairwiseKernel`. The only caveat is that the gradient of
+the hyperparameters is not analytic but numeric and all those kernels support
+only isotropic distances. The parameter ``gamma`` is considered to be a
+hyperparameter and may be optimized. The other kernel parameters are set
+directly at initialization and are kept fixed.
+
+
 Basic kernels
 -------------
-The :class:`ConstantKernel` kernel can be used as part of a product-kernel
-where it scales the magnitude of the other factor (kernel) or as part of a
-sum-kernel, where it modifies the mean of the Gaussian process. It depends
-on a parameter :math:`c`. It is defined as:
+The :class:`ConstantKernel` kernel can be used as part of a :class:`Product`
+kernel where it scales the magnitude of the other factor (kernel) or as part
+of a :class:`Sum` kernel, where it modifies the mean of the Gaussian process.
+It depends on a parameter :math:`c`. It is defined as:
 
 .. math::
    k(x_i, x_j) = c \;\forall\; x_1, x_2
diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py
index b66e672f2b6c0..ab8225ea5bd2a 100644
--- a/sklearn/gaussian_process/gaussian_process.py
+++ b/sklearn/gaussian_process/gaussian_process.py
@@ -58,10 +58,10 @@ def l1_cross_distances(X):
     return D, ij
 
 
-@deprecated("GaussianProcess is deprecated and will be removed in 0.18."
+@deprecated("GaussianProcess is deprecated and will be removed in 0.18. "
             "Use the GaussianProcessRegressor instead.")
 class GaussianProcess(BaseEstimator, RegressorMixin):
-    """The Gaussian Process model class.
+    """The legacy Gaussian Process model class.
 
     Note that this class is deprecated and will be removed in 0.18.
     Use the GaussianProcessRegressor instead.
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 13608c66e4039..af74e2d7cd4e7 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -37,6 +37,36 @@
 class Hyperparameter(namedtuple(
      'Hyperparameter',
      ('name', 'value_type', 'bounds', 'n_elements', 'fixed'))):
+    """A kernel hyperparameter's specification in form of a namedtuple.
+
+    Entries
+    -------
+    name : string
+        The name of the hyperparameter. Note that a kernel using a
+        hyperparameter with name "x" must have the attributes self.x and
+        self.x_bounds
+
+    value_type : string
+        The type of the hyperparameter. Currently, only "numeric"
+        hyperparameters are supported.
+
+    bounds : pair of floats >= 0 or "fixed"
+        The lower and upper bound on the parameter. If n_elements>1, a pair
+        of 1d array with n_elements each may be given alternatively. If
+        the string "fixed" is passed as bounds, the hyperparameter's value
+        cannot be changed.
+
+    n_elements : int, default=1
+        The number of elements of the hyperparameter value. Defaults to 1,
+        which corresponds to a scalar hyperparameter. n_elements > 1
+        corresponds to a hyperparameter which is vector-valued,
+        such as, e.g., anisotropic length-scales.
+
+    fixed : bool, default: None
+        Whether the value of this hyperparameter is fixed, i.e., cannot be
+        changed during hyperparameter tuning. If None is passed, the "fixed" is
+        derived based on the given bounds.
+    """
     # A raw namedtuple is very memory efficient as it packs the attributes
     # in a struct to get rid of the __dict__ of attributes in particular it
     # does not copy the string for the keys on each instance.
@@ -150,7 +180,7 @@ def n_dims(self):
 
     @property
     def hyperparameters(self):
-        """Returns a list of all hyperparameter."""
+        """Returns a list of all hyperparameter specifications."""
         r = []
         for attr, value in self.__dict__.items():
             if attr.startswith("hyperparameter_"):
@@ -209,12 +239,12 @@ def theta(self, theta):
 
     @property
     def bounds(self):
-        """Returns the bounds on the kernel's hyperparameters theta.
+        """Returns the log-transformed bounds on the theta.
 
         Returns
         -------
         bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters theta
+            The log-transformed bounds on the kernel's hyperparameters theta
         """
         bounds = []
         for hyperparameter in self.hyperparameters:
@@ -343,12 +373,12 @@ def theta(self, theta):
 
     @property
     def bounds(self):
-        """Returns the bounds on the kernel's hyperparameters theta.
+        """Returns the log-transformed bounds on the theta.
 
         Returns
         -------
         bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters theta
+            The log-transformed bounds on the kernel's hyperparameters theta
         """
         return np.vstack([kernel.bounds for kernel in self.kernels])
 
@@ -481,12 +511,12 @@ def theta(self, theta):
 
     @property
     def bounds(self):
-        """Returns the bounds on the kernel's hyperparameters theta.
+        """Returns the log-transformed bounds on the theta.
 
         Returns
         -------
         bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters theta
+            The log-transformed bounds on the kernel's hyperparameters theta
         """
         if self.k1.bounds.size == 0:
             return self.k2.bounds
@@ -727,12 +757,12 @@ def theta(self, theta):
 
     @property
     def bounds(self):
-        """Returns the bounds on the kernel's hyperparameters theta.
+        """Returns the log-transformed bounds on the theta.
 
         Returns
         -------
         bounds : array, shape (n_dims, 2)
-            The bounds on the kernel's hyperparameters theta
+            The log-transformed bounds on the kernel's hyperparameters theta
         """
         return self.kernel.bounds
 
@@ -1571,8 +1601,8 @@ class PairwiseKernel(Kernel):
 
     Note: Evaluation of eval_gradient is not analytic but numeric and all
           kernels support only isotropic distances. The parameter gamma is
-          specified via the param_space and may be optimized. The other
-          kernel parameters are set directly  at initialization and are kept
+          considered to be a hyperparameter and may be optimized. The other
+          kernel parameters are set directly at initialization and are kept
           fixed.
 
     Parameters

From 7ff0f71c1a23c9de18e8503345b6cc3ebef552a8 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 19 Aug 2015 16:54:46 +0200
Subject: [PATCH 149/165] DOC Fixing some minor issues in narrative doc of GPs

---
 doc/modules/gaussian_process.rst | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 2aa78b1668118..60aa05a62fef3 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -43,13 +43,13 @@ Gaussian Process Regression (GPR)
 
 The :class:`GaussianProcessRegressor` implements Gaussian processes (GP) for
 regression purposes. For this, the prior of the GP needs to be specified. The
-prior mean is assumed to be zero (other constants can be achieved by
-subtracting the specific value externally from all target values). The prior's
+prior mean is assumed to be constant and zero (for ``normalize_y=False``) or the
+training data's mean (for ``normalize_y=True``). The prior's
 covariance is specified by a passing a :ref:`kernel <gp_kernels>` object. The
 hyperparameters of the kernel are optimized during fitting of
 GaussianProcessRegressor by maximizing the log-marginal-likelihood (LML) based
-on the passed `optimizer`. As the LML may have multiple local optima, the
-optimizer can be started repeatedly by specifying `n_restarts_optimizer`. The
+on the passed ``optimizer``. As the LML may have multiple local optima, the
+optimizer can be started repeatedly by specifying ``n_restarts_optimizer``. The
 first run is always conducted starting from the initial hyperparameter values
 of the kernel; subsequent runs are conducted from hyperparameter values
 that have been chosen randomly from the range of allowed values.
@@ -57,7 +57,7 @@ If the initial hyperparameters should be kept fixed, `None` can be passed as
 optimizer.
 
 The noise level in the targets can be specified by passing it via the
-parameter `alpha`, either globally as a scalar or per datapoint.
+parameter ``alpha``, either globally as a scalar or per datapoint.
 Note that a moderate noise level can also be helpful for dealing with numeric
 issues during fitting as it is effectively implemented as Tikhonov
 regularization, i.e., by adding it to the diagonal of the kernel matrix. An
@@ -68,9 +68,9 @@ level from the data (see example below).
 The implementation is based on Algorithm 2.1 of [RW2006]_. In addition to
 the API of standard sklearn estimators, GaussianProcessRegressor:
      * allows prediction without prior fitting (based on the GP prior)
-     * provides an additional method `sample_y(X)`, which evaluates samples
+     * provides an additional method ``sample_y(X)``, which evaluates samples
        drawn from the GPR (prior or posterior) at given inputs
-     * exposes a method `log_marginal_likelihood(theta)`, which can be used
+     * exposes a method ``log_marginal_likelihood(theta)``, which can be used
        externally for other ways of selecting hyperparameters, e.g., via
        Markov chain Monte Carlo.
 
@@ -247,8 +247,8 @@ The GP prior mean is assumed to be zero. The prior's
 covariance is specified by a passing a :ref:`kernel <gp_kernels>` object. The
 hyperparameters of the kernel are optimized during fitting of
 GaussianProcessRegressor by maximizing the log-marginal-likelihood (LML) based
-on the passed `optimizer`. As the LML may have multiple local optima, the
-optimizer can be started repeatedly by specifying `n_restarts_optimizer`. The
+on the passed ``optimizer``. As the LML may have multiple local optima, the
+optimizer can be started repeatedly by specifying ``n_restarts_optimizer``. The
 first run is always conducted starting from the initial hyperparameter values
 of the kernel; subsequent runs are conducted from hyperparameter values
 that have been chosen randomly from the range of allowed values.
@@ -291,7 +291,7 @@ LML, they perform slightly worse according to the log-loss on test data. The
 figure shows that this is because they exhibit a steep change of the class
 probabilities at the class boundaries (which is good) but have predicted
 probabilities close to 0.5 far away from the class boundaries (which is bad)
-This undiesirable effect is caused by the Laplace approximation used
+This undesirable effect is caused by the Laplace approximation used
 internally by GPC.
 
 The second figure shows the log-marginal-likelihood for different choices of
@@ -313,10 +313,10 @@ Illustration of GPC on the XOR dataset
 .. currentmodule:: sklearn.gaussian_process.kernels
 
 This example illustrates GPC on XOR data. Compared are a stationary, isotropic
-kernel (`RBF`) and a non-stationary kernel (`DotProduct`). On this particular
+kernel (:class:`RBF`) and a non-stationary kernel (:class:`DotProduct`). On this particular
 dataset, the `DotProduct` kernel obtains considerably better results because the
 class-boundaries are linear and coincide with the coordinate axes. In practice,
-however, stationary kernels such as `RBF` often obtain better results
+however, stationary kernels such as :class:`RBF` often obtain better results.
 
 .. figure:: ../auto_examples/gaussian_process/images/plot_gpc_xor_001.png
    :target: ../auto_examples/gaussian_process/plot_gpc_xor.html

From 63ca51cf0625e643814cbe661a2bfe1870553a03 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 19 Aug 2015 21:11:57 +0200
Subject: [PATCH 150/165] FIX Fix failing tests (doctest, order of
 hyperparameters,  pairwise_kernels)

---
 doc/modules/gaussian_process.rst               | 15 ++++++---------
 sklearn/gaussian_process/kernels.py            |  4 ++--
 sklearn/gaussian_process/tests/test_kernels.py |  4 ++--
 sklearn/metrics/pairwise.py                    |  4 ++--
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 60aa05a62fef3..f2e45b0c66e0b 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -407,18 +407,15 @@ kernel but with the hyperparameters set to ``theta``. An illustrative example:
     >>> for hyperparameter in kernel.hyperparameters: print hyperparameter
     Hyperparameter(name='k1__k1__c', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
     Hyperparameter(name='k1__k2__l', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
-    Hyperparameter(name='k2__l', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False
+    Hyperparameter(name='k2__l', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
     >>> print kernel.get_params()
-    {'k1__k1': 1**2, 'k1__k1__c': 1.0, 'k1__k2': RBF(l=0.5),
-     'k1__k2__l_bounds': (1e-05, 100000.0), 'k1__k2__l': 0.5,
-     'k1__k1__c_bounds': (1e-05, 100000.0), 'k2__l': 2.0, 'k2': RBF(l=2),
-     'k1': 1**2 * RBF(l=0.5), 'k2__l_bounds': (1e-05, 100000.0)}
+    {'k1__k1': 1**2, 'k1__k1__c': 1.0, 'k1__k2': RBF(l=0.5), 'k1__k2__l_bounds': (1e-05, 100000.0), 'k1__k2__l': 0.5, 'k1__k1__c_bounds': (1e-05, 100000.0), 'k2__l': 2.0, 'k2': RBF(l=2), 'k1': 1**2 * RBF(l=0.5), 'k2__l_bounds': (1e-05, 100000.0)}
     >>> print kernel.theta  # Note: log-transformed
-     [ 0.         -0.69314718  0.69314718]
+    [ 0.         -0.69314718  0.69314718]
     >>> print kernel.bounds  # Note: log-transformed
-     [[-11.51292546  11.51292546]
-      [-11.51292546  11.51292546]
-      [-11.51292546  11.51292546]]
+    [[-11.51292546  11.51292546]
+     [-11.51292546  11.51292546]
+     [-11.51292546  11.51292546]]
 
 
 All Gaussian process kernels are interoperable with :mod:`sklearn.metrics.pairwise`
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index af74e2d7cd4e7..0517a9de7de68 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -182,7 +182,7 @@ def n_dims(self):
     def hyperparameters(self):
         """Returns a list of all hyperparameter specifications."""
         r = []
-        for attr, value in self.__dict__.items():
+        for attr, value in sorted(self.__dict__.items()):
             if attr.startswith("hyperparameter_"):
                 r.append(value)
         return r
@@ -1368,7 +1368,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             else:  # alpha is kept fixed
                 alpha_gradient = np.empty((K.shape[0], K.shape[1], 0))
 
-            return K, np.dstack((l_gradient, alpha_gradient))
+            return K, np.dstack((alpha_gradient, l_gradient))
         else:
             return K
 
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index ea637a544596d..318ac3c343c4a 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -86,8 +86,8 @@ def test_kernel_theta():
         theta_vars = map(lambda s: s.rstrip("_bounds"),
                          filter(lambda s: s.endswith("_bounds"), args))
         assert_equal(
-            [hyperparameter.name for hyperparameter in kernel.hyperparameters],
-            list(theta_vars))
+            set(hyperparameter.name for hyperparameter in kernel.hyperparameters),
+            set(theta_vars))
 
         # Check that values returned in theta are consistent with
         # hyperparameter values (being their logarithms)
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index de696470ebcec..e3290dfe343c5 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -1275,13 +1275,13 @@ def pairwise_kernels(X, Y=None, metric="linear", filter_params=False,
 
     if metric == "precomputed":
         return X
+    elif isinstance(metric, GPKernel):
+        func = metric.__call__
     elif metric in PAIRWISE_KERNEL_FUNCTIONS:
         if filter_params:
             kwds = dict((k, kwds[k]) for k in kwds
                         if k in KERNEL_PARAMS[metric])
         func = PAIRWISE_KERNEL_FUNCTIONS[metric]
-    elif isinstance(metric, GPKernel):
-        func = metric.__call__
     elif callable(metric):
         func = partial(_pairwise_callable, metric=metric, **kwds)
     else:

From 523b439f716170723366db0549207b06c82afc33 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 19 Aug 2015 21:35:52 +0200
Subject: [PATCH 151/165] FIX python3 compatibility in doctest

---
 doc/modules/gaussian_process.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index f2e45b0c66e0b..37c569a8ead00 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -404,15 +404,15 @@ kernel but with the hyperparameters set to ``theta``. An illustrative example:
 
     >>> from sklearn.gaussian_process.kernels import ConstantKernel, RBF
     >>> kernel = ConstantKernel(c=1.0) * RBF(l=0.5) + RBF(l=2.0)
-    >>> for hyperparameter in kernel.hyperparameters: print hyperparameter
+    >>> for hyperparameter in kernel.hyperparameters: print(hyperparameter)
     Hyperparameter(name='k1__k1__c', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
     Hyperparameter(name='k1__k2__l', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
     Hyperparameter(name='k2__l', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
-    >>> print kernel.get_params()
+    >>> print(kernel.get_params())
     {'k1__k1': 1**2, 'k1__k1__c': 1.0, 'k1__k2': RBF(l=0.5), 'k1__k2__l_bounds': (1e-05, 100000.0), 'k1__k2__l': 0.5, 'k1__k1__c_bounds': (1e-05, 100000.0), 'k2__l': 2.0, 'k2': RBF(l=2), 'k1': 1**2 * RBF(l=0.5), 'k2__l_bounds': (1e-05, 100000.0)}
-    >>> print kernel.theta  # Note: log-transformed
+    >>> print(kernel.theta)  # Note: log-transformed
     [ 0.         -0.69314718  0.69314718]
-    >>> print kernel.bounds  # Note: log-transformed
+    >>> print(kernel.bounds)  # Note: log-transformed
     [[-11.51292546  11.51292546]
      [-11.51292546  11.51292546]
      [-11.51292546  11.51292546]]

From d2a3851d7fe7651236c0b27885e1757cbf983a33 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 19 Aug 2015 21:36:15 +0200
Subject: [PATCH 152/165] FIX Fixing test_kernel_theta

---
 sklearn/gaussian_process/tests/test_kernels.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 318ac3c343c4a..066cc5deaaa18 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -91,14 +91,14 @@ def test_kernel_theta():
 
         # Check that values returned in theta are consistent with
         # hyperparameter values (being their logarithms)
-        for i, theta_var in enumerate(theta_vars):
-            assert_equal(theta[i], np.log(getattr(kernel, theta_var)))
+        for i, hyperparameter in enumerate(kernel.hyperparameters):
+            assert_equal(theta[i], np.log(getattr(kernel, hyperparameter.name)))
 
         # Fixed kernel parameters must be excluded from theta and gradient.
-        for i, theta_var in enumerate(theta_vars):
+        for i, hyperparameter in enumerate(kernel.hyperparameters):
             # create copy with certain hyperparameter fixed
             params = kernel.get_params()
-            params[theta_var + "_bounds"] = "fixed"
+            params[hyperparameter.name + "_bounds"] = "fixed"
             kernel_class = kernel.__class__
             new_kernel = kernel_class(**params)
             # Check that theta and K_gradient are identical with the fixed
@@ -116,12 +116,12 @@ def test_kernel_theta():
                                    K_gradient_new[..., i:])
 
         # Check that values of theta are modified correctly
-        for i, theta_var in enumerate(theta_vars):
+        for i, hyperparameter in enumerate(kernel.hyperparameters):
             theta[i] = np.log(42)
             kernel.theta = theta
-            assert_almost_equal(getattr(kernel, theta_var), 42)
+            assert_almost_equal(getattr(kernel, hyperparameter.name), 42)
 
-            setattr(kernel, theta_var, 43)
+            setattr(kernel, hyperparameter.name, 43)
             assert_almost_equal(kernel.theta[i], np.log(43))
 
 

From a0326408954722c54f46e51a3188ae5c39139474 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 19 Aug 2015 21:50:04 +0200
Subject: [PATCH 153/165] FIX Further python3 related fixes

---
 doc/modules/gaussian_process.rst               | 14 ++++++++++++--
 sklearn/gaussian_process/tests/test_kernels.py |  2 +-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 37c569a8ead00..ecba51efd895b 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -408,8 +408,18 @@ kernel but with the hyperparameters set to ``theta``. An illustrative example:
     Hyperparameter(name='k1__k1__c', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
     Hyperparameter(name='k1__k2__l', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
     Hyperparameter(name='k2__l', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
-    >>> print(kernel.get_params())
-    {'k1__k1': 1**2, 'k1__k1__c': 1.0, 'k1__k2': RBF(l=0.5), 'k1__k2__l_bounds': (1e-05, 100000.0), 'k1__k2__l': 0.5, 'k1__k1__c_bounds': (1e-05, 100000.0), 'k2__l': 2.0, 'k2': RBF(l=2), 'k1': 1**2 * RBF(l=0.5), 'k2__l_bounds': (1e-05, 100000.0)}
+        >>> params = kernel.get_params()
+    >>> for key in sorted(params): print("%s : %s" % (key, params[key]))
+    k1 : 1**2 * RBF(l=0.5)
+    k1__k1 : 1**2
+    k1__k1__c : 1.0
+    k1__k1__c_bounds : (1e-05, 100000.0)
+    k1__k2 : RBF(l=0.5)
+    k1__k2__l : 0.5
+    k1__k2__l_bounds : (1e-05, 100000.0)
+    k2 : RBF(l=2)
+    k2__l : 2.0
+    k2__l_bounds : (1e-05, 100000.0)
     >>> print(kernel.theta)  # Note: log-transformed
     [ 0.         -0.69314718  0.69314718]
     >>> print(kernel.bounds)  # Note: log-transformed
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 066cc5deaaa18..b03e9bb5a2371 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -110,7 +110,7 @@ def test_kernel_theta():
                 assert_equal(theta[:i], new_kernel.theta[:i])
                 assert_array_equal(K_gradient[..., :i],
                                    K_gradient_new[..., :i])
-            if i + 1 < len(theta_vars):
+            if i + 1 < len(kernel.hyperparameters):
                 assert_equal(theta[i+1:], new_kernel.theta[i:])
                 assert_array_equal(K_gradient[..., i+1:],
                                    K_gradient_new[..., i:])

From bbce7d2c852aa353acfdcad7409b173495f35b85 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Wed, 19 Aug 2015 22:09:12 +0200
Subject: [PATCH 154/165] FIX Fixing a python2.6-related issue

---
 doc/modules/gaussian_process.rst | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index ecba51efd895b..7eeb812ce1023 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -403,29 +403,29 @@ method is ``clone_with_theta(theta)``, which returns a cloned version of the
 kernel but with the hyperparameters set to ``theta``. An illustrative example:
 
     >>> from sklearn.gaussian_process.kernels import ConstantKernel, RBF
-    >>> kernel = ConstantKernel(c=1.0) * RBF(l=0.5) + RBF(l=2.0)
+    >>> kernel = ConstantKernel(c=1.0, c_bounds=(0.1, 10.0)) * RBF(l=0.5, l_bounds=(0.1, 10.0)) + RBF(l=2.0, l_bounds=(0.1, 10.0))
     >>> for hyperparameter in kernel.hyperparameters: print(hyperparameter)
-    Hyperparameter(name='k1__k1__c', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
-    Hyperparameter(name='k1__k2__l', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
-    Hyperparameter(name='k2__l', value_type='numeric', bounds=array([[  1.00000000e-05,   1.00000000e+05]]), n_elements=1, fixed=False)
-        >>> params = kernel.get_params()
+    Hyperparameter(name='k1__k1__c', value_type='numeric', bounds=array([[  0.1,  10. ]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k1__k2__l', value_type='numeric', bounds=array([[  0.1,  10. ]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k2__l', value_type='numeric', bounds=array([[  0.1,  10. ]]), n_elements=1, fixed=False)
+    >>> params = kernel.get_params()
     >>> for key in sorted(params): print("%s : %s" % (key, params[key]))
     k1 : 1**2 * RBF(l=0.5)
     k1__k1 : 1**2
     k1__k1__c : 1.0
-    k1__k1__c_bounds : (1e-05, 100000.0)
+    k1__k1__c_bounds : (0.1, 10.0)
     k1__k2 : RBF(l=0.5)
     k1__k2__l : 0.5
-    k1__k2__l_bounds : (1e-05, 100000.0)
+    k1__k2__l_bounds : (0.1, 10.0)
     k2 : RBF(l=2)
     k2__l : 2.0
-    k2__l_bounds : (1e-05, 100000.0)
+    k2__l_bounds : (0.1, 10.0)
     >>> print(kernel.theta)  # Note: log-transformed
     [ 0.         -0.69314718  0.69314718]
     >>> print(kernel.bounds)  # Note: log-transformed
-    [[-11.51292546  11.51292546]
-     [-11.51292546  11.51292546]
-     [-11.51292546  11.51292546]]
+    [[-2.30258509  2.30258509]
+     [-2.30258509  2.30258509]
+     [-2.30258509  2.30258509]]
 
 
 All Gaussian process kernels are interoperable with :mod:`sklearn.metrics.pairwise`

From e075585754e8975141213a941538e0ffee6fe59d Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 20 Aug 2015 09:17:47 +0200
Subject: [PATCH 155/165] FIX Changing bounds in GP doctest

---
 doc/modules/gaussian_process.rst | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 7eeb812ce1023..05bcdb5294709 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -403,29 +403,29 @@ method is ``clone_with_theta(theta)``, which returns a cloned version of the
 kernel but with the hyperparameters set to ``theta``. An illustrative example:
 
     >>> from sklearn.gaussian_process.kernels import ConstantKernel, RBF
-    >>> kernel = ConstantKernel(c=1.0, c_bounds=(0.1, 10.0)) * RBF(l=0.5, l_bounds=(0.1, 10.0)) + RBF(l=2.0, l_bounds=(0.1, 10.0))
+    >>> kernel = ConstantKernel(c=1.0, c_bounds=(0.0, 10.0)) * RBF(l=0.5, l_bounds=(0.0, 10.0)) + RBF(l=2.0, l_bounds=(0.0, 10.0))
     >>> for hyperparameter in kernel.hyperparameters: print(hyperparameter)
-    Hyperparameter(name='k1__k1__c', value_type='numeric', bounds=array([[  0.1,  10. ]]), n_elements=1, fixed=False)
-    Hyperparameter(name='k1__k2__l', value_type='numeric', bounds=array([[  0.1,  10. ]]), n_elements=1, fixed=False)
-    Hyperparameter(name='k2__l', value_type='numeric', bounds=array([[  0.1,  10. ]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k1__k1__c', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k1__k2__l', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k2__l', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
     >>> params = kernel.get_params()
     >>> for key in sorted(params): print("%s : %s" % (key, params[key]))
     k1 : 1**2 * RBF(l=0.5)
     k1__k1 : 1**2
     k1__k1__c : 1.0
-    k1__k1__c_bounds : (0.1, 10.0)
+    k1__k1__c_bounds : (0.0, 10.0)
     k1__k2 : RBF(l=0.5)
     k1__k2__l : 0.5
-    k1__k2__l_bounds : (0.1, 10.0)
+    k1__k2__l_bounds : (0.0, 10.0)
     k2 : RBF(l=2)
     k2__l : 2.0
-    k2__l_bounds : (0.1, 10.0)
+    k2__l_bounds : (0.0, 10.0)
     >>> print(kernel.theta)  # Note: log-transformed
     [ 0.         -0.69314718  0.69314718]
     >>> print(kernel.bounds)  # Note: log-transformed
-    [[-2.30258509  2.30258509]
-     [-2.30258509  2.30258509]
-     [-2.30258509  2.30258509]]
+    [[       -inf  2.30258509]
+     [       -inf  2.30258509]
+     [       -inf  2.30258509]]
 
 
 All Gaussian process kernels are interoperable with :mod:`sklearn.metrics.pairwise`

From 52060d056ad2073a0a53401b0f31c2ef51865e8d Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 20 Aug 2015 09:20:42 +0200
Subject: [PATCH 156/165] MISC copy_X_train defaults to True in GPR and GPC

---
 sklearn/gaussian_process/gpc.py | 4 ++--
 sklearn/gaussian_process/gpr.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index da1d0c613ee70..6f83e475ab918 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -98,7 +98,7 @@ def optimizer(obj_func, initial_theta, bounds):
         up convergence when _posterior_mode is called several times on similar
         problems as in hyperparameter optimization.
 
-    copy_X_train : bool, optional (default: False)
+    copy_X_train : bool, optional (default: True)
         If True, a persistent copy of the training data is stored in the
         object. Otherwise, just a reference to the training data is stored,
         which might cause predictions to change if the data is modified
@@ -138,7 +138,7 @@ def optimizer(obj_func, initial_theta, bounds):
     """
     def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
                  n_restarts_optimizer=0, max_iter_predict=100, warm_start=False,
-                 copy_X_train=False, random_state=None):
+                 copy_X_train=True, random_state=None):
         self.kernel = kernel
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 56d93c0b2c65e..00e0a0389762f 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -91,7 +91,7 @@ def optimizer(obj_func, initial_theta, bounds):
         prior based on the data, which contradicts the likelihood principle;
         normalization is thus disabled per default.
 
-    copy_X_train : bool, optional (default: False)
+    copy_X_train : bool, optional (default: True)
         If True, a persistent copy of the training data is stored in the
         object. Otherwise, just a reference to the training data is stored,
         which might cause predictions to change if the data is modified
@@ -122,7 +122,7 @@ def optimizer(obj_func, initial_theta, bounds):
     """
     def __init__(self, kernel=None, alpha=1e-10,
                  optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0,
-                 normalize_y=False, copy_X_train=False, random_state=None):
+                 normalize_y=False, copy_X_train=True, random_state=None):
         self.kernel = kernel
         self.alpha = alpha
         self.optimizer = optimizer

From 21768398edef8e5c81ad3d046fd8d614e615dd15 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Thu, 20 Aug 2015 13:36:26 +0200
Subject: [PATCH 157/165] FIX Fix message of ValueError for wrong shape of
 alpha in GPR

---
 sklearn/gaussian_process/gpr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 00e0a0389762f..fe4d06b18ce8e 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -169,7 +169,7 @@ def fit(self, X, y):
             if self.alpha.shape[0] == 1:
                 self.alpha = self.alpha[0]
             else:
-                raise ValueError("sigma_n_squared must be a scalar or an array"
+                raise ValueError("alpha must be a scalar or an array"
                                  " with same number of entries as y.(%d != %d)"
                                  % (self.alpha.shape[0], y.shape[0]))
 

From 5164af33a274dbe26f665f441fdf3e649b443bf7 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Sun, 30 Aug 2015 20:32:32 +0200
Subject: [PATCH 158/165] PEP8 Making pep8 and pyflakes happy

---
 examples/gaussian_process/plot_gpc_iris.py         |  6 ++----
 .../gaussian_process/plot_gpc_isoprobability.py    |  1 +
 examples/gaussian_process/plot_gpr_co2.py          | 14 +++++++-------
 examples/gaussian_process/plot_gpr_noisy.py        |  4 ++--
 .../gaussian_process/plot_gpr_noisy_targets.py     |  4 ++--
 .../gaussian_process/plot_gpr_prior_posterior.py   |  7 ++++---
 sklearn/gaussian_process/gpc.py                    | 12 ++++++------
 sklearn/gaussian_process/gpr.py                    |  2 +-
 sklearn/gaussian_process/kernels.py                | 13 ++++++-------
 sklearn/gaussian_process/tests/test_kernels.py     |  6 ++++--
 10 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/examples/gaussian_process/plot_gpc_iris.py b/examples/gaussian_process/plot_gpc_iris.py
index 15f292be4fbef..ab9c1c6810b75 100644
--- a/examples/gaussian_process/plot_gpc_iris.py
+++ b/examples/gaussian_process/plot_gpc_iris.py
@@ -13,14 +13,12 @@
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn import datasets
-from sklearn.multiclass import OneVsRestClassifier
 from sklearn.gaussian_process import GaussianProcessClassifier
 from sklearn.gaussian_process.kernels import RBF
 
 # import some data to play with
 iris = datasets.load_iris()
-X = iris.data[:, :2]  # we only take the first two features. We could
-                      # avoid this ugly slicing by using a two-dim dataset
+X = iris.data[:, :2]  # we only take the first two features.
 y = np.array(iris.target, dtype=int)
 
 h = .02  # step size in the mesh
@@ -58,7 +56,7 @@
     plt.xticks(())
     plt.yticks(())
     plt.title("%s, LML: %.3f" %
-        (titles[i], clf.log_marginal_likelihood(clf.kernel_.theta)))
+              (titles[i], clf.log_marginal_likelihood(clf.kernel_.theta)))
 
 plt.tight_layout()
 plt.show()
diff --git a/examples/gaussian_process/plot_gpc_isoprobability.py b/examples/gaussian_process/plot_gpc_isoprobability.py
index 44c8969f52792..21b3a010bf2f3 100644
--- a/examples/gaussian_process/plot_gpc_isoprobability.py
+++ b/examples/gaussian_process/plot_gpc_isoprobability.py
@@ -27,6 +27,7 @@
 # A few constants
 lim = 8
 
+
 def g(x):
     """The function to predict (classification will then consist in predicting
     whether g(x) <= 0 or not)"""
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index ad2007ac105fc..d5ce794656963 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -63,7 +63,7 @@
 
 from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels \
-    import RBF, Kernel, WhiteKernel, RationalQuadratic, ExpSineSquared
+    import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
 from sklearn.datasets import fetch_mldata
 
 data = fetch_mldata('mauna-loa-atmospheric-co2').data
@@ -74,11 +74,11 @@
 k1 = 66.0**2 * RBF(l=67.0)  # long term smooth rising trend
 k2 = 2.4**2 * RBF(l=90.0) * ExpSineSquared(l=1.3, p=1.0)  # seasonal component
 k3 = 0.66**2 * RationalQuadratic(l=1.2, alpha=0.78)  # medium term irregularity
-k4 = 0.18**2 * RBF(l=0.134) + WhiteKernel(c=0.19**2) # noise terms
+k4 = 0.18**2 * RBF(l=0.134) + WhiteKernel(c=0.19**2)  # noise terms
 kernel_gpml = k1 + k2 + k3 + k4
 
 gp = GaussianProcessRegressor(kernel=kernel_gpml, alpha=0,
-							         optimizer=None, normalize_y=True)
+                              optimizer=None, normalize_y=True)
 gp.fit(X, y)
 
 print("GPML kernel: %s" % gp.kernel_)
@@ -86,12 +86,12 @@
       % gp.log_marginal_likelihood(gp.kernel_.theta))
 
 # Kernel with optimized parameters
-k1 = 50.0**2 * RBF(l=50.0) # long term smooth rising trend
+k1 = 50.0**2 * RBF(l=50.0)  # long term smooth rising trend
 k2 = 2.0**2 * RBF(l=100.0) \
-	* ExpSineSquared(l=1.0, p=1.0, p_bounds="fixed") # seasonal component
-k3 = 0.5**2 * RationalQuadratic(l=1.0, alpha=1.0) # medium term irregularities
+    * ExpSineSquared(l=1.0, p=1.0, p_bounds="fixed")  # seasonal component
+k3 = 0.5**2 * RationalQuadratic(l=1.0, alpha=1.0)  # medium term irregularities
 k4 = 0.1**2 * RBF(l=0.1) + WhiteKernel(c=0.1**2,
-									   c_bounds=(1e-3, np.inf))  # noise terms
+                                       c_bounds=(1e-3, np.inf))  # noise terms
 kernel = k1 + k2 + k3 + k4
 
 gp = GaussianProcessRegressor(kernel=kernel, alpha=0,
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index 31fc7f2527bbb..93dc276fa4e07 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -37,7 +37,7 @@
 # First run
 plt.figure(0)
 kernel = 1.0 * RBF(l=100.0, l_bounds=(1e-2, 1e3)) \
-  + WhiteKernel(c=1, c_bounds=(1e-10, 1e+1))
+    + WhiteKernel(c=1, c_bounds=(1e-10, 1e+1))
 gp = GaussianProcessRegressor(kernel=kernel,
                               alpha=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
@@ -56,7 +56,7 @@
 # Second run
 plt.figure(1)
 kernel = 1.0 * RBF(l=1.0, l_bounds=(1e-2, 1e3)) \
-  + WhiteKernel(c=1e-5, c_bounds=(1e-10, 1e+1))
+    + WhiteKernel(c=1e-5, c_bounds=(1e-10, 1e+1))
 gp = GaussianProcessRegressor(kernel=kernel,
                               alpha=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index e74ac1253d23d..329d0384b40ea 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -38,7 +38,7 @@ def f(x):
     """The function to predict."""
     return x * np.sin(x)
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 #  First the noiseless case
 X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T
 
@@ -74,7 +74,7 @@ def f(x):
 pl.ylim(-10, 20)
 pl.legend(loc='upper left')
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # now the noisy case
 X = np.linspace(0.1, 9.9, 20)
 X = np.atleast_2d(X).T
diff --git a/examples/gaussian_process/plot_gpr_prior_posterior.py b/examples/gaussian_process/plot_gpr_prior_posterior.py
index 47f253cdf3ba8..c9c32fbeecce4 100644
--- a/examples/gaussian_process/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/plot_gpr_prior_posterior.py
@@ -19,15 +19,16 @@
 
 from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels import (RBF, Matern, RationalQuadratic,
-    ExpSineSquared, DotProduct, ConstantKernel)
+                                              ExpSineSquared, DotProduct,
+                                              ConstantKernel)
 
 
 kernels = [1.0 * RBF(l=1.0, l_bounds=(1e-1, 10.0)),
            1.0 * RationalQuadratic(l=1.0, alpha=0.1),
            1.0 * ExpSineSquared(l=1.0, p=3.0, l_bounds=(0.1, 10.0),
                                 p_bounds=(1.0, 10.0)),
-           ConstantKernel(0.1, (0.01, 10.0)) \
-                * (DotProduct(sigma_0=1.0, sigma_0_bounds=(0.0, 10.0)) ** 2),
+           ConstantKernel(0.1, (0.01, 10.0))
+               * (DotProduct(sigma_0=1.0, sigma_0_bounds=(0.0, 10.0)) ** 2),
            1.0 * Matern(l=1.0, l_bounds=(1e-1, 10.0), nu=1.5)]
 
 for fig_index, kernel in enumerate(kernels):
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 6f83e475ab918..01ad8e0f89f2d 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -137,8 +137,8 @@ def optimizer(obj_func, initial_theta, bounds):
         of sqrt(W) is stored.
     """
     def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
-                 n_restarts_optimizer=0, max_iter_predict=100, warm_start=False,
-                 copy_X_train=True, random_state=None):
+                 n_restarts_optimizer=0, max_iter_predict=100,
+                 warm_start=False, copy_X_train=True, random_state=None):
         self.kernel = kernel
         self.optimizer = optimizer
         self.n_restarts_optimizer = n_restarts_optimizer
@@ -337,8 +337,8 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr))  # Line 7
         C = solve(L, W_sr[:, np.newaxis] * K)  # Line 8
         # Line 9: (use einsum to compute np.diag(C.T.dot(C))))
-        s_2 = -0.5*(np.diag(K) - np.einsum('ij, ij -> j', C, C)) \
-            * (pi * (1 - pi) * (1 - 2*pi))  # third derivative
+        s_2 = -0.5 * (np.diag(K) - np.einsum('ij, ij -> j', C, C)) \
+            * (pi * (1 - pi) * (1 - 2 * pi))  # third derivative
 
         for j in range(d_Z.shape[0]):
             C = K_gradient[:, :, j]   # Line 11
@@ -389,8 +389,8 @@ def _posterior_mode(self, K, return_temporaries=False):
 
             # Line 10: Compute log marginal likelihood in loop and use as
             #          convergence criterion
-            lml = -0.5*a.T.dot(f) \
-                - np.log(1 + np.exp(-(self.y_train_*2 - 1)*f)).sum() \
+            lml = -0.5 * a.T.dot(f) \
+                - np.log(1 + np.exp(-(self.y_train_ * 2 - 1) * f)).sum() \
                 - np.log(np.diag(L)).sum()
             # Check if we have converged (log marginal likelihood does
             # not decrease)
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index fe4d06b18ce8e..9c5c1626b161e 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -8,7 +8,7 @@
 from operator import itemgetter
 
 import numpy as np
-from scipy.linalg import cholesky, cho_solve, solve, solve_triangular
+from scipy.linalg import cholesky, cho_solve, solve_triangular
 from scipy.optimize import fmin_l_bfgs_b
 
 from sklearn.base import BaseEstimator, RegressorMixin, clone
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 0517a9de7de68..cfcf9bc2eaa87 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -20,7 +20,6 @@
 #       package.
 
 from abc import ABCMeta, abstractmethod
-from functools import partial
 from collections import namedtuple
 import inspect
 import math
@@ -34,9 +33,9 @@
 from ..base import clone
 
 
-class Hyperparameter(namedtuple(
-     'Hyperparameter',
-     ('name', 'value_type', 'bounds', 'n_elements', 'fixed'))):
+class Hyperparameter(namedtuple('Hyperparameter',
+                                ('name', 'value_type', 'bounds',
+                                 'n_elements', 'fixed'))):
     """A kernel hyperparameter's specification in form of a namedtuple.
 
     Entries
@@ -89,7 +88,7 @@ def __new__(cls, name, value_type, bounds, n_elements=1, fixed=None):
                                      % (name, n_elements, bounds.shape[0]))
 
         if fixed is None:
-             fixed = bounds is "fixed"
+            fixed = bounds is "fixed"
         return super(Hyperparameter, cls).__new__(
             cls, name, value_type, bounds, n_elements, fixed)
 
@@ -340,7 +339,7 @@ def get_params(self, deep=True):
         params : mapping of string to any
             Parameter names mapped to their values.
         """
-        return dict(kernels=kernels)
+        return dict(kernels=self.kernels)
 
     @property
     def theta(self):
@@ -1309,7 +1308,7 @@ def __init__(self, l=1.0, alpha=1.0, l_bounds=(1e-5, 1e5),
 
         self.hyperparameter_l = Hyperparameter("l", "numeric", l_bounds)
         self.hyperparameter_alpha = \
-             Hyperparameter("alpha", "numeric", alpha_bounds)
+            Hyperparameter("alpha", "numeric", alpha_bounds)
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index b03e9bb5a2371..59f7a0b5ab7f6 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -86,13 +86,15 @@ def test_kernel_theta():
         theta_vars = map(lambda s: s.rstrip("_bounds"),
                          filter(lambda s: s.endswith("_bounds"), args))
         assert_equal(
-            set(hyperparameter.name for hyperparameter in kernel.hyperparameters),
+            set(hyperparameter.name
+                for hyperparameter in kernel.hyperparameters),
             set(theta_vars))
 
         # Check that values returned in theta are consistent with
         # hyperparameter values (being their logarithms)
         for i, hyperparameter in enumerate(kernel.hyperparameters):
-            assert_equal(theta[i], np.log(getattr(kernel, hyperparameter.name)))
+            assert_equal(theta[i],
+                         np.log(getattr(kernel, hyperparameter.name)))
 
         # Fixed kernel parameters must be excluded from theta and gradient.
         for i, hyperparameter in enumerate(kernel.hyperparameters):

From 0b6116a9d7d71e8a4b6e26f494ba8a106c639ed7 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 31 Aug 2015 16:16:46 +0200
Subject: [PATCH 159/165] REF More meaningful names for hyperparameters of GP
 kernels

---
 doc/modules/gaussian_process.rst              |  71 ++---
 .../gaussian_process/plot_compare_gpr_krr.py  |   5 +-
 examples/gaussian_process/plot_gpc.py         |   4 +-
 examples/gaussian_process/plot_gpc_xor.py     |   5 +-
 examples/gaussian_process/plot_gpr_co2.py     |  55 ++--
 examples/gaussian_process/plot_gpr_noisy.py   |   8 +-
 .../plot_gpr_prior_posterior.py               |  17 +-
 sklearn/gaussian_process/gpc.py               |   4 +-
 sklearn/gaussian_process/gpr.py               |   4 +-
 sklearn/gaussian_process/kernels.py           | 280 ++++++++++--------
 sklearn/gaussian_process/tests/test_gpc.py    |   9 +-
 sklearn/gaussian_process/tests/test_gpr.py    |  19 +-
 .../gaussian_process/tests/test_kernels.py    |  40 +--
 13 files changed, 281 insertions(+), 240 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 05bcdb5294709..52049efc16153 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -134,7 +134,8 @@ The following figure illustrates both methods on an artificial dataset, which
 consists of a sinusoidal target function and strong noise. The figure compares
 the learned model of KRR and GPR based on a ExpSineSquared kernel, which is
 suited for learning periodic functions. The kernel's hyperparameters control
-the smoothness (l) and periodicity of the kernel (p). Moreover, the noise level
+the smoothness (length_scale) and periodicity of the kernel (periodicity).
+Moreover, the noise level
 of the data is learned explicitly by GPR by an additional WhiteKernel component
 in the kernel and by the regularization parameter alpha of KRR.
 
@@ -192,25 +193,25 @@ different properties of the signal:
    and the RBF's length scale are further free parameters.
 
 Maximizing the log-marginal-likelihood after subtracting the target's mean
-yields the following kernel with an LML of -84.483:
+yields the following kernel with an LML of -83.214:
 
 ::
 
-   2.5e+03 * RBF(l=49.8)
-   + 6.68 * RBF(l=100) * ExpSineSquared(l=1.37, p=1)
-   + 0.215 * RationalQuadratic(alpha=3.98, l=0.982)
-   + 0.0381 * RBF(l=0.136) + WhiteKernel(c=0.0335)
-
-Thus, most of the target signal (sqrt(2.5e+03)ppm = 50ppm) is explained by a
-long-term rising trend (length-scale 49.8 years). The periodic component has
-an amplitude of sqrt(6.68)ppm = 2.58ppm, a decay time of 100 years and a
-length-scale of 1.37. The long decay time indicates that we have a locally very
-close to periodic seasonal component. The correlated noise has an amplitude of
-sqrt(0.0381)ppm = 0.195ppm with a length scale of 0.136 years and a white-noise
-contribution of sqrt(0.0335)ppm = 0.183pm. Thus, the overall noise level is
-very small, indicating that the data can be very well explained by the model.
-The following figure shows also that the model makes very confident predictions
-until around 2015.
+   34.4**2 * RBF(length_scale=41.8)
+   + 3.27**2 * RBF(length_scale=180) * ExpSineSquared(length_scale=1.44,
+                                                      periodicity=1)
+   + 0.446**2 * RationalQuadratic(alpha=17.7, length_scale=0.957)
+   + 0.197**2 * RBF(length_scale=0.138) + WhiteKernel(noise_level=0.0336)
+
+Thus, most of the target signal (34.4ppm) is explained by a long-term rising
+trend (length-scale 41.8 years). The periodic component has an amplitude of
+3.27ppm, a decay time of 180 years and a length-scale of 1.44. The long decay
+time indicates that we have a locally very close to periodic seasonal
+component. The correlated noise has an amplitude of 0.197ppm with a length
+scale of 0.138 years and a white-noise contribution of 0.197ppm. Thus, the
+overall noise level is very small, indicating that the data can be very well
+explained by the model. The figure shows also that the model makes very
+confident predictions until around 2015
 
 .. figure:: ../auto_examples/gaussian_process/images/plot_gpr_co2_001.png
    :target: ../auto_examples/gaussian_process/plot_gpr_co2.html
@@ -403,23 +404,23 @@ method is ``clone_with_theta(theta)``, which returns a cloned version of the
 kernel but with the hyperparameters set to ``theta``. An illustrative example:
 
     >>> from sklearn.gaussian_process.kernels import ConstantKernel, RBF
-    >>> kernel = ConstantKernel(c=1.0, c_bounds=(0.0, 10.0)) * RBF(l=0.5, l_bounds=(0.0, 10.0)) + RBF(l=2.0, l_bounds=(0.0, 10.0))
+    >>> kernel = ConstantKernel(constant_value=1.0, constant_value_bounds=(0.0, 10.0)) * RBF(length_scale=0.5, length_scale_bounds=(0.0, 10.0)) + RBF(length_scale=2.0, length_scale_bounds=(0.0, 10.0))
     >>> for hyperparameter in kernel.hyperparameters: print(hyperparameter)
-    Hyperparameter(name='k1__k1__c', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
-    Hyperparameter(name='k1__k2__l', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
-    Hyperparameter(name='k2__l', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k1__k1__constant_value', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k1__k2__length_scale', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k2__length_scale', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
     >>> params = kernel.get_params()
     >>> for key in sorted(params): print("%s : %s" % (key, params[key]))
-    k1 : 1**2 * RBF(l=0.5)
+    k1 : 1**2 * RBF(length_scale=0.5)
     k1__k1 : 1**2
-    k1__k1__c : 1.0
-    k1__k1__c_bounds : (0.0, 10.0)
-    k1__k2 : RBF(l=0.5)
-    k1__k2__l : 0.5
-    k1__k2__l_bounds : (0.0, 10.0)
-    k2 : RBF(l=2)
-    k2__l : 2.0
-    k2__l_bounds : (0.0, 10.0)
+    k1__k1__constant_value : 1.0
+    k1__k1__constant_value_bounds : (0.0, 10.0)
+    k1__k2 : RBF(length_scale=0.5)
+    k1__k2__length_scale : 0.5
+    k1__k2__length_scale_bounds : (0.0, 10.0)
+    k2 : RBF(length_scale=2)
+    k2__length_scale : 2.0
+    k2__length_scale_bounds : (0.0, 10.0)
     >>> print(kernel.theta)  # Note: log-transformed
     [ 0.         -0.69314718  0.69314718]
     >>> print(kernel.bounds)  # Note: log-transformed
@@ -444,18 +445,18 @@ Basic kernels
 The :class:`ConstantKernel` kernel can be used as part of a :class:`Product`
 kernel where it scales the magnitude of the other factor (kernel) or as part
 of a :class:`Sum` kernel, where it modifies the mean of the Gaussian process.
-It depends on a parameter :math:`c`. It is defined as:
+It depends on a parameter :math:`constant\_value`. It is defined as:
 
 .. math::
-   k(x_i, x_j) = c \;\forall\; x_1, x_2
+   k(x_i, x_j) = constant\_value \;\forall\; x_1, x_2
 
 The main use-case of the :class:`WhiteKernel` kernel is as part of a
 sum-kernel where it explains the noise-component of the signal. Tuning its
-parameter :math:`c` corresponds to estimating the noise-level.
-It is defined as:
+parameter :math:`noise\_level` corresponds to estimating the noise-level.
+It is defined as:e
 
 .. math::
-    k(x_i, x_j) = c \text{ if } x_i == x_j \text{ else } 0
+    k(x_i, x_j) = noise\_level \text{ if } x_i == x_j \text{ else } 0
 
 
 Kernel operators
diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index b23e587978d06..6c728144e44ff 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -78,7 +78,8 @@
 kr.fit(X, y)
 print("Time for KRR fitting: %.3f" % (time.time() - stime))
 
-gp_kernel = ExpSineSquared(1.0, 5.0, p_bounds=(1e-2, 1e1)) + WhiteKernel(1e-1)
+gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) \
+    + WhiteKernel(1e-1)
 gpr = GaussianProcessRegressor(kernel=gp_kernel)
 stime = time.time()
 gpr.fit(X, y)
@@ -111,5 +112,5 @@
 plt.ylabel('target')
 plt.xlim(0, 20)
 plt.title('GPR versus Kernel Ridge')
-plt.legend(loc="best")
+plt.legend(loc="best", prop={'size': 10})
 plt.show()
diff --git a/examples/gaussian_process/plot_gpc.py b/examples/gaussian_process/plot_gpc.py
index 736e64c6da3f2..26b7d6d0decfe 100644
--- a/examples/gaussian_process/plot_gpc.py
+++ b/examples/gaussian_process/plot_gpc.py
@@ -42,11 +42,11 @@
 y = np.array(X[:, 0] > 2.5, dtype=int)
 
 # Specify Gaussian Processes with fixed and optimized hyperparameters
-gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(l=1.0),
+gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0),
                                    optimizer=None)
 gp_fix.fit(X[:train_size], y[:train_size])
 
-gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(l=1.0))
+gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
 gp_opt.fit(X[:train_size], y[:train_size])
 
 print("Log Marginal Likelihood (initial): %.3f"
diff --git a/examples/gaussian_process/plot_gpc_xor.py b/examples/gaussian_process/plot_gpc_xor.py
index 5e8678d2bb0ba..084f6b4b7cace 100644
--- a/examples/gaussian_process/plot_gpc_xor.py
+++ b/examples/gaussian_process/plot_gpc_xor.py
@@ -30,7 +30,7 @@
 
 # fit the model
 plt.figure(figsize=(10, 5))
-kernels = [1.0 * RBF(l=1.0), 1.0 * DotProduct(sigma_0=1.0)**2]
+kernels = [1.0 * RBF(length_scale=1.0), 1.0 * DotProduct(sigma_0=1.0)**2]
 for i, kernel in enumerate(kernels):
     clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y)
 
@@ -50,7 +50,8 @@
     plt.axis([-3, 3, -3, 3])
     plt.colorbar(image)
     plt.title("%s\n Log-Marginal-Likelihood:%.3f"
-              % (clf.kernel_, clf.log_marginal_likelihood(clf.kernel_.theta)))
+              % (clf.kernel_, clf.log_marginal_likelihood(clf.kernel_.theta)),
+              fontsize=12)
 
 plt.tight_layout()
 plt.show()
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index d5ce794656963..07f1d8214df91 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -36,20 +36,20 @@
 
 Maximizing the log-marginal-likelihood after subtracting the target's mean
 yields the following kernel with an LML of -83.214:
-   1.19e+03 * RBF(l=41.8)
-   + 10.7 * RBF(l=180) * ExpSineSquared(l=1.44, p=1)
-   + 0.199 * RationalQuadratic(alpha=17.7, l=0.957)
-   + 0.0389 * RBF(l=0.138) + WhiteKernel(c=0.0336)
-Thus, most of the target signal (sqrt(1.19e+03)ppm = 34.5ppm) is explained by a
-long-term rising trend (length-scale 41.8 years). The periodic component has
-an amplitude of sqrt(10.7)ppm = 3.27ppm, a decay time of 180 years and a
-length-scale of 1.44. The long decay time indicates that we have a locally very
-close to periodic seasonal component. The correlated noise has an amplitude of
-sqrt(0.0389)ppm = 0.197ppm with a length scale of 0.138 years and a white-noise
-contribution of sqrt(0.0336)ppm = 0.183pm. Thus, the overall noise level is
-very small, indicating that the data can be very well explained by the model.
-The figure shows also that the model makes very confident predictions until
-around 2015.
+   34.4**2 * RBF(length_scale=41.8)
+   + 3.27**2 * RBF(length_scale=180) * ExpSineSquared(length_scale=1.44,
+                                                      periodicity=1)
+   + 0.446**2 * RationalQuadratic(alpha=17.7, length_scale=0.957)
+   + 0.197**2 * RBF(length_scale=0.138) + WhiteKernel(noise_level=0.0336)
+Thus, most of the target signal (34.4ppm) is explained by a long-term rising
+trend (length-scale 41.8 years). The periodic component has an amplitude of
+3.27ppm, a decay time of 180 years and a length-scale of 1.44. The long decay
+time indicates that we have a locally very close to periodic seasonal
+component. The correlated noise has an amplitude of 0.197ppm with a length
+scale of 0.138 years and a white-noise contribution of 0.197ppm. Thus, the
+overall noise level is very small, indicating that the data can be very well
+explained by the model. The figure shows also that the model makes very
+confident predictions until around 2015.
 """
 print(__doc__)
 
@@ -71,10 +71,14 @@
 y = data[:, 0]
 
 # Kernel with parameters given in GPML book
-k1 = 66.0**2 * RBF(l=67.0)  # long term smooth rising trend
-k2 = 2.4**2 * RBF(l=90.0) * ExpSineSquared(l=1.3, p=1.0)  # seasonal component
-k3 = 0.66**2 * RationalQuadratic(l=1.2, alpha=0.78)  # medium term irregularity
-k4 = 0.18**2 * RBF(l=0.134) + WhiteKernel(c=0.19**2)  # noise terms
+k1 = 66.0**2 * RBF(length_scale=67.0)  # long term smooth rising trend
+k2 = 2.4**2 * RBF(length_scale=90.0) \
+    * ExpSineSquared(length_scale=1.3, periodicity=1.0)  # seasonal component
+# medium term irregularity
+k3 = 0.66**2 \
+    * RationalQuadratic(length_scale=1.2, alpha=0.78)
+k4 = 0.18**2 * RBF(length_scale=0.134) \
+    + WhiteKernel(noise_level=0.19**2)  # noise terms
 kernel_gpml = k1 + k2 + k3 + k4
 
 gp = GaussianProcessRegressor(kernel=kernel_gpml, alpha=0,
@@ -86,12 +90,15 @@
       % gp.log_marginal_likelihood(gp.kernel_.theta))
 
 # Kernel with optimized parameters
-k1 = 50.0**2 * RBF(l=50.0)  # long term smooth rising trend
-k2 = 2.0**2 * RBF(l=100.0) \
-    * ExpSineSquared(l=1.0, p=1.0, p_bounds="fixed")  # seasonal component
-k3 = 0.5**2 * RationalQuadratic(l=1.0, alpha=1.0)  # medium term irregularities
-k4 = 0.1**2 * RBF(l=0.1) + WhiteKernel(c=0.1**2,
-                                       c_bounds=(1e-3, np.inf))  # noise terms
+k1 = 50.0**2 * RBF(length_scale=50.0)  # long term smooth rising trend
+k2 = 2.0**2 * RBF(length_scale=100.0) \
+    * ExpSineSquared(length_scale=1.0, periodicity=1.0,
+                     periodicity_bounds="fixed")  # seasonal component
+# medium term irregularities
+k3 = 0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0)
+k4 = 0.1**2 * RBF(length_scale=0.1) \
+    + WhiteKernel(noise_level=0.1**2,
+                  noise_level_bounds=(1e-3, np.inf))  # noise terms
 kernel = k1 + k2 + k3 + k4
 
 gp = GaussianProcessRegressor(kernel=kernel, alpha=0,
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index 93dc276fa4e07..55af701f50d36 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -36,8 +36,8 @@
 
 # First run
 plt.figure(0)
-kernel = 1.0 * RBF(l=100.0, l_bounds=(1e-2, 1e3)) \
-    + WhiteKernel(c=1, c_bounds=(1e-10, 1e+1))
+kernel = 1.0 * RBF(length_scale=100.0, length_scale_bounds=(1e-2, 1e3)) \
+    + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))
 gp = GaussianProcessRegressor(kernel=kernel,
                               alpha=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
@@ -55,8 +55,8 @@
 
 # Second run
 plt.figure(1)
-kernel = 1.0 * RBF(l=1.0, l_bounds=(1e-2, 1e3)) \
-    + WhiteKernel(c=1e-5, c_bounds=(1e-10, 1e+1))
+kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
+    + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-10, 1e+1))
 gp = GaussianProcessRegressor(kernel=kernel,
                               alpha=0.0).fit(X, y)
 X_ = np.linspace(0, 5, 100)
diff --git a/examples/gaussian_process/plot_gpr_prior_posterior.py b/examples/gaussian_process/plot_gpr_prior_posterior.py
index c9c32fbeecce4..6c2bfe525f730 100644
--- a/examples/gaussian_process/plot_gpr_prior_posterior.py
+++ b/examples/gaussian_process/plot_gpr_prior_posterior.py
@@ -23,13 +23,15 @@
                                               ConstantKernel)
 
 
-kernels = [1.0 * RBF(l=1.0, l_bounds=(1e-1, 10.0)),
-           1.0 * RationalQuadratic(l=1.0, alpha=0.1),
-           1.0 * ExpSineSquared(l=1.0, p=3.0, l_bounds=(0.1, 10.0),
-                                p_bounds=(1.0, 10.0)),
+kernels = [1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)),
+           1.0 * RationalQuadratic(length_scale=1.0, alpha=0.1),
+           1.0 * ExpSineSquared(length_scale=1.0, periodicity=3.0,
+                                length_scale_bounds=(0.1, 10.0),
+                                periodicity_bounds=(1.0, 10.0)),
            ConstantKernel(0.1, (0.01, 10.0))
                * (DotProduct(sigma_0=1.0, sigma_0_bounds=(0.0, 10.0)) ** 2),
-           1.0 * Matern(l=1.0, l_bounds=(1e-1, 10.0), nu=1.5)]
+           1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0),
+                        nu=1.5)]
 
 for fig_index, kernel in enumerate(kernels):
     # Specify Gaussian Process
@@ -47,7 +49,7 @@
     plt.plot(X_, y_samples, lw=1)
     plt.xlim(0, 5)
     plt.ylim(-3, 3)
-    plt.title("Prior (kernel:  %s)" % kernel)
+    plt.title("Prior (kernel:  %s)" % kernel, fontsize=12)
 
     # Generate data and fit GP
     rng = np.random.RandomState(4)
@@ -69,7 +71,8 @@
     plt.xlim(0, 5)
     plt.ylim(-3, 3)
     plt.title("Posterior (kernel: %s)\n Log-Likelihood: %.3f"
-              % (gp.kernel_, gp.log_marginal_likelihood(gp.kernel_.theta)))
+              % (gp.kernel_, gp.log_marginal_likelihood(gp.kernel_.theta)),
+              fontsize=12)
     plt.tight_layout()
 
 plt.show()
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 01ad8e0f89f2d..d52c6a3a80457 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -163,8 +163,8 @@ def fit(self, X, y):
         self : returns an instance of self.
         """
         if self.kernel is None:  # Use an RBF kernel as default
-            self.kernel_ = \
-                C(1.0, c_bounds="fixed") * RBF(1.0, l_bounds="fixed")
+            self.kernel_ = C(1.0, constant_value_bounds="fixed") \
+                * RBF(1.0, length_scale_bounds="fixed")
         else:
             self.kernel_ = clone(self.kernel)
 
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 9c5c1626b161e..b53e12865ddfb 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -147,8 +147,8 @@ def fit(self, X, y):
         self : returns an instance of self.
         """
         if self.kernel is None:  # Use an RBF kernel as default
-            self.kernel_ = \
-                C(1.0, c_bounds="fixed") * RBF(1.0, l_bounds="fixed")
+            self.kernel_ = C(1.0, constant_value_bounds="fixed") \
+                * RBF(1.0, length_scale_bounds="fixed")
         else:
             self.kernel_ = clone(self.kernel)
 
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index cfcf9bc2eaa87..fb764828483fc 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -839,21 +839,23 @@ class ConstantKernel(Kernel):
     the other factor (kernel) or as part of a sum-kernel, where it modifies
     the mean of the Gaussian process.
 
-    k(x_1, x_2) = c for all x_1, x_2
+    k(x_1, x_2) = constant_value for all x_1, x_2
 
     Parameters
     ----------
-    c : float, default: 1.0
-        The constant value which defines the covariance: k(x_1, x_2) = c
+    constant_value : float, default: 1.0
+        The constant value which defines the covariance:
+        k(x_1, x_2) = constant_value
 
-    c_bounds : pair of floats >= 0, default: (1e-5, 1e5)
-        The lower and upper bound on c
+    constant_value_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on constant_value
     """
-    def __init__(self, c=1.0, c_bounds=(1e-5, 1e5)):
-        self.c = c
-        self.c_bounds = c_bounds
+    def __init__(self, constant_value=1.0, constant_value_bounds=(1e-5, 1e5)):
+        self.constant_value = constant_value
+        self.constant_value_bounds = constant_value_bounds
 
-        self.hyperparameter_c = Hyperparameter("c", "numeric", c_bounds)
+        self.hyperparameter_constant_value = \
+            Hyperparameter("constant_value", "numeric", constant_value_bounds)
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
@@ -887,10 +889,11 @@ def __call__(self, X, Y=None, eval_gradient=False):
         elif eval_gradient:
             raise ValueError("Gradient can only be evaluated when Y is None.")
 
-        K = self.c * np.ones((X.shape[0], Y.shape[0]))
+        K = self.constant_value * np.ones((X.shape[0], Y.shape[0]))
         if eval_gradient:
-            if not self.hyperparameter_c.fixed:
-                return K, self.c * np.ones((X.shape[0], X.shape[0], 1))
+            if not self.hyperparameter_constant_value.fixed:
+                return (K, self.constant_value
+                        * np.ones((X.shape[0], X.shape[0], 1)))
             else:
                 return K, np.empty((X.shape[0], X.shape[0], 0))
         else:
@@ -913,10 +916,10 @@ def diag(self, X):
         K_diag : array, shape (n_samples_X,)
             Diagonal of kernel k(X, X)
         """
-        return self.c * np.ones(X.shape[0])
+        return self.constant_value * np.ones(X.shape[0])
 
     def __repr__(self):
-        return "{0:.3g}**2".format(np.sqrt(self.c))
+        return "{0:.3g}**2".format(np.sqrt(self.constant_value))
 
 
 class WhiteKernel(Kernel):
@@ -926,21 +929,22 @@ class WhiteKernel(Kernel):
     explains the noise-component of the signal. Tuning its parameter
     corresponds to estimating the noise-level.
 
-    k(x_1, x_2) = c if x_1 == x_2 else 0
+    k(x_1, x_2) = noise_level if x_1 == x_2 else 0
 
     Parameters
     ----------
-    c : float, default: 1.0
+    noise_level : float, default: 1.0
         Parameter controlling the noise level
 
-    c_bounds : pair of floats >= 0, default: (1e-5, 1e5)
-        The lower and upper bound on c
+    noise_level_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on noise_level
     """
-    def __init__(self, c=1.0, c_bounds=(1e-5, 1e5)):
-        self.c = c
-        self.c_bounds = c_bounds
+    def __init__(self, noise_level=1.0, noise_level_bounds=(1e-5, 1e5)):
+        self.noise_level = noise_level
+        self.noise_level_bounds = noise_level_bounds
 
-        self.hyperparameter_c = Hyperparameter("c", "numeric", c_bounds)
+        self.hyperparameter_noise_level = \
+            Hyperparameter("noise_level", "numeric", noise_level_bounds)
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
@@ -973,10 +977,11 @@ def __call__(self, X, Y=None, eval_gradient=False):
             raise ValueError("Gradient can only be evaluated when Y is None.")
 
         if Y is None:
-            K = self.c * np.eye(X.shape[0])
+            K = self.noise_level * np.eye(X.shape[0])
             if eval_gradient:
-                if not self.hyperparameter_c.fixed:
-                    return K, self.c * np.eye(X.shape[0])[:, :, np.newaxis]
+                if not self.hyperparameter_noise_level.fixed:
+                    return (K, self.noise_level
+                            * np.eye(X.shape[0])[:, :, np.newaxis])
                 else:
                     return K, np.empty((X.shape[0], X.shape[0], 0))
             else:
@@ -1001,10 +1006,11 @@ def diag(self, X):
         K_diag : array, shape (n_samples_X,)
             Diagonal of kernel k(X, X)
         """
-        return self.c * np.ones(X.shape[0])
+        return self.noise_level * np.ones(X.shape[0])
 
     def __repr__(self):
-        return "{0}(c={1:.3g})".format(self.__class__.__name__, self.c)
+        return "{0}(noise_level={1:.3g})".format(self.__class__.__name__,
+                                                 self.noise_level)
 
 
 class RBF(Kernel):
@@ -1012,11 +1018,11 @@ class RBF(Kernel):
 
     The RBF kernel is a stationary kernel. It is also known as the
     "squared exponential" kernel. It is parameterized by a length-scale
-    parameter l>0, which can either be a scalar (isotropic variant of
-    the kernel) or a vector with the same number of dimensions as the inputs
-    X (anisotropic variant of the kernel). The kernel given by:
+    parameter length_scale>0, which can either be a scalar (isotropic variant
+    of the kernel) or a vector with the same number of dimensions as the inputs
+    X (anisotropic variant of the kernel). The kernel is given by:
 
-    k(x_i, x_j) = exp(-1 / 2 d(x_i / l, x_j / l)^2)
+    k(x_i, x_j) = exp(-1 / 2 d(x_i / length_scale, x_j / length_scale)^2)
 
     This kernel is infinitely differentiable, which implies that GPs with this
     kernel as covariance function have mean square derivatives of all orders,
@@ -1024,33 +1030,34 @@ class RBF(Kernel):
 
     Parameters
     -----------
-    l : float or array with shape (n_features,), entries > 0, default: 1.0
+    length_scale : float or array with shape (n_features,), default: 1.0
         The length scale of the kernel. If a float, an isotropic kernel is
         used. If an array, an anisotropic kernel is used where each dimension
         of l defines the length-scale of the respective feature dimension.
 
-    l_bounds : pair of floats >= 0, default: (1e-5, 1e5)
-        The lower and upper bound on l
+    length_scale_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on length_scale
     """
-    def __init__(self, l=1.0, l_bounds=(1e-5, 1e5)):
-        if np.iterable(l):
-            if len(l) > 1:
+    def __init__(self, length_scale=1.0, length_scale_bounds=(1e-5, 1e5)):
+        if np.iterable(length_scale):
+            if len(length_scale) > 1:
                 self.anisotropic = True
-                self.l = np.asarray(l, dtype=np.float)
+                self.length_scale = np.asarray(length_scale, dtype=np.float)
             else:
                 self.anisotropic = False
-                self.l = float(l[0])
+                self.length_scale = float(length_scale[0])
         else:
             self.anisotropic = False
-            self.l = float(l)
-        self.l_bounds = l_bounds
+            self.length_scale = float(length_scale)
+        self.length_scale_bounds = length_scale_bounds
 
-        if self.anisotropic:  # anisotropic l
-            self.hyperparameter_l = \
-                Hyperparameter("l", "numeric", l_bounds, len(l))
+        if self.anisotropic:  # anisotropic length_scale
+            self.hyperparameter_length_scale = \
+                Hyperparameter("length_scale", "numeric", length_scale_bounds,
+                               len(length_scale))
         else:
-            self.hyperparameter_l = \
-                Hyperparameter("l", "numeric", l_bounds)
+            self.hyperparameter_length_scale = \
+                Hyperparameter("length_scale", "numeric", length_scale_bounds)
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
@@ -1079,13 +1086,13 @@ def __call__(self, X, Y=None, eval_gradient=False):
             is True.
         """
         X = np.atleast_2d(X)
-        if self.anisotropic and X.shape[1] != self.l.shape[0]:
+        if self.anisotropic and X.shape[1] != self.length_scale.shape[0]:
             raise Exception("Anisotropic kernel must have the same number of "
                             "dimensions as data (%d!=%d)"
-                            % (self.l.shape[0], X.shape[1]))
+                            % (self.length_scale.shape[0], X.shape[1]))
 
         if Y is None:
-            dists = pdist(X / self.l, metric='sqeuclidean')
+            dists = pdist(X / self.length_scale, metric='sqeuclidean')
             K = np.exp(-.5 * dists)
             # convert from upper-triangular matrix to square matrix
             K = squareform(K)
@@ -1094,21 +1101,22 @@ def __call__(self, X, Y=None, eval_gradient=False):
             if eval_gradient:
                 raise ValueError(
                     "Gradient can only be evaluated when Y is None.")
-            dists = cdist(X / self.l, Y / self.l, metric='sqeuclidean')
+            dists = cdist(X / self.length_scale, Y / self.length_scale,
+                          metric='sqeuclidean')
             K = np.exp(-.5 * dists)
 
         if eval_gradient:
-            if self.hyperparameter_l.fixed:
+            if self.hyperparameter_length_scale.fixed:
                 # Hyperparameter l kept fixed
                 return K, np.empty((X.shape[0], X.shape[0], 0))
-            elif not self.anisotropic or self.l.shape[0] == 1:
+            elif not self.anisotropic or self.length_scale.shape[0] == 1:
                 K_gradient = \
                     (K * squareform(dists))[:, :, np.newaxis]
                 return K, K_gradient
             elif self.anisotropic:
                 # We need to recompute the pairwise dimension-wise distances
                 K_gradient = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 \
-                    / (self.l ** 2)
+                    / (self.length_scale ** 2)
                 K_gradient *= K[..., np.newaxis]
                 return K, K_gradient
             else:
@@ -1119,11 +1127,12 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
     def __repr__(self):
         if self.anisotropic:
-            return "{0}(l=[{1}])".format(self.__class__.__name__,
-                                         ", ".join(map("{0:.3g}".format,
-                                                   self.l)))
+            return "{0}(length_scale=[{1}])".format(
+                self.__class__.__name__, ", ".join(map("{0:.3g}".format,
+                                                   self.length_scale)))
         else:  # isotropic
-            return "{0}(l={1:.3g})".format(self.__class__.__name__, self.l)
+            return "{0}(length_scale={1:.3g})".format(
+                self.__class__.__name__, self.length_scale)
 
 
 class Matern(RBF):
@@ -1142,13 +1151,13 @@ class Matern(RBF):
 
     Parameters
     -----------
-    l : float or array with shape (n_features,), entries > 0, default: 1.0
+    length_scale : float or array with shape (n_features,), default: 1.0
         The length scale of the kernel. If a float, an isotropic kernel is
         used. If an array, an anisotropic kernel is used where each dimension
         of l defines the length-scale of the respective feature dimension.
 
-    l_bounds : pair of floats >= 0, default: (1e-5, 1e5)
-        The lower and upper bound on l
+    length_scale_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on length_scale
 
     nu: float, default: 1.5
         The parameter nu controlling the smoothness of the learned function.
@@ -1162,8 +1171,9 @@ class Matern(RBF):
         Bessel function. Furthermore, in contrast to l, nu is kept fixed to
         its initial value and not optimized.
     """
-    def __init__(self, l=1.0, l_bounds=(1e-5, 1e5), nu=1.5):
-        super(Matern, self).__init__(l, l_bounds)
+    def __init__(self, length_scale=1.0, length_scale_bounds=(1e-5, 1e5),
+                 nu=1.5):
+        super(Matern, self).__init__(length_scale, length_scale_bounds)
         self.nu = nu
 
     def __call__(self, X, Y=None, eval_gradient=False):
@@ -1193,18 +1203,19 @@ def __call__(self, X, Y=None, eval_gradient=False):
             is True.
         """
         X = np.atleast_2d(X)
-        if self.anisotropic and X.shape[1] != self.l.shape[0]:
+        if self.anisotropic and X.shape[1] != self.length_scale.shape[0]:
             raise Exception("Anisotropic kernel must have the same number of "
                             "dimensions as data (%d!=%d)"
-                            % (self.l.shape[0], X.shape[1]))
+                            % (self.length_scale.shape[0], X.shape[1]))
 
         if Y is None:
-            dists = pdist(X / self.l, metric='euclidean')
+            dists = pdist(X / self.length_scale, metric='euclidean')
         else:
             if eval_gradient:
                 raise ValueError(
                     "Gradient can only be evaluated when Y is None.")
-            dists = cdist(X / self.l, Y / self.l, metric='euclidean')
+            dists = cdist(X / self.length_scale, Y / self.length_scale,
+                          metric='euclidean')
 
         if self.nu == 0.5:
             K = np.exp(-dists)
@@ -1228,7 +1239,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             np.fill_diagonal(K, 1)
 
         if eval_gradient:
-            if self.hyperparameter_l.fixed:
+            if self.hyperparameter_length_scale.fixed:
                 # Hyperparameter l kept fixed
                 K_gradient = np.empty((X.shape[0], X.shape[0], 0))
                 return K, K_gradient
@@ -1236,7 +1247,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
             # We need to recompute the pairwise dimension-wise distances
             if self.anisotropic:
                 D = (X[:, np.newaxis, :] - X[np.newaxis, :, :])**2 \
-                    / (self.l ** 2)
+                    / (self.length_scale ** 2)
             else:
                 D = squareform(dists**2)[:, :, np.newaxis]
 
@@ -1265,13 +1276,13 @@ def f(theta):  # helper function
 
     def __repr__(self):
         if self.anisotropic:
-            return "{0}(l=[{1}], nu={2:.3g})".format(
+            return "{0}(length_scale=[{1}], nu={2:.3g})".format(
                 self.__class__.__name__,
-                ", ".join(map("{0:.3g}".format, self.l)),
+                ", ".join(map("{0:.3g}".format, self.length_scale)),
                 self.nu)
         else:  # isotropic
-            return "{0}(l={1:.3g}, nu={2:.3g})".format(
-                self.__class__.__name__, self.l, self.nu)
+            return "{0}(length_scale={1:.3g}, nu={2:.3g})".format(
+                self.__class__.__name__, self.length_scale, self.nu)
 
 
 class RationalQuadratic(Kernel):
@@ -1279,34 +1290,35 @@ class RationalQuadratic(Kernel):
 
     The RationalQuadratic kernel can be seen as a scale mixture (an infinite
     sum) of RBF kernels with different characteristic length-scales. It is
-    parameterized by a length-scale parameter l>0 and a scale mixture parameter
-    alpha>0 Only the isotropic variant where l is a scalar is supported at the
-    moment. The kernel given by:
+    parameterized by a length-scale parameter length_scale>0 and a scale
+    mixture parameter alpha>0. Only the isotropic variant where length_scale is
+    a scalar is supported at the moment. The kernel given by:
 
-    k(x_i, x_j) = (1 + d(x_i, x_j)^2 / (2*alpha l^2))^-alpha
+    k(x_i, x_j) = (1 + d(x_i, x_j)^2 / (2*alpha * length_scale^2))^-alpha
 
     Parameters
     ----------
-    l : float > 0, default: 1.0
+    length_scale : float > 0, default: 1.0
         The length scale of the kernel.
 
     alpha : float > 0, default: 1.0
         Scale mixture parameter
 
-    l_bounds : pair of floats >= 0, default: (1e-5, 1e5)
-        The lower and upper bound on l
+    length_scale_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on length_scale
 
     alpha_bounds : pair of floats >= 0, default: (1e-5, 1e5)
         The lower and upper bound on alpha
     """
-    def __init__(self, l=1.0, alpha=1.0, l_bounds=(1e-5, 1e5),
-                 alpha_bounds=(1e-5, 1e5)):
-        self.l = l
+    def __init__(self, length_scale=1.0, alpha=1.0,
+                 length_scale_bounds=(1e-5, 1e5), alpha_bounds=(1e-5, 1e5)):
+        self.length_scale = length_scale
         self.alpha = alpha
-        self.l_bounds = l_bounds
+        self.length_scale_bounds = length_scale_bounds
         self.alpha_bounds = alpha_bounds
 
-        self.hyperparameter_l = Hyperparameter("l", "numeric", l_bounds)
+        self.hyperparameter_length_scale = \
+            Hyperparameter("length_scale", "numeric", length_scale_bounds)
         self.hyperparameter_alpha = \
             Hyperparameter("alpha", "numeric", alpha_bounds)
 
@@ -1339,7 +1351,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
         X = np.atleast_2d(X)
         if Y is None:
             dists = squareform(pdist(X, metric='sqeuclidean'))
-            tmp = dists / (2 * self.alpha * self.l ** 2)
+            tmp = dists / (2 * self.alpha * self.length_scale ** 2)
             base = (1 + tmp)
             K = base ** -self.alpha
             np.fill_diagonal(K, 1)
@@ -1348,67 +1360,72 @@ def __call__(self, X, Y=None, eval_gradient=False):
                 raise ValueError(
                     "Gradient can only be evaluated when Y is None.")
             dists = cdist(X, Y, metric='sqeuclidean')
-            K = (1 + dists / (2 * self.alpha * self.l ** 2)) ** -self.alpha
+            K = (1 + dists / (2 * self.alpha * self.length_scale ** 2)) \
+                ** -self.alpha
 
         if eval_gradient:
-            # gradient with respect to l
-            if not self.hyperparameter_l.fixed:
-                l_gradient = dists * K / (self.l ** 2 * base)
-                l_gradient = l_gradient[:, :, np.newaxis]
+            # gradient with respect to length_scale
+            if not self.hyperparameter_length_scale.fixed:
+                length_scale_gradient = \
+                    dists * K / (self.length_scale ** 2 * base)
+                length_scale_gradient = length_scale_gradient[:, :, np.newaxis]
             else:  # l is kept fixed
-                l_gradient = np.empty((K.shape[0], K.shape[1], 0))
+                length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0))
 
             # gradient with respect to alpha
             if not self.hyperparameter_alpha.fixed:
                 alpha_gradient = \
                     K * (-self.alpha * np.log(base)
-                         + dists / (2 * self.l ** 2 * base))
+                         + dists / (2 * self.length_scale ** 2 * base))
                 alpha_gradient = alpha_gradient[:, :, np.newaxis]
             else:  # alpha is kept fixed
                 alpha_gradient = np.empty((K.shape[0], K.shape[1], 0))
 
-            return K, np.dstack((alpha_gradient, l_gradient))
+            return K, np.dstack((alpha_gradient, length_scale_gradient))
         else:
             return K
 
     def __repr__(self):
-        return "{0}(alpha={1:.3g}, l={2:.3g})".format(
-            self.__class__.__name__, self.alpha, self.l)
+        return "{0}(alpha={1:.3g}, length_scale={2:.3g})".format(
+            self.__class__.__name__, self.alpha, self.length_scale)
 
 
 class ExpSineSquared(Kernel):
     """Exp-Sine-Squared kernel.
 
     The ExpSineSquared kernel allows modeling periodic functions. It is
-    parameterized by a length-scale parameter l>0 and a periodicity parameter
-    p>0. Only the isotropic variant where l is a scalar is supported at the
-    moment. The kernel given by:
+    parameterized by a length-scale parameter length_scale>0 and a periodicity
+    parameter periodicity>0. Only the isotropic variant where l is a scalar is
+    supported at the moment. The kernel given by:
 
-    k(x_i, x_j) =  exp(-2 sin(\pi / p * d(x_i, x_j)) / l)^2
+    k(x_i, x_j) = exp(-2 sin(\pi / periodicity * d(x_i, x_j)) / length_scale)^2
 
     Parameters
     ----------
-    l : float > 0, default: 1.0
+    length_scale : float > 0, default: 1.0
         The length scale of the kernel.
 
-    p : float > 0, default: 1.0
+    periodicity : float > 0, default: 1.0
         The periodicity of the kernel.
 
-    l_bounds : pair of floats >= 0, default: (1e-5, 1e5)
-        The lower and upper bound on l
+    length_scale_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on length_scale
 
-    p_bounds : pair of floats >= 0, default: (1e-5, 1e5)
-        The lower and upper bound on p
+    periodicity_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on periodicity
     """
-    def __init__(self, l=1.0, p=1.0, l_bounds=(1e-5, 1e5),
-                 p_bounds=(1e-5, 1e5)):
-        self.l = l
-        self.p = p
-        self.l_bounds = l_bounds
-        self.p_bounds = p_bounds
-
-        self.hyperparameter_l = Hyperparameter("l", "numeric", l_bounds)
-        self.hyperparameter_p = Hyperparameter("p", "numeric", p_bounds)
+    def __init__(self, length_scale=1.0, periodicity=1.0,
+                 length_scale_bounds=(1e-5, 1e5),
+                 periodicity_bounds=(1e-5, 1e5)):
+        self.length_scale = length_scale
+        self.periodicity = periodicity
+        self.length_scale_bounds = length_scale_bounds
+        self.periodicity_bounds = periodicity_bounds
+
+        self.hyperparameter_length_scale = \
+            Hyperparameter("length_scale", "numeric", length_scale_bounds)
+        self.hyperparameter_periodicity = \
+            Hyperparameter("periodicity", "numeric", periodicity_bounds)
 
     def __call__(self, X, Y=None, eval_gradient=False):
         """Return the kernel k(X, Y) and optionally its gradient.
@@ -1439,39 +1456,42 @@ def __call__(self, X, Y=None, eval_gradient=False):
         X = np.atleast_2d(X)
         if Y is None:
             dists = squareform(pdist(X, metric='euclidean'))
-            arg = np.pi * dists / self.p
+            arg = np.pi * dists / self.periodicity
             sin_of_arg = np.sin(arg)
-            K = np.exp(- 2 * (sin_of_arg / self.l) ** 2)
+            K = np.exp(- 2 * (sin_of_arg / self.length_scale) ** 2)
         else:
             if eval_gradient:
                 raise ValueError(
                     "Gradient can only be evaluated when Y is None.")
             dists = cdist(X, Y, metric='euclidean')
-            K = np.exp(- 2 * (np.sin(np.pi / self.p * dists) / self.l) ** 2)
+            K = np.exp(- 2 * (np.sin(np.pi / self.periodicity * dists)
+                              / self.length_scale) ** 2)
 
         if eval_gradient:
             cos_of_arg = np.cos(arg)
-            # gradient with respect to l
-            if not self.hyperparameter_l.fixed:
-                l_gradient = 4 / self.l**2 * sin_of_arg**2 * K
-                l_gradient = l_gradient[:, :, np.newaxis]
-            else:  # l is kept fixed
-                l_gradient = np.empty((K.shape[0], K.shape[1], 0))
+            # gradient with respect to length_scale
+            if not self.hyperparameter_length_scale.fixed:
+                length_scale_gradient = \
+                    4 / self.length_scale**2 * sin_of_arg**2 * K
+                length_scale_gradient = length_scale_gradient[:, :, np.newaxis]
+            else:  # length_scale is kept fixed
+                length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0))
             # gradient with respect to p
-            if not self.hyperparameter_p.fixed:
-                p_gradient = 4 * arg / self.l**2 * cos_of_arg \
+            if not self.hyperparameter_periodicity.fixed:
+                periodicity_gradient = \
+                    4 * arg / self.length_scale**2 * cos_of_arg \
                     * sin_of_arg * K
-                p_gradient = p_gradient[:, :, np.newaxis]
+                periodicity_gradient = periodicity_gradient[:, :, np.newaxis]
             else:  # p is kept fixed
-                p_gradient = np.empty((K.shape[0], K.shape[1], 0))
+                periodicity_gradient = np.empty((K.shape[0], K.shape[1], 0))
 
-            return K, np.dstack((l_gradient, p_gradient))
+            return K, np.dstack((length_scale_gradient, periodicity_gradient))
         else:
             return K
 
     def __repr__(self):
-        return "{0}(l={1:.3g}, p={2:.3g})".format(
-            self.__class__.__name__, self.l, self.p)
+        return "{0}(length_scale={1:.3g}, periodicity={2:.3g})".format(
+            self.__class__.__name__, self.length_scale, self.periodicity)
 
 
 class DotProduct(Kernel):
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 83ceb6ce8199d..31131ed7800de 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -26,8 +26,10 @@ def f(x):
 y_mc[fX > 0.35] = 2
 
 
-kernels = [RBF(l=0.1), RBF(l=1.0, l_bounds=(1e-3, 1e3)),
-           C(1.0, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3))]
+kernels = [RBF(length_scale=0.1),
+           RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+           C(1.0, (1e-2, 1e2))
+           * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))]
 
 
 def test_predict_consistent():
@@ -87,7 +89,8 @@ def test_random_starts():
     y = (np.sin(X).sum(axis=1) + np.sin(3 * X).sum(axis=1)) > 0
 
     kernel = C(1.0, (1e-2, 1e2)) \
-        * RBF(l=[1e-3] * n_features, l_bounds=[(1e-4, 1e+2)] * n_features)
+        * RBF(length_scale=[1e-3] * n_features,
+              length_scale_bounds=[(1e-4, 1e+2)] * n_features)
     last_lml = -np.inf
     for n_restarts_optimizer in range(9):
         gp = GaussianProcessClassifier(
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index a7c9049b89407..8adf3ffca9b16 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -23,11 +23,15 @@ def f(x):
 y = f(X).ravel()
 
 
-kernels = [RBF(l=1.0), RBF(l=1.0, l_bounds=(1e-3, 1e3)),
-           C(1.0, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3)),
-           C(1.0, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3))
+kernels = [RBF(length_scale=1.0),
+           RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+           C(1.0, (1e-2, 1e2))
+           * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+           C(1.0, (1e-2, 1e2))
+           * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))
            + C(1e-5, (1e-5, 1e2)),
-           C(0.1, (1e-2, 1e2)) * RBF(l=1.0, l_bounds=(1e-3, 1e3))
+           C(0.1, (1e-2, 1e2))
+           * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))
            + C(1e-5, (1e-5, 1e2))]
 
 
@@ -165,8 +169,9 @@ def test_random_starts():
         + rng.normal(scale=0.1, size=n_samples)
 
     kernel = C(1.0, (1e-2, 1e2)) \
-        * RBF(l=[1.0] * n_features, l_bounds=[(1e-4, 1e+2)] * n_features) \
-        + WhiteKernel(c=1e-5, c_bounds=(1e-5, 1e1))
+        * RBF(length_scale=[1.0] * n_features,
+              length_scale_bounds=[(1e-4, 1e+2)] * n_features) \
+        + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-5, 1e1))
     last_lml = -np.inf
     for n_restarts_optimizer in range(9):
         gp = GaussianProcessRegressor(
@@ -212,7 +217,7 @@ def test_y_multioutput():
 
     # Test for fixed kernel that first dimension of 2d GP equals the output
     # of 1d GP and that second dimension is twice as large
-    kernel = RBF(l=1.0)
+    kernel = RBF(length_scale=1.0)
 
     gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None,
                                    normalize_y=False)
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 59f7a0b5ab7f6..260fba97fedec 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -26,21 +26,21 @@
 X = np.random.RandomState(0).normal(0, 1, (10, 2))
 Y = np.random.RandomState(0).normal(0, 1, (11, 2))
 
-kernel_white = RBF(l=2.0) + WhiteKernel(c=3.0)
-kernels = [RBF(l=2.0), RBF(l_bounds=(0.5, 2.0)),
-           ConstantKernel(c=10.0),
-           2.0 * RBF(l=0.33, l_bounds="fixed"),
-           2.0 * RBF(l=0.5), kernel_white,
-           2.0 * RBF(l=[0.5, 2.0]),
-           2.0 * Matern(l=0.33, l_bounds="fixed"),
-           2.0 * Matern(l=0.5, nu=0.5),
-           2.0 * Matern(l=1.5, nu=1.5),
-           2.0 * Matern(l=2.5, nu=2.5),
-           2.0 * Matern(l=[0.5, 2.0], nu=0.5),
-           3.0 * Matern(l=[2.0, 0.5], nu=1.5),
-           4.0 * Matern(l=[0.5, 0.5], nu=2.5),
-           RationalQuadratic(l=0.5, alpha=1.5),
-           ExpSineSquared(l=0.5, p=1.5),
+kernel_white = RBF(length_scale=2.0) + WhiteKernel(noise_level=3.0)
+kernels = [RBF(length_scale=2.0), RBF(length_scale_bounds=(0.5, 2.0)),
+           ConstantKernel(constant_value=10.0),
+           2.0 * RBF(length_scale=0.33, length_scale_bounds="fixed"),
+           2.0 * RBF(length_scale=0.5), kernel_white,
+           2.0 * RBF(length_scale=[0.5, 2.0]),
+           2.0 * Matern(length_scale=0.33, length_scale_bounds="fixed"),
+           2.0 * Matern(length_scale=0.5, nu=0.5),
+           2.0 * Matern(length_scale=1.5, nu=1.5),
+           2.0 * Matern(length_scale=2.5, nu=2.5),
+           2.0 * Matern(length_scale=[0.5, 2.0], nu=0.5),
+           3.0 * Matern(length_scale=[2.0, 0.5], nu=1.5),
+           4.0 * Matern(length_scale=[0.5, 0.5], nu=2.5),
+           RationalQuadratic(length_scale=0.5, alpha=1.5),
+           ExpSineSquared(length_scale=0.5, periodicity=1.5),
            DotProduct(sigma_0=2.0), DotProduct(sigma_0=2.0) ** 2]
 for metric in PAIRWISE_KERNEL_FUNCTIONS:
     if metric in ["additive_chi2", "chi2"]:
@@ -174,7 +174,7 @@ def test_kernel_anisotropic():
     # Check getting and setting via theta
     kernel.theta = kernel.theta + np.log(2)
     assert_array_equal(kernel.theta, np.log([6.0, 1.0, 4.0]))
-    assert_array_equal(kernel.k2.l, [1.0, 4.0])
+    assert_array_equal(kernel.k2.length_scale, [1.0, 4.0])
 
 
 def test_kernel_stationary():
@@ -220,20 +220,20 @@ def test_kernel_clone():
 
 def test_matern_kernel():
     """ Test consistency of Matern kernel for special values of nu. """
-    K = Matern(nu=1.5, l=1.0)(X)
+    K = Matern(nu=1.5, length_scale=1.0)(X)
     # the diagonal elements of a matern kernel are 1
     assert_array_almost_equal(np.diag(K), np.ones(X.shape[0]))
     # matern kernel for coef0==0.5 is equal to absolute exponential kernel
     K_absexp = np.exp(-euclidean_distances(X, X, squared=False))
-    K = Matern(nu=0.5, l=1.0)(X)
+    K = Matern(nu=0.5, length_scale=1.0)(X)
     assert_array_almost_equal(K, K_absexp)
     # test that special cases of matern kernel (coef0 in [0.5, 1.5, 2.5])
     # result in nearly identical results as the general case for coef0 in
     # [0.5 + tiny, 1.5 + tiny, 2.5 + tiny]
     tiny = 1e-10
     for nu in [0.5, 1.5, 2.5]:
-        K1 = Matern(nu=nu, l=1.0)(X)
-        K2 = Matern(nu=nu + tiny, l=1.0)(X)
+        K1 = Matern(nu=nu, length_scale=1.0)(X)
+        K2 = Matern(nu=nu + tiny, length_scale=1.0)(X)
         assert_array_almost_equal(K1, K2)
 
 

From 5778a0a21993442704e7122a695a2f944dda05e1 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 31 Aug 2015 17:27:54 +0200
Subject: [PATCH 160/165] MISC Changing default for copy_X_train to True in
 GaussianProcessClassifier

---
 sklearn/gaussian_process/gpc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index d52c6a3a80457..94e472de45c96 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -489,7 +489,7 @@ def optimizer(obj_func, initial_theta, bounds):
         up convergence when _posterior_mode is called several times on similar
         problems as in hyperparameter optimization.
 
-    copy_X_train : bool, optional (default: False)
+    copy_X_train : bool, optional (default: True)
         If True, a persistent copy of the training data is stored in the
         object. Otherwise, just a reference to the training data is stored,
         which might cause predictions to change if the data is modified
@@ -534,7 +534,7 @@ def optimizer(obj_func, initial_theta, bounds):
     """
     def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
                  n_restarts_optimizer=0, max_iter_predict=100,
-                 warm_start=False, copy_X_train=False, random_state=None,
+                 warm_start=False, copy_X_train=True, random_state=None,
                  multi_class="one_vs_rest", n_jobs=1):
         self.kernel = kernel
         self.optimizer = optimizer

From 42aa29e189979fd02731f4452049008baac7ef4c Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 7 Sep 2015 12:15:57 +0200
Subject: [PATCH 161/165] FIX String comparison via equality and not identity
 in Hyperparameter.__init__

---
 sklearn/gaussian_process/kernels.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index fb764828483fc..52e5bf6b18ed2 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -77,7 +77,7 @@ class Hyperparameter(namedtuple('Hyperparameter',
     __slots__ = ()
 
     def __new__(cls, name, value_type, bounds, n_elements=1, fixed=None):
-        if bounds is not "fixed":
+        if bounds != "fixed":
             bounds = np.atleast_2d(bounds)
             if n_elements > 1:  # vector-valued parameter
                 if bounds.shape[0] == 1:
@@ -88,7 +88,7 @@ def __new__(cls, name, value_type, bounds, n_elements=1, fixed=None):
                                      % (name, n_elements, bounds.shape[0]))
 
         if fixed is None:
-            fixed = bounds is "fixed"
+            fixed = (bounds == "fixed")
         return super(Hyperparameter, cls).__new__(
             cls, name, value_type, bounds, n_elements, fixed)
 

From 0942e1f26a29723a3592f4348af63914c216aa65 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 7 Sep 2015 12:16:57 +0200
Subject: [PATCH 162/165] ENH GPR.log_marginal_likelihood() returns the current
 log-likelihood if no theta vector is provided

---
 sklearn/gaussian_process/gpc.py            | 52 +++++++++++++++++-----
 sklearn/gaussian_process/gpr.py            | 26 ++++++++---
 sklearn/gaussian_process/tests/test_gpc.py | 16 ++++++-
 sklearn/gaussian_process/tests/test_gpr.py | 16 ++++++-
 4 files changed, 88 insertions(+), 22 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 94e472de45c96..e74f415b2d5d4 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -135,6 +135,9 @@ def optimizer(obj_func, initial_theta, bounds):
         Square root of W, the Hessian of log-likelihood of the latent function
         values for the observed labels. Since W is diagonal, only the diagonal
         of sqrt(W) is stored.
+
+    log_marginal_likelihood_value_: float
+        The log-marginal-likelihood of self.kernel_.theta
     """
     def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
                  n_restarts_optimizer=0, max_iter_predict=100,
@@ -185,9 +188,7 @@ def fit(self, X, y):
             raise ValueError("{0:s} requires 2 classes.".format(
                 self.__class__.__name__))
 
-        if self.kernel_.n_dims == 0:  # no tunable hyperparameters
-            pass
-        elif self.optimizer is not None:
+        if self.optimizer is not None and self.kernel_.n_dims > 0:
             # Choose hyperparameters based on maximizing the log-marginal
             # likelihood (potentially starting from several initial values)
             def obj_func(theta, eval_gradient=True):
@@ -221,6 +222,10 @@ def obj_func(theta, eval_gradient=True):
             # likelihood
             lml_values = map(itemgetter(1), optima)
             self.kernel_.theta = optima[np.argmin(lml_values)][0]
+            self.log_marginal_likelihood_value_ = -np.min(lml_values)
+        else:
+            self.log_marginal_likelihood_value_ = \
+                self.log_marginal_likelihood(self.kernel_.theta)
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
@@ -292,19 +297,20 @@ def predict_proba(self, X):
 
         return np.vstack((1 - pi_star, pi_star)).T
 
-    def log_marginal_likelihood(self, theta, eval_gradient=False):
+    def log_marginal_likelihood(self, theta=None, eval_gradient=False):
         """Returns log-marginal likelihood of theta for training data.
 
         Parameters
         ----------
-        theta : array-like, shape = (n_kernel_params,)
+        theta : array-like, shape = (n_kernel_params,) or None
             Kernel hyperparameters for which the log-marginal likelihood is
-            evaluated
+            evaluated. If None, the precomputed log_marginal_likelihood
+            of self.kernel_.theta is returned.
 
         eval_gradient : bool, default: False
             If True, the gradient of the log-marginal likelihood with respect
             to the kernel hyperparameters at position theta is returned
-            additionally.
+            additionally. If True, theta must not be None.
 
         Returns
         -------
@@ -316,6 +322,12 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
             hyperparameters at position theta.
             Only returned when eval_gradient is True.
         """
+        if theta is None:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
         kernel = self.kernel_.clone_with_theta(theta)
 
         if eval_gradient:
@@ -526,6 +538,9 @@ def optimizer(obj_func, initial_theta, bounds):
         classification, a CompoundKernel is returned which consists of the
         different kernels used in the one-versus-rest classifiers.
 
+    log_marginal_likelihood_value_: float
+        The log-marginal-likelihood of self.kernel_.theta
+
     classes_ : array-like, shape = (n_classes,)
         Unique class labels.
 
@@ -589,6 +604,14 @@ def fit(self, X, y):
 
         self.base_estimator_.fit(X, y)
 
+        if self.n_classes_ > 2:
+            self.log_marginal_likelihood_value_ = np.mean(
+                [estimator.log_marginal_likelihood()
+                 for estimator in self.base_estimator_.estimators_])
+        else:
+            self.log_marginal_likelihood_value_ = \
+                self.base_estimator_.log_marginal_likelihood()
+
         return self
 
     def predict(self, X):
@@ -638,7 +661,7 @@ def kernel_(self):
                 [estimator.kernel_
                  for estimator in self.base_estimator_.estimators_])
 
-    def log_marginal_likelihood(self, theta, eval_gradient=False):
+    def log_marginal_likelihood(self, theta=None, eval_gradient=False):
         """Returns log-marginal likelihood of theta for training data.
 
         In the case of multi-class classification, the mean log-marginal
@@ -646,18 +669,19 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
 
         Parameters
         ----------
-        theta : array-like, shape = (n_kernel_params,)
+        theta : array-like, shape = (n_kernel_params,) or none
             Kernel hyperparameters for which the log-marginal likelihood is
             evaluated. In the case of multi-class classification, theta may
             be the  hyperparameters of the compound kernel or of an individual
             kernel. In the latter case, all individual kernel get assigned the
-            same theta values.
+            same theta values. If None, the precomputed log_marginal_likelihood
+            of self.kernel_.theta is returned.
 
         eval_gradient : bool, default: False
             If True, the gradient of the log-marginal likelihood with respect
             to the kernel hyperparameters at position theta is returned
             additionally. Note that gradient computation is not supported
-            for non-binary classification.
+            for non-binary classification. If True, theta must not be None.
 
         Returns
         -------
@@ -671,6 +695,12 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
         """
         check_is_fitted(self, ["classes_", "n_classes_"])
 
+        if theta is None:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
         theta = np.asarray(theta)
         if self.n_classes_ == 2:
             return self.base_estimator_.log_marginal_likelihood(
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index b53e12865ddfb..3307080c3ddd6 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -119,6 +119,9 @@ def optimizer(obj_func, initial_theta, bounds):
 
     alpha_: array-like, shape = (n_samples,)
         Dual coefficients of training data points in kernel space
+
+    log_marginal_likelihood_value_: float
+        The log-marginal-likelihood of self.kernel_.theta
     """
     def __init__(self, kernel=None, alpha=1e-10,
                  optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0,
@@ -176,9 +179,7 @@ def fit(self, X, y):
         self.X_train_ = np.copy(X) if self.copy_X_train else X
         self.y_train_ = np.copy(y) if self.copy_X_train else y
 
-        if self.kernel_.n_dims == 0:  # no tunable hyperparameters
-            pass
-        elif self.optimizer is not None:
+        if self.optimizer is not None and self.kernel_.n_dims > 0:
             # Choose hyperparameters based on maximizing the log-marginal
             # likelihood (potentially starting from several initial values)
             def obj_func(theta, eval_gradient=True):
@@ -212,6 +213,10 @@ def obj_func(theta, eval_gradient=True):
             # likelihood
             lml_values = map(itemgetter(1), optima)
             self.kernel_.theta = optima[np.argmin(lml_values)][0]
+            self.log_marginal_likelihood_value_ = -np.min(lml_values)
+        else:
+            self.log_marginal_likelihood_value_ = \
+                self.log_marginal_likelihood(self.kernel_.theta)
 
         # Precompute quantities required for predictions which are independent
         # of actual query points
@@ -334,19 +339,20 @@ def sample_y(self, X, n_samples=1, random_state=0):
             y_samples = np.hstack(y_samples)
         return y_samples
 
-    def log_marginal_likelihood(self, theta, eval_gradient=False):
+    def log_marginal_likelihood(self, theta=None, eval_gradient=False):
         """Returns log-marginal likelihood of theta for training data.
 
         Parameters
         ----------
-        theta : array-like, shape = (n_kernel_params,)
+        theta : array-like, shape = (n_kernel_params,) or None
             Kernel hyperparameters for which the log-marginal likelihood is
-            evaluated
+            evaluated. If None, the precomputed log_marginal_likelihood
+            of self.kernel_.theta is returned.
 
         eval_gradient : bool, default: False
             If True, the gradient of the log-marginal likelihood with respect
             to the kernel hyperparameters at position theta is returned
-            additionally.
+            additionally. If True, theta must not be None.
 
         Returns
         -------
@@ -358,6 +364,12 @@ def log_marginal_likelihood(self, theta, eval_gradient=False):
             hyperparameters at position theta.
             Only returned when eval_gradient is True.
         """
+        if theta is None:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
         kernel = self.kernel_.clone_with_theta(theta)
 
         if eval_gradient:
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 31131ed7800de..424b9aeffe8eb 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -10,7 +10,7 @@
 from sklearn.gaussian_process import GaussianProcessClassifier
 from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
 
-from sklearn.utils.testing import (assert_true, assert_greater,
+from sklearn.utils.testing import (assert_true, assert_greater, assert_equal,
                                    assert_almost_equal, assert_array_equal)
 
 
@@ -26,7 +26,8 @@ def f(x):
 y_mc[fX > 0.35] = 2
 
 
-kernels = [RBF(length_scale=0.1),
+fixed_kernel = RBF(length_scale=1.0, length_scale_bounds="fixed")
+kernels = [RBF(length_scale=0.1), fixed_kernel,
            RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
            C(1.0, (1e-2, 1e2))
            * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))]
@@ -44,14 +45,24 @@ def test_predict_consistent():
 def test_lml_improving():
     """ Test that hyperparameter-tuning improves log-marginal likelihood. """
     for kernel in kernels:
+        if kernel == fixed_kernel: continue
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
         assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.theta),
                        gpc.log_marginal_likelihood(kernel.theta))
 
 
+def test_lml_precomputed():
+    """ Test that lml of optimized kernel is stored correctly. """
+    for kernel in kernels:
+        gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+        assert_equal(gpc.log_marginal_likelihood(gpc.kernel_.theta),
+                     gpc.log_marginal_likelihood())
+
+
 def test_converged_to_local_maximum():
     """ Test that we are in local maximum after hyperparameter-optimization."""
     for kernel in kernels:
+        if kernel == fixed_kernel: continue
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
 
         lml, lml_gradient = \
@@ -117,6 +128,7 @@ def optimizer(obj_func, initial_theta, bounds):
         return theta_opt, func_min
 
     for kernel in kernels:
+        if kernel == fixed_kernel: continue
         gpc = GaussianProcessClassifier(kernel=kernel, optimizer=optimizer)
         gpc.fit(X, y_mc)
         # Checks that optimizer improved marginal likelihood
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 8adf3ffca9b16..218054dbbb52f 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -22,8 +22,8 @@ def f(x):
 X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T
 y = f(X).ravel()
 
-
-kernels = [RBF(length_scale=1.0),
+fixed_kernel = RBF(length_scale=1.0, length_scale_bounds="fixed")
+kernels = [RBF(length_scale=1.0), fixed_kernel,
            RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
            C(1.0, (1e-2, 1e2))
            * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
@@ -48,14 +48,24 @@ def test_gpr_interpolation():
 def test_lml_improving():
     """ Test that hyperparameter-tuning improves log-marginal likelihood. """
     for kernel in kernels:
+        if kernel == fixed_kernel: continue
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
         assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta),
                        gpr.log_marginal_likelihood(kernel.theta))
 
 
+def test_lml_precomputed():
+    """ Test that lml of optimized kernel is stored correctly. """
+    for kernel in kernels:
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+        assert_equal(gpr.log_marginal_likelihood(gpr.kernel_.theta),
+                     gpr.log_marginal_likelihood())
+
+
 def test_converged_to_local_maximum():
     """ Test that we are in local maximum after hyperparameter-optimization."""
     for kernel in kernels:
+        if kernel == fixed_kernel: continue
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
         lml, lml_gradient = \
@@ -69,6 +79,7 @@ def test_converged_to_local_maximum():
 def test_solution_inside_bounds():
     """ Test that hyperparameter-optimization remains in bounds"""
     for kernel in kernels:
+        if kernel == fixed_kernel: continue
         gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
 
         bounds = gpr.kernel_.bounds
@@ -270,6 +281,7 @@ def optimizer(obj_func, initial_theta, bounds):
         return theta_opt, func_min
 
     for kernel in kernels:
+        if kernel == fixed_kernel: continue
         gpr = GaussianProcessRegressor(kernel=kernel, optimizer=optimizer)
         gpr.fit(X, y)
         # Checks that optimizer improved marginal likelihood

From 68b654d42ab33dec87cd931daf3abd30b585b024 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 7 Sep 2015 14:00:35 +0200
Subject: [PATCH 163/165] FIX Enforcing y to be numeric in GPR and fixing
 python3 issue with maps

---
 sklearn/gaussian_process/gpc.py |  2 +-
 sklearn/gaussian_process/gpr.py | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index e74f415b2d5d4..f9c00a98be203 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -220,7 +220,7 @@ def obj_func(theta, eval_gradient=True):
                                                        bounds))
             # Select result from run with minimal (negative) log-marginal
             # likelihood
-            lml_values = map(itemgetter(1), optima)
+            lml_values = list(map(itemgetter(1), optima))
             self.kernel_.theta = optima[np.argmin(lml_values)][0]
             self.log_marginal_likelihood_value_ = -np.min(lml_values)
         else:
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index 3307080c3ddd6..85ff65b8a84b2 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -157,7 +157,7 @@ def fit(self, X, y):
 
         self.rng = check_random_state(self.random_state)
 
-        X, y = check_X_y(X, y, multi_output=True)
+        X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
 
         # Normalize target value
         if self.normalize_y:
@@ -211,7 +211,7 @@ def obj_func(theta, eval_gradient=True):
                                                        bounds))
             # Select result from run with minimal (negative) log-marginal
             # likelihood
-            lml_values = map(itemgetter(1), optima)
+            lml_values = list(map(itemgetter(1), optima))
             self.kernel_.theta = optima[np.argmin(lml_values)][0]
             self.log_marginal_likelihood_value_ = -np.min(lml_values)
         else:
@@ -385,14 +385,14 @@ def log_marginal_likelihood(self, theta=None, eval_gradient=False):
                 if eval_gradient else -np.inf
 
         # Support multi-dimensional output of self.y_train_
-        y_fit = self.y_train_
-        if y_fit.ndim == 1:
-            y_fit = y_fit[:, np.newaxis]
+        y_train = self.y_train_
+        if y_train.ndim == 1:
+            y_train = y_train[:, np.newaxis]
 
-        alpha = cho_solve((L, True), y_fit)  # Line 3
+        alpha = cho_solve((L, True), y_train)  # Line 3
 
         # Compute log-likelihood (compare line 7)
-        log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_fit, alpha)
+        log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_train, alpha)
         log_likelihood_dims -= np.log(np.diag(L)).sum()
         log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi)
         log_likelihood = log_likelihood_dims.sum(-1)  # sum over dimensions

From bd69c0b6575e83171f9aa420b4f2c909fc475680 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 7 Sep 2015 14:32:25 +0200
Subject: [PATCH 164/165] ADD Mixins for normalized and stationary kernels

---
 sklearn/gaussian_process/kernels.py | 60 ++++++++++++++++++++++++++---
 1 file changed, 55 insertions(+), 5 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 52e5bf6b18ed2..d3583fe19f944 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -295,6 +295,33 @@ def __repr__(self):
     def __call__(self, X, Y=None, eval_gradient=False):
         """Evaluate the kernel."""
 
+    @abstractmethod
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+
+    @abstractmethod
+    def is_stationary(self):
+        """Returns whether the kernel is stationary. """
+
+
+class NormalizedKernelMixin(object):
+    """Mixin for kernels which are normalized: k(X, X)=1."""
+
     def diag(self, X):
         """Returns the diagonal of the kernel k(X, X).
 
@@ -314,6 +341,10 @@ def diag(self, X):
         """
         return np.ones(X.shape[0])
 
+
+class StationaryKernelMixin(object):
+    """Mixin for kernels which are stationary: k(X, Y)= f(X-Y)."""
+
     def is_stationary(self):
         """Returns whether the kernel is stationary. """
         return True
@@ -432,6 +463,25 @@ def is_stationary(self):
         """Returns whether the kernel is stationary. """
         return np.all([kernel.is_stationary() for kernel in self.kernels])
 
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X, n_kernels)
+            Diagonal of kernel k(X, X)
+        """
+        return np.vstack([kernel.diag(X) for kernel in self.kernels]).T
+
 
 class KernelOperator(Kernel):
     """Base class for all kernel operators. """
@@ -832,7 +882,7 @@ def is_stationary(self):
         return self.kernel.is_stationary()
 
 
-class ConstantKernel(Kernel):
+class ConstantKernel(StationaryKernelMixin, Kernel):
     """Constant kernel.
 
     Can be used as part of a product-kernel where it scales the magnitude of
@@ -922,7 +972,7 @@ def __repr__(self):
         return "{0:.3g}**2".format(np.sqrt(self.constant_value))
 
 
-class WhiteKernel(Kernel):
+class WhiteKernel(StationaryKernelMixin, Kernel):
     """White kernel.
 
     The main use-case of this kernel is as part of a sum-kernel where it
@@ -1013,7 +1063,7 @@ def __repr__(self):
                                                  self.noise_level)
 
 
-class RBF(Kernel):
+class RBF(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
     """Radial-basis function kernel (aka squared-exponential kernel).
 
     The RBF kernel is a stationary kernel. It is also known as the
@@ -1285,7 +1335,7 @@ def __repr__(self):
                 self.__class__.__name__, self.length_scale, self.nu)
 
 
-class RationalQuadratic(Kernel):
+class RationalQuadratic(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
     """Rational Quadratic kernel.
 
     The RationalQuadratic kernel can be seen as a scale mixture (an infinite
@@ -1390,7 +1440,7 @@ def __repr__(self):
             self.__class__.__name__, self.alpha, self.length_scale)
 
 
-class ExpSineSquared(Kernel):
+class ExpSineSquared(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
     """Exp-Sine-Squared kernel.
 
     The ExpSineSquared kernel allows modeling periodic functions. It is

From 709d43c32626463ec54daa22b2577a8d721751d6 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
Date: Mon, 14 Sep 2015 10:39:31 +0200
Subject: [PATCH 165/165] TST test_lml_precomputed() checks only for equality
 in first 7 digits

---
 sklearn/gaussian_process/tests/test_gpc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 424b9aeffe8eb..3cc1a2ea429e3 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -55,8 +55,8 @@ def test_lml_precomputed():
     """ Test that lml of optimized kernel is stored correctly. """
     for kernel in kernels:
         gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
-        assert_equal(gpc.log_marginal_likelihood(gpc.kernel_.theta),
-                     gpc.log_marginal_likelihood())
+        assert_almost_equal(gpc.log_marginal_likelihood(gpc.kernel_.theta),
+                            gpc.log_marginal_likelihood(), 7)
 
 
 def test_converged_to_local_maximum():