From 8ad7472b3ca7affec1cbd867121625722efc2fb5 Mon Sep 17 00:00:00 2001 From: Jonas Seiler Date: Mon, 19 May 2014 14:36:51 +0200 Subject: [PATCH 1/6] Allow multiple input feature with the same value for gaussian processes --- sklearn/gaussian_process/gaussian_process.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py index 2c630ef20e747..3d08a61798e73 100644 --- a/sklearn/gaussian_process/gaussian_process.py +++ b/sklearn/gaussian_process/gaussian_process.py @@ -296,10 +296,6 @@ def fit(self, X, y): # Calculate matrix of distances D between samples D, ij = l1_cross_distances(X) - if (np.min(np.sum(D, axis=1)) == 0. - and self.corr != correlation.pure_nugget): - raise Exception("Multiple input features cannot have the same" - " value.") # Regression matrix and parameters F = self.regr(X) @@ -586,9 +582,6 @@ def reduced_likelihood_function(self, theta=None): if D is None: # Light storage mode (need to recompute D, ij and F) D, ij = l1_cross_distances(self.X) - if (np.min(np.sum(D, axis=1)) == 0. - and self.corr != correlation.pure_nugget): - raise Exception("Multiple X are not allowed") F = self.regr(self.X) # Set up R @@ -715,8 +708,8 @@ def minus_reduced_likelihood_function(log10t): # Generate a random starting point log10-uniformly # distributed between bounds log10theta0 = np.log10(self.thetaL) \ - + rand(self.theta0.size).reshape(self.theta0.shape) \ - * np.log10(self.thetaU / self.thetaL) + + rand(self.theta0.size).reshape(self.theta0.shape) \ + * np.log10(self.thetaU / self.thetaL) theta0 = 10. ** log10theta0 # Run Cobyla @@ -788,7 +781,7 @@ def corr_cut(t, d): return corr(array2d(np.hstack([optimal_theta[0][0:i], t[0], optimal_theta[0][(i + 1)::]] - )), d) + )), d) self.corr = corr_cut optimal_theta[0, i], optimal_rlf_value, optimal_par = \ @@ -873,7 +866,7 @@ def _check_params(self, n_samples=None): if np.any(self.nugget) < 0.: raise ValueError("nugget must be positive or zero.") if (n_samples is not None - and self.nugget.shape not in [(), (n_samples,)]): + and self.nugget.shape not in [(), (n_samples,)]): raise ValueError("nugget must be either a scalar " "or array of length n_samples.") From 5d6bd5ad5c2c76b2478fca91197ec2baf332432a Mon Sep 17 00:00:00 2001 From: Jonas Seiler Date: Wed, 21 May 2014 14:00:11 +0200 Subject: [PATCH 2/6] do not allow multiple feature for non-noisy data. Unittest for noisy data #3162 --- sklearn/gaussian_process/gaussian_process.py | 17 ++- .../tests/test_gaussian_process.py | 139 +++++++++++++++--- 2 files changed, 131 insertions(+), 25 deletions(-) diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py index 3d08a61798e73..e63998cc712d2 100644 --- a/sklearn/gaussian_process/gaussian_process.py +++ b/sklearn/gaussian_process/gaussian_process.py @@ -16,6 +16,7 @@ from . import correlation_models as correlation MACHINE_EPSILON = np.finfo(np.double).eps +DEFAULT_NUGGET = 10. * MACHINE_EPSILON def l1_cross_distances(X): @@ -221,7 +222,7 @@ def __init__(self, regr='constant', corr='squared_exponential', beta0=None, storage_mode='full', verbose=False, theta0=1e-1, thetaL=None, thetaU=None, optimizer='fmin_cobyla', random_start=1, normalize=True, - nugget=10. * MACHINE_EPSILON, random_state=None): + nugget=DEFAULT_NUGGET, random_state=None): self.regr = regr self.corr = corr @@ -296,6 +297,9 @@ def fit(self, X, y): # Calculate matrix of distances D between samples D, ij = l1_cross_distances(X) + if self._prohibitMultipleInputFeature(D): + raise Exception("Multiple input features cannot have the same" + " value for non-noisy data.") # Regression matrix and parameters F = self.regr(X) @@ -521,6 +525,15 @@ def predict(self, X, eval_MSE=False, batch_size=None): return y + def _prohibitMultipleInputFeature(self, D): + """ + Multiple input features are only allowed for noisy data + """ + return self.nugget <= DEFAULT_NUGGET and \ + np.min(np.sum(D, axis=1)) == 0. and \ + self.corr != correlation.pure_nugget + + def reduced_likelihood_function(self, theta=None): """ This function determines the BLUP parameters and evaluates the reduced @@ -582,6 +595,8 @@ def reduced_likelihood_function(self, theta=None): if D is None: # Light storage mode (need to recompute D, ij and F) D, ij = l1_cross_distances(self.X) + if self._prohibitMultipleInputFeature(D): + raise Exception("Multiple X only make sense for noisy data") F = self.regr(self.X) # Set up R diff --git a/sklearn/gaussian_process/tests/test_gaussian_process.py b/sklearn/gaussian_process/tests/test_gaussian_process.py index 61d617690edb9..efc3f1511bc1d 100644 --- a/sklearn/gaussian_process/tests/test_gaussian_process.py +++ b/sklearn/gaussian_process/tests/test_gaussian_process.py @@ -9,6 +9,9 @@ from nose.tools import assert_true import numpy as np +import scipy.stats as ss + +import pylab as pl from sklearn.gaussian_process import GaussianProcess from sklearn.gaussian_process import regression_models as regression @@ -39,7 +42,7 @@ def test_1d(regr=regression.constant, corr=correlation.squared_exponential, and np.allclose(MSE2, 0., atol=10)) -def test_2d(regr=regression.constant, corr=correlation.squared_exponential, +def check2d(X, regr=regression.constant, corr=correlation.squared_exponential, random_start=10, beta0=None): """ MLE estimation of a two-dimensional Gaussian Process model accounting for @@ -49,14 +52,7 @@ def test_2d(regr=regression.constant, corr=correlation.squared_exponential, """ b, kappa, e = 5., .5, .1 g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2. - X = np.array([[-4.61611719, -6.00099547], - [4.10469096, 5.32782448], - [0.00000000, -0.50000000], - [-6.17289014, -4.6984743], - [1.3109306, -6.93271427], - [-5.03823144, 3.10584743], - [-2.87600388, 6.74310541], - [5.21301203, 4.26386883]]) + y = g(X).ravel() gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0, theta0=[1e-2] * 2, thetaL=[1e-4] * 2, @@ -68,7 +64,7 @@ def test_2d(regr=regression.constant, corr=correlation.squared_exponential, assert_true(np.allclose(y_pred, y) and np.allclose(MSE, 0.)) -def test_2d_2d(regr=regression.constant, corr=correlation.squared_exponential, +def check2d_2d(X, regr=regression.constant, corr=correlation.squared_exponential, random_start=10, beta0=None): """ MLE estimation of a two-dimensional Gaussian Process model accounting for @@ -79,14 +75,7 @@ def test_2d_2d(regr=regression.constant, corr=correlation.squared_exponential, b, kappa, e = 5., .5, .1 g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2. f = lambda x: np.vstack((g(x), g(x))).T - X = np.array([[-4.61611719, -6.00099547], - [4.10469096, 5.32782448], - [0.00000000, -0.50000000], - [-6.17289014, -4.6984743], - [1.3109306, -6.93271427], - [-5.03823144, 3.10584743], - [-2.87600388, 6.74310541], - [5.21301203, 4.26386883]]) + y = f(X) gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0, theta0=[1e-2] * 2, thetaL=[1e-4] * 2, @@ -112,10 +101,19 @@ def test_more_builtin_correlation_models(random_start=1): all_corr = ['absolute_exponential', 'squared_exponential', 'cubic', 'linear'] + X = np.array([[-4.61611719, -6.00099547], + [4.10469096, 5.32782448], + [0.00000000, -0.50000000], + [-6.17289014, -4.6984743], + [1.3109306, -6.93271427], + [-5.03823144, 3.10584743], + [-2.87600388, 6.74310541], + [5.21301203, 4.26386883]]) + for corr in all_corr: test_1d(regr='constant', corr=corr, random_start=random_start) - test_2d(regr='constant', corr=corr, random_start=random_start) - test_2d_2d(regr='constant', corr=corr, random_start=random_start) + check2d(X, regr='constant', corr=corr, random_start=random_start) + check2d_2d(X, regr='constant', corr=corr, random_start=random_start) def test_ordinary_kriging(): @@ -123,15 +121,108 @@ def test_ordinary_kriging(): Repeat test_1d and test_2d with given regression weights (beta0) for different regression models (Ordinary Kriging). """ + + X = np.array([[-4.61611719, -6.00099547], + [4.10469096, 5.32782448], + [0.00000000, -0.50000000], + [-6.17289014, -4.6984743], + [1.3109306, -6.93271427], + [-5.03823144, 3.10584743], + [-2.87600388, 6.74310541], + [5.21301203, 4.26386883]]) + test_1d(regr='linear', beta0=[0., 0.5]) test_1d(regr='quadratic', beta0=[0., 0.5, 0.5]) - test_2d(regr='linear', beta0=[0., 0.5, 0.5]) - test_2d(regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5]) - test_2d_2d(regr='linear', beta0=[0., 0.5, 0.5]) - test_2d_2d(regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5]) + check2d(X, regr='linear', beta0=[0., 0.5, 0.5]) + check2d(X, regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5]) + check2d_2d(X, regr='linear', beta0=[0., 0.5, 0.5]) + check2d_2d(X, regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5]) def test_no_normalize(): gp = GaussianProcess(normalize=False).fit(X, y) y_pred = gp.predict(X) assert_true(np.allclose(y_pred, y)) + + +@raises(Exception) +def test_no_multiple_feature(): + ''' + check that multiple features are not allowed for non-noisy data + ''' + X = np.array([[-4.61611719, -6.00099547], + [4.10469096, 5.32782448], + [0.00000000, -0.50000000], + [-6.17289014, -4.6984743], + [-6.17289014, -4.6984743], + [1.3109306, -6.93271427], + [-5.03823144, 3.10584743], + [-2.87600388, 6.74310541], + [5.21301203, 4.26386883]]) + + check2d(X) + + +def check2dNoisy(X, regr=regression.constant, corr=correlation.squared_exponential, + random_start=10, beta0=None): + """ + MLE estimation of a two-dimensional Gaussian Process model accounting for + anisotropy. Check random start optimization. + + Test the interpolating property. + """ + b, kappa, e = 5., .5, .1 + g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2. + + y = g(X).ravel() + np.random.normal(0, 0.1, X.shape[0]) + gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0, + theta0=[1e-2] * 2, thetaL=[1e-4] * 2, + thetaU=[1.] * 2, + random_start=random_start, verbose=False, nugget=0.1) + gp.fit(X, y) + y_pred, MSE = gp.predict(X, eval_MSE=True) + + fig = pl.figure() + pl.plot(X, f(X), 'r:', label=u'$f(x) = x\,\sin(x)$') + pl.plot(X, y, 'r.', markersize=10, label=u'Observations') + pl.plot(X, y_pred, 'b-', label=u'Prediction') + pl.fill(np.concatenate([X, X[::-1]]), + np.concatenate([y_pred - 1.9600 * MSE, + (y_pred + 1.9600 * MSE)[::-1]]), + alpha=.5, fc='b', ec='None', label='95% confidence interval') + pl.xlabel('$x$') + pl.ylabel('$f(x)$') + pl.ylim(-10, 10) + pl.legend(loc='upper left') + pl.show() + + print((np.abs(y_pred - y) <= ss.norm.ppf(0.999, y_pred, np.sqrt(MSE)))) + + assert_true((np.abs(y_pred - y) <= ss.norm.ppf(0.999, y_pred, np.sqrt( + MSE))).all()) #check that difference between prediction and truth is smaller than 99.9% conf int. + + +def test_1d_noisy(regr=regression.constant, corr=correlation.absolute_exponential, + random_start=10, beta0=None): + """ + MLE estimation of a one-dimensional Gaussian Process model. + Check random start optimization with noisy / duplicate inputs. + + Test the interpolating property. + """ + + X = np.atleast_2d([1., 3., 5., 6., 7., 8., 9., 10.] * 2).T + x = np.atleast_2d(np.linspace(0, 10, 50)).T + + y = f(X).ravel() + np.random.normal(0, 0.1, len(X)) + + gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0, + theta0=1e-2, thetaL=1e-4, thetaU=1e-1, + random_start=random_start, verbose=False, nugget=0.01).fit(X, y) + y_pred, MSE = gp.predict(x, eval_MSE=True) + + y = f(x).ravel() + assert_true((np.abs(y_pred - y) <= np.abs( + ss.norm.ppf(0.025, y_pred, np.sqrt(MSE))) ).all()) #check that true value is within 95% conf. int. + + From d183e210adafa34f4dc48647fb512bcce336e96b Mon Sep 17 00:00:00 2001 From: Jonas Seiler Date: Wed, 21 May 2014 14:05:23 +0200 Subject: [PATCH 3/6] remove unused code --- .../tests/test_gaussian_process.py | 39 ------------------- 1 file changed, 39 deletions(-) diff --git a/sklearn/gaussian_process/tests/test_gaussian_process.py b/sklearn/gaussian_process/tests/test_gaussian_process.py index efc3f1511bc1d..86f7ea438cd2d 100644 --- a/sklearn/gaussian_process/tests/test_gaussian_process.py +++ b/sklearn/gaussian_process/tests/test_gaussian_process.py @@ -163,45 +163,6 @@ def test_no_multiple_feature(): check2d(X) -def check2dNoisy(X, regr=regression.constant, corr=correlation.squared_exponential, - random_start=10, beta0=None): - """ - MLE estimation of a two-dimensional Gaussian Process model accounting for - anisotropy. Check random start optimization. - - Test the interpolating property. - """ - b, kappa, e = 5., .5, .1 - g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2. - - y = g(X).ravel() + np.random.normal(0, 0.1, X.shape[0]) - gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0, - theta0=[1e-2] * 2, thetaL=[1e-4] * 2, - thetaU=[1.] * 2, - random_start=random_start, verbose=False, nugget=0.1) - gp.fit(X, y) - y_pred, MSE = gp.predict(X, eval_MSE=True) - - fig = pl.figure() - pl.plot(X, f(X), 'r:', label=u'$f(x) = x\,\sin(x)$') - pl.plot(X, y, 'r.', markersize=10, label=u'Observations') - pl.plot(X, y_pred, 'b-', label=u'Prediction') - pl.fill(np.concatenate([X, X[::-1]]), - np.concatenate([y_pred - 1.9600 * MSE, - (y_pred + 1.9600 * MSE)[::-1]]), - alpha=.5, fc='b', ec='None', label='95% confidence interval') - pl.xlabel('$x$') - pl.ylabel('$f(x)$') - pl.ylim(-10, 10) - pl.legend(loc='upper left') - pl.show() - - print((np.abs(y_pred - y) <= ss.norm.ppf(0.999, y_pred, np.sqrt(MSE)))) - - assert_true((np.abs(y_pred - y) <= ss.norm.ppf(0.999, y_pred, np.sqrt( - MSE))).all()) #check that difference between prediction and truth is smaller than 99.9% conf int. - - def test_1d_noisy(regr=regression.constant, corr=correlation.absolute_exponential, random_start=10, beta0=None): """ From 3240ae1dce0116d2f9f43bd6968f23bb389647b1 Mon Sep 17 00:00:00 2001 From: Jonas Seiler Date: Thu, 22 May 2014 09:52:37 +0200 Subject: [PATCH 4/6] remvoe unused import --- .../gaussian_process/tests/test_gaussian_process.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/sklearn/gaussian_process/tests/test_gaussian_process.py b/sklearn/gaussian_process/tests/test_gaussian_process.py index 86f7ea438cd2d..c81e8df70932c 100644 --- a/sklearn/gaussian_process/tests/test_gaussian_process.py +++ b/sklearn/gaussian_process/tests/test_gaussian_process.py @@ -4,6 +4,7 @@ # Author: Vincent Dubourg # Licence: BSD 3 clause +from macpath import norm_error from nose.tools import raises from nose.tools import assert_true @@ -11,8 +12,6 @@ import numpy as np import scipy.stats as ss -import pylab as pl - from sklearn.gaussian_process import GaussianProcess from sklearn.gaussian_process import regression_models as regression from sklearn.gaussian_process import correlation_models as correlation @@ -65,7 +64,7 @@ def check2d(X, regr=regression.constant, corr=correlation.squared_exponential, def check2d_2d(X, regr=regression.constant, corr=correlation.squared_exponential, - random_start=10, beta0=None): + random_start=10, beta0=None, theta0=[1e-2] * 2, thetaL=[1e-4] * 2, thetaU=[1e-1] * 2): """ MLE estimation of a two-dimensional Gaussian Process model accounting for anisotropy. Check random start optimization. @@ -78,8 +77,8 @@ def check2d_2d(X, regr=regression.constant, corr=correlation.squared_exponential y = f(X) gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0, - theta0=[1e-2] * 2, thetaL=[1e-4] * 2, - thetaU=[1e-1] * 2, + theta0=theta0, thetaL=thetaL, + thetaU=thetaU, random_start=random_start, verbose=False) gp.fit(X, y) y_pred, MSE = gp.predict(X, eval_MSE=True) @@ -185,5 +184,3 @@ def test_1d_noisy(regr=regression.constant, corr=correlation.absolute_exponentia y = f(x).ravel() assert_true((np.abs(y_pred - y) <= np.abs( ss.norm.ppf(0.025, y_pred, np.sqrt(MSE))) ).all()) #check that true value is within 95% conf. int. - - From 3fb6e7366b5363dc5e6c6f7e2e175281645f6049 Mon Sep 17 00:00:00 2001 From: Jonas Seiler Date: Thu, 22 May 2014 10:36:17 +0200 Subject: [PATCH 5/6] simpler assertion in gp test --- sklearn/gaussian_process/tests/test_gaussian_process.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/gaussian_process/tests/test_gaussian_process.py b/sklearn/gaussian_process/tests/test_gaussian_process.py index c81e8df70932c..b5460718192d7 100644 --- a/sklearn/gaussian_process/tests/test_gaussian_process.py +++ b/sklearn/gaussian_process/tests/test_gaussian_process.py @@ -4,8 +4,6 @@ # Author: Vincent Dubourg # Licence: BSD 3 clause -from macpath import norm_error - from nose.tools import raises from nose.tools import assert_true @@ -182,5 +180,5 @@ def test_1d_noisy(regr=regression.constant, corr=correlation.absolute_exponentia y_pred, MSE = gp.predict(x, eval_MSE=True) y = f(x).ravel() - assert_true((np.abs(y_pred - y) <= np.abs( - ss.norm.ppf(0.025, y_pred, np.sqrt(MSE))) ).all()) #check that true value is within 95% conf. int. + assert_true((np.abs(y_pred - y) <= (1.96 * MSE) ).all()) #check that true value is within 95% conf. int. + From b3e93744f4fa8d261b3fbd930401cc9ae212e903 Mon Sep 17 00:00:00 2001 From: Jonas Seiler Date: Fri, 20 Jun 2014 13:32:21 +0200 Subject: [PATCH 6/6] small cosmetic fixes --- sklearn/gaussian_process/gaussian_process.py | 16 +++++++--------- .../tests/test_gaussian_process.py | 7 +++---- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py index e63998cc712d2..25acc81df27b2 100644 --- a/sklearn/gaussian_process/gaussian_process.py +++ b/sklearn/gaussian_process/gaussian_process.py @@ -297,9 +297,8 @@ def fit(self, X, y): # Calculate matrix of distances D between samples D, ij = l1_cross_distances(X) - if self._prohibitMultipleInputFeature(D): - raise Exception("Multiple input features cannot have the same" - " value for non-noisy data.") + if self._repeated_samples_prohibited(D): + raise ValueError("Repeated samples (rows in X) are not allowed for noise-free models") # Regression matrix and parameters F = self.regr(X) @@ -525,13 +524,12 @@ def predict(self, X, eval_MSE=False, batch_size=None): return y - def _prohibitMultipleInputFeature(self, D): + def _repeated_samples_prohibited(self, D): """ Multiple input features are only allowed for noisy data """ - return self.nugget <= DEFAULT_NUGGET and \ - np.min(np.sum(D, axis=1)) == 0. and \ - self.corr != correlation.pure_nugget + return self.nugget <= DEFAULT_NUGGET and np.min( + np.sum(D, axis=1)) == 0. and self.corr != correlation.pure_nugget def reduced_likelihood_function(self, theta=None): @@ -595,8 +593,8 @@ def reduced_likelihood_function(self, theta=None): if D is None: # Light storage mode (need to recompute D, ij and F) D, ij = l1_cross_distances(self.X) - if self._prohibitMultipleInputFeature(D): - raise Exception("Multiple X only make sense for noisy data") + if self._repeated_samples_prohibited(D): + raise ValueError("Repeated samples (rows in X) are not allowed for noise-free models") F = self.regr(self.X) # Set up R diff --git a/sklearn/gaussian_process/tests/test_gaussian_process.py b/sklearn/gaussian_process/tests/test_gaussian_process.py index b5460718192d7..96aab3a4f9b0e 100644 --- a/sklearn/gaussian_process/tests/test_gaussian_process.py +++ b/sklearn/gaussian_process/tests/test_gaussian_process.py @@ -8,7 +8,6 @@ from nose.tools import assert_true import numpy as np -import scipy.stats as ss from sklearn.gaussian_process import GaussianProcess from sklearn.gaussian_process import regression_models as regression @@ -142,7 +141,7 @@ def test_no_normalize(): assert_true(np.allclose(y_pred, y)) -@raises(Exception) +@raises(ValueError) def test_no_multiple_feature(): ''' check that multiple features are not allowed for non-noisy data @@ -172,7 +171,8 @@ def test_1d_noisy(regr=regression.constant, corr=correlation.absolute_exponentia X = np.atleast_2d([1., 3., 5., 6., 7., 8., 9., 10.] * 2).T x = np.atleast_2d(np.linspace(0, 10, 50)).T - y = f(X).ravel() + np.random.normal(0, 0.1, len(X)) + rs = np.random.RandomState(0) + y = f(X).ravel() + rs.normal(0, 0.1, len(X)) gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0, theta0=1e-2, thetaL=1e-4, thetaU=1e-1, @@ -181,4 +181,3 @@ def test_1d_noisy(regr=regression.constant, corr=correlation.absolute_exponentia y = f(x).ravel() assert_true((np.abs(y_pred - y) <= (1.96 * MSE) ).all()) #check that true value is within 95% conf. int. -