scikit-learn · filthysocks · May 19, 2014 · May 21, 2014 · May 21, 2014 · May 22, 2014
diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py
@@ -16,6 +16,7 @@
 from . import correlation_models as correlation
 
 MACHINE_EPSILON = np.finfo(np.double).eps
+DEFAULT_NUGGET = 10. * MACHINE_EPSILON
 
 
 def l1_cross_distances(X):
@@ -165,11 +166,11 @@ class GaussianProcess(BaseEstimator, RegressorMixin):
 
     Attributes
     ----------
-    theta_ : array
+    `theta_`: array
         Specified theta OR the best set of autocorrelation parameters (the \
         sought maximizer of the reduced likelihood function).
 
-    reduced_likelihood_function_value_ : array
+    `reduced_likelihood_function_value_`: array
         The optimal reduced likelihood function value.
 
     Examples
@@ -221,7 +222,7 @@ def __init__(self, regr='constant', corr='squared_exponential', beta0=None,
                  storage_mode='full', verbose=False, theta0=1e-1,
                  thetaL=None, thetaU=None, optimizer='fmin_cobyla',
                  random_start=1, normalize=True,
-                 nugget=10. * MACHINE_EPSILON, random_state=None):
+                 nugget=DEFAULT_NUGGET, random_state=None):
 
         self.regr = regr
         self.corr = corr
@@ -263,12 +264,12 @@ def fit(self, X, y):
         self.random_state = check_random_state(self.random_state)
 
         # Force data to 2D numpy.array
-        X = check_array(X)
+        X = array2d(X)
         y = np.asarray(y)
         self.y_ndim_ = y.ndim
         if y.ndim == 1:
             y = y[:, np.newaxis]
-        check_consistent_length(X, y)
+        X, y = check_arrays(X, y)
 
         # Check shapes of DOE & observations
         n_samples, n_features = X.shape
@@ -296,10 +297,8 @@ def fit(self, X, y):
 
         # Calculate matrix of distances D between samples
         D, ij = l1_cross_distances(X)
-        if (np.min(np.sum(D, axis=1)) == 0.
-                and self.corr != correlation.pure_nugget):
-            raise Exception("Multiple input features cannot have the same"
-                            " target value.")
+        if self._repeated_samples_prohibited(D):
+            raise ValueError("Repeated samples (rows in X) are not allowed for noise-free models")
 
         # Regression matrix and parameters
         F = self.regr(X)
@@ -525,6 +524,14 @@ def predict(self, X, eval_MSE=False, batch_size=None):
 
                 return y
 
+    def _repeated_samples_prohibited(self, D):
+        """
+        Multiple input features are only allowed for noisy data
+        """
+        return self.nugget <= DEFAULT_NUGGET and np.min(
+            np.sum(D, axis=1)) == 0. and self.corr != correlation.pure_nugget
+
+
     def reduced_likelihood_function(self, theta=None):
         """
         This function determines the BLUP parameters and evaluates the reduced
@@ -586,9 +593,8 @@ def reduced_likelihood_function(self, theta=None):
         if D is None:
             # Light storage mode (need to recompute D, ij and F)
             D, ij = l1_cross_distances(self.X)
-            if (np.min(np.sum(D, axis=1)) == 0.
-                    and self.corr != correlation.pure_nugget):
-                raise Exception("Multiple X are not allowed")
+            if self._repeated_samples_prohibited(D):
+                raise ValueError("Repeated samples (rows in X) are not allowed for noise-free models")
             F = self.regr(self.X)
 
         # Set up R
@@ -874,7 +880,7 @@ def _check_params(self, n_samples=None):
         if np.any(self.nugget) < 0.:
             raise ValueError("nugget must be positive or zero.")
         if (n_samples is not None
-                and self.nugget.shape not in [(), (n_samples,)]):
+            and self.nugget.shape not in [(), (n_samples,)]):
             raise ValueError("nugget must be either a scalar "
                              "or array of length n_samples.")
 

diff --git a/sklearn/gaussian_process/tests/test_gaussian_process.py b/sklearn/gaussian_process/tests/test_gaussian_process.py
@@ -40,7 +40,7 @@ def test_1d(regr=regression.constant, corr=correlation.squared_exponential,
                 and np.allclose(MSE2, 0., atol=10))
 
 
-def test_2d(regr=regression.constant, corr=correlation.squared_exponential,
+def check2d(X, regr=regression.constant, corr=correlation.squared_exponential,
             random_start=10, beta0=None):
     """
     MLE estimation of a two-dimensional Gaussian Process model accounting for
@@ -50,14 +50,7 @@ def test_2d(regr=regression.constant, corr=correlation.squared_exponential,
     """
     b, kappa, e = 5., .5, .1
     g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2.
-    X = np.array([[-4.61611719, -6.00099547],
-                  [4.10469096, 5.32782448],
-                  [0.00000000, -0.50000000],
-                  [-6.17289014, -4.6984743],
-                  [1.3109306, -6.93271427],
-                  [-5.03823144, 3.10584743],
-                  [-2.87600388, 6.74310541],
-                  [5.21301203, 4.26386883]])
+
     y = g(X).ravel()
 
     thetaL = [1e-4] * 2
@@ -75,8 +68,8 @@ def test_2d(regr=regression.constant, corr=correlation.squared_exponential,
     assert_true(np.all(gp.theta_ <= thetaU)) # Upper bounds of hyperparameters
 
 
-def test_2d_2d(regr=regression.constant, corr=correlation.squared_exponential,
-               random_start=10, beta0=None):
+def check2d_2d(X, regr=regression.constant, corr=correlation.squared_exponential,
+               random_start=10, beta0=None, theta0=[1e-2] * 2, thetaL=[1e-4] * 2, thetaU=[1e-1] * 2):
     """
     MLE estimation of a two-dimensional Gaussian Process model accounting for
     anisotropy. Check random start optimization.
@@ -86,18 +79,11 @@ def test_2d_2d(regr=regression.constant, corr=correlation.squared_exponential,
     b, kappa, e = 5., .5, .1
     g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2.
     f = lambda x: np.vstack((g(x), g(x))).T
-    X = np.array([[-4.61611719, -6.00099547],
-                  [4.10469096, 5.32782448],
-                  [0.00000000, -0.50000000],
-                  [-6.17289014, -4.6984743],
-                  [1.3109306, -6.93271427],
-                  [-5.03823144, 3.10584743],
-                  [-2.87600388, 6.74310541],
-                  [5.21301203, 4.26386883]])
+
     y = f(X)
     gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0,
-                         theta0=[1e-2] * 2, thetaL=[1e-4] * 2,
-                         thetaU=[1e-1] * 2,
+                         theta0=theta0, thetaL=thetaL,
+                         thetaU=thetaU,
                          random_start=random_start, verbose=False)
     gp.fit(X, y)
     y_pred, MSE = gp.predict(X, eval_MSE=True)
@@ -119,23 +105,42 @@ def test_more_builtin_correlation_models(random_start=1):
     all_corr = ['absolute_exponential', 'squared_exponential', 'cubic',
                 'linear']
 
+    X = np.array([[-4.61611719, -6.00099547],
+                  [4.10469096, 5.32782448],
+                  [0.00000000, -0.50000000],
+                  [-6.17289014, -4.6984743],
+                  [1.3109306, -6.93271427],
+                  [-5.03823144, 3.10584743],
+                  [-2.87600388, 6.74310541],
+                  [5.21301203, 4.26386883]])
+
     for corr in all_corr:
         test_1d(regr='constant', corr=corr, random_start=random_start)
-        test_2d(regr='constant', corr=corr, random_start=random_start)
-        test_2d_2d(regr='constant', corr=corr, random_start=random_start)
+        check2d(X, regr='constant', corr=corr, random_start=random_start)
+        check2d_2d(X, regr='constant', corr=corr, random_start=random_start)
 
 
 def test_ordinary_kriging():
     """
     Repeat test_1d and test_2d with given regression weights (beta0) for
     different regression models (Ordinary Kriging).
     """
+
+    X = np.array([[-4.61611719, -6.00099547],
+                  [4.10469096, 5.32782448],
+                  [0.00000000, -0.50000000],
+                  [-6.17289014, -4.6984743],
+                  [1.3109306, -6.93271427],
+                  [-5.03823144, 3.10584743],
+                  [-2.87600388, 6.74310541],
+                  [5.21301203, 4.26386883]])
+
     test_1d(regr='linear', beta0=[0., 0.5])
     test_1d(regr='quadratic', beta0=[0., 0.5, 0.5])
-    test_2d(regr='linear', beta0=[0., 0.5, 0.5])
-    test_2d(regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5])
-    test_2d_2d(regr='linear', beta0=[0., 0.5, 0.5])
-    test_2d_2d(regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5])
+    check2d(X, regr='linear', beta0=[0., 0.5, 0.5])
+    check2d(X, regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5])
+    check2d_2d(X, regr='linear', beta0=[0., 0.5, 0.5])
+    check2d_2d(X, regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5])
 
 
 def test_no_normalize():
@@ -165,3 +170,45 @@ def test_random_starts():
         rlf = gp.reduced_likelihood_function()[0]
         assert_greater(rlf, best_likelihood - np.finfo(np.float32).eps)
         best_likelihood = rlf
+
+
+@raises(ValueError)
+def test_no_multiple_feature():
+    '''
+    check that multiple features are not allowed for non-noisy data
+    '''
+    X = np.array([[-4.61611719, -6.00099547],
+                  [4.10469096, 5.32782448],
+                  [0.00000000, -0.50000000],
+                  [-6.17289014, -4.6984743],
+                  [-6.17289014, -4.6984743],
+                  [1.3109306, -6.93271427],
+                  [-5.03823144, 3.10584743],
+                  [-2.87600388, 6.74310541],
+                  [5.21301203, 4.26386883]])
+
+    check2d(X)
+
+
+def test_1d_noisy(regr=regression.constant, corr=correlation.absolute_exponential,
+                  random_start=10, beta0=None):
+    """
+    MLE estimation of a one-dimensional Gaussian Process model.
+    Check random start optimization with noisy / duplicate inputs.
+
+    Test the interpolating property.
+    """
+
+    X = np.atleast_2d([1., 3., 5., 6., 7., 8., 9., 10.] * 2).T
+    x = np.atleast_2d(np.linspace(0, 10, 50)).T
+
+    rs = np.random.RandomState(0)
+    y = f(X).ravel() + rs.normal(0, 0.1, len(X))
+
+    gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0,
+                         theta0=1e-2, thetaL=1e-4, thetaU=1e-1,
+                         random_start=random_start, verbose=False, nugget=0.01).fit(X, y)
+    y_pred, MSE = gp.predict(x, eval_MSE=True)
+
+    y = f(x).ravel()
+    assert_true((np.abs(y_pred - y) <= (1.96 * MSE)  ).all())  #check that true value is within 95% conf. int.