Fixes #13936. Allow defining kernels that operate on structured data in addition to fixed-length feature vectors. Made the Gaussian process regressor and classifier compatible with either structure- or vector-based kernels.

yhtang · yhtang · commit ae89b251eefe · 2019-05-26T20:59:17.000-07:00
diff --git a/examples/gaussian_process/plot_gpr_on_structured_data.py b/examples/gaussian_process/plot_gpr_on_structured_data.py
@@ -0,0 +1,137 @@
+"""
+==========================================================================
+Gaussian process regression and classification on discrete data structures
+==========================================================================
+
+This example illustrates the use of Gaussian processes to carry out
+regression and classification tasks on data that are not in fixed-length
+feature vector form. This is enabled through the use of kernel functions
+that can directly operate on discrete structures such as variable-length
+sequences, trees, and graphs.
+"""
+print(__doc__)
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.gaussian_process.kernels import Kernel, Hyperparameter
+from sklearn.gaussian_process.kernels import StructuredDataKernelMixin
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.base import clone
+
+
+class SequenceKernel(StructuredDataKernelMixin, Kernel):
+    '''
+    a mimimal (but valid) convolutional kernel for sequences of variable length
+    '''
+    def __init__(self,
+                 baseline_similarity=0.5,
+                 baseline_similarity_bounds=(1e-5, 1)):
+        self.baseline_similarity = baseline_similarity
+        self.baseline_similarity_bounds = baseline_similarity_bounds
+
+    @property
+    def hyperparameter_baseline_similarity(self):
+        return Hyperparameter("baseline_similarity",
+                              "numeric",
+                              self.baseline_similarity_bounds)
+
+    def _f(self, s1, s2):
+        return sum([1.0 if c1 == c2 else self.baseline_similarity
+                   for c1 in s1
+                   for c2 in s2])
+
+    def _g(self, s1, s2):
+        return sum([0.0 if c1 == c2 else 1.0 for c1 in s1 for c2 in s2])
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        if Y is None:
+            Y = X
+
+        if eval_gradient:
+            return (np.array([[self._f(x, y) for y in Y] for x in X]),
+                    np.array([[[self._g(x, y)] for y in Y] for x in X]))
+        else:
+            return np.array([[self._f(x, y) for y in Y] for x in X])
+
+    def diag(self, X):
+        return np.array([self._f(x, x) for x in X])
+
+    def is_stationary(self):
+        return False
+
+    def clone_with_theta(self, theta):
+        cloned = clone(self)
+        cloned.theta = theta
+        return cloned
+
+
+kernel = SequenceKernel()
+
+seqs = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA'])
+vals = np.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0])
+
+'''
+Visualize sequence similarity matrix under the kernel
+'''
+
+K = kernel(seqs)
+D = kernel.diag(seqs)
+
+plt.figure(figsize=(8, 5))
+plt.imshow(np.diag(D**-0.5).dot(K).dot(np.diag(D**-0.5)))
+plt.gca().set_xticks(np.arange(len(seqs)))
+plt.gca().set_xticklabels(seqs)
+plt.gca().set_yticks(np.arange(len(seqs)))
+plt.gca().set_yticklabels(seqs)
+plt.title('Sequence similarity under the kernel')
+
+'''
+Regression
+'''
+
+training_idx = [0, 1, 3, 4]
+gp = GaussianProcessRegressor(kernel)
+gp.fit(seqs[training_idx], vals[training_idx])
+
+plt.figure(figsize=(8, 5))
+plt.bar(np.arange(len(seqs)), gp.predict(seqs), color='b', label='prediction')
+plt.bar(training_idx, vals[training_idx], width=0.2, color='r',
+        alpha=0.5, label='training')
+plt.gca().set_xticklabels(seqs)
+plt.title('Regression on sequences')
+plt.legend()
+
+'''
+Classification
+'''
+
+seqs = np.array(['AGCT', 'CGA', 'TAAC', 'TCG', 'CTTT', 'TGCT'])
+# whether there are 'A's in the sequence
+clss = np.array([True, True, True, False, False, False])
+
+gp = GaussianProcessClassifier(kernel)
+gp.fit(seqs, clss)
+
+seqs_test = ['AAA', 'ATAG', 'CTC', 'CT', 'C']
+clss_test = [True, True, False, False, False]
+
+plt.figure(figsize=(8, 5))
+plt.scatter(np.arange(len(seqs)), [1.0 if c is True else -1.0 for c in clss],
+            s=100, marker='o', edgecolor='none', facecolor=(1, 0.75, 0),
+            label='training')
+plt.scatter(len(seqs) + np.arange(len(seqs_test)),
+            [1.0 if c is True else -1.0 for c in clss_test],
+            s=100, marker='o', edgecolor='none', facecolor='r', label='truth')
+plt.scatter(len(seqs) + np.arange(len(seqs_test)),
+            [1.0 if c is True else -1.0 for c in gp.predict(seqs_test)],
+            s=100, marker='x', edgecolor=(0, 1.0, 0.3), linewidth=2,
+            label='prediction')
+plt.gca().set_xticks(np.arange(len(seqs) + len(seqs_test)))
+plt.gca().set_xticklabels(np.concatenate((seqs, seqs_test)))
+plt.gca().set_yticks([-1, 1])
+plt.gca().set_yticklabels([False, True])
+plt.title('Classification on sequences')
+plt.legend()
+
+plt.show()
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
@@ -116,8 +116,11 @@ def optimizer(obj_func, initial_theta, bounds):
 
     Attributes
     ----------
-    X_train_ : array-like, shape = (n_samples, n_features)
-        Feature values in training data (also required for prediction)
+    X_train_ : posssible values:
+               - array-like, shape = (n_samples, n_features)
+               - object list of length n_samples
+               Feature vectors or other representations of training data
+               (also required for prediction)
 
     y_train_ : array-like, shape = (n_samples,)
         Target values in training data (also required for prediction)
@@ -161,7 +164,9 @@ def fit(self, X, y):
 
         Parameters
         ----------
-        X : array-like, shape = (n_samples, n_features)
+        X : possible values:
+            - array-like, shape = (n_samples, n_features)
+            - object list of length n_samples
             Training data
 
         y : array-like, shape = (n_samples,)
@@ -248,7 +253,9 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : array-like, shape = (n_samples, n_features)
+        X : possible values:
+            - array-like, shape = (n_samples, n_features)
+            - object list of length n_samples
 
         Returns
         -------
@@ -270,7 +277,9 @@ def predict_proba(self, X):
 
         Parameters
         ----------
-        X : array-like, shape = (n_samples, n_features)
+        X : possible values:
+            - array-like, shape = (n_samples, n_features)
+            - object list of length n_samples
 
         Returns
         -------
@@ -588,13 +597,21 @@ def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
         self.random_state = random_state
         self.multi_class = multi_class
         self.n_jobs = n_jobs
+        if kernel is None:
+            self.ensure_2d = True
+            self.x_dtype = 'numeric'
+        else:
+            self.ensure_2d = kernel.on_vector()
+            self.x_dtype = 'numeric' if kernel.on_vector() else None
 
     def fit(self, X, y):
         """Fit Gaussian process classification model
 
         Parameters
         ----------
-        X : array-like, shape = (n_samples, n_features)
+        X : possible values:
+            - array-like, shape = (n_samples, n_features)
+            - object list of length n_samples
             Training data
 
         y : array-like, shape = (n_samples,)
@@ -604,7 +621,9 @@ def fit(self, X, y):
         -------
         self : returns an instance of self.
         """
-        X, y = check_X_y(X, y, multi_output=False)
+        X, y = check_X_y(X, y, multi_output=False,
+                         ensure_2d=self.ensure_2d,
+                         dtype=self.x_dtype)
 
         self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
             self.kernel, self.optimizer, self.n_restarts_optimizer,
@@ -648,23 +667,29 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : array-like, shape = (n_samples, n_features)
+        X : possible values:
+            - array-like, shape = (n_samples, n_features)
+            - object list of length n_samples
+            Query samples
 
         Returns
         -------
         C : array, shape = (n_samples,)
             Predicted target values for X, values are from ``classes_``
         """
         check_is_fitted(self, ["classes_", "n_classes_"])
-        X = check_array(X)
+        X = check_array(X, ensure_2d=self.ensure_2d, dtype=self.x_dtype)
         return self.base_estimator_.predict(X)
 
     def predict_proba(self, X):
         """Return probability estimates for the test vector X.
 
         Parameters
         ----------
-        X : array-like, shape = (n_samples, n_features)
+        X : possible values:
+            - array-like, shape = (n_samples, n_features)
+            - object list of length n_samples
+            Query points
 
         Returns
         -------
@@ -678,7 +703,7 @@ def predict_proba(self, X):
             raise ValueError("one_vs_one multi-class mode does not support "
                              "predicting probability estimates. Use "
                              "one_vs_rest mode instead.")
-        X = check_array(X)
+        X = check_array(X, ensure_2d=self.ensure_2d, dtype=self.x_dtype)
         return self.base_estimator_.predict_proba(X)
 
     @property
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
@@ -114,8 +114,11 @@ def optimizer(obj_func, initial_theta, bounds):
 
     Attributes
     ----------
-    X_train_ : array-like, shape = (n_samples, n_features)
-        Feature values in training data (also required for prediction)
+    X_train_ : posssible values:
+               - array-like, shape = (n_samples, n_features)
+               - object list of length n_samples
+               Feature vectors or other representations of training data
+               (also required for prediction)
 
     y_train_ : array-like, shape = (n_samples, [n_output_dims])
         Target values in training data (also required for prediction)
@@ -158,13 +161,21 @@ def __init__(self, kernel=None, alpha=1e-10,
         self.normalize_y = normalize_y
         self.copy_X_train = copy_X_train
         self.random_state = random_state
+        if kernel is None:
+            self.ensure_2d = True
+            self.x_dtype = 'numeric'
+        else:
+            self.ensure_2d = kernel.on_vector()
+            self.x_dtype = 'numeric' if kernel.on_vector() else None
 
     def fit(self, X, y):
         """Fit Gaussian process regression model.
 
         Parameters
         ----------
-        X : array-like, shape = (n_samples, n_features)
+        X : possible values:
+            - array-like, shape = (n_samples, n_features)
+            - object list of length n_samples
             Training data
 
         y : array-like, shape = (n_samples, [n_output_dims])
@@ -182,7 +193,10 @@ def fit(self, X, y):
 
         self._rng = check_random_state(self.random_state)
 
-        X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
+        X, y = check_X_y(X, y, multi_output=True,
+                         y_numeric=True,
+                         ensure_2d=self.ensure_2d,
+                         dtype=self.x_dtype)
 
         # Normalize target value
         if self.normalize_y:
@@ -271,7 +285,9 @@ def predict(self, X, return_std=False, return_cov=False):
 
         Parameters
         ----------
-        X : array-like, shape = (n_samples, n_features)
+        X : possible values:
+            - array-like, shape = (n_samples, n_features)
+            - object list of length n_samples
             Query points where the GP is evaluated
 
         return_std : bool, default: False
@@ -300,7 +316,7 @@ def predict(self, X, return_std=False, return_cov=False):
                 "Not returning standard deviation of predictions when "
                 "returning full covariance.")
 
-        X = check_array(X)
+        X = check_array(X, ensure_2d=self.ensure_2d, dtype=self.x_dtype)
 
         if not hasattr(self, "X_train_"):  # Unfitted;predict based on GP prior
             if self.kernel is None:
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py