Skip to content

Commit ae89b25

Browse files
committed
Fixes #13936. Allow defining kernels that operate on structured data in addition to fixed-length feature vectors. Made the Gaussian process regressor and classifier compatible with either structure- or vector-based kernels.
1 parent e747376 commit ae89b25

File tree

4 files changed

+269
-41
lines changed

4 files changed

+269
-41
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
"""
2+
==========================================================================
3+
Gaussian process regression and classification on discrete data structures
4+
==========================================================================
5+
6+
This example illustrates the use of Gaussian processes to carry out
7+
regression and classification tasks on data that are not in fixed-length
8+
feature vector form. This is enabled through the use of kernel functions
9+
that can directly operate on discrete structures such as variable-length
10+
sequences, trees, and graphs.
11+
"""
12+
print(__doc__)
13+
14+
import numpy as np
15+
import matplotlib.pyplot as plt
16+
from sklearn.gaussian_process.kernels import Kernel, Hyperparameter
17+
from sklearn.gaussian_process.kernels import StructuredDataKernelMixin
18+
from sklearn.gaussian_process import GaussianProcessRegressor
19+
from sklearn.gaussian_process import GaussianProcessClassifier
20+
from sklearn.base import clone
21+
22+
23+
class SequenceKernel(StructuredDataKernelMixin, Kernel):
24+
'''
25+
a mimimal (but valid) convolutional kernel for sequences of variable length
26+
'''
27+
def __init__(self,
28+
baseline_similarity=0.5,
29+
baseline_similarity_bounds=(1e-5, 1)):
30+
self.baseline_similarity = baseline_similarity
31+
self.baseline_similarity_bounds = baseline_similarity_bounds
32+
33+
@property
34+
def hyperparameter_baseline_similarity(self):
35+
return Hyperparameter("baseline_similarity",
36+
"numeric",
37+
self.baseline_similarity_bounds)
38+
39+
def _f(self, s1, s2):
40+
return sum([1.0 if c1 == c2 else self.baseline_similarity
41+
for c1 in s1
42+
for c2 in s2])
43+
44+
def _g(self, s1, s2):
45+
return sum([0.0 if c1 == c2 else 1.0 for c1 in s1 for c2 in s2])
46+
47+
def __call__(self, X, Y=None, eval_gradient=False):
48+
if Y is None:
49+
Y = X
50+
51+
if eval_gradient:
52+
return (np.array([[self._f(x, y) for y in Y] for x in X]),
53+
np.array([[[self._g(x, y)] for y in Y] for x in X]))
54+
else:
55+
return np.array([[self._f(x, y) for y in Y] for x in X])
56+
57+
def diag(self, X):
58+
return np.array([self._f(x, x) for x in X])
59+
60+
def is_stationary(self):
61+
return False
62+
63+
def clone_with_theta(self, theta):
64+
cloned = clone(self)
65+
cloned.theta = theta
66+
return cloned
67+
68+
69+
kernel = SequenceKernel()
70+
71+
seqs = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA'])
72+
vals = np.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0])
73+
74+
'''
75+
Visualize sequence similarity matrix under the kernel
76+
'''
77+
78+
K = kernel(seqs)
79+
D = kernel.diag(seqs)
80+
81+
plt.figure(figsize=(8, 5))
82+
plt.imshow(np.diag(D**-0.5).dot(K).dot(np.diag(D**-0.5)))
83+
plt.gca().set_xticks(np.arange(len(seqs)))
84+
plt.gca().set_xticklabels(seqs)
85+
plt.gca().set_yticks(np.arange(len(seqs)))
86+
plt.gca().set_yticklabels(seqs)
87+
plt.title('Sequence similarity under the kernel')
88+
89+
'''
90+
Regression
91+
'''
92+
93+
training_idx = [0, 1, 3, 4]
94+
gp = GaussianProcessRegressor(kernel)
95+
gp.fit(seqs[training_idx], vals[training_idx])
96+
97+
plt.figure(figsize=(8, 5))
98+
plt.bar(np.arange(len(seqs)), gp.predict(seqs), color='b', label='prediction')
99+
plt.bar(training_idx, vals[training_idx], width=0.2, color='r',
100+
alpha=0.5, label='training')
101+
plt.gca().set_xticklabels(seqs)
102+
plt.title('Regression on sequences')
103+
plt.legend()
104+
105+
'''
106+
Classification
107+
'''
108+
109+
seqs = np.array(['AGCT', 'CGA', 'TAAC', 'TCG', 'CTTT', 'TGCT'])
110+
# whether there are 'A's in the sequence
111+
clss = np.array([True, True, True, False, False, False])
112+
113+
gp = GaussianProcessClassifier(kernel)
114+
gp.fit(seqs, clss)
115+
116+
seqs_test = ['AAA', 'ATAG', 'CTC', 'CT', 'C']
117+
clss_test = [True, True, False, False, False]
118+
119+
plt.figure(figsize=(8, 5))
120+
plt.scatter(np.arange(len(seqs)), [1.0 if c is True else -1.0 for c in clss],
121+
s=100, marker='o', edgecolor='none', facecolor=(1, 0.75, 0),
122+
label='training')
123+
plt.scatter(len(seqs) + np.arange(len(seqs_test)),
124+
[1.0 if c is True else -1.0 for c in clss_test],
125+
s=100, marker='o', edgecolor='none', facecolor='r', label='truth')
126+
plt.scatter(len(seqs) + np.arange(len(seqs_test)),
127+
[1.0 if c is True else -1.0 for c in gp.predict(seqs_test)],
128+
s=100, marker='x', edgecolor=(0, 1.0, 0.3), linewidth=2,
129+
label='prediction')
130+
plt.gca().set_xticks(np.arange(len(seqs) + len(seqs_test)))
131+
plt.gca().set_xticklabels(np.concatenate((seqs, seqs_test)))
132+
plt.gca().set_yticks([-1, 1])
133+
plt.gca().set_yticklabels([False, True])
134+
plt.title('Classification on sequences')
135+
plt.legend()
136+
137+
plt.show()

sklearn/gaussian_process/gpc.py

+36-11
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,11 @@ def optimizer(obj_func, initial_theta, bounds):
116116
117117
Attributes
118118
----------
119-
X_train_ : array-like, shape = (n_samples, n_features)
120-
Feature values in training data (also required for prediction)
119+
X_train_ : posssible values:
120+
- array-like, shape = (n_samples, n_features)
121+
- object list of length n_samples
122+
Feature vectors or other representations of training data
123+
(also required for prediction)
121124
122125
y_train_ : array-like, shape = (n_samples,)
123126
Target values in training data (also required for prediction)
@@ -161,7 +164,9 @@ def fit(self, X, y):
161164
162165
Parameters
163166
----------
164-
X : array-like, shape = (n_samples, n_features)
167+
X : possible values:
168+
- array-like, shape = (n_samples, n_features)
169+
- object list of length n_samples
165170
Training data
166171
167172
y : array-like, shape = (n_samples,)
@@ -248,7 +253,9 @@ def predict(self, X):
248253
249254
Parameters
250255
----------
251-
X : array-like, shape = (n_samples, n_features)
256+
X : possible values:
257+
- array-like, shape = (n_samples, n_features)
258+
- object list of length n_samples
252259
253260
Returns
254261
-------
@@ -270,7 +277,9 @@ def predict_proba(self, X):
270277
271278
Parameters
272279
----------
273-
X : array-like, shape = (n_samples, n_features)
280+
X : possible values:
281+
- array-like, shape = (n_samples, n_features)
282+
- object list of length n_samples
274283
275284
Returns
276285
-------
@@ -588,13 +597,21 @@ def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
588597
self.random_state = random_state
589598
self.multi_class = multi_class
590599
self.n_jobs = n_jobs
600+
if kernel is None:
601+
self.ensure_2d = True
602+
self.x_dtype = 'numeric'
603+
else:
604+
self.ensure_2d = kernel.on_vector()
605+
self.x_dtype = 'numeric' if kernel.on_vector() else None
591606

592607
def fit(self, X, y):
593608
"""Fit Gaussian process classification model
594609
595610
Parameters
596611
----------
597-
X : array-like, shape = (n_samples, n_features)
612+
X : possible values:
613+
- array-like, shape = (n_samples, n_features)
614+
- object list of length n_samples
598615
Training data
599616
600617
y : array-like, shape = (n_samples,)
@@ -604,7 +621,9 @@ def fit(self, X, y):
604621
-------
605622
self : returns an instance of self.
606623
"""
607-
X, y = check_X_y(X, y, multi_output=False)
624+
X, y = check_X_y(X, y, multi_output=False,
625+
ensure_2d=self.ensure_2d,
626+
dtype=self.x_dtype)
608627

609628
self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
610629
self.kernel, self.optimizer, self.n_restarts_optimizer,
@@ -648,23 +667,29 @@ def predict(self, X):
648667
649668
Parameters
650669
----------
651-
X : array-like, shape = (n_samples, n_features)
670+
X : possible values:
671+
- array-like, shape = (n_samples, n_features)
672+
- object list of length n_samples
673+
Query samples
652674
653675
Returns
654676
-------
655677
C : array, shape = (n_samples,)
656678
Predicted target values for X, values are from ``classes_``
657679
"""
658680
check_is_fitted(self, ["classes_", "n_classes_"])
659-
X = check_array(X)
681+
X = check_array(X, ensure_2d=self.ensure_2d, dtype=self.x_dtype)
660682
return self.base_estimator_.predict(X)
661683

662684
def predict_proba(self, X):
663685
"""Return probability estimates for the test vector X.
664686
665687
Parameters
666688
----------
667-
X : array-like, shape = (n_samples, n_features)
689+
X : possible values:
690+
- array-like, shape = (n_samples, n_features)
691+
- object list of length n_samples
692+
Query points
668693
669694
Returns
670695
-------
@@ -678,7 +703,7 @@ def predict_proba(self, X):
678703
raise ValueError("one_vs_one multi-class mode does not support "
679704
"predicting probability estimates. Use "
680705
"one_vs_rest mode instead.")
681-
X = check_array(X)
706+
X = check_array(X, ensure_2d=self.ensure_2d, dtype=self.x_dtype)
682707
return self.base_estimator_.predict_proba(X)
683708

684709
@property

sklearn/gaussian_process/gpr.py

+22-6
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,11 @@ def optimizer(obj_func, initial_theta, bounds):
114114
115115
Attributes
116116
----------
117-
X_train_ : array-like, shape = (n_samples, n_features)
118-
Feature values in training data (also required for prediction)
117+
X_train_ : posssible values:
118+
- array-like, shape = (n_samples, n_features)
119+
- object list of length n_samples
120+
Feature vectors or other representations of training data
121+
(also required for prediction)
119122
120123
y_train_ : array-like, shape = (n_samples, [n_output_dims])
121124
Target values in training data (also required for prediction)
@@ -158,13 +161,21 @@ def __init__(self, kernel=None, alpha=1e-10,
158161
self.normalize_y = normalize_y
159162
self.copy_X_train = copy_X_train
160163
self.random_state = random_state
164+
if kernel is None:
165+
self.ensure_2d = True
166+
self.x_dtype = 'numeric'
167+
else:
168+
self.ensure_2d = kernel.on_vector()
169+
self.x_dtype = 'numeric' if kernel.on_vector() else None
161170

162171
def fit(self, X, y):
163172
"""Fit Gaussian process regression model.
164173
165174
Parameters
166175
----------
167-
X : array-like, shape = (n_samples, n_features)
176+
X : possible values:
177+
- array-like, shape = (n_samples, n_features)
178+
- object list of length n_samples
168179
Training data
169180
170181
y : array-like, shape = (n_samples, [n_output_dims])
@@ -182,7 +193,10 @@ def fit(self, X, y):
182193

183194
self._rng = check_random_state(self.random_state)
184195

185-
X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
196+
X, y = check_X_y(X, y, multi_output=True,
197+
y_numeric=True,
198+
ensure_2d=self.ensure_2d,
199+
dtype=self.x_dtype)
186200

187201
# Normalize target value
188202
if self.normalize_y:
@@ -271,7 +285,9 @@ def predict(self, X, return_std=False, return_cov=False):
271285
272286
Parameters
273287
----------
274-
X : array-like, shape = (n_samples, n_features)
288+
X : possible values:
289+
- array-like, shape = (n_samples, n_features)
290+
- object list of length n_samples
275291
Query points where the GP is evaluated
276292
277293
return_std : bool, default: False
@@ -300,7 +316,7 @@ def predict(self, X, return_std=False, return_cov=False):
300316
"Not returning standard deviation of predictions when "
301317
"returning full covariance.")
302318

303-
X = check_array(X)
319+
X = check_array(X, ensure_2d=self.ensure_2d, dtype=self.x_dtype)
304320

305321
if not hasattr(self, "X_train_"): # Unfitted;predict based on GP prior
306322
if self.kernel is None:

0 commit comments

Comments
 (0)