Skip to content

Commit e36cad4

Browse files
committed
Merge pull request lisa-lab#32 from mesnilgr/master
adding RNN with Word Embeddings tutorial
2 parents 31226dd + 599e458 commit e36cad4

File tree

7 files changed

+806
-5
lines changed

7 files changed

+806
-5
lines changed

code/rnnslu.py

Lines changed: 375 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,375 @@
1+
from collections import OrderedDict
2+
import copy
3+
import cPickle
4+
import gzip
5+
import os
6+
import urllib
7+
import random
8+
import stat
9+
import subprocess
10+
import sys
11+
import time
12+
13+
import numpy
14+
15+
import theano
16+
from theano import tensor as T
17+
18+
PREFIX = os.getenv('ATISDATA', 'data')
19+
20+
21+
# utils functions
22+
def shuffle(lol, seed):
23+
'''
24+
lol :: list of list as input
25+
seed :: seed the shuffling
26+
27+
shuffle inplace each list in the same order
28+
'''
29+
for l in lol:
30+
random.seed(seed)
31+
random.shuffle(l)
32+
33+
# start-snippet-1
34+
def contextwin(l, win):
35+
'''
36+
win :: int corresponding to the size of the window
37+
given a list of indexes composing a sentence
38+
39+
l :: array containing the word indexes
40+
41+
it will return a list of list of indexes corresponding
42+
to context windows surrounding each word in the sentence
43+
'''
44+
assert (win % 2) == 1
45+
assert win >= 1
46+
l = list(l)
47+
48+
lpadded = win//2 * [-1] + l + win//2 * [-1]
49+
out = [lpadded[i:i+win] for i in range(len(l))]
50+
51+
assert len(out) == len(l)
52+
return out
53+
# end-snippet-1
54+
55+
# data loading functions
56+
def atisfold(fold):
57+
assert fold in range(5)
58+
filename = os.path.join(PREFIX, 'atis.fold'+str(fold)+'.pkl.gz')
59+
f = gzip.open(filename, 'rb')
60+
train_set, valid_set, test_set, dicts = cPickle.load(f)
61+
return train_set, valid_set, test_set, dicts
62+
63+
64+
# metrics function using conlleval.pl
65+
def conlleval(p, g, w, filename):
66+
'''
67+
INPUT:
68+
p :: predictions
69+
g :: groundtruth
70+
w :: corresponding words
71+
72+
OUTPUT:
73+
filename :: name of the file where the predictions
74+
are written. it will be the input of conlleval.pl script
75+
for computing the performance in terms of precision
76+
recall and f1 score
77+
'''
78+
out = ''
79+
for sl, sp, sw in zip(g, p, w):
80+
out += 'BOS O O\n'
81+
for wl, wp, w in zip(sl, sp, sw):
82+
out += w + ' ' + wl + ' ' + wp + '\n'
83+
out += 'EOS O O\n\n'
84+
85+
f = open(filename, 'w')
86+
f.writelines(out)
87+
f.close()
88+
89+
return get_perf(filename)
90+
91+
92+
def download(origin):
93+
'''
94+
download the corresponding atis file
95+
from http://www-etud.iro.umontreal.ca/~mesnilgr/atis/
96+
'''
97+
print 'Downloading data from %s' % origin
98+
name = origin.split('/')[-1]
99+
urllib.urlretrieve(origin, name)
100+
101+
102+
def get_perf(filename):
103+
''' run conlleval.pl perl script to obtain
104+
precision/recall and F1 score '''
105+
_conlleval = 'conlleval.pl'
106+
if not os.path.isfile(_conlleval):
107+
url = 'http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl'
108+
download(url)
109+
os.chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions
110+
111+
proc = subprocess.Popen(["perl",
112+
_conlleval],
113+
stdin=subprocess.PIPE,
114+
stdout=subprocess.PIPE)
115+
116+
stdout, _ = proc.communicate(''.join(open(filename).readlines()))
117+
for line in stdout.split('\n'):
118+
if 'accuracy' in line:
119+
out = line.split()
120+
break
121+
122+
precision = float(out[6][:-2])
123+
recall = float(out[8][:-2])
124+
f1score = float(out[10])
125+
126+
return {'p': precision, 'r': recall, 'f1': f1score}
127+
128+
# start-snippet-2
129+
class RNNSLU(object):
130+
''' elman neural net model '''
131+
def __init__(self, nh, nc, ne, de, cs):
132+
'''
133+
nh :: dimension of the hidden layer
134+
nc :: number of classes
135+
ne :: number of word embeddings in the vocabulary
136+
de :: dimension of the word embeddings
137+
cs :: word window context size
138+
'''
139+
# parameters of the model
140+
self.emb = theano.shared(name='embeddings',
141+
value=0.2 * numpy.random.uniform(-1.0, 1.0,
142+
(ne+1, de))
143+
# add one for padding at the end
144+
.astype(theano.config.floatX))
145+
self.wx = theano.shared(name='wx',
146+
value=0.2 * numpy.random.uniform(-1.0, 1.0,
147+
(de * cs, nh))
148+
.astype(theano.config.floatX))
149+
self.wh = theano.shared(name='wh',
150+
value=0.2 * numpy.random.uniform(-1.0, 1.0,
151+
(nh, nh))
152+
.astype(theano.config.floatX))
153+
self.w = theano.shared(name='w',
154+
value=0.2 * numpy.random.uniform(-1.0, 1.0,
155+
(nh, nc))
156+
.astype(theano.config.floatX))
157+
self.bh = theano.shared(name='bh',
158+
value=numpy.zeros(nh,
159+
dtype=theano.config.floatX))
160+
self.b = theano.shared(name='b',
161+
value=numpy.zeros(nc,
162+
dtype=theano.config.floatX))
163+
self.h0 = theano.shared(name='h0',
164+
value=numpy.zeros(nh,
165+
dtype=theano.config.floatX))
166+
167+
# bundle
168+
self.params = [self.emb, self.wx, self.wh, self.w,
169+
self.bh, self.b, self.h0]
170+
# end-snippet-2
171+
# as many columns as context window size
172+
# as many lines as words in the sentence
173+
# start-snippet-3
174+
idxs = T.imatrix()
175+
x = self.emb[idxs].reshape((idxs.shape[0], de*cs))
176+
y_sentence = T.ivector('y_sentence') # labels
177+
# end-snippet-3 start-snippet-4
178+
179+
def recurrence(x_t, h_tm1):
180+
h_t = T.nnet.sigmoid(T.dot(x_t, self.wx)
181+
+ T.dot(h_tm1, self.wh) + self.bh)
182+
s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b)
183+
return [h_t, s_t]
184+
185+
[h, s], _ = theano.scan(fn=recurrence,
186+
sequences=x,
187+
outputs_info=[self.h0, None],
188+
n_steps=x.shape[0])
189+
190+
p_y_given_x_sentence = s[:, 0, :]
191+
y_pred = T.argmax(p_y_given_x_sentence, axis=1)
192+
# end-snippet-4
193+
194+
# cost and gradients and learning rate
195+
# start-snippet-5
196+
lr = T.scalar('lr')
197+
198+
sentence_nll = -T.mean(T.log(p_y_given_x_sentence)
199+
[T.arange(x.shape[0]), y_sentence])
200+
sentence_gradients = T.grad(sentence_nll, self.params)
201+
sentence_updates = OrderedDict((p, p - lr*g)
202+
# end-snippet-5
203+
for p, g in
204+
zip(self.params, sentence_gradients))
205+
206+
# theano functions to compile
207+
# start-snippet-6
208+
self.classify = theano.function(inputs=[idxs], outputs=y_pred)
209+
self.sentence_train = theano.function(inputs=[idxs, y_sentence, lr],
210+
outputs=sentence_nll,
211+
updates=sentence_updates)
212+
# end-snippet-6 start-snippet-7
213+
self.normalize = theano.function(inputs=[],
214+
updates={self.emb:
215+
self.emb /
216+
T.sqrt((self.emb**2)
217+
.sum(axis=1))
218+
.dimshuffle(0, 'x')})
219+
# end-snippet-7
220+
221+
def train(self, x, y, window_size, learning_rate):
222+
223+
cwords = contextwin(x, window_size)
224+
words = map(lambda x: numpy.asarray(x).astype('int32'), cwords)
225+
labels = y
226+
227+
self.sentence_train(words, labels, learning_rate)
228+
self.normalize()
229+
230+
def save(self, folder):
231+
for param in self.params:
232+
numpy.save(os.path.join(folder,
233+
param.name + '.npy'), param.get_value())
234+
235+
def load(self, folder):
236+
for param in self.params:
237+
param.set_value(numpy.load(os.path.join(folder,
238+
param.name + '.npy')))
239+
240+
241+
242+
def main(param=None):
243+
if not param:
244+
param = {'fold': 3,
245+
# 5 folds 0,1,2,3,4
246+
'data': 'atis',
247+
'lr': 0.0970806646812754,
248+
'verbose': 1,
249+
'decay': True,
250+
# decay on the learning rate if improvement stops
251+
'win': 7,
252+
# number of words in the context window
253+
'nhidden': 200,
254+
# number of hidden units
255+
'seed': 345,
256+
'emb_dimension': 50,
257+
# dimension of word embedding
258+
'nepochs': 60,
259+
# 60 is recommended
260+
'savemodel': False}
261+
print param
262+
263+
folder = os.path.basename(__file__).split('.')[0]
264+
if not os.path.exists(folder):
265+
os.mkdir(folder)
266+
267+
# load the dataset
268+
train_set, valid_set, test_set, dic = atisfold(param['fold'])
269+
270+
idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems())
271+
idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems())
272+
273+
train_lex, train_ne, train_y = train_set
274+
valid_lex, valid_ne, valid_y = valid_set
275+
test_lex, test_ne, test_y = test_set
276+
277+
vocsize = len(set(reduce(lambda x, y: list(x) + list(y),
278+
train_lex + valid_lex + test_lex)))
279+
nclasses = len(set(reduce(lambda x, y: list(x)+list(y),
280+
train_y + test_y + valid_y)))
281+
nsentences = len(train_lex)
282+
283+
groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
284+
words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]
285+
groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
286+
words_test = [map(lambda x: idx2word[x], w) for w in test_lex]
287+
288+
# instanciate the model
289+
numpy.random.seed(param['seed'])
290+
random.seed(param['seed'])
291+
292+
rnn = RNNSLU(nh=param['nhidden'],
293+
nc=nclasses,
294+
ne=vocsize,
295+
de=param['emb_dimension'],
296+
cs=param['win'])
297+
298+
# train with early stopping on validation set
299+
best_f1 = -numpy.inf
300+
param['clr'] = param['lr']
301+
for e in xrange(param['nepochs']):
302+
303+
# shuffle
304+
shuffle([train_lex, train_ne, train_y], param['seed'])
305+
306+
param['ce'] = e
307+
tic = time.time()
308+
309+
for i, (x, y) in enumerate(zip(train_lex, train_y)):
310+
rnn.train(x, y, param['win'], param['clr'])
311+
print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic),
312+
sys.stdout.flush()
313+
314+
# evaluation // back into the real world : idx -> words
315+
predictions_test = [map(lambda x: idx2label[x],
316+
rnn.classify(numpy.asarray(
317+
contextwin(x, param['win'])).astype('int32')))
318+
for x in test_lex]
319+
predictions_valid = [map(lambda x: idx2label[x],
320+
rnn.classify(numpy.asarray(
321+
contextwin(x, param['win'])).astype('int32')))
322+
for x in valid_lex]
323+
324+
# evaluation // compute the accuracy using conlleval.pl
325+
res_test = conlleval(predictions_test,
326+
groundtruth_test,
327+
words_test,
328+
folder + '/current.test.txt')
329+
res_valid = conlleval(predictions_valid,
330+
groundtruth_valid,
331+
words_valid,
332+
folder + '/current.valid.txt')
333+
334+
if res_valid['f1'] > best_f1:
335+
336+
if param['savemodel']:
337+
rnn.save(folder)
338+
339+
best_rnn = copy.deepcopy(rnn)
340+
best_f1 = res_valid['f1']
341+
342+
if param['verbose']:
343+
print('NEW BEST: epoch', e,
344+
'valid F1', res_valid['f1'],
345+
'best test F1', res_test['f1'])
346+
347+
param['vf1'], param['tf1'] = res_valid['f1'], res_test['f1']
348+
param['vp'], param['tp'] = res_valid['p'], res_test['p']
349+
param['vr'], param['tr'] = res_valid['r'], res_test['r']
350+
param['be'] = e
351+
352+
subprocess.call(['mv', folder + '/current.test.txt',
353+
folder + '/best.test.txt'])
354+
subprocess.call(['mv', folder + '/current.valid.txt',
355+
folder + '/best.valid.txt'])
356+
else:
357+
if param['verbose']:
358+
print ''
359+
360+
# learning rate decay if no improvement in 10 epochs
361+
if param['decay'] and abs(param['be']-param['ce']) >= 10:
362+
param['clr'] *= 0.5
363+
rnn = best_rnn
364+
365+
if param['clr'] < 1e-5:
366+
break
367+
368+
print('BEST RESULT: epoch', param['be'],
369+
'valid F1', param['vf1'],
370+
'best test F1', param['tf1'],
371+
'with the model', folder)
372+
373+
374+
if __name__ == '__main__':
375+
main()

0 commit comments

Comments
 (0)