Skip to content

Commit 5b2239c

Browse files
committed
Merge pull request lisa-lab#71 from nouiz/lstm2
Update/speed up to the lstm code
2 parents c4e42a2 + 6e38783 commit 5b2239c

File tree

3 files changed

+84
-52
lines changed

3 files changed

+84
-52
lines changed

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ language: c
1212
# command to install dependencies
1313
before_install:
1414
#zlib1g-dev is needed to allow PIL to uncompress the dataset.
15+
- sudo apt-get update
1516
- sudo apt-get install -qq libatlas3gf-base libatlas-dev zlib1g-dev zip unzip zlibc libzip-dev libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev python-numpy python-scipy python-pip python-nose python-yaml pyflakes python-imaging
1617

1718
install:

code/imdb.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import os
44

55
import numpy
6-
76
import theano
87

98

@@ -16,6 +15,7 @@ def prepare_data(seqs, labels, maxlen=None):
1615
if maxlen is set, we will cut all sequence to this maximum
1716
lenght.
1817
18+
This swap the axis!
1919
"""
2020
# x: a list of sentences
2121
lengths = [len(s) for s in seqs]
@@ -40,7 +40,7 @@ def prepare_data(seqs, labels, maxlen=None):
4040
maxlen = numpy.max(lengths)
4141

4242
x = numpy.zeros((maxlen, n_samples)).astype('int64')
43-
x_mask = numpy.zeros((maxlen, n_samples)).astype('float32')
43+
x_mask = numpy.zeros((maxlen, n_samples)).astype(theano.config.floatX)
4444
for idx, s in enumerate(seqs):
4545
x[:lengths[idx], idx] = s
4646
x_mask[:lengths[idx], idx] = 1.
@@ -74,8 +74,9 @@ def get_dataset_file(dataset, default_dataset, origin):
7474
return dataset
7575

7676

77-
def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None):
78-
''' Loads the dataset
77+
def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None,
78+
sort_by_len=True):
79+
'''Loads the dataset
7980
8081
:type path: String
8182
:param path: The path to the dataset (here IMDB)
@@ -87,6 +88,12 @@ def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None):
8788
the validation set.
8889
:type maxlen: None or positive int
8990
:param maxlen: the max sequence length we use in the train/valid set.
91+
:type sort_by_len: bool
92+
:name sort_by_len: Sort by the sequence lenght for the train,
93+
valid and test set. This allow faster execution as it cause
94+
less padding per minibatch. Another mechanism must be used to
95+
shuffle the train set at each epoch.
96+
9097
'''
9198

9299
#############
@@ -140,6 +147,22 @@ def remove_unk(x):
140147
valid_set_x = remove_unk(valid_set_x)
141148
test_set_x = remove_unk(test_set_x)
142149

150+
def len_argsort(seq):
151+
return sorted(range(len(seq)), key=lambda x: len(seq[x]))
152+
153+
if sort_by_len:
154+
sorted_index = len_argsort(test_set_x)
155+
test_set_x = [test_set_x[i] for i in sorted_index]
156+
test_set_y = [test_set_y[i] for i in sorted_index]
157+
158+
sorted_index = len_argsort(valid_set_x)
159+
valid_set_x = [valid_set_x[i] for i in sorted_index]
160+
valid_set_y = [valid_set_y[i] for i in sorted_index]
161+
162+
sorted_index = len_argsort(train_set_x)
163+
train_set_x = [train_set_x[i] for i in sorted_index]
164+
train_set_y = [train_set_y[i] for i in sorted_index]
165+
143166
train = (train_set_x, train_set_y)
144167
valid = (valid_set_x, valid_set_y)
145168
test = (test_set_x, test_set_y)

code/lstm.py

Lines changed: 56 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
Build a tweet sentiment analyzer
33
'''
44
from collections import OrderedDict
5-
import copy
65
import cPickle as pkl
76
import random
87
import sys
98
import time
109

1110
import numpy
1211
import theano
12+
from theano import config
1313
import theano.tensor as tensor
1414
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
1515

@@ -18,6 +18,10 @@
1818
datasets = {'imdb': (imdb.load_data, imdb.prepare_data)}
1919

2020

21+
def numpy_floatX(data):
22+
return numpy.asarray(data, dtype=config.floatX)
23+
24+
2125
def get_minibatches_idx(n, minibatch_size, shuffle=False):
2226
"""
2327
Used to shuffle the dataset at each iteration.
@@ -86,14 +90,14 @@ def init_params(options):
8690
# embedding
8791
randn = numpy.random.rand(options['n_words'],
8892
options['dim_proj'])
89-
params['Wemb'] = (0.01 * randn).astype('float32')
93+
params['Wemb'] = (0.01 * randn).astype(config.floatX)
9094
params = get_layer(options['encoder'])[0](options,
9195
params,
9296
prefix=options['encoder'])
9397
# classifier
9498
params['U'] = 0.01 * numpy.random.randn(options['dim_proj'],
95-
options['ydim']).astype('float32')
96-
params['b'] = numpy.zeros((options['ydim'],)).astype('float32')
99+
options['ydim']).astype(config.floatX)
100+
params['b'] = numpy.zeros((options['ydim'],)).astype(config.floatX)
97101

98102
return params
99103

@@ -123,7 +127,7 @@ def get_layer(name):
123127
def ortho_weight(ndim):
124128
W = numpy.random.randn(ndim, ndim)
125129
u, s, v = numpy.linalg.svd(W)
126-
return u.astype('float32')
130+
return u.astype(config.floatX)
127131

128132

129133
def param_init_lstm(options, params, prefix='lstm'):
@@ -143,7 +147,7 @@ def param_init_lstm(options, params, prefix='lstm'):
143147
ortho_weight(options['dim_proj'])], axis=1)
144148
params[_p(prefix, 'U')] = U
145149
b = numpy.zeros((4 * options['dim_proj'],))
146-
params[_p(prefix, 'b')] = b.astype('float32')
150+
params[_p(prefix, 'b')] = b.astype(config.floatX)
147151

148152
return params
149153

@@ -159,8 +163,8 @@ def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
159163

160164
def _slice(_x, n, dim):
161165
if _x.ndim == 3:
162-
return _x[:, :, n*dim:(n+1)*dim]
163-
return _x[:, n*dim:(n+1)*dim]
166+
return _x[:, :, n * dim:(n + 1) * dim]
167+
return _x[:, n * dim:(n + 1) * dim]
164168

165169
def _step(m_, x_, h_, c_):
166170
preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
@@ -186,9 +190,11 @@ def _step(m_, x_, h_, c_):
186190
dim_proj = options['dim_proj']
187191
rval, updates = theano.scan(_step,
188192
sequences=[mask, state_below],
189-
outputs_info=[tensor.alloc(0., n_samples,
193+
outputs_info=[tensor.alloc(numpy_floatX(0.),
194+
n_samples,
190195
dim_proj),
191-
tensor.alloc(0., n_samples,
196+
tensor.alloc(numpy_floatX(0.),
197+
n_samples,
192198
dim_proj)],
193199
name=_p(prefix, '_layers'),
194200
n_steps=nsteps)
@@ -229,21 +235,21 @@ def sgd(lr, tparams, grads, x, mask, y, cost):
229235

230236

231237
def adadelta(lr, tparams, grads, x, mask, y, cost):
232-
zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
238+
zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
233239
name='%s_grad' % k)
234240
for k, p in tparams.iteritems()]
235-
running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
241+
running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
236242
name='%s_rup2' % k)
237243
for k, p in tparams.iteritems()]
238-
running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
244+
running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
239245
name='%s_rgrad2' % k)
240246
for k, p in tparams.iteritems()]
241247

242248
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
243249
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
244250
for rg2, g in zip(running_grads2, grads)]
245251

246-
f_grad_shared = theano.function([x, mask, y], cost, updates=zgup+rg2up,
252+
f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
247253
name='adadelta_f_grad_shared')
248254

249255
updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
@@ -254,21 +260,21 @@ def adadelta(lr, tparams, grads, x, mask, y, cost):
254260
for ru2, ud in zip(running_up2, updir)]
255261
param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]
256262

257-
f_update = theano.function([lr], [], updates=ru2up+param_up,
263+
f_update = theano.function([lr], [], updates=ru2up + param_up,
258264
on_unused_input='ignore',
259265
name='adadelta_f_update')
260266

261267
return f_grad_shared, f_update
262268

263269

264270
def rmsprop(lr, tparams, grads, x, mask, y, cost):
265-
zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
271+
zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
266272
name='%s_grad' % k)
267273
for k, p in tparams.iteritems()]
268-
running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
274+
running_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
269275
name='%s_rgrad' % k)
270276
for k, p in tparams.iteritems()]
271-
running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
277+
running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
272278
name='%s_rgrad2' % k)
273279
for k, p in tparams.iteritems()]
274280

@@ -281,15 +287,15 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
281287
updates=zgup + rgup + rg2up,
282288
name='rmsprop_f_grad_shared')
283289

284-
updir = [theano.shared(p.get_value() * numpy.float32(0.),
290+
updir = [theano.shared(p.get_value() * numpy_floatX(0.),
285291
name='%s_updir' % k)
286292
for k, p in tparams.iteritems()]
287293
updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
288294
for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
289295
running_grads2)]
290296
param_up = [(p, p + udn[1])
291297
for p, udn in zip(tparams.values(), updir_new)]
292-
f_update = theano.function([lr], [], updates=updir_new+param_up,
298+
f_update = theano.function([lr], [], updates=updir_new + param_up,
293299
on_unused_input='ignore',
294300
name='rmsprop_f_update')
295301

@@ -300,10 +306,10 @@ def build_model(tparams, options):
300306
trng = RandomStreams(1234)
301307

302308
# Used for dropout.
303-
use_noise = theano.shared(numpy.float32(0.))
309+
use_noise = theano.shared(numpy_floatX(0.))
304310

305311
x = tensor.matrix('x', dtype='int64')
306-
mask = tensor.matrix('mask', dtype='float32')
312+
mask = tensor.matrix('mask', dtype=config.floatX)
307313
y = tensor.vector('y', dtype='int64')
308314

309315
n_timesteps = x.shape[0]
@@ -321,7 +327,7 @@ def build_model(tparams, options):
321327
if options['use_dropout']:
322328
proj = dropout_layer(proj, use_noise, trng)
323329

324-
pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U'])+tparams['b'])
330+
pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b'])
325331

326332
f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
327333
f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')
@@ -336,7 +342,7 @@ def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
336342
the probabilities of new examples.
337343
"""
338344
n_samples = len(data[0])
339-
probs = numpy.zeros((n_samples, 2)).astype('float32')
345+
probs = numpy.zeros((n_samples, 2)).astype(config.floatX)
340346

341347
n_done = 0
342348

@@ -368,7 +374,7 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
368374
preds = f_pred(x, mask)
369375
targets = numpy.array(data[1])[valid_index]
370376
valid_err += (preds == targets).sum()
371-
valid_err = 1. - numpy.float32(valid_err) / len(data[0])
377+
valid_err = 1. - numpy_floatX(valid_err) / len(data[0])
372378

373379
return valid_err
374380

@@ -396,6 +402,7 @@ def train_lstm(
396402
use_dropout=True, # if False slightly faster, but worst test error
397403
# This frequently need a bigger model.
398404
reload_model="", # Path to a saved model we want to start from.
405+
test_size=-1, # If >0, we keep only this number of test example.
399406
):
400407

401408
# Model options
@@ -407,8 +414,16 @@ def train_lstm(
407414
print 'Loading data'
408415
train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
409416
maxlen=maxlen)
417+
if test_size > 0:
418+
# The test set is sorted by size, but we want to keep random
419+
# size example. So we must select a random selection of the
420+
# examples.
421+
idx = numpy.arange(len(test[0]))
422+
random.shuffle(idx)
423+
idx = idx[:test_size]
424+
test = ([test[0][n] for n in idx], [test[1][n] for n in idx])
410425

411-
ydim = numpy.max(train[1])+1
426+
ydim = numpy.max(train[1]) + 1
412427

413428
model_options['ydim'] = ydim
414429

@@ -430,9 +445,9 @@ def train_lstm(
430445
y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)
431446

432447
if decay_c > 0.:
433-
decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
448+
decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
434449
weight_decay = 0.
435-
weight_decay += (tparams['U']**2).sum()
450+
weight_decay += (tparams['U'] ** 2).sum()
436451
weight_decay *= decay_c
437452
cost += weight_decay
438453

@@ -447,10 +462,8 @@ def train_lstm(
447462

448463
print 'Optimization'
449464

450-
kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size,
451-
shuffle=True)
452-
kf_test = get_minibatches_idx(len(test[0]), valid_batch_size,
453-
shuffle=True)
465+
kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
466+
kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)
454467

455468
print "%d train examples" % len(train[0])
456469
print "%d valid examples" % len(valid[0])
@@ -460,9 +473,9 @@ def train_lstm(
460473
bad_count = 0
461474

462475
if validFreq == -1:
463-
validFreq = len(train[0])/batch_size
476+
validFreq = len(train[0]) / batch_size
464477
if saveFreq == -1:
465-
saveFreq = len(train[0])/batch_size
478+
saveFreq = len(train[0]) / batch_size
466479

467480
uidx = 0 # the number of update done
468481
estop = False # early stop
@@ -482,12 +495,10 @@ def train_lstm(
482495
y = [train[1][t] for t in train_index]
483496
x = [train[0][t]for t in train_index]
484497

485-
# Get the data in numpy.ndarray formet.
486-
# It return something of the shape (minibatch maxlen, n samples)
487-
x, mask, y = prepare_data(x, y, maxlen=maxlen)
488-
if x is None:
489-
print 'Minibatch with zero sample under length ', maxlen
490-
continue
498+
# Get the data in numpy.ndarray format
499+
# This swap the axis!
500+
# Return something of shape (minibatch maxlen, n samples)
501+
x, mask, y = prepare_data(x, y)
491502
n_samples += x.shape[1]
492503

493504
cost = f_grad_shared(x, mask, y)
@@ -514,7 +525,8 @@ def train_lstm(
514525
if numpy.mod(uidx, validFreq) == 0:
515526
use_noise.set_value(0.)
516527
train_err = pred_error(f_pred, prepare_data, train, kf)
517-
valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
528+
valid_err = pred_error(f_pred, prepare_data, valid,
529+
kf_valid)
518530
test_err = pred_error(f_pred, prepare_data, test, kf_test)
519531

520532
history_errs.append([valid_err, test_err])
@@ -553,7 +565,8 @@ def train_lstm(
553565
best_p = unzip(tparams)
554566

555567
use_noise.set_value(0.)
556-
train_err = pred_error(f_pred, prepare_data, train, kf)
568+
kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
569+
train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted)
557570
valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
558571
test_err = pred_error(f_pred, prepare_data, test, kf_test)
559572

@@ -570,14 +583,9 @@ def train_lstm(
570583

571584

572585
if __name__ == '__main__':
573-
574-
# We must have floatX=float32 for this tutorial to work correctly.
575-
theano.config.floatX = "float32"
576-
# The next line is the new Theano default. This is a speed up.
577-
theano.config.scan.allow_gc = False
578-
579586
# See function train for all possible parameter and there definition.
580587
train_lstm(
581588
#reload_model="lstm_model.npz",
582589
max_epochs=100,
590+
test_size=500,
583591
)

0 commit comments

Comments
 (0)