Merge pull request lisa-lab#71 from nouiz/lstm2

lamblin · lamblin · commit 5b2239c3dfd1 · 2015-02-03T11:27:38.000-05:00
Update/speed up to the lstm code
diff --git a/.travis.yml b/.travis.yml
@@ -12,6 +12,7 @@ language: c
 # command to install dependencies
 before_install:
 #zlib1g-dev is needed to allow PIL to uncompress the dataset.
+  - sudo apt-get update
   - sudo apt-get install -qq libatlas3gf-base libatlas-dev zlib1g-dev zip unzip zlibc libzip-dev libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev python-numpy python-scipy python-pip python-nose python-yaml pyflakes python-imaging
 
 install:
diff --git a/code/imdb.py b/code/imdb.py
@@ -3,7 +3,6 @@
 import os
 
 import numpy
-
 import theano
 
 
@@ -16,6 +15,7 @@ def prepare_data(seqs, labels, maxlen=None):
     if maxlen is set, we will cut all sequence to this maximum
     lenght.
 
+    This swap the axis!
     """
     # x: a list of sentences
     lengths = [len(s) for s in seqs]
@@ -40,7 +40,7 @@ def prepare_data(seqs, labels, maxlen=None):
     maxlen = numpy.max(lengths)
 
     x = numpy.zeros((maxlen, n_samples)).astype('int64')
-    x_mask = numpy.zeros((maxlen, n_samples)).astype('float32')
+    x_mask = numpy.zeros((maxlen, n_samples)).astype(theano.config.floatX)
     for idx, s in enumerate(seqs):
         x[:lengths[idx], idx] = s
         x_mask[:lengths[idx], idx] = 1.
@@ -74,8 +74,9 @@ def get_dataset_file(dataset, default_dataset, origin):
     return dataset
 
 
-def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None):
-    ''' Loads the dataset
+def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None,
+              sort_by_len=True):
+    '''Loads the dataset
 
     :type path: String
     :param path: The path to the dataset (here IMDB)
@@ -87,6 +88,12 @@ def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None):
         the validation set.
     :type maxlen: None or positive int
     :param maxlen: the max sequence length we use in the train/valid set.
+    :type sort_by_len: bool
+    :name sort_by_len: Sort by the sequence lenght for the train,
+        valid and test set. This allow faster execution as it cause
+        less padding per minibatch. Another mechanism must be used to
+        shuffle the train set at each epoch.
+
     '''
 
     #############
@@ -140,6 +147,22 @@ def remove_unk(x):
     valid_set_x = remove_unk(valid_set_x)
     test_set_x = remove_unk(test_set_x)
 
+    def len_argsort(seq):
+        return sorted(range(len(seq)), key=lambda x: len(seq[x]))
+
+    if sort_by_len:
+        sorted_index = len_argsort(test_set_x)
+        test_set_x = [test_set_x[i] for i in sorted_index]
+        test_set_y = [test_set_y[i] for i in sorted_index]
+
+        sorted_index = len_argsort(valid_set_x)
+        valid_set_x = [valid_set_x[i] for i in sorted_index]
+        valid_set_y = [valid_set_y[i] for i in sorted_index]
+
+        sorted_index = len_argsort(train_set_x)
+        train_set_x = [train_set_x[i] for i in sorted_index]
+        train_set_y = [train_set_y[i] for i in sorted_index]
+
     train = (train_set_x, train_set_y)
     valid = (valid_set_x, valid_set_y)
     test = (test_set_x, test_set_y)
diff --git a/code/lstm.py b/code/lstm.py
@@ -2,14 +2,14 @@
 Build a tweet sentiment analyzer
 '''
 from collections import OrderedDict
-import copy
 import cPickle as pkl
 import random
 import sys
 import time
 
 import numpy
 import theano
+from theano import config
 import theano.tensor as tensor
 from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 
@@ -18,6 +18,10 @@
 datasets = {'imdb': (imdb.load_data, imdb.prepare_data)}
 
 
+def numpy_floatX(data):
+    return numpy.asarray(data, dtype=config.floatX)
+
+
 def get_minibatches_idx(n, minibatch_size, shuffle=False):
     """
     Used to shuffle the dataset at each iteration.
@@ -86,14 +90,14 @@ def init_params(options):
     # embedding
     randn = numpy.random.rand(options['n_words'],
                               options['dim_proj'])
-    params['Wemb'] = (0.01 * randn).astype('float32')
+    params['Wemb'] = (0.01 * randn).astype(config.floatX)
     params = get_layer(options['encoder'])[0](options,
                                               params,
                                               prefix=options['encoder'])
     # classifier
     params['U'] = 0.01 * numpy.random.randn(options['dim_proj'],
-                                            options['ydim']).astype('float32')
-    params['b'] = numpy.zeros((options['ydim'],)).astype('float32')
+                                            options['ydim']).astype(config.floatX)
+    params['b'] = numpy.zeros((options['ydim'],)).astype(config.floatX)
 
     return params
 
@@ -123,7 +127,7 @@ def get_layer(name):
 def ortho_weight(ndim):
     W = numpy.random.randn(ndim, ndim)
     u, s, v = numpy.linalg.svd(W)
-    return u.astype('float32')
+    return u.astype(config.floatX)
 
 
 def param_init_lstm(options, params, prefix='lstm'):
@@ -143,7 +147,7 @@ def param_init_lstm(options, params, prefix='lstm'):
                            ortho_weight(options['dim_proj'])], axis=1)
     params[_p(prefix, 'U')] = U
     b = numpy.zeros((4 * options['dim_proj'],))
-    params[_p(prefix, 'b')] = b.astype('float32')
+    params[_p(prefix, 'b')] = b.astype(config.floatX)
 
     return params
 
@@ -159,8 +163,8 @@ def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
 
     def _slice(_x, n, dim):
         if _x.ndim == 3:
-            return _x[:, :, n*dim:(n+1)*dim]
-        return _x[:, n*dim:(n+1)*dim]
+            return _x[:, :, n * dim:(n + 1) * dim]
+        return _x[:, n * dim:(n + 1) * dim]
 
     def _step(m_, x_, h_, c_):
         preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
@@ -186,9 +190,11 @@ def _step(m_, x_, h_, c_):
     dim_proj = options['dim_proj']
     rval, updates = theano.scan(_step,
                                 sequences=[mask, state_below],
-                                outputs_info=[tensor.alloc(0., n_samples,
+                                outputs_info=[tensor.alloc(numpy_floatX(0.),
+                                                           n_samples,
                                                            dim_proj),
-                                              tensor.alloc(0., n_samples,
+                                              tensor.alloc(numpy_floatX(0.),
+                                                           n_samples,
                                                            dim_proj)],
                                 name=_p(prefix, '_layers'),
                                 n_steps=nsteps)
@@ -229,21 +235,21 @@ def sgd(lr, tparams, grads, x, mask, y, cost):
 
 
 def adadelta(lr, tparams, grads, x, mask, y, cost):
-    zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
+    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                   name='%s_grad' % k)
                     for k, p in tparams.iteritems()]
-    running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
+    running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                  name='%s_rup2' % k)
                    for k, p in tparams.iteritems()]
-    running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
+    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                     name='%s_rgrad2' % k)
                       for k, p in tparams.iteritems()]
 
     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
              for rg2, g in zip(running_grads2, grads)]
 
-    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup+rg2up,
+    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
                                     name='adadelta_f_grad_shared')
 
     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
@@ -254,21 +260,21 @@ def adadelta(lr, tparams, grads, x, mask, y, cost):
              for ru2, ud in zip(running_up2, updir)]
     param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]
 
-    f_update = theano.function([lr], [], updates=ru2up+param_up,
+    f_update = theano.function([lr], [], updates=ru2up + param_up,
                                on_unused_input='ignore',
                                name='adadelta_f_update')
 
     return f_grad_shared, f_update
 
 
 def rmsprop(lr, tparams, grads, x, mask, y, cost):
-    zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
+    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                   name='%s_grad' % k)
                     for k, p in tparams.iteritems()]
-    running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
+    running_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                    name='%s_rgrad' % k)
                      for k, p in tparams.iteritems()]
-    running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
+    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                     name='%s_rgrad2' % k)
                       for k, p in tparams.iteritems()]
 
@@ -281,15 +287,15 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
                                     updates=zgup + rgup + rg2up,
                                     name='rmsprop_f_grad_shared')
 
-    updir = [theano.shared(p.get_value() * numpy.float32(0.),
+    updir = [theano.shared(p.get_value() * numpy_floatX(0.),
                            name='%s_updir' % k)
              for k, p in tparams.iteritems()]
     updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
                  for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                             running_grads2)]
     param_up = [(p, p + udn[1])
                 for p, udn in zip(tparams.values(), updir_new)]
-    f_update = theano.function([lr], [], updates=updir_new+param_up,
+    f_update = theano.function([lr], [], updates=updir_new + param_up,
                                on_unused_input='ignore',
                                name='rmsprop_f_update')
 
@@ -300,10 +306,10 @@ def build_model(tparams, options):
     trng = RandomStreams(1234)
 
     # Used for dropout.
-    use_noise = theano.shared(numpy.float32(0.))
+    use_noise = theano.shared(numpy_floatX(0.))
 
     x = tensor.matrix('x', dtype='int64')
-    mask = tensor.matrix('mask', dtype='float32')
+    mask = tensor.matrix('mask', dtype=config.floatX)
     y = tensor.vector('y', dtype='int64')
 
     n_timesteps = x.shape[0]
@@ -321,7 +327,7 @@ def build_model(tparams, options):
     if options['use_dropout']:
         proj = dropout_layer(proj, use_noise, trng)
 
-    pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U'])+tparams['b'])
+    pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b'])
 
     f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
     f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')
@@ -336,7 +342,7 @@ def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
     the probabilities of new examples.
     """
     n_samples = len(data[0])
-    probs = numpy.zeros((n_samples, 2)).astype('float32')
+    probs = numpy.zeros((n_samples, 2)).astype(config.floatX)
 
     n_done = 0
 
@@ -368,7 +374,7 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
         preds = f_pred(x, mask)
         targets = numpy.array(data[1])[valid_index]
         valid_err += (preds == targets).sum()
-    valid_err = 1. - numpy.float32(valid_err) / len(data[0])
+    valid_err = 1. - numpy_floatX(valid_err) / len(data[0])
 
     return valid_err
 
@@ -396,6 +402,7 @@ def train_lstm(
     use_dropout=True,  # if False slightly faster, but worst test error
                        # This frequently need a bigger model.
     reload_model="",  # Path to a saved model we want to start from.
+    test_size=-1,  # If >0, we keep only this number of test example.
 ):
 
     # Model options
@@ -407,8 +414,16 @@ def train_lstm(
     print 'Loading data'
     train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
                                    maxlen=maxlen)
+    if test_size > 0:
+        # The test set is sorted by size, but we want to keep random
+        # size example.  So we must select a random selection of the
+        # examples.
+        idx = numpy.arange(len(test[0]))
+        random.shuffle(idx)
+        idx = idx[:test_size]
+        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])
 
-    ydim = numpy.max(train[1])+1
+    ydim = numpy.max(train[1]) + 1
 
     model_options['ydim'] = ydim
 
@@ -430,9 +445,9 @@ def train_lstm(
      y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)
 
     if decay_c > 0.:
-        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
+        decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
         weight_decay = 0.
-        weight_decay += (tparams['U']**2).sum()
+        weight_decay += (tparams['U'] ** 2).sum()
         weight_decay *= decay_c
         cost += weight_decay
 
@@ -447,10 +462,8 @@ def train_lstm(
 
     print 'Optimization'
 
-    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size,
-                                   shuffle=True)
-    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size,
-                                  shuffle=True)
+    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
+    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)
 
     print "%d train examples" % len(train[0])
     print "%d valid examples" % len(valid[0])
@@ -460,9 +473,9 @@ def train_lstm(
     bad_count = 0
 
     if validFreq == -1:
-        validFreq = len(train[0])/batch_size
+        validFreq = len(train[0]) / batch_size
     if saveFreq == -1:
-        saveFreq = len(train[0])/batch_size
+        saveFreq = len(train[0]) / batch_size
 
     uidx = 0  # the number of update done
     estop = False  # early stop
@@ -482,12 +495,10 @@ def train_lstm(
                 y = [train[1][t] for t in train_index]
                 x = [train[0][t]for t in train_index]
 
-                # Get the data in numpy.ndarray formet.
-                # It return something of the shape (minibatch maxlen, n samples)
-                x, mask, y = prepare_data(x, y, maxlen=maxlen)
-                if x is None:
-                    print 'Minibatch with zero sample under length ', maxlen
-                    continue
+                # Get the data in numpy.ndarray format
+                # This swap the axis!
+                # Return something of shape (minibatch maxlen, n samples)
+                x, mask, y = prepare_data(x, y)
                 n_samples += x.shape[1]
 
                 cost = f_grad_shared(x, mask, y)
@@ -514,7 +525,8 @@ def train_lstm(
                 if numpy.mod(uidx, validFreq) == 0:
                     use_noise.set_value(0.)
                     train_err = pred_error(f_pred, prepare_data, train, kf)
-                    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
+                    valid_err = pred_error(f_pred, prepare_data, valid,
+                                           kf_valid)
                     test_err = pred_error(f_pred, prepare_data, test, kf_test)
 
                     history_errs.append([valid_err, test_err])
@@ -553,7 +565,8 @@ def train_lstm(
         best_p = unzip(tparams)
 
     use_noise.set_value(0.)
-    train_err = pred_error(f_pred, prepare_data, train, kf)
+    kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
+    train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted)
     valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
     test_err = pred_error(f_pred, prepare_data, test, kf_test)
 
@@ -570,14 +583,9 @@ def train_lstm(
 
 
 if __name__ == '__main__':
-
-    # We must have floatX=float32 for this tutorial to work correctly.
-    theano.config.floatX = "float32"
-    # The next line is the new Theano default. This is a speed up.
-    theano.config.scan.allow_gc = False
-
     # See function train for all possible parameter and there definition.
     train_lstm(
         #reload_model="lstm_model.npz",
         max_epochs=100,
+        test_size=500,
     )