2
2
Build a tweet sentiment analyzer
3
3
'''
4
4
from collections import OrderedDict
5
- import copy
6
5
import cPickle as pkl
7
6
import random
8
7
import sys
9
8
import time
10
9
11
10
import numpy
12
11
import theano
12
+ from theano import config
13
13
import theano .tensor as tensor
14
14
from theano .sandbox .rng_mrg import MRG_RandomStreams as RandomStreams
15
15
18
18
datasets = {'imdb' : (imdb .load_data , imdb .prepare_data )}
19
19
20
20
21
+ def numpy_floatX (data ):
22
+ return numpy .asarray (data , dtype = config .floatX )
23
+
24
+
21
25
def get_minibatches_idx (n , minibatch_size , shuffle = False ):
22
26
"""
23
27
Used to shuffle the dataset at each iteration.
@@ -86,14 +90,14 @@ def init_params(options):
86
90
# embedding
87
91
randn = numpy .random .rand (options ['n_words' ],
88
92
options ['dim_proj' ])
89
- params ['Wemb' ] = (0.01 * randn ).astype ('float32' )
93
+ params ['Wemb' ] = (0.01 * randn ).astype (config . floatX )
90
94
params = get_layer (options ['encoder' ])[0 ](options ,
91
95
params ,
92
96
prefix = options ['encoder' ])
93
97
# classifier
94
98
params ['U' ] = 0.01 * numpy .random .randn (options ['dim_proj' ],
95
- options ['ydim' ]).astype ('float32' )
96
- params ['b' ] = numpy .zeros ((options ['ydim' ],)).astype ('float32' )
99
+ options ['ydim' ]).astype (config . floatX )
100
+ params ['b' ] = numpy .zeros ((options ['ydim' ],)).astype (config . floatX )
97
101
98
102
return params
99
103
@@ -123,7 +127,7 @@ def get_layer(name):
123
127
def ortho_weight (ndim ):
124
128
W = numpy .random .randn (ndim , ndim )
125
129
u , s , v = numpy .linalg .svd (W )
126
- return u .astype ('float32' )
130
+ return u .astype (config . floatX )
127
131
128
132
129
133
def param_init_lstm (options , params , prefix = 'lstm' ):
@@ -143,7 +147,7 @@ def param_init_lstm(options, params, prefix='lstm'):
143
147
ortho_weight (options ['dim_proj' ])], axis = 1 )
144
148
params [_p (prefix , 'U' )] = U
145
149
b = numpy .zeros ((4 * options ['dim_proj' ],))
146
- params [_p (prefix , 'b' )] = b .astype ('float32' )
150
+ params [_p (prefix , 'b' )] = b .astype (config . floatX )
147
151
148
152
return params
149
153
@@ -159,8 +163,8 @@ def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
159
163
160
164
def _slice (_x , n , dim ):
161
165
if _x .ndim == 3 :
162
- return _x [:, :, n * dim :(n + 1 ) * dim ]
163
- return _x [:, n * dim :(n + 1 ) * dim ]
166
+ return _x [:, :, n * dim :(n + 1 ) * dim ]
167
+ return _x [:, n * dim :(n + 1 ) * dim ]
164
168
165
169
def _step (m_ , x_ , h_ , c_ ):
166
170
preact = tensor .dot (h_ , tparams [_p (prefix , 'U' )])
@@ -186,9 +190,11 @@ def _step(m_, x_, h_, c_):
186
190
dim_proj = options ['dim_proj' ]
187
191
rval , updates = theano .scan (_step ,
188
192
sequences = [mask , state_below ],
189
- outputs_info = [tensor .alloc (0. , n_samples ,
193
+ outputs_info = [tensor .alloc (numpy_floatX (0. ),
194
+ n_samples ,
190
195
dim_proj ),
191
- tensor .alloc (0. , n_samples ,
196
+ tensor .alloc (numpy_floatX (0. ),
197
+ n_samples ,
192
198
dim_proj )],
193
199
name = _p (prefix , '_layers' ),
194
200
n_steps = nsteps )
@@ -229,21 +235,21 @@ def sgd(lr, tparams, grads, x, mask, y, cost):
229
235
230
236
231
237
def adadelta (lr , tparams , grads , x , mask , y , cost ):
232
- zipped_grads = [theano .shared (p .get_value () * numpy . float32 (0. ),
238
+ zipped_grads = [theano .shared (p .get_value () * numpy_floatX (0. ),
233
239
name = '%s_grad' % k )
234
240
for k , p in tparams .iteritems ()]
235
- running_up2 = [theano .shared (p .get_value () * numpy . float32 (0. ),
241
+ running_up2 = [theano .shared (p .get_value () * numpy_floatX (0. ),
236
242
name = '%s_rup2' % k )
237
243
for k , p in tparams .iteritems ()]
238
- running_grads2 = [theano .shared (p .get_value () * numpy . float32 (0. ),
244
+ running_grads2 = [theano .shared (p .get_value () * numpy_floatX (0. ),
239
245
name = '%s_rgrad2' % k )
240
246
for k , p in tparams .iteritems ()]
241
247
242
248
zgup = [(zg , g ) for zg , g in zip (zipped_grads , grads )]
243
249
rg2up = [(rg2 , 0.95 * rg2 + 0.05 * (g ** 2 ))
244
250
for rg2 , g in zip (running_grads2 , grads )]
245
251
246
- f_grad_shared = theano .function ([x , mask , y ], cost , updates = zgup + rg2up ,
252
+ f_grad_shared = theano .function ([x , mask , y ], cost , updates = zgup + rg2up ,
247
253
name = 'adadelta_f_grad_shared' )
248
254
249
255
updir = [- tensor .sqrt (ru2 + 1e-6 ) / tensor .sqrt (rg2 + 1e-6 ) * zg
@@ -254,21 +260,21 @@ def adadelta(lr, tparams, grads, x, mask, y, cost):
254
260
for ru2 , ud in zip (running_up2 , updir )]
255
261
param_up = [(p , p + ud ) for p , ud in zip (tparams .values (), updir )]
256
262
257
- f_update = theano .function ([lr ], [], updates = ru2up + param_up ,
263
+ f_update = theano .function ([lr ], [], updates = ru2up + param_up ,
258
264
on_unused_input = 'ignore' ,
259
265
name = 'adadelta_f_update' )
260
266
261
267
return f_grad_shared , f_update
262
268
263
269
264
270
def rmsprop (lr , tparams , grads , x , mask , y , cost ):
265
- zipped_grads = [theano .shared (p .get_value () * numpy . float32 (0. ),
271
+ zipped_grads = [theano .shared (p .get_value () * numpy_floatX (0. ),
266
272
name = '%s_grad' % k )
267
273
for k , p in tparams .iteritems ()]
268
- running_grads = [theano .shared (p .get_value () * numpy . float32 (0. ),
274
+ running_grads = [theano .shared (p .get_value () * numpy_floatX (0. ),
269
275
name = '%s_rgrad' % k )
270
276
for k , p in tparams .iteritems ()]
271
- running_grads2 = [theano .shared (p .get_value () * numpy . float32 (0. ),
277
+ running_grads2 = [theano .shared (p .get_value () * numpy_floatX (0. ),
272
278
name = '%s_rgrad2' % k )
273
279
for k , p in tparams .iteritems ()]
274
280
@@ -281,15 +287,15 @@ def rmsprop(lr, tparams, grads, x, mask, y, cost):
281
287
updates = zgup + rgup + rg2up ,
282
288
name = 'rmsprop_f_grad_shared' )
283
289
284
- updir = [theano .shared (p .get_value () * numpy . float32 (0. ),
290
+ updir = [theano .shared (p .get_value () * numpy_floatX (0. ),
285
291
name = '%s_updir' % k )
286
292
for k , p in tparams .iteritems ()]
287
293
updir_new = [(ud , 0.9 * ud - 1e-4 * zg / tensor .sqrt (rg2 - rg ** 2 + 1e-4 ))
288
294
for ud , zg , rg , rg2 in zip (updir , zipped_grads , running_grads ,
289
295
running_grads2 )]
290
296
param_up = [(p , p + udn [1 ])
291
297
for p , udn in zip (tparams .values (), updir_new )]
292
- f_update = theano .function ([lr ], [], updates = updir_new + param_up ,
298
+ f_update = theano .function ([lr ], [], updates = updir_new + param_up ,
293
299
on_unused_input = 'ignore' ,
294
300
name = 'rmsprop_f_update' )
295
301
@@ -300,10 +306,10 @@ def build_model(tparams, options):
300
306
trng = RandomStreams (1234 )
301
307
302
308
# Used for dropout.
303
- use_noise = theano .shared (numpy . float32 (0. ))
309
+ use_noise = theano .shared (numpy_floatX (0. ))
304
310
305
311
x = tensor .matrix ('x' , dtype = 'int64' )
306
- mask = tensor .matrix ('mask' , dtype = 'float32' )
312
+ mask = tensor .matrix ('mask' , dtype = config . floatX )
307
313
y = tensor .vector ('y' , dtype = 'int64' )
308
314
309
315
n_timesteps = x .shape [0 ]
@@ -321,7 +327,7 @@ def build_model(tparams, options):
321
327
if options ['use_dropout' ]:
322
328
proj = dropout_layer (proj , use_noise , trng )
323
329
324
- pred = tensor .nnet .softmax (tensor .dot (proj , tparams ['U' ])+ tparams ['b' ])
330
+ pred = tensor .nnet .softmax (tensor .dot (proj , tparams ['U' ]) + tparams ['b' ])
325
331
326
332
f_pred_prob = theano .function ([x , mask ], pred , name = 'f_pred_prob' )
327
333
f_pred = theano .function ([x , mask ], pred .argmax (axis = 1 ), name = 'f_pred' )
@@ -336,7 +342,7 @@ def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
336
342
the probabilities of new examples.
337
343
"""
338
344
n_samples = len (data [0 ])
339
- probs = numpy .zeros ((n_samples , 2 )).astype ('float32' )
345
+ probs = numpy .zeros ((n_samples , 2 )).astype (config . floatX )
340
346
341
347
n_done = 0
342
348
@@ -368,7 +374,7 @@ def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
368
374
preds = f_pred (x , mask )
369
375
targets = numpy .array (data [1 ])[valid_index ]
370
376
valid_err += (preds == targets ).sum ()
371
- valid_err = 1. - numpy . float32 (valid_err ) / len (data [0 ])
377
+ valid_err = 1. - numpy_floatX (valid_err ) / len (data [0 ])
372
378
373
379
return valid_err
374
380
@@ -396,6 +402,7 @@ def train_lstm(
396
402
use_dropout = True , # if False slightly faster, but worst test error
397
403
# This frequently need a bigger model.
398
404
reload_model = "" , # Path to a saved model we want to start from.
405
+ test_size = - 1 , # If >0, we keep only this number of test example.
399
406
):
400
407
401
408
# Model options
@@ -407,8 +414,16 @@ def train_lstm(
407
414
print 'Loading data'
408
415
train , valid , test = load_data (n_words = n_words , valid_portion = 0.05 ,
409
416
maxlen = maxlen )
417
+ if test_size > 0 :
418
+ # The test set is sorted by size, but we want to keep random
419
+ # size example. So we must select a random selection of the
420
+ # examples.
421
+ idx = numpy .arange (len (test [0 ]))
422
+ random .shuffle (idx )
423
+ idx = idx [:test_size ]
424
+ test = ([test [0 ][n ] for n in idx ], [test [1 ][n ] for n in idx ])
410
425
411
- ydim = numpy .max (train [1 ])+ 1
426
+ ydim = numpy .max (train [1 ]) + 1
412
427
413
428
model_options ['ydim' ] = ydim
414
429
@@ -430,9 +445,9 @@ def train_lstm(
430
445
y , f_pred_prob , f_pred , cost ) = build_model (tparams , model_options )
431
446
432
447
if decay_c > 0. :
433
- decay_c = theano .shared (numpy . float32 (decay_c ), name = 'decay_c' )
448
+ decay_c = theano .shared (numpy_floatX (decay_c ), name = 'decay_c' )
434
449
weight_decay = 0.
435
- weight_decay += (tparams ['U' ]** 2 ).sum ()
450
+ weight_decay += (tparams ['U' ] ** 2 ).sum ()
436
451
weight_decay *= decay_c
437
452
cost += weight_decay
438
453
@@ -447,10 +462,8 @@ def train_lstm(
447
462
448
463
print 'Optimization'
449
464
450
- kf_valid = get_minibatches_idx (len (valid [0 ]), valid_batch_size ,
451
- shuffle = True )
452
- kf_test = get_minibatches_idx (len (test [0 ]), valid_batch_size ,
453
- shuffle = True )
465
+ kf_valid = get_minibatches_idx (len (valid [0 ]), valid_batch_size )
466
+ kf_test = get_minibatches_idx (len (test [0 ]), valid_batch_size )
454
467
455
468
print "%d train examples" % len (train [0 ])
456
469
print "%d valid examples" % len (valid [0 ])
@@ -460,9 +473,9 @@ def train_lstm(
460
473
bad_count = 0
461
474
462
475
if validFreq == - 1 :
463
- validFreq = len (train [0 ])/ batch_size
476
+ validFreq = len (train [0 ]) / batch_size
464
477
if saveFreq == - 1 :
465
- saveFreq = len (train [0 ])/ batch_size
478
+ saveFreq = len (train [0 ]) / batch_size
466
479
467
480
uidx = 0 # the number of update done
468
481
estop = False # early stop
@@ -482,12 +495,10 @@ def train_lstm(
482
495
y = [train [1 ][t ] for t in train_index ]
483
496
x = [train [0 ][t ]for t in train_index ]
484
497
485
- # Get the data in numpy.ndarray formet.
486
- # It return something of the shape (minibatch maxlen, n samples)
487
- x , mask , y = prepare_data (x , y , maxlen = maxlen )
488
- if x is None :
489
- print 'Minibatch with zero sample under length ' , maxlen
490
- continue
498
+ # Get the data in numpy.ndarray format
499
+ # This swap the axis!
500
+ # Return something of shape (minibatch maxlen, n samples)
501
+ x , mask , y = prepare_data (x , y )
491
502
n_samples += x .shape [1 ]
492
503
493
504
cost = f_grad_shared (x , mask , y )
@@ -514,7 +525,8 @@ def train_lstm(
514
525
if numpy .mod (uidx , validFreq ) == 0 :
515
526
use_noise .set_value (0. )
516
527
train_err = pred_error (f_pred , prepare_data , train , kf )
517
- valid_err = pred_error (f_pred , prepare_data , valid , kf_valid )
528
+ valid_err = pred_error (f_pred , prepare_data , valid ,
529
+ kf_valid )
518
530
test_err = pred_error (f_pred , prepare_data , test , kf_test )
519
531
520
532
history_errs .append ([valid_err , test_err ])
@@ -553,7 +565,8 @@ def train_lstm(
553
565
best_p = unzip (tparams )
554
566
555
567
use_noise .set_value (0. )
556
- train_err = pred_error (f_pred , prepare_data , train , kf )
568
+ kf_train_sorted = get_minibatches_idx (len (train [0 ]), batch_size )
569
+ train_err = pred_error (f_pred , prepare_data , train , kf_train_sorted )
557
570
valid_err = pred_error (f_pred , prepare_data , valid , kf_valid )
558
571
test_err = pred_error (f_pred , prepare_data , test , kf_test )
559
572
@@ -570,14 +583,9 @@ def train_lstm(
570
583
571
584
572
585
if __name__ == '__main__' :
573
-
574
- # We must have floatX=float32 for this tutorial to work correctly.
575
- theano .config .floatX = "float32"
576
- # The next line is the new Theano default. This is a speed up.
577
- theano .config .scan .allow_gc = False
578
-
579
586
# See function train for all possible parameter and there definition.
580
587
train_lstm (
581
588
#reload_model="lstm_model.npz",
582
589
max_epochs = 100 ,
590
+ test_size = 500 ,
583
591
)
0 commit comments