9
9
from keras .layers import LSTM , Dense , Embedding , Dropout , Bidirectional
10
10
from keras .layers import TimeDistributed
11
11
from keras .optimizers import Adam
12
- import cPickle as pickle
12
+ from spacy .compat import pickle
13
+
14
+ import thinc .extra .datasets
13
15
14
16
import spacy
15
17
@@ -70,24 +72,28 @@ def get_features(docs, max_length):
70
72
for i , doc in enumerate (docs ):
71
73
j = 0
72
74
for token in doc :
73
- if token .has_vector and not token .is_punct and not token .is_space :
74
- Xs [i , j ] = token .rank + 1
75
- j += 1
76
- if j >= max_length :
77
- break
75
+ vector_id = token .vocab .vectors .find (key = token .orth )
76
+ if vector_id >= 0 :
77
+ Xs [i , j ] = vector_id
78
+ else :
79
+ Xs [i , j ] = 0
80
+ j += 1
81
+ if j >= max_length :
82
+ break
78
83
return Xs
79
84
80
85
81
86
def train (train_texts , train_labels , dev_texts , dev_labels ,
82
87
lstm_shape , lstm_settings , lstm_optimizer , batch_size = 100 , nb_epoch = 5 ,
83
88
by_sentence = True ):
84
89
print ("Loading spaCy" )
85
- nlp = spacy .load ('en' , entity = False )
90
+ nlp = spacy .load ('en_vectors_web_lg' )
91
+ nlp .add_pipe (nlp .create_pipe ('sentencizer' ))
86
92
embeddings = get_embeddings (nlp .vocab )
87
93
model = compile_lstm (embeddings , lstm_shape , lstm_settings )
88
94
print ("Parsing texts..." )
89
- train_docs = list (nlp .pipe (train_texts , batch_size = 5000 , n_threads = 3 ))
90
- dev_docs = list (nlp .pipe (dev_texts , batch_size = 5000 , n_threads = 3 ))
95
+ train_docs = list (nlp .pipe (train_texts ))
96
+ dev_docs = list (nlp .pipe (dev_texts ))
91
97
if by_sentence :
92
98
train_docs , train_labels = get_labelled_sentences (train_docs , train_labels )
93
99
dev_docs , dev_labels = get_labelled_sentences (dev_docs , dev_labels )
@@ -111,22 +117,18 @@ def compile_lstm(embeddings, shape, settings):
111
117
mask_zero = True
112
118
)
113
119
)
114
- model .add (TimeDistributed (Dense (shape ['nr_hidden' ], bias = False )))
115
- model .add (Bidirectional (LSTM (shape ['nr_hidden' ], dropout_U = settings ['dropout' ],
116
- dropout_W = settings ['dropout' ])))
120
+ model .add (TimeDistributed (Dense (shape ['nr_hidden' ], use_bias = False )))
121
+ model .add (Bidirectional (LSTM (shape ['nr_hidden' ],
122
+ recurrent_dropout = settings ['dropout' ],
123
+ dropout = settings ['dropout' ])))
117
124
model .add (Dense (shape ['nr_class' ], activation = 'sigmoid' ))
118
125
model .compile (optimizer = Adam (lr = settings ['lr' ]), loss = 'binary_crossentropy' ,
119
126
metrics = ['accuracy' ])
120
127
return model
121
128
122
129
123
130
def get_embeddings (vocab ):
124
- max_rank = max (lex .rank + 1 for lex in vocab if lex .has_vector )
125
- vectors = numpy .ndarray ((max_rank + 1 , vocab .vectors_length ), dtype = 'float32' )
126
- for lex in vocab :
127
- if lex .has_vector :
128
- vectors [lex .rank + 1 ] = lex .vector
129
- return vectors
131
+ return vocab .vectors .data
130
132
131
133
132
134
def evaluate (model_dir , texts , labels , max_length = 100 ):
@@ -174,22 +176,32 @@ def read_data(data_dir, limit=0):
174
176
batch_size = ("Size of minibatches for training LSTM" , "option" , "b" , int ),
175
177
nr_examples = ("Limit to N examples" , "option" , "n" , int )
176
178
)
177
- def main (model_dir , train_dir , dev_dir ,
179
+ def main (model_dir = None , train_dir = None , dev_dir = None ,
178
180
is_runtime = False ,
179
181
nr_hidden = 64 , max_length = 100 , # Shape
180
182
dropout = 0.5 , learn_rate = 0.001 , # General NN config
181
183
nb_epoch = 5 , batch_size = 100 , nr_examples = - 1 ): # Training params
182
- model_dir = pathlib .Path (model_dir )
183
- train_dir = pathlib .Path (train_dir )
184
- dev_dir = pathlib .Path (dev_dir )
184
+ if model_dir is not None :
185
+ model_dir = pathlib .Path (model_dir )
186
+ if train_dir is None or dev_dir is None :
187
+ imdb_data = thinc .extra .datasets .imdb ()
185
188
if is_runtime :
186
- dev_texts , dev_labels = read_data (dev_dir )
189
+ if dev_dir is None :
190
+ dev_texts , dev_labels = zip (* imdb_data [1 ])
191
+ else :
192
+ dev_texts , dev_labels = read_data (dev_dir )
187
193
acc = evaluate (model_dir , dev_texts , dev_labels , max_length = max_length )
188
194
print (acc )
189
195
else :
190
- print ("Read data" )
191
- train_texts , train_labels = read_data (train_dir , limit = nr_examples )
192
- dev_texts , dev_labels = read_data (dev_dir , limit = nr_examples )
196
+ if train_dir is None :
197
+ train_texts , train_labels = zip (* imdb_data [0 ])
198
+ else :
199
+ print ("Read data" )
200
+ train_texts , train_labels = read_data (train_dir , limit = nr_examples )
201
+ if dev_dir is None :
202
+ dev_texts , dev_labels = zip (* imdb_data [1 ])
203
+ else :
204
+ dev_texts , dev_labels = read_data (dev_dir , imdb_data , limit = nr_examples )
193
205
train_labels = numpy .asarray (train_labels , dtype = 'int32' )
194
206
dev_labels = numpy .asarray (dev_labels , dtype = 'int32' )
195
207
lstm = train (train_texts , train_labels , dev_texts , dev_labels ,
@@ -198,10 +210,11 @@ def main(model_dir, train_dir, dev_dir,
198
210
{},
199
211
nb_epoch = nb_epoch , batch_size = batch_size )
200
212
weights = lstm .get_weights ()
201
- with (model_dir / 'model' ).open ('wb' ) as file_ :
202
- pickle .dump (weights [1 :], file_ )
203
- with (model_dir / 'config.json' ).open ('wb' ) as file_ :
204
- file_ .write (lstm .to_json ())
213
+ if model_dir is not None :
214
+ with (model_dir / 'model' ).open ('wb' ) as file_ :
215
+ pickle .dump (weights [1 :], file_ )
216
+ with (model_dir / 'config.json' ).open ('wb' ) as file_ :
217
+ file_ .write (lstm .to_json ())
205
218
206
219
207
220
if __name__ == '__main__' :
0 commit comments