Skip to content

Commit 46e62ad

Browse files
committed
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
2 parents bb25cb0 + 86d6bd7 commit 46e62ad

27 files changed

+7252
-7019
lines changed

examples/deep_learning_keras.py

Lines changed: 43 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
from keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
1010
from keras.layers import TimeDistributed
1111
from keras.optimizers import Adam
12-
import cPickle as pickle
12+
from spacy.compat import pickle
13+
14+
import thinc.extra.datasets
1315

1416
import spacy
1517

@@ -70,24 +72,28 @@ def get_features(docs, max_length):
7072
for i, doc in enumerate(docs):
7173
j = 0
7274
for token in doc:
73-
if token.has_vector and not token.is_punct and not token.is_space:
74-
Xs[i, j] = token.rank + 1
75-
j += 1
76-
if j >= max_length:
77-
break
75+
vector_id = token.vocab.vectors.find(key=token.orth)
76+
if vector_id >= 0:
77+
Xs[i, j] = vector_id
78+
else:
79+
Xs[i, j] = 0
80+
j += 1
81+
if j >= max_length:
82+
break
7883
return Xs
7984

8085

8186
def train(train_texts, train_labels, dev_texts, dev_labels,
8287
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
8388
by_sentence=True):
8489
print("Loading spaCy")
85-
nlp = spacy.load('en', entity=False)
90+
nlp = spacy.load('en_vectors_web_lg')
91+
nlp.add_pipe(nlp.create_pipe('sentencizer'))
8692
embeddings = get_embeddings(nlp.vocab)
8793
model = compile_lstm(embeddings, lstm_shape, lstm_settings)
8894
print("Parsing texts...")
89-
train_docs = list(nlp.pipe(train_texts, batch_size=5000, n_threads=3))
90-
dev_docs = list(nlp.pipe(dev_texts, batch_size=5000, n_threads=3))
95+
train_docs = list(nlp.pipe(train_texts))
96+
dev_docs = list(nlp.pipe(dev_texts))
9197
if by_sentence:
9298
train_docs, train_labels = get_labelled_sentences(train_docs, train_labels)
9399
dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels)
@@ -111,22 +117,18 @@ def compile_lstm(embeddings, shape, settings):
111117
mask_zero=True
112118
)
113119
)
114-
model.add(TimeDistributed(Dense(shape['nr_hidden'], bias=False)))
115-
model.add(Bidirectional(LSTM(shape['nr_hidden'], dropout_U=settings['dropout'],
116-
dropout_W=settings['dropout'])))
120+
model.add(TimeDistributed(Dense(shape['nr_hidden'], use_bias=False)))
121+
model.add(Bidirectional(LSTM(shape['nr_hidden'],
122+
recurrent_dropout=settings['dropout'],
123+
dropout=settings['dropout'])))
117124
model.add(Dense(shape['nr_class'], activation='sigmoid'))
118125
model.compile(optimizer=Adam(lr=settings['lr']), loss='binary_crossentropy',
119126
metrics=['accuracy'])
120127
return model
121128

122129

123130
def get_embeddings(vocab):
124-
max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector)
125-
vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
126-
for lex in vocab:
127-
if lex.has_vector:
128-
vectors[lex.rank + 1] = lex.vector
129-
return vectors
131+
return vocab.vectors.data
130132

131133

132134
def evaluate(model_dir, texts, labels, max_length=100):
@@ -174,22 +176,32 @@ def read_data(data_dir, limit=0):
174176
batch_size=("Size of minibatches for training LSTM", "option", "b", int),
175177
nr_examples=("Limit to N examples", "option", "n", int)
176178
)
177-
def main(model_dir, train_dir, dev_dir,
179+
def main(model_dir=None, train_dir=None, dev_dir=None,
178180
is_runtime=False,
179181
nr_hidden=64, max_length=100, # Shape
180182
dropout=0.5, learn_rate=0.001, # General NN config
181183
nb_epoch=5, batch_size=100, nr_examples=-1): # Training params
182-
model_dir = pathlib.Path(model_dir)
183-
train_dir = pathlib.Path(train_dir)
184-
dev_dir = pathlib.Path(dev_dir)
184+
if model_dir is not None:
185+
model_dir = pathlib.Path(model_dir)
186+
if train_dir is None or dev_dir is None:
187+
imdb_data = thinc.extra.datasets.imdb()
185188
if is_runtime:
186-
dev_texts, dev_labels = read_data(dev_dir)
189+
if dev_dir is None:
190+
dev_texts, dev_labels = zip(*imdb_data[1])
191+
else:
192+
dev_texts, dev_labels = read_data(dev_dir)
187193
acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length)
188194
print(acc)
189195
else:
190-
print("Read data")
191-
train_texts, train_labels = read_data(train_dir, limit=nr_examples)
192-
dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples)
196+
if train_dir is None:
197+
train_texts, train_labels = zip(*imdb_data[0])
198+
else:
199+
print("Read data")
200+
train_texts, train_labels = read_data(train_dir, limit=nr_examples)
201+
if dev_dir is None:
202+
dev_texts, dev_labels = zip(*imdb_data[1])
203+
else:
204+
dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples)
193205
train_labels = numpy.asarray(train_labels, dtype='int32')
194206
dev_labels = numpy.asarray(dev_labels, dtype='int32')
195207
lstm = train(train_texts, train_labels, dev_texts, dev_labels,
@@ -198,10 +210,11 @@ def main(model_dir, train_dir, dev_dir,
198210
{},
199211
nb_epoch=nb_epoch, batch_size=batch_size)
200212
weights = lstm.get_weights()
201-
with (model_dir / 'model').open('wb') as file_:
202-
pickle.dump(weights[1:], file_)
203-
with (model_dir / 'config.json').open('wb') as file_:
204-
file_.write(lstm.to_json())
213+
if model_dir is not None:
214+
with (model_dir / 'model').open('wb') as file_:
215+
pickle.dump(weights[1:], file_)
216+
with (model_dir / 'config.json').open('wb') as file_:
217+
file_.write(lstm.to_json())
205218

206219

207220
if __name__ == '__main__':

0 commit comments

Comments
 (0)