Skip to content

Commit 98e981d

Browse files
committed
Merge branch 'master' of https://github.com/lisa-lab/DeepLearningTutorials into sda-edits
2 parents 0b6d1ee + e64f050 commit 98e981d

18 files changed

+1992
-21
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
code/*.pyc
2+
code/*_plots
23
code/tmp*
34
code/midi
5+
code/rnnslu
6+
data/atis.*
47
data/mnist.pkl.gz
58
data/mnist_py3k.pkl.gz
69
data/Nottingham.zip

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ language: c
1212
# command to install dependencies
1313
before_install:
1414
#zlib1g-dev is needed to allow PIL to uncompress the dataset.
15+
- sudo apt-get update
1516
- sudo apt-get install -qq libatlas3gf-base libatlas-dev zlib1g-dev zip unzip zlibc libzip-dev libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev python-numpy python-scipy python-pip python-nose python-yaml pyflakes python-imaging
1617

1718
install:

code/convolutional_mlp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
179179
# Construct the second convolutional pooling layer
180180
# filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
181181
# maxpooling reduces this further to (8/2, 8/2) = (4, 4)
182-
# 4D output tensor is thus of shape (nkerns[0], nkerns[1], 4, 4)
182+
# 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
183183
layer1 = LeNetConvPoolLayer(
184184
rng,
185185
input=layer0.output,

code/imdb.py

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
import cPickle
2+
import gzip
3+
import os
4+
5+
import numpy
6+
import theano
7+
8+
9+
def prepare_data(seqs, labels, maxlen=None):
10+
"""Create the matrices from the datasets.
11+
12+
This pad each sequence to the same lenght: the lenght of the
13+
longuest sequence or maxlen.
14+
15+
if maxlen is set, we will cut all sequence to this maximum
16+
lenght.
17+
18+
This swap the axis!
19+
"""
20+
# x: a list of sentences
21+
lengths = [len(s) for s in seqs]
22+
23+
if maxlen is not None:
24+
new_seqs = []
25+
new_labels = []
26+
new_lengths = []
27+
for l, s, y in zip(lengths, seqs, labels):
28+
if l < maxlen:
29+
new_seqs.append(s)
30+
new_labels.append(y)
31+
new_lengths.append(l)
32+
lengths = new_lengths
33+
labels = new_labels
34+
seqs = new_seqs
35+
36+
if len(lengths) < 1:
37+
return None, None, None
38+
39+
n_samples = len(seqs)
40+
maxlen = numpy.max(lengths)
41+
42+
x = numpy.zeros((maxlen, n_samples)).astype('int64')
43+
x_mask = numpy.zeros((maxlen, n_samples)).astype(theano.config.floatX)
44+
for idx, s in enumerate(seqs):
45+
x[:lengths[idx], idx] = s
46+
x_mask[:lengths[idx], idx] = 1.
47+
48+
return x, x_mask, labels
49+
50+
51+
def get_dataset_file(dataset, default_dataset, origin):
52+
'''Look for it as if it was a full path, if not, try local file,
53+
if not try in the data directory.
54+
55+
Download dataset if it is not present
56+
57+
'''
58+
data_dir, data_file = os.path.split(dataset)
59+
if data_dir == "" and not os.path.isfile(dataset):
60+
# Check if dataset is in the data directory.
61+
new_path = os.path.join(
62+
os.path.split(__file__)[0],
63+
"..",
64+
"data",
65+
dataset
66+
)
67+
if os.path.isfile(new_path) or data_file == default_dataset:
68+
dataset = new_path
69+
70+
if (not os.path.isfile(dataset)) and data_file == default_dataset:
71+
import urllib
72+
print 'Downloading data from %s' % origin
73+
urllib.urlretrieve(origin, dataset)
74+
return dataset
75+
76+
77+
def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None,
78+
sort_by_len=True):
79+
'''Loads the dataset
80+
81+
:type path: String
82+
:param path: The path to the dataset (here IMDB)
83+
:type n_words: int
84+
:param n_words: The number of word to keep in the vocabulary.
85+
All extra words are set to unknow (1).
86+
:type valid_portion: float
87+
:param valid_portion: The proportion of the full train set used for
88+
the validation set.
89+
:type maxlen: None or positive int
90+
:param maxlen: the max sequence length we use in the train/valid set.
91+
:type sort_by_len: bool
92+
:name sort_by_len: Sort by the sequence lenght for the train,
93+
valid and test set. This allow faster execution as it cause
94+
less padding per minibatch. Another mechanism must be used to
95+
shuffle the train set at each epoch.
96+
97+
'''
98+
99+
#############
100+
# LOAD DATA #
101+
#############
102+
103+
# Load the dataset
104+
path = get_dataset_file(
105+
path, "imdb.pkl",
106+
"http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
107+
108+
if path.endswith(".gz"):
109+
f = gzip.open(path, 'rb')
110+
else:
111+
f = open(path, 'rb')
112+
113+
train_set = cPickle.load(f)
114+
test_set = cPickle.load(f)
115+
f.close()
116+
if maxlen:
117+
new_train_set_x = []
118+
new_train_set_y = []
119+
for x, y in zip(train_set[0], train_set[1]):
120+
if len(x) < maxlen:
121+
new_train_set_x.append(x)
122+
new_train_set_y.append(y)
123+
train_set = (new_train_set_x, new_train_set_y)
124+
del new_train_set_x, new_train_set_y
125+
126+
# split training set into validation set
127+
train_set_x, train_set_y = train_set
128+
n_samples = len(train_set_x)
129+
sidx = numpy.random.permutation(n_samples)
130+
n_train = int(numpy.round(n_samples * (1. - valid_portion)))
131+
valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
132+
valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
133+
train_set_x = [train_set_x[s] for s in sidx[:n_train]]
134+
train_set_y = [train_set_y[s] for s in sidx[:n_train]]
135+
136+
train_set = (train_set_x, train_set_y)
137+
valid_set = (valid_set_x, valid_set_y)
138+
139+
def remove_unk(x):
140+
return [[1 if w >= n_words else w for w in sen] for sen in x]
141+
142+
test_set_x, test_set_y = test_set
143+
valid_set_x, valid_set_y = valid_set
144+
train_set_x, train_set_y = train_set
145+
146+
train_set_x = remove_unk(train_set_x)
147+
valid_set_x = remove_unk(valid_set_x)
148+
test_set_x = remove_unk(test_set_x)
149+
150+
def len_argsort(seq):
151+
return sorted(range(len(seq)), key=lambda x: len(seq[x]))
152+
153+
if sort_by_len:
154+
sorted_index = len_argsort(test_set_x)
155+
test_set_x = [test_set_x[i] for i in sorted_index]
156+
test_set_y = [test_set_y[i] for i in sorted_index]
157+
158+
sorted_index = len_argsort(valid_set_x)
159+
valid_set_x = [valid_set_x[i] for i in sorted_index]
160+
valid_set_y = [valid_set_y[i] for i in sorted_index]
161+
162+
sorted_index = len_argsort(train_set_x)
163+
train_set_x = [train_set_x[i] for i in sorted_index]
164+
train_set_y = [train_set_y[i] for i in sorted_index]
165+
166+
train = (train_set_x, train_set_y)
167+
valid = (valid_set_x, valid_set_y)
168+
test = (test_set_x, test_set_y)
169+
170+
return train, valid, test

code/imdb_preprocess.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
"""
2+
This script is what created the dataset pickled.
3+
4+
1) You need to download this file and put it in the same directory as this file.
5+
https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission.
6+
7+
2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory.
8+
9+
3) Then run this script.
10+
"""
11+
12+
dataset_path='/Tmp/bastienf/aclImdb/'
13+
14+
import numpy
15+
import cPickle as pkl
16+
17+
from collections import OrderedDict
18+
19+
import glob
20+
import os
21+
22+
from subprocess import Popen, PIPE
23+
24+
# tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer
25+
tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-']
26+
27+
28+
def tokenize(sentences):
29+
30+
print 'Tokenizing..',
31+
text = "\n".join(sentences)
32+
tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
33+
tok_text, _ = tokenizer.communicate(text)
34+
toks = tok_text.split('\n')[:-1]
35+
print 'Done'
36+
37+
return toks
38+
39+
40+
def build_dict(path):
41+
sentences = []
42+
currdir = os.getcwd()
43+
os.chdir('%s/pos/' % path)
44+
for ff in glob.glob("*.txt"):
45+
with open(ff, 'r') as f:
46+
sentences.append(f.readline().strip())
47+
os.chdir('%s/neg/' % path)
48+
for ff in glob.glob("*.txt"):
49+
with open(ff, 'r') as f:
50+
sentences.append(f.readline().strip())
51+
os.chdir(currdir)
52+
53+
sentences = tokenize(sentences)
54+
55+
print 'Building dictionary..',
56+
wordcount = dict()
57+
for ss in sentences:
58+
words = ss.strip().lower().split()
59+
for w in words:
60+
if w not in wordcount:
61+
wordcount[w] = 1
62+
else:
63+
wordcount[w] += 1
64+
65+
counts = wordcount.values()
66+
keys = wordcount.keys()
67+
68+
sorted_idx = numpy.argsort(counts)[::-1]
69+
70+
worddict = dict()
71+
72+
for idx, ss in enumerate(sorted_idx):
73+
worddict[keys[ss]] = idx+2 # leave 0 and 1 (UNK)
74+
75+
print numpy.sum(counts), ' total words ', len(keys), ' unique words'
76+
77+
return worddict
78+
79+
80+
def grab_data(path, dictionary):
81+
sentences = []
82+
currdir = os.getcwd()
83+
os.chdir(path)
84+
for ff in glob.glob("*.txt"):
85+
with open(ff, 'r') as f:
86+
sentences.append(f.readline().strip())
87+
os.chdir(currdir)
88+
sentences = tokenize(sentences)
89+
90+
seqs = [None] * len(sentences)
91+
for idx, ss in enumerate(sentences):
92+
words = ss.strip().lower().split()
93+
seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words]
94+
95+
return seqs
96+
97+
98+
def main():
99+
# Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/
100+
path = dataset_path
101+
dictionary = build_dict(os.path.join(path, 'train'))
102+
103+
train_x_pos = grab_data(path+'train/pos', dictionary)
104+
train_x_neg = grab_data(path+'train/neg', dictionary)
105+
train_x = train_x_pos + train_x_neg
106+
train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)
107+
108+
test_x_pos = grab_data(path+'test/pos', dictionary)
109+
test_x_neg = grab_data(path+'test/neg', dictionary)
110+
test_x = test_x_pos + test_x_neg
111+
test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)
112+
113+
f = open('imdb.pkl', 'wb')
114+
pkl.dump((train_x, train_y), f, -1)
115+
pkl.dump((test_x, test_y), f, -1)
116+
f.close()
117+
118+
f = open('imdb.dict.pkl', 'wb')
119+
pkl.dump(dictionary, f, -1)
120+
f.close()
121+
122+
if __name__ == '__main__':
123+
main()

0 commit comments

Comments
 (0)