Skip to content

Commit 194adad

Browse files
committed
Add the script that created the preprocessed imdb dataset
1 parent 6b7b7a6 commit 194adad

File tree

1 file changed

+123
-0
lines changed

1 file changed

+123
-0
lines changed

code/imdb_preprocess.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
"""
2+
This script is what created the dataset pickled.
3+
4+
1) You need to download this file and put it in the same directory as this file.
5+
https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission.
6+
7+
2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory.
8+
9+
3) Then run this script.
10+
"""
11+
12+
dataset_path='/Tmp/bastienf/aclImdb/'
13+
14+
import numpy
15+
import cPickle as pkl
16+
17+
from collections import OrderedDict
18+
19+
import glob
20+
import os
21+
22+
from subprocess import Popen, PIPE
23+
24+
# tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer
25+
tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-']
26+
27+
28+
def tokenize(sentences):
29+
30+
print 'Tokenizing..',
31+
text = "\n".join(sentences)
32+
tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
33+
tok_text, _ = tokenizer.communicate(text)
34+
toks = tok_text.split('\n')[:-1]
35+
print 'Done'
36+
37+
return toks
38+
39+
40+
def build_dict(path):
41+
sentences = []
42+
currdir = os.getcwd()
43+
os.chdir('%s/pos/' % path)
44+
for ff in glob.glob("*.txt"):
45+
with open(ff, 'r') as f:
46+
sentences.append(f.readline().strip())
47+
os.chdir('%s/neg/' % path)
48+
for ff in glob.glob("*.txt"):
49+
with open(ff, 'r') as f:
50+
sentences.append(f.readline().strip())
51+
os.chdir(currdir)
52+
53+
sentences = tokenize(sentences)
54+
55+
print 'Building dictionary..',
56+
wordcount = dict()
57+
for ss in sentences:
58+
words = ss.strip().lower().split()
59+
for w in words:
60+
if w not in wordcount:
61+
wordcount[w] = 1
62+
else:
63+
wordcount[w] += 1
64+
65+
counts = wordcount.values()
66+
keys = wordcount.keys()
67+
68+
sorted_idx = numpy.argsort(counts)[::-1]
69+
70+
worddict = dict()
71+
72+
for idx, ss in enumerate(sorted_idx):
73+
worddict[keys[ss]] = idx+2 # leave 0 and 1 (UNK)
74+
75+
print numpy.sum(counts), ' total words ', len(keys), ' unique words'
76+
77+
return worddict
78+
79+
80+
def grab_data(path, dictionary):
81+
sentences = []
82+
currdir = os.getcwd()
83+
os.chdir(path)
84+
for ff in glob.glob("*.txt"):
85+
with open(ff, 'r') as f:
86+
sentences.append(f.readline().strip())
87+
os.chdir(currdir)
88+
sentences = tokenize(sentences)
89+
90+
seqs = [None] * len(sentences)
91+
for idx, ss in enumerate(sentences):
92+
words = ss.strip().lower().split()
93+
seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words]
94+
95+
return seqs
96+
97+
98+
def main():
99+
# Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/
100+
path = dataset_path
101+
dictionary = build_dict(os.path.join(path, 'train'))
102+
103+
train_x_pos = grab_data(path+'train/pos', dictionary)
104+
train_x_neg = grab_data(path+'train/neg', dictionary)
105+
train_x = train_x_pos + train_x_neg
106+
train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)
107+
108+
test_x_pos = grab_data(path+'test/pos', dictionary)
109+
test_x_neg = grab_data(path+'test/neg', dictionary)
110+
test_x = test_x_pos + test_x_neg
111+
test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)
112+
113+
f = open('imdb.pkl', 'wb')
114+
pkl.dump((train_x, train_y), f, -1)
115+
pkl.dump((test_x, test_y), f, -1)
116+
f.close()
117+
118+
f = open('imdb.dict.pkl', 'wb')
119+
pkl.dump(dictionary, f, -1)
120+
f.close()
121+
122+
if __name__ == '__main__':
123+
main()

0 commit comments

Comments
 (0)