Add the script that created the preprocessed imdb dataset

nouiz · nouiz · commit 194adadb94c5 · 2015-01-13T09:29:05.000-05:00
diff --git a/code/imdb_preprocess.py b/code/imdb_preprocess.py
@@ -0,0 +1,123 @@
+"""
+This script is what created the dataset pickled.
+
+1) You need to download this file and put it in the same directory as this file.
+https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission.
+
+2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory.
+
+3) Then run this script.
+"""
+
+dataset_path='/Tmp/bastienf/aclImdb/'
+
+import numpy
+import cPickle as pkl
+
+from collections import OrderedDict
+
+import glob
+import os
+
+from subprocess import Popen, PIPE
+
+# tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer
+tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-']
+
+
+def tokenize(sentences):
+
+    print 'Tokenizing..',
+    text = "\n".join(sentences)
+    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
+    tok_text, _ = tokenizer.communicate(text)
+    toks = tok_text.split('\n')[:-1]
+    print 'Done'
+
+    return toks
+
+
+def build_dict(path):
+    sentences = []
+    currdir = os.getcwd()
+    os.chdir('%s/pos/' % path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir('%s/neg/' % path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir(currdir)
+
+    sentences = tokenize(sentences)
+
+    print 'Building dictionary..',
+    wordcount = dict()
+    for ss in sentences:
+        words = ss.strip().lower().split()
+        for w in words:
+            if w not in wordcount:
+                wordcount[w] = 1
+            else:
+                wordcount[w] += 1
+
+    counts = wordcount.values()
+    keys = wordcount.keys()
+
+    sorted_idx = numpy.argsort(counts)[::-1]
+
+    worddict = dict()
+
+    for idx, ss in enumerate(sorted_idx):
+        worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)
+
+    print numpy.sum(counts), ' total words ', len(keys), ' unique words'
+
+    return worddict
+
+
+def grab_data(path, dictionary):
+    sentences = []
+    currdir = os.getcwd()
+    os.chdir(path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir(currdir)
+    sentences = tokenize(sentences)
+
+    seqs = [None] * len(sentences)
+    for idx, ss in enumerate(sentences):
+        words = ss.strip().lower().split()
+        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words]
+
+    return seqs
+
+
+def main():
+    # Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/
+    path = dataset_path
+    dictionary = build_dict(os.path.join(path, 'train'))
+
+    train_x_pos = grab_data(path+'train/pos', dictionary)
+    train_x_neg = grab_data(path+'train/neg', dictionary)
+    train_x = train_x_pos + train_x_neg
+    train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)
+
+    test_x_pos = grab_data(path+'test/pos', dictionary)
+    test_x_neg = grab_data(path+'test/neg', dictionary)
+    test_x = test_x_pos + test_x_neg
+    test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)
+
+    f = open('imdb.pkl', 'wb')
+    pkl.dump((train_x, train_y), f, -1)
+    pkl.dump((test_x, test_y), f, -1)
+    f.close()
+
+    f = open('imdb.dict.pkl', 'wb')
+    pkl.dump(dictionary, f, -1)
+    f.close()
+
+if __name__ == '__main__':
+    main()