|
| 1 | +""" |
| 2 | +This script is what created the dataset pickled. |
| 3 | +
|
| 4 | +1) You need to download this file and put it in the same directory as this file. |
| 5 | +https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission. |
| 6 | +
|
| 7 | +2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory. |
| 8 | +
|
| 9 | +3) Then run this script. |
| 10 | +""" |
| 11 | + |
| 12 | +dataset_path='/Tmp/bastienf/aclImdb/' |
| 13 | + |
| 14 | +import numpy |
| 15 | +import cPickle as pkl |
| 16 | + |
| 17 | +from collections import OrderedDict |
| 18 | + |
| 19 | +import glob |
| 20 | +import os |
| 21 | + |
| 22 | +from subprocess import Popen, PIPE |
| 23 | + |
| 24 | +# tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer |
| 25 | +tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-'] |
| 26 | + |
| 27 | + |
| 28 | +def tokenize(sentences): |
| 29 | + |
| 30 | + print 'Tokenizing..', |
| 31 | + text = "\n".join(sentences) |
| 32 | + tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE) |
| 33 | + tok_text, _ = tokenizer.communicate(text) |
| 34 | + toks = tok_text.split('\n')[:-1] |
| 35 | + print 'Done' |
| 36 | + |
| 37 | + return toks |
| 38 | + |
| 39 | + |
| 40 | +def build_dict(path): |
| 41 | + sentences = [] |
| 42 | + currdir = os.getcwd() |
| 43 | + os.chdir('%s/pos/' % path) |
| 44 | + for ff in glob.glob("*.txt"): |
| 45 | + with open(ff, 'r') as f: |
| 46 | + sentences.append(f.readline().strip()) |
| 47 | + os.chdir('%s/neg/' % path) |
| 48 | + for ff in glob.glob("*.txt"): |
| 49 | + with open(ff, 'r') as f: |
| 50 | + sentences.append(f.readline().strip()) |
| 51 | + os.chdir(currdir) |
| 52 | + |
| 53 | + sentences = tokenize(sentences) |
| 54 | + |
| 55 | + print 'Building dictionary..', |
| 56 | + wordcount = dict() |
| 57 | + for ss in sentences: |
| 58 | + words = ss.strip().lower().split() |
| 59 | + for w in words: |
| 60 | + if w not in wordcount: |
| 61 | + wordcount[w] = 1 |
| 62 | + else: |
| 63 | + wordcount[w] += 1 |
| 64 | + |
| 65 | + counts = wordcount.values() |
| 66 | + keys = wordcount.keys() |
| 67 | + |
| 68 | + sorted_idx = numpy.argsort(counts)[::-1] |
| 69 | + |
| 70 | + worddict = dict() |
| 71 | + |
| 72 | + for idx, ss in enumerate(sorted_idx): |
| 73 | + worddict[keys[ss]] = idx+2 # leave 0 and 1 (UNK) |
| 74 | + |
| 75 | + print numpy.sum(counts), ' total words ', len(keys), ' unique words' |
| 76 | + |
| 77 | + return worddict |
| 78 | + |
| 79 | + |
| 80 | +def grab_data(path, dictionary): |
| 81 | + sentences = [] |
| 82 | + currdir = os.getcwd() |
| 83 | + os.chdir(path) |
| 84 | + for ff in glob.glob("*.txt"): |
| 85 | + with open(ff, 'r') as f: |
| 86 | + sentences.append(f.readline().strip()) |
| 87 | + os.chdir(currdir) |
| 88 | + sentences = tokenize(sentences) |
| 89 | + |
| 90 | + seqs = [None] * len(sentences) |
| 91 | + for idx, ss in enumerate(sentences): |
| 92 | + words = ss.strip().lower().split() |
| 93 | + seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words] |
| 94 | + |
| 95 | + return seqs |
| 96 | + |
| 97 | + |
| 98 | +def main(): |
| 99 | + # Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ |
| 100 | + path = dataset_path |
| 101 | + dictionary = build_dict(os.path.join(path, 'train')) |
| 102 | + |
| 103 | + train_x_pos = grab_data(path+'train/pos', dictionary) |
| 104 | + train_x_neg = grab_data(path+'train/neg', dictionary) |
| 105 | + train_x = train_x_pos + train_x_neg |
| 106 | + train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg) |
| 107 | + |
| 108 | + test_x_pos = grab_data(path+'test/pos', dictionary) |
| 109 | + test_x_neg = grab_data(path+'test/neg', dictionary) |
| 110 | + test_x = test_x_pos + test_x_neg |
| 111 | + test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg) |
| 112 | + |
| 113 | + f = open('imdb.pkl', 'wb') |
| 114 | + pkl.dump((train_x, train_y), f, -1) |
| 115 | + pkl.dump((test_x, test_y), f, -1) |
| 116 | + f.close() |
| 117 | + |
| 118 | + f = open('imdb.dict.pkl', 'wb') |
| 119 | + pkl.dump(dictionary, f, -1) |
| 120 | + f.close() |
| 121 | + |
| 122 | +if __name__ == '__main__': |
| 123 | + main() |
0 commit comments