newcoder
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎.travis.yml
Lines changed: 1 addition & 0 deletions b/‎.travis.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎code/convolutional_mlp.py
Lines changed: 1 addition & 1 deletion b/‎code/convolutional_mlp.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎code/imdb.py
Lines changed: 170 additions & 0 deletions b/‎code/imdb.py
Lines changed: 170 additions & 0 deletions
diff --git a/‎code/imdb_preprocess.py
Lines changed: 123 additions & 0 deletions b/‎code/imdb_preprocess.py
Lines changed: 123 additions & 0 deletions
@@ -1,6 +1,9 @@
 code/*.pyc
+code/*_plots
 code/tmp*
 code/midi
+code/rnnslu
+data/atis.*
 data/mnist.pkl.gz
 data/mnist_py3k.pkl.gz
 data/Nottingham.zip
 
@@ -12,6 +12,7 @@ language: c
 # command to install dependencies
 before_install:
 #zlib1g-dev is needed to allow PIL to uncompress the dataset.
+  - sudo apt-get update
   - sudo apt-get install -qq libatlas3gf-base libatlas-dev zlib1g-dev zip unzip zlibc libzip-dev libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev python-numpy python-scipy python-pip python-nose python-yaml pyflakes python-imaging
 
 install:
 
@@ -179,7 +179,7 @@ def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
     # Construct the second convolutional pooling layer
     # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
     # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
-    # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 4, 4)
+    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
     layer1 = LeNetConvPoolLayer(
         rng,
         input=layer0.output,
 
@@ -0,0 +1,170 @@
+import cPickle
+import gzip
+import os
+
+import numpy
+import theano
+
+
+def prepare_data(seqs, labels, maxlen=None):
+    """Create the matrices from the datasets.
+
+    This pad each sequence to the same lenght: the lenght of the
+    longuest sequence or maxlen.
+
+    if maxlen is set, we will cut all sequence to this maximum
+    lenght.
+
+    This swap the axis!
+    """
+    # x: a list of sentences
+    lengths = [len(s) for s in seqs]
+
+    if maxlen is not None:
+        new_seqs = []
+        new_labels = []
+        new_lengths = []
+        for l, s, y in zip(lengths, seqs, labels):
+            if l < maxlen:
+                new_seqs.append(s)
+                new_labels.append(y)
+                new_lengths.append(l)
+        lengths = new_lengths
+        labels = new_labels
+        seqs = new_seqs
+
+        if len(lengths) < 1:
+            return None, None, None
+
+    n_samples = len(seqs)
+    maxlen = numpy.max(lengths)
+
+    x = numpy.zeros((maxlen, n_samples)).astype('int64')
+    x_mask = numpy.zeros((maxlen, n_samples)).astype(theano.config.floatX)
+    for idx, s in enumerate(seqs):
+        x[:lengths[idx], idx] = s
+        x_mask[:lengths[idx], idx] = 1.
+
+    return x, x_mask, labels
+
+
+def get_dataset_file(dataset, default_dataset, origin):
+    '''Look for it as if it was a full path, if not, try local file,
+    if not try in the data directory.
+
+    Download dataset if it is not present
+
+    '''
+    data_dir, data_file = os.path.split(dataset)
+    if data_dir == "" and not os.path.isfile(dataset):
+        # Check if dataset is in the data directory.
+        new_path = os.path.join(
+            os.path.split(__file__)[0],
+            "..",
+            "data",
+            dataset
+        )
+        if os.path.isfile(new_path) or data_file == default_dataset:
+            dataset = new_path
+
+    if (not os.path.isfile(dataset)) and data_file == default_dataset:
+        import urllib
+        print 'Downloading data from %s' % origin
+        urllib.urlretrieve(origin, dataset)
+    return dataset
+
+
+def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None,
+              sort_by_len=True):
+    '''Loads the dataset
+
+    :type path: String
+    :param path: The path to the dataset (here IMDB)
+    :type n_words: int
+    :param n_words: The number of word to keep in the vocabulary.
+        All extra words are set to unknow (1).
+    :type valid_portion: float
+    :param valid_portion: The proportion of the full train set used for
+        the validation set.
+    :type maxlen: None or positive int
+    :param maxlen: the max sequence length we use in the train/valid set.
+    :type sort_by_len: bool
+    :name sort_by_len: Sort by the sequence lenght for the train,
+        valid and test set. This allow faster execution as it cause
+        less padding per minibatch. Another mechanism must be used to
+        shuffle the train set at each epoch.
+
+    '''
+
+    #############
+    # LOAD DATA #
+    #############
+
+    # Load the dataset
+    path = get_dataset_file(
+        path, "imdb.pkl",
+        "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
+
+    if path.endswith(".gz"):
+        f = gzip.open(path, 'rb')
+    else:
+        f = open(path, 'rb')
+
+    train_set = cPickle.load(f)
+    test_set = cPickle.load(f)
+    f.close()
+    if maxlen:
+        new_train_set_x = []
+        new_train_set_y = []
+        for x, y in zip(train_set[0], train_set[1]):
+            if len(x) < maxlen:
+                new_train_set_x.append(x)
+                new_train_set_y.append(y)
+        train_set = (new_train_set_x, new_train_set_y)
+        del new_train_set_x, new_train_set_y
+
+    # split training set into validation set
+    train_set_x, train_set_y = train_set
+    n_samples = len(train_set_x)
+    sidx = numpy.random.permutation(n_samples)
+    n_train = int(numpy.round(n_samples * (1. - valid_portion)))
+    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
+    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
+    train_set_x = [train_set_x[s] for s in sidx[:n_train]]
+    train_set_y = [train_set_y[s] for s in sidx[:n_train]]
+
+    train_set = (train_set_x, train_set_y)
+    valid_set = (valid_set_x, valid_set_y)
+
+    def remove_unk(x):
+        return [[1 if w >= n_words else w for w in sen] for sen in x]
+
+    test_set_x, test_set_y = test_set
+    valid_set_x, valid_set_y = valid_set
+    train_set_x, train_set_y = train_set
+
+    train_set_x = remove_unk(train_set_x)
+    valid_set_x = remove_unk(valid_set_x)
+    test_set_x = remove_unk(test_set_x)
+
+    def len_argsort(seq):
+        return sorted(range(len(seq)), key=lambda x: len(seq[x]))
+
+    if sort_by_len:
+        sorted_index = len_argsort(test_set_x)
+        test_set_x = [test_set_x[i] for i in sorted_index]
+        test_set_y = [test_set_y[i] for i in sorted_index]
+
+        sorted_index = len_argsort(valid_set_x)
+        valid_set_x = [valid_set_x[i] for i in sorted_index]
+        valid_set_y = [valid_set_y[i] for i in sorted_index]
+
+        sorted_index = len_argsort(train_set_x)
+        train_set_x = [train_set_x[i] for i in sorted_index]
+        train_set_y = [train_set_y[i] for i in sorted_index]
+
+    train = (train_set_x, train_set_y)
+    valid = (valid_set_x, valid_set_y)
+    test = (test_set_x, test_set_y)
+
+    return train, valid, test
@@ -0,0 +1,123 @@
+"""
+This script is what created the dataset pickled.
+
+1) You need to download this file and put it in the same directory as this file.
+https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission.
+
+2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory.
+
+3) Then run this script.
+"""
+
+dataset_path='/Tmp/bastienf/aclImdb/'
+
+import numpy
+import cPickle as pkl
+
+from collections import OrderedDict
+
+import glob
+import os
+
+from subprocess import Popen, PIPE
+
+# tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer
+tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-']
+
+
+def tokenize(sentences):
+
+    print 'Tokenizing..',
+    text = "\n".join(sentences)
+    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
+    tok_text, _ = tokenizer.communicate(text)
+    toks = tok_text.split('\n')[:-1]
+    print 'Done'
+
+    return toks
+
+
+def build_dict(path):
+    sentences = []
+    currdir = os.getcwd()
+    os.chdir('%s/pos/' % path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir('%s/neg/' % path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir(currdir)
+
+    sentences = tokenize(sentences)
+
+    print 'Building dictionary..',
+    wordcount = dict()
+    for ss in sentences:
+        words = ss.strip().lower().split()
+        for w in words:
+            if w not in wordcount:
+                wordcount[w] = 1
+            else:
+                wordcount[w] += 1
+
+    counts = wordcount.values()
+    keys = wordcount.keys()
+
+    sorted_idx = numpy.argsort(counts)[::-1]
+
+    worddict = dict()
+
+    for idx, ss in enumerate(sorted_idx):
+        worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)
+
+    print numpy.sum(counts), ' total words ', len(keys), ' unique words'
+
+    return worddict
+
+
+def grab_data(path, dictionary):
+    sentences = []
+    currdir = os.getcwd()
+    os.chdir(path)
+    for ff in glob.glob("*.txt"):
+        with open(ff, 'r') as f:
+            sentences.append(f.readline().strip())
+    os.chdir(currdir)
+    sentences = tokenize(sentences)
+
+    seqs = [None] * len(sentences)
+    for idx, ss in enumerate(sentences):
+        words = ss.strip().lower().split()
+        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words]
+
+    return seqs
+
+
+def main():
+    # Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/
+    path = dataset_path
+    dictionary = build_dict(os.path.join(path, 'train'))
+
+    train_x_pos = grab_data(path+'train/pos', dictionary)
+    train_x_neg = grab_data(path+'train/neg', dictionary)
+    train_x = train_x_pos + train_x_neg
+    train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)
+
+    test_x_pos = grab_data(path+'test/pos', dictionary)
+    test_x_neg = grab_data(path+'test/neg', dictionary)
+    test_x = test_x_pos + test_x_neg
+    test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)
+
+    f = open('imdb.pkl', 'wb')
+    pkl.dump((train_x, train_y), f, -1)
+    pkl.dump((test_x, test_y), f, -1)
+    f.close()
+
+    f = open('imdb.dict.pkl', 'wb')
+    pkl.dump(dictionary, f, -1)
+    f.close()
+
+if __name__ == '__main__':
+    main()