Merge branch 'master' of https://github.com/atulkum/sequence_prediction

atulkum · atulkum · commit 01e3776ae704 · 2019-01-13T21:58:02.000-08:00
diff --git a/README.md b/README.md
@@ -5,17 +5,20 @@
 - - [x] conll2003
 - - [ ] atis
 ### Neural NER
-- - [ ] CharLSTM+WordLSTM+CRF: [Lample .etc, NAACL16](http://www.aclweb.org/anthology/N/N16/N16-1030.pdf)
+- - [x] CharLSTM+WordLSTM+CRF: [Lample .etc, NAACL16](http://www.aclweb.org/anthology/N/N16/N16-1030.pdf)
   - - [x] Make a CoNLL-2003 batcher
   - - [x] Implement trainer
   - - [x] Implement WordLSTM + softmax
   - - [x] Implement CharLSTM + WordLSTM + softmax
-  - - [ ] Implement WordLSTM + CRF
-  - - [ ] Implement CharLSTM + WordLSTM + CRF
+  - - [x] Implement CharLSTM + WordLSTM + CRF
 
 ### Slot Filling + intent prediciton
 - - [ ] [Attention-Based Recurrent Neural Network Models for Joint Intent Detection and Slot Filling](https://arxiv.org/abs/1609.01454)
-
+  - - [ ] Make a ATIS batcher
+  - - [ ] Implement trainer
+  - - [ ] Implement slot filler
+  - - [ ] Implement intent
+  
 ### Tree VAE
 - - [ ] [STRUCTVAE: Tree-structured Latent Variable Models for Semi-supervised Semantic Parsing](https://arxiv.org/abs/1806.07832)
 
@@ -25,3 +28,5 @@ conda install pytorch  -c pytorch
 
 ```
 CoNLL-2003 can be downloaded from https://www.clips.uantwerpen.be/conll2003/ner/
+
+ATIS dataset can be downloaded from [split 0](http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold0.pkl.gz) [split 1](http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold1.pkl.gz) [split 2](http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold2.pkl.gz) [split 3](http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold3.pkl.gz) [split 4](http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold4.pkl.gz)
diff --git a/neural_ner/config.py b/neural_ner/config.py
@@ -4,7 +4,7 @@ class Config(object):
     pass
 
 config = Config()
-root_dir = os.path.join(os.path.expanduser('~'), 'Downloads/sequence_prediction')
+root_dir = os.path.join(os.path.expanduser('~'), 'sequence_prediction')
 config.data_dir = os.path.join(root_dir, 'CoNLL-2003')
 config.log_root = os.path.join(root_dir, 'log')
 
@@ -40,6 +40,6 @@ class Config(object):
 
 config.vocab_size = int(4e5)
 
-config.is_cuda = False
+config.is_cuda = True
 
-config.is_l2_loss = False
+config.is_l2_loss = False
diff --git a/neural_ner/crf.py b/neural_ner/crf.py
@@ -5,16 +5,9 @@
 from data_utils.sentence_utils import Constants
 import numpy as np
 
-def get_mask(lengths):
-    seq_lens = lengths.view(-1, 1)
-    max_len = torch.max(seq_lens)
-    range_tensor = torch.arange(max_len).unsqueeze(0)
-    range_tensor = range_tensor.expand(seq_lens.size(0), range_tensor.size(1))
-    mask = (range_tensor < seq_lens).float()
-    return mask
 
 class CRF_Loss(nn.Module):
-    def __init__(self, tagset_size):
+    def __init__(self, tagset_size, config):
         super(CRF_Loss, self).__init__()
         self.start_tag = tagset_size
         self.end_tag = tagset_size + 1
@@ -25,6 +18,7 @@ def __init__(self, tagset_size):
 
         self.transitions.data[self.end_tag, :] = -10000
         self.transitions.data[:, self.start_tag] = -10000
+        self.config = config
 
     def get_log_p_z(self, emissions, mask, seq_len):
         log_alpha = emissions[:, 0].clone()
@@ -76,7 +70,7 @@ def forward(self, emissions, tags):
         return self.log_likelihood(emissions, tags)
 
     def viterbi_decode(self, emissions, lengths):
-        mask = get_mask(lengths)
+        mask = self.get_mask(lengths)
         seq_len = emissions.shape[1]
 
         log_prob = emissions[:, 0].clone()
@@ -87,7 +81,10 @@ def viterbi_decode(self, emissions, lengths):
         best_scores_list = []
         best_scores_list.append(end_scores.unsqueeze(1))
 
-        best_paths_list = [torch.Tensor().long()]
+        best_paths_0 = torch.Tensor().long()
+        if self.config.is_cuda:
+            best_paths_0 = best_paths_0.cuda()
+        best_paths_list = [best_paths_0]
 
         for idx in range(1, seq_len):
             broadcast_emissions = emissions[:, idx].unsqueeze(1)
@@ -108,6 +105,10 @@ def viterbi_decode(self, emissions, lengths):
 
         valid_index_tensor = torch.tensor(0).long()
         padding_tensor = torch.tensor(Constants.TAG_PAD_ID).long()
+        
+        if self.config.is_cuda:
+            valid_index_tensor = valid_index_tensor.cuda()
+            padding_tensor = padding_tensor.cuda()
 
         labels = max_indices_from_scores[:, seq_len - 1]
         labels = torch.where(mask[:, seq_len - 1] != 1.0, padding_tensor, labels)
@@ -133,3 +134,12 @@ def viterbi_decode(self, emissions, lengths):
 
         return best_scores, torch.flip(all_labels, [1])
 
+    def get_mask(self, lengths):
+        seq_lens = lengths.view(-1, 1)
+        max_len = torch.max(seq_lens)
+        range_tensor = torch.arange(max_len).unsqueeze(0)
+        range_tensor = range_tensor.expand(seq_lens.size(0), range_tensor.size(1))
+        if self.config.is_cuda:
+            range_tensor = range_tensor.cuda()
+        mask = (range_tensor < seq_lens).float()
+        return mask
diff --git a/neural_ner/model.py b/neural_ner/model.py
@@ -187,7 +187,7 @@ def __init__(self, vocab, config):
         super(NER_SOFTMAX_CHAR_CRF, self).__init__()
 
         self.featurizer = NER_SOFTMAX_CHAR(vocab, config)
-        self.crf = CRF_Loss(len(vocab.id_to_tag))
+        self.crf = CRF_Loss(len(vocab.id_to_tag), config)
         self.config = config
 
     def forward(self, batch):
@@ -208,4 +208,4 @@ def get_loss(self, logits, y, s_lens):
 
     def predict(self, emissions, lengths):
         best_scores, pred = self.crf.viterbi_decode(emissions, lengths)
-        return pred
+        return pred
diff --git a/neural_ner/process_training.py b/neural_ner/process_training.py
@@ -44,7 +44,7 @@ def train(self):
         train_dir, summary_writer = setup_train_dir(self.config)
 
         params = list(filter(lambda p: p.requires_grad, self.model.parameters()))
-        optimizer = Adam(params, lr=0.001, amsgrad=True)
+        optimizer = Adam(params, amsgrad=True)
 
         num_params = sum(p.numel() for p in params)
         logging.info("Number of params: %d" % num_params)
@@ -73,7 +73,7 @@ def train(self):
                 logging.info(
                     'epoch %d, iter %d, loss %.5f, smoothed loss %.5f, grad norm %.5f, param norm %.5f, batch time %.3f' %
                     (epoch, global_step, train_loss, exp_loss, grad_norm, param_norm, iter_time))
-
+                
             if pre_epoch < epoch:
                 epoch_toc = time.time()
                 logging.info("End of epoch %i. Time for epoch: %f" % (epoch, epoch_toc - epoch_tic))