crf

atulkum · atulkum · commit e76c49916254 · 2019-01-12T04:15:08.000-08:00
diff --git a/neural_ner/crf.py b/neural_ner/crf.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn as nn
 from data_utils.sentence_utils import Constants
-
+import numpy as np
 
 def get_mask(lengths):
     seq_lens = lengths.view(-1, 1)
@@ -16,40 +16,35 @@ def get_mask(lengths):
 class CRF_Loss(nn.Module):
     def __init__(self, tagset_size):
         super(CRF_Loss, self).__init__()
-        self.start_tag_idx = tagset_size
-        self.stop_tag_idx = tagset_size + 1
+        self.start_tag = tagset_size
+        self.end_tag = tagset_size + 1
         self.num_tags = tagset_size + 2
 
-        #transition from y_i-1 to y_i, T[y_i, y_j] = y_i <= y_j
-        #+2 added for start and end indices
         self.transitions = nn.Parameter(torch.Tensor(self.num_tags, self.num_tags))
-        nn.init.uniform_(self.transitions, -0.1, 0.1)
+        nn.init.constant_(self.transitions, -np.log(self.num_tags))
 
-        #no transition to start_tag, not transition from end tag
-        self.transitions.data[self.start_tag_idx, :] = -10000
-        self.transitions.data[:, self.stop_tag_idx] = -10000
+        self.transitions.data[self.end_tag, :] = -10000
+        self.transitions.data[:, self.start_tag] = -10000
 
     def get_log_p_z(self, emissions, mask, seq_len):
         log_alpha = emissions[:, 0].clone()
-        log_alpha += self.transitions[self.start_tag_idx, : self.start_tag_idx].unsqueeze(0)
+        log_alpha += self.transitions[self.start_tag, : self.start_tag].unsqueeze(0)
 
         for idx in range(1, seq_len):
             broadcast_emissions = emissions[:, idx].unsqueeze(1)
-            broadcast_transitions = self.transitions[
-                : self.start_tag, : self.start_tag
-            ].unsqueeze(0)
+            broadcast_transitions = self.transitions[ : self.start_tag, : self.start_tag].unsqueeze(0)
             broadcast_logprob = log_alpha.unsqueeze(2)
             score = broadcast_logprob + broadcast_emissions + broadcast_transitions
 
             score = torch.logsumexp(score, 1)
-            log_alpha = score * mask[:, idx].unsqueeze(1) + log_alpha.squeeze(1) * (
-                1.0 - mask[:, idx].unsqueeze(1)
-            )
+            log_alpha = score * mask[:, idx].unsqueeze(1) + log_alpha.squeeze(1) * (1.0 - mask[:, idx].unsqueeze(1))
 
-            log_alpha += self.transitions[: self.start_tag, self.end_tag].unsqueeze(0)
+        log_alpha += self.transitions[: self.start_tag, self.end_tag].unsqueeze(0)
         return torch.logsumexp(log_alpha.squeeze(1), 1)
 
     def get_log_p_Y_X(self, emissions, mask, seq_len, tags):
+        tags[tags < 0] = 0 # clone and then set
+
         llh = self.transitions[self.start_tag, tags[:, 0]].unsqueeze(1)
         llh += emissions[:, 0, :].gather(1, tags[:, 0].view(-1, 1)) * mask[:, 0].unsqueeze(1)
 
@@ -79,20 +74,14 @@ def log_likelihood(self, emissions, tags):
     def forward(self, emissions, tags):
         return self.log_likelihood(emissions, tags)
 
-    def inference(self, emissions, lengths):
-        return self.viterbi_decode(emissions, lengths)
-
     def viterbi_decode(self, emissions, lengths):
         mask = get_mask(lengths)
         seq_len = emissions.shape[1]
 
         log_prob = emissions[:, 0].clone()
         log_prob += self.transitions[self.start_tag, : self.start_tag].unsqueeze(0)
 
-
-        end_scores = log_prob + self.transitions[
-            : self.start_tag, self.end_tag
-        ].unsqueeze(0)
+        end_scores = log_prob + self.transitions[: self.start_tag, self.end_tag].unsqueeze(0)
 
         best_scores_list = []
         best_scores_list.append(end_scores.unsqueeze(1))
@@ -101,20 +90,12 @@ def viterbi_decode(self, emissions, lengths):
 
         for idx in range(1, seq_len):
             broadcast_emissions = emissions[:, idx].unsqueeze(1)
-            broadcast_transmissions = self.transitions[
-                                      : self.start_tag, : self.start_tag
-                                      ].unsqueeze(0)
+            broadcast_transmissions = self.transitions[: self.start_tag, : self.start_tag].unsqueeze(0)
             broadcast_log_prob = log_prob.unsqueeze(2)
-
             score = broadcast_emissions + broadcast_transmissions + broadcast_log_prob
-
             max_scores, max_score_indices = torch.max(score, 1)
-
             best_paths_list.append(max_score_indices.unsqueeze(1))
-
-            end_scores = max_scores + self.transitions[
-                                      : self.start_tag, self.end_tag
-                                      ].unsqueeze(0)
+            end_scores = max_scores + self.transitions[: self.start_tag, self.end_tag].unsqueeze(0)
 
             best_scores_list.append(end_scores.unsqueeze(1))
             log_prob = max_scores
@@ -128,48 +109,25 @@ def viterbi_decode(self, emissions, lengths):
         padding_tensor = torch.tensor(Constants.PAD_ID).long()
 
         labels = max_indices_from_scores[:, seq_len - 1]
-        labels = self._mask_tensor(labels, 1.0 - mask[:, seq_len - 1], padding_tensor)
-
+        labels = torch.where(1.0 - mask[:, seq_len - 1], padding_tensor, labels)
         all_labels = labels.unsqueeze(1).long()
 
         for idx in range(seq_len - 2, -1, -1):
             indices_for_lookup = all_labels[:, -1].clone()
-            indices_for_lookup = torch.where(
-                indices_for_lookup == self.ignore_index,
-                valid_index_tensor,
-                indices_for_lookup
-            )
+            indices_for_lookup = torch.where(indices_for_lookup == self.ignore_index, valid_index_tensor,
+                                             indices_for_lookup)
 
-            indices_from_prev_pos = (
-                best_paths[:, idx, :]
-                    .gather(1, indices_for_lookup.view(-1, 1).long())
-                    .squeeze(1)
-            )
-            indices_from_prev_pos = torch.where(
-                (1.0 - mask[:, idx + 1]),
-                padding_tensor,
-                indices_from_prev_pos
-            )
+            indices_from_prev_pos = best_paths[:, idx, :].gather(1, indices_for_lookup.view(-1, 1).long()).squeeze(1)
+            indices_from_prev_pos = torch.where((1.0 - mask[:, idx + 1]), padding_tensor, indices_from_prev_pos)
 
             indices_from_max_scores = max_indices_from_scores[:, idx]
-            indices_from_max_scores = torch.where(
-                mask[:, idx + 1],
-                padding_tensor,
-                indices_from_max_scores
-            )
+            indices_from_max_scores = torch.where(mask[:, idx + 1], padding_tensor, indices_from_max_scores)
 
-            labels = torch.where(
-                indices_from_max_scores == self.ignore_index,
-                indices_from_prev_pos,
-                indices_from_max_scores,
-            )
+            labels = torch.where(indices_from_max_scores == self.ignore_index, indices_from_prev_pos,
+                                 indices_from_max_scores)
 
             # Set to ignore_index if present state is not valid.
-            labels = torch.where(
-                (1 - mask[:, idx]),
-                padding_tensor,
-                labels
-            )
+            labels = torch.where((1 - mask[:, idx]),padding_tensor, labels)
             all_labels = torch.cat((all_labels, labels.view(-1, 1).long()), 1)
 
         return best_scores, torch.flip(all_labels, [1])
diff --git a/neural_ner/data_utils/batcher.py b/neural_ner/data_utils/batcher.py
@@ -2,9 +2,9 @@
 
 import os
 import numpy as np
-from utils import load_sentences, prepare_dataset
-from vocab import Vocab
-from sentence_utils import pad_items, pad_chars
+from .utils import load_sentences, prepare_dataset
+from .vocab import Vocab
+from .sentence_utils import pad_items, pad_chars
 import torch
 
 class DatasetConll2003(object):
@@ -39,7 +39,7 @@ def get_data_file(data_type, config):
     def __iter__(self):
         return self
 
-    def next(self):
+    def __next__(self):
         self.iterations += 1
 
         if self.is_train and self.i >= len(self.data):
diff --git a/neural_ner/data_utils/sentence_utils.py b/neural_ner/data_utils/sentence_utils.py
@@ -32,7 +32,7 @@ def pad_chars(items, max_word_len):
     for item in items:
         padding = [pad_id] * (max_length - len(item))
         padded_items.append(item + padding)
-    for i in xrange(len(items), max_word_len):
+    for i in range(len(items), max_word_len):
         padding = [pad_id] * max_length
         padded_items.append(padding)
         padded_items_len.append(1)
diff --git a/neural_ner/data_utils/utils.py b/neural_ner/data_utils/utils.py
@@ -1,6 +1,6 @@
 import codecs
-from sentence_utils import prepare_sentence
-from tag_scheme_utils import update_tag_scheme
+from .sentence_utils import prepare_sentence
+from .tag_scheme_utils import update_tag_scheme
 
 def load_sentences(path, tag_scheme):
     sentences = []
diff --git a/neural_ner/data_utils/vocab.py b/neural_ner/data_utils/vocab.py
@@ -5,8 +5,8 @@
 import re
 import numpy as np
 
-from sentence_utils import get_char_word_seq, Constants
-from utils import load_sentences
+from .sentence_utils import get_char_word_seq, Constants
+from .utils import load_sentences
 from collections import Counter
 
 def create_freq_map(item_list):
@@ -39,9 +39,9 @@ def word_mapping(self, sentences):
         start_vocab_len = len(Constants._START_VOCAB)
         word_freq_map = create_freq_map(words)
         self.orig_word_freq_map = word_freq_map.copy()
-        print "Found %i unique words (%i in total)" % (
+        print ("Found {} unique words ({} in total)".format(
             len(word_freq_map), sum(len(x) for x in words)
-        )
+        ))
         '''
         self.config.vocab_size = min(self.config.vocab_size, len(word_freq_map))
         sorted_items = word_freq_map.most_common(self.config.vocab_size)
@@ -67,7 +67,7 @@ def word_mapping(self, sentences):
         for i, v in enumerate(Constants._START_VOCAB):
             id_to_char[i] = v
 
-        print "Found %i unique characters" % len(char_freq_map)
+        print("Found {} unique characters".format(len(char_freq_map)))
 
         self.char_to_id = {v: k for k, v in id_to_char.items()}
         self.id_to_char = id_to_char
@@ -76,13 +76,13 @@ def tag_mapping(self, sentences):
         tags = [[word[-1] for word in s] for s in sentences]
         freq_map = create_freq_map(tags)
         id_to_tag = {i: v for i, v in enumerate(freq_map)}
-        print "Found %i unique named entity tags" % len(freq_map)
+        print ("Found {} unique named entity tags" .format(len(freq_map)))
 
         self.tag_to_id = {v: k for k, v in id_to_tag.items()}
         self.id_to_tag = id_to_tag
 
     def get_glove(self):
-        print "Loading GLoVE vectors from file: %s" % self.config.glove_path
+        print ("Loading GLoVE vectors from file: {}".format(self.config.glove_path))
         vocab_size = int(4e5)
         word_to_vector = {}
 
@@ -103,6 +103,6 @@ def get_glove(self):
 if __name__ == '__main__':
     from config import config
     vocab = Vocab(config)
-    print len(vocab.word_to_id)
+    print (len(vocab.word_to_id))
     #emb_matrix = vocab.get_word_embd()
     #print len(emb_matrix)
diff --git a/neural_ner/model.py b/neural_ner/model.py
@@ -47,7 +47,7 @@ def get_word_embd(vocab, config):
     word_emb_matrix = np.random.uniform(low=-1.0, high=1.0,
                                         size=(len(vocab.word_to_id), config.word_emdb_dim))
     pretrained_init = 0
-    for w, wid in vocab.word_to_id.iteritems():
+    for w, wid in vocab.word_to_id.items():
         if w in word_to_vector:
             word_emb_matrix[wid, :] = word_to_vector[w]
             pretrained_init += 1
@@ -65,7 +65,7 @@ def test_one_batch(batch, model):
     return logits, pred
 
 def get_model(vocab, config, model_file_path, is_eval=False):
-    model = NER_SOFTMAX_CHAR(vocab,  config)
+    model = NER_SOFTMAX_CHAR_CRF(vocab,  config)
 
     if is_eval:
         model = model.eval()
@@ -188,19 +188,20 @@ def __init__(self, vocab, config):
 
         self.featurizer = NER_SOFTMAX_CHAR(vocab, config)
         self.crf = CRF_Loss(len(vocab.id_to_tag))
+        self.config = config
 
     def forward(self, batch):
         emissions = self.featurizer(batch)
         return emissions
 
-    def crf_loss(self, emissions, target, s_lens):
-        loss = -1 * self.crf(emissions, target)
-        loss = loss.squeeze(1).sum(dim=1) / s_lens.float()
+    def crf_loss(self, emissions, target):
+        a = self.crf(emissions, target)
+        loss = -1 * a
         loss = loss.mean()
         return loss
 
     def get_loss(self, logits, y, s_lens):
-        loss = self.crf_loss(logits, y, s_lens)
+        loss = self.crf_loss(logits, y)
         if self.config.is_l2_loss:
             loss += self.get_l2_loss()
         return loss
diff --git a/neural_ner/train_utils.py b/neural_ner/train_utils.py
@@ -108,24 +108,24 @@ def get_metric(self, log_dir, is_cf=False):
         eval_lines = [l.rstrip() for l in codecs.open(scores_path, 'r', 'utf8')]
         if is_cf:
             for line in eval_lines:
-                print line
+                print (line)
 
             # Confusion matrix with accuracy for each tag
-            print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * self.n_tags)).format(
+            print (("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * self.n_tags)) % (
                 "ID", "NE", "Total",
-                *([self.vocab.id_to_tag[i] for i in xrange(self.n_tags)] + ["Percent"])
-            )
-            for i in xrange(self.n_tags):
-                print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * self.n_tags)).format(
+                *([self.vocab.id_to_tag[i] for i in range(self.n_tags)] + ["Percent"])
+            ))
+            for i in range(self.n_tags):
+                print (("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * self.n_tags)) % (
                     str(i), self.vocab.id_to_tag[i], str(self.count[i].sum()),
-                    *([self.count[i][j] for j in xrange(self.n_tags)] +
+                    *([self.count[i][j] for j in range(self.n_tags)] +
                       ["%.3f" % (self.count[i][i] * 100. / max(1, self.count[i].sum()))])
-            )
+            ))
 
             # Global accuracy
-            print "%i/%i (%.5f%%)" % (
+            print ("%i/%i (%.5f%%)" % (
                 self.count.trace(), self.count.sum(), 100. * self.count.trace() / max(1, self.count.sum())
-            )
+            ))
 
         # F1 on all entities
         return float(eval_lines[1].strip().split()[-1])