perceptron loss

atulkum · atulkum · commit f60d0591e5bf · 2019-01-16T23:08:53.000-08:00
diff --git a/neural_ner/crf.py b/neural_ner/crf.py
@@ -5,7 +5,6 @@
 from data_utils.sentence_utils import Constants
 import numpy as np
 
-
 class CRF_Loss(nn.Module):
     def __init__(self, tagset_size, config):
         super(CRF_Loss, self).__init__()
@@ -20,7 +19,8 @@ def __init__(self, tagset_size, config):
         self.transitions.data[:, self.start_tag] = -10000
         self.config = config
 
-    def get_log_p_z(self, emissions, mask, seq_len):
+    def get_log_p_z(self, emissions, mask):
+        seq_len = emissions.shape[1]
         log_alpha = emissions[:, 0].clone()
         log_alpha += self.transitions[self.start_tag, : self.start_tag].unsqueeze(0)
 
@@ -36,7 +36,8 @@ def get_log_p_z(self, emissions, mask, seq_len):
         log_alpha += self.transitions[: self.start_tag, self.end_tag].unsqueeze(0)
         return torch.logsumexp(log_alpha.squeeze(1), 1)
 
-    def get_log_p_Y_X(self, emissions, mask, seq_len, orig_tags):
+    def get_log_p_Y_X(self, emissions, mask, orig_tags):
+        seq_len = emissions.shape[1]
         tags = orig_tags.clone()
         tags[tags < 0] = 0
 
@@ -61,16 +62,11 @@ def get_log_p_Y_X(self, emissions, mask, seq_len, orig_tags):
 
     def log_likelihood(self, emissions, tags):
         mask = tags.ne(Constants.TAG_PAD_ID).float()
-        seq_len = emissions.shape[1]
-        log_z = self.get_log_p_z(emissions, mask, seq_len)
-        log_p_y_x = self.get_log_p_Y_X(emissions, mask, seq_len, tags)
+        log_z = self.get_log_p_z(emissions, mask)
+        log_p_y_x = self.get_log_p_Y_X(emissions, mask, tags)
         return log_p_y_x - log_z
 
-    def forward(self, emissions, tags):
-        return self.log_likelihood(emissions, tags)
-
-    def viterbi_decode(self, emissions, lengths):
-        mask = self.get_mask(lengths)
+    def viterbi_decode(self, emissions, mask):
         seq_len = emissions.shape[1]
 
         log_prob = emissions[:, 0].clone()
@@ -101,7 +97,7 @@ def viterbi_decode(self, emissions, lengths):
         best_scores = torch.cat(best_scores_list, 1).float()
         best_paths = torch.cat(best_paths_list, 1)
 
-        _, max_indices_from_scores = torch.max(best_scores, 2)
+        max_scores, max_indices_from_scores = torch.max(best_scores, 2)
 
         valid_index_tensor = torch.tensor(0).long()
         padding_tensor = torch.tensor(Constants.TAG_PAD_ID).long()
@@ -132,14 +128,18 @@ def viterbi_decode(self, emissions, lengths):
             labels = torch.where(mask[:, idx] != 1.0, padding_tensor, labels)
             all_labels = torch.cat((all_labels, labels.view(-1, 1).long()), 1)
 
-        return best_scores, torch.flip(all_labels, [1])
+        last_tag_indices = mask.sum(1, dtype=torch.long) - 1
+        sentence_score = max_scores.gather(1, last_tag_indices.view(-1, 1)).squeeze(1)
+
+        return sentence_score, torch.flip(all_labels, [1])
 
-    def get_mask(self, lengths):
-        seq_lens = lengths.view(-1, 1)
-        max_len = torch.max(seq_lens)
-        range_tensor = torch.arange(max_len).unsqueeze(0)
-        range_tensor = range_tensor.expand(seq_lens.size(0), range_tensor.size(1))
-        if self.config.is_cuda:
-            range_tensor = range_tensor.cuda()
-        mask = (range_tensor < seq_lens).float()
-        return mask
+    def structural_perceptron_loss(self, emissions, tags):
+        mask = tags.ne(Constants.TAG_PAD_ID).float()
+
+        best_scores, pred = self.viterbi_decode(emissions, mask)
+        log_p_y_x = self.get_log_p_Y_X(emissions, mask, tags)
+
+        delta = torch.sum(tags.ne(pred).float()*mask, 1)
+
+        margin_loss = torch.clamp(best_scores + delta - log_p_y_x, min=0.0)
+        return margin_loss
diff --git a/neural_ner/model.py b/neural_ner/model.py
@@ -9,6 +9,7 @@
 import numpy as np
 
 from crf import CRF_Loss
+from model_utils import get_mask, init_lstm_wt, init_linear_wt, get_word_embd
 
 print('pytorch version', torch.__version__)
 
@@ -19,53 +20,8 @@
 if torch.cuda.is_available():
     torch.cuda.manual_seed_all(123)
 
-def init_lstm_wt(lstm):
-    for names in lstm._all_weights:
-        for name in names:
-            if name.startswith('weight_'):
-                wt = getattr(lstm, name)
-                drange = np.sqrt(6. / (np.sum(wt.size())))
-                wt.data.uniform_(-drange, drange)
-
-            elif name.startswith('bias_'):
-                # set forget bias to 1
-                bias = getattr(lstm, name)
-                n = bias.size(0)
-                start, end = n // 4, n // 2
-                bias.data.fill_(0.)
-                bias.data[start:end].fill_(1.)
-
-def init_linear_wt(linear):
-    drange = np.sqrt(6. / (np.sum(linear.weight.size())))
-    linear.weight.data.uniform_(-drange, drange)
-
-    if linear.bias is not None:
-        linear.bias.data.fill_(0.)
-
-def get_word_embd(vocab, config):
-    word_to_vector = vocab.glove_vectors
-    word_emb_matrix = np.random.uniform(low=-1.0, high=1.0,
-                                        size=(len(vocab.word_to_id), config.word_emdb_dim))
-    pretrained_init = 0
-    for w, wid in vocab.word_to_id.items():
-        if w in word_to_vector:
-            word_emb_matrix[wid, :] = word_to_vector[w]
-            pretrained_init += 1
-
-    "Total words %i (%i pretrained initialization)" % (
-        len(vocab.word_to_id), pretrained_init
-    )
-    return word_emb_matrix
-
-def test_one_batch(batch, model):
-    model.eval()
-    logits = model(batch)
-    lengths = batch['words_lens']
-    pred = model.predict(logits, lengths)
-    return logits, pred
-
 def get_model(vocab, config, model_file_path, is_eval=False):
-    model = NER_SOFTMAX_CHAR_CRF(vocab,  config)
+    model = NER_SOFTMAX_CHAR_CRF(vocab, config)
 
     if is_eval:
         model = model.eval()
@@ -194,18 +150,16 @@ def forward(self, batch):
         emissions = self.featurizer(batch)
         return emissions
 
-    def crf_loss(self, emissions, target):
-        a = self.crf(emissions, target)
-        loss = -1 * a
-        loss = loss.mean()
-        return loss
-
     def get_loss(self, logits, y, s_lens):
-        loss = self.crf_loss(logits, y)
+        #loss = -1 * self.crf.log_likelihood(logits, y)
+        loss = self.crf.structural_perceptron_loss(logits, y)
+        loss = loss / s_lens.float()
+        loss = loss.mean()
         if self.config.is_l2_loss:
             loss += self.get_l2_loss()
         return loss
 
     def predict(self, emissions, lengths):
-        best_scores, pred = self.crf.viterbi_decode(emissions, lengths)
+        mask = get_mask(lengths, self.config)
+        best_scores, pred = self.crf.viterbi_decode(emissions, mask)
         return pred
diff --git a/neural_ner/model_utils.py b/neural_ner/model_utils.py
@@ -0,0 +1,55 @@
+from __future__ import unicode_literals, print_function, division
+
+import torch
+import numpy as np
+
+def get_mask(lengths, config):
+    seq_lens = lengths.view(-1, 1)
+    max_len = torch.max(seq_lens)
+    range_tensor = torch.arange(max_len).unsqueeze(0)
+    range_tensor = range_tensor.expand(seq_lens.size(0), range_tensor.size(1))
+    if config.is_cuda:
+        range_tensor = range_tensor.cuda()
+    mask = (range_tensor < seq_lens).float()
+    return mask
+
+def init_lstm_wt(lstm):
+    for names in lstm._all_weights:
+        for name in names:
+            if name.startswith('weight_'):
+                wt = getattr(lstm, name)
+                drange = np.sqrt(6. / (np.sum(wt.size())))
+                wt.data.uniform_(-drange, drange)
+
+            elif name.startswith('bias_'):
+                # set forget bias to 1
+                bias = getattr(lstm, name)
+                n = bias.size(0)
+                start, end = n // 4, n // 2
+                bias.data.fill_(0.)
+                bias.data[start:end].fill_(1.)
+
+
+def init_linear_wt(linear):
+    drange = np.sqrt(6. / (np.sum(linear.weight.size())))
+    linear.weight.data.uniform_(-drange, drange)
+
+    if linear.bias is not None:
+        linear.bias.data.fill_(0.)
+
+
+def get_word_embd(vocab, config):
+    word_to_vector = vocab.glove_vectors
+    word_emb_matrix = np.random.uniform(low=-1.0, high=1.0,
+                                        size=(len(vocab.word_to_id), config.word_emdb_dim))
+    pretrained_init = 0
+    for w, wid in vocab.word_to_id.items():
+        if w in word_to_vector:
+            word_emb_matrix[wid, :] = word_to_vector[w]
+            pretrained_init += 1
+
+    "Total words %i (%i pretrained initialization)" % (
+        len(vocab.word_to_id), pretrained_init
+    )
+    return word_emb_matrix
+
diff --git a/neural_ner/process_training.py b/neural_ner/process_training.py
@@ -9,7 +9,7 @@
 
 from data_utils.batcher import DatasetConll2003
 from data_utils.vocab import Vocab
-from model import get_model, test_one_batch
+from model import get_model
 from train_utils import setup_train_dir, save_model, write_summary, \
     get_param_norm, get_grad_norm, Evaluter
 
@@ -22,6 +22,13 @@ def __init__(self, config, model_file_path):
         self.vocab = Vocab(config)
         self.model = get_model(self.vocab, config, model_file_path)
 
+    def test_one_batch(self, batch):
+        self.model.eval()
+        logits = self.model(batch)
+        lengths = batch['words_lens']
+        pred = self.model.predict(logits, lengths)
+        return logits, pred
+
     def train_one_batch(self, batch, optimizer, params):
         self.model.train()
         optimizer.zero_grad()
@@ -126,7 +133,7 @@ def evaluate(self, data_type, num_samples=None):
             s_lengths = batch['words_lens']
             y = batch['tags']
 
-            logits, pred = test_one_batch(batch, self.model)
+            logits, pred = self.test_one_batch(batch)
             loss = self.model.get_loss(logits, y, s_lengths)
 
             curr_batch_size = len(batch['raw_sentence'])