Trivial modifications

zysite · zysite · commit 40fe60a48b4f · 2019-05-02T14:08:48.000+08:00
diff --git a/parser/metric.py b/parser/metric.py
@@ -3,39 +3,17 @@
 
 class Metric(object):
 
-    def __lt__(self, other):
-        return self.score < other
-
-    def __le__(self, other):
-        return self.score <= other
-
-    def __eq__(self, other):
-        return self.score == other
-
-    def __ge__(self, other):
-        return self.score >= other
-
-    def __gt__(self, other):
-        return self.score > other
-
-    def __ne__(self, other):
-        return self.score != other
-
-    @property
-    def score(self):
-        raise AttributeError
-
-
-class AttachmentMethod(Metric):
-
     def __init__(self, eps=1e-5):
-        super(AttachmentMethod, self).__init__()
+        super(Metric, self).__init__()
 
         self.eps = eps
         self.total = 0.0
         self.correct_arcs = 0.0
         self.correct_rels = 0.0
 
+    def __repr__(self):
+        return f"UAS: {self.uas:.2%} LAS: {self.las:.2%}"
+
     def __call__(self, pred_arcs, pred_rels, gold_arcs, gold_rels):
         arc_mask = pred_arcs.eq(gold_arcs)
         rel_mask = pred_rels.eq(gold_rels) & arc_mask
@@ -44,8 +22,17 @@ def __call__(self, pred_arcs, pred_rels, gold_arcs, gold_rels):
         self.correct_arcs += arc_mask.sum().item()
         self.correct_rels += rel_mask.sum().item()
 
-    def __repr__(self):
-        return f"UAS: {self.uas:.2%} LAS: {self.las:.2%}"
+    def __lt__(self, other):
+        return self.score < other
+
+    def __le__(self, other):
+        return self.score <= other
+
+    def __ge__(self, other):
+        return self.score >= other
+
+    def __gt__(self, other):
+        return self.score > other
 
     @property
     def score(self):
diff --git a/parser/modules/bilstm.py b/parser/modules/bilstm.py
@@ -46,8 +46,8 @@ def layer_forward(self, x, hx, cell, batch_sizes, reverse=False):
             hid_mask = SharedDropout.get_mask(h, self.dropout)
 
         for t in steps:
-            batch_size = batch_sizes[t]
-            if len(h) < batch_size:
+            last_batch_size, batch_size = len(h), batch_sizes[t]
+            if last_batch_size < batch_size:
                 h = torch.cat((h, init_h[last_batch_size:batch_size]))
                 c = torch.cat((c, init_c[last_batch_size:batch_size]))
             else:
@@ -57,7 +57,6 @@ def layer_forward(self, x, hx, cell, batch_sizes, reverse=False):
             output.append(h)
             if self.training:
                 h = h * hid_mask[:batch_size]
-            last_batch_size = batch_size
         if reverse:
             output.reverse()
         output = torch.cat(output)
diff --git a/parser/utils/vocab.py b/parser/utils/vocab.py
@@ -32,11 +32,10 @@ def __init__(self, words, tags, rels):
         self.n_train_words = self.n_words
 
     def __repr__(self):
-        info = f"{self.__class__.__name__}(\n"
-        info += f"  num of words: {self.n_words}\n"
-        info += f"  num of tags: {self.n_tags}\n"
-        info += f"  num of rels: {self.n_rels}\n"
-        info += f")"
+        info = f"{self.__class__.__name__}: "
+        info += f"{self.n_words} words, "
+        info += f"{self.n_tags} tags, "
+        info += f"{self.n_rels} rels"
 
         return info
 
@@ -55,20 +54,20 @@ def rel2id(self, sequence):
     def id2rel(self, ids):
         return [self.rels[i] for i in ids]
 
-    def read_embeddings(self, embed, unk=None):
-        words = embed.words
-        # if the UNK token has existed in pretrained vocab,
-        # then replace it with a self-defined one
-        if unk in embed:
-            words[words.index(unk)] = self.UNK
+    def read_embeddings(self, embed, smooth=True):
+        # if the UNK token has existed in the pretrained,
+        # then use it to replace the one in the vocab
+        if embed.unk:
+            self.UNK = embed.unk
 
-        self.extend(words)
+        self.extend(embed.tokens)
         self.embeddings = torch.zeros(self.n_words, embed.dim)
 
         for i, word in enumerate(self.words):
             if word in embed:
                 self.embeddings[i] = embed[word]
-        self.embeddings /= torch.std(self.embeddings)
+        if smooth:
+            self.embeddings /= torch.std(self.embeddings)
 
     def extend(self, words):
         self.words.extend(sorted(set(words).difference(self.word_dict)))
@@ -77,9 +76,11 @@ def extend(self, words):
                              if regex.match(r'\p{P}+$', word))
         self.n_words = len(self.words)
 
-    def numericalize(self, corpus):
+    def numericalize(self, corpus, training=True):
         words = [self.word2id(seq) for seq in corpus.words]
         tags = [self.tag2id(seq) for seq in corpus.tags]
+        if not training:
+            return words, tags
         arcs = [torch.tensor(seq) for seq in corpus.heads]
         rels = [self.rel2id(seq) for seq in corpus.rels]