Replace LSTM with BiLSTM

zysite · zysite · commit 66d650fa1742 · 2019-04-27T17:05:27.000+08:00
diff --git a/parser/modules/__init__.py b/parser/modules/__init__.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 
 from .biaffine import Biaffine
-from .lstm import LSTM
+from .bilstm import BiLSTM
+from .dropout import IndependentDropout, SharedDropout
 from .mlp import MLP
 
 
-__all__ = ['LSTM', 'MLP', 'Biaffine']
+__all__ = ['MLP', 'Biaffine', 'BiLSTM', 'IndependentDropout', 'SharedDropout']
diff --git a/parser/modules/bilstm.py b/parser/modules/bilstm.py
@@ -7,28 +7,24 @@
 from torch.nn.utils.rnn import PackedSequence
 
 
-class LSTM(nn.Module):
+class BiLSTM(nn.Module):
 
-    def __init__(self, input_size, hidden_size, num_layers=1,
-                 dropout=0, bidirectional=False):
-        super(LSTM, self).__init__()
+    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0):
+        super(BiLSTM, self).__init__()
 
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.num_layers = num_layers
         self.dropout = dropout
-        self.bidirectional = bidirectional
-        self.num_directions = 2 if bidirectional else 1
 
         self.f_cells = nn.ModuleList()
         self.b_cells = nn.ModuleList()
         for layer in range(self.num_layers):
             self.f_cells.append(nn.LSTMCell(input_size=input_size,
                                             hidden_size=hidden_size))
-            if bidirectional:
-                self.b_cells.append(nn.LSTMCell(input_size=input_size,
-                                                hidden_size=hidden_size))
-            input_size = hidden_size * self.num_directions
+            self.b_cells.append(nn.LSTMCell(input_size=input_size,
+                                            hidden_size=hidden_size))
+            input_size = hidden_size * 2
 
         self.reset_parameters()
 
@@ -88,17 +84,12 @@ def forward(self, x, hx=None):
                                           cell=self.f_cells[layer],
                                           batch_sizes=batch_sizes,
                                           reverse=False)
-
-            if self.bidirectional:
-                b_output = self.layer_forward(x=x,
-                                              hx=hx,
-                                              cell=self.b_cells[layer],
-                                              batch_sizes=batch_sizes,
-                                              reverse=True)
-            if self.bidirectional:
-                x = torch.cat([f_output, b_output], -1)
-            else:
-                x = f_output
+            b_output = self.layer_forward(x=x,
+                                          hx=hx,
+                                          cell=self.b_cells[layer],
+                                          batch_sizes=batch_sizes,
+                                          reverse=True)
+            x = torch.cat([f_output, b_output], -1)
         x = PackedSequence(x, batch_sizes)
 
         return x
diff --git a/parser/parser.py b/parser/parser.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
-from parser.modules import LSTM, MLP, Biaffine
-from parser.modules.dropout import IndependentDropout, SharedDropout
+from parser.modules import (MLP, Biaffine, BiLSTM, IndependentDropout,
+                            SharedDropout)
 
 import torch
 import torch.nn as nn
@@ -23,11 +23,10 @@ def __init__(self, params, embeddings):
         self.embed_dropout = IndependentDropout(p=params['embed_dropout'])
 
         # the word-lstm layer
-        self.lstm = LSTM(input_size=params['n_embed']+params['n_tag_embed'],
-                         hidden_size=params['n_lstm_hidden'],
-                         num_layers=params['n_lstm_layers'],
-                         dropout=params['lstm_dropout'],
-                         bidirectional=True)
+        self.lstm = BiLSTM(input_size=params['n_embed']+params['n_tag_embed'],
+                           hidden_size=params['n_lstm_hidden'],
+                           num_layers=params['n_lstm_layers'],
+                           dropout=params['lstm_dropout'])
         self.lstm_dropout = SharedDropout(p=params['lstm_dropout'])
 
         # the MLP layers
@@ -82,7 +81,7 @@ def forward(self, words, tags):
         x, _ = pad_packed_sequence(x, True)
         x = self.lstm_dropout(x)[inverse_indices]
 
-        # apply MLPs to the LSTM output states
+        # apply MLPs to the BiLSTM output states
         arc_h = self.mlp_arc_h(x)
         arc_d = self.mlp_arc_d(x)
         rel_h = self.mlp_rel_h(x)
@@ -94,7 +93,7 @@ def forward(self, words, tags):
         # [batch_size, seq_len, seq_len, n_rels]
         s_rel = self.rel_attn(rel_d, rel_h).permute(0, 2, 3, 1)
         # set the scores that exceed the length of each sentence to -inf
-        s_arc.masked_fill_((1 - mask).unsqueeze(1), float('-inf'))
+        s_arc.masked_fill_(~mask.unsqueeze(1), float('-inf'))
 
         return s_arc, s_rel