Fakerycoder
diff --git a/‎supar/modules/__init__.py
Lines changed: 4 additions & 10 deletions b/‎supar/modules/__init__.py
Lines changed: 4 additions & 10 deletions
diff --git a/‎supar/modules/affine.py
Lines changed: 37 additions & 33 deletions b/‎supar/modules/affine.py
Lines changed: 37 additions & 33 deletions
diff --git a/‎supar/modules/char_lstm.py
Lines changed: 0 additions & 68 deletions b/‎supar/modules/char_lstm.py
Lines changed: 0 additions & 68 deletions
diff --git a/‎supar/modules/dropout.py
Lines changed: 1 addition & 2 deletions b/‎supar/modules/dropout.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎supar/modules/lstm.py
Lines changed: 81 additions & 24 deletions b/‎supar/modules/lstm.py
Lines changed: 81 additions & 24 deletions
@@ -1,17 +1,11 @@
 # -*- coding: utf-8 -*-
 
 from .affine import Biaffine, Triaffine
-from .bert import BertEmbedding
-from .char_lstm import CharLSTM
 from .dropout import IndependentDropout, SharedDropout
-from .lstm import LSTM
+from .lstm import CharLSTM, VariationalLSTM
 from .mlp import MLP
 from .scalar_mix import ScalarMix
-from .treecrf import (CRF2oDependency, CRFConstituency, CRFDependency,
-                      MatrixTree)
-from .variational_inference import (LBPSemanticDependency,
-                                    MFVISemanticDependency)
+from .transformer import TransformerEmbedding
 
-__all__ = ['LSTM', 'MLP', 'BertEmbedding', 'Biaffine', 'CharLSTM', 'CRF2oDependency', 'CRFConstituency', 'CRFDependency',
-           'IndependentDropout', 'LBPSemanticDependency', 'MatrixTree',
-           'MFVISemanticDependency', 'ScalarMix', 'SharedDropout', 'Triaffine']
+__all__ = ['MLP', 'TransformerEmbedding', 'Biaffine', 'CharLSTM',
+           'IndependentDropout', 'ScalarMix', 'SharedDropout', 'Triaffine', 'VariationalLSTM']
@@ -6,43 +6,44 @@
 
 class Biaffine(nn.Module):
     r"""
-    Biaffine layer for first-order scoring.
+    Biaffine layer for first-order scoring :cite:`dozat-etal-2017-biaffine`.
 
     This function has a tensor of weights :math:`W` and bias terms if needed.
-    The score :math:`s(x, y)` of the vector pair :math:`(x, y)` is computed as :math:`x^T W y`,
-    in which :math:`x` and :math:`y` can be concatenated with bias terms.
-
-    References:
-        - Timothy Dozat and Christopher D. Manning. 2017.
-          `Deep Biaffine Attention for Neural Dependency Parsing`_.
+    The score :math:`s(x, y)` of the vector pair :math:`(x, y)` is computed as :math:`x^T W y / d^s`,
+    where `d` and `s` are vector dimension and scaling factor respectively.
+    :math:`x` and :math:`y` can be concatenated with bias terms.
 
     Args:
         n_in (int):
             The size of the input feature.
         n_out (int):
             The number of output channels.
+        scale (float):
+            Factor to scale the scores. Default: 0.
         bias_x (bool):
             If ``True``, adds a bias term for tensor :math:`x`. Default: ``True``.
         bias_y (bool):
             If ``True``, adds a bias term for tensor :math:`y`. Default: ``True``.
-
-    .. _Deep Biaffine Attention for Neural Dependency Parsing:
-        https://openreview.net/forum?id=Hk95PK9le
     """
 
-    def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True):
+    def __init__(self, n_in, n_out=1, scale=0, bias_x=True, bias_y=True):
         super().__init__()
 
         self.n_in = n_in
         self.n_out = n_out
+        self.scale = scale
         self.bias_x = bias_x
         self.bias_y = bias_y
         self.weight = nn.Parameter(torch.Tensor(n_out, n_in+bias_x, n_in+bias_y))
 
         self.reset_parameters()
 
     def __repr__(self):
-        s = f"n_in={self.n_in}, n_out={self.n_out}"
+        s = f"n_in={self.n_in}"
+        if self.n_out > 1:
+            s += f", n_out={self.n_out}"
+        if self.scale != 0:
+            s += f", scale={self.scale}"
         if self.bias_x:
             s += f", bias_x={self.bias_x}"
         if self.bias_y:
@@ -70,7 +71,7 @@ def forward(self, x, y):
         if self.bias_y:
             y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
         # [batch_size, n_out, seq_len, seq_len]
-        s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y)
+        s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y) / self.n_in ** self.scale
         # remove dim 1 if n_out == 1
         s = s.squeeze(1)
 
@@ -79,44 +80,44 @@ def forward(self, x, y):
 
 class Triaffine(nn.Module):
     r"""
-    Triaffine layer for second-order scoring.
+    Triaffine layer for second-order scoring (:cite:`zhang-etal-2020-efficient`, :cite:`wang-etal-2019-second`).
 
     This function has a tensor of weights :math:`W` and bias terms if needed.
-    The score :math:`s(x, y, z)` of the vector triple :math:`(x, y, z)` is computed as :math:`x^T z^T W y`.
-    Usually, :math:`x` and :math:`y` can be concatenated with bias terms.
-
-    References:
-        - Yu Zhang, Zhenghua Li and Min Zhang. 2020.
-          `Efficient Second-Order TreeCRF for Neural Dependency Parsing`_.
-        - Xinyu Wang, Jingxian Huang, and Kewei Tu. 2019.
-          `Second-Order Semantic Dependency Parsing with End-to-End Neural Networks`_.
+    The score :math:`s(x, y, z)` of the vector triple :math:`(x, y, z)` is computed as :math:`x^T z^T W y / d^s`,
+    where `d` and `s` are vector dimension and scaling factor respectively.
+    :math:`x` and :math:`y` can be concatenated with bias terms.
 
     Args:
         n_in (int):
             The size of the input feature.
+        n_out (int):
+            The number of output channels.
+        scale (float):
+            Factor to scale the scores. Default: 0.
         bias_x (bool):
             If ``True``, adds a bias term for tensor :math:`x`. Default: ``False``.
         bias_y (bool):
             If ``True``, adds a bias term for tensor :math:`y`. Default: ``False``.
-
-    .. _Efficient Second-Order TreeCRF for Neural Dependency Parsing:
-        https://www.aclweb.org/anthology/2020.acl-main.302/
-    .. _Second-Order Semantic Dependency Parsing with End-to-End Neural Networks:
-        https://www.aclweb.org/anthology/P19-1454/
     """
 
-    def __init__(self, n_in, bias_x=False, bias_y=False):
+    def __init__(self, n_in, n_out=1, scale=0, bias_x=False, bias_y=False):
         super().__init__()
 
         self.n_in = n_in
+        self.n_out = n_out
+        self.scale = scale
         self.bias_x = bias_x
         self.bias_y = bias_y
-        self.weight = nn.Parameter(torch.Tensor(n_in+bias_x, n_in, n_in+bias_y))
+        self.weight = nn.Parameter(torch.Tensor(n_out, n_in+bias_x, n_in, n_in+bias_y))
 
         self.reset_parameters()
 
     def __repr__(self):
         s = f"n_in={self.n_in}"
+        if self.n_out > 1:
+            s += f", n_out={self.n_out}"
+        if self.scale != 0:
+            s += f", scale={self.scale}"
         if self.bias_x:
             s += f", bias_x={self.bias_x}"
         if self.bias_y:
@@ -136,15 +137,18 @@ def forward(self, x, y, z):
 
         Returns:
             ~torch.Tensor:
-                A scoring tensor of shape ``[batch_size, seq_len, seq_len, seq_len]``.
+                A scoring tensor of shape ``[batch_size, n_out, seq_len, seq_len, seq_len]``.
+                If ``n_out=1``, the dimension for ``n_out`` will be squeezed automatically.
         """
 
         if self.bias_x:
             x = torch.cat((x, torch.ones_like(x[..., :1])), -1)
         if self.bias_y:
             y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
-        w = torch.einsum('bzk,ikj->bzij', z, self.weight)
-        # [batch_size, seq_len, seq_len, seq_len]
-        s = torch.einsum('bxi,bzij,byj->bzxy', x, w, y)
+        w = torch.einsum('bzk,oikj->bozij', z, self.weight)
+        # [batch_size, n_out, seq_len, seq_len, seq_len]
+        s = torch.einsum('bxi,bozij,byj->bozxy', x, w, y) / self.n_in ** self.scale
+        # remove dim 1 if n_out == 1
+        s = s.squeeze(1)
 
         return s
@@ -6,8 +6,7 @@
 
 class SharedDropout(nn.Module):
     r"""
-    SharedDropout differs from the vanilla dropout strategy in that
-    the dropout mask is shared across one dimension.
+    SharedDropout differs from the vanilla dropout strategy in that the dropout mask is shared across one dimension.
 
     Args:
         p (float):
 
@@ -4,23 +4,93 @@
 import torch.nn as nn
 from supar.modules.dropout import SharedDropout
 from torch.nn.modules.rnn import apply_permutation
-from torch.nn.utils.rnn import PackedSequence
+from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence
 
 
-class LSTM(nn.Module):
+class CharLSTM(nn.Module):
     r"""
-    LSTM is an variant of the vanilla bidirectional LSTM adopted by Biaffine Parser
-    with the only difference of the dropout strategy.
+    CharLSTM aims to generate character-level embeddings for tokens.
+    It summarizes the information of characters in each token to an embedding using a LSTM layer.
+
+    Args:
+        n_char (int):
+            The number of characters.
+        n_embed (int):
+            The size of each embedding vector as input to LSTM.
+        n_hidden (int):
+            The size of each LSTM hidden state.
+        n_out (int):
+            The size of each output vector. Default: 0.
+            If 0, equals to the size of hidden states.
+        pad_index (int):
+            The index of the padding token in the vocabulary. Default: 0.
+        dropout (float):
+            The dropout ratio of CharLSTM hidden states. Default: 0.
+    """
+
+    def __init__(self, n_chars, n_embed, n_hidden, n_out=0, pad_index=0, dropout=0):
+        super().__init__()
+
+        self.n_chars = n_chars
+        self.n_embed = n_embed
+        self.n_hidden = n_hidden
+        self.n_out = n_out or n_hidden
+        self.pad_index = pad_index
+
+        self.embed = nn.Embedding(num_embeddings=n_chars, embedding_dim=n_embed)
+        self.lstm = nn.LSTM(input_size=n_embed, hidden_size=n_hidden//2, batch_first=True, bidirectional=True)
+        self.dropout = nn.Dropout(p=dropout)
+        self.projection = nn.Linear(in_features=n_hidden, out_features=self.n_out) if n_hidden != self.n_out else nn.Identity()
+
+    def __repr__(self):
+        s = f"{self.n_chars}, {self.n_embed}"
+        if self.n_hidden != self.n_out:
+            s += f", n_hidden={self.n_hidden}"
+        s += f", n_out={self.n_out}, pad_index={self.pad_index}"
+        if self.dropout.p != 0:
+            s += f", dropout={self.dropout.p}"
+
+        return f"{self.__class__.__name__}({s})"
+
+    def forward(self, x):
+        r"""
+        Args:
+            x (~torch.Tensor): ``[batch_size, seq_len, fix_len]``.
+                Characters of all tokens.
+                Each token holds no more than `fix_len` characters, and the excess is cut off directly.
+        Returns:
+            ~torch.Tensor:
+                The embeddings of shape ``[batch_size, seq_len, n_out]`` derived from the characters.
+        """
+
+        # [batch_size, seq_len, fix_len]
+        mask = x.ne(self.pad_index)
+        # [batch_size, seq_len]
+        lens = mask.sum(-1)
+        char_mask = lens.gt(0)
+
+        # [n, fix_len, n_embed]
+        x = self.embed(x[char_mask])
+        x = pack_padded_sequence(x, lens[char_mask].tolist(), True, False)
+        x, (h, _) = self.lstm(x)
+        # [n, fix_len, n_hidden]
+        h = self.dropout(torch.cat(torch.unbind(h), -1))
+        # [batch_size, seq_len, n_out]
+        embed = h.new_zeros(*lens.shape, self.n_out).masked_scatter_(char_mask.unsqueeze(-1), self.projection(h))
+
+        return embed
+
+
+class VariationalLSTM(nn.Module):
+    r"""
+    VariationalLSTM :cite:`yarin-etal-2016-dropout` is an variant of the vanilla bidirectional LSTM
+    adopted by Biaffine Parser with the only difference of the dropout strategy.
     It drops nodes in the LSTM layers (input and recurrent connections)
     and applies the same dropout mask at every recurrent timesteps.
 
     APIs are roughly the same as :class:`~torch.nn.LSTM` except that we only allows
     :class:`~torch.nn.utils.rnn.PackedSequence` as input.
 
-    References:
-        - Timothy Dozat and Christopher D. Manning. 2017.
-          `Deep Biaffine Attention for Neural Dependency Parsing`_.
-
     Args:
         input_size (int):
             The number of expected features in the input.
@@ -33,9 +103,6 @@ class LSTM(nn.Module):
         dropout (float):
             If non-zero, introduces a :class:`SharedDropout` layer on the outputs of each LSTM layer except the last layer.
             Default: 0.
-
-    .. _Deep Biaffine Attention for Neural Dependency Parsing:
-        https://openreview.net/forum?id=Hk95PK9le
     """
 
     def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False, dropout=0):
@@ -155,27 +222,17 @@ def forward(self, sequence, hx=None):
             if self.training:
                 mask = SharedDropout.get_mask(x[0], self.dropout)
                 x = [i * mask[:len(i)] for i in x]
-            x_i, (h_i, c_i) = self.layer_forward(x=x,
-                                                 hx=(h[i, 0], c[i, 0]),
-                                                 cell=self.f_cells[i],
-                                                 batch_sizes=batch_sizes)
+            x_i, (h_i, c_i) = self.layer_forward(x, (h[i, 0], c[i, 0]), self.f_cells[i], batch_sizes)
             if self.bidirectional:
-                x_b, (h_b, c_b) = self.layer_forward(x=x,
-                                                     hx=(h[i, 1], c[i, 1]),
-                                                     cell=self.b_cells[i],
-                                                     batch_sizes=batch_sizes,
-                                                     reverse=True)
+                x_b, (h_b, c_b) = self.layer_forward(x, (h[i, 1], c[i, 1]), self.b_cells[i], batch_sizes, True)
                 x_i = torch.cat((x_i, x_b), -1)
                 h_i = torch.stack((h_i, h_b))
                 c_i = torch.stack((c_i, c_b))
             x = x_i
             h_n.append(h_i)
             c_n.append(h_i)
 
-        x = PackedSequence(x,
-                           sequence.batch_sizes,
-                           sequence.sorted_indices,
-                           sequence.unsorted_indices)
+        x = PackedSequence(x, sequence.batch_sizes, sequence.sorted_indices, sequence.unsorted_indices)
         hx = torch.cat(h_n, 0), torch.cat(c_n, 0)
         hx = self.permute_hidden(hx, sequence.unsorted_indices)