Skip to content

Commit 6f06f95

Browse files
committed
Reorganize & Rename modules
1 parent 9e081c2 commit 6f06f95

File tree

12 files changed

+713
-459
lines changed

12 files changed

+713
-459
lines changed

supar/modules/__init__.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,11 @@
11
# -*- coding: utf-8 -*-
22

33
from .affine import Biaffine, Triaffine
4-
from .bert import BertEmbedding
5-
from .char_lstm import CharLSTM
64
from .dropout import IndependentDropout, SharedDropout
7-
from .lstm import LSTM
5+
from .lstm import CharLSTM, VariationalLSTM
86
from .mlp import MLP
97
from .scalar_mix import ScalarMix
10-
from .treecrf import (CRF2oDependency, CRFConstituency, CRFDependency,
11-
MatrixTree)
12-
from .variational_inference import (LBPSemanticDependency,
13-
MFVISemanticDependency)
8+
from .transformer import TransformerEmbedding
149

15-
__all__ = ['LSTM', 'MLP', 'BertEmbedding', 'Biaffine', 'CharLSTM', 'CRF2oDependency', 'CRFConstituency', 'CRFDependency',
16-
'IndependentDropout', 'LBPSemanticDependency', 'MatrixTree',
17-
'MFVISemanticDependency', 'ScalarMix', 'SharedDropout', 'Triaffine']
10+
__all__ = ['MLP', 'TransformerEmbedding', 'Biaffine', 'CharLSTM',
11+
'IndependentDropout', 'ScalarMix', 'SharedDropout', 'Triaffine', 'VariationalLSTM']

supar/modules/affine.py

Lines changed: 37 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,43 +6,44 @@
66

77
class Biaffine(nn.Module):
88
r"""
9-
Biaffine layer for first-order scoring.
9+
Biaffine layer for first-order scoring :cite:`dozat-etal-2017-biaffine`.
1010
1111
This function has a tensor of weights :math:`W` and bias terms if needed.
12-
The score :math:`s(x, y)` of the vector pair :math:`(x, y)` is computed as :math:`x^T W y`,
13-
in which :math:`x` and :math:`y` can be concatenated with bias terms.
14-
15-
References:
16-
- Timothy Dozat and Christopher D. Manning. 2017.
17-
`Deep Biaffine Attention for Neural Dependency Parsing`_.
12+
The score :math:`s(x, y)` of the vector pair :math:`(x, y)` is computed as :math:`x^T W y / d^s`,
13+
where `d` and `s` are vector dimension and scaling factor respectively.
14+
:math:`x` and :math:`y` can be concatenated with bias terms.
1815
1916
Args:
2017
n_in (int):
2118
The size of the input feature.
2219
n_out (int):
2320
The number of output channels.
21+
scale (float):
22+
Factor to scale the scores. Default: 0.
2423
bias_x (bool):
2524
If ``True``, adds a bias term for tensor :math:`x`. Default: ``True``.
2625
bias_y (bool):
2726
If ``True``, adds a bias term for tensor :math:`y`. Default: ``True``.
28-
29-
.. _Deep Biaffine Attention for Neural Dependency Parsing:
30-
https://openreview.net/forum?id=Hk95PK9le
3127
"""
3228

33-
def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True):
29+
def __init__(self, n_in, n_out=1, scale=0, bias_x=True, bias_y=True):
3430
super().__init__()
3531

3632
self.n_in = n_in
3733
self.n_out = n_out
34+
self.scale = scale
3835
self.bias_x = bias_x
3936
self.bias_y = bias_y
4037
self.weight = nn.Parameter(torch.Tensor(n_out, n_in+bias_x, n_in+bias_y))
4138

4239
self.reset_parameters()
4340

4441
def __repr__(self):
45-
s = f"n_in={self.n_in}, n_out={self.n_out}"
42+
s = f"n_in={self.n_in}"
43+
if self.n_out > 1:
44+
s += f", n_out={self.n_out}"
45+
if self.scale != 0:
46+
s += f", scale={self.scale}"
4647
if self.bias_x:
4748
s += f", bias_x={self.bias_x}"
4849
if self.bias_y:
@@ -70,7 +71,7 @@ def forward(self, x, y):
7071
if self.bias_y:
7172
y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
7273
# [batch_size, n_out, seq_len, seq_len]
73-
s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y)
74+
s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y) / self.n_in ** self.scale
7475
# remove dim 1 if n_out == 1
7576
s = s.squeeze(1)
7677

@@ -79,44 +80,44 @@ def forward(self, x, y):
7980

8081
class Triaffine(nn.Module):
8182
r"""
82-
Triaffine layer for second-order scoring.
83+
Triaffine layer for second-order scoring (:cite:`zhang-etal-2020-efficient`, :cite:`wang-etal-2019-second`).
8384
8485
This function has a tensor of weights :math:`W` and bias terms if needed.
85-
The score :math:`s(x, y, z)` of the vector triple :math:`(x, y, z)` is computed as :math:`x^T z^T W y`.
86-
Usually, :math:`x` and :math:`y` can be concatenated with bias terms.
87-
88-
References:
89-
- Yu Zhang, Zhenghua Li and Min Zhang. 2020.
90-
`Efficient Second-Order TreeCRF for Neural Dependency Parsing`_.
91-
- Xinyu Wang, Jingxian Huang, and Kewei Tu. 2019.
92-
`Second-Order Semantic Dependency Parsing with End-to-End Neural Networks`_.
86+
The score :math:`s(x, y, z)` of the vector triple :math:`(x, y, z)` is computed as :math:`x^T z^T W y / d^s`,
87+
where `d` and `s` are vector dimension and scaling factor respectively.
88+
:math:`x` and :math:`y` can be concatenated with bias terms.
9389
9490
Args:
9591
n_in (int):
9692
The size of the input feature.
93+
n_out (int):
94+
The number of output channels.
95+
scale (float):
96+
Factor to scale the scores. Default: 0.
9797
bias_x (bool):
9898
If ``True``, adds a bias term for tensor :math:`x`. Default: ``False``.
9999
bias_y (bool):
100100
If ``True``, adds a bias term for tensor :math:`y`. Default: ``False``.
101-
102-
.. _Efficient Second-Order TreeCRF for Neural Dependency Parsing:
103-
https://www.aclweb.org/anthology/2020.acl-main.302/
104-
.. _Second-Order Semantic Dependency Parsing with End-to-End Neural Networks:
105-
https://www.aclweb.org/anthology/P19-1454/
106101
"""
107102

108-
def __init__(self, n_in, bias_x=False, bias_y=False):
103+
def __init__(self, n_in, n_out=1, scale=0, bias_x=False, bias_y=False):
109104
super().__init__()
110105

111106
self.n_in = n_in
107+
self.n_out = n_out
108+
self.scale = scale
112109
self.bias_x = bias_x
113110
self.bias_y = bias_y
114-
self.weight = nn.Parameter(torch.Tensor(n_in+bias_x, n_in, n_in+bias_y))
111+
self.weight = nn.Parameter(torch.Tensor(n_out, n_in+bias_x, n_in, n_in+bias_y))
115112

116113
self.reset_parameters()
117114

118115
def __repr__(self):
119116
s = f"n_in={self.n_in}"
117+
if self.n_out > 1:
118+
s += f", n_out={self.n_out}"
119+
if self.scale != 0:
120+
s += f", scale={self.scale}"
120121
if self.bias_x:
121122
s += f", bias_x={self.bias_x}"
122123
if self.bias_y:
@@ -136,15 +137,18 @@ def forward(self, x, y, z):
136137
137138
Returns:
138139
~torch.Tensor:
139-
A scoring tensor of shape ``[batch_size, seq_len, seq_len, seq_len]``.
140+
A scoring tensor of shape ``[batch_size, n_out, seq_len, seq_len, seq_len]``.
141+
If ``n_out=1``, the dimension for ``n_out`` will be squeezed automatically.
140142
"""
141143

142144
if self.bias_x:
143145
x = torch.cat((x, torch.ones_like(x[..., :1])), -1)
144146
if self.bias_y:
145147
y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
146-
w = torch.einsum('bzk,ikj->bzij', z, self.weight)
147-
# [batch_size, seq_len, seq_len, seq_len]
148-
s = torch.einsum('bxi,bzij,byj->bzxy', x, w, y)
148+
w = torch.einsum('bzk,oikj->bozij', z, self.weight)
149+
# [batch_size, n_out, seq_len, seq_len, seq_len]
150+
s = torch.einsum('bxi,bozij,byj->bozxy', x, w, y) / self.n_in ** self.scale
151+
# remove dim 1 if n_out == 1
152+
s = s.squeeze(1)
149153

150154
return s

supar/modules/char_lstm.py

Lines changed: 0 additions & 68 deletions
This file was deleted.

supar/modules/dropout.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66

77
class SharedDropout(nn.Module):
88
r"""
9-
SharedDropout differs from the vanilla dropout strategy in that
10-
the dropout mask is shared across one dimension.
9+
SharedDropout differs from the vanilla dropout strategy in that the dropout mask is shared across one dimension.
1110
1211
Args:
1312
p (float):

supar/modules/lstm.py

Lines changed: 81 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,93 @@
44
import torch.nn as nn
55
from supar.modules.dropout import SharedDropout
66
from torch.nn.modules.rnn import apply_permutation
7-
from torch.nn.utils.rnn import PackedSequence
7+
from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence
88

99

10-
class LSTM(nn.Module):
10+
class CharLSTM(nn.Module):
1111
r"""
12-
LSTM is an variant of the vanilla bidirectional LSTM adopted by Biaffine Parser
13-
with the only difference of the dropout strategy.
12+
CharLSTM aims to generate character-level embeddings for tokens.
13+
It summarizes the information of characters in each token to an embedding using a LSTM layer.
14+
15+
Args:
16+
n_char (int):
17+
The number of characters.
18+
n_embed (int):
19+
The size of each embedding vector as input to LSTM.
20+
n_hidden (int):
21+
The size of each LSTM hidden state.
22+
n_out (int):
23+
The size of each output vector. Default: 0.
24+
If 0, equals to the size of hidden states.
25+
pad_index (int):
26+
The index of the padding token in the vocabulary. Default: 0.
27+
dropout (float):
28+
The dropout ratio of CharLSTM hidden states. Default: 0.
29+
"""
30+
31+
def __init__(self, n_chars, n_embed, n_hidden, n_out=0, pad_index=0, dropout=0):
32+
super().__init__()
33+
34+
self.n_chars = n_chars
35+
self.n_embed = n_embed
36+
self.n_hidden = n_hidden
37+
self.n_out = n_out or n_hidden
38+
self.pad_index = pad_index
39+
40+
self.embed = nn.Embedding(num_embeddings=n_chars, embedding_dim=n_embed)
41+
self.lstm = nn.LSTM(input_size=n_embed, hidden_size=n_hidden//2, batch_first=True, bidirectional=True)
42+
self.dropout = nn.Dropout(p=dropout)
43+
self.projection = nn.Linear(in_features=n_hidden, out_features=self.n_out) if n_hidden != self.n_out else nn.Identity()
44+
45+
def __repr__(self):
46+
s = f"{self.n_chars}, {self.n_embed}"
47+
if self.n_hidden != self.n_out:
48+
s += f", n_hidden={self.n_hidden}"
49+
s += f", n_out={self.n_out}, pad_index={self.pad_index}"
50+
if self.dropout.p != 0:
51+
s += f", dropout={self.dropout.p}"
52+
53+
return f"{self.__class__.__name__}({s})"
54+
55+
def forward(self, x):
56+
r"""
57+
Args:
58+
x (~torch.Tensor): ``[batch_size, seq_len, fix_len]``.
59+
Characters of all tokens.
60+
Each token holds no more than `fix_len` characters, and the excess is cut off directly.
61+
Returns:
62+
~torch.Tensor:
63+
The embeddings of shape ``[batch_size, seq_len, n_out]`` derived from the characters.
64+
"""
65+
66+
# [batch_size, seq_len, fix_len]
67+
mask = x.ne(self.pad_index)
68+
# [batch_size, seq_len]
69+
lens = mask.sum(-1)
70+
char_mask = lens.gt(0)
71+
72+
# [n, fix_len, n_embed]
73+
x = self.embed(x[char_mask])
74+
x = pack_padded_sequence(x, lens[char_mask].tolist(), True, False)
75+
x, (h, _) = self.lstm(x)
76+
# [n, fix_len, n_hidden]
77+
h = self.dropout(torch.cat(torch.unbind(h), -1))
78+
# [batch_size, seq_len, n_out]
79+
embed = h.new_zeros(*lens.shape, self.n_out).masked_scatter_(char_mask.unsqueeze(-1), self.projection(h))
80+
81+
return embed
82+
83+
84+
class VariationalLSTM(nn.Module):
85+
r"""
86+
VariationalLSTM :cite:`yarin-etal-2016-dropout` is an variant of the vanilla bidirectional LSTM
87+
adopted by Biaffine Parser with the only difference of the dropout strategy.
1488
It drops nodes in the LSTM layers (input and recurrent connections)
1589
and applies the same dropout mask at every recurrent timesteps.
1690
1791
APIs are roughly the same as :class:`~torch.nn.LSTM` except that we only allows
1892
:class:`~torch.nn.utils.rnn.PackedSequence` as input.
1993
20-
References:
21-
- Timothy Dozat and Christopher D. Manning. 2017.
22-
`Deep Biaffine Attention for Neural Dependency Parsing`_.
23-
2494
Args:
2595
input_size (int):
2696
The number of expected features in the input.
@@ -33,9 +103,6 @@ class LSTM(nn.Module):
33103
dropout (float):
34104
If non-zero, introduces a :class:`SharedDropout` layer on the outputs of each LSTM layer except the last layer.
35105
Default: 0.
36-
37-
.. _Deep Biaffine Attention for Neural Dependency Parsing:
38-
https://openreview.net/forum?id=Hk95PK9le
39106
"""
40107

41108
def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False, dropout=0):
@@ -155,27 +222,17 @@ def forward(self, sequence, hx=None):
155222
if self.training:
156223
mask = SharedDropout.get_mask(x[0], self.dropout)
157224
x = [i * mask[:len(i)] for i in x]
158-
x_i, (h_i, c_i) = self.layer_forward(x=x,
159-
hx=(h[i, 0], c[i, 0]),
160-
cell=self.f_cells[i],
161-
batch_sizes=batch_sizes)
225+
x_i, (h_i, c_i) = self.layer_forward(x, (h[i, 0], c[i, 0]), self.f_cells[i], batch_sizes)
162226
if self.bidirectional:
163-
x_b, (h_b, c_b) = self.layer_forward(x=x,
164-
hx=(h[i, 1], c[i, 1]),
165-
cell=self.b_cells[i],
166-
batch_sizes=batch_sizes,
167-
reverse=True)
227+
x_b, (h_b, c_b) = self.layer_forward(x, (h[i, 1], c[i, 1]), self.b_cells[i], batch_sizes, True)
168228
x_i = torch.cat((x_i, x_b), -1)
169229
h_i = torch.stack((h_i, h_b))
170230
c_i = torch.stack((c_i, c_b))
171231
x = x_i
172232
h_n.append(h_i)
173233
c_n.append(h_i)
174234

175-
x = PackedSequence(x,
176-
sequence.batch_sizes,
177-
sequence.sorted_indices,
178-
sequence.unsorted_indices)
235+
x = PackedSequence(x, sequence.batch_sizes, sequence.sorted_indices, sequence.unsorted_indices)
179236
hx = torch.cat(h_n, 0), torch.cat(c_n, 0)
180237
hx = self.permute_hidden(hx, sequence.unsorted_indices)
181238

0 commit comments

Comments
 (0)