Biaffine Semantic Dependency Parser

yzhangcs · yzhangcs · commit ad6236d60c53 · 2020-11-27T13:48:06.000+08:00
diff --git a/supar/__init__.py b/supar/__init__.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 
-from .parsers import (BiaffineDependencyParser, CRF2oDependencyParser,
+from .parsers import (BiaffineDependencyParser,
+                      BiaffineSemanticDependencyParser, CRF2oDependencyParser,
                       CRFConstituencyParser, CRFDependencyParser,
                       CRFNPDependencyParser, Parser)
 
@@ -9,14 +10,16 @@
            'CRFDependencyParser',
            'CRF2oDependencyParser',
            'CRFConstituencyParser',
+           'BiaffineSemanticDependencyParser',
            'Parser']
 __version__ = '1.0.0'
 
 PARSER = {parser.NAME: parser for parser in [BiaffineDependencyParser,
                                              CRFNPDependencyParser,
                                              CRFDependencyParser,
                                              CRF2oDependencyParser,
-                                             CRFConstituencyParser]}
+                                             CRFConstituencyParser,
+                                             BiaffineSemanticDependencyParser]}
 
 PRETRAINED = {
     'biaffine-dep-en': 'https://github.com/yzhangcs/parser/releases/download/v1.0.0/ptb.biaffine.dependency.char.zip',
diff --git a/supar/cmds/biaffine_semantic_dependency.py b/supar/cmds/biaffine_semantic_dependency.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+
+import argparse
+
+from supar import BiaffineSemanticDependencyParser
+from supar.cmds.cmd import parse
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Create Biaffine Semantic Dependency Parser.')
+    parser.set_defaults(Parser=BiaffineSemanticDependencyParser)
+    subparsers = parser.add_subparsers(title='Commands', dest='mode')
+    # train
+    subparser = subparsers.add_parser('train', help='Train a parser.')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], help='choices of additional features')
+    subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
+    subparser.add_argument('--max-len', type=int, help='max length of the sentences')
+    subparser.add_argument('--buckets', default=32, type=int, help='max num of buckets to use')
+    subparser.add_argument('--train', default='data/sdp/DM/train.conllu', help='path to train file')
+    subparser.add_argument('--dev', default='data/sdp/DM/dev.conllu', help='path to dev file')
+    subparser.add_argument('--test', default='data/sdp/DM/test.conllu', help='path to test file')
+    subparser.add_argument('--embed', default='data/glove.6B.100d.txt', help='path to pretrained embeddings')
+    subparser.add_argument('--unk', default='unk', help='unk token in pretrained embeddings')
+    subparser.add_argument('--n-embed', default=100, type=int, help='dimension of embeddings')
+    subparser.add_argument('--bert', default='bert-base-cased', help='which bert model to use')
+    # evaluate
+    subparser = subparsers.add_parser('evaluate', help='Evaluate the specified parser and dataset.')
+    subparser.add_argument('--buckets', default=8, type=int, help='max num of buckets to use')
+    subparser.add_argument('--data', default='data/sdp/DM/test.conllu', help='path to dataset')
+    # predict
+    subparser = subparsers.add_parser('predict', help='Use a trained parser to make predictions.')
+    subparser.add_argument('--prob', action='store_true', help='whether to output probs')
+    subparser.add_argument('--buckets', default=8, type=int, help='max num of buckets to use')
+    subparser.add_argument('--data', default='data/sdp/DM/test.conllu', help='path to dataset')
+    subparser.add_argument('--pred', default='pred.conllu', help='path to predicted result')
+    parse(parser)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/supar/models/__init__.py b/supar/models/__init__.py
@@ -3,9 +3,11 @@
 from .constituency import CRFConstituencyModel
 from .dependency import (BiaffineDependencyModel, CRF2oDependencyModel,
                          CRFDependencyModel, CRFNPDependencyModel)
+from .semantic_dependency import BiaffineSemanticDependencyModel
 
 __all__ = ['BiaffineDependencyModel',
            'CRFDependencyModel',
            'CRF2oDependencyModel',
            'CRFNPDependencyModel',
-           'CRFConstituencyModel']
+           'CRFConstituencyModel',
+           'BiaffineSemanticDependencyModel']
diff --git a/supar/models/semantic_dependency.py b/supar/models/semantic_dependency.py
@@ -0,0 +1,249 @@
+# -*- coding: utf-8 -*-
+
+import torch
+import torch.nn as nn
+from supar.modules import LSTM, MLP, BertEmbedding, Biaffine, CharLSTM
+from supar.modules.dropout import IndependentDropout, SharedDropout
+from supar.utils import Config
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+
+class BiaffineSemanticDependencyModel(nn.Module):
+    r"""
+    The implementation of Biaffine Semantic Dependency Parser.
+
+    References:
+        - Timothy Dozat and Christopher D. Manning. 2018.
+          `Simpler but More Accurate Semantic Dependency Parsing`_.
+
+    Args:
+        n_words (int):
+            The size of the word vocabulary.
+        n_feats (int):
+            The size of the feat vocabulary.
+        n_labels (int):
+            The number of labels in the treebank.
+        feat (str):
+            Specifies which type of additional feature to use: ``'char'`` | ``'bert'`` | ``'tag'``.
+            ``'char'``: Character-level representations extracted by CharLSTM.
+            ``'bert'``: BERT representations, other pretrained langugae models like XLNet are also feasible.
+            ``'tag'``: POS tag embeddings.
+            Default: ``'char'``.
+        n_embed (int):
+            The size of word embeddings. Default: 100.
+        n_embed_proj (int):
+            The size of linearly transformed word embeddings. Default: 125.
+        n_feat_embed (int):
+            The size of feature representations. Default: 100.
+        n_char_embed (int):
+            The size of character embeddings serving as inputs of CharLSTM, required if ``feat='char'``. Default: 50.
+        bert (str):
+            Specifies which kind of language model to use, e.g., ``'bert-base-cased'`` and ``'xlnet-base-cased'``.
+            This is required if ``feat='bert'``. The full list can be found in `transformers`_.
+            Default: ``None``.
+        n_bert_layers (int):
+            Specifies how many last layers to use. Required if ``feat='bert'``.
+            The final outputs would be the weight sum of the hidden states of these layers.
+            Default: 4.
+        mix_dropout (float):
+            The dropout ratio of BERT layers. Required if ``feat='bert'``. Default: .0.
+        embed_dropout (float):
+            The dropout ratio of input embeddings. Default: .2.
+        n_lstm_hidden (int):
+            The size of LSTM hidden states. Default: 600.
+        n_lstm_layers (int):
+            The number of LSTM layers. Default: 3.
+        lstm_dropout (float):
+            The dropout ratio of LSTM. Default: .33.
+        n_mlp_edge (int):
+            Edge MLP size. Default: 600.
+        n_mlp_label  (int):
+            Label MLP size. Default: 600.
+        edge_mlp_dropout (float):
+            The dropout ratio of edge MLP layers. Default: .25.
+        label_mlp_dropout (float):
+            The dropout ratio of label MLP layers. Default: .33.
+        feat_pad_index (int):
+            The index of the padding token in the feat vocabulary. Default: 0.
+        pad_index (int):
+            The index of the padding token in the word vocabulary. Default: 0.
+        unk_index (int):
+            The index of the unknown token in the word vocabulary. Default: 1.
+
+    .. _Simpler but More Accurate Semantic Dependency Parsing:
+        https://www.aclweb.org/anthology/P18-2077/
+    .. _transformers:
+        https://github.com/huggingface/transformers
+    """
+
+    def __init__(self,
+                 n_words,
+                 n_feats,
+                 n_labels,
+                 feat='char',
+                 n_embed=100,
+                 n_embed_proj=125,
+                 n_feat_embed=100,
+                 n_char_embed=50,
+                 bert=None,
+                 n_bert_layers=4,
+                 mix_dropout=.0,
+                 embed_dropout=.2,
+                 n_lstm_hidden=600,
+                 n_lstm_layers=3,
+                 lstm_dropout=.33,
+                 n_mlp_edge=600,
+                 n_mlp_label=600,
+                 edge_mlp_dropout=.25,
+                 label_mlp_dropout=.33,
+                 feat_pad_index=0,
+                 pad_index=0,
+                 unk_index=1,
+                 interpolation=0.1,
+                 **kwargs):
+        super().__init__()
+
+        self.args = Config().update(locals())
+        # the embedding layer
+        self.word_embed = nn.Embedding(num_embeddings=n_words,
+                                       embedding_dim=n_embed)
+        self.embed_proj = nn.Linear(n_embed, n_embed_proj)
+
+        if feat == 'char':
+            self.feat_embed = CharLSTM(n_chars=n_feats,
+                                       n_embed=n_char_embed,
+                                       n_out=n_feat_embed,
+                                       pad_index=feat_pad_index)
+        elif feat == 'bert':
+            self.feat_embed = BertEmbedding(model=bert,
+                                            n_layers=n_bert_layers,
+                                            n_out=n_feat_embed,
+                                            pad_index=feat_pad_index,
+                                            dropout=mix_dropout)
+            self.n_feat_embed = self.feat_embed.n_out
+        elif feat == 'tag':
+            self.feat_embed = nn.Embedding(num_embeddings=n_feats,
+                                           embedding_dim=n_feat_embed)
+        else:
+            raise RuntimeError("The feat type should be in ['char', 'bert', 'tag'].")
+        self.embed_dropout = IndependentDropout(p=embed_dropout)
+
+        # the lstm layer
+        self.lstm = LSTM(input_size=n_embed+n_feat_embed+n_embed_proj,
+                         hidden_size=n_lstm_hidden,
+                         num_layers=n_lstm_layers,
+                         bidirectional=True,
+                         dropout=lstm_dropout)
+        self.lstm_dropout = SharedDropout(p=lstm_dropout)
+
+        # the MLP layers
+        self.mlp_edge_d = MLP(n_in=n_lstm_hidden*2, n_out=n_mlp_edge, dropout=edge_mlp_dropout, activation=False)
+        self.mlp_edge_h = MLP(n_in=n_lstm_hidden*2, n_out=n_mlp_edge, dropout=edge_mlp_dropout, activation=False)
+        self.mlp_label_d = MLP(n_in=n_lstm_hidden*2, n_out=n_mlp_label, dropout=label_mlp_dropout, activation=False)
+        self.mlp_label_h = MLP(n_in=n_lstm_hidden*2, n_out=n_mlp_label, dropout=label_mlp_dropout, activation=False)
+
+        # the Biaffine layers
+        self.edge_attn = Biaffine(n_in=n_mlp_edge, n_out=2, bias_x=True, bias_y=True)
+        self.label_attn = Biaffine(n_in=n_mlp_label, n_out=n_labels, bias_x=True, bias_y=True)
+        self.criterion = nn.CrossEntropyLoss()
+        self.pad_index = pad_index
+        self.unk_index = unk_index
+        self.interpolation = interpolation
+
+    def load_pretrained(self, embed=None):
+        if embed is not None:
+            self.pretrained = nn.Embedding.from_pretrained(embed)
+        return self
+
+    def forward(self, words, feats):
+        r"""
+        Args:
+            words (~torch.LongTensor): ``[batch_size, seq_len]``.
+                Word indices.
+            feats (~torch.LongTensor):
+                Feat indices.
+                If feat is ``'char'`` or ``'bert'``, the size of feats should be ``[batch_size, seq_len, fix_len]``.
+                if ``'tag'``, the size is ``[batch_size, seq_len]``.
+
+        Returns:
+            ~torch.Tensor, ~torch.Tensor:
+                The first tensor of shape ``[batch_size, seq_len, seq_len, 2]`` holds scores of all possible edges.
+                The second of shape ``[batch_size, seq_len, seq_len, n_labels]`` holds
+                scores of all possible labels on each edge.
+        """
+
+        batch_size, seq_len = words.shape
+        # get the mask and lengths of given batch
+        mask = words.ne(self.pad_index)
+        ext_words = words
+        # set the indices larger than num_embeddings to unk_index
+        if hasattr(self, 'pretrained'):
+            ext_mask = words.ge(self.word_embed.num_embeddings)
+            ext_words = words.masked_fill(ext_mask, self.unk_index)
+
+        # get outputs from embedding layers
+        word_embed = self.word_embed(ext_words)
+        if hasattr(self, 'pretrained'):
+            word_embed = torch.cat((word_embed, self.embed_proj(self.pretrained(words))), -1)
+
+        feat_embed = self.feat_embed(feats)
+        word_embed, feat_embed = self.embed_dropout(word_embed, feat_embed)
+        # concatenate the word and feat representations
+        embed = torch.cat((word_embed, feat_embed), -1)
+
+        x = pack_padded_sequence(embed, mask.sum(1), True, False)
+        x, _ = self.lstm(x)
+        x, _ = pad_packed_sequence(x, True, total_length=seq_len)
+        x = self.lstm_dropout(x)
+
+        # apply MLPs to the BiLSTM output states
+        edge_d = self.mlp_edge_d(x)
+        edge_h = self.mlp_edge_h(x)
+        label_d = self.mlp_label_d(x)
+        label_h = self.mlp_label_h(x)
+
+        # [batch_size, seq_len, seq_len, 2]
+        s_egde = self.edge_attn(edge_d, edge_h).permute(0, 2, 3, 1)
+        # [batch_size, seq_len, seq_len, n_labels]
+        s_label = self.label_attn(label_d, label_h).permute(0, 2, 3, 1)
+
+        return s_egde, s_label
+
+    def loss(self, s_egde, s_label, edges, labels, mask):
+        r"""
+        Args:
+            s_egde (~torch.Tensor): ``[batch_size, seq_len, seq_len, 2]``.
+                Scores of all possible edges.
+            s_label (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``.
+                Scores of all possible labels on each edge.
+            edges (~torch.LongTensor): ``[batch_size, seq_len, seq_len]``.
+                The tensor of gold-standard edges.
+            labels (~torch.LongTensor): ``[batch_size, seq_len, seq_len]``.
+                The tensor of gold-standard labels.
+            mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
+                The mask for covering the unpadded tokens.
+
+        Returns:
+            ~torch.Tensor:
+                The training loss.
+        """
+
+        edge_mask = edges.gt(0) & mask
+        edge_loss = self.criterion(s_egde[mask], edges[mask])
+        label_loss = self.criterion(s_label[edge_mask], labels[edge_mask])
+        return self.interpolation * label_loss + (1 - self.interpolation) * edge_loss
+
+    def decode(self, s_egde, s_label):
+        r"""
+        Args:
+            s_egde (~torch.Tensor): ``[batch_size, seq_len, seq_len, 2]``.
+                Scores of all possible edges.
+            s_label (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``.
+                Scores of all possible labels on each edge.
+
+        Returns:
+            ~torch.Tensor, ~torch.Tensor:
+                Predicted edges and labels of shape ``[batch_size, seq_len, seq_len]``.
+        """
+
+        return s_egde.argmax(-1), s_label.argmax(-1)
diff --git a/supar/parsers/__init__.py b/supar/parsers/__init__.py
@@ -4,10 +4,12 @@
 from .dependency import (BiaffineDependencyParser, CRF2oDependencyParser,
                          CRFDependencyParser, CRFNPDependencyParser)
 from .parser import Parser
+from .semantic_dependency import BiaffineSemanticDependencyParser
 
 __all__ = ['BiaffineDependencyParser',
            'CRFNPDependencyParser',
            'CRFDependencyParser',
            'CRF2oDependencyParser',
            'CRFConstituencyParser',
+           'BiaffineSemanticDependencyParser',
            'Parser']
diff --git a/supar/parsers/semantic_dependency.py b/supar/parsers/semantic_dependency.py
diff --git a/supar/utils/transform.py b/supar/utils/transform.py