ELMoEmbedding

yzhangcs · yzhangcs · commit 1b0b07cc70d6 · 2021-07-17T17:49:15.000+08:00
diff --git a/docs/source/modules/pretrained.rst b/docs/source/modules/pretrained.rst
@@ -1,13 +1,18 @@
 Transformer Layers
 ================================================================
 
-.. currentmodule:: supar.modules.transformer
+.. currentmodule:: supar.modules.pretrained
 
 TransformerEmbedding
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: TransformerEmbedding
     :members:
 
+ELMoEmbedding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: ELMoEmbedding
+    :members:
+
 ScalarMix
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: ScalarMix
diff --git a/supar/cmds/biaffine_dep.py b/supar/cmds/biaffine_dep.py
@@ -15,7 +15,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/biaffine_sdp.py b/supar/cmds/biaffine_sdp.py
@@ -12,7 +12,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'lemma', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'lemma', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/crf2o_dep.py b/supar/cmds/crf2o_dep.py
@@ -16,7 +16,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/crf_con.py b/supar/cmds/crf_con.py
@@ -13,7 +13,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/crf_dep.py b/supar/cmds/crf_dep.py
@@ -16,7 +16,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/vi_con.py b/supar/cmds/vi_con.py
@@ -12,7 +12,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/vi_dep.py b/supar/cmds/vi_dep.py
@@ -15,7 +15,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/vi_sdp.py b/supar/cmds/vi_sdp.py
@@ -12,7 +12,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'lemma', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'lemma', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/models/con.py b/supar/models/con.py
@@ -46,6 +46,11 @@ class CRFConstituencyModel(Model):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -101,6 +106,8 @@ def __init__(self,
                  n_char_embed=50,
                  n_char_hidden=100,
                  char_pad_index=0,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, True),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
@@ -247,6 +254,11 @@ class VIConstituencyModel(CRFConstituencyModel):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -310,6 +322,8 @@ def __init__(self,
                  n_char_embed=50,
                  n_char_hidden=100,
                  char_pad_index=0,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, True),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
diff --git a/supar/models/dep.py b/supar/models/dep.py
@@ -47,6 +47,11 @@ class BiaffineDependencyModel(Model):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -104,6 +109,8 @@ def __init__(self,
                  n_char_embed=50,
                  n_char_hidden=100,
                  char_pad_index=0,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, False),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
@@ -263,6 +270,11 @@ class CRFDependencyModel(BiaffineDependencyModel):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -383,6 +395,11 @@ class CRF2oDependencyModel(BiaffineDependencyModel):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -439,6 +456,8 @@ def __init__(self,
                  n_char_embed=50,
                  n_char_hidden=100,
                  char_pad_index=0,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, False),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
@@ -622,6 +641,11 @@ class VIDependencyModel(BiaffineDependencyModel):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -687,6 +711,8 @@ def __init__(self,
                  n_char_embed=50,
                  n_char_hidden=100,
                  char_pad_index=0,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, False),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
diff --git a/supar/models/model.py b/supar/models/model.py
@@ -2,8 +2,9 @@
 
 import torch
 import torch.nn as nn
-from supar.modules import (CharLSTM, IndependentDropout, SharedDropout,
-                           TransformerEmbedding, VariationalLSTM)
+from supar.modules import (CharLSTM, ELMoEmbedding, IndependentDropout,
+                           SharedDropout, TransformerEmbedding,
+                           VariationalLSTM)
 from supar.utils import Config
 from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 
@@ -24,6 +25,8 @@ def __init__(self,
                  n_char_hidden=100,
                  char_pad_index=0,
                  char_dropout=0,
+                 elmo_bos_eos=(True, True),
+                 elmo_dropout=0.5,
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
@@ -63,6 +66,12 @@ def __init__(self,
                 self.lemma_embed = nn.Embedding(num_embeddings=n_lemmas,
                                                 embedding_dim=n_feat_embed)
                 n_input += n_feat_embed
+            if 'elmo' in feat:
+                self.elmo_embed = ELMoEmbedding(n_out=n_feat_embed,
+                                                bos_eos=elmo_bos_eos,
+                                                dropout=elmo_dropout,
+                                                requires_grad=(not freeze))
+                n_input += self.elmo_embed.n_out
             if 'bert' in feat:
                 self.bert_embed = TransformerEmbedding(model=bert,
                                                        n_layers=n_bert_layers,
@@ -126,6 +135,8 @@ def embed(self, words, feats):
             feat_embeds.append(self.tag_embed(feats.pop()))
         if 'char' in self.args.feat:
             feat_embeds.append(self.char_embed(feats.pop(0)))
+        if 'elmo' in self.args.feat:
+            feat_embeds.append(self.elmo_embed(feats.pop(0)))
         if 'bert' in self.args.feat:
             feat_embeds.append(self.bert_embed(feats.pop(0)))
         if 'lemma' in self.args.feat:
diff --git a/supar/models/sdp.py b/supar/models/sdp.py
@@ -46,6 +46,11 @@ class BiaffineSemanticDependencyModel(Model):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -107,6 +112,8 @@ def __init__(self,
                  n_char_hidden=400,
                  char_pad_index=0,
                  char_dropout=0.33,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, False),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
@@ -206,7 +213,7 @@ def decode(self, s_edge, s_label):
                 Scores of all possible labels on each edge.
 
         Returns:
-            ~torch.BoolTensor:
+            ~torch.LongTensor:
                 Predicted labels of shape ``[batch_size, seq_len, seq_len]``.
         """
 
@@ -252,6 +259,11 @@ class VISemanticDependencyModel(BiaffineSemanticDependencyModel):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -321,6 +333,8 @@ def __init__(self,
                  n_char_hidden=100,
                  char_pad_index=0,
                  char_dropout=0,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, False),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
diff --git a/supar/modules/__init__.py b/supar/modules/__init__.py
@@ -4,8 +4,8 @@
 from .dropout import IndependentDropout, SharedDropout
 from .lstm import CharLSTM, VariationalLSTM
 from .mlp import MLP
+from .pretrained import ELMoEmbedding, TransformerEmbedding
 from .scalar_mix import ScalarMix
-from .transformer import TransformerEmbedding
 
-__all__ = ['MLP', 'TransformerEmbedding', 'Biaffine', 'CharLSTM',
+__all__ = ['MLP', 'TransformerEmbedding', 'Biaffine', 'CharLSTM', 'ELMoEmbedding',
            'IndependentDropout', 'ScalarMix', 'SharedDropout', 'Triaffine', 'VariationalLSTM']
diff --git a/supar/modules/pretrained.py b/supar/modules/pretrained.py
diff --git a/supar/parsers/con.py b/supar/parsers/con.py
diff --git a/supar/parsers/dep.py b/supar/parsers/dep.py
diff --git a/supar/parsers/sdp.py b/supar/parsers/sdp.py
diff --git a/supar/utils/data.py b/supar/utils/data.py