ELMo support

zsLin177 · zsLin177 · commit a4befaa99a56 · 2021-06-16T19:19:27.000+08:00
diff --git a/supar/cmds/biaffine_dep.py b/supar/cmds/biaffine_dep.py
@@ -15,7 +15,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/biaffine_sdp.py b/supar/cmds/biaffine_sdp.py
@@ -12,7 +12,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'lemma', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'lemma', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/crf2o_dep.py b/supar/cmds/crf2o_dep.py
@@ -16,7 +16,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/crf_con.py b/supar/cmds/crf_con.py
@@ -13,7 +13,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/crf_dep.py b/supar/cmds/crf_dep.py
@@ -16,7 +16,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/vi_con.py b/supar/cmds/vi_con.py
@@ -12,7 +12,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/vi_dep.py b/supar/cmds/vi_dep.py
@@ -15,7 +15,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/vi_sdp.py b/supar/cmds/vi_sdp.py
@@ -12,7 +12,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'lemma', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'lemma', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/cmds/vi_srl.py b/supar/cmds/vi_srl.py
@@ -12,7 +12,7 @@ def main():
     subparsers = parser.add_subparsers(title='Commands', dest='mode')
     # train
     subparser = subparsers.add_parser('train', help='Train a parser.')
-    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'lemma', 'bert'], nargs='+', help='features to use')
+    subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'lemma', 'elmo', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
     subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
diff --git a/supar/models/con.py b/supar/models/con.py
@@ -45,6 +45,11 @@ class CRFConstituencyModel(Model):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -100,6 +105,8 @@ def __init__(self,
                  n_char_embed=50,
                  n_char_hidden=100,
                  char_pad_index=0,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, True),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
@@ -247,6 +254,11 @@ class VIConstituencyModel(CRFConstituencyModel):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -310,6 +322,8 @@ def __init__(self,
                  n_char_embed=50,
                  n_char_hidden=100,
                  char_pad_index=0,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, True),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
diff --git a/supar/models/dep.py b/supar/models/dep.py
@@ -47,6 +47,11 @@ class BiaffineDependencyModel(Model):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -104,6 +109,8 @@ def __init__(self,
                  n_char_embed=50,
                  n_char_hidden=100,
                  char_pad_index=0,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, False),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
@@ -262,6 +269,11 @@ class CRFDependencyModel(BiaffineDependencyModel):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -379,6 +391,11 @@ class CRF2oDependencyModel(BiaffineDependencyModel):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -435,6 +452,8 @@ def __init__(self,
                  n_char_embed=50,
                  n_char_hidden=100,
                  char_pad_index=0,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, False),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
@@ -618,6 +637,11 @@ class VIDependencyModel(BiaffineDependencyModel):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -683,6 +707,8 @@ def __init__(self,
                  n_char_embed=50,
                  n_char_hidden=100,
                  char_pad_index=0,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, False),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
diff --git a/supar/models/model.py b/supar/models/model.py
@@ -2,8 +2,9 @@
 
 import torch
 import torch.nn as nn
-from supar.modules import (CharLSTM, IndependentDropout, SharedDropout,
-                           TransformerEmbedding, VariationalLSTM)
+from supar.modules import (CharLSTM, ELMoEmbedding, IndependentDropout,
+                           SharedDropout, TransformerEmbedding,
+                           VariationalLSTM)
 from supar.utils import Config
 from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 
@@ -24,6 +25,8 @@ def __init__(self,
                  n_char_hidden=100,
                  char_pad_index=0,
                  char_dropout=0,
+                 elmo_bos_eos=(True, True),
+                 elmo_dropout=0.5,
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
@@ -63,6 +66,13 @@ def __init__(self,
                 self.lemma_embed = nn.Embedding(num_embeddings=n_lemmas,
                                                 embedding_dim=n_feat_embed)
                 n_input += n_feat_embed
+            if 'elmo' in feat:
+                self.elmo_embed = ELMoEmbedding(n_out=n_feat_embed,
+                                                bos_eos=elmo_bos_eos,
+                                                dropout=elmo_dropout,
+                                                requires_grad=(not freeze))
+                n_input += self.elmo_embed.n_out
+
             if 'bert' in feat:
                 self.bert_embed = TransformerEmbedding(model=bert,
                                                        n_layers=n_bert_layers,
@@ -126,6 +136,8 @@ def embed(self, words, feats):
             feat_embeds.append(self.tag_embed(feats.pop()))
         if 'char' in self.args.feat:
             feat_embeds.append(self.char_embed(feats.pop(0)))
+        if 'elmo' in self.args.feat:
+            feat_embeds.append(self.elmo_embed(feats.pop(0)))
         if 'bert' in self.args.feat:
             feat_embeds.append(self.bert_embed(feats.pop(0)))
         if 'lemma' in self.args.feat:
diff --git a/supar/models/sdp.py b/supar/models/sdp.py
@@ -47,6 +47,11 @@ class BiaffineSemanticDependencyModel(Model):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -108,6 +113,8 @@ def __init__(self,
                  n_char_hidden=400,
                  char_pad_index=0,
                  char_dropout=0.33,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, False),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
@@ -253,6 +260,11 @@ class VISemanticDependencyModel(BiaffineSemanticDependencyModel):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -322,6 +334,8 @@ def __init__(self,
                  n_char_hidden=100,
                  char_pad_index=0,
                  char_dropout=0,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, False),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
diff --git a/supar/models/srl.py b/supar/models/srl.py
@@ -47,6 +47,11 @@ class VISemanticRoleLabelingModel(Model):
             The size of hidden states of CharLSTM, required if using CharLSTM. Default: 100.
         char_pad_index (int):
             The index of the padding token in the character vocabulary, required if using CharLSTM. Default: 0.
+        elmo (str):
+            Name of the pretrained ELMo registered in `ELMoEmbedding.OPTION`. Default: ``'original_5b'``.
+        elmo_bos_eos (tuple[bool]):
+            A tuple of two boolean values indicating whether to keep start/end boundaries of elmo outputs.
+            Default: ``(True, False)``.
         bert (str):
             Specifies which kind of language model to use, e.g., ``'bert-base-cased'``.
             This is required if ``encoder='bert'`` or using BERT features. The full list can be found in `transformers`_.
@@ -116,6 +121,8 @@ def __init__(self,
                  n_char_hidden=100,
                  char_pad_index=0,
                  char_dropout=0,
+                 elmo='original_5b',
+                 elmo_bos_eos=(True, False),
                  bert=None,
                  n_bert_layers=4,
                  mix_dropout=.0,
diff --git a/supar/modules/__init__.py b/supar/modules/__init__.py
@@ -4,8 +4,8 @@
 from .dropout import IndependentDropout, SharedDropout
 from .lstm import CharLSTM, VariationalLSTM
 from .mlp import MLP
+from .pretrained import ELMoEmbedding, TransformerEmbedding
 from .scalar_mix import ScalarMix
-from .transformer import TransformerEmbedding
 
-__all__ = ['MLP', 'TransformerEmbedding', 'Biaffine', 'CharLSTM',
+__all__ = ['MLP', 'TransformerEmbedding', 'Biaffine', 'CharLSTM', 'ELMoEmbedding',
            'IndependentDropout', 'ScalarMix', 'SharedDropout', 'Triaffine', 'VariationalLSTM']
diff --git a/supar/modules/pretrained.py b/supar/modules/pretrained.py
diff --git a/supar/parsers/con.py b/supar/parsers/con.py
diff --git a/supar/parsers/dep.py b/supar/parsers/dep.py
diff --git a/supar/parsers/sdp.py b/supar/parsers/sdp.py
diff --git a/supar/parsers/srl.py b/supar/parsers/srl.py