Checkpoint support

yzhangcs · yzhangcs · commit 97a646d55033 · 2021-07-17T15:40:55.000+08:00
diff --git a/supar/cmds/biaffine_dep.py b/supar/cmds/biaffine_dep.py
@@ -17,6 +17,7 @@ def main():
     subparser = subparsers.add_parser('train', help='Train a parser.')
     subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
+    subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
     subparser.add_argument('--punct', action='store_true', help='whether to include punctuation')
     subparser.add_argument('--max-len', type=int, help='max length of the sentences')
diff --git a/supar/cmds/biaffine_sdp.py b/supar/cmds/biaffine_sdp.py
@@ -14,6 +14,7 @@ def main():
     subparser = subparsers.add_parser('train', help='Train a parser.')
     subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'lemma', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
+    subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
     subparser.add_argument('--max-len', type=int, help='max length of the sentences')
     subparser.add_argument('--buckets', default=32, type=int, help='max num of buckets to use')
diff --git a/supar/cmds/cmd.py b/supar/cmds/cmd.py
@@ -21,15 +21,15 @@ def parse(parser):
     torch.set_num_threads(args.threads)
     torch.manual_seed(args.seed)
     init_device(args.device, args.local_rank)
-    init_logger(logger, f"{args.path}.{args.mode}.log")
+    init_logger(logger, f"{args.path}.{args.mode}.log", 'a' if args.get('checkpoint') else 'w')
     logger.info('\n' + str(args))
 
     if args.mode == 'train':
-        parser = Parser.build(**args)
+        parser = Parser.load(**args) if args.checkpoint else Parser.build(**args)
         parser.train(**args)
     elif args.mode == 'evaluate':
-        parser = Parser.load(args.path)
+        parser = Parser.load(**args)
         parser.evaluate(**args)
     elif args.mode == 'predict':
-        parser = Parser.load(args.path)
+        parser = Parser.load(**args)
         parser.predict(**args)
diff --git a/supar/cmds/crf2o_dep.py b/supar/cmds/crf2o_dep.py
@@ -18,6 +18,7 @@ def main():
     subparser = subparsers.add_parser('train', help='Train a parser.')
     subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
+    subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
     subparser.add_argument('--punct', action='store_true', help='whether to include punctuation')
     subparser.add_argument('--max-len', type=int, help='max length of the sentences')
diff --git a/supar/cmds/crf_con.py b/supar/cmds/crf_con.py
@@ -15,6 +15,7 @@ def main():
     subparser = subparsers.add_parser('train', help='Train a parser.')
     subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
+    subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
     subparser.add_argument('--max-len', type=int, help='max length of the sentences')
     subparser.add_argument('--buckets', default=32, type=int, help='max num of buckets to use')
diff --git a/supar/cmds/crf_dep.py b/supar/cmds/crf_dep.py
@@ -18,6 +18,7 @@ def main():
     subparser = subparsers.add_parser('train', help='Train a parser.')
     subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
+    subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
     subparser.add_argument('--punct', action='store_true', help='whether to include punctuation')
     subparser.add_argument('--max-len', type=int, help='max length of the sentences')
diff --git a/supar/cmds/vi_con.py b/supar/cmds/vi_con.py
@@ -14,6 +14,7 @@ def main():
     subparser = subparsers.add_parser('train', help='Train a parser.')
     subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
+    subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
     subparser.add_argument('--max-len', type=int, help='max length of the sentences')
     subparser.add_argument('--buckets', default=32, type=int, help='max num of buckets to use')
diff --git a/supar/cmds/vi_dep.py b/supar/cmds/vi_dep.py
@@ -17,6 +17,7 @@ def main():
     subparser = subparsers.add_parser('train', help='Train a parser.')
     subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
+    subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
     subparser.add_argument('--punct', action='store_true', help='whether to include punctuation')
     subparser.add_argument('--max-len', type=int, help='max length of the sentences')
diff --git a/supar/cmds/vi_sdp.py b/supar/cmds/vi_sdp.py
@@ -14,6 +14,7 @@ def main():
     subparser = subparsers.add_parser('train', help='Train a parser.')
     subparser.add_argument('--feat', '-f', choices=['tag', 'char', 'lemma', 'bert'], nargs='+', help='features to use')
     subparser.add_argument('--build', '-b', action='store_true', help='whether to build the model first')
+    subparser.add_argument('--checkpoint', action='store_true', help='whether to load a checkpoint to restore training')
     subparser.add_argument('--encoder', choices=['lstm', 'bert'], default='lstm', help='encoder to use')
     subparser.add_argument('--max-len', type=int, help='max length of the sentences')
     subparser.add_argument('--buckets', default=32, type=int, help='max num of buckets to use')
diff --git a/supar/parsers/parser.py b/supar/parsers/parser.py
@@ -9,7 +9,7 @@
 import torch.distributed as dist
 from supar.utils import Config, Dataset
 from supar.utils.field import Field
-from supar.utils.fn import download
+from supar.utils.fn import download, get_rng_state, set_rng_state
 from supar.utils.logging import init_logger, logger
 from supar.utils.metric import Metric
 from supar.utils.parallel import DistributedDataParallel as DDP
@@ -34,15 +34,13 @@ def train(self, train, dev, test, buckets=32, batch_size=5000, update_steps=1,
         init_logger(logger, verbose=args.verbose)
 
         self.transform.train()
+        batch_size = batch_size // update_steps
         if dist.is_initialized():
-            args.batch_size = args.batch_size // dist.get_world_size()
+            batch_size = batch_size // dist.get_world_size()
         logger.info("Loading the data")
-        train = Dataset(self.transform, args.train, **args)
-        dev = Dataset(self.transform, args.dev)
-        test = Dataset(self.transform, args.test)
-        train.build(args.batch_size//args.update_steps, args.buckets, True, dist.is_initialized())
-        dev.build(args.batch_size, args.buckets)
-        test.build(args.batch_size, args.buckets)
+        train = Dataset(self.transform, args.train, **args).build(batch_size, buckets, True, dist.is_initialized())
+        dev = Dataset(self.transform, args.dev).build(batch_size, buckets)
+        test = Dataset(self.transform, args.test).build(batch_size, buckets)
         logger.info(f"\n{'train:':6} {train}\n{'dev:':6} {dev}\n{'test:':6} {test}\n")
 
         if args.encoder == 'lstm':
@@ -60,10 +58,16 @@ def train(self, train, dev, test, buckets=32, batch_size=5000, update_steps=1,
         if dist.is_initialized():
             self.model = DDP(self.model, device_ids=[args.local_rank], find_unused_parameters=True)
 
-        elapsed = timedelta()
-        best_e, best_metric = 1, Metric()
+        self.epoch, self.best_e, self.patience, self.best_metric, self.elapsed = 1, 1, patience, Metric(), timedelta()
+        if self.args.checkpoint:
+            self.optimizer.load_state_dict(self.checkpoint_state_dict.pop('optimizer_state_dict'))
+            self.scheduler.load_state_dict(self.checkpoint_state_dict.pop('scheduler_state_dict'))
+            set_rng_state(self.checkpoint_state_dict.pop('rng_state'))
+            for k, v in self.checkpoint_state_dict.items():
+                setattr(self, k, v)
+            train.loader.batch_sampler.epoch = self.epoch
 
-        for epoch in range(1, args.epochs + 1):
+        for epoch in range(self.epoch, args.epochs + 1):
             start = datetime.now()
 
             logger.info(f"Epoch {epoch} / {args.epochs}:")
@@ -74,22 +78,26 @@ def train(self, train, dev, test, buckets=32, batch_size=5000, update_steps=1,
             logger.info(f"{'test:':5} loss: {loss:.4f} - {test_metric}")
 
             t = datetime.now() - start
-            if dev_metric > best_metric:
-                best_e, best_metric = epoch, dev_metric
+            self.epoch += 1
+            self.patience -= 1
+            self.elapsed += t
+
+            if dev_metric > self.best_metric:
+                self.best_e, self.patience, self.best_metric = epoch, patience, dev_metric
                 if is_master():
-                    self.save(args.path)
+                    self.save_checkpoint(args.path)
                 logger.info(f"{t}s elapsed (saved)\n")
             else:
                 logger.info(f"{t}s elapsed\n")
-            elapsed += t
-            if epoch - best_e >= args.patience:
+            if self.patience < 1:
                 break
         loss, metric = self.load(**args)._evaluate(test.loader)
+        self.save(args.path)
 
-        logger.info(f"Epoch {best_e} saved")
-        logger.info(f"{'dev:':5} {best_metric}")
+        logger.info(f"Epoch {self.best_e} saved")
+        logger.info(f"{'dev:':5} {self.best_metric}")
         logger.info(f"{'test:':5} {metric}")
-        logger.info(f"{elapsed}s elapsed, {elapsed / epoch}s/epoch")
+        logger.info(f"{self.elapsed}s elapsed, {self.elapsed / epoch}s/epoch")
 
     def evaluate(self, data, buckets=8, batch_size=5000, **kwargs):
         args = self.args.update(locals())
@@ -98,7 +106,7 @@ def evaluate(self, data, buckets=8, batch_size=5000, **kwargs):
         self.transform.train()
         logger.info("Loading the data")
         dataset = Dataset(self.transform, data)
-        dataset.build(args.batch_size, args.buckets)
+        dataset.build(batch_size, buckets)
         logger.info(f"\n{dataset}")
 
         logger.info("Evaluating the dataset")
@@ -120,7 +128,7 @@ def predict(self, data, pred=None, lang=None, buckets=8, batch_size=5000, prob=F
 
         logger.info("Loading the data")
         dataset = Dataset(self.transform, data, lang=lang)
-        dataset.build(args.batch_size, args.buckets)
+        dataset.build(batch_size, buckets)
         logger.info(f"\n{dataset}")
 
         logger.info("Making predictions on the dataset")
@@ -153,7 +161,7 @@ def build(cls, path, **kwargs):
         raise NotImplementedError
 
     @classmethod
-    def load(cls, path, reload=False, src=None, **kwargs):
+    def load(cls, path, reload=False, src=None, checkpoint=False, **kwargs):
         r"""
         Loads a parser with data fields and pretrained model parameters.
 
@@ -169,6 +177,8 @@ def load(cls, path, reload=False, src=None, **kwargs):
                 ``'github'``: github release page.
                 ``'hlt'``: hlt homepage, only accessible from 9:00 to 18:00 (UTC+8).
                 Default: None.
+            checkpoint (bool):
+                If ``True``, loads all checkpoint states to restore the training process. Default: ``False``.
             kwargs (dict):
                 A dict holding unconsumed arguments for updating training configs and initializing the model.
 
@@ -192,7 +202,9 @@ def load(cls, path, reload=False, src=None, **kwargs):
         model.load_state_dict(state['state_dict'], False)
         model.to(args.device)
         transform = state['transform']
-        return cls(args, model, transform)
+        parser = cls(args, model, transform)
+        parser.checkpoint_state_dict = state['checkpoint_state_dict'] if args.checkpoint else None
+        return parser
 
     def save(self, path):
         model = self.model
@@ -207,3 +219,22 @@ def save(self, path):
                  'pretrained': pretrained,
                  'transform': self.transform}
         torch.save(state, path, pickle_module=dill)
+
+    def save_checkpoint(self, path):
+        model = self.model
+        if hasattr(model, 'module'):
+            model = self.model.module
+        args = model.args
+        checkpoint_state_dict = {k: getattr(self, k) for k in ['epoch', 'best_e', 'patience', 'best_metric', 'elapsed']}
+        checkpoint_state_dict.update({'optimizer_state_dict': self.optimizer.state_dict(),
+                                      'scheduler_state_dict': self.scheduler.state_dict(),
+                                      'rng_state': get_rng_state()})
+        state_dict = {k: v.cpu() for k, v in model.state_dict().items()}
+        pretrained = state_dict.pop('pretrained.weight', None)
+        state = {'name': self.NAME,
+                 'args': args,
+                 'state_dict': state_dict,
+                 'pretrained': pretrained,
+                 'checkpoint_state_dict': checkpoint_state_dict,
+                 'transform': self.transform}
+        torch.save(state, path, pickle_module=dill)
diff --git a/supar/utils/data.py b/supar/utils/data.py
@@ -131,18 +131,15 @@ def __init__(self, buckets, batch_size, shuffle=False, distributed=False):
         self.rank = dist.get_rank() if distributed else 0
         self.replicas = dist.get_world_size() if distributed else 1
         self.samples = sum(self.chunks) // self.replicas
-        self.epoch = 0
+        self.epoch = 1
 
     def __iter__(self):
         g = torch.Generator()
         g.manual_seed(self.epoch)
-        range_fn = torch.arange
+        total, count = 0, 0
         # if `shuffle=True`, shuffle both the buckets and samples in each bucket
         # for distributed training, make sure each process generates the same random sequence at each epoch
-        if self.shuffle:
-            def range_fn(x):
-                return torch.randperm(x, generator=g)
-        total, count = 0, 0
+        range_fn = torch.arange if not self.shuffle else lambda x: torch.randperm(x, generator=g)
         # TODO: more elegant way to deal with uneven data, which we directly discard right now
         for i in range_fn(len(self.buckets)).tolist():
             split_sizes = [(len(self.buckets[i]) - j - 1) // self.chunks[i] + 1 for j in range(self.chunks[i])]
diff --git a/supar/utils/fn.py b/supar/utils/fn.py
@@ -99,7 +99,20 @@ def download(url, reload=False):
             members = f.infolist()
             path = os.path.join(os.path.dirname(path), members[0].filename)
             if len(members) != 1:
-                raise RuntimeError('Only one file(not dir) is allowed in the zipfile.')
+                raise RuntimeError('Only one file (not dir) is allowed in the zipfile.')
             if reload or not os.path.exists(path):
                 f.extractall(os.path.dirname(path))
     return path
+
+
+def get_rng_state():
+    state = {'rng_state': torch.get_rng_state()}
+    if torch.cuda.is_available():
+        state['cuda_rng_state'] = torch.cuda.get_rng_state()
+    return state
+
+
+def set_rng_state(state):
+    torch.set_rng_state(state['rng_state'])
+    if torch.cuda.is_available():
+        torch.cuda.set_rng_state(state['cuda_rng_state'])