Add exponential moving average for model weights + few other additions and cleanup

rwightman · rwightman · commit 9bcd65181bad · 2019-06-07T15:39:36.000-07:00
* ModelEma class added to track an EMA set of weights for the model being trained
* EMA handling added to train, validation and clean_checkpoint scripts
* Add multi checkpoint or multi-model validation support to validate.py
* Add syncbn option (APEX) to train script for experimentation
* Cleanup interface of CheckpointSaver while adding ema functionality
diff --git a/clean_checkpoint.py b/clean_checkpoint.py
@@ -9,6 +9,8 @@
                     help='path to latest checkpoint (default: none)')
 parser.add_argument('--output', default='./cleaned.pth', type=str, metavar='PATH',
                     help='output path')
+parser.add_argument('--use-ema', dest='use_ema', action='store_true',
+                    help='use ema version of weights if present')
 
 
 def main():
@@ -24,8 +26,13 @@ def main():
         checkpoint = torch.load(args.checkpoint, map_location='cpu')
 
         new_state_dict = OrderedDict()
-        if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
-            state_dict = checkpoint['state_dict']
+        if isinstance(checkpoint, dict):
+            state_dict_key = 'state_dict_ema' if args.use_ema else 'state_dict'
+            if state_dict_key in checkpoint:
+                state_dict = checkpoint[state_dict_key]
+            else:
+                print("Error: No state_dict found in checkpoint {}.".format(args.checkpoint))
+                exit(1)
         else:
             state_dict = checkpoint
         for k, v in state_dict.items():
diff --git a/models/helpers.py b/models/helpers.py
@@ -4,22 +4,24 @@
 from collections import OrderedDict
 
 
-def load_checkpoint(model, checkpoint_path):
+def load_checkpoint(model, checkpoint_path, use_ema=False):
     if checkpoint_path and os.path.isfile(checkpoint_path):
-        print("=> Loading checkpoint '{}'".format(checkpoint_path))
         checkpoint = torch.load(checkpoint_path)
-        if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+        state_dict_key = ''
+        if isinstance(checkpoint, dict):
+            state_dict_key = 'state_dict'
+            if use_ema and 'state_dict_ema' in checkpoint:
+                state_dict_key = 'state_dict_ema'
+        if state_dict_key and state_dict_key in checkpoint:
             new_state_dict = OrderedDict()
-            for k, v in checkpoint['state_dict'].items():
-                if k.startswith('module'):
-                    name = k[7:]  # remove `module.`
-                else:
-                    name = k
+            for k, v in checkpoint[state_dict_key].items():
+                # strip `module.` prefix
+                name = k[7:] if k.startswith('module') else k
                 new_state_dict[name] = v
             model.load_state_dict(new_state_dict)
         else:
             model.load_state_dict(checkpoint)
-        print("=> Loaded checkpoint '{}'".format(checkpoint_path))
+        print("=> Loaded {} from checkpoint '{}'".format(state_dict_key or 'weights', checkpoint_path))
     else:
         print("=> Error: No checkpoint found at '{}'".format(checkpoint_path))
         raise FileNotFoundError()
@@ -28,27 +30,24 @@ def load_checkpoint(model, checkpoint_path):
 def resume_checkpoint(model, checkpoint_path, start_epoch=None):
     optimizer_state = None
     if os.path.isfile(checkpoint_path):
-        print("=> loading checkpoint '{}'".format(checkpoint_path))
         checkpoint = torch.load(checkpoint_path)
         if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
             new_state_dict = OrderedDict()
             for k, v in checkpoint['state_dict'].items():
-                if k.startswith('module'):
-                    name = k[7:]  # remove `module.`
-                else:
-                    name = k
+                name = k[7:] if k.startswith('module') else k
                 new_state_dict[name] = v
             model.load_state_dict(new_state_dict)
             if 'optimizer' in checkpoint:
                 optimizer_state = checkpoint['optimizer']
-            print("=> loaded checkpoint '{}' (epoch {})".format(checkpoint_path, checkpoint['epoch']))
             start_epoch = checkpoint['epoch'] if start_epoch is None else start_epoch
+            print("=> Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, checkpoint['epoch']))
         else:
             model.load_state_dict(checkpoint)
             start_epoch = 0 if start_epoch is None else start_epoch
+            print("=> Loaded checkpoint '{}'".format(checkpoint_path))
         return optimizer_state, start_epoch
     else:
-        print("=> No checkpoint found at '{}'".format(checkpoint_path))
+        print("=> Error: No checkpoint found at '{}'".format(checkpoint_path))
         raise FileNotFoundError()
 
 
diff --git a/optim/rmsprop_tf.py b/optim/rmsprop_tf.py
@@ -89,7 +89,7 @@ def step(self, closure=None):
                 state['step'] += 1
 
                 if group['weight_decay'] != 0:
-                    if group['decoupled_decay']:
+                    if 'decoupled_decay' in group and group['decoupled_decay']:
                         p.data.add_(-group['weight_decay'], p.data)
                     else:
                         grad = grad.add(group['weight_decay'], p.data)
@@ -109,7 +109,7 @@ def step(self, closure=None):
                 if group['momentum'] > 0:
                     buf = state['momentum_buffer']
                     # Tensorflow accumulates the LR scaling in the momentum buffer
-                    if group['lr_in_momentum']:
+                    if 'lr_in_momentum' in group and group['lr_in_momentum']:
                         buf.mul_(group['momentum']).addcdiv_(group['lr'], grad, avg)
                         p.data.add_(-buf)
                     else:
diff --git a/train.py b/train.py
@@ -6,12 +6,13 @@
 try:
     from apex import amp
     from apex.parallel import DistributedDataParallel as DDP
+    from apex.parallel import convert_syncbn_model
     has_apex = True
 except ImportError:
     has_apex = False
 
 from data import Dataset, create_loader, resolve_data_config, FastCollateMixup, mixup_target
-from models import create_model, resume_checkpoint
+from models import create_model, resume_checkpoint, load_checkpoint
 from utils import *
 from loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
 from optim import create_optimizer
@@ -91,11 +92,17 @@
                     help='BatchNorm momentum override (if not None)')
 parser.add_argument('--bn-eps', type=float, default=None,
                     help='BatchNorm epsilon override (if not None)')
+parser.add_argument('--model-ema', action='store_true', default=False,
+                    help='Enable tracking moving average of model weights')
+parser.add_argument('--model-ema-force-cpu', action='store_true', default=False,
+                    help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.')
+parser.add_argument('--model-ema-decay', type=float, default=0.9998,
+                    help='decay factor for model weights moving average (default: 0.9998)')
 parser.add_argument('--seed', type=int, default=42, metavar='S',
                     help='random seed (default: 42)')
 parser.add_argument('--log-interval', type=int, default=50, metavar='N',
                     help='how many batches to wait before logging training status')
-parser.add_argument('--recovery-interval', type=int, default=1000, metavar='N',
+parser.add_argument('--recovery-interval', type=int, default=0, metavar='N',
                     help='how many batches to wait before writing recovery checkpoint')
 parser.add_argument('-j', '--workers', type=int, default=4, metavar='N',
                     help='how many training processes to use (default: 1)')
@@ -109,6 +116,8 @@
                     help='save images of input bathes every log interval for debugging')
 parser.add_argument('--amp', action='store_true', default=False,
                     help='use NVIDIA amp for mixed precision training')
+parser.add_argument('--sync-bn', action='store_true',
+                    help='enabling apex sync BN.')
 parser.add_argument('--no-prefetcher', action='store_true', default=False,
                     help='disable fast prefetcher')
 parser.add_argument('--output', default='', type=str, metavar='PATH',
@@ -131,31 +140,28 @@ def main():
 
     args.device = 'cuda:0'
     args.world_size = 1
-    r = -1
+    args.rank = 0  # global rank
     if args.distributed:
         args.num_gpu = 1
         args.device = 'cuda:%d' % args.local_rank
         torch.cuda.set_device(args.local_rank)
-        torch.distributed.init_process_group(backend='nccl',
-                                             init_method='env://')
+        torch.distributed.init_process_group(
+            backend='nccl', init_method='env://')
         args.world_size = torch.distributed.get_world_size()
-        r = torch.distributed.get_rank()
+        args.rank = torch.distributed.get_rank()
+    assert args.rank >= 0
 
     if args.distributed:
         print('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
-              % (r, args.world_size))
+              % (args.rank, args.world_size))
     else:
         print('Training with a single process on %d GPUs.' % args.num_gpu)
 
-    # FIXME seed handling for multi-process distributed?
-    torch.manual_seed(args.seed)
+    torch.manual_seed(args.seed + args.rank)
 
     output_dir = ''
     if args.local_rank == 0:
-        if args.output:
-            output_base = args.output
-        else:
-            output_base = './output'
+        output_base = args.output if args.output else './output'
         exp_name = '-'.join([
             datetime.now().strftime("%Y%m%d-%H%M%S"),
             args.model,
@@ -191,6 +197,8 @@ def main():
             args.amp = False
         model = nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda()
     else:
+        if args.distributed and args.sync_bn and has_apex:
+            model = convert_syncbn_model(model)
         model.cuda()
 
     optimizer = create_optimizer(args, model)
@@ -205,8 +213,20 @@ def main():
         use_amp = False
         print('AMP disabled')
 
+    model_ema = None
+    if args.model_ema:
+        model_ema = ModelEma(
+            model,
+            decay=args.model_ema_decay,
+            device='cpu' if args.model_ema_force_cpu else '',
+            resume=args.resume)
+
     if args.distributed:
         model = DDP(model, delay_allreduce=True)
+        if model_ema is not None and not args.model_ema_force_cpu:
+            # must also distribute EMA model to allow validation
+            model_ema.ema = DDP(model_ema.ema, delay_allreduce=True)
+            model_ema.ema_has_module = True
 
     lr_scheduler, num_epochs = create_scheduler(args, optimizer)
     if start_epoch > 0:
@@ -273,6 +293,7 @@ def main():
     eval_metric = args.eval_metric
     saver = None
     if output_dir:
+        # only set if process is rank 0
         decreasing = True if eval_metric == 'loss' else False
         saver = CheckpointSaver(checkpoint_dir=output_dir, decreasing=decreasing)
     best_metric = None
@@ -284,10 +305,15 @@ def main():
 
             train_metrics = train_epoch(
                 epoch, model, loader_train, optimizer, train_loss_fn, args,
-                lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, use_amp=use_amp)
+                lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir,
+                use_amp=use_amp, model_ema=model_ema)
+
+            eval_metrics = validate(model, loader_eval, validate_loss_fn, args)
 
-            eval_metrics = validate(
-                model, loader_eval, validate_loss_fn, args)
+            if model_ema is not None and not args.model_ema_force_cpu:
+                ema_eval_metrics = validate(
+                    model_ema.ema, loader_eval, validate_loss_fn, args, log_suffix=' (EMA)')
+                eval_metrics = ema_eval_metrics
 
             if lr_scheduler is not None:
                 lr_scheduler.step(epoch, eval_metrics[eval_metric])
@@ -298,15 +324,12 @@ def main():
 
             if saver is not None:
                 # save proper checkpoint with eval metric
-                best_metric, best_epoch = saver.save_checkpoint({
-                    'epoch': epoch + 1,
-                    'arch': args.model,
-                    'state_dict': model.state_dict(),
-                    'optimizer': optimizer.state_dict(),
-                    'args': args,
-                    },
+                save_metric = eval_metrics[eval_metric]
+                best_metric, best_epoch = saver.save_checkpoint(
+                    model, optimizer, args,
                     epoch=epoch + 1,
-                    metric=eval_metrics[eval_metric])
+                    model_ema=model_ema,
+                    metric=save_metric)
 
     except KeyboardInterrupt:
         pass
@@ -316,7 +339,7 @@ def main():
 
 def train_epoch(
         epoch, model, loader, optimizer, loss_fn, args,
-        lr_scheduler=None, saver=None, output_dir='', use_amp=False):
+        lr_scheduler=None, saver=None, output_dir='', use_amp=False, model_ema=None):
 
     if args.prefetcher and args.mixup > 0 and loader.mixup_enabled:
         if args.mixup_off_epoch and epoch >= args.mixup_off_epoch:
@@ -359,6 +382,8 @@ def train_epoch(
         optimizer.step()
 
         torch.cuda.synchronize()
+        if model_ema is not None:
+            model_ema.update(model)
         num_updates += 1
 
         batch_time_m.update(time.time() - end)
@@ -394,18 +419,11 @@ def train_epoch(
                         padding=0,
                         normalize=True)
 
-        if args.local_rank == 0 and (
-                saver is not None and last_batch or (batch_idx + 1) % args.recovery_interval == 0):
+        if saver is not None and args.recovery_interval and (
+                last_batch or (batch_idx + 1) % args.recovery_interval == 0):
             save_epoch = epoch + 1 if last_batch else epoch
-            saver.save_recovery({
-                'epoch': save_epoch,
-                'arch': args.model,
-                'state_dict':  model.state_dict(),
-                'optimizer': optimizer.state_dict(),
-                'args': args,
-                },
-                epoch=save_epoch,
-                batch_idx=batch_idx)
+            saver.save_recovery(
+                model, optimizer, args, save_epoch, model_ema=model_ema, batch_idx=batch_idx)
 
         if lr_scheduler is not None:
             lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg)
@@ -415,7 +433,7 @@ def train_epoch(
     return OrderedDict([('loss', losses_m.avg)])
 
 
-def validate(model, loader, loss_fn, args):
+def validate(model, loader, loss_fn, args, log_suffix=''):
     batch_time_m = AverageMeter()
     losses_m = AverageMeter()
     prec1_m = AverageMeter()
@@ -461,12 +479,13 @@ def validate(model, loader, loss_fn, args):
             batch_time_m.update(time.time() - end)
             end = time.time()
             if args.local_rank == 0 and (last_batch or batch_idx % args.log_interval == 0):
-                print('Test: [{0}/{1}]\t'
+                log_name = 'Test' + log_suffix
+                print('{0}: [{1}/{2}]\t'
                       'Time {batch_time.val:.3f} ({batch_time.avg:.3f})  '
                       'Loss {loss.val:.4f} ({loss.avg:.4f})  '
                       'Prec@1 {top1.val:.4f} ({top1.avg:.4f})  '
                       'Prec@5 {top5.val:.4f} ({top5.avg:.4f})'.format(
-                    batch_idx, last_idx,
+                    log_name, batch_idx, last_idx,
                     batch_time=batch_time_m, loss=losses_m,
                     top1=prec1_m, top5=prec5_m))
 
@@ -475,12 +494,5 @@ def validate(model, loader, loss_fn, args):
     return metrics
 
 
-def reduce_tensor(tensor, n):
-    rt = tensor.clone()
-    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
-    rt /= n
-    return rt
-
-
 if __name__ == '__main__':
     main()
diff --git a/utils.py b/utils.py
diff --git a/validate.py b/validate.py