remove unnecessary sync point in AveragedModel update (#158017)

gl3lan · facebook-github-bot · commit f61c1f56f30f · 2025-07-14T01:36:30.000-07:00
Summary:

The test `bool(self.n_averaged == 0)` is a CPU/GPU synchronization point that is called for each update.
This test is only meant to know whether the AveragedModel copy has been initialized or not.
This diff introduces a CPU-based boolean variable for that purpose.
When loading from checkpoint we also make sure the parameter is refreshed.

After this fix, each `update_parameter` call is reduced to 6ms from 333ms (98% reduction).

Test Plan:
contbuild &amp; OSS CI
Test plan from GitHub:
CI

Rollback Plan:

Differential Revision: D78074709
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
@@ -4,7 +4,7 @@
 import itertools
 import math
 import warnings
-from collections.abc import Iterable
+from collections.abc import Iterable, Mapping
 from copy import deepcopy
 from typing import Any, Callable, cast, Literal, Optional, Union
 
@@ -237,6 +237,7 @@ def __init__(
         self.register_buffer(
             "n_averaged", torch.tensor(0, dtype=torch.long, device=device)
         )
+        self.is_copy_initialized = False
         self.avg_fn = avg_fn
         self.multi_avg_fn = multi_avg_fn
         self.use_buffers = use_buffers
@@ -259,15 +260,14 @@ def update_parameters(self, model: Module):
         )
         self_param_detached: list[Optional[Tensor]] = []
         model_param_detached: list[Optional[Tensor]] = []
-        copy_param = bool(self.n_averaged == 0)
         for p_averaged, p_model in zip(self_param, model_param):
             p_model_ = p_model.detach().to(p_averaged.device)
             self_param_detached.append(p_averaged.detach())
             model_param_detached.append(p_model_)
-            if copy_param:
+            if not self.is_copy_initialized:
                 p_averaged.detach().copy_(p_model_)
 
-        if self.n_averaged > 0:
+        if self.is_copy_initialized:
             if self.multi_avg_fn is not None or self.avg_fn is None:
                 grouped_tensors = _group_tensors_by_device_and_dtype(
                     [self_param_detached, model_param_detached]
@@ -310,6 +310,13 @@ def update_parameters(self, model: Module):
             for b_swa, b_model in zip(self.module.buffers(), model.buffers()):
                 b_swa.detach().copy_(b_model.detach().to(b_swa.device))
         self.n_averaged += 1
+        self.is_copy_initialized = True
+
+    def load_state_dict(
+        self, state_dict: Mapping[str, Any], strict: bool = True, assign: bool = False
+    ):
+        super().load_state_dict(state_dict, strict, assign)
+        self.is_copy_initialized = bool(self.n_averaged > 0)
 
 
 @torch.no_grad()