[PP] Initialize P2P communicators on first step (#160210)

H-Huang · pytorchmergebot · commit e63c2b21c186 · 2025-08-11T23:46:58.000Z
Was hitting hangs in multi-node settings and initializing the NCCL communicators needed for batch p2p ops ahead of time fixes this. This change adds extra communication since it communicates a dummy tensor to next and previous stage ranks. However, this is only paid on the first step so it is negligible. Debug history: https://docs.google.com/document/d/1EKVJYmW2hj_VsvDvnSggXhZzJyvMu9dA0iDJWOZAtjY/edit?tab=t.0 Pull Request resolved: #160210 Approved by: https://github.com/wconstab
diff --git a/torch/distributed/pipelining/schedules.py b/torch/distributed/pipelining/schedules.py
@@ -554,6 +554,13 @@ def __init__(
         )
 
     def _initialize_stage(self, args, kwargs):
+        # Prepare the communication needed for the pipeline schedule execution
+        # This is needed because during execution we always perform a series of batch P2P ops
+        # The first call of the batched P2P needs to involve the global group
+        all_ops: list[dist.P2POp] = []
+        all_ops.extend(self._stage._get_init_p2p_neighbors_ops())
+        _wait_batch_p2p(_batch_p2p(all_ops))
+
         self._stage._prepare_forward_infra(self._n_microbatches, args, kwargs)
         if self._has_backward:
             self._stage._prepare_backward_infra(self._n_microbatches)
@@ -1428,6 +1435,14 @@ def __init__(
             )
 
     def _initialize_stages(self, args: tuple[Any, ...], kwargs):
+        # Prepare the communication needed for the pipeline schedule execution
+        # This is needed because during execution we always perform a series of batch P2P ops
+        # The first call of the batched P2P needs to involve the global group
+        all_ops: list[dist.P2POp] = []
+        for stage in self._stages:
+            all_ops.extend(stage._get_init_p2p_neighbors_ops())
+        _wait_batch_p2p(_batch_p2p(all_ops))
+
         # may be 'none' value (if this stage sends its output shapes to the next stage via P2P)
         # or real value (if this stage and next stage are on the same device)
         next_stage_args: tuple[Any, ...] = tuple()
diff --git a/torch/distributed/pipelining/stage.py b/torch/distributed/pipelining/stage.py
@@ -935,6 +935,60 @@ def _validate_fwd_outputs(self, outputs: tuple[torch.Tensor, ...]):
             f"Stage {self.stage_index} forward outputs", expected_tensors_meta, outputs
         )
 
+    def _get_init_p2p_neighbors_ops(self) -> list[dist.P2POp]:
+        """
+        Get the operations to initialize the p2p communicators between previous and next stages.
+        This is done so by creating a dummy tensor and sending it to the next stage and receiving
+        from the previous stage.
+        """
+        ops: list[dist.P2POp] = []
+        next_stage_peer_rank = self.stage_index_to_group_rank.get(self.stage_index + 1)
+        prev_stage_peer_rank = self.stage_index_to_group_rank.get(self.stage_index - 1)
+
+        recv_tensor = torch.zeros(1, device=self.device)
+        send_tensor = torch.tensor(self.stage_index, device=self.device)
+        # forward
+        if not self.is_first:
+            ops.append(
+                dist.P2POp(
+                    dist.irecv,
+                    recv_tensor,
+                    group_peer=prev_stage_peer_rank,
+                    group=self.group,
+                )
+            )
+        if not self.is_last:
+            ops.append(
+                dist.P2POp(
+                    dist.isend,
+                    send_tensor,
+                    group_peer=next_stage_peer_rank,
+                    group=self.group,
+                )
+            )
+
+        # backward
+        if not self.is_first:
+            ops.append(
+                dist.P2POp(
+                    dist.isend,
+                    send_tensor,
+                    group_peer=prev_stage_peer_rank,
+                    group=self.group,
+                )
+            )
+        if not self.is_last:
+            ops.append(
+                dist.P2POp(
+                    dist.irecv,
+                    recv_tensor,
+                    group_peer=next_stage_peer_rank,
+                    group=self.group,
+                )
+            )
+
+        return ops
+
 
 class _PipelineStage(_PipelineStageBase):
     def __init__(