turn off reorder_for_peak_memory in case of collectives

xuanzhang816 · xuanzhang816 · commit 111f6de0671a · 2025-06-09T09:40:04.000-07:00
ghstack-source-id: 23bbb05 Pull Request resolved: #155271
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -375,6 +375,10 @@ def prologue_fusion_enabled() -> bool:
 # enable operator reordering for peak memory optimization
 reorder_for_peak_memory = True
 
+# reorder_for_peak_memory has performance regression for models with collectives
+# so we by default disable it for models with collectives
+disable_peak_mem_reorder_with_collectives = True
+
 # runtime estimation function for ops
 # for built-in estimation function, pass in "default"; for user-defined estimation function, pass in the function handle
 estimate_op_runtime = "default"
diff --git a/torch/_inductor/memory.py b/torch/_inductor/memory.py
@@ -9,8 +9,9 @@
 from torch._utils_internal import signpost_event
 from torch.utils._ordered_set import OrderedSet
 
+from . import config
 from .ir import MultiOutputLayout, NoneLayout
-from .utils import get_dtype_size, is_wait
+from .utils import contains_collective_or_wait, get_dtype_size, is_wait
 from .virtualized import V
 
 
@@ -648,6 +649,11 @@ def reorder_for_peak_memory(
 
     torch_log.info("Reordering for peak memory -- %d nodes", len(nodes))
 
+    if config.disable_peak_mem_reorder_with_collectives and contains_collective_or_wait(
+        nodes
+    ):
+        return nodes
+
     estimated_peak_memory, name_to_freeable_input_buf = prepare_planning_info(
         nodes,
         name_to_buf,
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
@@ -2341,6 +2341,10 @@ def contains_wait(snode: BaseSchedulerNode) -> bool:
         return is_wait(snode.node)
 
 
+def contains_collective_or_wait(snodes: list[BaseSchedulerNode]) -> bool:
+    return any(contains_collective(snode) or contains_wait(snode) for snode in snodes)
+
+
 def is_fallback_op(
     node: Optional[Operation],
     op: Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]],