[PT2][memory] mutation size correctness (#157562)

xuanzhang816 · pytorchmergebot · commit 86670b39fa3d · 2025-07-08T14:02:20.000Z
Pull Request resolved: #157562 Approved by: https://github.com/yf225
diff --git a/test/distributed/test_compute_comm_reordering.py b/test/distributed/test_compute_comm_reordering.py
@@ -179,9 +179,9 @@ def func(a):
                 .check("extern_kernels.mm")
                 .check("triton_poi_fused_relu")
                 .check("torch.ops._c10d_functional.all_reduce_.default")
-                .check("extern_kernels.mm")
                 .check("torch.ops._c10d_functional.wait_tensor.default")
                 .check("extern_kernels.mm")
+                .check("extern_kernels.mm")
                 .run(code)
             )
             out = compiled(inputs)
diff --git a/test/inductor/test_memory.py b/test/inductor/test_memory.py
@@ -203,6 +203,79 @@ def reorder_with_only_dfs(
             outp = compiled_model(self.inputs)
             self.assertTrue(same(outp, outp_corr))
 
+    @mock.patch.object(config, "allow_buffer_reuse", False)
+    def test_mutation_size_propogation(self):
+        """
+        This tests correct size propogation in the case of mutations.
+        In this example, buf1 is a mutation of buf0; we should have:
+        * buf0: has size_alloc 2048 and size_free 0;
+        * buf1: has size_alloc 0 and size_free 2048.
+        This is because
+        - when buf1 is created, no additional memory is used; and
+        - the 2048 bytes of memory can only be released when buf1 is freed.
+        Similar arguments for buf2 and buf3, buf4 and buf5, etc.
+        """
+
+        # using triton custom kernel to creat small example with mutations
+        @triton.jit
+        def convert_to_bf16_kernel(
+            input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(input_ptr + offsets, mask=mask)
+            x_bf16 = x.to(tl.bfloat16)
+            tl.store(output_ptr + offsets, x_bf16, mask=mask)
+
+        def convert_to_bf16(x):
+            output = torch.empty_like(x, dtype=torch.bfloat16)
+            n_elements = x.numel()
+            BLOCK_SIZE = 1024
+            grid = (triton.cdiv(n_elements, BLOCK_SIZE),)
+            convert_to_bf16_kernel[grid](
+                x.flatten(), output.flatten(), n_elements, BLOCK_SIZE
+            )
+            return output.view(x.shape)
+
+        # create a custom function to record the buffer size information
+        buffer_info = {}
+        og_method = memory.assign_memory_planning_info_for_scheduler_buffers
+
+        def assign_memory_planning_info_for_scheduler_buffers_with_records(
+            nodes, name_to_buf
+        ):
+            og_method(nodes, name_to_buf)
+            for buf_name, buf in name_to_buf.items():
+                buffer_info[buf_name] = (
+                    buf.mpi_buffer.size_alloc,
+                    buf.mpi_buffer.size_free,
+                )
+
+        # test example and checks
+        def f(a, p):
+            for e in a:
+                e = convert_to_bf16(e)
+                p = p @ e
+            return p
+
+        a = [torch.randn(32, 32, device=GPU_TYPE) for _ in range(4)]
+        p = torch.ones(a[0].size(), dtype=torch.bfloat16, device=GPU_TYPE)
+
+        with mock.patch.object(
+            memory,
+            "assign_memory_planning_info_for_scheduler_buffers",
+            assign_memory_planning_info_for_scheduler_buffers_with_records,
+        ):
+            f_compiled = torch.compile(f)
+            f_compiled(a, p)
+            for buf_name in ["buf0", "buf2", "buf4", "buf6"]:
+                self.assertEqual(buffer_info[buf_name], (2048, 0))
+
+            for buf_name in ["buf1", "buf3", "buf5", "buf7"]:
+                self.assertEqual(buffer_info[buf_name], (0, 2048))
+
     @unittest.skipIf(
         not torch.cuda.is_available()
         or torch.cuda.get_device_properties().total_memory < int(1e10),
@@ -228,4 +301,7 @@ def f(a, b, c):
     from torch._inductor.test_case import run_tests
 
     if HAS_GPU:
+        import triton
+        from triton import language as tl
+
         run_tests()
diff --git a/torch/_inductor/memory.py b/torch/_inductor/memory.py
@@ -10,7 +10,7 @@
 from torch.utils._ordered_set import OrderedSet
 
 from .ir import MultiOutputLayout, NoneLayout
-from .utils import get_dtype_size, is_wait
+from .utils import get_dtype_size
 from .virtualized import V
 
 
@@ -147,23 +147,18 @@ def _compute_and_update_buf_size(
         sched_buf: SchedulerBuffer, user_of_MultiOutputLayout: bool = False
     ) -> int:
         if isinstance(sched_buf.node.layout, NoneLayout):
-            _size = 0
-            # for a wait tensor op, its schedulerBuffer NoneLayout layout. However,
-            # the schedulerBuffer is treated as a mutation of the collective output
-            # so it needs to inherit the size of the collectives
-            if (
-                sched_buf.defining_op
-                and is_wait(sched_buf.defining_op.node)
-                and sched_buf.get_mutations()
-            ):
+            # mutations should inherit the size of the mutated buffer
+            if sched_buf.get_mutations():
                 mutated_buf_name = sched_buf.get_mutations()[0]
-                _size = (
-                    sched_buf_to_size[mutated_buf_name][1]
-                    if mutated_buf_name in sched_buf_to_size
-                    else 0
-                )
-            sched_buf_to_size[sched_buf.get_name()] = (_size, _size)
-            return _size
+                if mutated_buf_name in sched_buf_to_size:
+                    (_size_alloc, _size_free) = sched_buf_to_size[mutated_buf_name]
+                else:
+                    (_size_alloc, _size_free) = (0, 0)
+                sched_buf_to_size[sched_buf.get_name()] = (0, _size_free)
+                sched_buf_to_size[mutated_buf_name] = (_size_alloc, 0)
+            else:
+                sched_buf_to_size[sched_buf.get_name()] = (0, 0)
+            return 0
         elif isinstance(sched_buf.node.layout, MultiOutputLayout):
             size_alloc = 0
             for user in sched_buf.users: