-
Notifications
You must be signed in to change notification settings - Fork 24.9k
Open
Labels
module: c10dIssues/PRs related to collective communications and process groupsIssues/PRs related to collective communications and process groupsmodule: flaky-testsProblem is a flaky test in CIProblem is a flaky test in CIskippedDenotes a (flaky) test currently skipped in CI.Denotes a (flaky) test currently skipped in CI.triagedThis issue has been looked at a team member, and triaged and prioritized into an appropriate moduleThis issue has been looked at a team member, and triaged and prioritized into an appropriate module
Description
Platforms: linux, rocm, inductor
This test was disabled because it is failing in CI. See recent examples and the most recent trunk workflow logs.
Over the past 3 hours, it has been determined flaky in 3 workflow(s) with 3 failures and 3 successes.
Debugging instructions (after clicking on the recent samples link):
DO NOT ASSUME THINGS ARE OKAY IF THE CI IS GREEN. We now shield flaky tests from developers so CI will thus be green but it will be harder to parse the logs.
To find relevant log snippets:
- Click on the workflow logs linked above
- Click on the Test step of the job so that it is expanded. Otherwise, the grepping will not work.
- Grep for
test_dtensor_seq_par_shard_dim_1
- There should be several instances run (as flaky tests are rerun in CI) from which you can study the logs.
Sample error message
Traceback (most recent call last):
File "/var/lib/jenkins/pytorch/test/distributed/tensor/parallel/test_micro_pipeline_tp.py", line 492, in test_dtensor_seq_par
self.assertIn("fused_all_gather_matmul", code)
File "/opt/conda/envs/py_3.10/lib/python3.10/unittest/case.py", line 1112, in assertIn
self.fail(self._formatMessage(msg, standardMsg))
File "/opt/conda/envs/py_3.10/lib/python3.10/unittest/case.py", line 675, in fail
raise self.failureException(msg)
AssertionError: 'fused_all_gather_matmul' not found in '# AOT ID: [\'1_forward\']\nfrom ctypes import c_void_p, c_long, c_int\nimport torch\nimport math\nimport random\nimport os\nimport tempfile\nfrom math import inf, nan\nfrom cmath import nanj\nfrom torch._inductor.hooks import run_intermediate_hooks\nfrom torch._inductor.utils import maybe_profile\nfrom torch._inductor.codegen.memory_planning import _align as align\nfrom torch import device, empty_strided\nfrom torch._inductor.async_compile import AsyncCompile\nfrom torch._inductor.select_algorithm import extern_kernels\nfrom torch._inductor.codegen.multi_kernel import MultiKernelCall\nimport triton\nimport triton.language as tl\nfrom torch._inductor.runtime.triton_heuristics import start_graph, end_graph\nfrom torch._C import _cuda_getCurrentRawStream as get_raw_stream\nfrom torch._C import _cuda_getCurrentRawStream as get_raw_stream\n\naten = torch.ops.aten\ninductor_ops = torch.ops.inductor\n_quantized = torch.ops._quantized\nassert_size_stride = torch._C._dynamo.guards.assert_size_stride\nassert_alignment = torch._C._dynamo.guards.assert_alignment\nempty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu\nempty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda\nempty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu\nreinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor\nalloc_from_pool = torch.ops.inductor._alloc_from_pool\nasync_compile = AsyncCompile()\nempty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p\n\n\n# kernel path: /tmp/tmppqzca6u9/sc/cscgghc5r2wllpuuf5v3vakz2lwsyb44yxesm7ia6gvftk3a44m7.py\n# Topologically Sorted Source Nodes: [input_tensor_1], Original ATen: [aten.cat]\n# Source node to ATen node mapping:\n# input_tensor_1 => cat\n# Graph fragment:\n# %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%getitem, %getitem_1], 1), kwargs = {})\ntriton_poi_fused_cat_0 = async_compile.triton(\'triton_poi_fused_cat_0\', \'\'\'\nimport triton\nimport triton.language as tl\n\nfrom torch._inductor.runtime import triton_helpers, triton_heuristics\nfrom torch._inductor.runtime.triton_helpers import libdevice, math as tl_math\nfrom torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties\ntriton_helpers.set_driver_to_gpu()\n\n@triton_heuristics.pointwise(\n size_hints={\'x\': 512}, \n filename=__file__,\n triton_meta={\'signature\': {\'in_ptr0\': \'*fp32\', \'out_ptr0\': \'*fp32\', \'xnumel\': \'i32\', \'XBLOCK\': \'constexpr\'}, \'device\': DeviceProperties(type=\'hip\', index=0, multi_processor_count=304, cc=\'gfx942\', major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=64), \'constants\': {}, \'configs\': [{(0,): [[\'tt.divisibility\', 16]], (1,): [[\'tt.divisibility\', 16]], (2,): [[\'tt.divisibility\', 16]]}]},\n inductor_meta={\'grid_type\': \'Grid1D\', \'autotune_hints\': set(), \'kernel_name\': \'triton_poi_fused_cat_0\', \'mutated_arg_names\': [], \'optimize_mem\': False, \'no_x_dim\': False, \'num_load\': 2, \'num_reduction\': 0, \'backend_hash\': \'DF44DA125E980BD742A7B1BF8C2CBB766B780F0FD145B3F1B1A03EAEBF71542D\', \'are_deterministic_algorithms_enabled\': False, \'assert_indirect_indexing\': True, \'autotune_local_cache\': True, \'autotune_pointwise\': True, \'autotune_remote_cache\': None, \'force_disable_caches\': False, \'dynamic_scale_rblock\': True, \'max_autotune\': False, \'max_autotune_pointwise\': False, \'min_split_scan_rblock\': 256, \'spill_threshold\': 16, \'store_cubin\': False, \'is_hip\': True},\n min_elem_per_thread=0\n)\n@triton.jit\ndef triton_poi_fused_cat_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):\n xnumel = 320\n xoffset = tl.program_id(0) * XBLOCK\n xindex = xoffset + tl.arange(0, XBLOCK)[:]\n xmask = xindex < xnumel\n x1 = ((xindex // 10) % 16)\n x0 = (xindex % 10)\n x2 = xindex // 160\n x3 = xindex\n tmp0 = x1\n tmp1 = tl.full([1], 0, tl.int64)\n tmp2 = tmp0 >= tmp1\n tmp3 = tl.full([1], 8, tl.int64)\n tmp4 = tmp0 < tmp3\n tmp5 = tmp4.to(tl.int1)\n tmp6 = tl.load(in_ptr0 + (x0 + 10*(x1) + 80*x2), xmask & tmp5, other=0.0)\n tmp7 = tmp0 >= tmp3\n tmp8 = tl.full([1], 16, tl.int64)\n tmp9 = tmp0 < tmp8\n tmp10 = tmp7.to(tl.int1)\n tmp11 = tl.load(in_ptr0 + (160 + x0 + 10*((-8) + x1) + 80*x2), xmask & tmp10, other=0.0)\n tmp12 = tl.where(tmp4, tmp6, tmp11)\n tl.store(out_ptr0 + (x3), tmp12, xmask)\n\'\'\', device_str=\'cuda\')\n\n\n# kernel path: /tmp/tmppqzca6u9/lt/cltzieel7e24hr436hzvsxqislupv7o3tazsatunzdo6f7nkcfwm.py\n# Topologically Sorted Source Nodes: [linear], Original ATen: [aten.mm]\n# Source node to ATen node mapping:\n# linear => constant_pad_nd_default\n# Graph fragment:\n# %constant_pad_nd_default : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%view_1, [0, 2, 0, 0]), kwargs = {})\ntriton_poi_fused_mm_1 = async_compile.triton(\'triton_poi_fused_mm_1\', \'\'\'\nimport triton\nimport triton.language as tl\n\nfrom torch._inductor.runtime import triton_helpers, triton_heuristics\nfrom torch._inductor.runtime.triton_helpers import libdevice, math as tl_math\nfrom torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties\ntriton_helpers.set_driver_to_gpu()\n\n@triton_heuristics.pointwise(\n size_hints={\'x\': 512}, \n filename=__file__,\n triton_meta={\'signature\': {\'in_ptr0\': \'*fp32\', \'out_ptr0\': \'*fp32\', \'xnumel\': \'i32\', \'XBLOCK\': \'constexpr\'}, \'device\': DeviceProperties(type=\'hip\', index=0, multi_processor_count=304, cc=\'gfx942\', major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=64), \'constants\': {}, \'configs\': [{(0,): [[\'tt.divisibility\', 16]], (1,): [[\'tt.divisibility\', 16]], (2,): [[\'tt.divisibility\', 16]]}]},\n inductor_meta={\'grid_type\': \'Grid1D\', \'autotune_hints\': set(), \'kernel_name\': \'triton_poi_fused_mm_1\', \'mutated_arg_names\': [], \'optimize_mem\': False, \'no_x_dim\': False, \'num_load\': 1, \'num_reduction\': 0, \'backend_hash\': \'DF44DA125E980BD742A7B1BF8C2CBB766B780F0FD145B3F1B1A03EAEBF71542D\', \'are_deterministic_algorithms_enabled\': False, \'assert_indirect_indexing\': True, \'autotune_local_cache\': True, \'autotune_pointwise\': True, \'autotune_remote_cache\': None, \'force_disable_caches\': False, \'dynamic_scale_rblock\': True, \'max_autotune\': False, \'max_autotune_pointwise\': False, \'min_split_scan_rblock\': 256, \'spill_threshold\': 16, \'store_cubin\': False, \'is_hip\': True},\n min_elem_per_thread=0\n)\n@triton.jit\ndef triton_poi_fused_mm_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):\n xnumel = 384\n xoffset = tl.program_id(0) * XBLOCK\n xindex = xoffset + tl.arange(0, XBLOCK)[:]\n xmask = xindex < xnumel\n x0 = (xindex % 12)\n x1 = xindex // 12\n x2 = xindex\n tmp0 = x0\n tmp1 = tl.full([1], 10, tl.int64)\n tmp2 = tmp0 < tmp1\n tmp3 = tmp2.to(tl.int1)\n tmp4 = tl.load(in_ptr0 + (x0 + 10*x1), xmask & tmp3, other=0.0)\n tl.store(out_ptr0 + (x2), tmp4, xmask)\n\'\'\', device_str=\'cuda\')\n\n\n# kernel path: /tmp/tmppqzca6u9/ld/cldzvmepu7pgtkep65r3nzexcphh4ftfwaz7nfzqmlfgs3vhb4xn.py\n# Topologically Sorted Source Nodes: [linear], Original ATen: [aten.mm]\n# Source node to ATen node mapping:\n# linear => constant_pad_nd_default_1\n# Graph fragment:\n# %constant_pad_nd_default_1 : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%permute, [0, 0, 0, 2]), kwargs = {})\ntriton_poi_fused_mm_2 = async_compile.triton(\'triton_poi_fused_mm_2\', \'\'\'\nimport triton\nimport triton.language as tl\n\nfrom torch._inductor.runtime import triton_helpers, triton_heuristics\nfrom torch._inductor.runtime.triton_helpers import libdevice, math as tl_math\nfrom torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties\ntriton_helpers.set_driver_to_gpu()\n\n@triton_heuristics.pointwise(\n size_hints={\'x\': 128}, \n filename=__file__,\n triton_meta={\'signature\': {\'in_ptr0\': \'*fp32\', \'out_ptr0\': \'*fp32\', \'xnumel\': \'i32\', \'XBLOCK\': \'constexpr\'}, \'device\': DeviceProperties(type=\'hip\', index=0, multi_processor_count=304, cc=\'gfx942\', major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=64), \'constants\': {}, \'configs\': [{(0,): [[\'tt.divisibility\', 16]], (1,): [[\'tt.divisibility\', 16]], (2,): [[\'tt.divisibility\', 16]]}]},\n inductor_meta={\'grid_type\': \'Grid1D\', \'autotune_hints\': set(), \'kernel_name\': \'triton_poi_fused_mm_2\', \'mutated_arg_names\': [], \'optimize_mem\': False, \'no_x_dim\': False, \'num_load\': 1, \'num_reduction\': 0, \'backend_hash\': \'DF44DA125E980BD742A7B1BF8C2CBB766B780F0FD145B3F1B1A03EAEBF71542D\', \'are_deterministic_algorithms_enabled\': False, \'assert_indirect_indexing\': True, \'autotune_local_cache\': True, \'autotune_pointwise\': True, \'autotune_remote_cache\': None, \'force_disable_caches\': False, \'dynamic_scale_rblock\': True, \'max_autotune\': False, \'max_autotune_pointwise\': False, \'min_split_scan_rblock\': 256, \'spill_threshold\': 16, \'store_cubin\': False, \'is_hip\': True},\n min_elem_per_thread=0\n)\n@triton.jit\ndef triton_poi_fused_mm_2(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):\n xnumel = 96\n xoffset = tl.program_id(0) * XBLOCK\n xindex = xoffset + tl.arange(0, XBLOCK)[:]\n xmask = xindex < xnumel\n x0 = (xindex % 12)\n x1 = xindex // 12\n x2 = xindex\n tmp0 = x0\n tmp1 = tl.full([1], 10, tl.int64)\n tmp2 = tmp0 < tmp1\n tmp3 = tmp2.to(tl.int1)\n tmp4 = tl.load(in_ptr0 + (x0 + 10*x1), xmask & tmp3, other=0.0)\n tl.store(out_ptr0 + (x2), tmp4, xmask)\n\'\'\', device_str=\'cuda\')\n\n\n# kernel path: /tmp/tmppqzca6u9/ah/cahun247dgy66snar3eudwvxepliwexbhlsvqxegcothcwyc6td5.py\n# Topologically Sorted Source Nodes: [input_tensor_2], Original ATen: [aten.relu, aten.threshold_backward]\n# Source node to ATen node mapping:\n# input_tensor_2 => relu\n# Graph fragment:\n# %relu : [num_users=3] = call_function[target=torch.ops.aten.relu.default](args = (%view_2,), kwargs = {})\n# %inductor_force_stride_order_default : [num_users=1] = call_function[target=torch.ops.prims.inductor_force_stride_order.default](args = (%relu, (8, 16, 1)), kwargs = {})\n# %le : [num_users=1] = call_function[target=torch.ops.aten.le.Scalar](args = (%relu, 0), kwargs = {})\ntriton_poi_fused_relu_threshold_backward_3 = async_compile.triton(\'triton_poi_fused_relu_threshold_backward_3\', \'\'\'\nimport triton\nimport triton.language as tl\n\nfrom torch._inductor.runtime import triton_helpers, triton_heuristics\nfrom torch._inductor.runtime.triton_helpers import libdevice, math as tl_math\nfrom torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties\ntriton_helpers.set_driver_to_gpu()\n\n@triton_heuristics.pointwise(\n size_hints={\'x\': 256}, \n filename=__file__,\n triton_meta={\'signature\': {\'in_out_ptr0\': \'*fp32\', \'out_ptr0\': \'*fp32\', \'out_ptr1\': \'*i1\', \'xnumel\': \'i32\', \'XBLOCK\': \'constexpr\'}, \'device\': DeviceProperties(type=\'hip\', index=0, multi_processor_count=304, cc=\'gfx942\', major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=64), \'constants\': {}, \'configs\': [{(0,): [[\'tt.divisibility\', 16]], (1,): [[\'tt.divisibility\', 16]], (2,): [[\'tt.divisibility\', 16]], (3,): [[\'tt.divisibility\', 16]]}]},\n inductor_meta={\'grid_type\': \'Grid1D\', \'autotune_hints\': set(), \'kernel_name\': \'triton_poi_fused_relu_threshold_backward_3\', \'mutated_arg_names\': [\'in_out_ptr0\'], \'optimize_mem\': False, \'no_x_dim\': False, \'num_load\': 1, \'num_reduction\': 0, \'backend_hash\': \'DF44DA125E980BD742A7B1BF8C2CBB766B780F0FD145B3F1B1A03EAEBF71542D\', \'are_deterministic_algorithms_enabled\': False, \'assert_indirect_indexing\': True, \'autotune_local_cache\': True, \'autotune_pointwise\': True, \'autotune_remote_cache\': None, \'force_disable_caches\': False, \'dynamic_scale_rblock\': True, \'max_autotune\': False, \'max_autotune_pointwise\': False, \'min_split_scan_rblock\': 256, \'spill_threshold\': 16, \'store_cubin\': False, \'is_hip\': True},\n min_elem_per_thread=0\n)\n@triton.jit\ndef triton_poi_fused_relu_threshold_backward_3(in_out_ptr0, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):\n xnumel = 256\n xoffset = tl.program_id(0) * XBLOCK\n xindex = xoffset + tl.arange(0, XBLOCK)[:]\n xmask = xindex < xnumel\n x0 = xindex\n x1 = (xindex % 8)\n x2 = ((xindex // 8) % 16)\n x3 = xindex // 128\n tmp0 = tl.load(in_out_ptr0 + (x0), xmask)\n tmp1 = tl.full([1], 0, tl.int32)\n tmp2 = triton_helpers.maximum(tmp1, tmp0)\n tmp3 = 0.0\n tmp4 = tmp2 <= tmp3\n tl.store(in_out_ptr0 + (x0), tmp2, xmask)\n tl.store(out_ptr0 + (x1 + 8*x3 + 16*x2), tmp2, xmask)\n tl.store(out_ptr1 + (x0), tmp4, xmask)\n\'\'\', device_str=\'cuda\')\n\n\nasync_compile.wait(globals())\ndel async_compile\n\ndef call(args):\n primals_1, primals_2, primals_3 = args\n args.clear()\n assert_size_stride(primals_1, (2, 8, 10), (80, 10, 1))\n assert_size_stride(primals_2, (8, 10), (10, 1))\n assert_size_stride(primals_3, (10, 8), (8, 1))\n with torch.cuda._DeviceGuard(0):\n torch.cuda.set_device(0)\n # Topologically Sorted Source Nodes: [input_tensor_1], Original ATen: [_c10d_functional.all_gather_into_tensor]\n buf0 = torch.ops._c10d_functional.all_gather_into_tensor.default(primals_1, 2, \'0\')\n assert_size_stride(buf0, (4, 8, 10), (80, 10, 1))\n assert_alignment(buf0, 16)\n # Topologically Sorted Source Nodes: [input_tensor_1], Original ATen: [_c10d_functional.wait_tensor]\n torch.ops._c10d_functional.wait_tensor.default(buf0)\n del primals_1\n buf3 = empty_strided_cuda((2, 16, 10), (160, 10, 1), torch.float32)\n # Topologically Sorted Source Nodes: [input_tensor_1], Original ATen: [aten.cat]\n stream0 = get_raw_stream(0)\n triton_poi_fused_cat_0.run(buf0, buf3, 320, stream=stream0)\n del buf0\n buf4 = empty_strided_cuda((32, 12), (12, 1), torch.float32)\n # Topologically Sorted Source Nodes: [linear], Original ATen: [aten.mm]\n stream0 = get_raw_stream(0)\n triton_poi_fused_mm_1.run(buf3, buf4, 384, stream=stream0)\n buf5 = empty_strided_cuda((12, 8), (1, 12), torch.float32)\n # Topologically Sorted Source Nodes: [linear], Original ATen: [aten.mm]\n stream0 = get_raw_stream(0)\n triton_poi_fused_mm_2.run(primals_2, buf5, 96, stream=stream0)\n del primals_2\n buf6 = empty_strided_cuda((32, 8), (8, 1), torch.float32)\n # Topologically Sorted Source Nodes: [linear], Original ATen: [aten.mm]\n extern_kernels.mm(buf4, buf5, out=buf6)\n del buf4\n del buf5\n buf7 = reinterpret_tensor(buf6, (2, 16, 8), (128, 8, 1), 0); del buf6 # reuse\n buf8 = empty_strided_cuda((2, 16, 8), (8, 16, 1), torch.float32)\n buf11 = empty_strided_cuda((2, 16, 8), (128, 8, 1), torch.bool)\n # Topologically Sorted Source Nodes: [input_tensor_2], Original ATen: [aten.relu, aten.threshold_backward]\n stream0 = get_raw_stream(0)\n triton_poi_fused_relu_threshold_backward_3.run(buf7, buf8, buf11, 256, stream=stream0)\n # Topologically Sorted Source Nodes: [], Original ATen: []\n buf9 = torch.ops.symm_mem.fused_matmul_reduce_scatter.default(buf8, reinterpret_tensor(primals_3, (8, 10), (1, 8), 0), \'sum\', 1, \'0\')\n del buf8\n buf10 = buf9\n assert_size_stride(buf10, (2, 8, 10), (80, 10, 1))\n assert_alignment(buf10, 16)\n del buf9\n return (buf10, primals_3, reinterpret_tensor(buf3, (32, 10), (10, 1), 0), reinterpret_tensor(buf7, (32, 8), (8, 1), 0), buf11, )\n\n\ndef benchmark_compiled_module(times=10, repeat=10):\n from torch._dynamo.testing import rand_strided\n from torch._inductor.utils import print_performance\n primals_1 = rand_strided((2, 8, 10), (80, 10, 1), device=\'cuda:0\', dtype=torch.float32)\n primals_2 = rand_strided((8, 10), (10, 1), device=\'cuda:0\', dtype=torch.float32)\n primals_3 = rand_strided((10, 8), (8, 1), device=\'cuda:0\', dtype=torch.float32)\n fn = lambda: call([primals_1, primals_2, primals_3])\n return print_performance(fn, times=times, repeat=repeat)\n\n\nif __name__ == "__main__":\n from torch._inductor.wrapper_benchmark import compiled_module_main\n compiled_module_main(\'None\', benchmark_compiled_module)\n'
To execute this test, run the following from the base repo dir:
PYTORCH_TEST_WITH_ROCM=1 python test/distributed/tensor/parallel/test_micro_pipeline_tp.py MicroPipelineTPTest.test_dtensor_seq_par_shard_dim_1
This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
Test file path: distributed/tensor/parallel/test_micro_pipeline_tp.py
cc @clee2000
Metadata
Metadata
Assignees
Labels
module: c10dIssues/PRs related to collective communications and process groupsIssues/PRs related to collective communications and process groupsmodule: flaky-testsProblem is a flaky test in CIProblem is a flaky test in CIskippedDenotes a (flaky) test currently skipped in CI.Denotes a (flaky) test currently skipped in CI.triagedThis issue has been looked at a team member, and triaged and prioritized into an appropriate moduleThis issue has been looked at a team member, and triaged and prioritized into an appropriate module