pytorch
diff --git a/‎aten/src/ATen/autocast_mode.cpp
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/autocast_mode.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/detail/MTIAHooksInterface.cpp
Lines changed: 4 additions & 0 deletions b/‎aten/src/ATen/detail/MTIAHooksInterface.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎aten/src/ATen/detail/MTIAHooksInterface.h
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/detail/MTIAHooksInterface.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/mps/operations/BinaryKernel.mm
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/native/mps/operations/BinaryKernel.mm
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/public/LoadHIP.cmake
Lines changed: 2 additions & 2 deletions b/‎cmake/public/LoadHIP.cmake
Lines changed: 2 additions & 2 deletions
diff --git a/‎codex_setup.sh
Lines changed: 0 additions & 4 deletions b/‎codex_setup.sh
Lines changed: 0 additions & 4 deletions
diff --git a/‎test/dynamo/test_fx_graph_runnable.py
Lines changed: 88 additions & 0 deletions b/‎test/dynamo/test_fx_graph_runnable.py
Lines changed: 88 additions & 0 deletions
diff --git a/‎test/dynamo/test_pgo.py
Lines changed: 13 additions & 7 deletions b/‎test/dynamo/test_pgo.py
Lines changed: 13 additions & 7 deletions
diff --git a/‎test/dynamo/test_repros.py
Lines changed: 25 additions & 0 deletions b/‎test/dynamo/test_repros.py
Lines changed: 25 additions & 0 deletions
@@ -239,6 +239,7 @@ TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) {
   KERNEL_MPS(scaled_dot_product_attention, lower_precision_fp)
 
   // fp32
+  KERNEL_MPS(conv_transpose3d, input, fp32)
   KERNEL_MPS(acos, fp32)
   KERNEL_MPS(asin, fp32)
   KERNEL_MPS(cosh, fp32)
 
@@ -21,6 +21,10 @@ bool isMTIAHooksBuilt() {
 
 } // namespace detail
 
+bool MTIAHooksInterface::isAvailable() const {
+  return detail::isMTIAHooksBuilt() && detail::getMTIAHooks().deviceCount() > 0;
+}
+
 C10_DEFINE_REGISTRY(MTIAHooksRegistry, MTIAHooksInterface, MTIAHooksArgs)
 
 } // namespace at
@@ -149,6 +149,8 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
     FAIL_MTIAHOOKS_FUNC(__func__);
     return;
   }
+
+  virtual bool isAvailable() const override;
 };
 
 struct TORCH_API MTIAHooksArgs {};
 
@@ -53,6 +53,7 @@ void binary_op_kernel(const std::string func_name,
                   .add_input(input)
                   .add_input(other)
                   .check_all_same_dtype(false)
+                  .promote_inputs_to_common_dtype(true)
                   .build();
 
   lib.exec_binary_kernel(iter, func_name, alpha);
 
@@ -6,7 +6,7 @@ set(PYTORCH_FOUND_HIP FALSE)
 # In the latter case, if /opt/rocm does not exist emit status
 # message and return.
 if(DEFINED ENV{ROCM_PATH})
-  set(ROCM_PATH $ENV{ROCM_PATH})
+  file(TO_CMAKE_PATH "$ENV{ROCM_PATH}" ROCM_PATH)
   if(NOT EXISTS ${ROCM_PATH})
     message(FATAL_ERROR
       "ROCM_PATH environment variable is set to ${ROCM_PATH} but does not exist.\n"
@@ -31,7 +31,7 @@ if(NOT DEFINED ENV{MAGMA_HOME})
   set(MAGMA_HOME ${ROCM_PATH}/magma)
   set(ENV{MAGMA_HOME} ${ROCM_PATH}/magma)
 else()
-  set(MAGMA_HOME $ENV{MAGMA_HOME})
+  file(TO_CMAKE_PATH "$ENV{MAGMA_HOME}" MAGMA_HOME)
 endif()
 
 # MIOpen isn't a part of HIP-SDK for Windows and hence, may have a different
 
@@ -9,10 +9,6 @@ COMMIT=$(grep -oE '[0-9a-f]{40}' <<< "$NIGHTLY_PATCH" | head -1)
 COMMIT_DATE=$(echo "$NIGHTLY_PATCH" | grep '^Date:' | sed -E 's/Date: .*, ([0-9]+) ([A-Za-z]+) ([0-9]+) .*/\3 \2 \1/' | awk 'BEGIN{split("Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec", months, " "); for(i=1;i<=12;i++) month[months[i]]=sprintf("%02d",i)} {print $1 month[$2] sprintf("%02d",$3)}')
 VERSION_STRING="2.9.0.dev${COMMIT_DATE}+cpu"
 git rev-parse HEAD > /tmp/orig_work.txt
-cp AGENTS.md /tmp
 git reset --hard $COMMIT
-cp /tmp/AGENTS.md .
-curl https://patch-diff.githubusercontent.com/raw/pytorch/pytorch/pull/159965.diff | patch -p1
 USE_NIGHTLY=$VERSION_STRING python setup.py develop
-git commit -asm "Agents patch"
 echo "source $PWD/.venv/bin/activate" >> ~/.bashrc
@@ -11,12 +11,65 @@
 from torch._inductor.codecache import WritableTempFile
 from torch._inductor.test_case import TestCase
 from torch.testing._internal.common_utils import IS_FBCODE, IS_SANDCASTLE
+from torch.utils._triton import has_triton
 
 
 if torch.distributed.is_available():
     from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard
     from torch.testing._internal.distributed.fake_pg import FakeStore
 
+if has_triton():
+    import triton
+    import triton.language as tl
+
+    def init_to_zero(name):
+        return lambda nargs: nargs[name].zero_()
+
+    @triton.jit
+    def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
+        pid = tl.program_id(axis=0)
+
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+
+        x = tl.load(x_ptr + offsets, mask=mask)
+        y = tl.load(y_ptr + offsets, mask=mask)
+        output = x + y
+        tl.atomic_add(output_ptr + offsets, output, mask=mask)
+
+    @triton.autotune(
+        configs=[
+            triton.Config(
+                {"BLOCK_SIZE": 1024},
+                num_warps=4,
+                num_stages=2,
+                pre_hook=init_to_zero("output_ptr"),
+            )
+        ],
+        pre_hook=init_to_zero("output_ptr"),
+        post_hook=init_to_zero("output_ptr"),
+        key=["n_elements"],
+    )
+    @triton.jit
+    def add_kernel_autotune(
+        x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr
+    ):
+        pid = tl.program_id(axis=0)
+
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+
+        x = tl.load(x_ptr + offsets, mask=mask)
+        y = tl.load(y_ptr + offsets, mask=mask)
+        output = x + y
+        tl.atomic_add(output_ptr + offsets, output, mask=mask)
+
+
+from torch.testing._internal.inductor_utils import GPU_TYPE
+from torch.testing._internal.triton_utils import requires_gpu
+
 
 class FxGraphRunnableArtifactFilter(logging.Filter):
     def filter(self, record):
@@ -100,6 +153,41 @@ def f(x):
         torch.compile(f)(torch.randn(4))
         self._exec_and_verify_payload()
 
+    @unittest.skipUnless(has_triton(), "Triton not available")
+    def test_user_defined_triton_kernel_autotune(self):
+        def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.ones(x.shape, device=x.device, dtype=x.dtype)
+            n_elements = output.numel()
+
+            def grid(
+                meta,
+            ):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+            add_kernel_autotune[grid](x, y, output, n_elements)
+            return output
+
+        x = torch.ones((4096,), device=GPU_TYPE, dtype=torch.float16)
+        y = torch.ones((4096,), device=GPU_TYPE, dtype=torch.float16)
+
+        torch.compile(add)(x, y)
+        self._exec_and_verify_payload()
+
+    @unittest.skipUnless(has_triton(), "Triton not available")
+    @requires_gpu
+    def test_user_defined_triton_kernel(self):
+        def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.ones(x.shape, device=x.device, dtype=x.dtype)
+            n_elements = x.numel()
+            add_kernel[n_elements,](x, y, output, n_elements, BLOCK_SIZE=4)
+            return output
+
+        x = torch.ones((4096,), device=GPU_TYPE, dtype=torch.float16)
+        y = torch.ones((4096,), device=GPU_TYPE, dtype=torch.float16)
+
+        torch.compile(add)(x, y)
+        self._exec_and_verify_payload()
+
     def test_two_inputs_matmul(self):
         def f(a, b):
             return (a @ b).relu()
 
@@ -56,6 +56,10 @@ def f(x):
         f(torch.randn(2, 6))
         self.assertEqual(cnts.frame_count, 1)
 
+    @torch._dynamo.config.patch(
+        force_parameter_static_shapes=False,
+        force_nn_module_property_static_shapes=False,
+    )
     def test_whitelist_suggestion(self):
         cnts = CompileCounter()
 
@@ -195,14 +199,16 @@ def run():
         self.assertEqual(cnts.frame_count, 3)
 
         # parameter static shapes are forced static, so we recompile once
-        run()
-        self.assertEqual(cnts.frame_count, 2)
+        with torch._dynamo.config.patch(
+            force_parameter_static_shapes=False,
+            force_nn_module_property_static_shapes=False,
+        ):
+            run()
+            self.assertEqual(cnts.frame_count, 2)
 
-        # flags are flipped, PGO records dynamism, so params are dynamically compiled to start
-        torch._dynamo.config.force_parameter_static_shapes = False
-        torch._dynamo.config.force_nn_module_property_static_shapes = False
-        run()
-        self.assertEqual(cnts.frame_count, 1)
+            # because flags were flipped, params were included in PGO
+            run()
+            self.assertEqual(cnts.frame_count, 1)
 
     def test_njt(self):
         cnts = CompileCounter()
 
@@ -7673,6 +7673,31 @@ def forward(self, x):
         out2 = torch.compile(model, backend="eager")(input.clone())
         self.assertEqual(out1, out2)
 
+    @requires_cuda
+    def test_zero_dim_param_mixed_device_grad(self):
+        # cpu 0-dim params with cuda grads
+        # https://github.com/pytorch/pytorch/issues/160084
+        class RegressionModel(torch.nn.Module):
+            def __init__(self, a=0, b=0):
+                super().__init__()
+                self.a = torch.nn.Parameter(torch.tensor(a).float())
+                self.b = torch.nn.Parameter(torch.tensor(b).float())
+
+            def forward(self, x):
+                return x * self.a + self.b
+
+        model = RegressionModel()
+        model.forward = torch.compile(
+            model.forward, backend="aot_eager", fullgraph=True
+        )
+        inputs = torch.randn(4, 10).to("cuda")
+        out = model(inputs)
+        out.sum().backward()
+        self.assertIsNotNone(model.a.grad)
+        self.assertIsNotNone(model.b.grad)
+        self.assertEqual(model.a.grad.device, torch.device("cpu"))
+        self.assertEqual(model.b.grad.device, torch.device("cpu"))
+
     def test_filter_warnings(self):
         x = torch.ones(2, 2, requires_grad=True)
Original file line number	Diff line number	Diff line change
`@@ -149,6 +149,8 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {`
`149`	`149`	`FAIL_MTIAHOOKS_FUNC(__func__);`
`150`	`150`	`return;`
`151`	`151`	`}`
	`152`	`+`
	`153`	`+ virtual bool isAvailable() const override;`
`152`	`154`	`};`
`153`	`155`
`154`	`156`	`struct TORCH_API MTIAHooksArgs {};`