Add helion x pt2 test (#155513)

oulgen · pytorchmergebot · commit 0e2013a12da9 · 2025-06-11T07:08:06.000Z
This kinda just worked out of the box, shocking. PT2 traced into helion and emitted it as a user defined triton kernel: P1836496774 In the long run, we do not actually want this, but rather to create a helion HOP so we can do fusions etc. Pull Request resolved: #155513 Approved by: https://github.com/zou3519, https://github.com/jansel
diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
@@ -98,3 +98,6 @@ fi
 if [ -n "${NUMPY_VERSION}" ]; then
   pip_install "numpy==${NUMPY_VERSION}"
 fi
+if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
+  pip_install helion
+fi
diff --git a/test/inductor/test_helion_kernels.py b/test/inductor/test_helion_kernels.py
@@ -0,0 +1,49 @@
+# Owner(s): ["module: inductor"]
+import torch
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.common_utils import instantiate_parametrized_tests
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_HELION, requires_helion
+
+
+if HAS_HELION:
+    import helion
+    import helion.language as hl
+
+
+class HelionTests(TestCase):
+    @requires_helion()
+    def test_add_kernel(self):
+        @helion.kernel(config=helion.Config(block_sizes=[1, 2]))
+        def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            # match pytorch broadcasting rules
+            x, y = torch.broadcast_tensors(x, y)
+            out = torch.empty(
+                x.shape,
+                # match type promotion of torch.add
+                dtype=torch.promote_types(x.dtype, y.dtype),
+                device=x.device,
+            )
+            # tile will be a tuple of blocks
+            for tile in hl.tile(out.size()):
+                out[tile] = x[tile] + y[tile]
+            return out
+
+        def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            return add(x, y)
+
+        x = torch.randn(4, 8, device=GPU_TYPE, dtype=torch.float16)
+        y = torch.randn(4, 8, device=GPU_TYPE, dtype=torch.float16)
+
+        out = add(x, y)
+        compiled_add = torch.compile(f, fullgraph=True, backend="inductor")
+        compiled_out = compiled_add(x, y)
+
+        self.assertEqual(out, x + y)
+        self.assertEqual(compiled_out, x + y)
+
+
+instantiate_parametrized_tests(HelionTests)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/testing/_internal/inductor_utils.py b/torch/testing/_internal/inductor_utils.py
@@ -16,6 +16,7 @@
 from torch._inductor.codecache import CppCodeCache
 from torch._inductor.utils import get_gpu_shared_memory, is_big_gpu
 from torch._inductor.utils import GPU_TYPES, get_gpu_type, is_gpu
+from torch.utils._helion import has_helion
 from torch.utils._triton import has_triton
 from torch.testing._internal.common_device_type import (
     get_desired_device_type_test_bases,
@@ -48,6 +49,8 @@ def test_cpu():
 
 HAS_TRITON = has_triton()
 
+HAS_HELION = has_helion()
+
 if HAS_TRITON:
     import triton
     TRITON_HAS_CPU = "cpu" in triton.backends.backends
@@ -133,6 +136,7 @@ def skip_windows_ci(name: str, file: str) -> None:
 # TODO: Remove HAS_MPS condition  when `HAS_GPU` includes HAS_MPS
 requires_gpu = functools.partial(unittest.skipIf, not (HAS_GPU or HAS_MPS), "requires gpu")
 requires_triton = functools.partial(unittest.skipIf, not HAS_TRITON, "requires triton")
+requires_helion = functools.partial(unittest.skipIf, not HAS_HELION, "requires helion")
 
 def requires_cuda_with_enough_memory(min_mem_required):
     def inner(fn):
diff --git a/torch/utils/_helion.py b/torch/utils/_helion.py
@@ -0,0 +1,17 @@
+import functools
+
+from torch.utils._triton import has_triton
+
+
+@functools.lru_cache(None)
+def has_helion_package() -> bool:
+    try:
+        import helion  # type: ignore[import-untyped, import-not-found]  # noqa: F401
+    except ImportError:
+        return False
+    return True
+
+
+@functools.lru_cache(None)
+def has_helion() -> bool:
+    return has_helion_package() and has_triton()