[kernacle] add support for addmm and bmm

bobrenjc93 · bobrenjc93 · commit 83b412617c01 · 2025-08-08T21:07:20.000-07:00
Title says it all Differential Revision: [D79940195](https://our.internmc.facebook.com/intern/diff/D79940195/) ghstack-source-id: 301861447 Pull Request resolved: #160239
diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
@@ -4,6 +4,7 @@
 import torch
 from torch._dynamo.utils import counters
 from torch._inductor.codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
+from torch._inductor.remote_gemm_autotune_cache import gen_best_config
 
 from .. import ir, lowering as L
 from ..kernel_inputs import MMKernelInputs
@@ -240,7 +241,17 @@ def may_require_contiguous(t, meta_t):
     if use_ck_gemm_template(layout, m, n, k):
         CKGemmTemplate.add_ck_gemm_choices(choices, layout, kernel_inputs.nodes())
 
-    return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
+    best_config_future = None
+    if torch._inductor.config.remote_gemm_autotune_cache:
+        best_config_future = gen_best_config("bmm", [mat1, mat2])
+
+    return autotune_select_algorithm(
+        name,
+        choices,
+        kernel_inputs.nodes(),
+        layout,
+        best_config_future=best_config_future,
+    )
 
 
 @L.register_lowering(aten.baddbmm)
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
@@ -841,7 +841,7 @@ def tuned_mm(mat1, mat2, *, layout=None):
     # Purposely not awaiting the future here - this kicks off the best config lookup at lowering time
     # The future will be awaited at scheduling time in select_algorithm.py
     if torch._inductor.config.remote_gemm_autotune_cache:
-        best_config_future = gen_best_config(mat1, mat2)
+        best_config_future = gen_best_config("mm", [mat1, mat2])
 
     return autotune_select_algorithm(
         name,
@@ -946,13 +946,19 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
             if use_aten_gemm_kernels()
             else []
         )
+
+        best_config_future = None
+        if torch._inductor.config.remote_gemm_autotune_cache:
+            best_config_future = gen_best_config("addmm", [mat1, mat2, inp])
+
+        # TODO(coconutruben): replace with kernel_inputs.nodes()
+        # once that supports the unexpanded nodes as well
         return autotune_select_algorithm(
-            # TODO(coconutruben): replace with kernel_inputs.nodes()
-            # once that supports the unexpanded nodes as well
             "addmm",
             choices,
             [inp, mat1, mat2],
             layout,
+            best_config_future=best_config_future,
         )
 
     choices = (
@@ -1055,7 +1061,19 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
             has_bias=True,
         )
 
-    return autotune_select_algorithm("addmm", choices, kernel_inputs.nodes(), layout)
+    best_config_future = None
+    if torch._inductor.config.remote_gemm_autotune_cache:
+        best_config_future = gen_best_config(
+            "addmm", [mat1, mat2, inp], alpha=alpha, beta=beta
+        )
+
+    return autotune_select_algorithm(
+        "addmm",
+        choices,
+        kernel_inputs.nodes(),
+        layout,
+        best_config_future=best_config_future,
+    )
 
 
 @register_lowering(aten._sparse_semi_structured_mm, type_promotion_kind=None)
diff --git a/torch/_inductor/remote_gemm_autotune_cache.py b/torch/_inductor/remote_gemm_autotune_cache.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import TypeVar
+from typing import Any, TypeVar
 
 import torch._inductor.config as config
 from torch._inductor import ir
@@ -8,13 +8,15 @@
 _T = TypeVar("_T")
 
 
-def gen_best_config(mat1: ir.StorageBox, mat2: ir.StorageBox) -> asyncio.Task[_T]:
+def gen_best_config(
+    mm_type: str, mats: list[ir.StorageBox], **kwargs: Any
+) -> asyncio.Task[_T]:
     """
     Generate the best GEMM autotune config for the given matrices.
     """
     if config.is_fbcode():
         from torch._inductor.fb.remote_gemm_autotune_cache import gen_best_config
 
-        return gen_best_config(mat1, mat2)
+        return gen_best_config(mm_type, mats, **kwargs)
     else:
         raise NotImplementedError("Function gen_best_config is not yet implemented")
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
@@ -2392,31 +2392,33 @@ def do_autotuning(choices, precompile_fn, hint_override: Optional[int] = None):
 
             if best_config_future is not None:
                 best_config = await_sync(best_config_future)
-
-                important_keys = [
-                    "ACC_TYPE",
-                    "ALLOW_TF32",
-                    "BLOCK_K",
-                    "BLOCK_M",
-                    "BLOCK_N",
-                    "EVEN_K",
-                    "GROUP_M",
-                    "USE_FAST_ACCUM",
-                    "num_stages",
-                    "num_warps",
-                    "num_consumer_groups",
-                    "num_buffers_warp_spec",
-                ]
-                choices = [
-                    choice
-                    for choice in choices
-                    if all(
-                        f"{k}={best_config[k]}" in choice.description
+                if best_config:
+                    important_keys = [
+                        "ACC_TYPE",
+                        "ALLOW_TF32",
+                        "BLOCK_K",
+                        "BLOCK_M",
+                        "BLOCK_N",
+                        "EVEN_K",
+                        "GROUP_M",
+                        "USE_FAST_ACCUM",
+                        "num_stages",
+                        "num_warps",
+                        "num_consumer_groups",
+                        "num_buffers_warp_spec",
+                    ]
+                    choices = [
+                        choice
+                        for choice in choices
+                        if all(
+                            f"{k}={best_config[k]}" in choice.description
+                            for k in important_keys
+                        )
                         for k in important_keys
+                    ]
+                    log.info(
+                        "Filtered to %d choices based on best_config", len(choices)
                     )
-                    for k in important_keys
-                ]
-                log.info("Filtered to %d choices based on best_config", len(choices))
 
             timings = self.lookup(
                 choices,