pytorch
diff --git a/‎test/inductor/test_max_autotune.py
Lines changed: 3 additions & 6 deletions b/‎test/inductor/test_max_autotune.py
Lines changed: 3 additions & 6 deletions
diff --git a/‎torch/_inductor/choices.py
Lines changed: 52 additions & 34 deletions b/‎torch/_inductor/choices.py
Lines changed: 52 additions & 34 deletions
diff --git a/‎torch/_inductor/kernel/bmm.py
Lines changed: 37 additions & 33 deletions b/‎torch/_inductor/kernel/bmm.py
Lines changed: 37 additions & 33 deletions
diff --git a/‎torch/_inductor/kernel/conv.py
Lines changed: 9 additions & 0 deletions b/‎torch/_inductor/kernel/conv.py
Lines changed: 9 additions & 0 deletions
@@ -35,10 +35,7 @@
     TritonTemplate,
     TritonTemplateCaller,
 )
-from torch._inductor.template_heuristics import (
-    CUDAMMTemplateConfigHeuristic,
-    GemmConfig,
-)
+from torch._inductor.template_heuristics import CUDAConfigHeuristic, GemmConfig
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -1564,9 +1561,9 @@ def f(a, b):
         b = torch.randn(K, N, dtype=torch.float16, device="cuda", requires_grad=True)
 
         with mock.patch(
-            "torch._inductor.template_registry.get_template_heuristic"
+            "torch._inductor.kernel.mm.V.choices.get_config_heuristics"
         ) as config_mock:
-            config_heuristics = CUDAMMTemplateConfigHeuristic()
+            config_heuristics = CUDAConfigHeuristic()
 
             # Traditionally, this would be set of all possible configs
             # We mock out the code path for the sake of the unit test
 
@@ -9,7 +9,6 @@
 
 from . import config
 from .codecache import write_text
-from .kernel_inputs import KernelInputs  # noqa: TC001
 from .metrics import get_metric_table, is_metric_table_enabled
 from .runtime.hints import DeviceProperties, ReductionHint
 from .scheduler import BaseSchedulerNode, Scheduler, WhyNoFuse
@@ -20,7 +19,6 @@
     ROCmConfigHeuristic,
     XPUConfigHeuristic,
 )
-from .template_registry import get_template_heuristic
 from .virtualized import V
 
 
@@ -70,6 +68,58 @@ def get_config_heuristics(
         else:
             return BaseConfigHeuristic()
 
+    # GEMM configs
+    def get_base_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        if config.max_autotune_gemm_search_space != "EXHAUSTIVE":
+            return mm_heuristics.get_mm_configs()
+        else:
+            return mm_heuristics.get_exhaustive_mm_configs()
+
+    def get_extra_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_extra_mm_configs()
+
+    def get_int8_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_int8_mm_configs()
+
+    def get_mixed_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_mixed_mm_configs()
+
+    def get_persistent_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_persistent_mm_configs()
+
+    def get_scaled_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_scaled_mm_configs()
+
+    def get_scaled_persistent_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_scaled_persistent_mm_configs()
+
+    def get_mm_plus_mm_configs(
+        self, device_type: Optional[str] = "cuda"
+    ) -> partial[Generator[TritonConfig, None, None]]:
+        mm_heuristics = self.get_config_heuristics(device_type)
+        return mm_heuristics.get_mm_plus_mm_configs()
+
     # Conv configs
     def get_conv_configs(
         self, device_type: Optional[str] = "cuda"
@@ -78,7 +128,6 @@ def get_conv_configs(
         return conv_heuristics.get_conv_configs()
 
     # Flex attention configs
-    # TODO(coconutruben): break out flexattention/decode configs into the new retrieval mechanism
     def get_flex_attention_fwd_configs(
         self, head_dim: int, dtype: torch.dtype, device_type: Optional[str] = "cuda"
     ) -> list[Any]:
@@ -97,37 +146,6 @@ def get_flex_decode_configs(
         flex_heuristics = self.get_config_heuristics(device_type)
         return flex_heuristics.get_flex_decode_configs(head_dim, dtype)
 
-    def get_mm_configs(
-        self,
-        kernel_inputs: KernelInputs,
-        layout: Any,
-        template_name: str,
-        op_name: str,
-    ) -> Generator[dict[str, Any], None, None]:
-        """
-        Get generator of template parameters for MM templates using template-specific heuristics.
-
-        Args:
-            kernel_inputs: MMKernelInputs containing input tensor nodes and matrix indices
-            layout: Output layout
-            template_name: Template name (e.g., "bmm", "mm", "mm_persistent_tma")
-            op_name: Operation name (e.g., "bmm", "baddbmm", "addmm", "mm_plus_mm")
-
-        Yields:
-            Template parameter dictionaries ready for maybe_append_choice
-        """
-        input_tensors = kernel_inputs.nodes()
-        if len(input_tensors) < 2:
-            raise ValueError(f"Need at least 2 input tensors, got {len(input_tensors)}")
-
-        # Extract device_type from kernel_inputs
-        device_type = kernel_inputs.device_type
-        assert device_type is not None, "get_mm_configs requires a valid device type"
-        # Get the appropriate template-specific heuristic
-        heuristic = get_template_heuristic(template_name, device_type)
-
-        yield from heuristic.get_template_configs(kernel_inputs, layout, op_name)
-
     def triton_kernel_kwargs(
         self,
         kernel_cls: type[TritonKernel],
 
@@ -6,7 +6,6 @@
 from torch._inductor.codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
 
 from .. import ir, lowering as L
-from ..kernel_inputs import MMKernelInputs
 from ..select_algorithm import (
     autotune_select_algorithm,
     ExternKernelChoice,
@@ -27,6 +26,8 @@
     addmm_epilogue,
     is_batch_stride_largest,
     mm_args,
+    mm_config_kwargs,
+    mm_options,
 )
 
 
@@ -39,6 +40,13 @@ def bmm_grid(b, m, n, meta, *, cdiv):
     return (cdiv(m, meta["BLOCK_M"]) * cdiv(n, meta["BLOCK_N"]), b, 1)
 
 
+def _is_large_block_for_cpu(m, n, k):
+    # Thresholds are experimentally determined to reduce Triton CPU compile times
+    if m > 128 or n > 128 or k > 128:
+        return True
+    return m * n > 2**12
+
+
 bmm_template = TritonTemplate(
     name="bmm",
     grid=bmm_grid,
@@ -167,14 +175,9 @@ def may_require_contiguous(t, meta_t):
             meta_mat2 = V.graph.current_node.args[1]
             mat2 = may_require_contiguous(mat2, meta_mat2)
 
-    # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
     m, n, k, layout, mat1, mat2 = mm_args(
         mat1, mat2, layout=layout, out_dtype=out_dtype
     )
-    name = "bmm"
-
-    # Create MMKernelInputs for BMM at the top
-    kernel_inputs = MMKernelInputs([mat1, mat2])
 
     # below is for getting an overview logging info of inductor mms
     batch_size = mat1.get_size()[0]  # Extract batch dimension
@@ -192,65 +195,63 @@ def may_require_contiguous(t, meta_t):
 
     if out_dtype:
         assert mat1.get_device().type == "cuda", "out_dtype is only supported for CUDA"
-        aten_func = aten_bmm_dtype.bind(
-            kernel_inputs.nodes(), layout, out_dtype=out_dtype
-        )
+        aten_func = aten_bmm_dtype.bind((mat1, mat2), layout, out_dtype=out_dtype)
     else:
-        aten_func = aten_bmm.bind(kernel_inputs.nodes(), layout)
+        aten_func = aten_bmm.bind((mat1, mat2), layout)
 
     # options to tune from
     choices = [aten_func] if use_aten_gemm_kernels() else []
 
+    device_type = ir.get_device_type(mat1)
+    bmm_configs = V.choices.get_base_mm_configs(device_type)
+
+    dtype = mat1.get_dtype()
     if use_triton_template(layout):
         # TODO: add out_dtype support for Triton Template
         assert out_dtype is None, "out_dtype is not supported for Triton"
-
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, bmm_template.name, name
+        for config in bmm_configs(
+            m,
+            n,
+            k,
+            **mm_config_kwargs(device_type, _is_large_block_for_cpu, dtype.itemsize),
         ):
             bmm_template.maybe_append_choice(
                 choices,
-                input_nodes=kernel_inputs.nodes(),
+                input_nodes=(mat1, mat2),
                 layout=layout,
-                **kwargs,
+                **mm_options(config, m, n, k, layout),
             )
     _, is_nonzero = _is_static_problem(layout)
     batch_stride_largest = is_batch_stride_largest(mat1, mat2, layout)
     if (
         batch_stride_largest
         and is_nonzero
         and use_cutlass_template(layout, m, n, k)
-        and _use_cutlass_for_op(name)
+        and _use_cutlass_for_op("bmm")
     ):
         from ..codegen.cuda.gemm_template import CUTLASS3xGemmTemplate
 
-        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
-            choices, layout, kernel_inputs.nodes()
-        )  # type: ignore[arg-type]
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])  # type: ignore[arg-type]
 
     if use_cpp_bmm_template(layout, mat1, mat2):
         from ..codegen.cpp_bmm_template import CppBmmTemplate
 
         CppBmmTemplate.add_choices(
             choices,
             layout,
-            kernel_inputs.nodes(),
+            [mat1, mat2],
         )
 
     if use_ck_gemm_template(layout, m, n, k):
-        CKGemmTemplate.add_ck_gemm_choices(choices, layout, kernel_inputs.nodes())
+        CKGemmTemplate.add_ck_gemm_choices(choices, layout, [mat1, mat2])
 
-    return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
+    return autotune_select_algorithm("bmm", choices, [mat1, mat2], layout)
 
 
 @L.register_lowering(aten.baddbmm)
 def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
-    # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
     m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout)
 
-    # Create MMKernelInputs for BadDBMM at the top
-    kernel_inputs = MMKernelInputs([inp, mat1, mat2])
-
     # below is for getting an overview logging info of inductor mms
     batch_size = mat1.get_size()[0]
     counters["aten_mm_info"][f"aten.baddbmm_{batch_size}_{m}_{n}_{k}"] += 1
@@ -265,26 +266,29 @@ def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         inp.get_dtype(),
         layout,
     )
-    name = "baddbmm"
+
     # options to tune from
     choices = (
-        [aten_baddbmm.bind(kernel_inputs.nodes(), layout, alpha=alpha, beta=beta)]
+        [aten_baddbmm.bind((inp, mat1, mat2), layout, alpha=alpha, beta=beta)]
         if use_aten_gemm_kernels()
         else []
     )
 
+    device_type = ir.get_device_type(mat1)
+    bmm_configs = V.choices.get_base_mm_configs(device_type)
+
     if use_triton_template(layout):
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, bmm_template.name, name
+        for config in bmm_configs(
+            m, n, k, **mm_config_kwargs(device_type, _is_large_block_for_cpu)
         ):
             bmm_template.maybe_append_choice(
                 choices,
-                input_nodes=kernel_inputs.nodes(),
+                input_nodes=(inp, mat1, mat2),
                 layout=layout,
-                **kwargs,
+                **mm_options(config, m, n, k, layout),
                 prefix_args=1,
                 epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
                 epilogue_fn_hash=str(["addmm_epilogue", layout.dtype, alpha, beta]),
             )
 
-    return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
+    return autotune_select_algorithm("baddbmm", choices, [inp, mat1, mat2], layout)
@@ -29,6 +29,7 @@
     use_triton_template,
 )
 from ..virtualized import V
+from .mm_common import mm_config_kwargs
 
 
 if TYPE_CHECKING:
@@ -60,6 +61,13 @@ def conv3d_grid(n, c, d, h, w, meta, *, cdiv):
     )
 
 
+def _is_large_block_for_cpu(m, n, k):
+    # Thresholds are experimentally determined to reduce Triton CPU compile times
+    if m > 256 or n > 256 or k > 256:
+        return True
+    return m * n * k > 2**17
+
+
 LOOP_BODY_2D = """
         idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
         idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
@@ -595,6 +603,7 @@ def channels_last_conv():
             sympy_product([x.get_size()[0], *x.get_size()[2:]]),
             out_chan,
             in_chan,
+            **mm_config_kwargs(device_type, _is_large_block_for_cpu),
         ):
             if ndim == 2:
                 conv2d_template.maybe_append_choice(