WIP add support for dynamic shapes

eellison · eellison · commit 2d58dd405b58 · 2025-08-07T11:42:54.000-07:00
ghstack-source-id: 6e66931 Pull Request resolved: #155557
diff --git a/test/inductor/test_loop_ordering.py b/test/inductor/test_loop_ordering.py
@@ -896,7 +896,9 @@ def forward(permute):
             arg0_1 = torch.randn([XDIM, YDIM], device=GPU_TYPE, dtype=torch.bfloat16)
             permute = torch.ops.aten.permute.default(arg0_1, [1, 0])
 
-            out, code = run_and_get_code(torch.compile(forward), (permute))
+            out, code = run_and_get_code(
+                torch.compile(forward, dynamic=True), (permute)
+            )
 
             self.assertEqual(out, forward(permute))
             FileCheck().check("YBLOCK").check("XBLOCK").run(code[0])
@@ -937,12 +939,13 @@ def T(self, layout: str):
 
     @parametrize("a", layouts)
     @parametrize("b", layouts)
-    def test_pointwise(self, a, b):
+    @parametrize("dynamic", (False, True))
+    def test_pointwise(self, a, b, dynamic):
         def foo(x, y):
             return x + y
 
         x, y = self.T(a), self.T(b)
-        res, code = run_and_get_code(torch.compile(foo), x, y)
+        res, code = run_and_get_code(torch.compile(foo, dynamic=dynamic), x, y)
 
         if a != b:
             FileCheck().check("ynumel").run(code[0])
@@ -968,13 +971,14 @@ def f(a, b):
         ).run(code[0])
         self.assertEqual(out, f(*inps), atol=0.001, rtol=0.04)
 
-    def test_3d_pointwise(self):
+    @parametrize("dynamic", (False, True))
+    def test_3d_pointwise(self, dynamic):
         inps = (self.T("cont"), self.T("T"), self.T("NHWC"))
 
         def f(x, y, z):
             return x + y + z
 
-        f_c = torch.compile(f)
+        f_c = torch.compile(f, dynamic=dynamic)
         out, code = run_and_get_code(f_c, *inps)
 
         FileCheck().check_dag("znumel").check_dag("ynumel").check_dag("xnumel").run(
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
@@ -2179,12 +2179,15 @@ def compute_tiling_strategy(
         pw_ranges = [ranges[v] for v in all_iter_vars]
         red_ranges = [ranges[v] for v in all_red_vars]
 
+        def check_eq(a, b):
+            return V.graph.sizevars.atomically_apply_size_hint(a - b, fallback=32) == 0
+
         torch._check(
-            sympy_product(pw_ranges) == pointwise_numel,
+            check_eq(sympy_product(pw_ranges), pointwise_numel),
             lambda: f"{pw_ranges}, {pointwise_numel}, {node_schedule}",
         )
         torch._check(
-            sympy_product(red_ranges) == reduction_numel,
+            check_eq(sympy_product(red_ranges), reduction_numel),
             lambda: f"{red_ranges}, {reduction_numel}, {node_schedule}",
         )
 
@@ -2331,7 +2334,7 @@ def process_node_vars(
         def score_mod(t):
             score_factor = 1.0
             for tile_size in t[0].tiling.values():
-                if not CandidateTiling.is_good_size(tile_size):
+                if not CandidateTiling.is_good_size(tile_size, size_hint=False):
                     score_factor = score_factor / bad_size_additional_tiling_penalty
                 else:
                     score_factor = score_factor / good_size_tiling_penalty
@@ -2588,8 +2591,12 @@ class CandidateTiling:
     name: Optional[str] = None
 
     @staticmethod
-    def is_good_size(s):
+    def is_good_size(s, size_hint=True):
         """Somewhat arbitrary heuristic used to boost scores for some sizes"""
+        sv = V.graph.sizevars
+        if not size_hint:
+            return sv.statically_known_multiple_of(s, 32) and sv.statically_known_geq(s, 32)
+
         s = V.graph.sizevars.size_hint(s)
         return s >= 32 and (s % 32 == 0)
 
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
@@ -380,13 +380,6 @@ def statically_known_multiple_of(
         """
         Return a bool indicating if it is sound to optimize for the numerator being a multiple of the denominator.
         """
-        # The reason we skip compute here is to avoid the cost of trying to eval this symbolically.
-        # see https://github.com/sympy/sympy/issues/28200
-        if has_free_unbacked_symbols(numerator) or has_free_unbacked_symbols(
-            denominator
-        ):
-            return False
-
         if len(free_symbols(numerator)) > 20:
             return False
 
diff --git a/torch/_inductor/tiling_utils.py b/torch/_inductor/tiling_utils.py
@@ -221,13 +221,17 @@ def get_pw_red_splits(
     red_numel: sympy.Expr,
     none_if_not_divisible: bool = False,
 ) -> Optional[tuple[VarsAndRanges, VarsAndRanges]]:
-    if n.is_reduction() or sympy_product(n._body.sizes[0]) == pointwise_numel:
+    n_pointwise_numel = V.graph.sizevars.simplify(sympy_product(n._body.sizes[0]))
+    if n.is_reduction() or n_pointwise_numel == pointwise_numel:
         return (
             (n._body.iter_vars, n._body.sizes[0]),
             (n._body.reduce_vars, n._body.sizes[1]),
         )  # type: ignore[return-value]
 
-    assert sympy_product(n._body.sizes[0]) == pointwise_numel * red_numel  # type: ignore[operator]
+    assert V.graph.sizevars.atomically_apply_size_hint(
+        n_pointwise_numel - (pointwise_numel * red_numel), fallback=config.unbacked_symint_fallback
+    ) == 0
+
     i = len(n._body.sizes[0]) - 1
     prod = 1
     while i >= 0:
@@ -319,6 +323,7 @@ def get_node_splits(self) -> tuple[Split, Split]:
 
         if len(self.all_node_sizes) == 1:
             return next(iter(self.all_node_sizes))
+        # TODO - return default pointwise, reduction
 
         max_pw_split = max(self.pw_split_options.keys())
         for pw_split_len in range(max_pw_split, 0, -1):
@@ -478,13 +483,6 @@ def extract_normalized_read_writes(
     pointwise_numel: sympy.Expr = node.group[1][0]
     red_numel: sympy.Expr = node.group[1][1]
 
-    # TODO - a few dynamic shapes issues to resolve
-    if any(
-        (isinstance(var, sympy.Expr) and not var.is_constant())
-        for var in (pointwise_numel, red_numel)
-    ):
-        return None
-
     pw_splits, red_splits = NodeSplitGetter(node).get_node_splits()
 
     # lets use different prefix (`n`) to distinguish
@@ -663,13 +661,8 @@ def analyze_memory_coalescing(
         ((True, item) for item in reads.items()),
         ((False, item) for item in writes.items()),
     ):
-        # skip memory deps with indirect vars - todo: better handling
-        indirect_expr = bool(
-            memory_expr.free_symbols - norm_read_writes.var_ranges.keys()
-        )
-
-        if indirect_expr:
-            continue
+        # TODO skip memory deps with indirect vars
+        # handled in extract_normalized_read_writes currently
 
         size = get_score(memory_expr, var_ranges)
         if size == 0:
@@ -699,8 +692,8 @@ def analyze_memory_coalescing(
     tiling_scores: dict[sympy.Expr, dict[int, int]] = defaultdict(Counter)
 
     for uncoalesced_expr, addr_score in uncoalesced_addrs.items():
-        expr_subs = dict.fromkeys(uncoalesced_expr.free_symbols, 0)
-        for v in uncoalesced_expr.free_symbols:
+        expr_subs = dict.fromkeys(var_ranges.keys(), 0)
+        for v in uncoalesced_expr.free_symbols & var_ranges.keys():
             # skip non iter/reduce var variables
             if v not in var_ranges:
                 continue
@@ -710,7 +703,13 @@ def analyze_memory_coalescing(
             del expr_subs[v]
             single_var_expr = sympy_subs(uncoalesced_expr, expr_subs)
             expr_subs[v] = 0
+
+            # TODO: skip dynamic shapes for now,
+            if len(single_var_expr.free_symbols) != 1:
+                continue
+
             tiling_factor = solve_for_tiling(single_var_expr)
+
             if (
                 tiling_factor is None
                 or not tiling_factor.is_constant()
diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py
@@ -237,6 +237,9 @@ def eval(
             return base
         if base.is_integer and equal_valued(divisor, -1):
             return sympy.Mul(base, -1)
+        if base is divisor:
+            return sympy.S.One
+
         if (
             isinstance(base, sympy.Number)
             and isinstance(divisor, sympy.Number)
@@ -324,11 +327,15 @@ def eval(
             if divisor != 1:
                 gcd = sympy.gcd(base, divisor)
                 if gcd != 1:
-                    return ModularIndexing(
-                        sympy.simplify(base / gcd),
-                        sympy.simplify(divisor / gcd),
-                        modulus,
-                    )
+                    try:
+                        return ModularIndexing(
+                            sympy.simplify(base / gcd),
+                            sympy.simplify(divisor / gcd),
+                            modulus,
+                        )
+                    except Exception:
+                        breakpoint()
+                        raise
         except sympy.PolynomialError:
             pass  # https://github.com/pytorch/pytorch/issues/108276