Add remaining method and tests for dtype propagation (#140057)

eellison · pytorchmergebot · commit fd553b9817cb · 2024-11-27T17:06:44.000Z
Adds the remaining unimplemented ops as well as an assertion failure if someone adds a new op without a dtype rule. We test all unique pointwise operators registered as lowerings which have an opinfo. There will be some follow ups for this to work well with both `codegen_upcast_to_fp32` as True and False. Pull Request resolved: #140057 Approved by: https://github.com/arui-meta, https://github.com/blaine-rister, https://github.com/ezyang ghstack dependencies: #139945
diff --git a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@@ -6,27 +6,27 @@ add_loop_eager_dynamic,compile_time_instruction_count,5703000000,0.025
 
 
 
-add_loop_inductor,compile_time_instruction_count,26750000000,0.015
+add_loop_inductor,compile_time_instruction_count,29490000000,0.015
 
 
 
-add_loop_inductor_dynamic_gpu,compile_time_instruction_count,42430000000,0.025
+add_loop_inductor_dynamic_gpu,compile_time_instruction_count,43310000000,0.025
 
 
 
-add_loop_inductor_gpu,compile_time_instruction_count,24790000000,0.015
+add_loop_inductor_gpu,compile_time_instruction_count,25660000000,0.015
 
 
 
 basic_modules_ListOfLinears_eager,compile_time_instruction_count,1033000000,0.015
 
 
 
-basic_modules_ListOfLinears_inductor,compile_time_instruction_count,19970000000,0.015
+basic_modules_ListOfLinears_inductor,compile_time_instruction_count,20790000000,0.015
 
 
 
-basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,16450000000,0.015
+basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,17020000000,0.015
 
 
 
@@ -62,4 +62,4 @@ aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3863000000,
 
 
 
-aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10360000000,0.015
+aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10410000000,0.015
diff --git a/test/inductor/test_op_dtype_prop.py b/test/inductor/test_op_dtype_prop.py
@@ -0,0 +1,86 @@
+# Owner(s): ["module: inductor"]
+import importlib
+import os
+import sys
+
+import torch
+from torch._dynamo.utils import disable_cache_limit
+from torch._inductor import config
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_methods_invocations import op_db
+
+
+# Make the helper files in test/ importable
+pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(pytorch_test_dir)
+
+
+importlib.import_module("functorch")
+importlib.import_module("filelock")
+
+
+from torch._inductor.lowering import lowerings
+from torch.testing._internal.common_device_type import ops
+from torch.testing._internal.inductor_utils import HAS_GPU
+
+
+unique_pointwise_op_names = set()
+
+for op in lowerings:
+    if not isinstance(op, torch._ops.OpOverload):
+        continue
+
+    if torch.Tag.pointwise not in op.tags:
+        continue
+
+    if op._schema.is_mutable:
+        continue
+
+    op_name = (op.name().split("::")[-1]).split(".")[0]
+    unique_pointwise_op_names.add(op_name)
+
+pointwise_ops = [
+    op
+    for op in op_db
+    if op.name in unique_pointwise_op_names and "reduction" not in op.variant_test_name
+]
+
+
+class TestCase(InductorTestCase):
+    @ops(
+        pointwise_ops,
+        allowed_dtypes=(
+            torch.float32,
+            torch.float64,
+            torch.int32,
+            # torch.int64, # fixed in follow up
+            torch.bool,
+        ),
+    )
+    # @config.patch("triton.codegen_upcast_to_fp32", False) # TODO enable
+    @config.patch("test_configs.runtime_triton_dtype_assert", True)
+    @disable_cache_limit()
+    def test_op_dtype_propagation(self, op, dtype):
+        def run(op, args, kwargs):
+            return op(*args, **kwargs)
+
+        if op.name == "add":
+            self.skipTest("Fixed in follow ups")
+
+        sample_inputs_itr = op.sample_inputs("cuda", dtype, requires_grad=False)
+        for sample_input in sample_inputs_itr:
+            args = (sample_input.input,) + sample_input.args
+            kwargs = sample_input.kwargs
+            out = run(op.get_op(), args, kwargs)
+            out_c = torch.compile(run)(op.get_op(), args, kwargs)
+            self.assertEqual(out, out_c)
+
+
+instantiate_device_type_tests(TestCase, globals(), only_for=("cuda",))
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    if HAS_GPU:
+        run_tests(needs="filelock")
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
@@ -1791,21 +1791,39 @@ def inner(*args, **kwargs):
                     value = getattr(parent_handler, name)(*args, **kwargs)  # type: ignore[has-type]
                     dtype_handler = DtypePropagationOpsHandler()
 
+                    output_idx = 0
+
                     def do_cse(v):
-                        # TODO - throw on default
                         output_dtype = getattr(
                             dtype_handler,
                             name,
-                            dtype_handler.default_handler,
-                        )(*args)
+                        )(*args, **kwargs)
 
                         csevar = V.kernel.cse.generate(
                             V.kernel.compute,
                             v,
                             bounds=bounds,
                             dtype=output_dtype,
                         )
+
+                        nonlocal output_idx
+                        if config.test_configs.runtime_triton_dtype_assert and not (
+                            V.graph.get_current_device_or_throw().type == "cpu"
+                            and config.cpu_backend != "triton"
+                        ):
+                            from torch._inductor.codegen.triton import triton_type
+
+                            # we tree_map over the output, so we need to fetch corresponding dtype
+                            if isinstance(output_dtype, (list, tuple)):
+                                output_dtype = output_dtype[output_idx]
+
+                            V.kernel.compute.writeline(
+                                f"tl.static_assert({csevar}.dtype == {triton_type(output_dtype)})"
+                            )
+                        output_idx += 1
+
                         csevar.update_on_args(name, args, kwargs)
+
                         return csevar
 
                     return pytree.tree_map(do_cse, value)
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -1092,7 +1092,9 @@ def sigmoid(x):
     @staticmethod
     def signbit(x):
         # XX: This is wrong for the value -0.0 in floating point
-        return f"libdevice.signbit({x}) if ({x}).dtype is tl.float32 else {x} < 0"
+        return (
+            f"(libdevice.signbit({x}) != 0) if ({x}).dtype is tl.float32 else {x} < 0"
+        )
 
     @staticmethod
     def fmod(a, b):
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -1353,6 +1353,8 @@ class trace:
 class test_configs:
     force_extern_kernel_in_multi_template = False
 
+    runtime_triton_dtype_assert = False
+
 
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
diff --git a/torch/_inductor/dtype_propagation.py b/torch/_inductor/dtype_propagation.py
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py