llvm · adam-smnk · Aug 29, 2025 · Aug 26, 2025 · Aug 29, 2025 · Aug 29, 2025
@@ -142,14 +142,17 @@ class AMX_Op<string mnemonic, list<Trait> traits = []> :
 // Tile reset.
 //
 
-def TileZeroOp : AMX_Op<"tile_zero", [Pure,
-    AMXIntrinsicOpInterface
+def TileZeroOp : AMX_Op<"tile_zero", [
+    AMXIntrinsicOpInterface,
+    MemoryEffects<[MemWrite]>
   ]> {
   let summary = "tile zero operation";
   let description = [{
     Zeroes the destination tile, with the shape defined by the 2-dim
     vector type of the result. This is eventually lowered into the
     "tilezero" instruction with the corresponding tile configuration.
+    With memory-effects, each "tilezero" operation serves as a compilation 
+    hint to use a separate tile register.
 
     Example:
 
@@ -179,15 +182,17 @@ def TileZeroOp : AMX_Op<"tile_zero", [Pure,
 // Tile memory operations.
 //
 
-def TileLoadOp : AMX_Op<"tile_load", [Pure,
-    AMXIntrinsicOpInterface
+def TileLoadOp : AMX_Op<"tile_load", [
+    AMXIntrinsicOpInterface,
+    MemoryEffects<[MemWrite]>
   ]> {
   let summary = "tile load operation";
   let description = [{
     Loads a tile from memory defined by a base and indices, with the
     shape defined by the 2-dim vector type of the result. This is
     eventually lowered into the "tileloadd" instruction with the
-    corresponding tile configuration.
+    corresponding tile configuration. With memory-effects, each "tileload" 
+    operation serves as a compilation hint to use a separate tile register.
 
     Example:
 

@@ -0,0 +1,32 @@
+// RUN: mlir-opt %s -cse -convert-vector-to-llvm="enable-amx" | FileCheck %s
+
+// With inclusion of memory side-effects, it is expected CSE not to fold multiple 
+// "tileload" and "tilezero".
+// CHECK-LABEL: do_not_fold_tiles(
+// CHECK: llvm.call_intrinsic "llvm.x86.tilezero.internal"
+// CHECK: llvm.call_intrinsic "llvm.x86.tilezero.internal"
+// CHECK: llvm.call_intrinsic "llvm.x86.tileloadd64.internal"
+// CHECK: llvm.call_intrinsic "llvm.x86.tileloadd64.internal"
+// CHECK: llvm.call_intrinsic "llvm.x86.tileloadd64.internal"
+// CHECK: llvm.call_intrinsic "llvm.x86.tileloadd64.internal"
+func.func @do_not_fold_tiles(%arg0: memref<2x32x32xbf16>, %arg1: memref<2x16x32xbf16>) -> memref<16x32xf32> {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c16 = arith.constant 16 : index
+  %alloca = memref.alloca() : memref<16x32xf32>
+  %0 = amx.tile_zero : !amx.tile<16x16xf32>
+  %1 = amx.tile_zero : !amx.tile<16x16xf32>
+  %2:2 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (!amx.tile<16x16xf32>, !amx.tile<16x16xf32>) {
+    %3 = amx.tile_load %arg0[%arg2, %c0, %c0] : memref<2x32x32xbf16> into !amx.tile<16x32xbf16>
+    %4 = amx.tile_load %arg0[%arg2, %c16, %c0] : memref<2x32x32xbf16> into !amx.tile<16x32xbf16>
+    %5 = amx.tile_load %arg1[%arg2, %c0, %c0] : memref<2x16x32xbf16> into !amx.tile<16x32xbf16>
+    %6 = amx.tile_load %arg1[%arg2, %c0, %c0] : memref<2x16x32xbf16> into !amx.tile<16x32xbf16>
+    %7 = amx.tile_mulf %3, %5, %arg3 : !amx.tile<16x32xbf16>, !amx.tile<16x32xbf16>, !amx.tile<16x16xf32>
+    %8 = amx.tile_mulf %4, %6, %arg4 : !amx.tile<16x32xbf16>, !amx.tile<16x32xbf16>, !amx.tile<16x16xf32>
+    scf.yield %7, %8 : !amx.tile<16x16xf32>, !amx.tile<16x16xf32>
+  }
+  amx.tile_store %alloca[%c0, %c0], %2#0 : memref<16x32xf32>, !amx.tile<16x16xf32>
+  amx.tile_store %alloca[%c0, %c16], %2#1 : memref<16x32xf32>, !amx.tile<16x16xf32>
+  return %alloca : memref<16x32xf32>
+}