[MLIR][NVVM][NVGPU] Combine prefetch and prefetch.tensormap #153134

Wolfram70 · 2025-08-12T04:12:17Z

This PR combines the prefetch and prefetch.tensormap NVVM Ops
to one prefetch Op. The tensormap variant is lowered through the
newly added intrinsics.

The lowering of the NVGPU tma.prefetch.descriptor Op is changed
from lowering to the prefetch.tensormap Op to prefetch.

PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu

llvmbot · 2025-08-12T04:12:48Z

@llvm/pr-subscribers-mlir-gpu

@llvm/pr-subscribers-mlir

Author: Srinivasa Ravi (Wolfram70)

Changes

This PR updates the prefetch.tensormap NVVM Op to lower through the llvm.nvvm.prefetch.tensormap intrinsics.

PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu

Full diff: https://github.com/llvm/llvm-project/pull/153134.diff

4 Files Affected:

(modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+22-5)
(modified) mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir (+1-1)
(modified) mlir/test/Dialect/LLVMIR/nvvm.mlir (+11)
(modified) mlir/test/Target/LLVMIR/nvvm/prefetch.mlir (+13)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 8d507268a3a15..3112fbd68c54a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -25,9 +25,11 @@ include "mlir/Dialect/LLVMIR/LLVMTypes.td"
 def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
 def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
 def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
+def LLVM_PointerConst : LLVM_PointerInAddressSpace<4>;
 def LLVM_PointerLocal : LLVM_PointerInAddressSpace<5>;
 def LLVM_PointerTensor : LLVM_PointerInAddressSpace<6>;
 def LLVM_PointerSharedCluster : LLVM_PointerInAddressSpace<7>;
+def LLVM_PointerParam : LLVM_PointerInAddressSpace<101>;
 
 //===----------------------------------------------------------------------===//
 // NVVM dialect definitions
@@ -2464,15 +2466,30 @@ def NVVM_PrefetchOp : NVVM_Op<"prefetch"> {
   }];
 }
 
-def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
-                    [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>,
-  Arguments<(ins LLVM_AnyPointer:$tmaDescriptor, PtxPredicate:$predicate)> {
-  let assemblyFormat = "$tmaDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
+def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap", 
+    [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, NVVMRequiresSM<90>]> {
+  let summary = "Brings the cache line containing an address from `const` or `param` state space for subsequent use by the `cp.async.bulk.tensor` instruction";
+  let description = [{
+    Operand `addr` can be a const, param or generic address pointer. If it is a 
+    generic address pointer, it must map to a const or param memory location.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu)
+  }];
+  let arguments = (ins AnyTypeOf<[LLVM_PointerGeneric,
+                                  LLVM_PointerConst,
+                                  LLVM_PointerParam]>:$addr,
+                       PtxPredicate:$predicate);
+  let assemblyFormat = "$addr (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
+
+  let extraClassDeclaration = "bool hasIntrinsic() { return !getPredicate(); }";
   let extraClassDefinition = [{
-    std::string $cppClass::getPtx() { 
+    std::string $cppClass::getPtx() {
       return std::string("prefetch.tensormap [%0];");
     }
   }];
+  let llvmBuilder = [{
+    createIntrinsicCall(builder, llvm::Intrinsic::nvvm_prefetch_tensormap, {$addr}, {$addr->getType()});
+  }];
 }
 
 def NVVM_CpAsyncBulkPrefetchOp : NVVM_Op<"cp.async.bulk.prefetch"> {
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index e50576722e38c..956ae113ba020 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -582,7 +582,7 @@ func.func @elect_one_leader_sync() {
 
 // CHECK-LABEL: @init_mbarrier_arrive_expect_tx
 llvm.func @init_mbarrier_arrive_expect_tx(%desc : !llvm.ptr, %pred : i1) {
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "prefetch.tensormap [$0];", "l"
+  //CHECK: nvvm.prefetch.tensormap %{{.*}}
   nvvm.prefetch.tensormap %desc : !llvm.ptr
   //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$1 prefetch.tensormap [$0];", "l,b"
   nvvm.prefetch.tensormap %desc, predicate = %pred : !llvm.ptr, i1
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index c7fa41c98ac92..892bb2b13165a 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -619,6 +619,17 @@ func.func @prefetch(%gen_ptr: !llvm.ptr, %local_ptr: !llvm.ptr<5>, %global_ptr:
   return
 }
 
+// CHECK-LABEL: @prefetch_tensormap
+func.func @prefetch_tensormap(%gen_ptr: !llvm.ptr, %const_ptr: !llvm.ptr<4>, %param_ptr: !llvm.ptr<101>) {
+  // CHECK:   nvvm.prefetch.tensormap %{{.*}}
+  nvvm.prefetch.tensormap %gen_ptr : !llvm.ptr
+  // CHECK:   nvvm.prefetch.tensormap %{{.*}}
+  nvvm.prefetch.tensormap %const_ptr : !llvm.ptr<4>
+  // CHECK:   nvvm.prefetch.tensormap %{{.*}}
+  nvvm.prefetch.tensormap %param_ptr : !llvm.ptr<101>
+  return
+}
+
 // -----
 
 // Just check these don't emit errors.
diff --git a/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir b/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
index f38b7529a7233..c29f27915946c 100644
--- a/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
@@ -45,3 +45,16 @@ llvm.func @prefetch_L1_uniform(%gen_ptr: !llvm.ptr) {
   nvvm.prefetch level = L1 uniform, %gen_ptr : !llvm.ptr
   llvm.return
 }
+
+llvm.func @prefetch_tensormap(%gen_ptr: !llvm.ptr, %const_ptr: !llvm.ptr<4>, %param_ptr: !llvm.ptr<101>) {
+  // CHECK-LABEL: define void @prefetch_tensormap(ptr %0, ptr addrspace(4) %1, ptr addrspace(101) %2) {
+  // CHECK-NEXT: call void @llvm.nvvm.prefetch.tensormap.p0(ptr %0)
+  // CHECK-NEXT: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %1)
+  // CHECK-NEXT: call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.prefetch.tensormap %gen_ptr : !llvm.ptr
+  nvvm.prefetch.tensormap %const_ptr: !llvm.ptr<4>
+  nvvm.prefetch.tensormap %param_ptr: !llvm.ptr<101>
+  llvm.return
+}

llvmbot · 2025-08-12T04:12:49Z

@llvm/pr-subscribers-mlir-llvm

Author: Srinivasa Ravi (Wolfram70)

Changes

This PR updates the prefetch.tensormap NVVM Op to lower through the llvm.nvvm.prefetch.tensormap intrinsics.

PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu

Full diff: https://github.com/llvm/llvm-project/pull/153134.diff

4 Files Affected:

(modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+22-5)
(modified) mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir (+1-1)
(modified) mlir/test/Dialect/LLVMIR/nvvm.mlir (+11)
(modified) mlir/test/Target/LLVMIR/nvvm/prefetch.mlir (+13)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 8d507268a3a15..3112fbd68c54a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -25,9 +25,11 @@ include "mlir/Dialect/LLVMIR/LLVMTypes.td"
 def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
 def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
 def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
+def LLVM_PointerConst : LLVM_PointerInAddressSpace<4>;
 def LLVM_PointerLocal : LLVM_PointerInAddressSpace<5>;
 def LLVM_PointerTensor : LLVM_PointerInAddressSpace<6>;
 def LLVM_PointerSharedCluster : LLVM_PointerInAddressSpace<7>;
+def LLVM_PointerParam : LLVM_PointerInAddressSpace<101>;
 
 //===----------------------------------------------------------------------===//
 // NVVM dialect definitions
@@ -2464,15 +2466,30 @@ def NVVM_PrefetchOp : NVVM_Op<"prefetch"> {
   }];
 }
 
-def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
-                    [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>,
-  Arguments<(ins LLVM_AnyPointer:$tmaDescriptor, PtxPredicate:$predicate)> {
-  let assemblyFormat = "$tmaDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
+def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap", 
+    [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, NVVMRequiresSM<90>]> {
+  let summary = "Brings the cache line containing an address from `const` or `param` state space for subsequent use by the `cp.async.bulk.tensor` instruction";
+  let description = [{
+    Operand `addr` can be a const, param or generic address pointer. If it is a 
+    generic address pointer, it must map to a const or param memory location.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu)
+  }];
+  let arguments = (ins AnyTypeOf<[LLVM_PointerGeneric,
+                                  LLVM_PointerConst,
+                                  LLVM_PointerParam]>:$addr,
+                       PtxPredicate:$predicate);
+  let assemblyFormat = "$addr (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
+
+  let extraClassDeclaration = "bool hasIntrinsic() { return !getPredicate(); }";
   let extraClassDefinition = [{
-    std::string $cppClass::getPtx() { 
+    std::string $cppClass::getPtx() {
       return std::string("prefetch.tensormap [%0];");
     }
   }];
+  let llvmBuilder = [{
+    createIntrinsicCall(builder, llvm::Intrinsic::nvvm_prefetch_tensormap, {$addr}, {$addr->getType()});
+  }];
 }
 
 def NVVM_CpAsyncBulkPrefetchOp : NVVM_Op<"cp.async.bulk.prefetch"> {
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index e50576722e38c..956ae113ba020 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -582,7 +582,7 @@ func.func @elect_one_leader_sync() {
 
 // CHECK-LABEL: @init_mbarrier_arrive_expect_tx
 llvm.func @init_mbarrier_arrive_expect_tx(%desc : !llvm.ptr, %pred : i1) {
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "prefetch.tensormap [$0];", "l"
+  //CHECK: nvvm.prefetch.tensormap %{{.*}}
   nvvm.prefetch.tensormap %desc : !llvm.ptr
   //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$1 prefetch.tensormap [$0];", "l,b"
   nvvm.prefetch.tensormap %desc, predicate = %pred : !llvm.ptr, i1
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index c7fa41c98ac92..892bb2b13165a 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -619,6 +619,17 @@ func.func @prefetch(%gen_ptr: !llvm.ptr, %local_ptr: !llvm.ptr<5>, %global_ptr:
   return
 }
 
+// CHECK-LABEL: @prefetch_tensormap
+func.func @prefetch_tensormap(%gen_ptr: !llvm.ptr, %const_ptr: !llvm.ptr<4>, %param_ptr: !llvm.ptr<101>) {
+  // CHECK:   nvvm.prefetch.tensormap %{{.*}}
+  nvvm.prefetch.tensormap %gen_ptr : !llvm.ptr
+  // CHECK:   nvvm.prefetch.tensormap %{{.*}}
+  nvvm.prefetch.tensormap %const_ptr : !llvm.ptr<4>
+  // CHECK:   nvvm.prefetch.tensormap %{{.*}}
+  nvvm.prefetch.tensormap %param_ptr : !llvm.ptr<101>
+  return
+}
+
 // -----
 
 // Just check these don't emit errors.
diff --git a/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir b/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
index f38b7529a7233..c29f27915946c 100644
--- a/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
@@ -45,3 +45,16 @@ llvm.func @prefetch_L1_uniform(%gen_ptr: !llvm.ptr) {
   nvvm.prefetch level = L1 uniform, %gen_ptr : !llvm.ptr
   llvm.return
 }
+
+llvm.func @prefetch_tensormap(%gen_ptr: !llvm.ptr, %const_ptr: !llvm.ptr<4>, %param_ptr: !llvm.ptr<101>) {
+  // CHECK-LABEL: define void @prefetch_tensormap(ptr %0, ptr addrspace(4) %1, ptr addrspace(101) %2) {
+  // CHECK-NEXT: call void @llvm.nvvm.prefetch.tensormap.p0(ptr %0)
+  // CHECK-NEXT: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %1)
+  // CHECK-NEXT: call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.prefetch.tensormap %gen_ptr : !llvm.ptr
+  nvvm.prefetch.tensormap %const_ptr: !llvm.ptr<4>
+  nvvm.prefetch.tensormap %param_ptr: !llvm.ptr<101>
+  llvm.return
+}