-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[MLIR][NVVM][NVGPU] Combine prefetch and prefetch.tensormap #153134
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[MLIR][NVVM][NVGPU] Combine prefetch and prefetch.tensormap #153134
Conversation
@llvm/pr-subscribers-mlir-gpu @llvm/pr-subscribers-mlir Author: Srinivasa Ravi (Wolfram70) ChangesThis PR updates the PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu Full diff: https://github.com/llvm/llvm-project/pull/153134.diff 4 Files Affected:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 8d507268a3a15..3112fbd68c54a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -25,9 +25,11 @@ include "mlir/Dialect/LLVMIR/LLVMTypes.td"
def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
+def LLVM_PointerConst : LLVM_PointerInAddressSpace<4>;
def LLVM_PointerLocal : LLVM_PointerInAddressSpace<5>;
def LLVM_PointerTensor : LLVM_PointerInAddressSpace<6>;
def LLVM_PointerSharedCluster : LLVM_PointerInAddressSpace<7>;
+def LLVM_PointerParam : LLVM_PointerInAddressSpace<101>;
//===----------------------------------------------------------------------===//
// NVVM dialect definitions
@@ -2464,15 +2466,30 @@ def NVVM_PrefetchOp : NVVM_Op<"prefetch"> {
}];
}
-def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
- [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>,
- Arguments<(ins LLVM_AnyPointer:$tmaDescriptor, PtxPredicate:$predicate)> {
- let assemblyFormat = "$tmaDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
+def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
+ [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, NVVMRequiresSM<90>]> {
+ let summary = "Brings the cache line containing an address from `const` or `param` state space for subsequent use by the `cp.async.bulk.tensor` instruction";
+ let description = [{
+ Operand `addr` can be a const, param or generic address pointer. If it is a
+ generic address pointer, it must map to a const or param memory location.
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu)
+ }];
+ let arguments = (ins AnyTypeOf<[LLVM_PointerGeneric,
+ LLVM_PointerConst,
+ LLVM_PointerParam]>:$addr,
+ PtxPredicate:$predicate);
+ let assemblyFormat = "$addr (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
+
+ let extraClassDeclaration = "bool hasIntrinsic() { return !getPredicate(); }";
let extraClassDefinition = [{
- std::string $cppClass::getPtx() {
+ std::string $cppClass::getPtx() {
return std::string("prefetch.tensormap [%0];");
}
}];
+ let llvmBuilder = [{
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_prefetch_tensormap, {$addr}, {$addr->getType()});
+ }];
}
def NVVM_CpAsyncBulkPrefetchOp : NVVM_Op<"cp.async.bulk.prefetch"> {
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index e50576722e38c..956ae113ba020 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -582,7 +582,7 @@ func.func @elect_one_leader_sync() {
// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
llvm.func @init_mbarrier_arrive_expect_tx(%desc : !llvm.ptr, %pred : i1) {
- //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "prefetch.tensormap [$0];", "l"
+ //CHECK: nvvm.prefetch.tensormap %{{.*}}
nvvm.prefetch.tensormap %desc : !llvm.ptr
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$1 prefetch.tensormap [$0];", "l,b"
nvvm.prefetch.tensormap %desc, predicate = %pred : !llvm.ptr, i1
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index c7fa41c98ac92..892bb2b13165a 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -619,6 +619,17 @@ func.func @prefetch(%gen_ptr: !llvm.ptr, %local_ptr: !llvm.ptr<5>, %global_ptr:
return
}
+// CHECK-LABEL: @prefetch_tensormap
+func.func @prefetch_tensormap(%gen_ptr: !llvm.ptr, %const_ptr: !llvm.ptr<4>, %param_ptr: !llvm.ptr<101>) {
+ // CHECK: nvvm.prefetch.tensormap %{{.*}}
+ nvvm.prefetch.tensormap %gen_ptr : !llvm.ptr
+ // CHECK: nvvm.prefetch.tensormap %{{.*}}
+ nvvm.prefetch.tensormap %const_ptr : !llvm.ptr<4>
+ // CHECK: nvvm.prefetch.tensormap %{{.*}}
+ nvvm.prefetch.tensormap %param_ptr : !llvm.ptr<101>
+ return
+}
+
// -----
// Just check these don't emit errors.
diff --git a/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir b/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
index f38b7529a7233..c29f27915946c 100644
--- a/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
@@ -45,3 +45,16 @@ llvm.func @prefetch_L1_uniform(%gen_ptr: !llvm.ptr) {
nvvm.prefetch level = L1 uniform, %gen_ptr : !llvm.ptr
llvm.return
}
+
+llvm.func @prefetch_tensormap(%gen_ptr: !llvm.ptr, %const_ptr: !llvm.ptr<4>, %param_ptr: !llvm.ptr<101>) {
+ // CHECK-LABEL: define void @prefetch_tensormap(ptr %0, ptr addrspace(4) %1, ptr addrspace(101) %2) {
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.tensormap.p0(ptr %0)
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %1)
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %2)
+ // CHECK-NEXT: ret void
+ // CHECK-NEXT: }
+ nvvm.prefetch.tensormap %gen_ptr : !llvm.ptr
+ nvvm.prefetch.tensormap %const_ptr: !llvm.ptr<4>
+ nvvm.prefetch.tensormap %param_ptr: !llvm.ptr<101>
+ llvm.return
+}
|
@llvm/pr-subscribers-mlir-llvm Author: Srinivasa Ravi (Wolfram70) ChangesThis PR updates the PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu Full diff: https://github.com/llvm/llvm-project/pull/153134.diff 4 Files Affected:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 8d507268a3a15..3112fbd68c54a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -25,9 +25,11 @@ include "mlir/Dialect/LLVMIR/LLVMTypes.td"
def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
+def LLVM_PointerConst : LLVM_PointerInAddressSpace<4>;
def LLVM_PointerLocal : LLVM_PointerInAddressSpace<5>;
def LLVM_PointerTensor : LLVM_PointerInAddressSpace<6>;
def LLVM_PointerSharedCluster : LLVM_PointerInAddressSpace<7>;
+def LLVM_PointerParam : LLVM_PointerInAddressSpace<101>;
//===----------------------------------------------------------------------===//
// NVVM dialect definitions
@@ -2464,15 +2466,30 @@ def NVVM_PrefetchOp : NVVM_Op<"prefetch"> {
}];
}
-def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
- [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>,
- Arguments<(ins LLVM_AnyPointer:$tmaDescriptor, PtxPredicate:$predicate)> {
- let assemblyFormat = "$tmaDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
+def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
+ [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, NVVMRequiresSM<90>]> {
+ let summary = "Brings the cache line containing an address from `const` or `param` state space for subsequent use by the `cp.async.bulk.tensor` instruction";
+ let description = [{
+ Operand `addr` can be a const, param or generic address pointer. If it is a
+ generic address pointer, it must map to a const or param memory location.
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu)
+ }];
+ let arguments = (ins AnyTypeOf<[LLVM_PointerGeneric,
+ LLVM_PointerConst,
+ LLVM_PointerParam]>:$addr,
+ PtxPredicate:$predicate);
+ let assemblyFormat = "$addr (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
+
+ let extraClassDeclaration = "bool hasIntrinsic() { return !getPredicate(); }";
let extraClassDefinition = [{
- std::string $cppClass::getPtx() {
+ std::string $cppClass::getPtx() {
return std::string("prefetch.tensormap [%0];");
}
}];
+ let llvmBuilder = [{
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_prefetch_tensormap, {$addr}, {$addr->getType()});
+ }];
}
def NVVM_CpAsyncBulkPrefetchOp : NVVM_Op<"cp.async.bulk.prefetch"> {
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index e50576722e38c..956ae113ba020 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -582,7 +582,7 @@ func.func @elect_one_leader_sync() {
// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
llvm.func @init_mbarrier_arrive_expect_tx(%desc : !llvm.ptr, %pred : i1) {
- //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "prefetch.tensormap [$0];", "l"
+ //CHECK: nvvm.prefetch.tensormap %{{.*}}
nvvm.prefetch.tensormap %desc : !llvm.ptr
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$1 prefetch.tensormap [$0];", "l,b"
nvvm.prefetch.tensormap %desc, predicate = %pred : !llvm.ptr, i1
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index c7fa41c98ac92..892bb2b13165a 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -619,6 +619,17 @@ func.func @prefetch(%gen_ptr: !llvm.ptr, %local_ptr: !llvm.ptr<5>, %global_ptr:
return
}
+// CHECK-LABEL: @prefetch_tensormap
+func.func @prefetch_tensormap(%gen_ptr: !llvm.ptr, %const_ptr: !llvm.ptr<4>, %param_ptr: !llvm.ptr<101>) {
+ // CHECK: nvvm.prefetch.tensormap %{{.*}}
+ nvvm.prefetch.tensormap %gen_ptr : !llvm.ptr
+ // CHECK: nvvm.prefetch.tensormap %{{.*}}
+ nvvm.prefetch.tensormap %const_ptr : !llvm.ptr<4>
+ // CHECK: nvvm.prefetch.tensormap %{{.*}}
+ nvvm.prefetch.tensormap %param_ptr : !llvm.ptr<101>
+ return
+}
+
// -----
// Just check these don't emit errors.
diff --git a/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir b/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
index f38b7529a7233..c29f27915946c 100644
--- a/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
@@ -45,3 +45,16 @@ llvm.func @prefetch_L1_uniform(%gen_ptr: !llvm.ptr) {
nvvm.prefetch level = L1 uniform, %gen_ptr : !llvm.ptr
llvm.return
}
+
+llvm.func @prefetch_tensormap(%gen_ptr: !llvm.ptr, %const_ptr: !llvm.ptr<4>, %param_ptr: !llvm.ptr<101>) {
+ // CHECK-LABEL: define void @prefetch_tensormap(ptr %0, ptr addrspace(4) %1, ptr addrspace(101) %2) {
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.tensormap.p0(ptr %0)
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %1)
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %2)
+ // CHECK-NEXT: ret void
+ // CHECK-NEXT: }
+ nvvm.prefetch.tensormap %gen_ptr : !llvm.ptr
+ nvvm.prefetch.tensormap %const_ptr: !llvm.ptr<4>
+ nvvm.prefetch.tensormap %param_ptr: !llvm.ptr<101>
+ llvm.return
+}
|
Except for a small ask in the naming, LGTM |
a80b9d2
to
593fab2
Compare
@grypp Could you take another look at this PR? Thanks! |
593fab2
to
6b69dfb
Compare
80b0665
to
b68e9c3
Compare
b68e9c3
to
9c76c93
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
@grypp @durga4github I have combined |
9c76c93
to
b6473b2
Compare
c3042bf
to
7f31020
Compare
7f31020
to
7890702
Compare
This change combines the `prefetch` and `prefetch.tensormap` NVVM Ops to one `prefetch` Op. The `tensormap` variant is lowered through the newly added intrinsics. The lowering of the NVGPU `tma.prefetch.descriptor` Op is changed from lowering to the `prefetch.tensormap` Op to `prefetch`.
7890702
to
3a12ed4
Compare
@grypp Is this PR okay to land? |
This PR combines the
prefetch
andprefetch.tensormap
NVVM Opsto one
prefetch
Op. Thetensormap
variant is lowered through thenewly added intrinsics.
The lowering of the NVGPU
tma.prefetch.descriptor
Op is changedfrom lowering to the
prefetch.tensormap
Op toprefetch
.PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu