-
Notifications
You must be signed in to change notification settings - Fork 14.9k
clang/AMDGPU: Add __builtin_amdgcn_inverse_ballot_w{32,64} #155724
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-backend-amdgpu Author: Nicolai Hähnle (nhaehnle) ChangesAdd builtins that expose the underlying llvm.amdgcn.inverse.ballot intrinsic that we've had for a while. This allows more explicitly writing code that selects or branches in terms of lane masks, which can lead to better code quality. Full diff: https://github.com/llvm/llvm-project/pull/155724.diff 5 Files Affected:
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index a13e0a5952fe4..2ce60de05fff2 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -5162,6 +5162,23 @@ If no address spaces names are provided, all address spaces are fenced.
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local")
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global")
+__builtin_amdgcn_ballot_w{32,64}
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``__builtin_amdgcn_ballot_w{32,64}`` returns a bitmask that contains its
+boolean argument as a bit for every lane of the current wave that is currently
+active (i.e., that is converged with the executing thread), and a 0 bit for
+every lane that is not active.
+
+The result is uniform, i.e. it is the same in every active thread of the wave.
+
+__builtin_amdgcn_inverse_ballot_w{32,64}
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Given a wave-uniform bitmask, ``__builtin_amdgcn_inverse_ballot_w{32,64}(mask)``
+returns the bit at the position of the current lane. It is almost equivalent to
+``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if
+the given mask has the same value for all active lanes of the current wave.
ARM/AArch64 Language Extensions
-------------------------------
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index f8f55772db8fe..6f5d1e024b91d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -183,6 +183,9 @@ TARGET_BUILTIN(__builtin_amdgcn_struct_ptr_buffer_load_lds, "vQbv*3IUiiiiIiIi",
TARGET_BUILTIN(__builtin_amdgcn_ballot_w32, "ZUib", "nc", "wavefrontsize32")
BUILTIN(__builtin_amdgcn_ballot_w64, "WUib", "nc")
+TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w32, "bZUi", "nc", "wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w64, "bWUi", "nc", "wavefrontsize64")
+
// Deprecated intrinsics in favor of __builtin_amdgn_ballot_{w32|w64}
BUILTIN(__builtin_amdgcn_uicmp, "WUiUiUiIi", "nc")
BUILTIN(__builtin_amdgcn_uicmpl, "WUiWUiWUiIi", "nc")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index dad1f95ac710d..ac674f3cd59e8 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -504,6 +504,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
return Builder.CreateCall(F, { Src });
}
+ case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32:
+ case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: {
+ llvm::Value *Src = EmitScalarExpr(E->getArg(0));
+ Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_inverse_ballot, { Src->getType() });
+ return Builder.CreateCall(F, { Src });
+ }
case AMDGPU::BI__builtin_amdgcn_tanhf:
case AMDGPU::BI__builtin_amdgcn_tanhh:
case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
index 5e587cb87e073..d390418523694 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
@@ -24,6 +24,13 @@ void test_ballot_wave32_target_attr(global uint* out, int a, int b)
*out = __builtin_amdgcn_ballot_w32(a == b);
}
+// CHECK-LABEL: @test_inverse_ballot_wave32(
+// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %{{.+}})
+void test_inverse_ballot_wave32(global bool* out, int a)
+{
+ *out = __builtin_amdgcn_inverse_ballot_w32(a);
+}
+
// CHECK-LABEL: @test_read_exec(
// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true)
void test_read_exec(global uint* out) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
index 1fc2ac0d3141e..d851ec7e6734f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
@@ -23,6 +23,13 @@ void test_ballot_wave64_target_attr(global ulong* out, int a, int b)
*out = __builtin_amdgcn_ballot_w64(a == b);
}
+// CHECK-LABEL: @test_inverse_ballot_wave64(
+// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %{{.+}})
+void test_inverse_ballot_wave64(global bool* out, ulong a)
+{
+ *out = __builtin_amdgcn_inverse_ballot_w64(a);
+}
+
// CHECK-LABEL: @test_read_exec(
// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true)
void test_read_exec(global ulong* out) {
|
@llvm/pr-subscribers-clang Author: Nicolai Hähnle (nhaehnle) ChangesAdd builtins that expose the underlying llvm.amdgcn.inverse.ballot intrinsic that we've had for a while. This allows more explicitly writing code that selects or branches in terms of lane masks, which can lead to better code quality. Full diff: https://github.com/llvm/llvm-project/pull/155724.diff 5 Files Affected:
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index a13e0a5952fe4..2ce60de05fff2 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -5162,6 +5162,23 @@ If no address spaces names are provided, all address spaces are fenced.
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local")
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global")
+__builtin_amdgcn_ballot_w{32,64}
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``__builtin_amdgcn_ballot_w{32,64}`` returns a bitmask that contains its
+boolean argument as a bit for every lane of the current wave that is currently
+active (i.e., that is converged with the executing thread), and a 0 bit for
+every lane that is not active.
+
+The result is uniform, i.e. it is the same in every active thread of the wave.
+
+__builtin_amdgcn_inverse_ballot_w{32,64}
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Given a wave-uniform bitmask, ``__builtin_amdgcn_inverse_ballot_w{32,64}(mask)``
+returns the bit at the position of the current lane. It is almost equivalent to
+``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if
+the given mask has the same value for all active lanes of the current wave.
ARM/AArch64 Language Extensions
-------------------------------
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index f8f55772db8fe..6f5d1e024b91d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -183,6 +183,9 @@ TARGET_BUILTIN(__builtin_amdgcn_struct_ptr_buffer_load_lds, "vQbv*3IUiiiiIiIi",
TARGET_BUILTIN(__builtin_amdgcn_ballot_w32, "ZUib", "nc", "wavefrontsize32")
BUILTIN(__builtin_amdgcn_ballot_w64, "WUib", "nc")
+TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w32, "bZUi", "nc", "wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w64, "bWUi", "nc", "wavefrontsize64")
+
// Deprecated intrinsics in favor of __builtin_amdgn_ballot_{w32|w64}
BUILTIN(__builtin_amdgcn_uicmp, "WUiUiUiIi", "nc")
BUILTIN(__builtin_amdgcn_uicmpl, "WUiWUiWUiIi", "nc")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index dad1f95ac710d..ac674f3cd59e8 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -504,6 +504,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
return Builder.CreateCall(F, { Src });
}
+ case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32:
+ case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: {
+ llvm::Value *Src = EmitScalarExpr(E->getArg(0));
+ Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_inverse_ballot, { Src->getType() });
+ return Builder.CreateCall(F, { Src });
+ }
case AMDGPU::BI__builtin_amdgcn_tanhf:
case AMDGPU::BI__builtin_amdgcn_tanhh:
case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
index 5e587cb87e073..d390418523694 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
@@ -24,6 +24,13 @@ void test_ballot_wave32_target_attr(global uint* out, int a, int b)
*out = __builtin_amdgcn_ballot_w32(a == b);
}
+// CHECK-LABEL: @test_inverse_ballot_wave32(
+// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %{{.+}})
+void test_inverse_ballot_wave32(global bool* out, int a)
+{
+ *out = __builtin_amdgcn_inverse_ballot_w32(a);
+}
+
// CHECK-LABEL: @test_read_exec(
// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true)
void test_read_exec(global uint* out) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
index 1fc2ac0d3141e..d851ec7e6734f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
@@ -23,6 +23,13 @@ void test_ballot_wave64_target_attr(global ulong* out, int a, int b)
*out = __builtin_amdgcn_ballot_w64(a == b);
}
+// CHECK-LABEL: @test_inverse_ballot_wave64(
+// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %{{.+}})
+void test_inverse_ballot_wave64(global bool* out, ulong a)
+{
+ *out = __builtin_amdgcn_inverse_ballot_w64(a);
+}
+
// CHECK-LABEL: @test_read_exec(
// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true)
void test_read_exec(global ulong* out) {
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Add builtins that expose the underlying llvm.amdgcn.inverse.ballot intrinsic that we've had for a while. This allows more explicitly writing code that selects or branches in terms of lane masks, which can lead to better code quality.
efd32dd
to
cbebbcc
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should have introduced a wavesize_t and have one unsuffixed builtin, but this is what we are doing in the other cases already
Add builtins that expose the underlying llvm.amdgcn.inverse.ballot intrinsic that we've had for a while.
This allows more explicitly writing code that selects or branches in terms of lane masks, which can lead to better code quality.