-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGCN] Add missing gfx1250 clang tests. NFC. #155478
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGCN] Add missing gfx1250 clang tests. NFC. #155478
Conversation
@llvm/pr-subscribers-clang-driver Author: Stanislav Mekhanoshin (rampitec) ChangesPatch is 21.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155478.diff 5 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
index f300b05fe798a..cdfe9fcd89091 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
@@ -1,6 +1,7 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s
typedef float v2f __attribute__((ext_vector_type(2)));
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 4ff0571239e71..23af19d8ad950 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -58,6 +58,58 @@ void test_s_wait_tensorcnt() {
__builtin_amdgcn_s_wait_tensorcnt(0);
}
+// CHECK-LABEL: @test_bitop3_b32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.bitop3.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 1)
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
+// CHECK-NEXT: ret void
+//
+void test_bitop3_b32(global uint* out, uint a, uint b, uint c) {
+ *out = __builtin_amdgcn_bitop3_b32(a, b, c, 1);
+}
+
+// CHECK-LABEL: @test_bitop3_b16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i16 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: store i16 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 2
+// CHECK-NEXT: store i16 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[B_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[C_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.bitop3.i16(i16 [[TMP0]], i16 [[TMP1]], i16 [[TMP2]], i32 1)
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[TMP4]], align 2
+// CHECK-NEXT: ret void
+//
+void test_bitop3_b16(global ushort* out, ushort a, ushort b, ushort c) {
+ *out = __builtin_amdgcn_bitop3_b16(a, b, c, 1);
+}
+
// CHECK-LABEL: @test_prng_b32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -1258,6 +1310,145 @@ void test_prefetch(generic void *fptr, global void *gptr) {
__builtin_amdgcn_global_prefetch(gptr, 8);
}
+// CHECK-LABEL: @test_global_add_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store float [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]]
+// CHECK-NEXT: ret float [[TMP2]]
+//
+float test_global_add_f32(global float *addr, float x) {
+ return __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
+}
+
+// CHECK-LABEL: @test_global_add_half2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: ret <2 x half> [[TMP2]]
+//
+half2 test_global_add_half2(global half2 *addr, half2 x) {
+ return __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: @test_flat_add_2f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: ret <2 x half> [[TMP2]]
+//
+half2 test_flat_add_2f16(generic half2 *addr, half2 x) {
+ return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: @test_flat_add_2bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i16> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat>
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16>
+// CHECK-NEXT: ret <2 x i16> [[TMP4]]
+//
+short2 test_flat_add_2bf16(generic short2 *addr, short2 x) {
+ return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: @test_global_add_2bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i16> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat>
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16>
+// CHECK-NEXT: ret <2 x i16> [[TMP4]]
+//
+short2 test_global_add_2bf16(global short2 *addr, short2 x) {
+ return __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: @test_local_add_2f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(3) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store <2 x i16> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(3), ptr [[ADDR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat>
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16>
+// CHECK-NEXT: ret <2 x i16> [[TMP4]]
+//
+short2 test_local_add_2f16(local short2 *addr, short2 x) {
+ return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: @test_local_add_2bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(3) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(3), ptr [[ADDR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4
+// CHECK-NEXT: ret <2 x half> [[TMP2]]
+//
+half2 test_local_add_2bf16(local half2 *addr, half2 x) {
+ return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
+}
+
// CHECK-LABEL: @test_cvt_pk_fp8_f32_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/Driver/cuda-bad-arch.cu b/clang/test/Driver/cuda-bad-arch.cu
index 85231a5b9705a..6ac72296049bc 100644
--- a/clang/test/Driver/cuda-bad-arch.cu
+++ b/clang/test/Driver/cuda-bad-arch.cu
@@ -25,6 +25,8 @@
// RUN: | FileCheck -check-prefix OK %s
// RUN: %clang -### -x hip --target=x86_64-linux-gnu -nogpulib -nogpuinc --cuda-gpu-arch=gfx942 -c %s 2>&1 \
// RUN: | FileCheck -check-prefix OK %s
+// RUN: %clang -### -x hip --target=x86_64-linux-gnu -nogpulib -nogpuinc --cuda-gpu-arch=gfx1250 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix OK %s
// We don't allow using NVPTX/AMDGCN for host compilation.
// RUN: not %clang -### --no-offload-new-driver --cuda-host-only --target=nvptx-nvidia-cuda -nogpulib -nogpuinc -c %s 2>&1 \
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 8f34cccaecb7a..4a28f9acdecf7 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -23,6 +23,11 @@ void test_setprio_inc_wg(short a) {
__builtin_amdgcn_s_setprio_inc_wg(a); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' must be a constant integer}}
}
+void test_bitop3_args(global uint* out, uint a, uint b, uint c) {
+ *out = __builtin_amdgcn_bitop3_b32(a, b, c, a); // expected-error {{argument to '__builtin_amdgcn_bitop3_b32' must be a constant integer}}
+ *out = __builtin_amdgcn_bitop3_b16((ushort)a, (ushort)b, (ushort)c, a); // expected-error {{argument to '__builtin_amdgcn_bitop3_b16' must be a constant integer}}
+}
+
void test_s_monitor_sleep(short a) {
__builtin_amdgcn_s_monitor_sleep(a); // expected-error {{'__builtin_amdgcn_s_monitor_sleep' must be a constant integer}}
}
@@ -43,6 +48,12 @@ void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) {
__builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}}
}
+void test_cvt_sr_f8_f16(global int* out, uint sr, int old, int sel)
+{
+ *out = __builtin_amdgcn_cvt_sr_bf8_f16(1.0, sr, old, sel); // expected-error {{'__builtin_amdgcn_cvt_sr_bf8_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_cvt_sr_fp8_f16(1.0, sr, old, sel); // expected-error {{'__builtin_amdgcn_cvt_sr_fp8_f16' must be a constant integer}}
+}
+
void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
global float32 *outf32, global half16 *outh16, global bfloat16 *outy16,
global float16 *outf16, uint3 src3,
@@ -92,6 +103,34 @@ void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global
*b128out = __builtin_amdgcn_flat_load_monitor_b128(b128faddr, cpol); // expected-error {{'__builtin_amdgcn_flat_load_monitor_b128' must be a constant integer}}
}
+void test_amdgcn_async_load_store_lds_offset(global char* gaddr8, global int *gaddr32, global v2i* gaddr64, global v4i* gaddr128, local char* laddr8,
+ local int *laddr32, local v2i* laddr64, local v4i* laddr128, int offset, int mask)
+{
+ __builtin_amdgcn_global_store_async_from_lds_b8(gaddr8, laddr8, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b8' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b32(gaddr32, laddr32, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b32' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b64(gaddr64, laddr64, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b64' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b128(gaddr128, laddr128, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b128' must be a constant integer}}
+
+ __builtin_amdgcn_global_store_async_from_lds_b8(gaddr8, laddr8, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b8' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b32(gaddr32, laddr32, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b32' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b64(gaddr64, laddr64, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b64' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b128(gaddr128, laddr128, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b128' must be a constant integer}}
+}
+
+void test_amdgcn_async_load_store_lds_cpol(global char* gaddr8, global int *gaddr32, global v2i* gaddr64, global v4i* gaddr128, local char* laddr8,
+ local int *laddr32, local v2i* laddr64, local v4i* laddr128, int cpol, int mask)
+{
+ __builtin_amdgcn_global_store_async_from_lds_b8(gaddr8, laddr8, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b8' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b32(gaddr32, laddr32, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b32' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b64(gaddr64, laddr64, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b64' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b128(gaddr128, laddr128, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b128' must be a constant integer}}
+
+ __builtin_amdgcn_global_store_async_from_lds_b8(gaddr8, laddr8, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b8' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b32(gaddr32, laddr32, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b32' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b64(gaddr64, laddr64, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b64' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b128(gaddr128, laddr128, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b128' must be a constant integer}}
+}
+
void test_amdgcn_tensor_load_store(v4i sg0, v...
[truncated]
|
@llvm/pr-subscribers-clang Author: Stanislav Mekhanoshin (rampitec) ChangesPatch is 21.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155478.diff 5 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
index f300b05fe798a..cdfe9fcd89091 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
@@ -1,6 +1,7 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s
typedef float v2f __attribute__((ext_vector_type(2)));
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 4ff0571239e71..23af19d8ad950 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -58,6 +58,58 @@ void test_s_wait_tensorcnt() {
__builtin_amdgcn_s_wait_tensorcnt(0);
}
+// CHECK-LABEL: @test_bitop3_b32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.bitop3.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 1)
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
+// CHECK-NEXT: ret void
+//
+void test_bitop3_b32(global uint* out, uint a, uint b, uint c) {
+ *out = __builtin_amdgcn_bitop3_b32(a, b, c, 1);
+}
+
+// CHECK-LABEL: @test_bitop3_b16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i16 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: store i16 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 2
+// CHECK-NEXT: store i16 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[B_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[C_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.bitop3.i16(i16 [[TMP0]], i16 [[TMP1]], i16 [[TMP2]], i32 1)
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[TMP4]], align 2
+// CHECK-NEXT: ret void
+//
+void test_bitop3_b16(global ushort* out, ushort a, ushort b, ushort c) {
+ *out = __builtin_amdgcn_bitop3_b16(a, b, c, 1);
+}
+
// CHECK-LABEL: @test_prng_b32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -1258,6 +1310,145 @@ void test_prefetch(generic void *fptr, global void *gptr) {
__builtin_amdgcn_global_prefetch(gptr, 8);
}
+// CHECK-LABEL: @test_global_add_f32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store float [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]]
+// CHECK-NEXT: ret float [[TMP2]]
+//
+float test_global_add_f32(global float *addr, float x) {
+ return __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
+}
+
+// CHECK-LABEL: @test_global_add_half2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: ret <2 x half> [[TMP2]]
+//
+half2 test_global_add_half2(global half2 *addr, half2 x) {
+ return __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: @test_flat_add_2f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: ret <2 x half> [[TMP2]]
+//
+half2 test_flat_add_2f16(generic half2 *addr, half2 x) {
+ return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: @test_flat_add_2bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i16> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat>
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16>
+// CHECK-NEXT: ret <2 x i16> [[TMP4]]
+//
+short2 test_flat_add_2bf16(generic short2 *addr, short2 x) {
+ return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: @test_global_add_2bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i16> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat>
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16>
+// CHECK-NEXT: ret <2 x i16> [[TMP4]]
+//
+short2 test_global_add_2bf16(global short2 *addr, short2 x) {
+ return __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: @test_local_add_2f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(3) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store <2 x i16> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(3), ptr [[ADDR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat>
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16>
+// CHECK-NEXT: ret <2 x i16> [[TMP4]]
+//
+short2 test_local_add_2f16(local short2 *addr, short2 x) {
+ return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: @test_local_add_2bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR_ADDR]] to ptr
+// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(3) [[ADDR:%.*]], ptr [[ADDR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(3), ptr [[ADDR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4
+// CHECK-NEXT: ret <2 x half> [[TMP2]]
+//
+half2 test_local_add_2bf16(local half2 *addr, half2 x) {
+ return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
+}
+
// CHECK-LABEL: @test_cvt_pk_fp8_f32_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/Driver/cuda-bad-arch.cu b/clang/test/Driver/cuda-bad-arch.cu
index 85231a5b9705a..6ac72296049bc 100644
--- a/clang/test/Driver/cuda-bad-arch.cu
+++ b/clang/test/Driver/cuda-bad-arch.cu
@@ -25,6 +25,8 @@
// RUN: | FileCheck -check-prefix OK %s
// RUN: %clang -### -x hip --target=x86_64-linux-gnu -nogpulib -nogpuinc --cuda-gpu-arch=gfx942 -c %s 2>&1 \
// RUN: | FileCheck -check-prefix OK %s
+// RUN: %clang -### -x hip --target=x86_64-linux-gnu -nogpulib -nogpuinc --cuda-gpu-arch=gfx1250 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix OK %s
// We don't allow using NVPTX/AMDGCN for host compilation.
// RUN: not %clang -### --no-offload-new-driver --cuda-host-only --target=nvptx-nvidia-cuda -nogpulib -nogpuinc -c %s 2>&1 \
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 8f34cccaecb7a..4a28f9acdecf7 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -23,6 +23,11 @@ void test_setprio_inc_wg(short a) {
__builtin_amdgcn_s_setprio_inc_wg(a); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' must be a constant integer}}
}
+void test_bitop3_args(global uint* out, uint a, uint b, uint c) {
+ *out = __builtin_amdgcn_bitop3_b32(a, b, c, a); // expected-error {{argument to '__builtin_amdgcn_bitop3_b32' must be a constant integer}}
+ *out = __builtin_amdgcn_bitop3_b16((ushort)a, (ushort)b, (ushort)c, a); // expected-error {{argument to '__builtin_amdgcn_bitop3_b16' must be a constant integer}}
+}
+
void test_s_monitor_sleep(short a) {
__builtin_amdgcn_s_monitor_sleep(a); // expected-error {{'__builtin_amdgcn_s_monitor_sleep' must be a constant integer}}
}
@@ -43,6 +48,12 @@ void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) {
__builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}}
}
+void test_cvt_sr_f8_f16(global int* out, uint sr, int old, int sel)
+{
+ *out = __builtin_amdgcn_cvt_sr_bf8_f16(1.0, sr, old, sel); // expected-error {{'__builtin_amdgcn_cvt_sr_bf8_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_cvt_sr_fp8_f16(1.0, sr, old, sel); // expected-error {{'__builtin_amdgcn_cvt_sr_fp8_f16' must be a constant integer}}
+}
+
void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
global float32 *outf32, global half16 *outh16, global bfloat16 *outy16,
global float16 *outf16, uint3 src3,
@@ -92,6 +103,34 @@ void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global
*b128out = __builtin_amdgcn_flat_load_monitor_b128(b128faddr, cpol); // expected-error {{'__builtin_amdgcn_flat_load_monitor_b128' must be a constant integer}}
}
+void test_amdgcn_async_load_store_lds_offset(global char* gaddr8, global int *gaddr32, global v2i* gaddr64, global v4i* gaddr128, local char* laddr8,
+ local int *laddr32, local v2i* laddr64, local v4i* laddr128, int offset, int mask)
+{
+ __builtin_amdgcn_global_store_async_from_lds_b8(gaddr8, laddr8, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b8' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b32(gaddr32, laddr32, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b32' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b64(gaddr64, laddr64, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b64' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b128(gaddr128, laddr128, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b128' must be a constant integer}}
+
+ __builtin_amdgcn_global_store_async_from_lds_b8(gaddr8, laddr8, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b8' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b32(gaddr32, laddr32, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b32' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b64(gaddr64, laddr64, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b64' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b128(gaddr128, laddr128, offset, 0); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b128' must be a constant integer}}
+}
+
+void test_amdgcn_async_load_store_lds_cpol(global char* gaddr8, global int *gaddr32, global v2i* gaddr64, global v4i* gaddr128, local char* laddr8,
+ local int *laddr32, local v2i* laddr64, local v4i* laddr128, int cpol, int mask)
+{
+ __builtin_amdgcn_global_store_async_from_lds_b8(gaddr8, laddr8, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b8' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b32(gaddr32, laddr32, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b32' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b64(gaddr64, laddr64, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b64' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b128(gaddr128, laddr128, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b128' must be a constant integer}}
+
+ __builtin_amdgcn_global_store_async_from_lds_b8(gaddr8, laddr8, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b8' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b32(gaddr32, laddr32, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b32' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b64(gaddr64, laddr64, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b64' must be a constant integer}}
+ __builtin_amdgcn_global_store_async_from_lds_b128(gaddr128, laddr128, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b128' must be a constant integer}}
+}
+
void test_amdgcn_tensor_load_store(v4i sg0, v...
[truncated]
|
No description provided.