diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 47a552a7bf495..5dd65103fbbb9 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -743,20 +743,21 @@ void CodeGenModule::handleAMDGPUWavesPerEUAttr( llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) { unsigned Min = Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue(); - unsigned Max = - Attr->getMax() - ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue() - : 0; - if (Min != 0) { - assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max"); - - std::string AttrVal = llvm::utostr(Min); - if (Max != 0) - AttrVal = AttrVal + "," + llvm::utostr(Max); - F->addFnAttr("amdgpu-waves-per-eu", AttrVal); - } else - assert(Max == 0 && "Max must be zero"); + if (Attr->getMax()) { + unsigned Max = + Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue(); + assert(Min == 0 || (Min != 0 && Max != 0) && + "Min must be non-zero when Max is non-zero"); + assert(Min <= Max && "Min must be less than or equal to Max"); + // Do not add the attribute if min,max=0,0. + if (Max != 0) { + std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max); + F->addFnAttr("amdgpu-waves-per-eu", AttrVal); + } + } else if (Min != 0) { + F->addFnAttr("amdgpu-waves-per-eu", llvm::utostr(Min)); + } } std::unique_ptr diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index c23c98aa3aaeb..76bd11d70fa51 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -245,11 +245,6 @@ static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr, if (MaxExpr && !S.checkUInt32Argument(Attr, MaxExpr, Max, 1)) return true; - if (Min == 0 && Max != 0) { - S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid) - << &Attr << 0; - return true; - } if (Max != 0 && Min > Max) { S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid) << &Attr << 1; diff --git a/clang/test/CodeGenHIP/amdgpu-waves-per-eu.hip b/clang/test/CodeGenHIP/amdgpu-waves-per-eu.hip new file mode 100644 index 0000000000000..1fb7fd1501f0a --- /dev/null +++ b/clang/test/CodeGenHIP/amdgpu-waves-per-eu.hip @@ -0,0 +1,53 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device -emit-llvm -o - %s | FileCheck %s + +// COM: Most tests are in the OpenCL semastics, this is just a verification for HIP + +#define __global__ __attribute__((global)) + +//. +// CHECK: @__hip_cuid_ = addrspace(1) global i8 0 +// CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata" +//. +// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@_Z21kernel_waves_per_eu_0v +// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +__global__ __attribute__((amdgpu_waves_per_eu(0))) void kernel_waves_per_eu_0() {} + +// Equivalent to kernel_waves_per_eu_0. +// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_0_0v +// CHECK-SAME: () #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +__global__ __attribute__((amdgpu_waves_per_eu(0, 0))) void kernel_waves_per_eu_0_0() {} + +// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_0_4v +// CHECK-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +__global__ __attribute__((amdgpu_waves_per_eu(0, 4))) void kernel_waves_per_eu_0_4() {} + +// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_1_4v +// CHECK-SAME: () #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +__global__ __attribute__((amdgpu_waves_per_eu(1, 4))) void kernel_waves_per_eu_1_4() {} +//. +// CHECK: attributes #[[ATTR0]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-waves-per-eu"="0,4" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// CHECK: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-waves-per-eu"="1,4" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +//. +// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// CHECK: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +//. diff --git a/clang/test/SemaOpenCL/amdgpu-attrs.cl b/clang/test/SemaOpenCL/amdgpu-attrs.cl index 89ba3f86803c5..50497d68f5991 100644 --- a/clang/test/SemaOpenCL/amdgpu-attrs.cl +++ b/clang/test/SemaOpenCL/amdgpu-attrs.cl @@ -46,7 +46,6 @@ __attribute__((amdgpu_num_sgpr(4294967296))) kernel void kernel_num_sgpr_L() {} __attribute__((amdgpu_num_vgpr(4294967296))) kernel void kernel_num_vgpr_L() {} // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}} __attribute__((amdgpu_flat_work_group_size(0, 64))) kernel void kernel_flat_work_group_size_0_64() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: max must be 0 since min is 0}} -__attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: max must be 0 since min is 0}} __attribute__((amdgpu_flat_work_group_size(64, 32))) kernel void kernel_flat_work_group_size_64_32() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: min must not be greater than max}} __attribute__((amdgpu_waves_per_eu(4, 2))) kernel void kernel_waves_per_eu_4_2() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: min must not be greater than max}} @@ -61,6 +60,7 @@ __attribute__((amdgpu_num_vgpr(0))) kernel void kernel_num_vgpr_0() {} kernel __attribute__((amdgpu_flat_work_group_size(32, 64))) void kernel_flat_work_group_size_32_64() {} kernel __attribute__((amdgpu_waves_per_eu(2))) void kernel_waves_per_eu_2() {} +kernel __attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {} kernel __attribute__((amdgpu_waves_per_eu(2, 4))) void kernel_waves_per_eu_2_4() {} kernel __attribute__((amdgpu_num_sgpr(32))) void kernel_num_sgpr_32() {} kernel __attribute__((amdgpu_num_vgpr(64))) void kernel_num_vgpr_64() {} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index d095fc6cf9549..1fcd0654fba31 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -156,15 +156,15 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { } } -std::pair AMDGPUSubtarget::getFlatWorkGroupSizes( - const Function &F) const { +std::pair +AMDGPUSubtarget::getFlatWorkGroupSizes(const Function &F) const { // Default minimum/maximum flat work group sizes. std::pair Default = - getDefaultFlatWorkGroupSize(F.getCallingConv()); + getDefaultFlatWorkGroupSize(F.getCallingConv()); // Requested minimum/maximum flat work group sizes. std::pair Requested = AMDGPU::getIntegerPairAttribute( - F, "amdgpu-flat-work-group-size", Default); + F, "amdgpu-flat-work-group-size", Default); // Make sure requested minimum is less than requested maximum. if (Requested.first > Requested.second) @@ -186,23 +186,29 @@ std::pair AMDGPUSubtarget::getEffectiveWavesPerEU( // sizes limits the achievable maximum, and we aim to support enough waves per // EU so that we can concurrently execute all waves of a single workgroup of // maximum size on a CU. - std::pair Default = { + std::pair WavesPerEU = { getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second), getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second}; - Default.first = std::min(Default.first, Default.second); - - // Make sure requested minimum is within the default range and lower than the - // requested maximum. The latter must not violate target specification. - if (RequestedWavesPerEU.first < Default.first || - RequestedWavesPerEU.first > Default.second || - RequestedWavesPerEU.first > RequestedWavesPerEU.second || - RequestedWavesPerEU.second > getMaxWavesPerEU()) - return Default; - - // We cannot exceed maximum occupancy implied by flat workgroup size and LDS. - RequestedWavesPerEU.second = - std::min(RequestedWavesPerEU.second, Default.second); - return RequestedWavesPerEU; + WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second); + + // Requested minimum must not violate subtarget's specifications and be no + // greater than maximum. + if (RequestedWavesPerEU.first && + (RequestedWavesPerEU.first < getMinWavesPerEU() || + RequestedWavesPerEU.first > RequestedWavesPerEU.second)) + return WavesPerEU; + // Requested maximum must not violate subtarget's specifications. + if (RequestedWavesPerEU.second > getMaxWavesPerEU()) + return WavesPerEU; + + // A requested maximum may limit both the final minimum and maximum, but + // not increase them. A requested minimum can either decrease or increase the + // default minimum as long as it doesn't exceed the maximum. + WavesPerEU.second = std::min(WavesPerEU.second, RequestedWavesPerEU.second); + if (RequestedWavesPerEU.first) + WavesPerEU.first = RequestedWavesPerEU.first; + WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second); + return WavesPerEU; } std::pair @@ -229,7 +235,7 @@ std::pair AMDGPUSubtarget::getWavesPerEU(std::pair FlatWorkGroupSizes, unsigned LDSBytes, const Function &F) const { // Default minimum/maximum number of waves per execution unit. - std::pair Default(1, getMaxWavesPerEU()); + std::pair Default(0, getMaxWavesPerEU()); // Requested minimum/maximum number of waves per execution unit. std::pair Requested = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 1e44be8e47201..9e98d9b6dbaca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -128,7 +128,9 @@ class AMDGPUSubtarget { /// Returns the target minimum/maximum number of waves per EU. This is based /// on the minimum/maximum number of \p RequestedWavesPerEU and further /// limited by the maximum achievable occupancy derived from the range of \p - /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. + /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A + /// minimum requested waves/EU value of 0 indicates an intent to not restrict + /// the minimum target occupancy. std::pair getEffectiveWavesPerEU(std::pair RequestedWavesPerEU, std::pair FlatWorkGroupSizes, diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index e9fe4f3c618c7..2ab38a9f12a52 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -225,3 +225,15 @@ entry: ret void } attributes #12 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,10" "amdgpu-lds-size"="16384"} + +; At most 2 waves per execution unit. +; CHECK-LABEL: {{^}}empty_at_most_2: +; CHECK: SGPRBlocks: 12 +; CHECK: VGPRBlocks: 21 +; CHECK: NumSGPRsForWavesPerEU: 102 +; CHECK: NumVGPRsForWavesPerEU: 85 +define amdgpu_kernel void @empty_at_most_2() #13 { +entry: + ret void +} +attributes #13 = {"amdgpu-waves-per-eu"="0,2"} diff --git a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll index 67061bcb2a785..f8c7b593e6eba 100644 --- a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll @@ -57,5 +57,5 @@ entry: ret void } -attributes #0 = { "amdgpu-waves-per-eu"="1,1" } -attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" } +attributes #0 = { "amdgpu-waves-per-eu"="1" } +attributes #1 = { "amdgpu-waves-per-eu"="1" "amdgpu-flat-work-group-size"="1,1024" } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index b87d266cc2514..5e1cae0760c36 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s -; Check propagation of amdgpu-flat-work-group-size attribute. +; Check propagation of amdgpu-waves-per-eu attribute. ; Called from a single kernel with 1,8 define internal void @default_to_1_8_a() { @@ -216,41 +216,30 @@ define internal i32 @bitcasted_function() { ret i32 0 } -define internal void @called_from_invalid_bounds_0() { -; CHECK-LABEL: define internal void @called_from_invalid_bounds_0 -; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: ret void -; +define internal void @called_without_min_waves() { ret void } -define internal void @called_from_invalid_bounds_1() { -; CHECK-LABEL: define internal void @called_from_invalid_bounds_1 +define internal void @called_from_invalid_bounds() { +; CHECK-LABEL: define internal void @called_from_invalid_bounds ; CHECK-SAME: () #[[ATTR10:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void } -; Invalid range for amdgpu-waves-per-eu -define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 { -; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8 -; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: call void @called_from_invalid_bounds_0() -; CHECK-NEXT: ret void -; - call void @called_from_invalid_bounds_0() +define internal void @called_from_invalid_bounds_1() { + call void @called_without_min_waves() ret void } - ; Invalid range for amdgpu-waves-per-eu define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_1_123 ; CHECK-SAME: () #[[ATTR11:[0-9]+]] { -; CHECK-NEXT: call void @called_from_invalid_bounds_1() +; CHECK-NEXT: call void @called_from_invalid_bounds() ; CHECK-NEXT: ret void ; - call void @called_from_invalid_bounds_1() + call void @called_from_invalid_bounds() ret void } @@ -279,7 +268,7 @@ define amdgpu_kernel void @kernel_3_6() #12 { ; 3,6 -> 6,9 define internal void @refine_upper_func_3_6() #13 { ; CHECK-LABEL: define internal void @refine_upper_func_3_6 -; CHECK-SAME: () #[[ATTR9]] { +; CHECK-SAME: () #[[ATTR14:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -288,7 +277,7 @@ define internal void @refine_upper_func_3_6() #13 { ; 4,8 -> 6,8 define internal void @refine_lower_func_4_8() #14 { ; CHECK-LABEL: define internal void @refine_lower_func_4_8 -; CHECK-SAME: () #[[ATTR14:[0-9]+]] { +; CHECK-SAME: () #[[ATTR15:[0-9]+]] { ; CHECK-NEXT: call void @refine_upper_func_3_6() ; CHECK-NEXT: ret void ; @@ -298,7 +287,7 @@ define internal void @refine_lower_func_4_8() #14 { define amdgpu_kernel void @kernel_foo_6_8() #15 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_foo_6_8 -; CHECK-SAME: () #[[ATTR15:[0-9]+]] { +; CHECK-SAME: () #[[ATTR16:[0-9]+]] { ; CHECK-NEXT: call void @refine_upper_func_3_6() ; CHECK-NEXT: call void @refine_lower_func_4_8() ; CHECK-NEXT: call void @func_9_10_a() @@ -313,7 +302,7 @@ define amdgpu_kernel void @kernel_foo_6_8() #15 { ; 5,5 -> 5,5 define internal void @func_5_5() #16 { ; CHECK-LABEL: define internal void @func_5_5 -; CHECK-SAME: () #[[ATTR16:[0-9]+]] { +; CHECK-SAME: () #[[ATTR17:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -322,7 +311,7 @@ define internal void @func_5_5() #16 { ; 5,8 -> 8,8 define internal void @func_5_8() #17 { ; CHECK-LABEL: define internal void @func_5_8 -; CHECK-SAME: () #[[ATTR17:[0-9]+]] { +; CHECK-SAME: () #[[ATTR18:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -331,7 +320,7 @@ define internal void @func_5_8() #17 { ; 9,10 -> 9,10 define internal void @func_9_10_a() #18 { ; CHECK-LABEL: define internal void @func_9_10_a -; CHECK-SAME: () #[[ATTR18:[0-9]+]] { +; CHECK-SAME: () #[[ATTR19:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -340,7 +329,7 @@ define internal void @func_9_10_a() #18 { ; 9,10 -> 9,9 define internal void @func_9_10_b() #18 { ; CHECK-LABEL: define internal void @func_9_10_b -; CHECK-SAME: () #[[ATTR18]] { +; CHECK-SAME: () #[[ATTR19]] { ; CHECK-NEXT: ret void ; ret void @@ -348,7 +337,7 @@ define internal void @func_9_10_b() #18 { define amdgpu_kernel void @kernel_bar_8_9() #19 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_bar_8_9 -; CHECK-SAME: () #[[ATTR19:[0-9]+]] { +; CHECK-SAME: () #[[ATTR20:[0-9]+]] { ; CHECK-NEXT: call void @refine_upper_func_3_6() ; CHECK-NEXT: call void @func_5_5() ; CHECK-NEXT: call void @func_9_10_b() @@ -413,10 +402,11 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" } ; CHECK: attributes #[[ATTR11]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR12]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR13]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR20]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } ;.