Skip to content

Commit 776ec74

Browse files
committed
[AMDGPU][Attributor] Remove final update of waves-per-eu after the attributor run
We do not need this in the attributor, because `ST.getWavesPerEU` accounts for both the waves-per-eu and flat-workgroup-size attributes. If the waves-per-eu values are not valid, it drops them. In the attributor, we only need to propagate the values without using intermediate flat workgroup size values.
1 parent 6bd8448 commit 776ec74

32 files changed

+216
-288
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 1 addition & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1296,74 +1296,6 @@ struct AAAMDGPUNoAGPR
12961296

12971297
const char AAAMDGPUNoAGPR::ID = 0;
12981298

1299-
/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
1300-
/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
1301-
/// Both attributes start with narrow ranges that expand during iteration.
1302-
/// However, a narrower flat-workgroup-size leads to a wider waves-per-eu range,
1303-
/// preventing optimal updates later. Therefore, waves-per-eu can't be updated
1304-
/// with intermediate values during the attributor run. We defer the
1305-
/// finalization of waves-per-eu until after the flat-workgroup-size is
1306-
/// finalized.
1307-
/// TODO: Remove this and move similar logic back into the attributor run once
1308-
/// we have a better representation for waves-per-eu.
1309-
static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
1310-
bool Changed = false;
1311-
1312-
LLVMContext &Ctx = M.getContext();
1313-
1314-
for (Function &F : M) {
1315-
if (F.isDeclaration())
1316-
continue;
1317-
1318-
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1319-
1320-
std::optional<std::pair<unsigned, std::optional<unsigned>>>
1321-
FlatWgrpSizeAttr =
1322-
AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
1323-
1324-
unsigned MinWavesPerEU = ST.getMinWavesPerEU();
1325-
unsigned MaxWavesPerEU = ST.getMaxWavesPerEU();
1326-
1327-
unsigned MinFlatWgrpSize = ST.getMinFlatWorkGroupSize();
1328-
unsigned MaxFlatWgrpSize = ST.getMaxFlatWorkGroupSize();
1329-
if (FlatWgrpSizeAttr.has_value()) {
1330-
MinFlatWgrpSize = FlatWgrpSizeAttr->first;
1331-
MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second);
1332-
}
1333-
1334-
// Start with the "best" range.
1335-
unsigned Min = MinWavesPerEU;
1336-
unsigned Max = MinWavesPerEU;
1337-
1338-
// Compute the range from flat workgroup size. `getWavesPerEU` will also
1339-
// account for the 'amdgpu-waves-er-eu' attribute.
1340-
auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1341-
ST.getWavesPerEU(F, {MinFlatWgrpSize, MaxFlatWgrpSize});
1342-
1343-
// For the lower bound, we have to "tighten" it.
1344-
Min = std::max(Min, MinFromFlatWgrpSize);
1345-
// For the upper bound, we have to "extend" it.
1346-
Max = std::max(Max, MaxFromFlatWgrpSize);
1347-
1348-
// Clamp the range to the max range.
1349-
Min = std::max(Min, MinWavesPerEU);
1350-
Max = std::min(Max, MaxWavesPerEU);
1351-
1352-
// Update the attribute if it is not the max.
1353-
if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
1354-
SmallString<10> Buffer;
1355-
raw_svector_ostream OS(Buffer);
1356-
OS << Min << ',' << Max;
1357-
Attribute OldAttr = F.getFnAttribute("amdgpu-waves-per-eu");
1358-
Attribute NewAttr = Attribute::get(Ctx, "amdgpu-waves-per-eu", OS.str());
1359-
F.addFnAttr(NewAttr);
1360-
Changed |= OldAttr == NewAttr;
1361-
}
1362-
}
1363-
1364-
return Changed;
1365-
}
1366-
13671299
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13681300
AMDGPUAttributorOptions Options,
13691301
ThinOrFullLTOPhase LTOPhase) {
@@ -1438,11 +1370,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14381370
}
14391371
}
14401372

1441-
bool Changed = A.run() == ChangeStatus::CHANGED;
1442-
1443-
Changed |= updateWavesPerEU(M, TM);
1444-
1445-
return Changed;
1373+
return A.run() == ChangeStatus::CHANGED;
14461374
}
14471375
} // namespace
14481376

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -209,11 +209,6 @@ std::pair<unsigned, unsigned>
209209
AMDGPUSubtarget::getWavesPerEU(const Function &F) const {
210210
// Default/requested minimum/maximum flat work group sizes.
211211
std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
212-
return getWavesPerEU(F, FlatWorkGroupSizes);
213-
}
214-
215-
std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
216-
const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
217212
// Minimum number of bytes allocated in the LDS.
218213
unsigned LDSBytes =
219214
AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", {0, UINT32_MAX},

llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,6 @@ attributes #1 = { nounwind }
169169

170170
;.
171171
; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
172-
; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
173-
; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
172+
; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
173+
; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
174174
;.

llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ declare void @unknown()
105105

106106
define amdgpu_kernel void @kernel_calls_extern() {
107107
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
108-
; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
108+
; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
109109
; CHECK-NEXT: call void @unknown()
110110
; CHECK-NEXT: ret void
111111
;
@@ -115,8 +115,8 @@ define amdgpu_kernel void @kernel_calls_extern() {
115115

116116
define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
117117
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
118-
; CHECK-SAME: ) #[[ATTR3]] {
119-
; CHECK-NEXT: call void @unknown() #[[ATTR7:[0-9]+]]
118+
; CHECK-SAME: ) #[[ATTR2]] {
119+
; CHECK-NEXT: call void @unknown() #[[ATTR6:[0-9]+]]
120120
; CHECK-NEXT: ret void
121121
;
122122
call void @unknown() #0
@@ -125,7 +125,7 @@ define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
125125

126126
define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
127127
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
128-
; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
128+
; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
129129
; CHECK-NEXT: call void [[INDIRECT]]()
130130
; CHECK-NEXT: ret void
131131
;
@@ -135,8 +135,8 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
135135

136136
define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
137137
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
138-
; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
139-
; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR7]]
138+
; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
139+
; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR6]]
140140
; CHECK-NEXT: ret void
141141
;
142142
call void %indirect() #0
@@ -254,12 +254,11 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
254254

255255
attributes #0 = { "amdgpu-agpr-alloc"="0" }
256256
;.
257-
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
258-
; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
259-
; CHECK: attributes #[[ATTR2:[0-9]+]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
260-
; CHECK: attributes #[[ATTR3]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
261-
; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
262-
; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
263-
; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
264-
; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" }
257+
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
258+
; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
259+
; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
260+
; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
261+
; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
262+
; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
263+
; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" }
265264
;.

llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,14 @@ define void @call_no_dispatch_id() {
117117
ret void
118118
}
119119
;.
120-
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
121-
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
122-
; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
123-
; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
124-
; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
125-
; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
126-
; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
127-
; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
128-
; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
129-
; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
120+
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
121+
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
122+
; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
123+
; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "uniform-work-group-size"="false" }
124+
; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "uniform-work-group-size"="false" }
125+
; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "uniform-work-group-size"="false" }
126+
; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "uniform-work-group-size"="false" }
127+
; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "uniform-work-group-size"="false" }
128+
; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" }
129+
; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
130130
;.

0 commit comments

Comments
 (0)