Skip to content

Commit 555d8f4

Browse files
committed
[AMDGPU] Bundle loads before post-RA scheduler
We are relying on atrificial DAG edges inserted by the MemOpClusterMutation to keep loads and stores together in the post-RA scheduler. This does not work all the time since it allows to schedule a completely independent instruction in the middle of the cluster. Removed the DAG mutation and added pass to bundle already clustered instructions. These bundles are unpacked before the memory legalizer because it does not work with bundles but also because it allows to insert waitcounts in the middle of a store cluster. Removing artificial edges also allows a more relaxed scheduling. Differential Revision: https://reviews.llvm.org/D72737
1 parent b35b7da commit 555d8f4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+627
-421
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ FunctionPass *createSIMemoryLegalizerPass();
5959
FunctionPass *createSIInsertWaitcntsPass();
6060
FunctionPass *createSIPreAllocateWWMRegsPass();
6161
FunctionPass *createSIFormMemoryClausesPass();
62+
FunctionPass *createSIPostRABundlerPass();
6263
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &,
6364
const TargetMachine *);
6465
FunctionPass *createAMDGPUUseNativeCallsPass();
@@ -229,6 +230,9 @@ extern char &SIInsertWaitcntsID;
229230
void initializeSIFormMemoryClausesPass(PassRegistry&);
230231
extern char &SIFormMemoryClausesID;
231232

233+
void initializeSIPostRABundlerPass(PassRegistry&);
234+
extern char &SIPostRABundlerID;
235+
232236
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
233237
extern char &AMDGPUUnifyDivergentExitNodesID;
234238

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -754,53 +754,6 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
754754
}
755755

756756
namespace {
757-
struct MemOpClusterMutation : ScheduleDAGMutation {
758-
const SIInstrInfo *TII;
759-
760-
MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
761-
762-
void apply(ScheduleDAGInstrs *DAG) override {
763-
SUnit *SUa = nullptr;
764-
// Search for two consequent memory operations and link them
765-
// to prevent scheduler from moving them apart.
766-
// In DAG pre-process SUnits are in the original order of
767-
// the instructions before scheduling.
768-
for (SUnit &SU : DAG->SUnits) {
769-
MachineInstr &MI2 = *SU.getInstr();
770-
if (!MI2.mayLoad() && !MI2.mayStore()) {
771-
SUa = nullptr;
772-
continue;
773-
}
774-
if (!SUa) {
775-
SUa = &SU;
776-
continue;
777-
}
778-
779-
MachineInstr &MI1 = *SUa->getInstr();
780-
if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
781-
(TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
782-
(TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
783-
(TII->isDS(MI1) && TII->isDS(MI2))) {
784-
SU.addPredBarrier(SUa);
785-
786-
for (const SDep &SI : SU.Preds) {
787-
if (SI.getSUnit() != SUa)
788-
SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
789-
}
790-
791-
if (&SU != &DAG->ExitSU) {
792-
for (const SDep &SI : SUa->Succs) {
793-
if (SI.getSUnit() != &SU)
794-
SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
795-
}
796-
}
797-
}
798-
799-
SUa = &SU;
800-
}
801-
}
802-
};
803-
804757
struct FillMFMAShadowMutation : ScheduleDAGMutation {
805758
const SIInstrInfo *TII;
806759

@@ -927,7 +880,6 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
927880

928881
void GCNSubtarget::getPostRAMutations(
929882
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
930-
Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
931883
Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
932884
}
933885

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
235235
initializeSIOptimizeExecMaskingPass(*PR);
236236
initializeSIPreAllocateWWMRegsPass(*PR);
237237
initializeSIFormMemoryClausesPass(*PR);
238+
initializeSIPostRABundlerPass(*PR);
238239
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
239240
initializeAMDGPUAAWrapperPassPass(*PR);
240241
initializeAMDGPUExternalAAWrapperPass(*PR);
@@ -980,6 +981,7 @@ void GCNPassConfig::addPostRegAlloc() {
980981
}
981982

982983
void GCNPassConfig::addPreSched2() {
984+
addPass(&SIPostRABundlerID);
983985
}
984986

985987
void GCNPassConfig::addPreEmitPass() {

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ add_llvm_target(AMDGPUCodeGen
118118
SIOptimizeExecMasking.cpp
119119
SIOptimizeExecMaskingPreRA.cpp
120120
SIPeepholeSDWA.cpp
121+
SIPostRABundler.cpp
121122
SIRegisterInfo.cpp
122123
SIRemoveShortExecBranches.cpp
123124
SIShrinkInstructions.cpp

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1663,22 +1663,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
16631663
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
16641664
break;
16651665
}
1666-
case TargetOpcode::BUNDLE: {
1667-
if (!MI.mayLoad() || MI.hasUnmodeledSideEffects())
1668-
return false;
1669-
1670-
// If it is a load it must be a memory clause
1671-
for (MachineBasicBlock::instr_iterator I = MI.getIterator();
1672-
I->isBundledWithSucc(); ++I) {
1673-
I->unbundleFromSucc();
1674-
for (MachineOperand &MO : I->operands())
1675-
if (MO.isReg())
1676-
MO.setIsInternalRead(false);
1677-
}
1678-
1679-
MI.eraseFromParent();
1680-
break;
1681-
}
16821666
}
16831667
return true;
16841668
}

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,6 +1289,21 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
12891289

12901290
for (auto &MBB : MF) {
12911291
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1292+
1293+
if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) {
1294+
MachineBasicBlock::instr_iterator II(MI->getIterator());
1295+
for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1296+
I != E && I->isBundledWithPred(); ++I) {
1297+
I->unbundleFromPred();
1298+
for (MachineOperand &MO : I->operands())
1299+
if (MO.isReg())
1300+
MO.setIsInternalRead(false);
1301+
}
1302+
1303+
MI->eraseFromParent();
1304+
MI = II->getIterator();
1305+
}
1306+
12921307
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
12931308
continue;
12941309

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
//===-- SIPostRABundler.cpp -----------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file
10+
/// This pass creates bundles of memory instructions to protect adjacent loads
11+
/// and stores from beeing rescheduled apart from each other post-RA.
12+
///
13+
//===----------------------------------------------------------------------===//
14+
15+
#include "AMDGPU.h"
16+
#include "AMDGPUSubtarget.h"
17+
#include "SIDefines.h"
18+
#include "SIInstrInfo.h"
19+
#include "llvm/ADT/SmallSet.h"
20+
#include "llvm/CodeGen/MachineFunctionPass.h"
21+
#include "llvm/CodeGen/MachineInstrBundle.h"
22+
#include "llvm/InitializePasses.h"
23+
24+
using namespace llvm;
25+
26+
#define DEBUG_TYPE "si-post-ra-bundler"
27+
28+
namespace {
29+
30+
class SIPostRABundler : public MachineFunctionPass {
31+
public:
32+
static char ID;
33+
34+
public:
35+
SIPostRABundler() : MachineFunctionPass(ID) {
36+
initializeSIPostRABundlerPass(*PassRegistry::getPassRegistry());
37+
}
38+
39+
bool runOnMachineFunction(MachineFunction &MF) override;
40+
41+
StringRef getPassName() const override {
42+
return "SI post-RA bundler";
43+
}
44+
45+
void getAnalysisUsage(AnalysisUsage &AU) const override {
46+
AU.setPreservesAll();
47+
MachineFunctionPass::getAnalysisUsage(AU);
48+
}
49+
50+
private:
51+
const SIRegisterInfo *TRI;
52+
53+
SmallSet<Register, 16> Defs;
54+
55+
bool isDependentLoad(const MachineInstr &MI) const;
56+
57+
};
58+
59+
} // End anonymous namespace.
60+
61+
INITIALIZE_PASS(SIPostRABundler, DEBUG_TYPE, "SI post-RA bundler", false, false)
62+
63+
char SIPostRABundler::ID = 0;
64+
65+
char &llvm::SIPostRABundlerID = SIPostRABundler::ID;
66+
67+
FunctionPass *llvm::createSIPostRABundlerPass() {
68+
return new SIPostRABundler();
69+
}
70+
71+
bool SIPostRABundler::isDependentLoad(const MachineInstr &MI) const {
72+
if (!MI.mayLoad())
73+
return false;
74+
75+
for (const MachineOperand &Op : MI.explicit_operands()) {
76+
if (!Op.isReg())
77+
continue;
78+
Register Reg = Op.getReg();
79+
for (const Register Def : Defs)
80+
if (TRI->regsOverlap(Reg, Def))
81+
return true;
82+
}
83+
84+
return false;
85+
}
86+
87+
bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
88+
if (skipFunction(MF.getFunction()))
89+
return false;
90+
91+
TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
92+
bool Changed = false;
93+
const unsigned MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
94+
SIInstrFlags::SMRD | SIInstrFlags::DS |
95+
SIInstrFlags::FLAT | SIInstrFlags::MIMG;
96+
97+
for (MachineBasicBlock &MBB : MF) {
98+
MachineBasicBlock::instr_iterator Next;
99+
MachineBasicBlock::instr_iterator B = MBB.instr_begin();
100+
MachineBasicBlock::instr_iterator E = MBB.instr_end();
101+
for (auto I = B; I != E; I = Next) {
102+
Next = std::next(I);
103+
104+
if (I->isBundled() || !I->mayLoadOrStore() ||
105+
B->mayLoad() != I->mayLoad() || B->mayStore() != I->mayStore() ||
106+
(B->getDesc().TSFlags & MemFlags) !=
107+
(I->getDesc().TSFlags & MemFlags) ||
108+
isDependentLoad(*I)) {
109+
110+
if (B != I) {
111+
if (std::next(B) != I) {
112+
finalizeBundle(MBB, B, I);
113+
Changed = true;
114+
}
115+
Next = I;
116+
}
117+
118+
B = Next;
119+
Defs.clear();
120+
continue;
121+
}
122+
123+
if (I->getNumExplicitDefs() == 0)
124+
continue;
125+
126+
Defs.insert(I->defs().begin()->getReg());
127+
}
128+
129+
if (B != E && std::next(B) != E) {
130+
finalizeBundle(MBB, B, E);
131+
Changed = true;
132+
}
133+
134+
Defs.clear();
135+
}
136+
137+
return Changed;
138+
}

llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -807,9 +807,9 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
807807
; MOVREL-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
808808
; MOVREL-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
809809
; MOVREL-NEXT: v_mov_b32_e32 v34, s19
810-
; MOVREL-NEXT: v_mov_b32_e32 v33, s18
811810
; MOVREL-NEXT: v_mov_b32_e32 v32, s17
812811
; MOVREL-NEXT: v_mov_b32_e32 v31, s16
812+
; MOVREL-NEXT: v_mov_b32_e32 v33, s18
813813
; MOVREL-NEXT: v_mov_b32_e32 v30, s15
814814
; MOVREL-NEXT: v_mov_b32_e32 v29, s14
815815
; MOVREL-NEXT: v_mov_b32_e32 v28, s13
@@ -1113,10 +1113,10 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
11131113
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
11141114
; MOVREL-NEXT: v_movreld_b32_e32 v2, v0
11151115
; MOVREL-NEXT: v_movreld_b32_e32 v3, v1
1116+
; MOVREL-NEXT: ; implicit-def: $vcc_hi
11161117
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
11171118
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
11181119
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off
1119-
; MOVREL-NEXT: ; implicit-def: $vcc_hi
11201120
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off
11211121
; MOVREL-NEXT: s_endpgm
11221122
entry:
@@ -2053,24 +2053,24 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v
20532053
; MOVREL-NEXT: s_movreld_b64 s[2:3], s[18:19]
20542054
; MOVREL-NEXT: v_mov_b32_e32 v0, s0
20552055
; MOVREL-NEXT: v_mov_b32_e32 v4, s4
2056-
; MOVREL-NEXT: v_mov_b32_e32 v8, s8
2057-
; MOVREL-NEXT: v_mov_b32_e32 v12, s12
20582056
; MOVREL-NEXT: v_mov_b32_e32 v1, s1
20592057
; MOVREL-NEXT: v_mov_b32_e32 v2, s2
20602058
; MOVREL-NEXT: v_mov_b32_e32 v3, s3
2059+
; MOVREL-NEXT: v_mov_b32_e32 v8, s8
20612060
; MOVREL-NEXT: v_mov_b32_e32 v5, s5
20622061
; MOVREL-NEXT: v_mov_b32_e32 v6, s6
20632062
; MOVREL-NEXT: v_mov_b32_e32 v7, s7
2063+
; MOVREL-NEXT: v_mov_b32_e32 v12, s12
20642064
; MOVREL-NEXT: v_mov_b32_e32 v9, s9
20652065
; MOVREL-NEXT: v_mov_b32_e32 v10, s10
20662066
; MOVREL-NEXT: v_mov_b32_e32 v11, s11
20672067
; MOVREL-NEXT: v_mov_b32_e32 v13, s13
20682068
; MOVREL-NEXT: v_mov_b32_e32 v14, s14
20692069
; MOVREL-NEXT: v_mov_b32_e32 v15, s15
2070+
; MOVREL-NEXT: ; implicit-def: $vcc_hi
20702071
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
20712072
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
20722073
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
2073-
; MOVREL-NEXT: ; implicit-def: $vcc_hi
20742074
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
20752075
; MOVREL-NEXT: s_endpgm
20762076
entry:

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -374,10 +374,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
374374
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1
375375
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
376376
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4
377-
; CI-NEXT: v_mov_b32_e32 v5, s1
378377
; CI-NEXT: v_mov_b32_e32 v4, 42
379-
; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
380378
; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
379+
; CI-NEXT: v_mov_b32_e32 v5, s1
380+
; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
381381
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
382382
; CI-NEXT: flat_store_dword v[2:3], v0
383383
; CI-NEXT: s_endpgm
@@ -399,10 +399,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
399399
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1
400400
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
401401
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
402-
; VI-NEXT: v_mov_b32_e32 v5, s1
403402
; VI-NEXT: v_mov_b32_e32 v4, 42
404-
; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
405403
; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
404+
; VI-NEXT: v_mov_b32_e32 v5, s1
405+
; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
406406
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
407407
; VI-NEXT: flat_store_dword v[2:3], v0
408408
; VI-NEXT: s_endpgm
@@ -695,10 +695,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
695695
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1
696696
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
697697
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4
698-
; CI-NEXT: v_mov_b32_e32 v5, s1
699698
; CI-NEXT: v_mov_b32_e32 v4, 42
700-
; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
701699
; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
700+
; CI-NEXT: v_mov_b32_e32 v5, s1
701+
; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
702702
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
703703
; CI-NEXT: flat_store_dword v[2:3], v0
704704
; CI-NEXT: s_endpgm
@@ -720,10 +720,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
720720
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1
721721
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
722722
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
723-
; VI-NEXT: v_mov_b32_e32 v5, s1
724723
; VI-NEXT: v_mov_b32_e32 v4, 42
725-
; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
726724
; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
725+
; VI-NEXT: v_mov_b32_e32 v5, s1
726+
; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
727727
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
728728
; VI-NEXT: flat_store_dword v[2:3], v0
729729
; VI-NEXT: s_endpgm

0 commit comments

Comments
 (0)