Skip to content

Commit e53a9d9

Browse files
cdevadasperlfu
authored andcommitted
Resubmit: [AMDGPU] Invert the handling of skip insertion.
The current implementation of skip insertion (SIInsertSkip) makes it a mandatory pass required for correctness. Initially, the idea was to have an optional pass. This patch inserts the s_cbranch_execz upfront during SILowerControlFlow to skip over the sections of code when no lanes are active. Later, SIRemoveShortExecBranches removes the skips for short branches, unless there is a sideeffect and the skip branch is really necessary. This new pass will replace the handling of skip insertion in the existing SIInsertSkip Pass. Differential revision: https://reviews.llvm.org/D68092
1 parent e0a6093 commit e53a9d9

40 files changed

+390
-380
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,9 @@ extern char &SIWholeQuadModeID;
156156
void initializeSILowerControlFlowPass(PassRegistry &);
157157
extern char &SILowerControlFlowID;
158158

159+
void initializeSIRemoveShortExecBranchesPass(PassRegistry &);
160+
extern char &SIRemoveShortExecBranchesID;
161+
159162
void initializeSIInsertSkipsPass(PassRegistry &);
160163
extern char &SIInsertSkipsPassID;
161164

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
228228
initializeSIModeRegisterPass(*PR);
229229
initializeSIWholeQuadModePass(*PR);
230230
initializeSILowerControlFlowPass(*PR);
231+
initializeSIRemoveShortExecBranchesPass(*PR);
231232
initializeSIInsertSkipsPass(*PR);
232233
initializeSIMemoryLegalizerPass(*PR);
233234
initializeSIOptimizeExecMaskingPass(*PR);
@@ -993,6 +994,7 @@ void GCNPassConfig::addPreEmitPass() {
993994
// be better for it to emit S_NOP <N> when possible.
994995
addPass(&PostRAHazardRecognizerID);
995996

997+
addPass(&SIRemoveShortExecBranchesID);
996998
addPass(&SIInsertSkipsPassID);
997999
addPass(&BranchRelaxationPassID);
9981000
}

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ add_llvm_target(AMDGPUCodeGen
116116
SIOptimizeExecMaskingPreRA.cpp
117117
SIPeepholeSDWA.cpp
118118
SIRegisterInfo.cpp
119+
SIRemoveShortExecBranches.cpp
119120
SIShrinkInstructions.cpp
120121
SIWholeQuadMode.cpp
121122
GCNILPSched.cpp

llvm/lib/Target/AMDGPU/SIInsertSkips.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ using namespace llvm;
4141
#define DEBUG_TYPE "si-insert-skips"
4242

4343
static cl::opt<unsigned> SkipThresholdFlag(
44-
"amdgpu-skip-threshold",
44+
"amdgpu-skip-threshold-legacy",
4545
cl::desc("Number of instructions before jumping over divergent control flow"),
4646
cl::init(12), cl::Hidden);
4747

@@ -466,6 +466,9 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
466466
MachineInstr &MI = *I;
467467

468468
switch (MI.getOpcode()) {
469+
case AMDGPU::S_CBRANCH_EXECZ:
470+
ExecBranchStack.push_back(MI.getOperand(0).getMBB());
471+
break;
469472
case AMDGPU::SI_MASK_BRANCH:
470473
ExecBranchStack.push_back(MI.getOperand(0).getMBB());
471474
MadeChange |= skipMaskBranch(MI, MBB);

llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -244,9 +244,9 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
244244
BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
245245
.addReg(Tmp, RegState::Kill);
246246

247-
// Insert a pseudo terminator to help keep the verifier happy. This will also
248-
// be used later when inserting skips.
249-
MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
247+
// Insert the S_CBRANCH_EXECZ instruction which will be optimized later
248+
// during SIRemoveShortExecBranches.
249+
MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
250250
.add(MI.getOperand(2));
251251

252252
if (!LIS) {
@@ -323,8 +323,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
323323
.addReg(DstReg);
324324

325325
MachineInstr *Branch =
326-
BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
327-
.addMBB(DestBB);
326+
BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
327+
.addMBB(DestBB);
328328

329329
if (!LIS) {
330330
MI.eraseFromParent();
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
//===-- SIRemoveShortExecBranches.cpp ------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file
10+
/// This pass optmizes the s_cbranch_execz instructions.
11+
/// The pass removes this skip instruction for short branches,
12+
/// if there is no unwanted sideeffect in the fallthrough code sequence.
13+
///
14+
//===----------------------------------------------------------------------===//
15+
16+
#include "AMDGPU.h"
17+
#include "AMDGPUSubtarget.h"
18+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19+
#include "SIInstrInfo.h"
20+
#include "llvm/CodeGen/MachineFunctionPass.h"
21+
#include "llvm/Support/CommandLine.h"
22+
23+
using namespace llvm;
24+
25+
#define DEBUG_TYPE "si-remove-short-exec-branches"
26+
27+
static unsigned SkipThreshold;
28+
29+
static cl::opt<unsigned, true> SkipThresholdFlag(
30+
"amdgpu-skip-threshold", cl::Hidden,
31+
cl::desc(
32+
"Number of instructions before jumping over divergent control flow"),
33+
cl::location(SkipThreshold), cl::init(12));
34+
35+
namespace {
36+
37+
class SIRemoveShortExecBranches : public MachineFunctionPass {
38+
private:
39+
const SIInstrInfo *TII = nullptr;
40+
bool getBlockDestinations(MachineBasicBlock &SrcMBB,
41+
MachineBasicBlock *&TrueMBB,
42+
MachineBasicBlock *&FalseMBB,
43+
SmallVectorImpl<MachineOperand> &Cond);
44+
bool mustRetainExeczBranch(const MachineBasicBlock &From,
45+
const MachineBasicBlock &To) const;
46+
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
47+
48+
public:
49+
static char ID;
50+
51+
SIRemoveShortExecBranches() : MachineFunctionPass(ID) {
52+
initializeSIRemoveShortExecBranchesPass(*PassRegistry::getPassRegistry());
53+
}
54+
55+
bool runOnMachineFunction(MachineFunction &MF) override;
56+
};
57+
58+
} // End anonymous namespace.
59+
60+
INITIALIZE_PASS(SIRemoveShortExecBranches, DEBUG_TYPE,
61+
"SI remove short exec branches", false, false)
62+
63+
char SIRemoveShortExecBranches::ID = 0;
64+
65+
char &llvm::SIRemoveShortExecBranchesID = SIRemoveShortExecBranches::ID;
66+
67+
bool SIRemoveShortExecBranches::getBlockDestinations(
68+
MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
69+
MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
70+
if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
71+
return false;
72+
73+
if (!FalseMBB)
74+
FalseMBB = SrcMBB.getNextNode();
75+
76+
return true;
77+
}
78+
79+
bool SIRemoveShortExecBranches::mustRetainExeczBranch(
80+
const MachineBasicBlock &From, const MachineBasicBlock &To) const {
81+
unsigned NumInstr = 0;
82+
const MachineFunction *MF = From.getParent();
83+
84+
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
85+
MBBI != End && MBBI != ToI; ++MBBI) {
86+
const MachineBasicBlock &MBB = *MBBI;
87+
88+
for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
89+
I != E; ++I) {
90+
// When a uniform loop is inside non-uniform control flow, the branch
91+
// leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
92+
// when EXEC = 0. We should skip the loop lest it becomes infinite.
93+
if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
94+
I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
95+
return true;
96+
97+
if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
98+
return true;
99+
100+
// These instructions are potentially expensive even if EXEC = 0.
101+
if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
102+
I->getOpcode() == AMDGPU::S_WAITCNT)
103+
return true;
104+
105+
++NumInstr;
106+
if (NumInstr >= SkipThreshold)
107+
return true;
108+
}
109+
}
110+
111+
return false;
112+
}
113+
114+
// Returns true if the skip branch instruction is removed.
115+
bool SIRemoveShortExecBranches::removeExeczBranch(MachineInstr &MI,
116+
MachineBasicBlock &SrcMBB) {
117+
MachineBasicBlock *TrueMBB = nullptr;
118+
MachineBasicBlock *FalseMBB = nullptr;
119+
SmallVector<MachineOperand, 1> Cond;
120+
121+
if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
122+
return false;
123+
124+
// Consider only the forward branches.
125+
if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
126+
mustRetainExeczBranch(*FalseMBB, *TrueMBB))
127+
return false;
128+
129+
LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
130+
MI.eraseFromParent();
131+
SrcMBB.removeSuccessor(TrueMBB);
132+
133+
return true;
134+
}
135+
136+
bool SIRemoveShortExecBranches::runOnMachineFunction(MachineFunction &MF) {
137+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
138+
TII = ST.getInstrInfo();
139+
MF.RenumberBlocks();
140+
bool Changed = false;
141+
142+
for (MachineBasicBlock &MBB : MF) {
143+
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
144+
if (MBBI == MBB.end())
145+
continue;
146+
147+
MachineInstr &MI = *MBBI;
148+
switch (MI.getOpcode()) {
149+
case AMDGPU::S_CBRANCH_EXECZ:
150+
Changed = removeExeczBranch(MI, MBB);
151+
break;
152+
default:
153+
break;
154+
}
155+
}
156+
157+
return Changed;
158+
}

llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,8 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
1010
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1111
; CHECK-NEXT: ; implicit-def: $vgpr0
1212
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
13-
; CHECK-NEXT: ; mask branch BB0_2
1413
; CHECK-NEXT: s_cbranch_execz BB0_2
15-
; CHECK-NEXT: BB0_1: ; %if.true
14+
; CHECK-NEXT: ; %bb.1: ; %if.true
1615
; CHECK-NEXT: global_load_dword v0, v[0:1], off
1716
; CHECK-NEXT: BB0_2: ; %endif
1817
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -38,12 +37,10 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
3837
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3938
; CHECK-NEXT: ; implicit-def: $vgpr0
4039
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
41-
; CHECK-NEXT: ; mask branch BB1_2
42-
; CHECK-NEXT: BB1_1: ; %endif
43-
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
44-
; CHECK-NEXT: s_setpc_b64 s[30:31]
45-
; CHECK-NEXT: BB1_2: ; %if.true
40+
; CHECK-NEXT: s_cbranch_execnz BB1_2
41+
; CHECK-NEXT: ; %bb.1: ; %if.true
4642
; CHECK-NEXT: global_load_dword v0, v[0:1], off
43+
; CHECK-NEXT: BB1_2: ; %endif
4744
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
4845
; CHECK-NEXT: s_waitcnt vmcnt(0)
4946
; CHECK-NEXT: s_setpc_b64 s[30:31]

0 commit comments

Comments
 (0)