Skip to content

Commit 4a331be

Browse files
committed
[AMDGPU] Fix vccz after v_readlane/v_readfirstlane to vcc_lo/hi
Summary: Up to gfx9, writes to vcc_lo and vcc_hi by instructions like v_readlane and v_readfirstlane do not update vccz to reflect the new value of vcc. Fix it by reusing part of the existing vccz bug handling code, which inserts an "s_mov_b64 vcc, vcc" instruction to restore vccz just before an instruction that needs the correct value. Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D69661
1 parent 00efeae commit 4a331be

File tree

3 files changed

+118
-7
lines changed

3 files changed

+118
-7
lines changed

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,11 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
587587
return getGeneration() <= SEA_ISLANDS;
588588
}
589589

590+
/// Writes to VCC_LO/VCC_HI update the VCCZ flag.
591+
bool partialVCCWritesUpdateVCCZ() const {
592+
return getGeneration() >= GFX10;
593+
}
594+
590595
/// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
591596
/// was written by a VALU instruction.
592597
bool hasSMRDReadVALUDefHazard() const {

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1383,6 +1383,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
13831383
ScoreBrackets.dump();
13841384
});
13851385

1386+
// Assume VCCZ is correct at basic block boundaries, unless and until we need
1387+
// to handle cases where that is not true.
1388+
bool VCCZCorrect = true;
1389+
13861390
// Walk over the instructions.
13871391
MachineInstr *OldWaitcntInstr = nullptr;
13881392

@@ -1402,13 +1406,26 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
14021406
continue;
14031407
}
14041408

1405-
bool VCCZBugWorkAround = false;
1409+
// We might need to restore vccz to its correct value for either of two
1410+
// different reasons; see ST->hasReadVCCZBug() and
1411+
// ST->partialVCCWritesUpdateVCCZ().
1412+
bool RestoreVCCZ = false;
14061413
if (readsVCCZ(Inst)) {
1407-
if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1408-
ScoreBrackets.getScoreUB(LGKM_CNT) &&
1409-
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1410-
if (ST->hasReadVCCZBug())
1411-
VCCZBugWorkAround = true;
1414+
if (!VCCZCorrect)
1415+
RestoreVCCZ = true;
1416+
else if (ST->hasReadVCCZBug()) {
1417+
// There is a hardware bug on CI/SI where SMRD instruction may corrupt
1418+
// vccz bit, so when we detect that an instruction may read from a
1419+
// corrupt vccz bit, we need to:
1420+
// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
1421+
// operations to complete.
1422+
// 2. Restore the correct value of vccz by writing the current value
1423+
// of vcc back to vcc.
1424+
if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1425+
ScoreBrackets.getScoreUB(LGKM_CNT) &&
1426+
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1427+
RestoreVCCZ = true;
1428+
}
14121429
}
14131430
}
14141431

@@ -1419,6 +1436,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
14191436
}
14201437
}
14211438

1439+
if (!ST->partialVCCWritesUpdateVCCZ()) {
1440+
// Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
1441+
// Writes to vcc will fix it.
1442+
if (Inst.definesRegister(AMDGPU::VCC_LO) ||
1443+
Inst.definesRegister(AMDGPU::VCC_HI))
1444+
VCCZCorrect = false;
1445+
else if (Inst.definesRegister(AMDGPU::VCC))
1446+
VCCZCorrect = true;
1447+
}
1448+
14221449
// Generate an s_waitcnt instruction to be placed before
14231450
// cur_Inst, if needed.
14241451
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
@@ -1444,14 +1471,15 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
14441471

14451472
// TODO: Remove this work-around after fixing the scheduler and enable the
14461473
// assert above.
1447-
if (VCCZBugWorkAround) {
1474+
if (RestoreVCCZ) {
14481475
// Restore the vccz bit. Any time a value is written to vcc, the vcc
14491476
// bit is updated, so we can restore the bit by reading the value of
14501477
// vcc and then writing it back to the register.
14511478
BuildMI(Block, Inst, Inst.getDebugLoc(),
14521479
TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
14531480
TRI->getVCC())
14541481
.addReg(TRI->getVCC());
1482+
VCCZCorrect = true;
14551483
Modified = true;
14561484
}
14571485

llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,3 +85,81 @@ body: |
8585
S_ENDPGM 0
8686
8787
...
88+
---
89+
# Test that after reloading vcc spilled to a vgpr, we insert any necessary
90+
# instructions to fix vccz.
91+
92+
# CHECK-LABEL: name: reload_vcc_from_vgpr
93+
# CHECK: $vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc
94+
# CHECK: $vcc_hi = V_READLANE_B32_vi $vgpr0, 9
95+
# SI: $vcc = S_MOV_B64 $vcc
96+
# GFX9: $vcc = S_MOV_B64 $vcc
97+
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
98+
99+
name: reload_vcc_from_vgpr
100+
body: |
101+
bb.0:
102+
$vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc
103+
$vcc_hi = V_READLANE_B32_vi $vgpr0, 9
104+
S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
105+
bb.1:
106+
107+
...
108+
---
109+
# Test that after reloading vcc spilled to memory, we insert any necessary
110+
# instructions to fix vccz.
111+
112+
# CHECK-LABEL: name: reload_vcc_from_mem
113+
# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
114+
# CHECK: $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
115+
# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
116+
# CHECK: $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
117+
# SI: $vcc = S_MOV_B64 $vcc
118+
# GFX9: $vcc = S_MOV_B64 $vcc
119+
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
120+
121+
name: reload_vcc_from_mem
122+
body: |
123+
bb.0:
124+
$vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
125+
$vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
126+
$vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
127+
$vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
128+
S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
129+
bb.1:
130+
131+
...
132+
---
133+
# Test that after inline asm that defines vcc_lo, we insert any necessary
134+
# instructions to fix vccz.
135+
136+
# CHECK-LABEL: name: inlineasm_def_vcc_lo
137+
# CHECK: INLINEASM &"; def vcc_lo", 1, 10, implicit-def $vcc_lo
138+
# SI: $vcc = S_MOV_B64 $vcc
139+
# GFX9: $vcc = S_MOV_B64 $vcc
140+
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
141+
142+
name: inlineasm_def_vcc_lo
143+
body: |
144+
bb.0:
145+
INLINEASM &"; def vcc_lo", 1, 10, implicit-def $vcc_lo
146+
S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
147+
bb.1:
148+
149+
...
150+
---
151+
# Test that after inline asm that defines vcc, no unnecessary instructions are
152+
# inserted to fix vccz.
153+
154+
# CHECK-LABEL: name: inlineasm_def_vcc
155+
# CHECK: INLINEASM &"; def vcc", 1, 10, implicit-def $vcc
156+
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
157+
158+
name: inlineasm_def_vcc
159+
body: |
160+
bb.0:
161+
INLINEASM &"; def vcc", 1, 10, implicit-def $vcc
162+
S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
163+
bb.1:
164+
165+
...

0 commit comments

Comments
 (0)