Skip to content

Commit 52ec737

Browse files
committed
AMDGPU/GlobalISel: Fold add of constant into G_INSERT_VECTOR_ELT
Move the subregister base like in the extract case.
1 parent 349f6bb commit 52ec737

File tree

4 files changed

+260
-267
lines changed

4 files changed

+260
-267
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1930,12 +1930,15 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
19301930
if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
19311931
return false;
19321932

1933+
unsigned SubReg;
1934+
std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
1935+
ValSize / 8);
1936+
19331937
const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
19341938
STI.useVGPRIndexMode();
19351939

19361940
MachineBasicBlock *BB = MI.getParent();
19371941
const DebugLoc &DL = MI.getDebugLoc();
1938-
unsigned SubReg = ValSize == 64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
19391942

19401943
if (IndexMode) {
19411944
BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 63 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1511,6 +1511,25 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
15111511
constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
15121512
}
15131513

1514+
/// Utility function for pushing dynamic vector indexes with a constant offset
1515+
/// into waterwall loops.
1516+
static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1517+
MachineInstr &IdxUseInstr,
1518+
unsigned OpIdx,
1519+
unsigned ConstOffset) {
1520+
MachineRegisterInfo &MRI = *B.getMRI();
1521+
const LLT S32 = LLT::scalar(32);
1522+
Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1523+
B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1524+
1525+
auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1526+
1527+
auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1528+
MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1529+
MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1530+
IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1531+
}
1532+
15141533
void AMDGPURegisterBankInfo::applyMappingImpl(
15151534
const OperandsMapper &OpdMapper) const {
15161535
MachineInstr &MI = OpdMapper.getMI();
@@ -2011,20 +2030,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
20112030
ConstOffset > 0 &&
20122031
ConstOffset < SrcTy.getNumElements();
20132032

2014-
// Re-insert the constant offset add inside the waterfall loop.
2015-
auto ReinsertIndexAdd = [=, &B, &MRI](MachineInstr &IdxUseInstr,
2016-
unsigned OpIdx) {
2017-
Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
2018-
B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
2019-
2020-
auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
2021-
2022-
auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
2023-
MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
2024-
MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
2025-
IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
2026-
};
2027-
20282033
// Move the base register. We'll re-insert the add later.
20292034
if (ShouldMoveIndexIntoLoop)
20302035
MI.getOperand(2).setReg(BaseIdxReg);
@@ -2051,8 +2056,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
20512056
buildVCopy(B, DstReg, TmpReg);
20522057
}
20532058

2059+
// Re-insert the constant offset add inside the waterfall loop.
20542060
if (ShouldMoveIndexIntoLoop)
2055-
ReinsertIndexAdd(MI, 2);
2061+
reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
20562062

20572063
return;
20582064
}
@@ -2113,7 +2119,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
21132119
}
21142120

21152121
if (ShouldMoveIndexIntoLoop)
2116-
ReinsertIndexAdd(*IdxLo, 1);
2122+
reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
21172123

21182124
return;
21192125
}
@@ -2126,26 +2132,53 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
21262132
assert(OpdMapper.getVRegs(0).empty());
21272133
assert(OpdMapper.getVRegs(3).empty());
21282134

2135+
const RegisterBank *IdxBank =
2136+
OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2137+
21292138
if (substituteSimpleCopyRegs(OpdMapper, 1))
21302139
MRI.setType(MI.getOperand(1).getReg(), VecTy);
21312140

2141+
Register SrcReg = MI.getOperand(1).getReg();
2142+
Register InsReg = MI.getOperand(2).getReg();
2143+
LLT InsTy = MRI.getType(InsReg);
2144+
(void)InsTy;
2145+
2146+
Register BaseIdxReg;
2147+
unsigned ConstOffset;
2148+
MachineInstr *OffsetDef;
2149+
std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
2150+
AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2151+
2152+
// See if the index is an add of a constant which will be foldable by moving
2153+
// the base register of the index later if this is going to be executed in a
2154+
// waterfall loop. This is essentially to reassociate the add of a constant
2155+
// with the readfirstlane.
2156+
bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2157+
ConstOffset > 0 &&
2158+
ConstOffset < VecTy.getNumElements();
2159+
2160+
// Move the base register. We'll re-insert the add later.
2161+
if (ShouldMoveIndexIntoLoop)
2162+
MI.getOperand(3).setReg(BaseIdxReg);
2163+
2164+
21322165
if (InsRegs.empty()) {
2133-
applyDefaultMapping(OpdMapper);
21342166
executeInWaterfallLoop(MI, MRI, { 3 });
2167+
2168+
// Re-insert the constant offset add inside the waterfall loop.
2169+
if (ShouldMoveIndexIntoLoop) {
2170+
MachineIRBuilder B(MI);
2171+
reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2172+
}
2173+
21352174
return;
21362175
}
21372176

2138-
Register SrcReg = MI.getOperand(1).getReg();
2139-
Register InsReg = MI.getOperand(2).getReg();
2140-
Register IdxReg = MI.getOperand(3).getReg();
2141-
LLT SrcTy = MRI.getType(SrcReg);
2142-
LLT InsTy = MRI.getType(InsReg);
2143-
(void)InsTy;
21442177

21452178
assert(InsTy.getSizeInBits() == 64);
21462179

21472180
const LLT S32 = LLT::scalar(32);
2148-
LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
2181+
LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
21492182

21502183
MachineIRBuilder B(MI);
21512184
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
@@ -2158,7 +2191,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
21582191
MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
21592192

21602193
// Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2161-
auto IdxLo = B.buildShl(S32, IdxReg, One);
2194+
auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
21622195
auto IdxHi = B.buildAdd(S32, IdxLo, One);
21632196

21642197
auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
@@ -2192,6 +2225,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
21922225

21932226
executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
21942227
OpsToWaterfall, MRI);
2228+
2229+
// Re-insert the constant offset add inside the waterfall loop.
2230+
if (ShouldMoveIndexIntoLoop)
2231+
reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2232+
21952233
return;
21962234
}
21972235
case AMDGPU::G_INTRINSIC: {

0 commit comments

Comments
 (0)