@@ -1511,6 +1511,25 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1511
1511
constrainGenericRegister (DstReg, AMDGPU::VReg_64RegClass, MRI);
1512
1512
}
1513
1513
1514
+ // / Utility function for pushing dynamic vector indexes with a constant offset
1515
+ // / into waterwall loops.
1516
+ static void reinsertVectorIndexAdd (MachineIRBuilder &B,
1517
+ MachineInstr &IdxUseInstr,
1518
+ unsigned OpIdx,
1519
+ unsigned ConstOffset) {
1520
+ MachineRegisterInfo &MRI = *B.getMRI ();
1521
+ const LLT S32 = LLT::scalar (32 );
1522
+ Register WaterfallIdx = IdxUseInstr.getOperand (OpIdx).getReg ();
1523
+ B.setInsertPt (*IdxUseInstr.getParent (), IdxUseInstr.getIterator ());
1524
+
1525
+ auto MaterializedOffset = B.buildConstant (S32, ConstOffset);
1526
+
1527
+ auto Add = B.buildAdd (S32, WaterfallIdx, MaterializedOffset);
1528
+ MRI.setRegBank (MaterializedOffset.getReg (0 ), AMDGPU::SGPRRegBank);
1529
+ MRI.setRegBank (Add.getReg (0 ), AMDGPU::SGPRRegBank);
1530
+ IdxUseInstr.getOperand (OpIdx).setReg (Add.getReg (0 ));
1531
+ }
1532
+
1514
1533
void AMDGPURegisterBankInfo::applyMappingImpl (
1515
1534
const OperandsMapper &OpdMapper) const {
1516
1535
MachineInstr &MI = OpdMapper.getMI ();
@@ -2011,20 +2030,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
2011
2030
ConstOffset > 0 &&
2012
2031
ConstOffset < SrcTy.getNumElements ();
2013
2032
2014
- // Re-insert the constant offset add inside the waterfall loop.
2015
- auto ReinsertIndexAdd = [=, &B, &MRI](MachineInstr &IdxUseInstr,
2016
- unsigned OpIdx) {
2017
- Register WaterfallIdx = IdxUseInstr.getOperand (OpIdx).getReg ();
2018
- B.setInsertPt (*IdxUseInstr.getParent (), IdxUseInstr.getIterator ());
2019
-
2020
- auto MaterializedOffset = B.buildConstant (S32, ConstOffset);
2021
-
2022
- auto Add = B.buildAdd (S32, WaterfallIdx, MaterializedOffset);
2023
- MRI.setRegBank (MaterializedOffset.getReg (0 ), AMDGPU::SGPRRegBank);
2024
- MRI.setRegBank (Add.getReg (0 ), AMDGPU::SGPRRegBank);
2025
- IdxUseInstr.getOperand (OpIdx).setReg (Add.getReg (0 ));
2026
- };
2027
-
2028
2033
// Move the base register. We'll re-insert the add later.
2029
2034
if (ShouldMoveIndexIntoLoop)
2030
2035
MI.getOperand (2 ).setReg (BaseIdxReg);
@@ -2051,8 +2056,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
2051
2056
buildVCopy (B, DstReg, TmpReg);
2052
2057
}
2053
2058
2059
+ // Re-insert the constant offset add inside the waterfall loop.
2054
2060
if (ShouldMoveIndexIntoLoop)
2055
- ReinsertIndexAdd ( MI, 2 );
2061
+ reinsertVectorIndexAdd (B, MI, 2 , ConstOffset );
2056
2062
2057
2063
return ;
2058
2064
}
@@ -2113,7 +2119,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
2113
2119
}
2114
2120
2115
2121
if (ShouldMoveIndexIntoLoop)
2116
- ReinsertIndexAdd ( *IdxLo, 1 );
2122
+ reinsertVectorIndexAdd (B, *IdxLo, 1 , ConstOffset );
2117
2123
2118
2124
return ;
2119
2125
}
@@ -2126,26 +2132,53 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
2126
2132
assert (OpdMapper.getVRegs (0 ).empty ());
2127
2133
assert (OpdMapper.getVRegs (3 ).empty ());
2128
2134
2135
+ const RegisterBank *IdxBank =
2136
+ OpdMapper.getInstrMapping ().getOperandMapping (3 ).BreakDown [0 ].RegBank ;
2137
+
2129
2138
if (substituteSimpleCopyRegs (OpdMapper, 1 ))
2130
2139
MRI.setType (MI.getOperand (1 ).getReg (), VecTy);
2131
2140
2141
+ Register SrcReg = MI.getOperand (1 ).getReg ();
2142
+ Register InsReg = MI.getOperand (2 ).getReg ();
2143
+ LLT InsTy = MRI.getType (InsReg);
2144
+ (void )InsTy;
2145
+
2146
+ Register BaseIdxReg;
2147
+ unsigned ConstOffset;
2148
+ MachineInstr *OffsetDef;
2149
+ std::tie (BaseIdxReg, ConstOffset, OffsetDef) =
2150
+ AMDGPU::getBaseWithConstantOffset (MRI, MI.getOperand (3 ).getReg ());
2151
+
2152
+ // See if the index is an add of a constant which will be foldable by moving
2153
+ // the base register of the index later if this is going to be executed in a
2154
+ // waterfall loop. This is essentially to reassociate the add of a constant
2155
+ // with the readfirstlane.
2156
+ bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2157
+ ConstOffset > 0 &&
2158
+ ConstOffset < VecTy.getNumElements ();
2159
+
2160
+ // Move the base register. We'll re-insert the add later.
2161
+ if (ShouldMoveIndexIntoLoop)
2162
+ MI.getOperand (3 ).setReg (BaseIdxReg);
2163
+
2164
+
2132
2165
if (InsRegs.empty ()) {
2133
- applyDefaultMapping (OpdMapper);
2134
2166
executeInWaterfallLoop (MI, MRI, { 3 });
2167
+
2168
+ // Re-insert the constant offset add inside the waterfall loop.
2169
+ if (ShouldMoveIndexIntoLoop) {
2170
+ MachineIRBuilder B (MI);
2171
+ reinsertVectorIndexAdd (B, MI, 3 , ConstOffset);
2172
+ }
2173
+
2135
2174
return ;
2136
2175
}
2137
2176
2138
- Register SrcReg = MI.getOperand (1 ).getReg ();
2139
- Register InsReg = MI.getOperand (2 ).getReg ();
2140
- Register IdxReg = MI.getOperand (3 ).getReg ();
2141
- LLT SrcTy = MRI.getType (SrcReg);
2142
- LLT InsTy = MRI.getType (InsReg);
2143
- (void )InsTy;
2144
2177
2145
2178
assert (InsTy.getSizeInBits () == 64 );
2146
2179
2147
2180
const LLT S32 = LLT::scalar (32 );
2148
- LLT Vec32 = LLT::vector (2 * SrcTy .getNumElements (), 32 );
2181
+ LLT Vec32 = LLT::vector (2 * VecTy .getNumElements (), 32 );
2149
2182
2150
2183
MachineIRBuilder B (MI);
2151
2184
auto CastSrc = B.buildBitcast (Vec32, SrcReg);
@@ -2158,7 +2191,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
2158
2191
MachineInstrSpan Span (MachineBasicBlock::iterator (&MI), &B.getMBB ());
2159
2192
2160
2193
// Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2161
- auto IdxLo = B.buildShl (S32, IdxReg , One);
2194
+ auto IdxLo = B.buildShl (S32, BaseIdxReg , One);
2162
2195
auto IdxHi = B.buildAdd (S32, IdxLo, One);
2163
2196
2164
2197
auto InsLo = B.buildInsertVectorElement (Vec32, CastSrc, InsRegs[0 ], IdxLo);
@@ -2192,6 +2225,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
2192
2225
2193
2226
executeInWaterfallLoop (B, make_range (Span.begin (), Span.end ()),
2194
2227
OpsToWaterfall, MRI);
2228
+
2229
+ // Re-insert the constant offset add inside the waterfall loop.
2230
+ if (ShouldMoveIndexIntoLoop)
2231
+ reinsertVectorIndexAdd (B, *IdxLo, 1 , ConstOffset);
2232
+
2195
2233
return ;
2196
2234
}
2197
2235
case AMDGPU::G_INTRINSIC: {
0 commit comments