diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 3979e1e0c44aa..a116b57c85a88 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -173,6 +173,7 @@ struct FoldCandidate { class SIFoldOperandsImpl { public: + MachineFunction *MF; MachineRegisterInfo *MRI; const SIInstrInfo *TII; const SIRegisterInfo *TRI; @@ -705,6 +706,36 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { } MachineOperand *New = Fold.Def.OpToFold; + + // Verify the register is compatible with the operand. + if (const TargetRegisterClass *OpRC = + TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI, *MF)) { + const TargetRegisterClass *OldRC = MRI->getRegClass(Old.getReg()); + const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg()); + unsigned NewSubReg = New->getSubReg(); + unsigned OldSubReg = Old.getSubReg(); + + const TargetRegisterClass *ConstrainRC = OpRC; + if (NewSubReg && OldSubReg) { + unsigned PreA, PreB; + ConstrainRC = TRI->getCommonSuperRegClass(OpRC, OldSubReg, NewRC, + NewSubReg, PreA, PreB); + } else if (OldSubReg) { + ConstrainRC = TRI->getMatchingSuperRegClass(OldRC, OpRC, OldSubReg); + } else if (NewSubReg) { + ConstrainRC = TRI->getMatchingSuperRegClass(NewRC, OpRC, NewSubReg); + } + + if (!ConstrainRC) + return false; + + if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) { + LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI) + << TRI->getRegClassName(ConstrainRC) << '\n'); + return false; + } + } + // Rework once the VS_16 register class is updated to include proper // 16-bit SGPRs instead of 32-bit ones. if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg())) @@ -1429,30 +1460,9 @@ void SIFoldOperandsImpl::foldOperand( return; } - if (!FoldingImmLike) { - if (OpToFold.isReg() && ST->needsAlignedVGPRs()) { - // Don't fold if OpToFold doesn't hold an aligned register. - const TargetRegisterClass *RC = - TRI->getRegClassForReg(*MRI, OpToFold.getReg()); - assert(RC); - if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) { - unsigned SubReg = OpToFold.getSubReg(); - if (const TargetRegisterClass *SubRC = - TRI->getSubRegisterClass(RC, SubReg)) - RC = SubRC; - } - - if (!RC || !TRI->isProperlyAlignedRC(*RC)) - return; - } - - tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold); - - // FIXME: We could try to change the instruction from 64-bit to 32-bit - // to enable more folding opportunities. The shrink operands pass - // already does this. - return; - } + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunities. The shrink operands pass + // already does this. tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold); } @@ -2747,6 +2757,7 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) { } bool SIFoldOperandsImpl::run(MachineFunction &MF) { + this->MF = &MF; MRI = &MF.getRegInfo(); ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir index a0ea04b1b9c0f..8326862706a02 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir @@ -31,9 +31,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_64 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_64_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:areg_64 = IMPLICIT_DEF %2:areg_64_align2 = COPY killed %1 @@ -105,9 +104,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_96 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_96_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:areg_96 = IMPLICIT_DEF %2:areg_96_align2 = COPY killed %1 @@ -234,9 +232,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_128 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:areg_128 = IMPLICIT_DEF %2:areg_128_align2 = COPY killed %1 diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir index a54c0accce783..9dd025a3da086 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir @@ -46,9 +46,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_64 = IMPLICIT_DEF %2:vreg_64_align2 = COPY killed %1 @@ -148,9 +147,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_96 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_96 = IMPLICIT_DEF %2:vreg_96_align2 = COPY killed %1 @@ -326,9 +324,8 @@ body: | ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF - ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY killed [[DEF]] - ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vreg_128 = IMPLICIT_DEF %2:vreg_128_align2 = COPY killed %1