diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 00c7f0eb6e9f1..3412bb5acf28c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1872,6 +1872,23 @@ static SDValue matchZExtFromI32(SDValue Op) { return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue(); } +// If this matches *_extend i32:x, return x +// Otherwise if the value is I32 returns x. +static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, + const SelectionDAG *DAG) { + if (Op.getValueType() == MVT::i32) + return Op; + + if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) && + Op.getOpcode() != ISD::ANY_EXTEND && + !(DAG->SignBitIsZero(Op) && + Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND))) + return SDValue(); + + SDValue ExtSrc = Op.getOperand(0); + return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue(); +} + // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, @@ -2159,17 +2176,59 @@ bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset, return true; } +// Given \p Offset and load node \p N check if an \p Offset is a multiple of +// the load byte size. If it is update \p Offset to a pre-scaled value and +// return true. +bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset, + bool IsSigned) const { + bool ScaleOffset = false; + if (!Subtarget->hasScaleOffset() || !Offset) + return false; + + unsigned Size = + (unsigned)cast(N)->getMemoryVT().getFixedSizeInBits() / 8; + + SDValue Off = Offset; + if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG)) + Off = Ext; + + if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) { + if (auto *C = dyn_cast(Off.getOperand(1))) + ScaleOffset = C->getZExtValue() == Log2_32(Size); + } else if (Offset.getOpcode() == ISD::MUL || + (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) || + Offset.getOpcode() == AMDGPUISD::MUL_U24 || + (Offset.isMachineOpcode() && + Offset.getMachineOpcode() == + (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO + : AMDGPU::S_MUL_U64_U32_PSEUDO))) { + if (auto *C = dyn_cast(Offset.getOperand(1))) + ScaleOffset = C->getZExtValue() == Size; + } + + if (ScaleOffset) + Offset = Off.getOperand(0); + + return ScaleOffset; +} + // Match an immediate (if Offset is not null) or an SGPR (if SOffset is // not null) offset. If Imm32Only is true, match only 32-bit immediate // offsets available on CI. -bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, +bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, bool Imm32Only, bool IsBuffer, - bool HasSOffset, - int64_t ImmOffset) const { + bool HasSOffset, int64_t ImmOffset, + bool *ScaleOffset) const { assert((!SOffset || !Offset) && "Cannot match both soffset and offset at the same time!"); + if (ScaleOffset) { + assert(N && SOffset); + + *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */); + } + ConstantSDNode *C = dyn_cast(ByteOffsetNode); if (!C) { if (!SOffset) @@ -2254,24 +2313,25 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { // Match a base and an immediate (if Offset is not null) or an SGPR (if // SOffset is not null) or an immediate+SGPR offset. If Imm32Only is // true, match only 32-bit immediate offsets available on CI. -bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, - SDValue *SOffset, SDValue *Offset, - bool Imm32Only, bool IsBuffer, - bool HasSOffset, - int64_t ImmOffset) const { +bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr, + SDValue &SBase, SDValue *SOffset, + SDValue *Offset, bool Imm32Only, + bool IsBuffer, bool HasSOffset, + int64_t ImmOffset, + bool *ScaleOffset) const { if (SOffset && Offset) { assert(!Imm32Only && !IsBuffer); SDValue B; - if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true)) + if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true)) return false; int64_t ImmOff = 0; if (ConstantSDNode *C = dyn_cast(*Offset)) ImmOff = C->getSExtValue(); - return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true, - ImmOff); + return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false, + true, ImmOff, ScaleOffset); } // A 32-bit (address + offset) should not cause unsigned 32-bit integer @@ -2291,23 +2351,25 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, if (!N0 || !N1) return false; - if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, - ImmOffset)) { + if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset, ScaleOffset)) { SBase = N0; return true; } - if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, - ImmOffset)) { + if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, + ImmOffset, ScaleOffset)) { SBase = N1; return true; } return false; } -bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, +bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, - bool Imm32Only) const { - if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) { + bool Imm32Only, bool *ScaleOffset) const { + if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only, + /* IsBuffer */ false, /* HasSOffset */ false, + /* ImmOffset */ 0, ScaleOffset)) { SBase = Expand32BitAddress(SBase); return true; } @@ -2323,36 +2385,51 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset); + return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr, + &Offset); } bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const { assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset, - /* Imm32Only */ true); + return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr, + &Offset, /* Imm32Only */ true); } -bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, - SDValue &SOffset) const { - return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr); +bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, + SDValue &SOffset, SDValue &CPol) const { + bool ScaleOffset; + if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr, + /* Imm32Only */ false, &ScaleOffset)) + return false; + + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(N), MVT::i32); + return true; } -bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, - SDValue &SOffset, - SDValue &Offset) const { - return SelectSMRD(Addr, SBase, &SOffset, &Offset); +bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr, + SDValue &SBase, SDValue &SOffset, + SDValue &Offset, + SDValue &CPol) const { + bool ScaleOffset; + if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset)) + return false; + + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(N), MVT::i32); + return true; } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const { - return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset, + return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset, /* Imm32Only */ false, /* IsBuffer */ true); } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const { assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset, + return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset, /* Imm32Only */ true, /* IsBuffer */ true); } @@ -2361,9 +2438,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, // Match the (soffset + offset) pair as a 32-bit register base and // an immediate offset. return N.getValueType() == MVT::i32 && - SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr, - &Offset, /* Imm32Only */ false, - /* IsBuffer */ true); + SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset, + /* SOffset*/ nullptr, &Offset, + /* Imm32Only */ false, /* IsBuffer */ true); } bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index acbab3d9e2d81..f7c7b3e144758 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -176,22 +176,28 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, SDValue &Offset) const; - bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, + bool SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false, bool IsBuffer = false, bool HasSOffset = false, - int64_t ImmOffset = 0) const; + int64_t ImmOffset = 0, + bool *ScaleOffset = nullptr) const; SDValue Expand32BitAddress(SDValue Addr) const; - bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, - SDValue *Offset, bool Imm32Only = false, - bool IsBuffer = false, bool HasSOffset = false, - int64_t ImmOffset = 0) const; - bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset, - SDValue *Offset, bool Imm32Only = false) const; + bool SelectSMRDBaseOffset(SDNode *N, SDValue Addr, SDValue &SBase, + SDValue *SOffset, SDValue *Offset, + bool Imm32Only = false, bool IsBuffer = false, + bool HasSOffset = false, int64_t ImmOffset = 0, + bool *ScaleOffset = nullptr) const; + bool SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset, + SDValue *Offset, bool Imm32Only = false, + bool *ScaleOffset = nullptr) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; - bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const; - bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset, - SDValue &Offset) const; + bool SelectScaleOffset(SDNode *N, SDValue &Offset, bool IsSigned) const; + bool SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, SDValue &SOffset, + SDValue &CPol) const; + bool SelectSMRDSgprImm(SDNode *N, SDValue Addr, SDValue &SBase, + SDValue &SOffset, SDValue &Offset, + SDValue &CPol) const; bool SelectSMRDBufferImm(SDValue N, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const; bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 8975486caa770..d2e718c1272f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3494,25 +3494,74 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { } /// Match a zero extend from a 32-bit value to 64-bits. -static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { +Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const { Register ZExtSrc; - if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) - return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); + if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc)))) + return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) - const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) return Register(); assert(Def->getNumOperands() == 3 && - MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); - if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { + MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); + if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) { return Def->getOperand(1).getReg(); } return Register(); } +/// Match a sign extend from a 32-bit value to 64-bits. +Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const { + Register SExtSrc; + if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc)))) + return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register(); + + // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31)) + const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); + if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) + return Register(); + + assert(Def->getNumOperands() == 3 && + MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); + if (mi_match(Def->getOperand(2).getReg(), *MRI, + m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()), + m_SpecificICst(31)))) + return Def->getOperand(1).getReg(); + + if (VT->signBitIsZero(Reg)) + return matchZeroExtendFromS32(Reg); + + return Register(); +} + +/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it +/// is 32-bit. +Register +AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const { + return MRI->getType(Reg) == LLT::scalar(32) ? Reg + : matchZeroExtendFromS32(Reg); +} + +/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it +/// is 32-bit. +Register +AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const { + return MRI->getType(Reg) == LLT::scalar(32) ? Reg + : matchSignExtendFromS32(Reg); +} + +Register +AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg, + bool IsSigned) const { + if (IsSigned) + return matchSignExtendFromS32OrS32(Reg); + + return matchZeroExtendFromS32OrS32(Reg); +} + Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const { Register AnyExtSrc; if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc)))) @@ -3581,7 +3630,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); if (isSGPR(SAddr)) { Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); - if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { + if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) { Addr = SAddr; VOffset = Off; } @@ -5223,7 +5272,7 @@ AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const { getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); unsigned Key = 0; - Register S32 = matchZeroExtendFromS32(*MRI, Src); + Register S32 = matchZeroExtendFromS32(Src); if (!S32) S32 = matchAnyExtendFromS32(Src); @@ -5296,10 +5345,68 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { }}; } +// Given \p Offset and load specified by the \p Root operand check if \p Offset +// is a multiple of the load byte size. If it is update \p Offset to a +// pre-scaled value and return true. +bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root, + Register &Offset, + bool IsSigned) const { + if (!Subtarget->hasScaleOffset()) + return false; + + const MachineInstr &MI = *Root.getParent(); + MachineMemOperand *MMO = *MI.memoperands_begin(); + + if (!MMO->getSize().hasValue()) + return false; + + uint64_t Size = MMO->getSize().getValue(); + + Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned); + if (!OffsetReg) + OffsetReg = Offset; + + if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI)) + OffsetReg = Def->Reg; + + Register Op0; + MachineInstr *Mul; + bool ScaleOffset = + (isPowerOf2_64(Size) && + mi_match(OffsetReg, *MRI, + m_GShl(m_Reg(Op0), + m_any_of(m_SpecificICst(Log2_64(Size)), + m_Copy(m_SpecificICst(Log2_64(Size))))))) || + mi_match(OffsetReg, *MRI, + m_GMul(m_Reg(Op0), m_any_of(m_SpecificICst(Size), + m_Copy(m_SpecificICst(Size))))) || + mi_match( + OffsetReg, *MRI, + m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64, + m_Reg(Op0), m_SpecificICst(Size))) || + // Match G_AMDGPU_MAD_U64_U32 offset, c, 0 + (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) && + (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32 + : AMDGPU::G_AMDGPU_MAD_U64_U32) || + (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 && + VT->signBitIsZero(Mul->getOperand(2).getReg()))) && + mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) && + mi_match(Mul->getOperand(3).getReg(), *MRI, + m_GTrunc(m_any_of(m_SpecificICst(Size), + m_Copy(m_SpecificICst(Size))))) && + mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0))); + + if (ScaleOffset) + Offset = Op0; + + return ScaleOffset; +} + bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset, - int64_t *Offset) const { + int64_t *Offset, + bool *ScaleOffset) const { MachineInstr *MI = Root.getParent(); MachineBasicBlock *MBB = MI->getParent(); @@ -5314,6 +5421,9 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, const GEPInfo &GEPI = AddrInfo[0]; std::optional EncodedImm; + if (ScaleOffset) + *ScaleOffset = false; + if (SOffset && Offset) { EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, /*HasSOffset=*/true); @@ -5321,8 +5431,12 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, AddrInfo.size() > 1) { const GEPInfo &GEPI2 = AddrInfo[1]; if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) { - if (Register OffsetReg = - matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) { + Register OffsetReg = GEPI2.SgprParts[1]; + if (ScaleOffset) + *ScaleOffset = + selectScaleOffset(Root, OffsetReg, false /* IsSigned */); + OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg); + if (OffsetReg) { Base = GEPI2.SgprParts[0]; *SOffset = OffsetReg; *Offset = *EncodedImm; @@ -5367,7 +5481,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, } if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) { - if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) { + Register OffsetReg = GEPI.SgprParts[1]; + if (ScaleOffset) + *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */); + OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg); + if (OffsetReg) { Base = GEPI.SgprParts[0]; *SOffset = OffsetReg; return true; @@ -5381,7 +5499,8 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { Register Base; int64_t Offset; - if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset)) + if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset, + /* ScaleOffset */ nullptr)) return std::nullopt; return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, @@ -5412,23 +5531,30 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { Register Base, SOffset; - if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr)) + bool ScaleOffset; + if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr, + &ScaleOffset)) return std::nullopt; + unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0; return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, - [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}}; + [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}}; } InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const { Register Base, SOffset; int64_t Offset; - if (!selectSmrdOffset(Root, Base, &SOffset, &Offset)) + bool ScaleOffset; + if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset)) return std::nullopt; + unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0; return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}}; + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}}; } std::pair @@ -5565,7 +5691,7 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, // It's possible voffset is an SGPR here, but the copy to VGPR will be // inserted later. - if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { + if (Register VOffset = matchZeroExtendFromS32(PtrBaseOffset)) { return {{[=](MachineInstrBuilder &MIB) { // saddr MIB.addReg(SAddr); }, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 34bdf0a6d4ab2..e58fbb48ffb20 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -232,8 +232,10 @@ class AMDGPUInstructionSelector final : public InstructionSelector { InstructionSelector::ComplexRendererFns selectVINTERPModsHi(MachineOperand &Root) const; + bool selectScaleOffset(MachineOperand &Root, Register &Offset, + bool IsSigned) const; bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset, - int64_t *Offset) const; + int64_t *Offset, bool *ScaleOffset) const; InstructionSelector::ComplexRendererFns selectSmrdImm(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns @@ -421,6 +423,19 @@ class AMDGPUInstructionSelector final : public InstructionSelector { // shift amount operand's `ShAmtBits` bits is unneeded. bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const; + /// Match a zero extend from a 32-bit value to 64-bits. + Register matchZeroExtendFromS32(Register Reg) const; + /// Match a sign extend from a 32-bit value to 64-bits. + Register matchSignExtendFromS32(Register Reg) const; + /// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it + /// is 32-bit. + Register matchZeroExtendFromS32OrS32(Register Reg) const; + /// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it + /// is 32-bit. + Register matchSignExtendFromS32OrS32(Register Reg) const; + /// Match either sign or zero extend depending on the \p IsSigned from a + /// 32-bit value to 64-bits, or \p Reg itself if it is 32-bit. + Register matchExtendFromS32OrS32(Register Reg, bool IsSigned) const; /// Match an any extend from a 32-bit value to 64-bit. Register matchAnyExtendFromS32(Register Reg) const; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 0850c41c933de..38cc51b8ab32b 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -864,8 +864,10 @@ def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), def SMRDImm : ComplexPattern; def SMRDImm32 : ComplexPattern; -def SMRDSgpr : ComplexPattern; -def SMRDSgprImm : ComplexPattern; +let WantsRoot = true in { + def SMRDSgpr : ComplexPattern; + def SMRDSgprImm : ComplexPattern; +} def SMRDBufferImm : ComplexPattern; def SMRDBufferImm32 : ComplexPattern; def SMRDBufferSgprImm : ComplexPattern; @@ -906,15 +908,15 @@ multiclass SMRD_Patterns (Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> { + (frag (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)), + (vt (!cast(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, $cpol))> { let SubtargetPredicate = isGFX9Plus; } // 4. SGPR+IMM offset def : GCNPat < - (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)), - (vt (!cast(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> { + (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)), + (vt (!cast(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, $cpol))> { let SubtargetPredicate = isGFX9Plus; } @@ -989,15 +991,15 @@ multiclass ScalarLoadWithExtensionPat (Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{ + (node (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)), + (vt (!cast(Instr#"_SGPR_IMM") $sbase, $soffset, 0, $cpol))>{ let SubtargetPredicate = isGFX12Plus; } // 3. SGPR+IMM offset def : GCNPat < - (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)), - (vt (!cast(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{ + (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)), + (vt (!cast(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, $cpol))>{ let SubtargetPredicate = isGFX12Plus; } diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll new file mode 100644 index 0000000000000..b5bb68e1eaa89 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll @@ -0,0 +1,372 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s + +define amdgpu_ps float @s_load_b32_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +; 'i32 %idx' is a signed index while SMRD soffset is unsigned, thus it is not selected. + +define amdgpu_ps float @s_load_b32_idx32(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; SDAG-LABEL: s_load_b32_idx32: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_ashr_i32 s3, s2, 31 +; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0 +; SDAG-NEXT: s_wait_kmcnt 0x0 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_load_b32_idx32: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_ashr_i32 s3, s2, 31 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GISEL-NEXT: s_add_co_u32 s0, s0, s2 +; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3 +; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: ; return to shader part epilog +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i32 %idx + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @s_load_b32_idxprom_wrong_stride(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; SDAG-LABEL: s_load_b32_idxprom_wrong_stride: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_mov_b32 s3, 0 +; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0 +; SDAG-NEXT: s_wait_kmcnt 0x0 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_load_b32_idxprom_wrong_stride: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_mov_b32 s3, 0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; GISEL-NEXT: s_add_co_u32 s0, s0, s2 +; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3 +; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @s_load_b16_idxprom_ioffset(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b16_idxprom_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd + %ld = load i16, ptr addrspace(4) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b96_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b128_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b256_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <8 x float> %ret +} + +define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b512_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <16 x float> %ret +} + +define amdgpu_ps float @s_load_b32_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b32_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @s_load_b32_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b32_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x40 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxadd + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +; Note: this is a byte load, there is nothing to scale + +define amdgpu_ps float @s_load_b8_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b8_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_u8 s0, s[0:1], s2 offset:0x10 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i8, ptr addrspace(4) %p, i64 %idxadd + %ld = load i8, ptr addrspace(4) %arrayidx + %ret.i32 = zext i8 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @s_load_b16_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b16_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxprom + %ld = load i16, ptr addrspace(4) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @s_load_b16_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b16_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd + %ld = load i16, ptr addrspace(4) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b64_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b96_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b128_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b256_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <8 x float> %ret +} + +define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b512_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <16 x float> %ret +} + +!0 = !{i32 0, i32 1024}