diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index b45627d9c1c5d..9eac2717198ff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -352,6 +352,31 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, MI.eraseFromParent(); } +void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const { + Register Dst = MI.getDstReg(); + Register Ptr = MI.getPointerReg(); + MachineMemOperand &MMO = MI.getMMO(); + unsigned MemSize = 8 * MMO.getSize().getValue(); + + MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32); + + if (MI.getOpcode() == G_LOAD) { + B.buildLoad(Dst, Ptr, *WideMMO); + } else { + auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO); + + if (MI.getOpcode() == G_ZEXTLOAD) { + APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize); + auto MaskCst = B.buildConstant(SgprRB_S32, Mask); + B.buildAnd(Dst, Load, MaskCst); + } else { + B.buildSExtInReg(Dst, Load, MemSize); + } + } + + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(Dst); @@ -744,6 +769,9 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, } break; } + case WidenMMOToS32: { + return widenMMOToS32(cast(MI)); + } } if (!WaterfallSgprs.empty()) { @@ -759,6 +787,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { return LLT::scalar(1); case Sgpr16: case Vgpr16: + case UniInVgprS16: return LLT::scalar(16); case Sgpr32: case Sgpr32_WF: @@ -895,6 +924,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case SgprB256: case SgprB512: case UniInVcc: + case UniInVgprS16: case UniInVgprS32: case UniInVgprV2S16: case UniInVgprV4S32: @@ -1015,6 +1045,18 @@ void RegBankLegalizeHelper::applyMappingDst( B.buildTrunc(Reg, CopyS32_Vcc); break; } + case UniInVgprS16: { + assert(Ty == getTyFromID(MethodIDs[OpIdx])); + assert(RB == SgprRB); + Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16}); + Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32}); + Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32}); + Op.setReg(NewVgprDstS16); + B.buildAnyExt(NewVgprDstS32, NewVgprDstS16); + buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI); + B.buildTrunc(Reg, NewSgprDstS32); + break; + } case UniInVgprS32: case UniInVgprV2S16: case UniInVgprV4S32: { @@ -1257,6 +1299,7 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { return; } + MI.dump(); LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump();); llvm_unreachable("type not supported"); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index db965d8c000d9..7affe5ab3da7f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -11,6 +11,7 @@ #include "AMDGPURegBankLegalizeRules.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/MachineRegisterInfo.h" namespace llvm { @@ -107,6 +108,7 @@ class RegBankLegalizeHelper { void splitLoad(MachineInstr &MI, ArrayRef LLTBreakdown, LLT MergeTy = LLT()); void widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT()); + void widenMMOToS32(GAnyLoad &MI) const; void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &SgprWaterfallOperandRegs); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 8c56c21621121..0776d14a84067 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -467,6 +467,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, : ST(&_ST), MRI(&_MRI) { addRulesForGOpcs({G_ADD, G_SUB}, Standard) + .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); @@ -615,8 +617,9 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}}) .Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SplitTo32SExtInReg}}); - bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12; + bool hasSMRDx3 = ST->hasScalarDwordx3Loads(); bool hasSMRDSmall = ST->hasScalarSubwordLoads(); + bool usesTrue16 = ST->useRealTrue16Insts(); Predicate isAlign16([](const MachineInstr &MI) -> bool { return (*MI.memoperands_begin())->getAlign() >= Align(16); @@ -654,54 +657,187 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, return (*MI.memoperands_begin())->getFlags() & MONoClobber; }); - Predicate isNaturalAlignedSmall([](const MachineInstr &MI) -> bool { + Predicate isNaturalAligned([](const MachineInstr &MI) -> bool { + const MachineMemOperand *MMO = *MI.memoperands_begin(); + return MMO->getAlign() >= Align(MMO->getSize().getValue()); + }); + + Predicate is8Or16BitMMO([](const MachineInstr &MI) -> bool { const MachineMemOperand *MMO = *MI.memoperands_begin(); const unsigned MemSize = 8 * MMO->getSize().getValue(); - return (MemSize == 16 && MMO->getAlign() >= Align(2)) || - (MemSize == 8 && MMO->getAlign() >= Align(1)); + return MemSize == 16 || MemSize == 8; + }); + + Predicate is32BitMMO([](const MachineInstr &MI) -> bool { + const MachineMemOperand *MMO = *MI.memoperands_begin(); + return 8 * MMO->getSize().getValue() == 32; }); auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) && (isConst || isInvMMO || isNoClobberMMO); // clang-format off + // TODO: S32Dst, 16-bit any-extending load should not appear on True16 targets addRulesForGOpcs({G_LOAD}) - .Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}}) - .Any({{DivB32, UniP0}, {{VgprB32}, {VgprP0}}}) - - .Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}}) - .Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}}) - .Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}}) - .Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}) - .Any({{{UniB64, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}}) - .Any({{{UniB96, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}}) - .Any({{{UniB128, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}}) - .Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}}) - .Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}}) - - .Any({{DivB32, UniP3}, {{VgprB32}, {VgprP3}}}) - .Any({{{UniB32, UniP3}, isAlign4 && isUL}, {{SgprB32}, {SgprP3}}}) - .Any({{{UniB32, UniP3}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP3}}}) - - .Any({{{DivB256, DivP4}}, {{VgprB256}, {VgprP4}, SplitLoad}}) - .Any({{{UniB32, UniP4}, isNaturalAlignedSmall && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) // i8 and i16 load - .Any({{{UniB32, UniP4}, isAlign4 && isUL}, {{SgprB32}, {SgprP4}}}) - .Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnalignedLoads) - .Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnalignedLoads) - .Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnalignedLoads) - .Any({{{UniB128, UniP4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}}) - .Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}}) - .Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}}) - .Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load - .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}}) - .Any({{{UniB256, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP4}, SplitLoad}}) - .Any({{{UniB512, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP4}, SplitLoad}}) - - .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}}); - - addRulesForGOpcs({G_ZEXTLOAD}) // i8 and i16 zero-extending loads - .Any({{{UniB32, UniP3}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP3}}}) - .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}}); + // flat, addrspace(0), never uniform - flat_load + .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16) + .Any({{DivB32, P0}, {{VgprB32}, {VgprP0}}}) // 32-bit load, 8-bit and 16-bit any-extending load + .Any({{DivB64, P0}, {{VgprB64}, {VgprP0}}}) + .Any({{DivB96, P0}, {{VgprB96}, {VgprP0}}}) + .Any({{DivB128, P0}, {{VgprB128}, {VgprP0}}}) + + // global, addrspace(1) + // divergent - global_load + .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16) + .Any({{DivB32, P1}, {{VgprB32}, {VgprP1}}}) //32-bit load, 8-bit and 16-bit any-extending load + .Any({{DivB64, P1}, {{VgprB64}, {VgprP1}}}) + .Any({{DivB96, P1}, {{VgprB96}, {VgprP1}}}) + .Any({{DivB128, P1}, {{VgprB128}, {VgprP1}}}) + .Any({{DivB256, P1}, {{VgprB256}, {VgprP1}, SplitLoad}}) + .Any({{DivB512, P1}, {{VgprB512}, {VgprP1}, SplitLoad}}) + + // uniform - s_load + .Any({{{UniS16, P1}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load + .Any({{{UniS16, P1}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP1}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load + .Any({{{UniB32, P1}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load + // TODO: SplitLoad when !isNaturalAligned && isUL and target hasSMRDSmall + .Any({{{UniB32, P1}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load + .Any({{{UniB32, P1}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}}}) //32-bit load + .Any({{{UniB64, P1}, isAlign4 && isUL}, {{SgprB64}, {SgprP1}}}) + .Any({{{UniB96, P1}, isAlign16 && isUL}, {{SgprB96}, {SgprP1}, WidenLoad}}, !hasSMRDx3) + .Any({{{UniB96, P1}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP1}, SplitLoad}}, !hasSMRDx3) + .Any({{{UniB96, P1}, isAlign4 && isUL}, {{SgprB96}, {SgprP1}}}, hasSMRDx3) + .Any({{{UniB128, P1}, isAlign4 && isUL}, {{SgprB128}, {SgprP1}}}) + .Any({{{UniB256, P1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}}) + .Any({{{UniB512, P1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}}) + + // Uniform via global or buffer load, for example volatile or non-aligned + // uniform load. Not using standard {{UniInVgprTy}, {VgprP1}} since it is + // selected as global_load, use SgprP1 for pointer instead to match + // patterns without flat-for-global, default for GFX7 and older. + // -> +flat-for-global + {{UniInVgprTy}, {SgprP1}} - global_load + // -> -flat-for-global + {{UniInVgprTy}, {SgprP1}} - buffer_load + .Any({{{UniS16, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load + .Any({{{UniS16, P1}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && !hasSMRDSmall) // s16 load + .Any({{{UniB32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load + .Any({{{UniB32, P1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load + .Any({{{UniB64, P1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}}) + .Any({{{UniB96, P1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}}) + .Any({{{UniB128, P1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}}) + .Any({{{UniB256, P1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP1}, SplitLoad}}) + .Any({{{UniB512, P1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP1}, SplitLoad}}) + + // local, addrspace(3) - ds_load + .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16) + .Any({{DivB32, P3}, {{VgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load + .Any({{DivB64, P3}, {{VgprB64}, {VgprP3}}}) + .Any({{DivB96, P3}, {{VgprB96}, {VgprP3}}}) + .Any({{DivB128, P3}, {{VgprB128}, {VgprP3}}}) + + .Any({{UniS16, P3}, {{UniInVgprS16}, {SgprP3}}}, usesTrue16) // 16-bit load + .Any({{UniB32, P3}, {{UniInVgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load + .Any({{UniB64, P3}, {{UniInVgprB64}, {VgprP3}}}) + .Any({{UniB96, P3}, {{UniInVgprB96}, {VgprP3}}}) + .Any({{UniB128, P3}, {{UniInVgprB128}, {VgprP3}}}) + + // constant, addrspace(4) + // divergent - global_load + .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16) + .Any({{DivB32, P4}, {{VgprB32}, {VgprP4}}}) //32-bit load, 8-bit and 16-bit any-extending load + .Any({{DivB64, P4}, {{VgprB64}, {VgprP4}}}) + .Any({{DivB96, P4}, {{VgprB96}, {VgprP4}}}) + .Any({{DivB128, P4}, {{VgprB128}, {VgprP4}}}) + .Any({{DivB256, P4}, {{VgprB256}, {VgprP4}, SplitLoad}}) + .Any({{DivB512, P4}, {{VgprB512}, {VgprP4}, SplitLoad}}) + + // uniform - s_load + .Any({{{UniS16, P4}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load + .Any({{{UniS16, P4}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP4}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load + .Any({{{UniB32, P4}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load + .Any({{{UniB32, P4}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load + .Any({{{UniB32, P4}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}}}) //32-bit load + .Any({{{UniB64, P4}, isAlign4 && isUL}, {{SgprB64}, {SgprP4}}}) + .Any({{{UniB96, P4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasSMRDx3) + .Any({{{UniB96, P4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasSMRDx3) + .Any({{{UniB96, P4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasSMRDx3) + .Any({{{UniB128, P4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}}) + .Any({{{UniB256, P4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}}) + .Any({{{UniB512, P4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}}) + + // uniform in vgpr - global_load or buffer_load + .Any({{{UniS16, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load + .Any({{{UniS16, P4}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && !hasSMRDSmall) // s16 load + .Any({{{UniB32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load + .Any({{{UniB32, P4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP4}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load + .Any({{{UniB64, P4}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP4}}}) + .Any({{{UniB96, P4}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP4}}}) + .Any({{{UniB128, P4}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP4}}}) + .Any({{{UniB256, P4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP4}, SplitLoad}}) + .Any({{{UniB512, P4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP4}, SplitLoad}}) + + // private, addrspace(5), never uniform - scratch_load + .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16) + .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}}) // 32-bit load, 8-bit and 16-bit any-extending load + .Any({{DivB64, P5}, {{VgprB64}, {VgprP5}}}) + .Any({{DivB96, P5}, {{VgprB96}, {VgprP5}}}) + .Any({{DivB128, P5}, {{VgprB128}, {VgprP5}}}) + + .Any({{DivS32, Ptr128}, {{Vgpr32}, {VgprPtr128}}}); + + + addRulesForGOpcs({G_ZEXTLOAD, G_SEXTLOAD}) // i8 and i16 zeroextending loads + .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0}}}) + + .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1}}}) + .Any({{{UniS32, P1}, isAlign4 && isUL}, {{Sgpr32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall) + .Any({{{UniS32, P1}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP1}}}, hasSMRDSmall) + .Any({{{UniS32, P1}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP1}}}, !hasSMRDSmall) + .Any({{{UniS32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP1}}}, hasSMRDSmall) + + .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3}}}) + .Any({{UniS32, P3}, {{UniInVgprS32}, {VgprP3}}}) + + .Any({{DivS32, P4}, {{Vgpr32}, {VgprP4}}}) + .Any({{{UniS32, P4}, isAlign4 && isUL}, {{Sgpr32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall) + .Any({{{UniS32, P4}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP4}}}, hasSMRDSmall) + .Any({{{UniS32, P4}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP4}}}, !hasSMRDSmall) + .Any({{{UniS32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP4}}}, hasSMRDSmall) + + .Any({{DivS32, P5}, {{Vgpr32}, {VgprP5}}}); + + addRulesForGOpcs({G_STORE}) + // addrspace(0) + .Any({{S16, P0}, {{}, {Vgpr16, VgprP0}}}, usesTrue16) // 16-bit store + .Any({{B32, P0}, {{}, {VgprB32, VgprP0}}}) // 32-bit store, 8-bit and 16-bit truncating store + .Any({{B64, P0}, {{}, {VgprB64, VgprP0}}}) + .Any({{B96, P0}, {{}, {VgprB96, VgprP0}}}) + .Any({{B128, P0}, {{}, {VgprB128, VgprP0}}}) + + // addrspace(1), there are no stores to addrspace(4) + // For targets: + // - with "+flat-for-global" - global_store + // - without(-flat-for-global) - buffer_store addr64 + .Any({{S16, DivP1}, {{}, {Vgpr16, VgprP1}}}, usesTrue16) // 16-bit store + .Any({{B32, DivP1}, {{}, {VgprB32, VgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store + .Any({{B64, DivP1}, {{}, {VgprB64, VgprP1}}}) + .Any({{B96, DivP1}, {{}, {VgprB96, VgprP1}}}) + .Any({{B128, DivP1}, {{}, {VgprB128, VgprP1}}}) + + // For UniP1, use sgpr ptr to match flat-for-global patterns. Targets: + // - with "+flat-for-global" - global_store for both sgpr and vgpr ptr + // - without(-flat-for-global) - need sgpr ptr to select buffer_store + .Any({{S16, UniP1}, {{}, {Vgpr16, SgprP1}}}, usesTrue16) // 16-bit store + .Any({{B32, UniP1}, {{}, {VgprB32, SgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store + .Any({{B64, UniP1}, {{}, {VgprB64, SgprP1}}}) + .Any({{B96, UniP1}, {{}, {VgprB96, SgprP1}}}) + .Any({{B128, UniP1}, {{}, {VgprB128, SgprP1}}}) + + // addrspace(3) and addrspace(5) + .Any({{S16, Ptr32}, {{}, {Vgpr16, VgprPtr32}}}, usesTrue16) // 16-bit store + .Any({{B32, Ptr32}, {{}, {VgprB32, VgprPtr32}}}) // 32-bit store, 8-bit and 16-bit truncating store + .Any({{B64, Ptr32}, {{}, {VgprB64, VgprPtr32}}}) + .Any({{B96, Ptr32}, {{}, {VgprB96, VgprPtr32}}}) + .Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}}); // clang-format on addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, StandardB) @@ -714,12 +850,6 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}); - addRulesForGOpcs({G_STORE}) - .Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}}) - .Any({{S32, P1}, {{}, {Vgpr32, VgprP1}}}) - .Any({{S64, P1}, {{}, {Vgpr64, VgprP1}}}) - .Any({{V4S32, P1}, {{}, {VgprV4S32, VgprP1}}}); - addRulesForGOpcs({G_AMDGPU_BUFFER_STORE}) .Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 13914403c439e..d0c69105356b8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -176,6 +176,7 @@ enum RegBankLLTMappingApplyID { // Dst only modifiers: read-any-lane and truncs UniInVcc, + UniInVgprS16, UniInVgprS32, UniInVgprV2S16, UniInVgprV4S32, @@ -221,6 +222,7 @@ enum LoweringMethodID { UniCstExt, SplitLoad, WidenLoad, + WidenMMOToS32 }; enum FastRulesTypes { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll index 83912b1e77db2..97694f3304431 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s define i8 @atomic_load_flat_monotonic_i8(ptr %ptr) { ; GCN-LABEL: atomic_load_flat_monotonic_i8: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll index e2906c3d4fdb2..5d902d5ec98ab 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s define i8 @atomic_load_global_monotonic_i8(ptr addrspace(1) %ptr) { ; GFX6-LABEL: atomic_load_global_monotonic_i8: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll index 70cd96338a0c9..c1dbf91aa9086 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll @@ -1,5 +1,5 @@ -; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s -; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: llc -global-isel -new-reg-bank-select -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}atomic_load_monotonic_i8: ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll index 1656814d6fb06..31cdbbe1c4d73 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; TODO: Merge with atomic_load_local.ll diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll index dea42d62ec2d4..76850f0c0db17 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll @@ -1,5 +1,5 @@ -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}atomic_store_monotonic_i8: ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll index 18895f7867369..358ecd8fce3a9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll @@ -1,21 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,WAVE64 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck -check-prefixes=GCN,WAVE32 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,WAVE64 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck -check-prefixes=GCN,WAVE32 %s ; End to end tests for scalar vs. vector boolean legalization strategies. define amdgpu_ps float @select_vgpr_sgpr_trunc_cond(i32 inreg %a, i32 %b, i32 %c) { ; WAVE64-LABEL: select_vgpr_sgpr_trunc_cond: ; WAVE64: ; %bb.0: -; WAVE64-NEXT: s_and_b32 s0, 1, s0 -; WAVE64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; WAVE64-NEXT: s_cmp_lg_u32 s0, 0 +; WAVE64-NEXT: s_cselect_b64 vcc, exec, 0 ; WAVE64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; WAVE64-NEXT: ; return to shader part epilog ; ; WAVE32-LABEL: select_vgpr_sgpr_trunc_cond: ; WAVE32: ; %bb.0: -; WAVE32-NEXT: s_and_b32 s0, 1, s0 -; WAVE32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; WAVE32-NEXT: s_cmp_lg_u32 s0, 0 +; WAVE32-NEXT: s_cselect_b32 vcc_lo, exec_lo, 0 ; WAVE32-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; WAVE32-NEXT: ; return to shader part epilog %cc = trunc i32 %a to i1 @@ -28,16 +28,16 @@ define amdgpu_ps float @select_vgpr_sgpr_trunc_and_cond(i32 inreg %a.0, i32 inre ; WAVE64-LABEL: select_vgpr_sgpr_trunc_and_cond: ; WAVE64: ; %bb.0: ; WAVE64-NEXT: s_and_b32 s0, s0, s1 -; WAVE64-NEXT: s_and_b32 s0, 1, s0 -; WAVE64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; WAVE64-NEXT: s_cmp_lg_u32 s0, 0 +; WAVE64-NEXT: s_cselect_b64 vcc, exec, 0 ; WAVE64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; WAVE64-NEXT: ; return to shader part epilog ; ; WAVE32-LABEL: select_vgpr_sgpr_trunc_and_cond: ; WAVE32: ; %bb.0: ; WAVE32-NEXT: s_and_b32 s0, s0, s1 -; WAVE32-NEXT: s_and_b32 s0, 1, s0 -; WAVE32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; WAVE32-NEXT: s_cmp_lg_u32 s0, 0 +; WAVE32-NEXT: s_cselect_b32 vcc_lo, exec_lo, 0 ; WAVE32-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; WAVE32-NEXT: ; return to shader part epilog %cc.0 = trunc i32 %a.0 to i1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bug-legalization-artifact-combiner-dead-def.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bug-legalization-artifact-combiner-dead-def.ll index d317a3ef54162..a79e471b1b5bb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bug-legalization-artifact-combiner-dead-def.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bug-legalization-artifact-combiner-dead-def.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define void @value_finder_bug(ptr addrspace(5) %store_ptr, ptr addrspace(4) %ptr) { ; GFX10-LABEL: value_finder_bug: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combiner-crash.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combiner-crash.ll index 356ef52bf21b6..e1ae61be5a66b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combiner-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combiner-crash.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s +; RUN: llc -O0 -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s define amdgpu_kernel void @test_long_add4() { entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll index 8efe711c9267c..b153ff06b727e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s | FileCheck %s +; RUN: llc -global-isel -new-reg-bank-select -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s | FileCheck %s ; Make sure there's no crash at -O0 when matching MUBUF addressing ; modes for the stack. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll index 4fc0488ec60cf..990e4f67e420d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -mattr=-xnack < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -mattr=-xnack < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s ; Check lowering of some large extractelement that use the stack ; instead of register indexing. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll index 3605daef17bd3..405861d791169 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) { ; GFX9-LABEL: extractelement_sgpr_v4i128_sgpr_idx: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll index e4acee9ddda7e..798f6eb65e6aa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) { ; GFX9-LABEL: extractelement_sgpr_v4i16_sgpr_idx: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll index ac17dde1f9aa7..de1079196223a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) { ; GFX9-LABEL: extractelement_sgpr_v4i8_sgpr_idx: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index 3e1602625f197..c2129c20e4543 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN %s ; Check lowering of some large insertelement that use the stack ; instead of register indexing. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll index e4135fae40006..7fd981c3f3fc6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck %s ; TODO: Replace with existing DAG tests @lds_512_4 = internal unnamed_addr addrspace(3) global [128 x i32] poison, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll index 79760ce4a2a0b..5f529f5a3caaf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll @@ -1,4 +1,4 @@ -; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s ; FIXME: Error on non-HSA target diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll index ee9cf0b84868f..d37ade73daf5d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll @@ -1,6 +1,6 @@ -; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=CO-V4,HSA,ALL %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefixes=CO-V4,OS-MESA3D,ALL %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-unknown -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefixes=OS-UNKNOWN,ALL %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=CO-V4,HSA,ALL %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefixes=CO-V4,OS-MESA3D,ALL %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-unknown -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefixes=OS-UNKNOWN,ALL %s ; ALL-LABEL: {{^}}test: ; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll index 0467547e55374..eecd9ae13912e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll @@ -1,4 +1,4 @@ -; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s ; FIXME: Error on non-hsa target diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll index b2546700a935d..f491df8448a7a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll @@ -1,8 +1,8 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o %t.bc -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=hawaii < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=tonga < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll index ab8d8c192187f..41fda6de82181 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-UNALIGNED %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-NOUNALIGNED %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-UNALIGNED %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-NOUNALIGNED %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-UNALIGNED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-NOUNALIGNED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-UNALIGNED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-NOUNALIGNED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align1: @@ -1000,32 +1000,50 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v0, s[0:1] offset:7 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v0, s[0:1] offset:9 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:11 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:10 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v0, s[0:1] offset:11 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11) +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v2, 8, v1 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v2 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 24, v4 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s3, v4 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7) +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s4, v5 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s5, v6 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s6, v7 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v8 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v4, v5, v3 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s7, v8 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s8, v9 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v6, v10, 8, v9 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s9, v10 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v11 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s10, v11 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v8, v6 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s3, s3, 24 +; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s7, s7, 24 +; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s11, s11, 24 +; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2 +; GFX9-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4 +; GFX9-NOUNALIGNED-NEXT: s_or_b32 s3, s7, s6 +; GFX9-NOUNALIGNED-NEXT: s_or_b32 s4, s9, s8 +; GFX9-NOUNALIGNED-NEXT: s_or_b32 s5, s11, s10 +; GFX9-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2 +; GFX9-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4 ; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1: @@ -1043,102 +1061,120 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg ; GFX7-NOUNALIGNED: ; %bb.0: ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1 ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:1 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:3 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:1 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:5 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:7 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:9 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:11 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:10 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:3 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:5 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:7 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:9 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:10 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:11 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s3, v3 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s4, v4 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s5, v5 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s6, v6 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s7, v7 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s8, v8 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s9, v9 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v3, v10 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s10, v10 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v6, v11 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v4 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s11, v11 +; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s3, s3, 24 +; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s2, s2, 16 +; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s7, s7, 24 +; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s6, s6, 16 +; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s9, s9, 8 +; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s11, s11, 24 +; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s10, s10, 16 +; GFX7-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2 +; GFX7-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4 +; GFX7-NOUNALIGNED-NEXT: s_or_b32 s3, s7, s6 +; GFX7-NOUNALIGNED-NEXT: s_or_b32 s4, s9, s8 +; GFX7-NOUNALIGNED-NEXT: s_or_b32 s5, s11, s10 +; GFX7-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2 +; GFX7-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4 ; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX6-LABEL: s_load_constant_v3i32_align1: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:1 -; GFX6-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:3 +; GFX6-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:1 ; GFX6-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:2 -; GFX6-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:5 -; GFX6-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:7 -; GFX6-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:6 -; GFX6-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:9 -; GFX6-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:11 -; GFX6-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:10 -; GFX6-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 -; GFX6-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:4 -; GFX6-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:8 +; GFX6-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:3 +; GFX6-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:5 +; GFX6-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:6 +; GFX6-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:7 +; GFX6-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:8 +; GFX6-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:9 +; GFX6-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:10 +; GFX6-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:11 ; GFX6-NEXT: s_waitcnt vmcnt(11) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: s_waitcnt vmcnt(10) -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_readfirstlane_b32 s1, v1 ; GFX6-NEXT: s_waitcnt vmcnt(9) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_readfirstlane_b32 s2, v2 ; GFX6-NEXT: s_waitcnt vmcnt(8) -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX6-NEXT: v_readfirstlane_b32 s3, v3 ; GFX6-NEXT: s_waitcnt vmcnt(7) -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX6-NEXT: v_readfirstlane_b32 s4, v4 ; GFX6-NEXT: s_waitcnt vmcnt(6) -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_readfirstlane_b32 s5, v5 ; GFX6-NEXT: s_waitcnt vmcnt(5) -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX6-NEXT: v_readfirstlane_b32 s6, v6 ; GFX6-NEXT: s_waitcnt vmcnt(4) -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX6-NEXT: v_readfirstlane_b32 s7, v7 ; GFX6-NEXT: s_waitcnt vmcnt(3) -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX6-NEXT: v_readfirstlane_b32 s8, v8 ; GFX6-NEXT: s_waitcnt vmcnt(2) -; GFX6-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_readfirstlane_b32 s9, v9 ; GFX6-NEXT: s_waitcnt vmcnt(1) -; GFX6-NEXT: v_or_b32_e32 v2, v3, v10 -; GFX6-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX6-NEXT: v_readfirstlane_b32 s10, v10 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_or_b32_e32 v4, v6, v11 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v11 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_lshl_b32 s3, s3, 24 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshl_b32 s5, s5, 8 +; GFX6-NEXT: s_lshl_b32 s7, s7, 24 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_lshl_b32 s9, s9, 8 +; GFX6-NEXT: s_lshl_b32 s11, s11, 24 +; GFX6-NEXT: s_lshl_b32 s10, s10, 16 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: s_or_b32 s1, s3, s2 +; GFX6-NEXT: s_or_b32 s2, s5, s4 +; GFX6-NEXT: s_or_b32 s3, s7, s6 +; GFX6-NEXT: s_or_b32 s4, s9, s8 +; GFX6-NEXT: s_or_b32 s5, s11, s10 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: s_or_b32 s1, s3, s2 +; GFX6-NEXT: s_or_b32 s2, s5, s4 ; GFX6-NEXT: ; return to shader part epilog %load = load <3 x i32>, ptr addrspace(4) %ptr, align 1 ret <3 x i32> %load @@ -1220,15 +1256,24 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v0, s[0:1] offset:6 ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v0, s[0:1] offset:8 ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v0, s[0:1] offset:10 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v2, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v2 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s3, v4 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s4, v5 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s5, v6 +; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s3, s3, 16 +; GFX9-NOUNALIGNED-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2 +; GFX9-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4 ; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align2: @@ -1246,54 +1291,60 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg ; GFX7-NOUNALIGNED: ; %bb.0: ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1 ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:10 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:10 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s3, v3 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s4, v4 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s5, v5 +; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s3, s3, 16 +; GFX7-NOUNALIGNED-NEXT: s_lshl_b32 s5, s5, 16 +; GFX7-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2 +; GFX7-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4 ; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX6-LABEL: s_load_constant_v3i32_align2: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 -; GFX6-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:6 -; GFX6-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:10 -; GFX6-NEXT: buffer_load_ushort v3, off, s[0:3], 0 -; GFX6-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:4 -; GFX6-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8 +; GFX6-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2 +; GFX6-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 +; GFX6-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:8 +; GFX6-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:10 ; GFX6-NEXT: s_waitcnt vmcnt(5) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: s_waitcnt vmcnt(4) -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_readfirstlane_b32 s1, v1 ; GFX6-NEXT: s_waitcnt vmcnt(3) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_readfirstlane_b32 s2, v2 ; GFX6-NEXT: s_waitcnt vmcnt(2) -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_readfirstlane_b32 s3, v3 ; GFX6-NEXT: s_waitcnt vmcnt(1) -; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_readfirstlane_b32 s4, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 +; GFX6-NEXT: v_readfirstlane_b32 s5, v5 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: s_or_b32 s1, s3, s2 +; GFX6-NEXT: s_or_b32 s2, s5, s4 ; GFX6-NEXT: ; return to shader part epilog %load = load <3 x i32>, ptr addrspace(4) %ptr, align 2 ret <3 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll index 44e4320cddb22..0038a097174c6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; FIXME: Test should be redundant with constant-address-space-32bit.ll ; It's important to check with gfx8 and gfx9 to check access through global and flat. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-divergent.ll new file mode 100644 index 0000000000000..197133441d3a5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-divergent.ll @@ -0,0 +1,492 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode,-real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-True16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode,+real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-NoTrue16 %s + +define amdgpu_ps void @load_divergent_P0_i8_any_extending(ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: load_divergent_P0_i8_any_extending: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b8 v[2:3], v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(0) %ptra + store i8 %a, ptr addrspace(0) %out + ret void +} + +; with true16, S16 16-bit load +; without true16, S32 16-bit any-extending load +define amdgpu_ps void @load_divergent_P0_i16(ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-True16-LABEL: load_divergent_P0_i16: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: flat_load_u16 v0, v[0:1] +; GFX12-True16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-True16-NEXT: flat_store_b16 v[2:3], v0 +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_divergent_P0_i16: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: flat_load_d16_b16 v0, v[0:1] +; GFX12-NoTrue16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NoTrue16-NEXT: flat_store_b16 v[2:3], v0 +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(0) %ptra + store i16 %a, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @load_divergent_P0_i32(ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: load_divergent_P0_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_b32 v0, v[0:1] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b32 v[2:3], v0 +; GFX12-NEXT: s_endpgm + %a = load i32, ptr addrspace(0) %ptra + store i32 %a, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @load_divergent_P0_v2i32(ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: load_divergent_P0_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm + %a = load <2 x i32>, ptr addrspace(0) %ptra + store <2 x i32> %a, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @load_divergent_P0_v3i32(ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: load_divergent_P0_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_b96 v[4:6], v[0:1] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b96 v[2:3], v[4:6] +; GFX12-NEXT: s_endpgm + %a = load <3 x i32>, ptr addrspace(0) %ptra + store <3 x i32> %a, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @load_divergent_P0_v4i32(ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: load_divergent_P0_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_b128 v[4:7], v[0:1] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX12-NEXT: s_endpgm + %a = load <4 x i32>, ptr addrspace(0) %ptra + store <4 x i32> %a, ptr addrspace(0) %out + ret void +} + + + +define amdgpu_ps void @load_divergent_P1_i8_any_extending(ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P1_i8_any_extending: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b8 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra + store i8 %a, ptr addrspace(1) %out + ret void +} + +; with true16, S16 16-bit load +; without true16, S32 16-bit any-extending load +define amdgpu_ps void @load_divergent_P1_i16(ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-True16-LABEL: load_divergent_P1_i16: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: global_load_u16 v0, v[0:1], off +; GFX12-True16-NEXT: s_wait_loadcnt 0x0 +; GFX12-True16-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_divergent_P1_i16: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: global_load_d16_b16 v0, v[0:1], off +; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra + store i16 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_divergent_P1_i32(ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P1_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a = load i32, ptr addrspace(1) %ptra + store i32 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_divergent_P1_v2i32(ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P1_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-NEXT: s_endpgm + %a = load <2 x i32>, ptr addrspace(1) %ptra + store <2 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_divergent_P1_v3i32(ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P1_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b96 v[4:6], v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b96 v[2:3], v[4:6], off +; GFX12-NEXT: s_endpgm + %a = load <3 x i32>, ptr addrspace(1) %ptra + store <3 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_divergent_P1_v4i32(ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P1_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_endpgm + %a = load <4 x i32>, ptr addrspace(1) %ptra + store <4 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_divergent_P1_v8i32(ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P1_v8i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX12-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16 +; GFX12-NEXT: s_wait_loadcnt 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: s_endpgm + %a = load <8 x i32>, ptr addrspace(1) %ptra + store <8 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_divergent_P1_v16i32(ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P1_v16i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX12-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16 +; GFX12-NEXT: global_load_b128 v[12:15], v[0:1], off offset:32 +; GFX12-NEXT: global_load_b128 v[16:19], v[0:1], off offset:48 +; GFX12-NEXT: s_wait_loadcnt 0x3 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_wait_loadcnt 0x2 +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: s_wait_loadcnt 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[12:15], off offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v[2:3], v[16:19], off offset:48 +; GFX12-NEXT: s_endpgm + %a = load <16 x i32>, ptr addrspace(1) %ptra + store <16 x i32> %a, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @load_divergent_P3_i8_any_extending(ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: load_divergent_P3_i8_any_extending: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_u8 v0, v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b8 v1, v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(3) %ptra + store i8 %a, ptr addrspace(3) %out + ret void +} + +; with true16, S16 16-bit load +; without true16, S32 16-bit any-extending load +define amdgpu_ps void @load_divergent_P3_i16(ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-True16-LABEL: load_divergent_P3_i16: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: ds_load_u16 v0, v0 +; GFX12-True16-NEXT: s_wait_dscnt 0x0 +; GFX12-True16-NEXT: ds_store_b16 v1, v0 +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_divergent_P3_i16: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: ds_load_u16_d16 v0, v0 +; GFX12-NoTrue16-NEXT: s_wait_dscnt 0x0 +; GFX12-NoTrue16-NEXT: ds_store_b16 v1, v0 +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(3) %ptra + store i16 %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @load_divergent_P3_i32(ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: load_divergent_P3_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_b32 v0, v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + %a = load i32, ptr addrspace(3) %ptra + store i32 %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @load_divergent_P3_v2i32(ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: load_divergent_P3_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_b64 v[2:3], v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b64 v1, v[2:3] +; GFX12-NEXT: s_endpgm + %a = load <2 x i32>, ptr addrspace(3) %ptra + store <2 x i32> %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @load_divergent_P3_v3i32(ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: load_divergent_P3_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_b96 v[2:4], v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b96 v1, v[2:4] +; GFX12-NEXT: s_endpgm + %a = load <3 x i32>, ptr addrspace(3) %ptra + store <3 x i32> %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @load_divergent_P3_v4i32(ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: load_divergent_P3_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_b128 v[2:5], v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b128 v1, v[2:5] +; GFX12-NEXT: s_endpgm + %a = load <4 x i32>, ptr addrspace(3) %ptra + store <4 x i32> %a, ptr addrspace(3) %out + ret void +} + + + +define amdgpu_ps void @load_divergent_P4_i8_any_extending(ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P4_i8_any_extending: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b8 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(4) %ptra + store i8 %a, ptr addrspace(1) %out + ret void +} + +; with true16, S16 16-bit load +; without true16, S32 16-bit any-extending load +define amdgpu_ps void @load_divergent_P4_i16(ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-True16-LABEL: load_divergent_P4_i16: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: global_load_u16 v0, v[0:1], off +; GFX12-True16-NEXT: s_wait_loadcnt 0x0 +; GFX12-True16-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_divergent_P4_i16: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: global_load_d16_b16 v0, v[0:1], off +; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[2:3], v0, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra + store i16 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_divergent_P4_i32(ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P4_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a = load i32, ptr addrspace(4) %ptra + store i32 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_divergent_P4_v2i32(ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P4_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-NEXT: s_endpgm + %a = load <2 x i32>, ptr addrspace(4) %ptra + store <2 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_divergent_P4_v3i32(ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P4_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b96 v[4:6], v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b96 v[2:3], v[4:6], off +; GFX12-NEXT: s_endpgm + %a = load <3 x i32>, ptr addrspace(4) %ptra + store <3 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_divergent_P4_v4i32(ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P4_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_endpgm + %a = load <4 x i32>, ptr addrspace(4) %ptra + store <4 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_divergent_P4_v8i32(ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P4_v8i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX12-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16 +; GFX12-NEXT: s_wait_loadcnt 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: s_endpgm + %a = load <8 x i32>, ptr addrspace(4) %ptra + store <8 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_divergent_P4_v16i32(ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: load_divergent_P4_v16i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX12-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16 +; GFX12-NEXT: global_load_b128 v[12:15], v[0:1], off offset:32 +; GFX12-NEXT: global_load_b128 v[16:19], v[0:1], off offset:48 +; GFX12-NEXT: s_wait_loadcnt 0x3 +; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX12-NEXT: s_wait_loadcnt 0x2 +; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX12-NEXT: s_wait_loadcnt 0x1 +; GFX12-NEXT: global_store_b128 v[2:3], v[12:15], off offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v[2:3], v[16:19], off offset:48 +; GFX12-NEXT: s_endpgm + %a = load <16 x i32>, ptr addrspace(4) %ptra + store <16 x i32> %a, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @load_divergent_P5_i8_any_extending(ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: load_divergent_P5_i8_any_extending: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_u8 v0, v0, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: scratch_store_b8 v1, v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(5) %ptra + store i8 %a, ptr addrspace(5) %out + ret void +} + +; with true16, S16 16-bit load +; without true16, S32 16-bit any-extending load +define amdgpu_ps void @load_divergent_P5_i16(ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-True16-LABEL: load_divergent_P5_i16: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: scratch_load_u16 v0, v0, off +; GFX12-True16-NEXT: s_wait_loadcnt 0x0 +; GFX12-True16-NEXT: scratch_store_b16 v1, v0, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_divergent_P5_i16: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: scratch_load_d16_b16 v0, v0, off +; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 +; GFX12-NoTrue16-NEXT: scratch_store_b16 v1, v0, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(5) %ptra + store i16 %a, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @load_divergent_P5_i32(ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: load_divergent_P5_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_b32 v0, v0, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: scratch_store_b32 v1, v0, off +; GFX12-NEXT: s_endpgm + %a = load i32, ptr addrspace(5) %ptra + store i32 %a, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @load_divergent_P5_v2i32(ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: load_divergent_P5_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_b64 v[2:3], v0, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: scratch_store_b64 v1, v[2:3], off +; GFX12-NEXT: s_endpgm + %a = load <2 x i32>, ptr addrspace(5) %ptra + store <2 x i32> %a, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @load_divergent_P5_v3i32(ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: load_divergent_P5_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_b96 v[2:4], v0, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: scratch_store_b96 v1, v[2:4], off +; GFX12-NEXT: s_endpgm + %a = load <3 x i32>, ptr addrspace(5) %ptra + store <3 x i32> %a, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @load_divergent_P5_v4i32(ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: load_divergent_P5_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_b128 v[2:5], v0, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: scratch_store_b128 v1, v[2:5], off +; GFX12-NEXT: s_endpgm + %a = load <4 x i32>, ptr addrspace(5) %ptra + store <4 x i32> %a, ptr addrspace(5) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll index caaface376f21..d7fcbd5d623c9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s ; FIXME: -; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s +; XUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s define <4 x i32> @load_lds_v4i32(ptr addrspace(3) %ptr) { ; GFX9-LABEL: load_lds_v4i32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll index cbfdfd3286884..191f2e0670e15 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s ; FIXME: -; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s +; XUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s define <3 x i32> @load_lds_v3i32(ptr addrspace(3) %ptr) { ; GFX9-LABEL: load_lds_v3i32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll index ed248b450582c..b1de0eff05d30 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX7 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX11 %s + +; FixMe: need merge/unmerge artifact combine ; Unaligned DS access in available from GFX9 onwards. ; LDS alignment enforcement is controlled by a configuration register: @@ -283,8 +285,24 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_readfirstlane_b32 s7, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s8, v4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_readfirstlane_b32 s9, v5 +; GFX9-NEXT: v_readfirstlane_b32 s10, v6 +; GFX9-NEXT: v_readfirstlane_b32 s11, v7 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 ; GFX9-NEXT: s_endpgm ; @@ -298,8 +316,24 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s3, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s8, v4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v5 +; GFX7-NEXT: v_readfirstlane_b32 s10, v6 +; GFX7-NEXT: v_readfirstlane_b32 s11, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-NEXT: v_mov_b32_e32 v7, s11 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -310,8 +344,24 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-NEXT: v_readfirstlane_b32 s6, v2 +; GFX10-NEXT: v_readfirstlane_b32 s7, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s8, v4 +; GFX10-NEXT: v_readfirstlane_b32 s9, v5 +; GFX10-NEXT: v_readfirstlane_b32 s10, v6 +; GFX10-NEXT: v_readfirstlane_b32 s11, v7 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: v_mov_b32_e32 v5, s9 +; GFX10-NEXT: v_mov_b32_e32 v6, s10 +; GFX10-NEXT: v_mov_b32_e32 v7, s11 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] ; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 ; GFX10-NEXT: s_endpgm ; @@ -322,8 +372,22 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[0:1] ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s11 +; GFX11-NEXT: v_dual_mov_b32 v5, s9 :: v_dual_mov_b32 v6, s10 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3] offset:16 ; GFX11-NEXT: s_endpgm %load = load <8 x i32>, ptr addrspace(4) %ptr, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll index 92e532b6cf340..4361e5c113708 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll @@ -1,95 +1,2135 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode < %s | FileCheck %s - -define amdgpu_ps void @uniform_load_i32(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1, ptr addrspace(1) inreg %ptr2) { -; CHECK-LABEL: uniform_load_i32: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: global_load_dword v1, v0, s[0:1] glc dlc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_load_dword v2, v0, s[2:3] -; CHECK-NEXT: v_readfirstlane_b32 s0, v1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readfirstlane_b32 s1, v2 -; CHECK-NEXT: s_add_i32 s0, s0, s1 -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: global_store_dword v0, v1, s[4:5] -; CHECK-NEXT: s_endpgm - %load0 = load volatile i32, ptr addrspace(1) %ptr0 - %load1 = load i32, ptr addrspace(1) %ptr1, align 1 - %sum = add i32 %load0, %load1 - store i32 %sum, ptr addrspace(1) %ptr2 - ret void -} - -define amdgpu_ps void @uniform_load_v2i32(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) { -; CHECK-LABEL: uniform_load_v2i32: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: v_readfirstlane_b32 s1, v1 -; CHECK-NEXT: s_add_i32 s0, s0, s1 -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: global_store_dword v2, v0, s[2:3] -; CHECK-NEXT: s_endpgm - %load = load volatile <2 x i32>, ptr addrspace(1) %ptr0 - %elt0 = extractelement <2 x i32> %load, i32 0 - %elt1 = extractelement <2 x i32> %load, i32 1 - %sum = add i32 %elt0, %elt1 - store i32 %sum, ptr addrspace(1) %ptr1 - ret void -} - -define amdgpu_ps void @uniform_load_v3i32(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) { -; CHECK-LABEL: uniform_load_v3i32: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: v_readfirstlane_b32 s1, v1 -; CHECK-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-NEXT: s_add_i32 s0, s0, s1 -; CHECK-NEXT: s_add_i32 s0, s0, s4 -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: global_store_dword v3, v0, s[2:3] -; CHECK-NEXT: s_endpgm - %load = load <3 x i32>, ptr addrspace(1) %ptr0, align 2 - %elt0 = extractelement <3 x i32> %load, i32 0 - %elt1 = extractelement <3 x i32> %load, i32 1 - %elt2 = extractelement <3 x i32> %load, i32 2 - %sum0 = add i32 %elt0, %elt1 - %sum = add i32 %sum0, %elt2 - store i32 %sum, ptr addrspace(1) %ptr1 - ret void -} - -define amdgpu_ps void @uniform_load_v4i32(ptr addrspace(1) inreg %ptr0, ptr addrspace(1) inreg %ptr1) { -; CHECK-LABEL: uniform_load_v4i32: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] glc dlc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: v_readfirstlane_b32 s1, v1 -; CHECK-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-NEXT: s_add_i32 s0, s0, s1 -; CHECK-NEXT: s_add_i32 s0, s0, s4 -; CHECK-NEXT: s_add_i32 s0, s0, s5 -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: global_store_dword v4, v0, s[2:3] -; CHECK-NEXT: s_endpgm - %load = load volatile <4 x i32>, ptr addrspace(1) %ptr0 - %elt0 = extractelement <4 x i32> %load, i32 0 - %elt1 = extractelement <4 x i32> %load, i32 1 - %elt2 = extractelement <4 x i32> %load, i32 2 - %elt3 = extractelement <4 x i32> %load, i32 3 - %sum0 = add i32 %elt0, %elt1 - %sum1 = add i32 %sum0, %elt2 - %sum = add i32 %sum1, %elt3 - store i32 %sum, ptr addrspace(1) %ptr1 +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX11,GFX11-True16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-NoTrue16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-True16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-NoTrue16 %s + +; global address space, addrspace(1) + +; gfx12 true 16, not natural alignment or not uniform mmo +define amdgpu_ps void @load_uniform_P1_i16_b16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P1_i16_b16_gfx12: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-True16-LABEL: load_uniform_P1_i16_b16_gfx12: +; GFX11-True16: ; %bb.0: +; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-True16-NEXT: s_clause 0x1 +; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] +; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] glc dlc +; GFX11-True16-NEXT: s_waitcnt vmcnt(0) +; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-True16-NEXT: s_endpgm +; +; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx12: +; GFX11-NoTrue16: ; %bb.0: +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NoTrue16-NEXT: s_clause 0x1 +; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc +; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NoTrue16-NEXT: s_endpgm +; +; GFX12-True16-LABEL: load_uniform_P1_i16_b16_gfx12: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-True16-NEXT: s_clause 0x1 +; GFX12-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] +; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] scope:SCOPE_SYS +; GFX12-True16-NEXT: s_wait_loadcnt 0x0 +; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-True16-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx12: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NoTrue16-NEXT: s_clause 0x1 +; GFX12-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] scope:SCOPE_SYS +; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 +; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra, align 1 + %b = load volatile i16, ptr addrspace(1) %ptrb + %sum = add i16 %a, %b + store i16 %sum, ptr addrspace(1) %out + ret void +} + +; gfx11 true16, 16-bit load, not align 4 or not uniform mmo +define amdgpu_ps void @load_uniform_P1_i16_b16_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P1_i16_b16_gfx11: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-True16-LABEL: load_uniform_P1_i16_b16_gfx11: +; GFX11-True16: ; %bb.0: +; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-True16-NEXT: s_clause 0x1 +; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] +; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc +; GFX11-True16-NEXT: s_waitcnt vmcnt(0) +; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-True16-NEXT: s_endpgm +; +; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx11: +; GFX11-NoTrue16: ; %bb.0: +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NoTrue16-NEXT: s_clause 0x1 +; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc +; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NoTrue16-NEXT: s_endpgm +; +; GFX12-True16-LABEL: load_uniform_P1_i16_b16_gfx11: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0 +; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS +; GFX12-True16-NEXT: s_wait_loadcnt 0x0 +; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-True16-NEXT: s_wait_kmcnt 0x0 +; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0 +; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx11: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0 +; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS +; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 +; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 +; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0 +; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra + %b = load volatile i16, ptr addrspace(1) %ptra, align 4 + %sum = add i16 %a, %b + store i16 %sum, ptr addrspace(1) %out + ret void +} + +; gfx12 without true16, 16-bit any-extending load, not natural alignment or not uniform mmo +define amdgpu_ps void @load_uniform_P1_i16_anyextending_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P1_i16_anyextending_gfx12: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-True16-LABEL: load_uniform_P1_i16_anyextending_gfx12: +; GFX11-True16: ; %bb.0: +; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-True16-NEXT: s_clause 0x1 +; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] +; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc +; GFX11-True16-NEXT: s_waitcnt vmcnt(0) +; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-True16-NEXT: s_endpgm +; +; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx12: +; GFX11-NoTrue16: ; %bb.0: +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NoTrue16-NEXT: s_clause 0x1 +; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc +; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NoTrue16-NEXT: s_endpgm +; +; GFX12-True16-LABEL: load_uniform_P1_i16_anyextending_gfx12: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-True16-NEXT: s_clause 0x1 +; GFX12-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] +; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS +; GFX12-True16-NEXT: s_wait_loadcnt 0x0 +; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-True16-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx12: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NoTrue16-NEXT: s_clause 0x1 +; GFX12-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS +; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 +; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra, align 1 + %b = load volatile i16, ptr addrspace(1) %ptra + %sum = add i16 %a, %b + store i16 %sum, ptr addrspace(1) %out + ret void +} + +; gfx11(or older) without true 16, s16 any-extending load, not align 4 or not uniform mmo +define amdgpu_ps void @load_uniform_P1_i16_anyextending_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P1_i16_anyextending_gfx11: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-True16-LABEL: load_uniform_P1_i16_anyextending_gfx11: +; GFX11-True16: ; %bb.0: +; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-True16-NEXT: s_clause 0x1 +; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] +; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc +; GFX11-True16-NEXT: s_waitcnt vmcnt(0) +; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-True16-NEXT: s_endpgm +; +; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx11: +; GFX11-NoTrue16: ; %bb.0: +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NoTrue16-NEXT: s_clause 0x1 +; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc +; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NoTrue16-NEXT: s_endpgm +; +; GFX12-True16-LABEL: load_uniform_P1_i16_anyextending_gfx11: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0 +; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS +; GFX12-True16-NEXT: s_wait_loadcnt 0x0 +; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-True16-NEXT: s_wait_kmcnt 0x0 +; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0 +; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx11: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0 +; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS +; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 +; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 +; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0 +; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra + %b = load volatile i16, ptr addrspace(1) %ptra, align 4 + %sum = add i16 %a, %b + store i16 %sum, ptr addrspace(1) %out + ret void +} + +; any target, 32-bit load load, not align 4 or not uniform mmo +define amdgpu_ps void @load_uniform_P1_i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P1_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_uniform_P1_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v3, v2, s[0:1] +; GFX11-NEXT: global_load_b32 v2, v2, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_load_b32 v3, v2, s[0:1] +; GFX12-NEXT: global_load_b32 v2, v2, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i32, ptr addrspace(1) %ptra, align 2 + %b = load volatile i32, ptr addrspace(1) %ptra + %sum = add i32 %a, %b + store i32 %sum, ptr addrspace(1) %out + ret void +} + +; any target, 64bit load load, not align 4 or not uniform mmo +define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P1_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 +; GFX7-NEXT: v_readfirstlane_b32 s5, v5 +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s4, v4 +; GFX7-NEXT: s_add_i32 s1, s1, s5 +; GFX7-NEXT: s_add_i32 s0, s0, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_uniform_P1_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] +; GFX11-NEXT: global_load_b64 v[4:5], v4, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11-NEXT: s_add_i32 s1, s1, s3 +; GFX11-NEXT: s_add_i32 s0, s0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_load_b64 v[2:3], v4, s[0:1] +; GFX12-NEXT: global_load_b64 v[4:5], v4, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: v_readfirstlane_b32 s3, v5 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s2, v4 +; GFX12-NEXT: s_add_co_i32 s1, s1, s3 +; GFX12-NEXT: s_add_co_i32 s0, s0, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-NEXT: s_endpgm + %a = load <2 x i32>, ptr addrspace(1) %ptra, align 2 + %b = load volatile <2 x i32>, ptr addrspace(1) %ptra + %sum = add <2 x i32> %a, %b + store <2 x i32> %sum, ptr addrspace(1) %out + ret void +} + +; any target, 96bit load load, not align 4 or not uniform mmo +define amdgpu_ps void @load_uniform_P1_v3i32_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P1_v3i32_gfx12: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx3 v[2:4], off, s[0:3], 0 +; GFX7-NEXT: buffer_load_dwordx3 v[5:7], off, s[0:3], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s4, v5 +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 +; GFX7-NEXT: v_readfirstlane_b32 s6, v4 +; GFX7-NEXT: v_readfirstlane_b32 s5, v6 +; GFX7-NEXT: v_readfirstlane_b32 s7, v7 +; GFX7-NEXT: s_add_i32 s4, s0, s4 +; GFX7-NEXT: s_add_i32 s5, s1, s5 +; GFX7-NEXT: s_add_i32 s6, s6, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_dwordx3 v[2:4], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_uniform_P1_v3i32_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b96 v[2:4], v5, s[0:1] +; GFX11-NEXT: global_load_b96 v[5:7], v5, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11-NEXT: v_readfirstlane_b32 s5, v7 +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 +; GFX11-NEXT: v_readfirstlane_b32 s4, v6 +; GFX11-NEXT: s_add_i32 s2, s2, s5 +; GFX11-NEXT: s_add_i32 s0, s0, s3 +; GFX11-NEXT: s_add_i32 s1, s1, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_v3i32_gfx12: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v5, 0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_load_b96 v[2:4], v5, s[0:1] +; GFX12-NEXT: global_load_b96 v[5:7], v5, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s2, v4 +; GFX12-NEXT: v_readfirstlane_b32 s5, v7 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: v_readfirstlane_b32 s3, v5 +; GFX12-NEXT: v_readfirstlane_b32 s4, v6 +; GFX12-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12-NEXT: s_add_co_i32 s0, s0, s3 +; GFX12-NEXT: s_add_co_i32 s1, s1, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX12-NEXT: s_endpgm + %a = load <3 x i32>, ptr addrspace(1) %ptra, align 2 + %b = load volatile <3 x i32>, ptr addrspace(1) %ptra + %sum = add <3 x i32> %a, %b + store <3 x i32> %sum, ptr addrspace(1) %out + ret void +} + +; any target, 128-bit load load, not align 4 or not uniform mmo +define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P1_v4i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s4, v6 +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 +; GFX7-NEXT: v_readfirstlane_b32 s6, v4 +; GFX7-NEXT: v_readfirstlane_b32 s7, v5 +; GFX7-NEXT: v_readfirstlane_b32 s5, v7 +; GFX7-NEXT: v_readfirstlane_b32 s8, v8 +; GFX7-NEXT: v_readfirstlane_b32 s9, v9 +; GFX7-NEXT: s_add_i32 s4, s0, s4 +; GFX7-NEXT: s_add_i32 s5, s1, s5 +; GFX7-NEXT: s_add_i32 s6, s6, s8 +; GFX7-NEXT: s_add_i32 s7, s7, s9 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_uniform_P1_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[2:5], v6, s[0:1] +; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 +; GFX11-NEXT: v_readfirstlane_b32 s7, v9 +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11-NEXT: v_readfirstlane_b32 s4, v6 +; GFX11-NEXT: v_readfirstlane_b32 s5, v7 +; GFX11-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-NEXT: s_add_i32 s3, s3, s7 +; GFX11-NEXT: s_add_i32 s0, s0, s4 +; GFX11-NEXT: s_add_i32 s1, s1, s5 +; GFX11-NEXT: s_add_i32 s2, s2, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v6, 0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_load_b128 v[2:5], v6, s[0:1] +; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s3, v5 +; GFX12-NEXT: v_readfirstlane_b32 s7, v9 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: v_readfirstlane_b32 s2, v4 +; GFX12-NEXT: v_readfirstlane_b32 s4, v6 +; GFX12-NEXT: v_readfirstlane_b32 s5, v7 +; GFX12-NEXT: v_readfirstlane_b32 s6, v8 +; GFX12-NEXT: s_add_co_i32 s3, s3, s7 +; GFX12-NEXT: s_add_co_i32 s0, s0, s4 +; GFX12-NEXT: s_add_co_i32 s1, s1, s5 +; GFX12-NEXT: s_add_co_i32 s2, s2, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-NEXT: s_endpgm + %a = load <4 x i32>, ptr addrspace(1) %ptra, align 2 + %b = load volatile <4 x i32>, ptr addrspace(1) %ptra + %sum = add <4 x i32> %a, %b + store <4 x i32> %sum, ptr addrspace(1) %out + ret void +} + +; any target, 256bit load load, not align 4 or not uniform mmo +define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P1_v8i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:16 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_readfirstlane_b32 s4, v2 +; GFX7-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7-NEXT: v_readfirstlane_b32 s12, v10 +; GFX7-NEXT: v_readfirstlane_b32 s6, v4 +; GFX7-NEXT: v_readfirstlane_b32 s7, v5 +; GFX7-NEXT: v_readfirstlane_b32 s8, v6 +; GFX7-NEXT: v_readfirstlane_b32 s13, v11 +; GFX7-NEXT: v_readfirstlane_b32 s14, v12 +; GFX7-NEXT: v_readfirstlane_b32 s15, v13 +; GFX7-NEXT: v_readfirstlane_b32 s16, v14 +; GFX7-NEXT: s_add_i32 s4, s4, s12 +; GFX7-NEXT: v_readfirstlane_b32 s9, v7 +; GFX7-NEXT: v_readfirstlane_b32 s10, v8 +; GFX7-NEXT: v_readfirstlane_b32 s11, v9 +; GFX7-NEXT: v_readfirstlane_b32 s17, v15 +; GFX7-NEXT: v_readfirstlane_b32 s18, v16 +; GFX7-NEXT: v_readfirstlane_b32 s19, v17 +; GFX7-NEXT: s_add_i32 s5, s5, s13 +; GFX7-NEXT: s_add_i32 s6, s6, s14 +; GFX7-NEXT: s_add_i32 s7, s7, s15 +; GFX7-NEXT: s_add_i32 s8, s8, s16 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_add_i32 s9, s9, s17 +; GFX7-NEXT: s_add_i32 s10, s10, s18 +; GFX7-NEXT: s_add_i32 s11, s11, s19 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 +; GFX7-NEXT: v_mov_b32_e32 v7, s9 +; GFX7-NEXT: v_mov_b32_e32 v8, s10 +; GFX7-NEXT: v_mov_b32_e32 v9, s11 +; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_uniform_P1_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v14, 0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_load_b128 v[2:5], v14, s[0:1] +; GFX11-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16 +; GFX11-NEXT: global_load_b128 v[10:13], v14, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s11, v13 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11-NEXT: v_readfirstlane_b32 s7, v9 +; GFX11-NEXT: v_readfirstlane_b32 s8, v10 +; GFX11-NEXT: v_readfirstlane_b32 s9, v11 +; GFX11-NEXT: v_readfirstlane_b32 s10, v12 +; GFX11-NEXT: v_readfirstlane_b32 s15, v17 +; GFX11-NEXT: v_readfirstlane_b32 s4, v6 +; GFX11-NEXT: v_readfirstlane_b32 s5, v7 +; GFX11-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-NEXT: v_readfirstlane_b32 s12, v14 +; GFX11-NEXT: v_readfirstlane_b32 s13, v15 +; GFX11-NEXT: v_readfirstlane_b32 s14, v16 +; GFX11-NEXT: s_add_i32 s3, s3, s11 +; GFX11-NEXT: s_add_i32 s0, s0, s8 +; GFX11-NEXT: s_add_i32 s1, s1, s9 +; GFX11-NEXT: s_add_i32 s2, s2, s10 +; GFX11-NEXT: s_add_i32 s7, s7, s15 +; GFX11-NEXT: s_add_i32 s4, s4, s12 +; GFX11-NEXT: s_add_i32 s5, s5, s13 +; GFX11-NEXT: s_add_i32 s6, s6, s14 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 +; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_v8i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v14, 0 +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: global_load_b128 v[2:5], v14, s[0:1] +; GFX12-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16 +; GFX12-NEXT: global_load_b128 v[10:13], v14, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s3, v5 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s11, v13 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: v_readfirstlane_b32 s2, v4 +; GFX12-NEXT: v_readfirstlane_b32 s7, v9 +; GFX12-NEXT: v_readfirstlane_b32 s8, v10 +; GFX12-NEXT: v_readfirstlane_b32 s9, v11 +; GFX12-NEXT: v_readfirstlane_b32 s10, v12 +; GFX12-NEXT: v_readfirstlane_b32 s15, v17 +; GFX12-NEXT: v_readfirstlane_b32 s4, v6 +; GFX12-NEXT: v_readfirstlane_b32 s5, v7 +; GFX12-NEXT: v_readfirstlane_b32 s6, v8 +; GFX12-NEXT: v_readfirstlane_b32 s12, v14 +; GFX12-NEXT: v_readfirstlane_b32 s13, v15 +; GFX12-NEXT: v_readfirstlane_b32 s14, v16 +; GFX12-NEXT: s_add_co_i32 s3, s3, s11 +; GFX12-NEXT: s_add_co_i32 s0, s0, s8 +; GFX12-NEXT: s_add_co_i32 s1, s1, s9 +; GFX12-NEXT: s_add_co_i32 s2, s2, s10 +; GFX12-NEXT: s_add_co_i32 s7, s7, s15 +; GFX12-NEXT: s_add_co_i32 s4, s4, s12 +; GFX12-NEXT: s_add_co_i32 s5, s5, s13 +; GFX12-NEXT: s_add_co_i32 s6, s6, s14 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 +; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-NEXT: s_endpgm + %a = load <8 x i32>, ptr addrspace(1) %ptra, align 2 + %b = load volatile <8 x i32>, ptr addrspace(1) %ptra + %sum = add <8 x i32> %a, %b + store <8 x i32> %sum, ptr addrspace(1) %out + ret void +} + +; any target, 512bit load load, not align 4 or not uniform mmo +define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P1_v16i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_readfirstlane_b32 s4, v2 +; GFX7-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7-NEXT: v_readfirstlane_b32 s6, v4 +; GFX7-NEXT: v_readfirstlane_b32 s7, v5 +; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s8, v6 +; GFX7-NEXT: v_readfirstlane_b32 s9, v7 +; GFX7-NEXT: v_readfirstlane_b32 s10, v8 +; GFX7-NEXT: v_readfirstlane_b32 s11, v9 +; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s12, v10 +; GFX7-NEXT: v_readfirstlane_b32 s13, v11 +; GFX7-NEXT: v_readfirstlane_b32 s14, v12 +; GFX7-NEXT: v_readfirstlane_b32 s15, v13 +; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s16, v14 +; GFX7-NEXT: v_readfirstlane_b32 s17, v15 +; GFX7-NEXT: v_readfirstlane_b32 s18, v16 +; GFX7-NEXT: v_readfirstlane_b32 s19, v17 +; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_readfirstlane_b32 s20, v2 +; GFX7-NEXT: v_readfirstlane_b32 s21, v3 +; GFX7-NEXT: v_readfirstlane_b32 s22, v4 +; GFX7-NEXT: v_readfirstlane_b32 s23, v5 +; GFX7-NEXT: s_add_i32 s4, s4, s20 +; GFX7-NEXT: v_readfirstlane_b32 s24, v6 +; GFX7-NEXT: v_readfirstlane_b32 s25, v7 +; GFX7-NEXT: v_readfirstlane_b32 s26, v8 +; GFX7-NEXT: v_readfirstlane_b32 s27, v9 +; GFX7-NEXT: s_add_i32 s5, s5, s21 +; GFX7-NEXT: v_readfirstlane_b32 s28, v10 +; GFX7-NEXT: v_readfirstlane_b32 s29, v11 +; GFX7-NEXT: v_readfirstlane_b32 s30, v12 +; GFX7-NEXT: v_readfirstlane_b32 s31, v13 +; GFX7-NEXT: s_add_i32 s6, s6, s22 +; GFX7-NEXT: v_readfirstlane_b32 s33, v14 +; GFX7-NEXT: v_readfirstlane_b32 s34, v15 +; GFX7-NEXT: v_readfirstlane_b32 s35, v16 +; GFX7-NEXT: v_readfirstlane_b32 s36, v17 +; GFX7-NEXT: s_add_i32 s7, s7, s23 +; GFX7-NEXT: s_add_i32 s8, s8, s24 +; GFX7-NEXT: s_add_i32 s12, s12, s28 +; GFX7-NEXT: s_add_i32 s16, s16, s33 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_add_i32 s9, s9, s25 +; GFX7-NEXT: s_add_i32 s10, s10, s26 +; GFX7-NEXT: s_add_i32 s11, s11, s27 +; GFX7-NEXT: s_add_i32 s13, s13, s29 +; GFX7-NEXT: s_add_i32 s14, s14, s30 +; GFX7-NEXT: s_add_i32 s15, s15, s31 +; GFX7-NEXT: s_add_i32 s17, s17, s34 +; GFX7-NEXT: s_add_i32 s18, s18, s35 +; GFX7-NEXT: s_add_i32 s19, s19, s36 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 +; GFX7-NEXT: v_mov_b32_e32 v10, s12 +; GFX7-NEXT: v_mov_b32_e32 v14, s16 +; GFX7-NEXT: v_mov_b32_e32 v7, s9 +; GFX7-NEXT: v_mov_b32_e32 v8, s10 +; GFX7-NEXT: v_mov_b32_e32 v9, s11 +; GFX7-NEXT: v_mov_b32_e32 v11, s13 +; GFX7-NEXT: v_mov_b32_e32 v12, s14 +; GFX7-NEXT: v_mov_b32_e32 v13, s15 +; GFX7-NEXT: v_mov_b32_e32 v15, s17 +; GFX7-NEXT: v_mov_b32_e32 v16, s18 +; GFX7-NEXT: v_mov_b32_e32 v17, s19 +; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16 +; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32 +; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[0:3], 0 addr64 offset:48 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_uniform_P1_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v30, 0 +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: global_load_b128 v[2:5], v30, s[0:1] +; GFX11-NEXT: global_load_b128 v[6:9], v30, s[0:1] offset:16 +; GFX11-NEXT: global_load_b128 v[10:13], v30, s[0:1] offset:32 +; GFX11-NEXT: global_load_b128 v[14:17], v30, s[0:1] offset:48 +; GFX11-NEXT: global_load_b128 v[18:21], v30, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b128 v[22:25], v30, s[0:1] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b128 v[26:29], v30, s[0:1] offset:32 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11-NEXT: v_readfirstlane_b32 s19, v21 +; GFX11-NEXT: v_readfirstlane_b32 s7, v9 +; GFX11-NEXT: v_readfirstlane_b32 s16, v18 +; GFX11-NEXT: v_readfirstlane_b32 s17, v19 +; GFX11-NEXT: v_readfirstlane_b32 s18, v20 +; GFX11-NEXT: v_readfirstlane_b32 s23, v25 +; GFX11-NEXT: v_readfirstlane_b32 s4, v6 +; GFX11-NEXT: v_readfirstlane_b32 s5, v7 +; GFX11-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-NEXT: v_readfirstlane_b32 s11, v13 +; GFX11-NEXT: v_readfirstlane_b32 s20, v22 +; GFX11-NEXT: v_readfirstlane_b32 s21, v23 +; GFX11-NEXT: v_readfirstlane_b32 s22, v24 +; GFX11-NEXT: v_readfirstlane_b32 s27, v29 +; GFX11-NEXT: v_readfirstlane_b32 s8, v10 +; GFX11-NEXT: v_readfirstlane_b32 s9, v11 +; GFX11-NEXT: v_readfirstlane_b32 s10, v12 +; GFX11-NEXT: v_readfirstlane_b32 s15, v17 +; GFX11-NEXT: v_readfirstlane_b32 s24, v26 +; GFX11-NEXT: v_readfirstlane_b32 s25, v27 +; GFX11-NEXT: v_readfirstlane_b32 s26, v28 +; GFX11-NEXT: v_readfirstlane_b32 s31, v33 +; GFX11-NEXT: v_readfirstlane_b32 s12, v14 +; GFX11-NEXT: v_readfirstlane_b32 s13, v15 +; GFX11-NEXT: v_readfirstlane_b32 s14, v16 +; GFX11-NEXT: v_readfirstlane_b32 s28, v30 +; GFX11-NEXT: v_readfirstlane_b32 s29, v31 +; GFX11-NEXT: v_readfirstlane_b32 s30, v32 +; GFX11-NEXT: s_add_i32 s3, s3, s19 +; GFX11-NEXT: s_add_i32 s0, s0, s16 +; GFX11-NEXT: s_add_i32 s1, s1, s17 +; GFX11-NEXT: s_add_i32 s2, s2, s18 +; GFX11-NEXT: s_add_i32 s7, s7, s23 +; GFX11-NEXT: s_add_i32 s4, s4, s20 +; GFX11-NEXT: s_add_i32 s5, s5, s21 +; GFX11-NEXT: s_add_i32 s6, s6, s22 +; GFX11-NEXT: s_add_i32 s11, s11, s27 +; GFX11-NEXT: v_mov_b32_e32 v5, s3 +; GFX11-NEXT: s_add_i32 s8, s8, s24 +; GFX11-NEXT: s_add_i32 s9, s9, s25 +; GFX11-NEXT: s_add_i32 s10, s10, s26 +; GFX11-NEXT: s_add_i32 s15, s15, s31 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 +; GFX11-NEXT: s_add_i32 s12, s12, s28 +; GFX11-NEXT: s_add_i32 s13, s13, s29 +; GFX11-NEXT: s_add_i32 s14, s14, s30 +; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13 +; GFX11-NEXT: v_mov_b32_e32 v14, s12 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_v16i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v30, 0 +; GFX12-NEXT: s_clause 0x4 +; GFX12-NEXT: global_load_b128 v[2:5], v30, s[0:1] +; GFX12-NEXT: global_load_b128 v[6:9], v30, s[0:1] offset:16 +; GFX12-NEXT: global_load_b128 v[10:13], v30, s[0:1] offset:32 +; GFX12-NEXT: global_load_b128 v[14:17], v30, s[0:1] offset:48 +; GFX12-NEXT: global_load_b128 v[18:21], v30, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b128 v[22:25], v30, s[0:1] offset:16 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b128 v[26:29], v30, s[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s3, v5 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: v_readfirstlane_b32 s2, v4 +; GFX12-NEXT: v_readfirstlane_b32 s19, v21 +; GFX12-NEXT: v_readfirstlane_b32 s7, v9 +; GFX12-NEXT: v_readfirstlane_b32 s16, v18 +; GFX12-NEXT: v_readfirstlane_b32 s17, v19 +; GFX12-NEXT: v_readfirstlane_b32 s18, v20 +; GFX12-NEXT: v_readfirstlane_b32 s23, v25 +; GFX12-NEXT: v_readfirstlane_b32 s4, v6 +; GFX12-NEXT: v_readfirstlane_b32 s5, v7 +; GFX12-NEXT: v_readfirstlane_b32 s6, v8 +; GFX12-NEXT: v_readfirstlane_b32 s11, v13 +; GFX12-NEXT: v_readfirstlane_b32 s20, v22 +; GFX12-NEXT: v_readfirstlane_b32 s21, v23 +; GFX12-NEXT: v_readfirstlane_b32 s22, v24 +; GFX12-NEXT: v_readfirstlane_b32 s27, v29 +; GFX12-NEXT: v_readfirstlane_b32 s8, v10 +; GFX12-NEXT: v_readfirstlane_b32 s9, v11 +; GFX12-NEXT: v_readfirstlane_b32 s10, v12 +; GFX12-NEXT: v_readfirstlane_b32 s15, v17 +; GFX12-NEXT: v_readfirstlane_b32 s24, v26 +; GFX12-NEXT: v_readfirstlane_b32 s25, v27 +; GFX12-NEXT: v_readfirstlane_b32 s26, v28 +; GFX12-NEXT: v_readfirstlane_b32 s31, v33 +; GFX12-NEXT: v_readfirstlane_b32 s12, v14 +; GFX12-NEXT: v_readfirstlane_b32 s13, v15 +; GFX12-NEXT: v_readfirstlane_b32 s14, v16 +; GFX12-NEXT: v_readfirstlane_b32 s28, v30 +; GFX12-NEXT: v_readfirstlane_b32 s29, v31 +; GFX12-NEXT: v_readfirstlane_b32 s30, v32 +; GFX12-NEXT: s_add_co_i32 s3, s3, s19 +; GFX12-NEXT: s_add_co_i32 s0, s0, s16 +; GFX12-NEXT: s_add_co_i32 s1, s1, s17 +; GFX12-NEXT: s_add_co_i32 s2, s2, s18 +; GFX12-NEXT: s_add_co_i32 s7, s7, s23 +; GFX12-NEXT: s_add_co_i32 s4, s4, s20 +; GFX12-NEXT: s_add_co_i32 s5, s5, s21 +; GFX12-NEXT: s_add_co_i32 s6, s6, s22 +; GFX12-NEXT: s_add_co_i32 s11, s11, s27 +; GFX12-NEXT: v_mov_b32_e32 v5, s3 +; GFX12-NEXT: s_add_co_i32 s8, s8, s24 +; GFX12-NEXT: s_add_co_i32 s9, s9, s25 +; GFX12-NEXT: s_add_co_i32 s10, s10, s26 +; GFX12-NEXT: s_add_co_i32 s15, s15, s31 +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 +; GFX12-NEXT: s_add_co_i32 s12, s12, s28 +; GFX12-NEXT: s_add_co_i32 s13, s13, s29 +; GFX12-NEXT: s_add_co_i32 s14, s14, s30 +; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 +; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 +; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 +; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15 +; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13 +; GFX12-NEXT: v_mov_b32_e32 v14, s12 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX12-NEXT: s_endpgm + %a = load <16 x i32>, ptr addrspace(1) %ptra, align 2 + %b = load volatile <16 x i32>, ptr addrspace(1) %ptra + %sum = add <16 x i32> %a, %b + store <16 x i32> %sum, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @load_divergent_P3_i8_any_extending(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { +; GFX7-LABEL: load_divergent_P3_i8_any_extending: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u8 v1, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b8 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_divergent_P3_i8_any_extending: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: ds_load_u8 v1, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_store_b8 v0, v1 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_divergent_P3_i8_any_extending: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: ds_load_u8 v1, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b8 v0, v1 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(3) %ptra + store i8 %a, ptr addrspace(3) %out + ret void +} + +; with true16, S16 16-bit load +; without true16, S32 16-bit any-extending load +define amdgpu_ps void @load_divergent_P3_i16(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { +; GFX7-LABEL: load_divergent_P3_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v1, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b16 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX11-True16-LABEL: load_divergent_P3_i16: +; GFX11-True16: ; %bb.0: +; GFX11-True16-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-True16-NEXT: ds_load_u16_d16 v1, v1 +; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-True16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-True16-NEXT: v_mov_b16_e32 v1.l, s0 +; GFX11-True16-NEXT: ds_store_b16 v0, v1 +; GFX11-True16-NEXT: s_endpgm +; +; GFX11-NoTrue16-LABEL: load_divergent_P3_i16: +; GFX11-NoTrue16: ; %bb.0: +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NoTrue16-NEXT: ds_load_u16 v1, v1 +; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NoTrue16-NEXT: ds_store_b16 v0, v1 +; GFX11-NoTrue16-NEXT: s_endpgm +; +; GFX12-True16-LABEL: load_divergent_P3_i16: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-True16-NEXT: ds_load_u16_d16 v1, v1 +; GFX12-True16-NEXT: s_wait_dscnt 0x0 +; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-True16-NEXT: s_wait_alu 0xf1ff +; GFX12-True16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-True16-NEXT: v_mov_b16_e32 v1.l, s0 +; GFX12-True16-NEXT: ds_store_b16 v0, v1 +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_divergent_P3_i16: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NoTrue16-NEXT: ds_load_u16 v1, v1 +; GFX12-NoTrue16-NEXT: s_wait_dscnt 0x0 +; GFX12-NoTrue16-NEXT: ds_store_b16 v0, v1 +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(3) %ptra + store i16 %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @load_divergent_P3_i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { +; GFX7-LABEL: load_divergent_P3_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v1, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_divergent_P3_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: ds_load_b32 v1, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_store_b32 v0, v1 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_divergent_P3_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: ds_load_b32 v1, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b32 v0, v1 +; GFX12-NEXT: s_endpgm + %a = load i32, ptr addrspace(3) %ptra + store i32 %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @load_divergent_P3_v2i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { +; GFX7-LABEL: load_divergent_P3_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b64 v[1:2], v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b64 v0, v[1:2] +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_divergent_P3_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: ds_load_b64 v[1:2], v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_store_b64 v0, v[1:2] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_divergent_P3_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: ds_load_b64 v[1:2], v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b64 v0, v[1:2] +; GFX12-NEXT: s_endpgm + %a = load <2 x i32>, ptr addrspace(3) %ptra + store <2 x i32> %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @load_divergent_P3_v3i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { +; GFX7-LABEL: load_divergent_P3_v3i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b96 v[1:3], v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b96 v0, v[1:3] +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_divergent_P3_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: ds_load_b96 v[1:3], v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_store_b96 v0, v[1:3] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_divergent_P3_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: ds_load_b96 v[1:3], v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b96 v0, v[1:3] +; GFX12-NEXT: s_endpgm + %a = load <3 x i32>, ptr addrspace(3) %ptra + store <3 x i32> %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @load_divergent_P3_v4i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { +; GFX7-LABEL: load_divergent_P3_v4i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b128 v[1:4], v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b128 v0, v[1:4] +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_divergent_P3_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: ds_load_b128 v[1:4], v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_store_b128 v0, v[1:4] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_divergent_P3_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: ds_load_b128 v[1:4], v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: ds_store_b128 v0, v[1:4] +; GFX12-NEXT: s_endpgm + %a = load <4 x i32>, ptr addrspace(3) %ptra + store <4 x i32> %a, ptr addrspace(3) %out + ret void +} + + + +; constant address space, addrspace(4) +; not uniform load mmo check for G_LOAD is for the case where MMO somehow ends +; up with different addresspace then 4, Don't have tests for it in LLVM-IR. +; %b in tests will end up as uniform load in sgpr + +; gfx12 true 16, not natural alignment +define amdgpu_ps void @load_uniform_P4_i16_b16_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P4_i16_b16_gfx12: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-True16-LABEL: load_uniform_P4_i16_b16_gfx12: +; GFX11-True16: ; %bb.0: +; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-True16-NEXT: s_clause 0x1 +; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] +; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] glc dlc +; GFX11-True16-NEXT: s_waitcnt vmcnt(1) +; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-True16-NEXT: s_waitcnt vmcnt(0) +; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-True16-NEXT: s_endpgm +; +; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx12: +; GFX11-NoTrue16: ; %bb.0: +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NoTrue16-NEXT: s_clause 0x1 +; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc +; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(1) +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NoTrue16-NEXT: s_endpgm +; +; GFX12-True16-LABEL: load_uniform_P4_i16_b16_gfx12: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] +; GFX12-True16-NEXT: s_load_u16 s0, s[2:3], 0x0 +; GFX12-True16-NEXT: s_wait_loadcnt 0x0 +; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-True16-NEXT: s_wait_kmcnt 0x0 +; GFX12-True16-NEXT: s_add_co_i32 s0, s1, s0 +; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx12: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] +; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[2:3], 0x0 +; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 +; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 +; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s1, s0 +; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra, align 1 + %b = load volatile i16, ptr addrspace(4) %ptrb + %sum = add i16 %a, %b + store i16 %sum, ptr addrspace(1) %out + ret void +} + +; gfx11 true16, 16-bit load, not align 4 +define amdgpu_ps void @load_uniform_P4_i16_b16_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P4_i16_b16_gfx11: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s0, s1, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-True16-LABEL: load_uniform_P4_i16_b16_gfx11: +; GFX11-True16: ; %bb.0: +; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] +; GFX11-True16-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-True16-NEXT: s_waitcnt vmcnt(0) +; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-True16-NEXT: s_add_i32 s0, s1, s0 +; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-True16-NEXT: s_endpgm +; +; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx11: +; GFX11-NoTrue16: ; %bb.0: +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] +; GFX11-NoTrue16-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NoTrue16-NEXT: s_add_i32 s0, s1, s0 +; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NoTrue16-NEXT: s_endpgm +; +; GFX12-True16-LABEL: load_uniform_P4_i16_b16_gfx11: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: s_clause 0x1 +; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0 +; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-True16-NEXT: s_wait_kmcnt 0x0 +; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0 +; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx11: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: s_clause 0x1 +; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0 +; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 +; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0 +; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra + %b = load volatile i16, ptr addrspace(4) %ptra, align 4 + %sum = add i16 %a, %b + store i16 %sum, ptr addrspace(1) %out + ret void +} + +; gfx12 without true16, 16-bit any-extending load, not natural alignment +define amdgpu_ps void @load_uniform_P4_i16_anyextending_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P4_i16_anyextending_gfx12: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-True16-LABEL: load_uniform_P4_i16_anyextending_gfx12: +; GFX11-True16: ; %bb.0: +; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-True16-NEXT: s_clause 0x1 +; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] +; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc +; GFX11-True16-NEXT: s_waitcnt vmcnt(1) +; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-True16-NEXT: s_waitcnt vmcnt(0) +; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-True16-NEXT: s_endpgm +; +; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx12: +; GFX11-NoTrue16: ; %bb.0: +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NoTrue16-NEXT: s_clause 0x1 +; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc +; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(1) +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NoTrue16-NEXT: s_endpgm +; +; GFX12-True16-LABEL: load_uniform_P4_i16_anyextending_gfx12: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] +; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-True16-NEXT: s_wait_loadcnt 0x0 +; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-True16-NEXT: s_wait_kmcnt 0x0 +; GFX12-True16-NEXT: s_add_co_i32 s0, s1, s0 +; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx12: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] +; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 +; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 +; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s1, s0 +; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra, align 1 + %b = load volatile i16, ptr addrspace(4) %ptra + %sum = add i16 %a, %b + store i16 %sum, ptr addrspace(1) %out + ret void +} + +; gfx11(or older) without true 16, s16 any-extending load, not align 4 +define amdgpu_ps void @load_uniform_P4_i16_anyextending_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P4_i16_anyextending_gfx11: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s0, s1, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-True16-LABEL: load_uniform_P4_i16_anyextending_gfx11: +; GFX11-True16: ; %bb.0: +; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] +; GFX11-True16-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-True16-NEXT: s_waitcnt vmcnt(0) +; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-True16-NEXT: s_add_i32 s0, s1, s0 +; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-True16-NEXT: s_endpgm +; +; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx11: +; GFX11-NoTrue16: ; %bb.0: +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] +; GFX11-NoTrue16-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) +; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NoTrue16-NEXT: s_add_i32 s0, s1, s0 +; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NoTrue16-NEXT: s_endpgm +; +; GFX12-True16-LABEL: load_uniform_P4_i16_anyextending_gfx11: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: s_clause 0x1 +; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0 +; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-True16-NEXT: s_wait_kmcnt 0x0 +; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0 +; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx11: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: s_clause 0x1 +; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0 +; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 +; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0 +; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra + %b = load volatile i16, ptr addrspace(4) %ptra, align 4 + %sum = add i16 %a, %b + store i16 %sum, ptr addrspace(1) %out + ret void +} + +; any target, 32-bit load load, not align 4 +define amdgpu_ps void @load_uniform_P4_i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P4_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s0, s1, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_uniform_P4_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_b32 v2, v2, s[0:1] +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s0, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_load_b32 v2, v2, s[0:1] +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s1, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i32, ptr addrspace(4) %ptra, align 2 + %b = load volatile i32, ptr addrspace(4) %ptra + %sum = add i32 %a, %b + store i32 %sum, ptr addrspace(1) %out + ret void +} + +; any target, 64bit load load, not align 4 +define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P4_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7-NEXT: v_readfirstlane_b32 s4, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s1, s5, s1 +; GFX7-NEXT: s_add_i32 s0, s4, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_uniform_P4_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s0, s2, s0 +; GFX11-NEXT: s_add_i32 s1, s3, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_load_b64 v[2:3], v2, s[0:1] +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s2, v2 +; GFX12-NEXT: v_readfirstlane_b32 s3, v3 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s2, s0 +; GFX12-NEXT: s_add_co_i32 s1, s3, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-NEXT: s_endpgm + %a = load <2 x i32>, ptr addrspace(4) %ptra, align 2 + %b = load volatile <2 x i32>, ptr addrspace(4) %ptra + %sum = add <2 x i32> %a, %b + store <2 x i32> %sum, ptr addrspace(1) %out + ret void +} + +; any target, 96bit load load, not align 4 +define amdgpu_ps void @load_uniform_P4_v3i32_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P4_v3i32_gfx12: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx3 v[2:4], off, s[0:3], 0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s7, v4 +; GFX7-NEXT: s_add_i32 s4, s0, s4 +; GFX7-NEXT: s_add_i32 s5, s1, s5 +; GFX7-NEXT: s_add_i32 s6, s7, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_dwordx3 v[2:4], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_uniform_P4_v3i32_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_b96 v[2:4], v2, s[0:1] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s5, v4 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v3 +; GFX11-NEXT: s_add_i32 s2, s5, s2 +; GFX11-NEXT: s_add_i32 s0, s3, s0 +; GFX11-NEXT: s_add_i32 s1, s4, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_v3i32_gfx12: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_load_b96 v[2:4], v2, s[0:1] +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v4 +; GFX12-NEXT: v_readfirstlane_b32 s3, v2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v3 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s2, s5, s2 +; GFX12-NEXT: s_add_co_i32 s0, s3, s0 +; GFX12-NEXT: s_add_co_i32 s1, s4, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX12-NEXT: s_endpgm + %a = load <3 x i32>, ptr addrspace(4) %ptra, align 2 + %b = load volatile <3 x i32>, ptr addrspace(4) %ptra + %sum = add <3 x i32> %a, %b + store <3 x i32> %sum, ptr addrspace(1) %out + ret void +} + +; any target, 128-bit load load, not align 4 +define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P4_v4i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 +; GFX7-NEXT: v_readfirstlane_b32 s8, v4 +; GFX7-NEXT: v_readfirstlane_b32 s9, v5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s0, s4 +; GFX7-NEXT: s_add_i32 s5, s1, s5 +; GFX7-NEXT: s_add_i32 s6, s8, s6 +; GFX7-NEXT: s_add_i32 s7, s9, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_uniform_P4_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_b128 v[2:5], v2, s[0:1] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s7, v5 +; GFX11-NEXT: v_readfirstlane_b32 s4, v2 +; GFX11-NEXT: v_readfirstlane_b32 s5, v3 +; GFX11-NEXT: v_readfirstlane_b32 s6, v4 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s3, s7, s3 +; GFX11-NEXT: s_add_i32 s0, s4, s0 +; GFX11-NEXT: s_add_i32 s1, s5, s1 +; GFX11-NEXT: s_add_i32 s2, s6, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_load_b128 v[2:5], v2, s[0:1] +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s7, v5 +; GFX12-NEXT: v_readfirstlane_b32 s4, v2 +; GFX12-NEXT: v_readfirstlane_b32 s5, v3 +; GFX12-NEXT: v_readfirstlane_b32 s6, v4 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s3, s7, s3 +; GFX12-NEXT: s_add_co_i32 s0, s4, s0 +; GFX12-NEXT: s_add_co_i32 s1, s5, s1 +; GFX12-NEXT: s_add_co_i32 s2, s6, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-NEXT: s_endpgm + %a = load <4 x i32>, ptr addrspace(4) %ptra, align 2 + %b = load volatile <4 x i32>, ptr addrspace(4) %ptra + %sum = add <4 x i32> %a, %b + store <4 x i32> %sum, ptr addrspace(1) %out + ret void +} + +; any target, 256bit load load, not align 4 +define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P4_v8i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_readfirstlane_b32 s12, v2 +; GFX7-NEXT: v_readfirstlane_b32 s13, v3 +; GFX7-NEXT: v_readfirstlane_b32 s14, v4 +; GFX7-NEXT: v_readfirstlane_b32 s15, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s16, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s12, s4 +; GFX7-NEXT: v_readfirstlane_b32 s17, v7 +; GFX7-NEXT: v_readfirstlane_b32 s18, v8 +; GFX7-NEXT: v_readfirstlane_b32 s19, v9 +; GFX7-NEXT: s_add_i32 s5, s13, s5 +; GFX7-NEXT: s_add_i32 s6, s14, s6 +; GFX7-NEXT: s_add_i32 s7, s15, s7 +; GFX7-NEXT: s_add_i32 s8, s16, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_add_i32 s9, s17, s9 +; GFX7-NEXT: s_add_i32 s10, s18, s10 +; GFX7-NEXT: s_add_i32 s11, s19, s11 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 +; GFX7-NEXT: v_mov_b32_e32 v7, s9 +; GFX7-NEXT: v_mov_b32_e32 v8, s10 +; GFX7-NEXT: v_mov_b32_e32 v9, s11 +; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_uniform_P4_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[2:5], v6, s[0:1] +; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_readfirstlane_b32 s11, v5 +; GFX11-NEXT: v_readfirstlane_b32 s8, v2 +; GFX11-NEXT: v_readfirstlane_b32 s9, v3 +; GFX11-NEXT: v_readfirstlane_b32 s10, v4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s15, v9 +; GFX11-NEXT: v_readfirstlane_b32 s12, v6 +; GFX11-NEXT: v_readfirstlane_b32 s13, v7 +; GFX11-NEXT: v_readfirstlane_b32 s14, v8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s3, s11, s3 +; GFX11-NEXT: s_add_i32 s0, s8, s0 +; GFX11-NEXT: s_add_i32 s1, s9, s1 +; GFX11-NEXT: s_add_i32 s2, s10, s2 +; GFX11-NEXT: s_add_i32 s7, s15, s7 +; GFX11-NEXT: s_add_i32 s4, s12, s4 +; GFX11-NEXT: s_add_i32 s5, s13, s5 +; GFX11-NEXT: s_add_i32 s6, s14, s6 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 +; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_v8i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v6, 0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_load_b128 v[2:5], v6, s[0:1] +; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x1 +; GFX12-NEXT: v_readfirstlane_b32 s11, v5 +; GFX12-NEXT: v_readfirstlane_b32 s8, v2 +; GFX12-NEXT: v_readfirstlane_b32 s9, v3 +; GFX12-NEXT: v_readfirstlane_b32 s10, v4 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s15, v9 +; GFX12-NEXT: v_readfirstlane_b32 s12, v6 +; GFX12-NEXT: v_readfirstlane_b32 s13, v7 +; GFX12-NEXT: v_readfirstlane_b32 s14, v8 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s3, s11, s3 +; GFX12-NEXT: s_add_co_i32 s0, s8, s0 +; GFX12-NEXT: s_add_co_i32 s1, s9, s1 +; GFX12-NEXT: s_add_co_i32 s2, s10, s2 +; GFX12-NEXT: s_add_co_i32 s7, s15, s7 +; GFX12-NEXT: s_add_co_i32 s4, s12, s4 +; GFX12-NEXT: s_add_co_i32 s5, s13, s5 +; GFX12-NEXT: s_add_co_i32 s6, s14, s6 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 +; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-NEXT: s_endpgm + %a = load <8 x i32>, ptr addrspace(4) %ptra, align 2 + %b = load volatile <8 x i32>, ptr addrspace(4) %ptra + %sum = add <8 x i32> %a, %b + store <8 x i32> %sum, ptr addrspace(1) %out + ret void +} + +; any target, 512bit load load, not align 4 +define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX7-LABEL: load_uniform_P4_v16i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; GFX7-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_readfirstlane_b32 s20, v2 +; GFX7-NEXT: v_readfirstlane_b32 s21, v3 +; GFX7-NEXT: v_readfirstlane_b32 s22, v4 +; GFX7-NEXT: v_readfirstlane_b32 s23, v5 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_readfirstlane_b32 s24, v6 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_readfirstlane_b32 s28, v10 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s33, v14 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s20, s4 +; GFX7-NEXT: v_readfirstlane_b32 s25, v7 +; GFX7-NEXT: v_readfirstlane_b32 s26, v8 +; GFX7-NEXT: v_readfirstlane_b32 s27, v9 +; GFX7-NEXT: v_readfirstlane_b32 s29, v11 +; GFX7-NEXT: v_readfirstlane_b32 s30, v12 +; GFX7-NEXT: v_readfirstlane_b32 s31, v13 +; GFX7-NEXT: v_readfirstlane_b32 s34, v15 +; GFX7-NEXT: v_readfirstlane_b32 s35, v16 +; GFX7-NEXT: v_readfirstlane_b32 s36, v17 +; GFX7-NEXT: s_add_i32 s5, s21, s5 +; GFX7-NEXT: s_add_i32 s6, s22, s6 +; GFX7-NEXT: s_add_i32 s7, s23, s7 +; GFX7-NEXT: s_add_i32 s8, s24, s8 +; GFX7-NEXT: s_add_i32 s12, s28, s12 +; GFX7-NEXT: s_add_i32 s16, s33, s16 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_add_i32 s9, s25, s9 +; GFX7-NEXT: s_add_i32 s10, s26, s10 +; GFX7-NEXT: s_add_i32 s11, s27, s11 +; GFX7-NEXT: s_add_i32 s13, s29, s13 +; GFX7-NEXT: s_add_i32 s14, s30, s14 +; GFX7-NEXT: s_add_i32 s15, s31, s15 +; GFX7-NEXT: s_add_i32 s17, s34, s17 +; GFX7-NEXT: s_add_i32 s18, s35, s18 +; GFX7-NEXT: s_add_i32 s19, s36, s19 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 +; GFX7-NEXT: v_mov_b32_e32 v10, s12 +; GFX7-NEXT: v_mov_b32_e32 v14, s16 +; GFX7-NEXT: v_mov_b32_e32 v7, s9 +; GFX7-NEXT: v_mov_b32_e32 v8, s10 +; GFX7-NEXT: v_mov_b32_e32 v9, s11 +; GFX7-NEXT: v_mov_b32_e32 v11, s13 +; GFX7-NEXT: v_mov_b32_e32 v12, s14 +; GFX7-NEXT: v_mov_b32_e32 v13, s15 +; GFX7-NEXT: v_mov_b32_e32 v15, s17 +; GFX7-NEXT: v_mov_b32_e32 v16, s18 +; GFX7-NEXT: v_mov_b32_e32 v17, s19 +; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16 +; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32 +; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[0:3], 0 addr64 offset:48 +; GFX7-NEXT: s_endpgm +; +; GFX11-LABEL: load_uniform_P4_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v14, 0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_load_b128 v[2:5], v14, s[0:1] +; GFX11-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16 +; GFX11-NEXT: global_load_b128 v[10:13], v14, s[0:1] offset:32 +; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48 +; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_readfirstlane_b32 s19, v5 +; GFX11-NEXT: v_readfirstlane_b32 s16, v2 +; GFX11-NEXT: v_readfirstlane_b32 s17, v3 +; GFX11-NEXT: v_readfirstlane_b32 s18, v4 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_readfirstlane_b32 s23, v9 +; GFX11-NEXT: v_readfirstlane_b32 s20, v6 +; GFX11-NEXT: v_readfirstlane_b32 s21, v7 +; GFX11-NEXT: v_readfirstlane_b32 s22, v8 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_readfirstlane_b32 s27, v13 +; GFX11-NEXT: v_readfirstlane_b32 s24, v10 +; GFX11-NEXT: v_readfirstlane_b32 s25, v11 +; GFX11-NEXT: v_readfirstlane_b32 s26, v12 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s31, v17 +; GFX11-NEXT: v_readfirstlane_b32 s28, v14 +; GFX11-NEXT: v_readfirstlane_b32 s29, v15 +; GFX11-NEXT: v_readfirstlane_b32 s30, v16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s3, s19, s3 +; GFX11-NEXT: s_add_i32 s0, s16, s0 +; GFX11-NEXT: s_add_i32 s1, s17, s1 +; GFX11-NEXT: s_add_i32 s2, s18, s2 +; GFX11-NEXT: s_add_i32 s7, s23, s7 +; GFX11-NEXT: s_add_i32 s4, s20, s4 +; GFX11-NEXT: s_add_i32 s5, s21, s5 +; GFX11-NEXT: s_add_i32 s6, s22, s6 +; GFX11-NEXT: s_add_i32 s11, s27, s11 +; GFX11-NEXT: v_mov_b32_e32 v5, s3 +; GFX11-NEXT: s_add_i32 s8, s24, s8 +; GFX11-NEXT: s_add_i32 s9, s25, s9 +; GFX11-NEXT: s_add_i32 s10, s26, s10 +; GFX11-NEXT: s_add_i32 s15, s31, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 +; GFX11-NEXT: s_add_i32 s12, s28, s12 +; GFX11-NEXT: s_add_i32 s13, s29, s13 +; GFX11-NEXT: s_add_i32 s14, s30, s14 +; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13 +; GFX11-NEXT: v_mov_b32_e32 v14, s12 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_v16i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v14, 0 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_load_b128 v[2:5], v14, s[0:1] +; GFX12-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16 +; GFX12-NEXT: global_load_b128 v[10:13], v14, s[0:1] offset:32 +; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48 +; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x3 +; GFX12-NEXT: v_readfirstlane_b32 s19, v5 +; GFX12-NEXT: v_readfirstlane_b32 s16, v2 +; GFX12-NEXT: v_readfirstlane_b32 s17, v3 +; GFX12-NEXT: v_readfirstlane_b32 s18, v4 +; GFX12-NEXT: s_wait_loadcnt 0x2 +; GFX12-NEXT: v_readfirstlane_b32 s23, v9 +; GFX12-NEXT: v_readfirstlane_b32 s20, v6 +; GFX12-NEXT: v_readfirstlane_b32 s21, v7 +; GFX12-NEXT: v_readfirstlane_b32 s22, v8 +; GFX12-NEXT: s_wait_loadcnt 0x1 +; GFX12-NEXT: v_readfirstlane_b32 s27, v13 +; GFX12-NEXT: v_readfirstlane_b32 s24, v10 +; GFX12-NEXT: v_readfirstlane_b32 s25, v11 +; GFX12-NEXT: v_readfirstlane_b32 s26, v12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s31, v17 +; GFX12-NEXT: v_readfirstlane_b32 s28, v14 +; GFX12-NEXT: v_readfirstlane_b32 s29, v15 +; GFX12-NEXT: v_readfirstlane_b32 s30, v16 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s3, s19, s3 +; GFX12-NEXT: s_add_co_i32 s0, s16, s0 +; GFX12-NEXT: s_add_co_i32 s1, s17, s1 +; GFX12-NEXT: s_add_co_i32 s2, s18, s2 +; GFX12-NEXT: s_add_co_i32 s7, s23, s7 +; GFX12-NEXT: s_add_co_i32 s4, s20, s4 +; GFX12-NEXT: s_add_co_i32 s5, s21, s5 +; GFX12-NEXT: s_add_co_i32 s6, s22, s6 +; GFX12-NEXT: s_add_co_i32 s11, s27, s11 +; GFX12-NEXT: v_mov_b32_e32 v5, s3 +; GFX12-NEXT: s_add_co_i32 s8, s24, s8 +; GFX12-NEXT: s_add_co_i32 s9, s25, s9 +; GFX12-NEXT: s_add_co_i32 s10, s26, s10 +; GFX12-NEXT: s_add_co_i32 s15, s31, s15 +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 +; GFX12-NEXT: s_add_co_i32 s12, s28, s12 +; GFX12-NEXT: s_add_co_i32 s13, s29, s13 +; GFX12-NEXT: s_add_co_i32 s14, s30, s14 +; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 +; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 +; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 +; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15 +; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13 +; GFX12-NEXT: v_mov_b32_e32 v14, s12 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX12-NEXT: s_endpgm + %a = load <16 x i32>, ptr addrspace(4) %ptra, align 2 + %b = load volatile <16 x i32>, ptr addrspace(4) %ptra + %sum = add <16 x i32> %a, %b + store <16 x i32> %sum, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll new file mode 100644 index 0000000000000..bf36deac33380 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll @@ -0,0 +1,602 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-True16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-NoTrue16 %s + +; global address space, addrspace(1) + +; gfx12, true16 is S16 16-bit load +; gfx12, without true 16 is S32 16-bit any-extending load +define amdgpu_ps void @load_uniform_P1_i16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P1_i16_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_d16_b16 v2, v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-True16-LABEL: load_uniform_P1_i16_gfx12: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-True16-NEXT: s_wait_kmcnt 0x0 +; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_gfx12: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra + store i16 %a, ptr addrspace(1) %out + ret void +} + +; gfx11, and older true16 is S16 16-bit load +; gfx11, and older without true 16 is S32 16-bit any-extending load +; both cases require align 4 and uniform mmo to widen mmo to 32-bit load +define amdgpu_ps void @load_uniform_P1_i16_align4_widen_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-True16-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-True16-NEXT: s_wait_kmcnt 0x0 +; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_align4_widen_mmo_gfx11: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra, align 4 + store i16 %a, ptr addrspace(1) %out + ret void +} + +; gfx12, S32 8-bit anyextending load, no difference regarding true 16 +define amdgpu_ps void @load_uniform_P1_i8_any_extending_load(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P1_i8_any_extending_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_u8 v2, v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_i8_any_extending_load: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b8 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra + store i8 %a, ptr addrspace(1) %out + ret void +} + +; gfx11 and older, S32 8-bit any-extending load, no difference regarding true 16 +define amdgpu_ps void @load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_i8_any_extending_load_align4_widen_mmo_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b8 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra, align 4 + store i8 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_uniform_P1_i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P1_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i32, ptr addrspace(1) %ptra + store i32 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P1_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-NEXT: s_endpgm + %a = load <2 x i32>, ptr addrspace(1) %ptra + store <2 x i32> %a, ptr addrspace(1) %out + ret void +} + +; gfx11, S96 load align 16(default) to load S128 +define amdgpu_ps void @load_uniform_P1_v3i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P1_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX12-NEXT: s_endpgm + %a = load <3 x i32>, ptr addrspace(1) %ptra + store <3 x i32> %a, ptr addrspace(1) %out + ret void +} + +; gfx11, S96 load align 4 to load S64 + load S32 +define amdgpu_ps void @load_uniform_P1_v3i32_align4_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P1_v3i32_align4_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX11-NEXT: v_mov_b32_e32 v4, s6 +; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_v3i32_align4_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX12-NEXT: s_endpgm + %a = load <3 x i32>, ptr addrspace(1) %ptra, align 4 + store <3 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P1_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-NEXT: s_endpgm + %a = load <4 x i32>, ptr addrspace(1) %ptra + store <4 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P1_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 +; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_v8i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 +; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-NEXT: s_endpgm + %a = load <8 x i32>, ptr addrspace(1) %ptra + store <8 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P1_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 +; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 +; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 +; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 +; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 +; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P1_v16i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 +; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 +; GFX12-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 +; GFX12-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 +; GFX12-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 +; GFX12-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX12-NEXT: s_endpgm + %a = load <16 x i32>, ptr addrspace(1) %ptra + store <16 x i32> %a, ptr addrspace(1) %out + ret void +} + +; constant address space, addrspace(4) + +define amdgpu_ps void @load_uniform_P4_i16_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P4_i16_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_d16_b16 v2, v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-True16-LABEL: load_uniform_P4_i16_gfx12: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-True16-NEXT: s_wait_kmcnt 0x0 +; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_gfx12: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra + store i16 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_uniform_P4_i16_align4_widen_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-True16-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11: +; GFX12-True16: ; %bb.0: +; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-True16-NEXT: s_wait_kmcnt 0x0 +; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-True16-NEXT: s_endpgm +; +; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_align4_widen_mmo_gfx11: +; GFX12-NoTrue16: ; %bb.0: +; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 +; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NoTrue16-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra, align 4 + store i16 %a, ptr addrspace(1) %out + ret void +} + +; gfx12, S32 8-bit anyextending load, no difference regarding true 16 +define amdgpu_ps void @load_uniform_P4_i8_any_extending_load(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P4_i8_any_extending_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_u8 v2, v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_i8_any_extending_load: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b8 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(4) %ptra + store i8 %a, ptr addrspace(1) %out + ret void +} + +; gfx11 and older, S32 8-bit any-extending load, no difference regarding true 16 +define amdgpu_ps void @load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_i8_any_extending_load_align4_widen_mmo_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b8 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(4) %ptra, align 4 + store i8 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_uniform_P4_i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P4_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i32, ptr addrspace(4) %ptra + store i32 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P4_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-NEXT: s_endpgm + %a = load <2 x i32>, ptr addrspace(4) %ptra + store <2 x i32> %a, ptr addrspace(1) %out + ret void +} + +; gfx11, S96 load align 16(default) to load S128 +define amdgpu_ps void @load_uniform_P4_v3i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P4_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX12-NEXT: s_endpgm + %a = load <3 x i32>, ptr addrspace(4) %ptra + store <3 x i32> %a, ptr addrspace(1) %out + ret void +} + +; gfx11, S96 load align 4 to load S64 + load S32 +define amdgpu_ps void @load_uniform_P4_v3i32_align4_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P4_v3i32_align4_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX11-NEXT: v_mov_b32_e32 v4, s6 +; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_v3i32_align4_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX12-NEXT: s_endpgm + %a = load <3 x i32>, ptr addrspace(4) %ptra, align 4 + store <3 x i32> %a, ptr addrspace(1) %out + ret void +} + + +define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P4_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-NEXT: s_endpgm + %a = load <4 x i32>, ptr addrspace(4) %ptra + store <4 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P4_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 +; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_v8i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 +; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-NEXT: s_endpgm + %a = load <8 x i32>, ptr addrspace(4) %ptra + store <8 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: load_uniform_P4_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 +; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 +; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 +; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 +; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 +; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: load_uniform_P4_v16i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 +; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 +; GFX12-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 +; GFX12-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 +; GFX12-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 +; GFX12-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 +; GFX12-NEXT: s_clause 0x3 +; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX12-NEXT: s_endpgm + %a = load <16 x i32>, ptr addrspace(4) %ptra + store <16 x i32> %a, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-divergent.ll new file mode 100644 index 0000000000000..312d5b4e4c3bc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-divergent.ll @@ -0,0 +1,302 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s + +define amdgpu_ps void @sextload_P0_i8(ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: sextload_P0_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_i8 v0, v[0:1] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: flat_store_b32 v[2:3], v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(0) %ptra + %a32 = sext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @sextload_P0_i16(ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: sextload_P0_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_i16 v0, v[0:1] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: flat_store_b32 v[2:3], v0 +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(0) %ptra + %a32 = sext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @zextload_P0_i8(ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: zextload_P0_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_u8 v0, v[0:1] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: flat_store_b32 v[2:3], v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(0) %ptra + %a32 = zext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @zextload_P0_i16(ptr addrspace(0) %ptra, ptr addrspace(0) %out) { +; GFX12-LABEL: zextload_P0_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_u16 v0, v[0:1] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: flat_store_b32 v[2:3], v0 +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(0) %ptra + %a32 = zext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @sextload_P1_i8(ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: sextload_P1_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_i8 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra + %a32 = sext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P1_i16(ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: sextload_P1_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_i16 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra + %a32 = sext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P1_i8(ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: zextload_P1_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra + %a32 = zext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P1_i16(ptr addrspace(1) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: zextload_P1_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u16 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra + %a32 = zext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P3_i8(ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: sextload_P3_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_i8 v0, v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(3) %ptra + %a32 = sext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @sextload_P3_i16(ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: sextload_P3_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_i16 v0, v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(3) %ptra + %a32 = sext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @zextload_P3_i8(ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: zextload_P3_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_u8 v0, v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(3) %ptra + %a32 = zext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @zextload_P3_i16(ptr addrspace(3) %ptra, ptr addrspace(3) %out) { +; GFX12-LABEL: zextload_P3_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_load_u16 v0, v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(3) %ptra + %a32 = zext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @sextload_P4_i8(ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: sextload_P4_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_i8 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(4) %ptra + %a32 = sext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P4_i16(ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: sextload_P4_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_i16 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra + %a32 = sext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P4_i8(ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: zextload_P4_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u8 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(4) %ptra + %a32 = zext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P4_i16(ptr addrspace(4) %ptra, ptr addrspace(1) %out) { +; GFX12-LABEL: zextload_P4_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_u16 v0, v[0:1], off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra + %a32 = zext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P5_i8(ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: sextload_P5_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_i8 v0, v0, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: scratch_store_b32 v1, v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(5) %ptra + %a32 = sext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @sextload_P5_i16(ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: sextload_P5_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_i16 v0, v0, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: scratch_store_b32 v1, v0, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(5) %ptra + %a32 = sext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @zextload_P5_i8(ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: zextload_P5_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_u8 v0, v0, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: scratch_store_b32 v1, v0, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(5) %ptra + %a32 = zext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @zextload_P5_i16(ptr addrspace(5) %ptra, ptr addrspace(5) %out) { +; GFX12-LABEL: zextload_P5_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_load_u16 v0, v0, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX12-NEXT: scratch_store_b32 v1, v0, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(5) %ptra + %a32 = zext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(5) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform-in-vgpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform-in-vgpr.ll new file mode 100644 index 0000000000000..f12ec4dff8549 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform-in-vgpr.ll @@ -0,0 +1,608 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX12 %s + +define amdgpu_ps void @sextload_and_zextload_P1_i8_not_uniform_mmo_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: sextload_and_zextload_P1_i8_not_uniform_mmo_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_i8 v3, v2, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_u8 v2, v2, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_and_zextload_P1_i8_not_uniform_mmo_gfx12: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_load_i8 v3, v2, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_load_u8 v2, v2, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load volatile i8, ptr addrspace(1) %ptra + %a32 = sext i8 %a to i32 + %b = load volatile i8, ptr addrspace(1) %ptrb + %b32 = zext i8 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P1_i8_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: sextload_P1_i8_not_align4_or_not_uniform_mmo_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_i8 v3, v2, s[0:1] +; GFX11-NEXT: global_load_i8 v2, v2, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_P1_i8_not_align4_or_not_uniform_mmo_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_load_i8 s0, s[0:1], 0x0 +; GFX12-NEXT: global_load_i8 v2, v2, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra + %a32 = sext i8 %a to i32 + %b = load volatile i8, ptr addrspace(1) %ptrb, align 4 + %b32 = sext i8 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P1_i8_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: zextload_P1_i8_not_align4_or_not_uniform_mmo_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_u8 v3, v2, s[0:1] +; GFX11-NEXT: global_load_u8 v2, v2, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: zextload_P1_i8_not_align4_or_not_uniform_mmo_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-NEXT: global_load_u8 v2, v2, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra + %a32 = zext i8 %a to i32 + %b = load volatile i8, ptr addrspace(1) %ptrb, align 4 + %b32 = zext i8 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P1_i16_not_natural_align_or_not_uniform_mmo_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: sextload_P1_i16_not_natural_align_or_not_uniform_mmo_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_i16 v3, v2, s[0:1] +; GFX11-NEXT: global_load_i16 v2, v2, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_P1_i16_not_natural_align_or_not_uniform_mmo_gfx12: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_load_i16 v3, v2, s[0:1] +; GFX12-NEXT: global_load_i16 v2, v2, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra, align 1 + %a32 = sext i16 %a to i32 + %b = load volatile i16, ptr addrspace(1) %ptrb + %b32 = sext i16 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P1_i16_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: sextload_P1_i16_not_align4_or_not_uniform_mmo_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_i16 v3, v2, s[0:1] +; GFX11-NEXT: global_load_i16 v2, v2, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_P1_i16_not_align4_or_not_uniform_mmo_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_load_i16 s0, s[0:1], 0x0 +; GFX12-NEXT: global_load_i16 v2, v2, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra + %a32 = sext i16 %a to i32 + %b = load volatile i16, ptr addrspace(1) %ptrb, align 4 + %b32 = sext i16 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P1_i16_not_natural_align_or_not_uniform_mmo_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: zextload_P1_i16_not_natural_align_or_not_uniform_mmo_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: zextload_P1_i16_not_natural_align_or_not_uniform_mmo_gfx12: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX12-NEXT: global_load_u16 v2, v2, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra, align 1 + %a32 = zext i16 %a to i32 + %b = load volatile i16, ptr addrspace(1) %ptrb + %b32 = zext i16 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P1_i16_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: zextload_P1_i16_not_align4_or_not_uniform_mmo_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: zextload_P1_i16_not_align4_or_not_uniform_mmo_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-NEXT: global_load_u16 v2, v2, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra + %a32 = zext i16 %a to i32 + %b = load volatile i16, ptr addrspace(1) %ptrb, align 4 + %b32 = zext i16 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} + + + +define amdgpu_ps void @sextload_and_zextload_P3_i8(ptr addrspace(3) inreg %ptra, ptr addrspace(3) inreg %ptrb, ptr addrspace(3) %out) { +; GFX11-LABEL: sextload_and_zextload_P3_i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: ds_load_i8 v1, v1 +; GFX11-NEXT: ds_load_u8 v2, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: ds_store_b32 v0, v1 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_and_zextload_P3_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX12-NEXT: ds_load_i8 v1, v1 +; GFX12-NEXT: ds_load_u8 v2, v2 +; GFX12-NEXT: s_wait_dscnt 0x1 +; GFX12-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: ds_store_b32 v0, v1 +; GFX12-NEXT: s_endpgm + %a = load volatile i8, ptr addrspace(3) %ptra + %a32 = sext i8 %a to i32 + %b = load volatile i8, ptr addrspace(3) %ptrb + %b32 = zext i8 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @sextload_and_zextload_P3_i16(ptr addrspace(3) inreg %ptra, ptr addrspace(3) inreg %ptrb, ptr addrspace(3) %out) { +; GFX11-LABEL: sextload_and_zextload_P3_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: ds_load_i16 v1, v1 +; GFX11-NEXT: ds_load_u16 v2, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: ds_store_b32 v0, v1 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_and_zextload_P3_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX12-NEXT: ds_load_i16 v1, v1 +; GFX12-NEXT: ds_load_u16 v2, v2 +; GFX12-NEXT: s_wait_dscnt 0x1 +; GFX12-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: ds_store_b32 v0, v1 +; GFX12-NEXT: s_endpgm + %a = load volatile i16, ptr addrspace(3) %ptra + %a32 = sext i16 %a to i32 + %b = load volatile i16, ptr addrspace(3) %ptrb + %b32 = zext i16 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(3) %out + ret void +} + + + +define amdgpu_ps void @sextload_and_zextload_P4_i8_not_uniform_mmo_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: sextload_and_zextload_P4_i8_not_uniform_mmo_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_i8 v3, v2, s[0:1] glc dlc +; GFX11-NEXT: global_load_u8 v2, v2, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_and_zextload_P4_i8_not_uniform_mmo_gfx12: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_i8 s0, s[0:1], 0x0 +; GFX12-NEXT: s_load_u8 s1, s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load volatile i8, ptr addrspace(4) %ptra + %a32 = sext i8 %a to i32 + %b = load volatile i8, ptr addrspace(4) %ptrb + %b32 = zext i8 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P4_i8_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: sextload_P4_i8_not_align4_or_not_uniform_mmo_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_i8 v2, v2, s[0:1] +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sext_i32_i8 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_P4_i8_not_align4_or_not_uniform_mmo_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_i8 s0, s[0:1], 0x0 +; GFX12-NEXT: s_load_i8 s1, s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(4) %ptra + %a32 = sext i8 %a to i32 + %b = load volatile i8, ptr addrspace(4) %ptrb, align 4 + %b32 = sext i8 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P4_i8_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: zextload_P4_i8_not_align4_or_not_uniform_mmo_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_u8 v2, v2, s[0:1] +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: zextload_P4_i8_not_align4_or_not_uniform_mmo_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-NEXT: s_load_u8 s1, s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(4) %ptra + %a32 = zext i8 %a to i32 + %b = load volatile i8, ptr addrspace(4) %ptrb, align 4 + %b32 = zext i8 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P4_i16_not_natural_align_or_not_uniform_mmo_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: sextload_P4_i16_not_natural_align_or_not_uniform_mmo_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_i16 v3, v2, s[0:1] +; GFX11-NEXT: global_load_i16 v2, v2, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_P4_i16_not_natural_align_or_not_uniform_mmo_gfx12: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_load_i16 v2, v2, s[0:1] +; GFX12-NEXT: s_load_i16 s0, s[2:3], 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s1, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra, align 1 + %a32 = sext i16 %a to i32 + %b = load volatile i16, ptr addrspace(4) %ptrb + %b32 = sext i16 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P4_i16_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: sextload_P4_i16_not_align4_or_not_uniform_mmo_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_i16 v2, v2, s[0:1] +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sext_i32_i16 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_P4_i16_not_align4_or_not_uniform_mmo_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_i16 s0, s[0:1], 0x0 +; GFX12-NEXT: s_load_i16 s1, s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra + %a32 = sext i16 %a to i32 + %b = load volatile i16, ptr addrspace(4) %ptrb, align 4 + %b32 = sext i16 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P4_i16_not_natural_align_or_not_uniform_mmo_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: zextload_P4_i16_not_natural_align_or_not_uniform_mmo_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: zextload_P4_i16_not_natural_align_or_not_uniform_mmo_gfx12: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_load_u16 v2, v2, s[0:1] +; GFX12-NEXT: s_load_u16 s0, s[2:3], 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s1, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra, align 1 + %a32 = zext i16 %a to i32 + %b = load volatile i16, ptr addrspace(4) %ptrb + %b32 = zext i16 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P4_i16_not_align4_or_not_uniform_mmo_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { +; GFX11-LABEL: zextload_P4_i16_not_align4_or_not_uniform_mmo_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_u16 v2, v2, s[0:1] +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: s_add_i32 s0, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: zextload_P4_i16_not_align4_or_not_uniform_mmo_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-NEXT: s_load_u16 s1, s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(4) %ptra + %a32 = zext i16 %a to i32 + %b = load volatile i16, ptr addrspace(4) %ptrb, align 4 + %b32 = zext i16 %b to i32 + %res = add i32 %a32, %b32 + store i32 %res, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform.ll new file mode 100644 index 0000000000000..e094a1451f42c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform.ll @@ -0,0 +1,231 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s + +define amdgpu_ps void @sextload_P1_i8_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: sextload_P1_i8_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_i8 v2, v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_P1_i8_gfx12: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_i8 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra + %a32 = sext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P1_i8_align4_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: sextload_P1_i8_align4_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sext_i32_i8 s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_P1_i8_align4_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_i8 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra, align 4 + %a32 = sext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P1_i16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: sextload_P1_i16_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_i16 v2, v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_P1_i16_gfx12: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_i16 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra + %a32 = sext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @sextload_P1_i16_align4_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: sextload_P1_i16_align4_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sext_i32_i16 s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sextload_P1_i16_align4_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_i16 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra, align 4 + %a32 = sext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P1_i8_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: zextload_P1_i8_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_u8 v2, v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: zextload_P1_i8_gfx12: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra + %a32 = zext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P1_i8_align4_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: zextload_P1_i8_align4_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: zextload_P1_i8_align4_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i8, ptr addrspace(1) %ptra, align 4 + %a32 = zext i8 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P1_i16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: zextload_P1_i16_gfx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_u16 v2, v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: zextload_P1_i16_gfx12: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra + %a32 = zext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @zextload_P1_i16_align4_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) %out) { +; GFX11-LABEL: zextload_P1_i16_align4_gfx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: zextload_P1_i16_align4_gfx11: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_u16 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %a = load i16, ptr addrspace(1) %ptra, align 4 + %a32 = zext i16 %a to i32 + %res = add i32 %a32, %a32 + store i32 %res, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll index dc782aa08ae99..39eb41f387cf8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -o - %s | FileCheck %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -o - %s | FileCheck %s define amdgpu_cs void @test1(i32 %arg1, <4 x i32> inreg %arg2, i32, ptr addrspace(6) inreg %arg3) { ; CHECK-LABEL: test1: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir index dd7a3ebeab471..1c1cda2157c9f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir @@ -18,8 +18,7 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (s32), addrspace 1) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1) - ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV1]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -74,8 +73,7 @@ body: | ; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s16>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s16>), addrspace 1) ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[LOAD]](<2 x s16>) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1) - ; CHECK-NEXT: G_STORE [[BITCAST]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: G_STORE [[BITCAST]](s32), [[MV1]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -132,8 +130,7 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 ; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s64) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (s64), addrspace 1) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1) - ; CHECK-NEXT: G_STORE [[LOAD]](s64), [[COPY4]](p1) :: (store (s64), addrspace 1) + ; CHECK-NEXT: G_STORE [[LOAD]](s64), [[MV1]](p1) :: (store (s64), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -188,8 +185,7 @@ body: | ; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1) ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s64) = G_BITCAST [[LOAD]](<2 x s32>) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1) - ; CHECK-NEXT: G_STORE [[BITCAST]](s64), [[COPY4]](p1) :: (store (s64), addrspace 1) + ; CHECK-NEXT: G_STORE [[BITCAST]](s64), [[MV1]](p1) :: (store (s64), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -247,8 +243,7 @@ body: | ; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1) - ; CHECK-NEXT: G_STORE [[UV1]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: G_STORE [[UV1]](s32), [[MV1]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -307,8 +302,7 @@ body: | ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s16>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<4 x s16>), addrspace 1) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s16>), [[UV1:%[0-9]+]]:vgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[UV]](<2 x s16>) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1) - ; CHECK-NEXT: G_STORE [[BITCAST]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: G_STORE [[BITCAST]](s32), [[MV1]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir index e448c4cba0941..d52b5fe9df247 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir @@ -119,10 +119,9 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32, align 32, addrspace 1) - ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32, align 32, addrspace 1) + ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32 + 16, basealign 32, addrspace 1) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) @@ -152,10 +151,9 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64, align 32, addrspace 1) - ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64, align 32, addrspace 1) + ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64 + 16, basealign 32, addrspace 1) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) @@ -192,16 +190,15 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32, align 64, addrspace 1) - ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32, align 64, addrspace 1) + ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 16, basealign 64, addrspace 1) - ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C1]](s64) + ; GCN-NEXT: [[C1:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64) ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 1) - ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C2]](s64) + ; GCN-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C2]](s64) ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 48, basealign 64, addrspace 1) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) @@ -238,16 +235,15 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64, align 64, addrspace 1) - ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64, align 64, addrspace 1) + ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 16, basealign 64, addrspace 1) - ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C1]](s64) + ; GCN-NEXT: [[C1:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64) ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 1) - ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD [[COPY1]], [[C2]](s64) + ; GCN-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C2]](s64) ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 48, basealign 64, addrspace 1) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>) ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64), [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64), [[UV6:%[0-9]+]]:vgpr(s64), [[UV7:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) @@ -368,10 +364,9 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32, align 32, addrspace 4) - ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32, align 32, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32 + 16, basealign 32, addrspace 4) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) @@ -400,10 +395,9 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY1]](p4) :: (load (s128) from %ir.constant.not.uniform, align 32, addrspace 4) - ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p4) :: (load (s128) from %ir.constant.not.uniform, align 32, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(s128) = G_LOAD [[PTR_ADD]](p4) :: (load (s128) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4) ; GCN-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[LOAD]](s128), [[LOAD1]](s128) ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s256) @@ -433,10 +427,9 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY1]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform, align 32, addrspace 4) - ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform, align 32, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[LOAD]](<8 x s16>), [[LOAD1]](<8 x s16>) ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s16>), [[UV1:%[0-9]+]]:vgpr(<2 x s16>), [[UV2:%[0-9]+]]:vgpr(<2 x s16>), [[UV3:%[0-9]+]]:vgpr(<2 x s16>), [[UV4:%[0-9]+]]:vgpr(<2 x s16>), [[UV5:%[0-9]+]]:vgpr(<2 x s16>), [[UV6:%[0-9]+]]:vgpr(<2 x s16>), [[UV7:%[0-9]+]]:vgpr(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>) @@ -465,10 +458,9 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64, align 32, addrspace 4) - ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64, align 32, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64 + 16, basealign 32, addrspace 4) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) @@ -505,16 +497,15 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32, align 64, addrspace 4) - ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32, align 64, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 16, basealign 64, addrspace 4) - ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C1]](s64) + ; GCN-NEXT: [[C1:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64) ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 4) - ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C2]](s64) + ; GCN-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C2]](s64) ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 48, basealign 64, addrspace 4) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) @@ -551,16 +542,15 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64, align 64, addrspace 4) - ; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64, align 64, addrspace 4) + ; GCN-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C]](s64) ; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 16, basealign 64, addrspace 4) - ; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C1]](s64) + ; GCN-NEXT: [[C1:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64) ; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 4) - ; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C2]](s64) + ; GCN-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD [[COPY]], [[C2]](s64) ; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 48, basealign 64, addrspace 4) ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>) ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64), [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64), [[UV6:%[0-9]+]]:vgpr(s64), [[UV7:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) @@ -736,8 +726,7 @@ body: | ; GFX7: liveins: $sgpr0_sgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 4) + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s8), addrspace 4) ; GFX7-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] ; ; GFX12-LABEL: name: extload_constant_i8_to_i32_uniform @@ -762,8 +751,7 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 1) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s8), addrspace 1) ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s8), addrspace 1, align 1) @@ -782,8 +770,7 @@ body: | ; GFX7: liveins: $sgpr0_sgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 4) + ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 4) ; GFX7-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] ; ; GFX12-LABEL: name: extload_constant_i16_to_i32_uniform @@ -808,8 +795,7 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 1) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 1) ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s16), addrspace 1, align 2) @@ -845,8 +831,7 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 2, addrspace 4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32), align 2, addrspace 4) ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 2) @@ -865,8 +850,7 @@ body: | ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p4) :: (load (s32), align 1, addrspace 4) ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sextload.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sextload.mir index b257db4f1e665..032357f611dcc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sextload.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sextload.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s --- name: sextload_constant_i8_to_i32_uniform @@ -13,8 +12,8 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p4) :: (load (s8), addrspace 4) + ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p4) :: (load (s8), addrspace 4) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_SEXTLOAD %0 :: (load (s8), addrspace 4, align 1) ... @@ -31,8 +30,8 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p4) :: (load (s8), addrspace 1) + ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p4) :: (load (s8), addrspace 1) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_SEXTLOAD %0 :: (load (s8), addrspace 1, align 1) ... @@ -49,8 +48,8 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p4) :: (load (s16), addrspace 4) + ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p4) :: (load (s16), addrspace 4) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_SEXTLOAD %0 :: (load (s16), addrspace 4, align 2) ... @@ -67,8 +66,8 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p4) :: (load (s16), addrspace 1) + ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p4) :: (load (s16), addrspace 1) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_SEXTLOAD %0 :: (load (s16), addrspace 1, align 2) ... @@ -86,6 +85,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3) ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p3) :: (load (s8), addrspace 3) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]] %0:_(p3) = COPY $sgpr0 %1:_(s32) = G_SEXTLOAD %0 :: (load (s8), addrspace 3, align 1) ... @@ -104,6 +104,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3) ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p3) :: (load (s16), addrspace 3) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]] %0:_(p3) = COPY $sgpr0 %1:_(s32) = G_SEXTLOAD %0 :: (load (s16), addrspace 3, align 2) ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir index efdf4b7f25fd7..3fa90e315fc27 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=kaveri -run-pass=regbankselect -mattr=+unaligned-access-mode %s -verify-machineinstrs -o - | FileCheck -check-prefixes=GFX7 %s -# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -run-pass=regbankselect -mattr=+unaligned-access-mode %s -verify-machineinstrs -o - | FileCheck -check-prefixes=GFX1010 %s +# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=kaveri -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -mattr=+unaligned-access-mode %s -verify-machineinstrs -o - | FileCheck -check-prefixes=GFX7 %s +# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -mattr=+unaligned-access-mode %s -verify-machineinstrs -o - | FileCheck -check-prefixes=GFX1010 %s + +# FixMe: need merge/unmerge artifact combine --- name: test_uniform_load_without_noclobber @@ -16,27 +18,46 @@ body: | ; GFX7-NEXT: %in_addr:sgpr(p1) = COPY $sgpr0_sgpr1 ; GFX7-NEXT: %out_addr:sgpr(p1) = COPY $sgpr2_sgpr3 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %in_addr(p1) :: (load (<4 x s32>), align 4, addrspace 1) - ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, [[C]](s64) + ; GFX7-NEXT: %cst16:sgpr(s64) = G_CONSTANT i64 16 + ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, %cst16(s64) ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from unknown-address + 16, align 4, addrspace 1) - ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; GFX7-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, [[C1]](s64) + ; GFX7-NEXT: %cst32:sgpr(s64) = G_CONSTANT i64 32 + ; GFX7-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, %cst32(s64) ; GFX7-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<4 x s32>) from unknown-address + 32, align 4, addrspace 1) - ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, [[C2]](s64) + ; GFX7-NEXT: %cst48:sgpr(s64) = G_CONSTANT i64 48 + ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, %cst48(s64) ; GFX7-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<4 x s32>) from unknown-address + 48, align 4, addrspace 1) - ; GFX7-NEXT: %load:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) - ; GFX7-NEXT: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>), %load8_11:vgpr(<4 x s32>), %load12_15:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<16 x s32>) - ; GFX7-NEXT: G_STORE %load0_3(<4 x s32>), %out_addr(p1) :: (store (<4 x s32>), align 4, addrspace 1) - ; GFX7-NEXT: %cst16:sgpr(s64) = G_CONSTANT i64 16 + ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) + ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) + ; GFX7-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE8:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV8]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE9:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV9]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE10:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV10]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE11:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV11]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE12:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV12]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE13:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV13]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE14:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV14]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE15:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV15]] + ; GFX7-NEXT: %load:sgpr(<16 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32), [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32), [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32), [[AMDGPU_READANYLANE8]](s32), [[AMDGPU_READANYLANE9]](s32), [[AMDGPU_READANYLANE10]](s32), [[AMDGPU_READANYLANE11]](s32), [[AMDGPU_READANYLANE12]](s32), [[AMDGPU_READANYLANE13]](s32), [[AMDGPU_READANYLANE14]](s32), [[AMDGPU_READANYLANE15]](s32) + ; GFX7-NEXT: %load0_3:sgpr(<4 x s32>), %load4_7:sgpr(<4 x s32>), %load8_11:sgpr(<4 x s32>), %load12_15:sgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<16 x s32>) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load0_3(<4 x s32>) + ; GFX7-NEXT: G_STORE [[COPY]](<4 x s32>), %out_addr(p1) :: (store (<4 x s32>), align 4, addrspace 1) ; GFX7-NEXT: %out_addr_plus_16:sgpr(p1) = G_PTR_ADD %out_addr, %cst16(s64) - ; GFX7-NEXT: G_STORE %load4_7(<4 x s32>), %out_addr_plus_16(p1) :: (store (<4 x s32>), align 4, addrspace 1) - ; GFX7-NEXT: %cst32:sgpr(s64) = G_CONSTANT i64 32 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load4_7(<4 x s32>) + ; GFX7-NEXT: G_STORE [[COPY1]](<4 x s32>), %out_addr_plus_16(p1) :: (store (<4 x s32>), align 4, addrspace 1) ; GFX7-NEXT: %out_addr_plus_32:sgpr(p1) = G_PTR_ADD %out_addr, %cst32(s64) - ; GFX7-NEXT: G_STORE %load8_11(<4 x s32>), %out_addr_plus_32(p1) :: (store (<4 x s32>), align 4, addrspace 1) - ; GFX7-NEXT: %cst48:sgpr(s64) = G_CONSTANT i64 48 + ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load8_11(<4 x s32>) + ; GFX7-NEXT: G_STORE [[COPY2]](<4 x s32>), %out_addr_plus_32(p1) :: (store (<4 x s32>), align 4, addrspace 1) ; GFX7-NEXT: %out_addr_plus_48:sgpr(p1) = G_PTR_ADD %out_addr, %cst48(s64) - ; GFX7-NEXT: G_STORE %load12_15(<4 x s32>), %out_addr_plus_48(p1) :: (store (<4 x s32>), align 4, addrspace 1) + ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load12_15(<4 x s32>) + ; GFX7-NEXT: G_STORE [[COPY3]](<4 x s32>), %out_addr_plus_48(p1) :: (store (<4 x s32>), align 4, addrspace 1) ; GFX7-NEXT: S_ENDPGM 0 ; ; GFX1010-LABEL: name: test_uniform_load_without_noclobber @@ -44,33 +65,47 @@ body: | ; GFX1010-NEXT: {{ $}} ; GFX1010-NEXT: %in_addr:sgpr(p1) = COPY $sgpr0_sgpr1 ; GFX1010-NEXT: %out_addr:sgpr(p1) = COPY $sgpr2_sgpr3 - ; GFX1010-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY %in_addr(p1) ; GFX1010-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %in_addr(p1) :: (load (<4 x s32>), align 4, addrspace 1) - ; GFX1010-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GFX1010-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, [[C]](s64) + ; GFX1010-NEXT: %cst16:sgpr(s64) = G_CONSTANT i64 16 + ; GFX1010-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, %cst16(s64) ; GFX1010-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from unknown-address + 16, align 4, addrspace 1) - ; GFX1010-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 - ; GFX1010-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, [[C1]](s64) + ; GFX1010-NEXT: %cst32:sgpr(s64) = G_CONSTANT i64 32 + ; GFX1010-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, %cst32(s64) ; GFX1010-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<4 x s32>) from unknown-address + 32, align 4, addrspace 1) - ; GFX1010-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 - ; GFX1010-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, [[C2]](s64) + ; GFX1010-NEXT: %cst48:sgpr(s64) = G_CONSTANT i64 48 + ; GFX1010-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = nuw inbounds G_PTR_ADD %in_addr, %cst48(s64) ; GFX1010-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<4 x s32>) from unknown-address + 48, align 4, addrspace 1) - ; GFX1010-NEXT: %load:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) - ; GFX1010-NEXT: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>), %load8_11:vgpr(<4 x s32>), %load12_15:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<16 x s32>) - ; GFX1010-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY %out_addr(p1) - ; GFX1010-NEXT: G_STORE %load0_3(<4 x s32>), [[COPY1]](p1) :: (store (<4 x s32>), align 4, addrspace 1) - ; GFX1010-NEXT: %cst16:sgpr(s64) = G_CONSTANT i64 16 + ; GFX1010-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) + ; GFX1010-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) + ; GFX1010-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE8:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV8]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE9:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV9]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE10:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV10]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE11:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV11]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE12:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV12]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE13:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV13]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE14:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV14]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE15:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV15]] + ; GFX1010-NEXT: %load:sgpr(<16 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32), [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32), [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32), [[AMDGPU_READANYLANE8]](s32), [[AMDGPU_READANYLANE9]](s32), [[AMDGPU_READANYLANE10]](s32), [[AMDGPU_READANYLANE11]](s32), [[AMDGPU_READANYLANE12]](s32), [[AMDGPU_READANYLANE13]](s32), [[AMDGPU_READANYLANE14]](s32), [[AMDGPU_READANYLANE15]](s32) + ; GFX1010-NEXT: %load0_3:sgpr(<4 x s32>), %load4_7:sgpr(<4 x s32>), %load8_11:sgpr(<4 x s32>), %load12_15:sgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<16 x s32>) + ; GFX1010-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load0_3(<4 x s32>) + ; GFX1010-NEXT: G_STORE [[COPY]](<4 x s32>), %out_addr(p1) :: (store (<4 x s32>), align 4, addrspace 1) ; GFX1010-NEXT: %out_addr_plus_16:sgpr(p1) = G_PTR_ADD %out_addr, %cst16(s64) - ; GFX1010-NEXT: [[COPY2:%[0-9]+]]:vgpr(p1) = COPY %out_addr_plus_16(p1) - ; GFX1010-NEXT: G_STORE %load4_7(<4 x s32>), [[COPY2]](p1) :: (store (<4 x s32>), align 4, addrspace 1) - ; GFX1010-NEXT: %cst32:sgpr(s64) = G_CONSTANT i64 32 + ; GFX1010-NEXT: [[COPY1:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load4_7(<4 x s32>) + ; GFX1010-NEXT: G_STORE [[COPY1]](<4 x s32>), %out_addr_plus_16(p1) :: (store (<4 x s32>), align 4, addrspace 1) ; GFX1010-NEXT: %out_addr_plus_32:sgpr(p1) = G_PTR_ADD %out_addr, %cst32(s64) - ; GFX1010-NEXT: [[COPY3:%[0-9]+]]:vgpr(p1) = COPY %out_addr_plus_32(p1) - ; GFX1010-NEXT: G_STORE %load8_11(<4 x s32>), [[COPY3]](p1) :: (store (<4 x s32>), align 4, addrspace 1) - ; GFX1010-NEXT: %cst48:sgpr(s64) = G_CONSTANT i64 48 + ; GFX1010-NEXT: [[COPY2:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load8_11(<4 x s32>) + ; GFX1010-NEXT: G_STORE [[COPY2]](<4 x s32>), %out_addr_plus_32(p1) :: (store (<4 x s32>), align 4, addrspace 1) ; GFX1010-NEXT: %out_addr_plus_48:sgpr(p1) = G_PTR_ADD %out_addr, %cst48(s64) - ; GFX1010-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY %out_addr_plus_48(p1) - ; GFX1010-NEXT: G_STORE %load12_15(<4 x s32>), [[COPY4]](p1) :: (store (<4 x s32>), align 4, addrspace 1) + ; GFX1010-NEXT: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load12_15(<4 x s32>) + ; GFX1010-NEXT: G_STORE [[COPY3]](<4 x s32>), %out_addr_plus_48(p1) :: (store (<4 x s32>), align 4, addrspace 1) ; GFX1010-NEXT: S_ENDPGM 0 %in_addr:_(p1) = COPY $sgpr0_sgpr1 %out_addr:_(p1) = COPY $sgpr2_sgpr3 @@ -103,15 +138,26 @@ body: | ; GFX7-NEXT: %ptr:sgpr(p4) = COPY $sgpr0_sgpr1 ; GFX7-NEXT: %out:sgpr(p1) = COPY $sgpr2_sgpr3 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %ptr(p4) :: (load (<4 x s32>), align 1, addrspace 4) - ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD %ptr, [[C]](s64) - ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, align 1, addrspace 4) - ; GFX7-NEXT: %load:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) - ; GFX7-NEXT: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<8 x s32>) - ; GFX7-NEXT: G_STORE %load0_3(<4 x s32>), %out(p1) :: (store (<4 x s32>), align 32, addrspace 1) ; GFX7-NEXT: %cst_16:sgpr(s64) = G_CONSTANT i64 16 + ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD %ptr, %cst_16(s64) + ; GFX7-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, align 1, addrspace 4) + ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GFX7-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]] + ; GFX7-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]] + ; GFX7-NEXT: %load:sgpr(<8 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32), [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32), [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32) + ; GFX7-NEXT: %load0_3:sgpr(<4 x s32>), %load4_7:sgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<8 x s32>) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load0_3(<4 x s32>) + ; GFX7-NEXT: G_STORE [[COPY]](<4 x s32>), %out(p1) :: (store (<4 x s32>), align 32, addrspace 1) ; GFX7-NEXT: %out_plus_16:sgpr(p1) = G_PTR_ADD %out, %cst_16(s64) - ; GFX7-NEXT: G_STORE %load4_7(<4 x s32>), %out_plus_16(p1) :: (store (<4 x s32>), align 32, addrspace 1) + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load4_7(<4 x s32>) + ; GFX7-NEXT: G_STORE [[COPY1]](<4 x s32>), %out_plus_16(p1) :: (store (<4 x s32>), align 32, addrspace 1) ; GFX7-NEXT: S_ENDPGM 0 ; ; GFX1010-LABEL: name: test_s_load_constant_v8i32_align1 @@ -119,19 +165,27 @@ body: | ; GFX1010-NEXT: {{ $}} ; GFX1010-NEXT: %ptr:sgpr(p4) = COPY $sgpr0_sgpr1 ; GFX1010-NEXT: %out:sgpr(p1) = COPY $sgpr2_sgpr3 - ; GFX1010-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY %ptr(p4) ; GFX1010-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %ptr(p4) :: (load (<4 x s32>), align 1, addrspace 4) - ; GFX1010-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 - ; GFX1010-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = nuw inbounds G_PTR_ADD %ptr, [[C]](s64) - ; GFX1010-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, align 1, addrspace 4) - ; GFX1010-NEXT: %load:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) - ; GFX1010-NEXT: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<8 x s32>) - ; GFX1010-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY %out(p1) - ; GFX1010-NEXT: G_STORE %load0_3(<4 x s32>), [[COPY1]](p1) :: (store (<4 x s32>), align 32, addrspace 1) ; GFX1010-NEXT: %cst_16:sgpr(s64) = G_CONSTANT i64 16 + ; GFX1010-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = nuw inbounds G_PTR_ADD %ptr, %cst_16(s64) + ; GFX1010-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, align 1, addrspace 4) + ; GFX1010-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; GFX1010-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; GFX1010-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]] + ; GFX1010-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]] + ; GFX1010-NEXT: %load:sgpr(<8 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32), [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32), [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32) + ; GFX1010-NEXT: %load0_3:sgpr(<4 x s32>), %load4_7:sgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<8 x s32>) + ; GFX1010-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load0_3(<4 x s32>) + ; GFX1010-NEXT: G_STORE [[COPY]](<4 x s32>), %out(p1) :: (store (<4 x s32>), align 32, addrspace 1) ; GFX1010-NEXT: %out_plus_16:sgpr(p1) = G_PTR_ADD %out, %cst_16(s64) - ; GFX1010-NEXT: [[COPY2:%[0-9]+]]:vgpr(p1) = COPY %out_plus_16(p1) - ; GFX1010-NEXT: G_STORE %load4_7(<4 x s32>), [[COPY2]](p1) :: (store (<4 x s32>), align 32, addrspace 1) + ; GFX1010-NEXT: [[COPY1:%[0-9]+]]:vgpr(<4 x s32>) = COPY %load4_7(<4 x s32>) + ; GFX1010-NEXT: G_STORE [[COPY1]](<4 x s32>), %out_plus_16(p1) :: (store (<4 x s32>), align 32, addrspace 1) ; GFX1010-NEXT: S_ENDPGM 0 %ptr:_(p4) = COPY $sgpr0_sgpr1 %out:_(p1) = COPY $sgpr2_sgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir index f1f8d0b6b9df5..7838e979befef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir @@ -1,7 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=regbankselect -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s + --- name: constant_load_i8_align8 legalized: true @@ -15,12 +16,14 @@ body: | ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), align 8, addrspace 4) ; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; ; GFX9-LABEL: name: constant_load_i8_align8 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), align 8, addrspace 4) ; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; ; GFX10-LABEL: name: constant_load_i8_align8 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} @@ -44,12 +47,14 @@ body: | ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4) ; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; ; GFX9-LABEL: name: constant_load_i8_align4 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4) ; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; ; GFX10-LABEL: name: constant_load_i8_align4 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} @@ -73,12 +78,14 @@ body: | ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4) ; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; ; GFX9-LABEL: name: constant_load_i16_align4 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4) ; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; ; GFX10-LABEL: name: constant_load_i16_align4 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} @@ -103,6 +110,7 @@ body: | ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4) ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8 ; GFX8-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32) + ; ; GFX9-LABEL: name: constant_sextload_i8_align4 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} @@ -110,6 +118,7 @@ body: | ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4) ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8 ; GFX9-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32) + ; ; GFX10-LABEL: name: constant_sextload_i8_align4 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} @@ -135,6 +144,7 @@ body: | ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4) ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16 ; GFX8-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32) + ; ; GFX9-LABEL: name: constant_sextload_i16_align4 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} @@ -142,6 +152,7 @@ body: | ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 4) ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16 ; GFX9-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32) + ; ; GFX10-LABEL: name: constant_sextload_i16_align4 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} @@ -169,6 +180,7 @@ body: | ; GFX8-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255 ; GFX8-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]] ; GFX8-NEXT: S_ENDPGM 0, implicit [[AND]](s32) + ; ; GFX9-LABEL: name: constant_zextload_i8_align4 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} @@ -177,6 +189,7 @@ body: | ; GFX9-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255 ; GFX9-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]] ; GFX9-NEXT: S_ENDPGM 0, implicit [[AND]](s32) + ; ; GFX10-LABEL: name: constant_zextload_i8_align4 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} @@ -204,6 +217,7 @@ body: | ; GFX8-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 ; GFX8-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]] ; GFX8-NEXT: S_ENDPGM 0, implicit [[AND]](s32) + ; ; GFX9-LABEL: name: constant_zextload_i16_align4 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} @@ -212,6 +226,7 @@ body: | ; GFX9-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 ; GFX9-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]] ; GFX9-NEXT: S_ENDPGM 0, implicit [[AND]](s32) + ; ; GFX10-LABEL: name: constant_zextload_i16_align4 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} @@ -237,12 +252,14 @@ body: | ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1) ; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; ; GFX9-LABEL: name: global_load_i8_align4 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1) ; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; ; GFX10-LABEL: name: global_load_i8_align4 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} @@ -266,12 +283,14 @@ body: | ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1) ; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; ; GFX9-LABEL: name: global_load_i16_align4 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1) ; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; ; GFX10-LABEL: name: global_load_i16_align4 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} @@ -296,6 +315,7 @@ body: | ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1) ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8 ; GFX8-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32) + ; ; GFX9-LABEL: name: global_sextload_i8_alig4 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} @@ -303,6 +323,7 @@ body: | ; GFX9-NEXT: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s32), addrspace 1) ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8 ; GFX9-NEXT: S_ENDPGM 0, implicit [[SEXT_INREG]](s32) + ; ; GFX10-LABEL: name: global_sextload_i8_alig4 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} @@ -329,6 +350,7 @@ body: | ; GFX8-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 ; GFX8-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]] ; GFX8-NEXT: S_ENDPGM 0, implicit [[AND]](s32) + ; ; GFX9-LABEL: name: global_zextload_i16_align4 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} @@ -337,6 +359,7 @@ body: | ; GFX9-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 ; GFX9-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]] ; GFX9-NEXT: S_ENDPGM 0, implicit [[AND]](s32) + ; ; GFX10-LABEL: name: global_zextload_i16_align4 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} @@ -360,23 +383,25 @@ body: | ; GFX8: liveins: $sgpr0_sgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4) - ; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4) + ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] + ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX9-LABEL: name: constant_load_i8_align2 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4) + ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] + ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX10-LABEL: name: constant_load_i8_align2 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4) - ; GFX10-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4) + ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] + ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (invariant load (s8), align 2, addrspace 4) S_ENDPGM 0, implicit %1 @@ -392,23 +417,25 @@ body: | ; GFX8: liveins: $sgpr0_sgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4) - ; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4) + ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] + ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX9-LABEL: name: constant_load_i16_align2 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4) + ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] + ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX10-LABEL: name: constant_load_i16_align2 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4) - ; GFX10-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4) + ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] + ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (invariant load (s16), align 2, addrspace 4) S_ENDPGM 0, implicit %1 @@ -424,23 +451,25 @@ body: | ; GFX8: liveins: $sgpr0_sgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX8-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4) - ; GFX8-NEXT: S_ENDPGM 0, implicit [[SEXTLOAD]](s32) + ; GFX8-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4) + ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]] + ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX9-LABEL: name: constant_sextload_i8_align2 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX9-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[SEXTLOAD]](s32) + ; GFX9-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4) + ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]] + ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX10-LABEL: name: constant_sextload_i8_align2 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX10-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4) - ; GFX10-NEXT: S_ENDPGM 0, implicit [[SEXTLOAD]](s32) + ; GFX10-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4) + ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]] + ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s8), align 2, addrspace 4) S_ENDPGM 0, implicit %1 @@ -456,23 +485,25 @@ body: | ; GFX8: liveins: $sgpr0_sgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX8-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4) - ; GFX8-NEXT: S_ENDPGM 0, implicit [[SEXTLOAD]](s32) + ; GFX8-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4) + ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]] + ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX9-LABEL: name: constant_sextload_i16_align2 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX9-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[SEXTLOAD]](s32) + ; GFX9-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4) + ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]] + ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX10-LABEL: name: constant_sextload_i16_align2 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX10-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4) - ; GFX10-NEXT: S_ENDPGM 0, implicit [[SEXTLOAD]](s32) + ; GFX10-NEXT: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4) + ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[SEXTLOAD]] + ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_SEXTLOAD %0 :: (invariant load (s16), align 2, addrspace 4) S_ENDPGM 0, implicit %1 @@ -488,23 +519,25 @@ body: | ; GFX8: liveins: $sgpr0_sgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX8-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4) - ; GFX8-NEXT: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32) + ; GFX8-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4) + ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]] + ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX9-LABEL: name: constant_zextload_i8_align2 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32) + ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4) + ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]] + ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX10-LABEL: name: constant_zextload_i8_align2 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load (s8), align 2, addrspace 4) - ; GFX10-NEXT: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32) + ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s8), align 2, addrspace 4) + ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]] + ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s8), align 2, addrspace 4) S_ENDPGM 0, implicit %1 @@ -520,23 +553,25 @@ body: | ; GFX8: liveins: $sgpr0_sgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX8-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4) - ; GFX8-NEXT: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32) + ; GFX8-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4) + ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]] + ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX9-LABEL: name: constant_zextload_i16_align2 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32) + ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4) + ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]] + ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX10-LABEL: name: constant_zextload_i16_align2 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load (s16), addrspace 4) - ; GFX10-NEXT: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32) + ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p1) :: (invariant load (s16), addrspace 4) + ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]] + ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load (s16), align 2, addrspace 4) S_ENDPGM 0, implicit %1 @@ -552,23 +587,25 @@ body: | ; GFX8: liveins: $sgpr0_sgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8), align 4, addrspace 3) - ; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 3) + ; GFX8-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] + ; GFX8-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX9-LABEL: name: local_load_i8_align4 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8), align 4, addrspace 3) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 3) + ; GFX9-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] + ; GFX9-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) + ; ; GFX10-LABEL: name: local_load_i8_align4 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8), align 4, addrspace 3) - ; GFX10-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 3) + ; GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] + ; GFX10-NEXT: S_ENDPGM 0, implicit [[AMDGPU_READANYLANE]](s32) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s8), align 4, addrspace 3) S_ENDPGM 0, implicit %1 @@ -587,6 +624,7 @@ body: | ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8), align 4, addrspace 5) ; GFX8-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; ; GFX9-LABEL: name: private_load_i8_align4 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9-NEXT: {{ $}} @@ -594,6 +632,7 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s8), align 4, addrspace 5) ; GFX9-NEXT: S_ENDPGM 0, implicit [[LOAD]](s32) + ; ; GFX10-LABEL: name: private_load_i8_align4 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir index 29db4cf9eedf5..7f48a30b2069f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir @@ -12,8 +12,7 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s8), addrspace 4) + ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p4) :: (load (s8), addrspace 4) ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_ZEXTLOAD %0 :: (load (s8), addrspace 4, align 1) @@ -31,8 +30,7 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s8), addrspace 1) + ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p4) :: (load (s8), addrspace 1) ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_ZEXTLOAD %0 :: (load (s8), addrspace 1, align 1) @@ -50,8 +48,7 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s16), addrspace 4) + ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p4) :: (load (s16), addrspace 4) ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_ZEXTLOAD %0 :: (load (s16), addrspace 4, align 2) @@ -69,8 +66,7 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4) - ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s16), addrspace 1) + ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY]](p4) :: (load (s16), addrspace 1) ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]] %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_ZEXTLOAD %0 :: (load (s16), addrspace 1, align 2) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir index a5711418a8000..3b5ec94aeb980 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s --- | define amdgpu_kernel void @load_constant(ptr addrspace(4) %ptr0) { @@ -110,8 +110,8 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s32) from %ir.ptr1, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s32) from %ir.ptr1, addrspace 1) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s32) from %ir.ptr1) ... @@ -127,8 +127,8 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s32) from %ir.ptr1, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s32) from %ir.ptr1, addrspace 1) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s32) from %ir.ptr1) ... @@ -144,8 +144,8 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (volatile invariant load (s32) from %ir.ptr1, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (volatile invariant load (s32) from %ir.ptr1, addrspace 1) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (volatile invariant load (s32) from %ir.ptr1) ... @@ -161,8 +161,8 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load acquire (s32) from %ir.ptr1, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load acquire (s32) from %ir.ptr1, addrspace 1) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (invariant load acquire (s32) from %ir.ptr1) ... @@ -178,8 +178,8 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load (s32) from %ir.tmp1, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY]](p1) :: (load (s32) from %ir.tmp1, addrspace 1) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]] %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load (s32) from %ir.tmp1) ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll index 084f2400a536e..c82bd6b3a4c4b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -march=amdgcn -mtriple=amdgcn-amd-hmcsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -global-isel -new-reg-bank-select -march=amdgcn -mtriple=amdgcn-amd-hmcsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s define void @shuffle_to_extract(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GFX942-LABEL: shuffle_to_extract: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll index 766b869aabe0f..5e6894b379a42 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll @@ -1,8 +1,8 @@ -; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding -global-isel | FileCheck --check-prefixes=SI,GCN %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding -global-isel | FileCheck --check-prefixes=CI,GCN,SICIVI %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding -global-isel | FileCheck --check-prefixes=VI,GCN,SICIVI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -show-mc-encoding -global-isel < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding -global-isel < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding -global-isel -new-reg-bank-select | FileCheck --check-prefixes=SI,GCN %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding -global-isel -new-reg-bank-select | FileCheck --check-prefixes=CI,GCN,SICIVI %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -show-mc-encoding -global-isel -new-reg-bank-select | FileCheck --check-prefixes=VI,GCN,SICIVI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -show-mc-encoding -global-isel -new-reg-bank-select < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding -global-isel -new-reg-bank-select < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s ; SMRD load with an immediate offset. ; GCN-LABEL: {{^}}smrd0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-divergent-addr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-divergent-addr.ll new file mode 100644 index 0000000000000..bdd3cfe717aeb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-divergent-addr.ll @@ -0,0 +1,429 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefixes=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s + +define amdgpu_ps void @store_P0_i8(i8 %a, ptr addrspace(0) %out) { +; GFX7-LABEL: store_P0_i8: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_store_byte v[1:2], v0 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P0_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b8 v[1:2], v0 +; GFX12-NEXT: s_endpgm + store i8 %a, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @store_P0_i16(i16 %a, ptr addrspace(0) %out) { +; GFX7-LABEL: store_P0_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_store_short v[1:2], v0 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P0_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b16 v[1:2], v0 +; GFX12-NEXT: s_endpgm + store i16 %a, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @store_P0_i32(i32 %a, ptr addrspace(0) %out) { +; GFX7-LABEL: store_P0_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_store_dword v[1:2], v0 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P0_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b32 v[1:2], v0 +; GFX12-NEXT: s_endpgm + store i32 %a, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @store_P0_v2i32(<2 x i32> %a, ptr addrspace(0) %out) { +; GFX7-LABEL: store_P0_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v2 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX7-NEXT: flat_store_dword v[2:3], v1 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P0_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm + store <2 x i32> %a, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @store_P0_v3i32(<3 x i32> %a, ptr addrspace(0) %out) { +; GFX7-LABEL: store_P0_v3i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v3 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc +; GFX7-NEXT: flat_store_dword v[3:4], v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v3 +; GFX7-NEXT: flat_store_dword v[5:6], v1 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P0_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b96 v[3:4], v[0:2] +; GFX12-NEXT: s_endpgm + store <3 x i32> %a, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @store_P0_v4i32(<4 x i32> %a, ptr addrspace(0) %out) { +; GFX7-LABEL: store_P0_v4i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 4, v4 +; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX7-NEXT: flat_store_dword v[4:5], v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v4 +; GFX7-NEXT: flat_store_dword v[6:7], v1 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v3 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P0_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b128 v[4:5], v[0:3] +; GFX12-NEXT: s_endpgm + store <4 x i32> %a, ptr addrspace(0) %out + ret void +} + +define amdgpu_ps void @store_P1_i8(i8 %a, ptr addrspace(1) %out) { +; GFX7-LABEL: store_P1_i8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_byte v0, v[1:2], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P1_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b8 v[1:2], v0, off +; GFX12-NEXT: s_endpgm + store i8 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @store_P1_i16(i16 %a, ptr addrspace(1) %out) { +; GFX7-LABEL: store_P1_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_short v0, v[1:2], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P1_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b16 v[1:2], v0, off +; GFX12-NEXT: s_endpgm + store i16 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @store_P1_i32(i32 %a, ptr addrspace(1) %out) { +; GFX7-LABEL: store_P1_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P1_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b32 v[1:2], v0, off +; GFX12-NEXT: s_endpgm + store i32 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @store_P1_v2i32(<2 x i32> %a, ptr addrspace(1) %out) { +; GFX7-LABEL: store_P1_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P1_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-NEXT: s_endpgm + store <2 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @store_P1_v3i32(<3 x i32> %a, ptr addrspace(1) %out) { +; GFX7-LABEL: store_P1_v3i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_dwordx3 v[0:2], v[3:4], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P1_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b96 v[3:4], v[0:2], off +; GFX12-NEXT: s_endpgm + store <3 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @store_P1_v4i32(<4 x i32> %a, ptr addrspace(1) %out) { +; GFX7-LABEL: store_P1_v4i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P1_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX12-NEXT: s_endpgm + store <4 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @store_P3_i8(i8 %a, ptr addrspace(3) %out) { +; GFX7-LABEL: store_P3_i8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P3_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b8 v1, v0 +; GFX12-NEXT: s_endpgm + store i8 %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @store_P3_i16(i16 %a, ptr addrspace(3) %out) { +; GFX7-LABEL: store_P3_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P3_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b16 v1, v0 +; GFX12-NEXT: s_endpgm + store i16 %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @store_P3_i32(i32 %a, ptr addrspace(3) %out) { +; GFX7-LABEL: store_P3_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P3_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm + store i32 %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @store_P3_v2i32(<2 x i32> %a, ptr addrspace(3) %out) { +; GFX7-LABEL: store_P3_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_write_b64 v2, v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P3_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b64 v2, v[0:1] +; GFX12-NEXT: s_endpgm + store <2 x i32> %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @store_P3_v3i32(<3 x i32> %a, ptr addrspace(3) %out) { +; GFX7-LABEL: store_P3_v3i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_write_b96 v3, v[0:2] +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P3_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b96 v3, v[0:2] +; GFX12-NEXT: s_endpgm + store <3 x i32> %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @store_P3_v4i32(<4 x i32> %a, ptr addrspace(3) %out) { +; GFX7-LABEL: store_P3_v4i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_write_b128 v4, v[0:3] +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P3_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: ds_store_b128 v4, v[0:3] +; GFX12-NEXT: s_endpgm + store <4 x i32> %a, ptr addrspace(3) %out + ret void +} + +define amdgpu_ps void @store_P5_i8(i8 %a, ptr addrspace(5) %out) { +; GFX7-LABEL: store_P5_i8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_mov_b32 s4, s0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s4, s0 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: buffer_store_byte v0, v1, s[4:7], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P5_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_store_b8 v1, v0, off +; GFX12-NEXT: s_endpgm + store i8 %a, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @store_P5_i16(i16 %a, ptr addrspace(5) %out) { +; GFX7-LABEL: store_P5_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_mov_b32 s4, s0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s4, s0 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P5_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_store_b16 v1, v0, off +; GFX12-NEXT: s_endpgm + store i16 %a, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @store_P5_i32(i32 %a, ptr addrspace(5) %out) { +; GFX7-LABEL: store_P5_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_mov_b32 s4, s0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s4, s0 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P5_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_store_b32 v1, v0, off +; GFX12-NEXT: s_endpgm + store i32 %a, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @store_P5_v2i32(<2 x i32> %a, ptr addrspace(5) %out) { +; GFX7-LABEL: store_P5_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_mov_b32 s4, s0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s4, s0 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: buffer_store_dword v0, v2, s[4:7], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v2 +; GFX7-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P5_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: s_endpgm + store <2 x i32> %a, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @store_P5_v3i32(<3 x i32> %a, ptr addrspace(5) %out) { +; GFX7-LABEL: store_P5_v3i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_mov_b32 s4, s0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s4, s0 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v3 +; GFX7-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v3 +; GFX7-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P5_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_store_b96 v3, v[0:2], off +; GFX12-NEXT: s_endpgm + store <3 x i32> %a, ptr addrspace(5) %out + ret void +} + +define amdgpu_ps void @store_P5_v4i32(<4 x i32> %a, ptr addrspace(5) %out) { +; GFX7-LABEL: store_P5_v4i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_mov_b32 s4, s0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s4, s0 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: buffer_store_dword v0, v4, s[4:7], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GFX7-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v4 +; GFX7-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v4 +; GFX7-NEXT: buffer_store_dword v3, v0, s[4:7], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_P5_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: scratch_store_b128 v4, v[0:3], off +; GFX12-NEXT: s_endpgm + store <4 x i32> %a, ptr addrspace(5) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 38ef707fa65a2..1be50894838ab 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s ; FIXME: -; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s +; XUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index 1d2d330eeb61a..e16d5a7331165 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s ; FIXME: -; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s +; XUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-uniform-addr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-uniform-addr.ll new file mode 100644 index 0000000000000..3fa4892494998 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-uniform-addr.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefixes=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s + +define amdgpu_ps void @store_uniform_P1_addr_i8(i8 %a, ptr addrspace(1) inreg %out) { +; GFX7-LABEL: store_uniform_P1_addr_i8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_uniform_P1_addr_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm + store i8 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @store_uniform_P1_addr_i16(i16 %a, ptr addrspace(1) inreg %out) { +; GFX7-LABEL: store_uniform_P1_addr_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_uniform_P1_addr_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm + store i16 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @store_uniform_P1_addr_i32(i32 %a, ptr addrspace(1) inreg %out) { +; GFX7-LABEL: store_uniform_P1_addr_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_uniform_P1_addr_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm + store i32 %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @store_uniform_P1_addr_v2i32(<2 x i32> %a, ptr addrspace(1) inreg %out) { +; GFX7-LABEL: store_uniform_P1_addr_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_uniform_P1_addr_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm + store <2 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @store_uniform_P1_addr_v3i32(<3 x i32> %a, ptr addrspace(1) inreg %out) { +; GFX7-LABEL: store_uniform_P1_addr_v3i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_uniform_P1_addr_v3i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX12-NEXT: s_endpgm + store <3 x i32> %a, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @store_uniform_P1_addr_v4i32(<4 x i32> %a, ptr addrspace(1) inreg %out) { +; GFX7-LABEL: store_uniform_P1_addr_v4i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX12-LABEL: store_uniform_P1_addr_v4i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_endpgm + store <4 x i32> %a, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-load.ll index dccc55b7d045b..d6e8d0a0a4788 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/unsupported-load.ll @@ -1,4 +1,4 @@ -; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - < %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s +; RUN: not llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - < %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s ; GISEL-ERR: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_LOAD %{{[0-9]+}}:vgpr(p8) :: (load (s32) from %ir.rsrc, addrspace 8) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll index d28840d36ed65..2d3ce9469ee90 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s + +; FixMe: need to decide to move multiple instructions to vgpr define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 { ; GFX8-LABEL: constant_load_i8_align4: @@ -338,15 +340,17 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_sbyte v2, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_add_u32 s2, s0, 2 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: s_lshr_b32 s2, s2, 16 +; GFX8-NEXT: s_add_u32 s0, s0, 2 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_short v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: constant_sextload_i8_align2: @@ -356,8 +360,11 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] -; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: constant_sextload_i8_align2: @@ -367,8 +374,11 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] -; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2 +; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2 ; GFX10-NEXT: s_endpgm %load = load i8, ptr addrspace(1) %in, align 2 %sextload = sext i8 %load to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll index 9cd9c4734fbe6..4511c364b8a7e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s define i32 @zextload_global_i1_to_i32(ptr addrspace(1) %ptr) { ; GFX9-LABEL: zextload_global_i1_to_i32: diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll index 93422e259b827..736fa409e4761 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-SDAG -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-GISEL +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-GISEL + +; FixMe: need to decide to move multiple instructions to vgpr that will be folded by isel pattern define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds1align1: @@ -43,11 +45,18 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:1 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 8, v1 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s1, s1, 8 +; ALIGNED-GISEL-NEXT: s_or_b32 s0, s1, s0 +; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s1, 8 ; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v0 -; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v1 offset:1 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v0 offset:1 ; ALIGNED-GISEL-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds2align1: @@ -105,27 +114,39 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ALIGNED-GISEL-LABEL: ds4align1: ; ALIGNED-GISEL: ; %bb.0: ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 -; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:3 -; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:2 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 +; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 +; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:3 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v3 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:1 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:3 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 8 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 24 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 16 +; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0 +; ALIGNED-GISEL-NEXT: s_or_b32 s2, s4, s3 +; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0 +; ALIGNED-GISEL-NEXT: s_and_b32 s3, 0xffff, s0 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s3, s3, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s2, s0, 16 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:1 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s2, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:2 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:3 ; ALIGNED-GISEL-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds4align1: @@ -166,10 +187,17 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:2 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s1, s1, 16 +; ALIGNED-GISEL-NEXT: s_or_b32 s0, s1, s0 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s0, 16 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0 -; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v2, v0 offset:2 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0 offset:2 ; ALIGNED-GISEL-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds4align2: @@ -247,31 +275,56 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 ; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:7 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 8 +; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v3 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 24 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 16 +; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v6 +; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v5 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 8 +; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v7 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v2 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v1 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 -; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:1 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, 8 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v1 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v4 offset:3 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 -; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v0 offset:4 -; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:5 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v0 offset:6 -; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:7 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 24 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 16 +; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3 +; ALIGNED-GISEL-NEXT: s_and_b32 s4, 0xffff, s0 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s4, s4, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s3, s0, 16 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:1 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s3, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:2 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s2 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:3 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s1, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s2, 16 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:4 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:5 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s0, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:6 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:7 ; ALIGNED-GISEL-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds8align1: @@ -319,15 +372,28 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 ; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 ; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:6 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 16 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 -; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:4 -; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:6 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v3 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 16 +; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0 +; ALIGNED-GISEL-NEXT: s_or_b32 s2, s4, s3 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s3, s0, 16 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:2 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s2, 16 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:4 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:6 ; ALIGNED-GISEL-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds8align2: @@ -428,48 +494,85 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 ; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 8 +; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v3 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 -; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 -; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8 -; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9 -; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 24 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 16 +; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 offset:8 +; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:9 +; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v6 +; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v5 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 8 +; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:10 ; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:11 +; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v7 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v8 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v8 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 24 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 16 +; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 8 +; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v3 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v4, v3 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 -; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:1 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, 8 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v1 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v5 offset:3 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:4 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:5 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v2 offset:6 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:7 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 offset:8 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:9 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:10 -; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:11 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s5, s5, 24 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 16 +; ALIGNED-GISEL-NEXT: s_or_b32 s4, s5, s4 +; ALIGNED-GISEL-NEXT: s_and_b32 s5, 0xffff, s0 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s5, s5, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s4, s0, 16 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s5 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:1 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s4, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:2 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s2 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:3 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s1, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s2, 16 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:4 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:5 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s0, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:6 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s3 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:7 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s1, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s3, 16 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:9 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s0, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:10 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:11 ; ALIGNED-GISEL-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds12align1: @@ -524,19 +627,37 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 ; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 ; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:10 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s1 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v4 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 16 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v3 +; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s4, 16 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v5 -; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v1 -; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v1 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v2 offset:4 -; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v2 offset:6 -; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v0 offset:8 -; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v0 offset:10 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; ALIGNED-GISEL-NEXT: s_or_b32 s2, s2, s3 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v5 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 16 +; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s4, s0, 16 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:2 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s2, 16 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:4 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:6 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s3, 16 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:10 ; ALIGNED-GISEL-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds12align2: @@ -730,63 +851,114 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 ; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 8 +; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v3 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 24 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 16 +; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v6 +; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v5 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 8 +; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v7 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v8 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2 -; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8 -; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9 -; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10 -; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:11 -; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:12 -; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:13 -; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v0 offset:14 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v8 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 24 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 16 +; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3 +; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 offset:8 +; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:9 +; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:10 +; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:11 +; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:12 +; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:13 +; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:14 ; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:15 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 8 +; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v3 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; ALIGNED-GISEL-NEXT: v_or3_b32 v3, v4, v5, v3 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s5, v4 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s5, s5, 24 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 16 +; ALIGNED-GISEL-NEXT: s_or_b32 s4, s5, s4 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s5, v6 +; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v5 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s5, s5, 8 +; ALIGNED-GISEL-NEXT: s_or_b32 s4, s5, s4 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s5, v7 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v5, v4 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:1 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v6, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v1 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v6 offset:3 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v2 offset:4 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:5 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v2 offset:6 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:7 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v3 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v3 offset:8 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:9 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v3 offset:10 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:11 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 offset:12 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:13 -; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:14 -; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:15 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s6, s6, 24 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s5, s5, 16 +; ALIGNED-GISEL-NEXT: s_or_b32 s5, s6, s5 +; ALIGNED-GISEL-NEXT: s_and_b32 s6, 0xffff, s0 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s6, s6, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; ALIGNED-GISEL-NEXT: s_or_b32 s4, s5, s4 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s5, s0, 16 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:1 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s5, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s5 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:2 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s2 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:3 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s1, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s2, 16 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:4 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:5 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s0, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:6 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s3 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:7 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s1, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s3, 16 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:9 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s0, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:10 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; ALIGNED-GISEL-NEXT: s_and_b32 s1, 0xffff, s4 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:11 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s1, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s4, 16 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:12 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:13 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s1, s0, 8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:14 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:15 ; ALIGNED-GISEL-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds16align1: @@ -847,22 +1019,46 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ALIGNED-GISEL-NEXT: ds_read_u16 v7, v0 offset:12 ; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:14 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v2 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s0, v1 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s2, s2, 16 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v4 +; ALIGNED-GISEL-NEXT: s_or_b32 s0, s2, s0 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s3, s3, 16 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v6, 16, v5 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v6 +; ALIGNED-GISEL-NEXT: s_or_b32 s2, s3, s2 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s3, v5 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s4, s4, 16 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v7 -; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 -; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2 -; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v2 offset:4 -; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v2 offset:6 -; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v3 offset:8 -; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v3 offset:10 -; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:12 -; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:14 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; ALIGNED-GISEL-NEXT: s_or_b32 s3, s4, s3 +; ALIGNED-GISEL-NEXT: v_readfirstlane_b32 s4, v7 +; ALIGNED-GISEL-NEXT: s_lshl_b32 s5, s5, 16 +; ALIGNED-GISEL-NEXT: s_or_b32 s4, s5, s4 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s5, s0, 16 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s5 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:2 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s2, 16 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:4 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:6 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s3, 16 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:8 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:10 +; ALIGNED-GISEL-NEXT: s_lshr_b32 s0, s4, 16 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:12 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:14 ; ALIGNED-GISEL-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds16align2: diff --git a/llvm/test/CodeGen/AMDGPU/lds-size.ll b/llvm/test/CodeGen/AMDGPU/lds-size.ll index 655475c6543e2..75732a58eafc4 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-size.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-size.ll @@ -1,5 +1,5 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=ALL -check-prefix=EG %s ; This test makes sure we do not double count global values when they are diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.addrspacecast.nonnull.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.addrspacecast.nonnull.ll index 265353675b349..cc1dd536020a7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.addrspacecast.nonnull.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.addrspacecast.nonnull.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,DAGISEL-ASM -; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,GISEL-ASM +; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel -new-reg-bank-select -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,GISEL-ASM define void @local_to_flat(ptr addrspace(3) %ptr) { ; ASM-LABEL: local_to_flat: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll index 50f1beba25227..2439949514ce9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll @@ -1,5 +1,5 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s declare i64 @llvm.amdgcn.dispatch.id() #1 diff --git a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll index 75e7a63c540e5..7ae57ff8ec276 100644 --- a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GISEL %s define i32 @range_metadata_sext_i8_signed_range_i32(ptr addrspace(1) %ptr) { ; GCN-LABEL: range_metadata_sext_i8_signed_range_i32: diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index a3c38b17abf00..e50ed3ee95140 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -5,12 +5,12 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s ; Test splitting flat instruction offsets into the low and high bits ; when the offset doesn't fit in the offset field. diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll index 20916a9a51d9e..ef9d4aff3065e 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s diff --git a/llvm/test/CodeGen/AMDGPU/read_register.ll b/llvm/test/CodeGen/AMDGPU/read_register.ll index f6a5af55840ac..bee329d8ba39a 100644 --- a/llvm/test/CodeGen/AMDGPU/read_register.ll +++ b/llvm/test/CodeGen/AMDGPU/read_register.ll @@ -1,5 +1,5 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck %s declare i32 @llvm.read_register.i32(metadata) #0 declare i64 @llvm.read_register.i64(metadata) #0 diff --git a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll index 902e3ef5c2397..cdb62ce664677 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GISEL +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GISEL define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg %flag) { ; GCN-LABEL: sink_scratch_pointer: diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll index 9c7f393d35932..a7affb93c1c6a 100644 --- a/llvm/test/CodeGen/AMDGPU/trap.ll +++ b/llvm/test/CodeGen/AMDGPU/trap.ll @@ -1,27 +1,27 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s -; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s ; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s -; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn--amdhsa -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s ; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s -; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s ; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s -; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn--amdhsa -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s ; enable trap handler feature ; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s ; disable trap handler feature ; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s ; RUN: llc -global-isel=0 -mtriple=amdgcn < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s -; RUN: llc -global-isel=1 -mtriple=amdgcn < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s ; GCN-WARNING: warning: :0:0: in function hsa_debugtrap void (ptr addrspace(1)): debugtrap handler not supported diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index fc323c6e66a3d..7a64e55abb8d3 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { ;