From fd31b019d02b280c8c34a059cccc2dd0f5bc81f1 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 30 Jun 2025 19:01:57 -0700 Subject: [PATCH 1/5] New VPWidenStridedLoadRecipe --- .../Transforms/Vectorize/LoopVectorize.cpp | 3 +- llvm/lib/Transforms/Vectorize/VPlan.h | 50 +++++++++++++++- .../Transforms/Vectorize/VPlanAnalysis.cpp | 6 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 57 +++++++++++++++++-- .../Transforms/Vectorize/VPlanTransforms.cpp | 14 +++-- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 + .../Transforms/Vectorize/VPlanVerifier.cpp | 3 +- 7 files changed, 119 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5a1d1e75e2d5d..8ed55ecc9f6e6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3965,7 +3965,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( [](const auto *R) { return Instruction::Select; }) .Case( [](const auto *R) { return Instruction::Store; }) - .Case( + .Case( [](const auto *R) { return Instruction::Load; }) .Case( [](const auto *R) { return Instruction::Call; }) @@ -4065,6 +4065,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPReductionPHISC: case VPDef::VPInterleaveEVLSC: case VPDef::VPInterleaveSC: + case VPDef::VPWidenStridedLoadSC: case VPDef::VPWidenLoadEVLSC: case VPDef::VPWidenLoadSC: case VPDef::VPWidenStoreEVLSC: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 167d36b687580..ff5a021640073 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -563,6 +563,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPInterleaveEVLSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: + case VPRecipeBase::VPWidenStridedLoadSC: case VPRecipeBase::VPWidenLoadEVLSC: case VPRecipeBase::VPWidenLoadSC: case VPRecipeBase::VPWidenStoreEVLSC: @@ -3156,7 +3157,8 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC || R->getVPDefID() == VPRecipeBase::VPWidenStoreSC || R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC || - R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC; + R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC || + R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC; } static inline bool classof(const VPUser *U) { @@ -3277,6 +3279,52 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { } }; +/// A recipe for strided load operations, using the base address, stride, and an +/// optional mask. This recipe will generate an vp.strided.load intrinsic call +/// to represent memory accesses with a fixed stride. +struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe, + public VPValue { + VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride, + VPValue *VF, VPValue *Mask, + const VPIRMetadata &Metadata, DebugLoc DL) + : VPWidenMemoryRecipe( + VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF}, + /*Consecutive=*/false, /*Reverse=*/false, Metadata, DL), + VPValue(this, &Load) { + setMask(Mask); + } + + VPWidenStridedLoadRecipe *clone() override { + return new VPWidenStridedLoadRecipe(cast(Ingredient), getAddr(), + getStride(), getVF(), getMask(), *this, + getDebugLoc()); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC); + + /// Return the stride operand. + VPValue *getStride() const { return getOperand(1); } + + /// Return the VF operand. + VPValue *getVF() const { return getOperand(2); } + + /// Generate a strided load. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getAddr() || Op == getStride() || Op == getVF(); + } +}; + /// A recipe for widening store operations, using the stored value, the address /// to store to and an optional mask. struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe { diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index d400ceff7797c..ba76aad1b6485 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -187,8 +187,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) { - assert((isa(R)) && - "Store recipes should not define any values"); + assert( + (isa( + R)) && + "Store recipes should not define any values"); return cast(&R->getIngredient())->getType(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 5f3503d0ce57a..4554d833bcc47 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -82,6 +82,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenPHISC: @@ -105,6 +106,7 @@ bool VPRecipeBase::mayReadFromMemory() const { return cast(this)->mayReadOrWriteMemory(); case VPInstructionSC: return cast(this)->opcodeMayReadOrWriteFromMemory(); + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: return true; @@ -188,6 +190,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPInterleaveEVLSC: case VPInterleaveSC: return mayWriteToMemory(); + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenStoreEVLSC: @@ -3281,9 +3284,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, const Align Alignment = getLoadStoreAlignment(&Ingredient); unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); - unsigned Opcode = isa(this) - ? Instruction::Load - : Instruction::Store; + unsigned Opcode = + isa( + this) + ? Instruction::Load + : Instruction::Store; if (!Consecutive) { // TODO: Using the original IR may not be accurate. @@ -3293,8 +3298,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, "Inconsecutive memory access should not have the order."); const Value *Ptr = getLoadStorePointerOperand(&Ingredient); - Type *PtrTy = Ptr->getType(); + if (isa(this)) + return Ctx.TTI.getStridedMemoryOpCost( + Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient); + Type *PtrTy = Ptr->getType(); // If the address value is uniform across all lanes, then the address can be // calculated with scalar type and broadcast. if (!vputils::isSingleScalar(getAddr())) @@ -3449,6 +3457,47 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPWidenStridedLoadRecipe::execute(VPTransformState &State) { + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + Value *Addr = State.get(getAddr(), /*IsScalar*/ true); + Value *StrideInBytes = State.get(getStride(), /*IsScalar*/ true); + Value *Mask = nullptr; + if (VPValue *VPMask = getMask()) + Mask = State.get(VPMask); + else + Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)), + Builder.getInt32Ty()); + + auto *PtrTy = Addr->getType(); + auto *StrideTy = StrideInBytes->getType(); + CallInst *NewLI = Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy}, + {Addr, StrideInBytes, Mask, RunTimeVF}, nullptr, "wide.strided.load"); + NewLI->addParamAttr( + 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); + applyMetadata(*NewLI); + State.set(this, NewLI); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN "; + printAsOperand(O, SlotTracker); + O << " = load "; + getAddr()->printAsOperand(O, SlotTracker); + O << ", stride = "; + getStride()->printAsOperand(O, SlotTracker); + O << ", runtimeVF = "; + getVF()->printAsOperand(O, SlotTracker); +} +#endif + void VPWidenStoreRecipe::execute(VPTransformState &State) { VPValue *StoredVPValue = getStoredValue(); bool CreateScatter = !isConsecutive(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 7de94717f56e5..7730012c53576 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2497,10 +2497,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); - assert(all_of(Plan.getVF().users(), - IsaPred) && - "User of VF that we can't transform to EVL."); + assert( + all_of( + Plan.getVF().users(), + IsaPred) && + "User of VF that we can't transform to EVL."); Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) { return isa(U); }); @@ -2595,8 +2597,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { "New recipe must define the same number of values as the " "original."); EVLRecipe->insertBefore(CurRecipe); - if (isa( - EVLRecipe)) { + if (isa(EVLRecipe)) { for (unsigned I = 0; I < NumDefVal; ++I) { VPValue *CurVPV = CurRecipe->getVPValue(I); CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 85c6c2c8d7965..0306972893f28 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -349,6 +349,7 @@ class VPDef { VPWidenCastSC, VPWidenGEPSC, VPWidenIntrinsicSC, + VPWidenStridedLoadSC, VPWidenLoadEVLSC, VPWidenLoadSC, VPWidenStoreEVLSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 99f3bc367a548..61d97c71c2cc8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -157,7 +157,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { return VerifyEVLUse(*S, S->getNumOperands() - 1); }) .Case( + VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe, + VPWidenStridedLoadRecipe>( [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); }) .Case([&](auto *R) { if (R->getNumOperands() != 3) { From 102c1c78047c376c8da4e0cef581bbee7cc29306 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 7 Jul 2025 01:02:03 -0700 Subject: [PATCH 2/5] Expand VPVectorPointerRecipe to support stride --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 ++++- llvm/lib/Transforms/Vectorize/VPlan.h | 17 ++++++++++------- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 15 ++++++++++++--- .../LoopVectorize/vplan-dot-printing.ll | 4 ++-- 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8ed55ecc9f6e6..dabb196ce416d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7552,7 +7552,10 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I), /*Stride*/ -1, Flags, I->getDebugLoc()); } else { - VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), + const DataLayout &DL = I->getDataLayout(); + auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType()); + VPValue *StrideOne = Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, 1)); + VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne, GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), I->getDebugLoc()); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index ff5a021640073..87b5d84dceb9d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1885,20 +1885,23 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, #endif }; -/// A recipe to compute the pointers for widened memory accesses of IndexTy. +/// A recipe to compute the pointers for widened memory accesses of IndexedTy, +/// with the Stride expressed in units of IndexedTy. class VPVectorPointerRecipe : public VPRecipeWithIRFlags, - public VPUnrollPartAccessor<1> { + public VPUnrollPartAccessor<2> { Type *IndexedTy; public: - VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags, - DebugLoc DL) - : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef(Ptr), - GEPFlags, DL), + VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, VPValue *Stride, + GEPNoWrapFlags GEPFlags, DebugLoc DL) + : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, + ArrayRef({Ptr, Stride}), GEPFlags, DL), IndexedTy(IndexedTy) {} VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) + VPValue *getStride() const { return getOperand(1); } + void execute(VPTransformState &State) override; bool onlyFirstLaneUsed(const VPValue *Op) const override { @@ -1916,7 +1919,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, } VPVectorPointerRecipe *clone() override { - return new VPVectorPointerRecipe(getOperand(0), IndexedTy, + return new VPVectorPointerRecipe(getOperand(0), IndexedTy, getStride(), getGEPNoWrapFlags(), getDebugLoc()); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 4554d833bcc47..3331050e175f4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2569,13 +2569,22 @@ void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent, void VPVectorPointerRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; unsigned CurrentPart = getUnrollPart(*this); - Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, - /*IsUnitStride*/ true, CurrentPart, Builder); + Value *Stride = State.get(getStride(), /*IsScalar*/ true); + + auto *StrideC = dyn_cast(Stride); + bool IsStrideOne = StrideC && StrideC->isOne(); + bool IsUnitStride = IsStrideOne || (StrideC && StrideC->isMinusOne()); + Type *IndexTy = + getGEPIndexTy(State.VF.isScalable(), + /*IsReverse*/ false, IsUnitStride, CurrentPart, Builder); Value *Ptr = State.get(getOperand(0), VPLane(0)); + Stride = Builder.CreateSExtOrTrunc(Stride, IndexTy); Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); + Value *Index = IsStrideOne ? Increment : Builder.CreateMul(Increment, Stride); + Value *ResultPtr = - Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); + Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags()); State.set(this, ResultPtr, /*IsScalar*/ true); } diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll index 528f2448616e8..2c757021e76ff 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll @@ -42,11 +42,11 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw ; CHECK-NEXT: " EMIT vp\<[[CAN_IV:%.+]]\> = CANONICAL-INDUCTION ir\<0\>, vp\<[[CAN_IV_NEXT:%.+]]\>\l" + ; CHECK-NEXT: " vp\<[[STEPS:%.+]]\> = SCALAR-STEPS vp\<[[CAN_IV]]\>, ir\<1\>, vp\<[[VF]]\>\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr inbounds ir\<%y\>, vp\<[[STEPS]]\>\l" + -; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>\l" + +; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>, ir\<1\>\l" + ; CHECK-NEXT: " WIDEN ir\<%lv\> = load vp\<[[VEC_PTR]]\>\l" + ; CHECK-NEXT: " WIDEN-INTRINSIC ir\<%call\> = call llvm.sqrt(ir\<%lv\>)\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr inbounds ir\<%x\>, vp\<[[STEPS]]\>\l" + -; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>\l" + +; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>, ir\<1\>\l" + ; CHECK-NEXT: " WIDEN store vp\<[[VEC_PTR2]]\>, ir\<%call\>\l" + ; CHECK-NEXT: " EMIT vp\<[[CAN_IV_NEXT]]\> = add nuw vp\<[[CAN_IV]]\>, vp\<[[VFxUF]]\>\l" + ; CHECK-NEXT: " EMIT branch-on-count vp\<[[CAN_IV_NEXT]]\>, vp\<[[VEC_TC]]\>\l" + From bf4b88bee3cb033b6c8fafa2e30faa80375d6674 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 30 Jun 2025 20:38:38 -0700 Subject: [PATCH 3/5] Transform the gather to stride load --- .../Transforms/Vectorize/LoopVectorize.cpp | 18 +- llvm/lib/Transforms/Vectorize/VPlan.h | 28 ++- .../Transforms/Vectorize/VPlanTransforms.cpp | 181 ++++++++++++++++++ .../Transforms/Vectorize/VPlanTransforms.h | 6 + .../RISCV/blocks-with-dead-instructions.ll | 13 +- .../RISCV/masked_gather_scatter.ll | 44 ++++- .../LoopVectorize/RISCV/pr154103.ll | 55 +++++- .../LoopVectorize/RISCV/strided-accesses.ll | 166 ++++++++++------ .../RISCV/tail-folding-gather-scatter.ll | 83 ++++++-- .../RISCV/tail-folding-interleave.ll | 171 +++++++++-------- 10 files changed, 577 insertions(+), 188 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index dabb196ce416d..cb007c4b07eaa 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8650,19 +8650,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( *Plan)) return nullptr; + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); // Transform recipes to abstract recipes if it is legal and beneficial and // clamp the range for better cost estimation. // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. - if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + if (!CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); - } - - for (ElementCount VF : Range) - Plan->addVF(VF); - Plan->setName("Initial VPlan"); // Interleave memory: for each Interleave Group we marked earlier as relevant // for this VPlan, replace the Recipes widening its memory instructions with a @@ -8675,6 +8670,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE, Legal->getLAI()->getSymbolicStrides()); + // Convert memory recipes to strided access recipes if the strided access is + // legal and profitable. + VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan, + CostCtx, Range); + + for (ElementCount VF : Range) + Plan->addVF(VF); + Plan->setName("Initial VPlan"); + auto BlockNeedsPredication = [this](BasicBlock *BB) { return Legal->blockNeedsPredication(BB); }; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 87b5d84dceb9d..b000afa5308f8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1765,10 +1765,6 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags, /// A recipe for handling GEP instructions. class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags { - bool isPointerLoopInvariant() const { - return getOperand(0)->isDefinedOutsideLoopRegions(); - } - bool isIndexLoopInvariant(unsigned I) const { return getOperand(I + 1)->isDefinedOutsideLoopRegions(); } @@ -1797,6 +1793,30 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags { VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC) + bool isPointerLoopInvariant() const { + return getOperand(0)->isDefinedOutsideLoopRegions(); + } + + std::optional getUniqueVariantIndex() const { + std::optional VarIdx; + for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { + if (isIndexLoopInvariant(I)) + continue; + + if (VarIdx) + return std::nullopt; + VarIdx = I; + } + return VarIdx; + } + + Type *getIndexedType(unsigned I) const { + auto *GEP = cast(getUnderlyingInstr()); + Type *SourceElementType = GEP->getSourceElementType(); + SmallVector Ops(GEP->idx_begin(), GEP->idx_begin() + I); + return GetElementPtrInst::getIndexedType(SourceElementType, Ops); + } + /// Generate the gep nodes. void execute(VPTransformState &State) override; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 7730012c53576..960c8e8674607 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -4115,3 +4115,184 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator( MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false); MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights); } + +static std::pair matchStridedStart(VPValue *CurIndex) { + // TODO: Support VPWidenPointerInductionRecipe. + if (auto *WidenIV = dyn_cast(CurIndex)) + return {WidenIV, WidenIV->getStepValue()}; + + auto *WidenR = dyn_cast(CurIndex); + if (!WidenR || !CurIndex->getUnderlyingValue()) + return {nullptr, nullptr}; + + unsigned Opcode = WidenR->getOpcode(); + // TODO: Support Instruction::Add and Instruction::Or. + if (Opcode != Instruction::Shl && Opcode != Instruction::Mul) + return {nullptr, nullptr}; + + // Match the pattern binop(variant, invariant), or binop(invariant, variant) + // if the binary operator is commutative. + bool IsLHSUniform = vputils::isSingleScalar(WidenR->getOperand(0)); + if (IsLHSUniform == vputils::isSingleScalar(WidenR->getOperand(1)) || + (IsLHSUniform && !Instruction::isCommutative(Opcode))) + return {nullptr, nullptr}; + unsigned VarIdx = IsLHSUniform ? 1 : 0; + + auto [Start, Stride] = matchStridedStart(WidenR->getOperand(VarIdx)); + if (!Start) + return {nullptr, nullptr}; + + SmallVector StartOps(WidenR->operands()); + StartOps[VarIdx] = Start; + auto *StartR = new VPReplicateRecipe(WidenR->getUnderlyingInstr(), StartOps, + /*IsUniform*/ true); + StartR->insertBefore(WidenR); + + unsigned InvIdx = VarIdx == 0 ? 1 : 0; + auto *StrideR = + new VPInstruction(Opcode, {Stride, WidenR->getOperand(InvIdx)}); + StrideR->insertBefore(WidenR); + return {StartR, StrideR}; +} + +static std::tuple +determineBaseAndStride(VPWidenGEPRecipe *WidenGEP) { + // TODO: Check if the base pointer is strided. + if (!WidenGEP->isPointerLoopInvariant()) + return {nullptr, nullptr, nullptr}; + + // Find the only one variant index. + std::optional VarIndex = WidenGEP->getUniqueVariantIndex(); + if (!VarIndex) + return {nullptr, nullptr, nullptr}; + + Type *ElementTy = WidenGEP->getIndexedType(*VarIndex); + if (ElementTy->isScalableTy() || ElementTy->isStructTy() || + ElementTy->isVectorTy()) + return {nullptr, nullptr, nullptr}; + + unsigned VarOp = *VarIndex + 1; + VPValue *IndexVPV = WidenGEP->getOperand(VarOp); + auto [Start, Stride] = matchStridedStart(IndexVPV); + if (!Start) + return {nullptr, nullptr, nullptr}; + + SmallVector Ops(WidenGEP->operands()); + Ops[VarOp] = Start; + auto *BasePtr = new VPReplicateRecipe(WidenGEP->getUnderlyingInstr(), Ops, + /*IsUniform*/ true); + BasePtr->insertBefore(WidenGEP); + + return {BasePtr, Stride, ElementTy}; +} + +void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range) { + if (Plan.hasScalarVFOnly()) + return; + + VPTypeAnalysis TypeInfo(Plan); + DenseMap> + StrideCache; + SmallVector ToErase; + SmallPtrSet PossiblyDead; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *MemR = dyn_cast(&R); + // TODO: Support strided store. + // TODO: Transform reverse access into strided access with -1 stride. + // TODO: Transform gather/scatter with uniform address into strided access + // with 0 stride. + // TODO: Transform interleave access into multiple strided accesses. + if (!MemR || !isa(MemR) || MemR->isConsecutive()) + continue; + + auto *Ptr = dyn_cast(MemR->getAddr()); + if (!Ptr) + continue; + + // Memory cost model requires the pointer operand of memory access + // instruction. + Value *PtrUV = Ptr->getUnderlyingValue(); + if (!PtrUV) + continue; + + // Try to get base and stride here. + VPValue *BasePtr, *StrideInElement; + Type *ElementTy; + auto It = StrideCache.find(Ptr); + if (It != StrideCache.end()) + std::tie(BasePtr, StrideInElement, ElementTy) = It->second; + else + std::tie(BasePtr, StrideInElement, ElementTy) = StrideCache[Ptr] = + determineBaseAndStride(Ptr); + + // Skip if the memory access is not a strided access. + if (!BasePtr) { + assert(!StrideInElement && !ElementTy); + continue; + } + assert(StrideInElement && ElementTy); + + Instruction &Ingredient = MemR->getIngredient(); + auto IsProfitable = [&](ElementCount VF) -> bool { + Type *DataTy = toVectorTy(getLoadStoreType(&Ingredient), VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment)) + return false; + const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx); + const InstructionCost StridedLoadStoreCost = + Ctx.TTI.getStridedMemoryOpCost(Instruction::Load, DataTy, PtrUV, + MemR->isMasked(), Alignment, + Ctx.CostKind, &Ingredient); + return StridedLoadStoreCost < CurrentCost; + }; + + if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable, + Range)) { + PossiblyDead.insert(BasePtr); + PossiblyDead.insert(StrideInElement); + continue; + } + PossiblyDead.insert(Ptr); + + // Create a new vector pointer for strided access. + auto *GEP = dyn_cast(PtrUV->stripPointerCasts()); + auto *NewPtr = new VPVectorPointerRecipe( + BasePtr, ElementTy, StrideInElement, + GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), + Ptr->getDebugLoc()); + NewPtr->insertBefore(MemR); + + const DataLayout &DL = Ingredient.getDataLayout(); + TypeSize TS = DL.getTypeAllocSize(ElementTy); + unsigned TypeScale = TS.getFixedValue(); + VPValue *StrideInBytes = StrideInElement; + // Scale the stride by the size of the indexed type. + if (TypeScale != 1) { + VPValue *ScaleVPV = Plan.getOrAddLiveIn(ConstantInt::get( + TypeInfo.inferScalarType(StrideInElement), TypeScale)); + auto *ScaledStride = + new VPInstruction(Instruction::Mul, {StrideInElement, ScaleVPV}); + ScaledStride->insertBefore(MemR); + StrideInBytes = ScaledStride; + } + + auto *LoadR = cast(MemR); + auto *StridedLoad = new VPWidenStridedLoadRecipe( + *cast(&Ingredient), NewPtr, StrideInBytes, &Plan.getVF(), + LoadR->getMask(), *LoadR, LoadR->getDebugLoc()); + StridedLoad->insertBefore(LoadR); + LoadR->replaceAllUsesWith(StridedLoad); + + ToErase.push_back(LoadR); + } + } + + // Clean up dead memory access recipes, and unused base address and stride. + for (auto *R : ToErase) + R->eraseFromParent(); + for (auto *V : PossiblyDead) + recursivelyDeleteDeadRecipes(V); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 1957428fab799..c8e510a956e1e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -239,6 +239,12 @@ struct VPlanTransforms { &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed); + /// Transform widen memory recipes into strided access recipes when legal + /// and profitable. Clamps \p Range to maintain consistency with widen + /// decisions of \p Plan, and uses \p Ctx to evaluate the cost. + static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range); + /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll index 5a99f15b9f585..7450ffd0ee30b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll @@ -425,6 +425,8 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 % ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i1 [[IC]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = xor [[BROADCAST_SPLAT]], splat (i1 true) @@ -433,15 +435,23 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 % ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP13]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP27]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP27]] to i64 ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 3, [[TMP12]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.stepvector.nxv8i32() +; CHECK-NEXT: [[TMP19:%.*]] = icmp ult [[TMP18]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv8i16.nxv8p0( align 2 [[TMP20]], splat (i1 true), i32 [[TMP27]]) +; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP4]] to i32 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv8i16.p0.i64(ptr align 2 [[TMP21]], i64 6, [[TMP19]], i32 [[TMP15]]) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq [[WIDE_MASKED_GATHER]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = select [[TMP17]], [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP28:%.*]] = xor [[TMP17]], splat (i1 true) @@ -450,6 +460,7 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 % ; CHECK-NEXT: [[TMP24:%.*]] = or [[TMP22]], [[TMP23]] ; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP20]], [[TMP24]], i32 [[TMP27]]) ; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP27]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll index d0dac0e380cdc..e34fc3557fda3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -30,29 +30,41 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV32: vector.ph: +; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 ; RV32-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i64() ; RV32-NEXT: [[TMP9:%.*]] = mul [[TMP7]], splat (i64 16) ; RV32-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP9]] ; RV32-NEXT: br label [[VECTOR_BODY:%.*]] ; RV32: vector.body: +; RV32-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV32-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV32-NEXT: [[AVL:%.*]] = phi i64 [ 625, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV32-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; RV32-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 +; RV32-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT6]], poison, zeroinitializer ; RV32-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64 ; RV32-NEXT: [[TMP11:%.*]] = mul i64 16, [[TMP8]] ; RV32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 ; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] -; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i32.nxv2p0( align 4 [[TMP13]], splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]] +; RV32-NEXT: [[TMP16:%.*]] = call @llvm.stepvector.nxv2i32() +; RV32-NEXT: [[TMP25:%.*]] = icmp ult [[TMP16]], [[BROADCAST_SPLAT7]] +; RV32-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 16 +; RV32-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[OFFSET_IDX]] +; RV32-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP1]] to i32 +; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i32.p0.i64(ptr align 4 [[TMP26]], i64 64, [[TMP25]], i32 [[TMP27]]), !alias.scope [[META0:![0-9]+]] ; RV32-NEXT: [[TMP14:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], splat (i32 100) -; RV32-NEXT: [[TMP15:%.*]] = shl nuw nsw [[VEC_IND]], splat (i64 1) -; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP15]] -; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.vp.gather.nxv2f64.nxv2p0( align 8 [[TMP16]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]] +; RV32-NEXT: [[TMP12:%.*]] = select [[TMP25]], [[TMP14]], zeroinitializer +; RV32-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[OFFSET_IDX]], 1 +; RV32-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP13]] +; RV32-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP1]] to i32 +; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.experimental.vp.strided.load.nxv2f64.p0.i64(ptr align 8 [[TMP28]], i64 256, [[TMP12]], i32 [[TMP15]]), !alias.scope [[META3:![0-9]+]] ; RV32-NEXT: [[TMP17:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to ; RV32-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP17]] ; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] ; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0( [[TMP18]], align 8 [[TMP19]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] ; RV32-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; RV32-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]] ; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] ; RV32-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; RV32-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 @@ -100,29 +112,41 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; RV64-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV64: vector.ph: +; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 ; RV64-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i64() ; RV64-NEXT: [[TMP9:%.*]] = mul [[TMP7]], splat (i64 16) ; RV64-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP9]] ; RV64-NEXT: br label [[VECTOR_BODY:%.*]] ; RV64: vector.body: +; RV64-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV64-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV64-NEXT: [[AVL:%.*]] = phi i64 [ 625, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV64-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; RV64-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 +; RV64-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT6]], poison, zeroinitializer ; RV64-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64 ; RV64-NEXT: [[TMP11:%.*]] = mul i64 16, [[TMP8]] ; RV64-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 ; RV64-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; RV64-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] -; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i32.nxv2p0( align 4 [[TMP13]], splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]] +; RV64-NEXT: [[TMP16:%.*]] = call @llvm.stepvector.nxv2i32() +; RV64-NEXT: [[TMP25:%.*]] = icmp ult [[TMP16]], [[BROADCAST_SPLAT7]] +; RV64-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 16 +; RV64-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[OFFSET_IDX]] +; RV64-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP1]] to i32 +; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i32.p0.i64(ptr align 4 [[TMP26]], i64 64, [[TMP25]], i32 [[TMP27]]), !alias.scope [[META0:![0-9]+]] ; RV64-NEXT: [[TMP14:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], splat (i32 100) -; RV64-NEXT: [[TMP15:%.*]] = shl nuw nsw [[VEC_IND]], splat (i64 1) -; RV64-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP15]] -; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.vp.gather.nxv2f64.nxv2p0( align 8 [[TMP16]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]] +; RV64-NEXT: [[TMP12:%.*]] = select [[TMP25]], [[TMP14]], zeroinitializer +; RV64-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[OFFSET_IDX]], 1 +; RV64-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP13]] +; RV64-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP1]] to i32 +; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.experimental.vp.strided.load.nxv2f64.p0.i64(ptr align 8 [[TMP28]], i64 256, [[TMP12]], i32 [[TMP15]]), !alias.scope [[META3:![0-9]+]] ; RV64-NEXT: [[TMP17:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to ; RV64-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP17]] ; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] ; RV64-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0( [[TMP18]], align 8 [[TMP19]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] ; RV64-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; RV64-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]] ; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] ; RV64-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; RV64-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll index c35a3d7b9269f..067b9908ab485 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll @@ -6,12 +6,53 @@ define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d) { ; CHECK-LABEL: define void @pr154103( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ -7905747460161236406, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult [[TMP3]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[EVL_BASED_IV]], 7 +; CHECK-NEXT: [[IV:%.*]] = add i64 1, [[TMP5]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i8.p0.i64(ptr align 1 [[GEP]], i64 7, [[TMP4]], i32 [[TMP7]]) +; CHECK-NEXT: [[TMP8:%.*]] = zext [[WIDE_STRIDED_LOAD]] to +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vp.merge.nxv4i64( splat (i1 true), [[TMP8]], splat (i64 1), i32 [[TMP2]]) +; CHECK-NEXT: [[TMP10:%.*]] = sdiv zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp sgt [[TMP10]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i8.nxv4p0( align 1 [[BROADCAST_SPLAT]], [[TMP11]], i32 [[TMP2]]) +; CHECK-NEXT: [[TMP12:%.*]] = zext [[WIDE_MASKED_GATHER]] to +; CHECK-NEXT: [[TMP13:%.*]] = xor [[TMP12]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP11]], [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = trunc [[PREDPHI]] to +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i16.nxv4p0( [[TMP14]], align 2 [[BROADCAST_SPLAT2]], splat (i1 true), i32 [[TMP2]]) +; CHECK-NEXT: store i32 0, ptr [[D]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP2]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[X:%.*]] = load i8, ptr [[GEP]], align 1 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 1, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV1]] +; CHECK-NEXT: [[X:%.*]] = load i8, ptr [[GEP1]], align 1 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[X]] to i64 ; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 0, [[CONV]] ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[DIV]], 0 @@ -26,9 +67,9 @@ define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalia ; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[COND]] to i16 ; CHECK-NEXT: store i16 [[TRUNC]], ptr [[C]], align 2 ; CHECK-NEXT: store i32 0, ptr [[D]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 7 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV]], 0 -; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 7 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV1]], 0 +; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index 7a3d81b240394..1f797886635fb 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -10,23 +10,34 @@ define void @single_constant_stride_int_scaled(ptr %p) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH1:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP10:%.*]] = mul [[TMP8]], splat (i64 1) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i64 [[EVL_BASED_IV]], 8 ; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw [[VEC_IND]], splat (i64 8) -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], [[TMP14]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP15]], splat (i1 true), i32 [[TMP11]]) +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P]], [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP18]], i64 32, [[TMP7]], i32 [[TMP19]]) ; CHECK-NEXT: [[TMP16:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP16]], align 4 [[TMP15]], splat (i1 true), i32 [[TMP11]]) ; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 @@ -72,12 +83,20 @@ define void @single_constant_stride_int_scaled(ptr %p) { ; CHECK-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-UF2-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-UF2-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-UF2-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[INDEX]], 8 ; CHECK-UF2-NEXT: [[TMP9:%.*]] = mul nuw nsw [[VEC_IND]], splat (i64 8) ; CHECK-UF2-NEXT: [[TMP10:%.*]] = mul nuw nsw [[STEP_ADD]], splat (i64 8) -; CHECK-UF2-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], [[TMP9]] +; CHECK-UF2-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP21]] +; CHECK-UF2-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P]], [[TMP9]] ; CHECK-UF2-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[P]], [[TMP10]] -; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP11]], i32 4, splat (i1 true), poison) -; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP12]], i32 4, splat (i1 true), poison) +; CHECK-UF2-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF2-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP23]], 2 +; CHECK-UF2-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 8 +; CHECK-UF2-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP17]] +; CHECK-UF2-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP22]], i64 32, splat (i1 true), i32 [[TMP19]]) +; CHECK-UF2-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP18]], i64 32, splat (i1 true), i32 [[TMP20]]) ; CHECK-UF2-NEXT: [[TMP13:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; CHECK-UF2-NEXT: [[TMP14:%.*]] = add [[WIDE_MASKED_GATHER1]], splat (i32 1) ; CHECK-UF2-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP13]], [[TMP11]], i32 4, splat (i1 true)) @@ -127,23 +146,34 @@ define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP6]], splat (i64 64) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 64, [[TMP11]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[P:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP12]], splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: [[TMP15:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP16:%.*]] = icmp ult [[TMP15]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 64 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[P]], [[VEC_IND]] +; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP17]], i64 256, [[TMP16]], i32 [[TMP18]]) ; CHECK-NEXT: [[TMP13:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP13]], align 4 [[TMP12]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 @@ -190,10 +220,18 @@ define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-UF2-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-UF2-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[TMP6]] -; CHECK-UF2-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[P:%.*]], [[VEC_IND]] +; CHECK-UF2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 64 +; CHECK-UF2-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET_IDX]] +; CHECK-UF2-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[P]], [[VEC_IND]] ; CHECK-UF2-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[P]], [[STEP_ADD]] -; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP9]], i32 4, splat (i1 true), poison) -; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP10]], i32 4, splat (i1 true), poison) +; CHECK-UF2-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF2-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 2 +; CHECK-UF2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP20]], 64 +; CHECK-UF2-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP14]] +; CHECK-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP18]], i64 256, splat (i1 true), i32 [[TMP16]]) +; CHECK-UF2-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP15]], i64 256, splat (i1 true), i32 [[TMP17]]) ; CHECK-UF2-NEXT: [[TMP11:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; CHECK-UF2-NEXT: [[TMP12:%.*]] = add [[WIDE_MASKED_GATHER1]], splat (i32 1) ; CHECK-UF2-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP11]], [[TMP9]], i32 4, splat (i1 true)) @@ -823,6 +861,9 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; STRIDED-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: +; STRIDED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() +; STRIDED-NEXT: [[TMP45:%.*]] = mul nuw i64 [[TMP42]], 4 +; STRIDED-NEXT: [[TMP47:%.*]] = mul i64 [[STRIDE]], 4 ; STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[STRIDE]], i64 0 ; STRIDED-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; STRIDED-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv4i64() @@ -830,19 +871,27 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP14]] ; STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; STRIDED: vector.body: +; STRIDED-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[TMP43:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; STRIDED-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement poison, i32 [[TMP43]], i64 0 +; STRIDED-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector [[BROADCAST_SPLATINSERT11]], poison, zeroinitializer ; STRIDED-NEXT: [[TMP44:%.*]] = zext i32 [[TMP43]] to i64 ; STRIDED-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement poison, i64 [[TMP44]], i64 0 ; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT9]], poison, zeroinitializer +; STRIDED-NEXT: [[TMP48:%.*]] = call @llvm.stepvector.nxv4i32() +; STRIDED-NEXT: [[TMP49:%.*]] = icmp ult [[TMP48]], [[BROADCAST_SPLAT12]] +; STRIDED-NEXT: [[TMP50:%.*]] = mul nuw nsw i64 [[EVL_BASED_IV]], [[STRIDE]] ; STRIDED-NEXT: [[TMP18:%.*]] = mul nuw nsw [[VEC_IND]], [[BROADCAST_SPLAT1]] -; STRIDED-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[P]], [[TMP18]] -; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP19]], splat (i1 true), i32 [[TMP43]]), !alias.scope [[META9:![0-9]+]] +; STRIDED-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP50]] +; STRIDED-NEXT: [[TMP52:%.*]] = trunc i64 [[TMP45]] to i32 +; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP51]], i64 [[TMP47]], [[TMP49]], i32 [[TMP52]]), !alias.scope [[META9:![0-9]+]] ; STRIDED-NEXT: [[TMP20:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; STRIDED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], [[TMP18]] ; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP20]], align 4 [[TMP21]], splat (i1 true), i32 [[TMP43]]), !alias.scope [[META12:![0-9]+]], !noalias [[META9]] ; STRIDED-NEXT: [[TMP46:%.*]] = zext i32 [[TMP43]] to i64 +; STRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP46]], [[EVL_BASED_IV]] ; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP46]] ; STRIDED-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; STRIDED-NEXT: [[TMP41:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 @@ -928,6 +977,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-UF2-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 2 ; STRIDED-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP30]] ; STRIDED-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; STRIDED-UF2-NEXT: [[TMP35:%.*]] = mul i64 [[STRIDE]], 4 ; STRIDED-UF2-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement poison, i64 [[STRIDE]], i64 0 ; STRIDED-UF2-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector [[BROADCAST_SPLATINSERT10]], poison, zeroinitializer ; STRIDED-UF2-NEXT: [[TMP31:%.*]] = call @llvm.stepvector.nxv4i64() @@ -938,12 +988,18 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-UF2-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-UF2-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] +; STRIDED-UF2-NEXT: [[TMP36:%.*]] = mul nuw nsw i64 [[INDEX]], [[STRIDE]] ; STRIDED-UF2-NEXT: [[TMP33:%.*]] = mul nuw nsw [[VEC_IND]], [[BROADCAST_SPLAT11]] ; STRIDED-UF2-NEXT: [[TMP34:%.*]] = mul nuw nsw [[STEP_ADD]], [[BROADCAST_SPLAT11]] -; STRIDED-UF2-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[P]], [[TMP33]] -; STRIDED-UF2-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[P]], [[TMP34]] -; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP35]], i32 4, splat (i1 true), poison), !alias.scope [[META8:![0-9]+]] -; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP36]], i32 4, splat (i1 true), poison), !alias.scope [[META8]] +; STRIDED-UF2-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP36]] +; STRIDED-UF2-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() +; STRIDED-UF2-NEXT: [[TMP46:%.*]] = shl nuw i64 [[TMP45]], 2 +; STRIDED-UF2-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], [[STRIDE]] +; STRIDED-UF2-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[TMP44]], i64 [[TMP47]] +; STRIDED-UF2-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP29]] to i32 +; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP44]], i64 [[TMP35]], splat (i1 true), i32 [[TMP42]]), !alias.scope [[META8:![0-9]+]] +; STRIDED-UF2-NEXT: [[TMP43:%.*]] = trunc i64 [[TMP29]] to i32 +; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP48]], i64 [[TMP35]], splat (i1 true), i32 [[TMP43]]), !alias.scope [[META8]] ; STRIDED-UF2-NEXT: [[TMP37:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; STRIDED-UF2-NEXT: [[TMP38:%.*]] = add [[WIDE_MASKED_GATHER12]], splat (i32 1) ; STRIDED-UF2-NEXT: [[TMP39:%.*]] = getelementptr i32, ptr [[P2]], [[TMP33]] @@ -1362,26 +1418,25 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; NOSTRIDED-NEXT: entry: ; NOSTRIDED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NOSTRIDED: vector.ph: -; NOSTRIDED-NEXT: [[TMP0:%.*]] = call @llvm.stepvector.nxv2i64() -; NOSTRIDED-NEXT: [[TMP1:%.*]] = mul [[TMP0]], splat (i64 1) -; NOSTRIDED-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP1]] +; NOSTRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NOSTRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 ; NOSTRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; NOSTRIDED: vector.body: ; NOSTRIDED-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; NOSTRIDED-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; NOSTRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; NOSTRIDED-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; NOSTRIDED-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 -; NOSTRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; NOSTRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; NOSTRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], [[VEC_IND]] -; NOSTRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP4]], splat (i1 true), i32 [[TMP2]]) +; NOSTRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP2]], i64 0 +; NOSTRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; NOSTRIDED-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv2i32() +; NOSTRIDED-NEXT: [[TMP4:%.*]] = icmp ult [[TMP3]], [[BROADCAST_SPLAT]] +; NOSTRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], i64 [[EVL_BASED_IV]] +; NOSTRIDED-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP1]] to i32 +; NOSTRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP9]], i64 4, [[TMP4]], i32 [[TMP10]]) ; NOSTRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[EVL_BASED_IV]] ; NOSTRIDED-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP2]]) ; NOSTRIDED-NEXT: [[TMP6:%.*]] = zext i32 [[TMP2]] to i64 ; NOSTRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP6]], [[EVL_BASED_IV]] ; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP6]] -; NOSTRIDED-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; NOSTRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; NOSTRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; NOSTRIDED: middle.block: @@ -1409,23 +1464,20 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; NOSTRIDED-UF2: vector.ph: ; NOSTRIDED-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; NOSTRIDED-UF2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; NOSTRIDED-UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; NOSTRIDED-UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; NOSTRIDED-UF2-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; NOSTRIDED-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] ; NOSTRIDED-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; NOSTRIDED-UF2-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv2i64() -; NOSTRIDED-UF2-NEXT: [[TMP6:%.*]] = mul [[TMP5]], splat (i64 1) -; NOSTRIDED-UF2-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] ; NOSTRIDED-UF2-NEXT: br label [[VECTOR_BODY:%.*]] ; NOSTRIDED-UF2: vector.body: ; NOSTRIDED-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; NOSTRIDED-UF2-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; NOSTRIDED-UF2-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] -; NOSTRIDED-UF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], [[VEC_IND]] -; NOSTRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], [[STEP_ADD]] -; NOSTRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP7]], i32 8, splat (i1 true), poison) -; NOSTRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP8]], i32 8, splat (i1 true), poison) +; NOSTRIDED-UF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], i64 [[INDEX]] +; NOSTRIDED-UF2-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; NOSTRIDED-UF2-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 1 +; NOSTRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i64 [[TMP7]] +; NOSTRIDED-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP3]] to i32 +; NOSTRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP5]], i64 4, splat (i1 true), i32 [[TMP15]]) +; NOSTRIDED-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP3]] to i32 +; NOSTRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP8]], i64 4, splat (i1 true), i32 [[TMP16]]) ; NOSTRIDED-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[INDEX]] ; NOSTRIDED-UF2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() ; NOSTRIDED-UF2-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1 @@ -1433,7 +1485,6 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; NOSTRIDED-UF2-NEXT: store [[WIDE_MASKED_GATHER]], ptr [[TMP9]], align 8 ; NOSTRIDED-UF2-NEXT: store [[WIDE_MASKED_GATHER1]], ptr [[TMP12]], align 8 ; NOSTRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; NOSTRIDED-UF2-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[BROADCAST_SPLAT]] ; NOSTRIDED-UF2-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NOSTRIDED-UF2-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; NOSTRIDED-UF2: middle.block: @@ -1458,26 +1509,25 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; STRIDED-NEXT: entry: ; STRIDED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: -; STRIDED-NEXT: [[TMP0:%.*]] = call @llvm.stepvector.nxv2i64() -; STRIDED-NEXT: [[TMP1:%.*]] = mul [[TMP0]], splat (i64 1) -; STRIDED-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP1]] +; STRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; STRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 ; STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; STRIDED: vector.body: ; STRIDED-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; STRIDED-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; STRIDED-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 -; STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], [[VEC_IND]] -; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP4]], splat (i1 true), i32 [[TMP2]]) +; STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP2]], i64 0 +; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; STRIDED-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv2i32() +; STRIDED-NEXT: [[TMP4:%.*]] = icmp ult [[TMP3]], [[BROADCAST_SPLAT]] +; STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], i64 [[EVL_BASED_IV]] +; STRIDED-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP1]] to i32 +; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP9]], i64 4, [[TMP4]], i32 [[TMP10]]) ; STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[EVL_BASED_IV]] ; STRIDED-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP2]]) ; STRIDED-NEXT: [[TMP6:%.*]] = zext i32 [[TMP2]] to i64 ; STRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP6]], [[EVL_BASED_IV]] ; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP6]] -; STRIDED-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; STRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; STRIDED: middle.block: @@ -1505,23 +1555,20 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; STRIDED-UF2: vector.ph: ; STRIDED-UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; STRIDED-UF2-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; STRIDED-UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; STRIDED-UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; STRIDED-UF2-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; STRIDED-UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] ; STRIDED-UF2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; STRIDED-UF2-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv2i64() -; STRIDED-UF2-NEXT: [[TMP6:%.*]] = mul [[TMP5]], splat (i64 1) -; STRIDED-UF2-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] ; STRIDED-UF2-NEXT: br label [[VECTOR_BODY:%.*]] ; STRIDED-UF2: vector.body: ; STRIDED-UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; STRIDED-UF2-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; STRIDED-UF2-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] -; STRIDED-UF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], [[VEC_IND]] -; STRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], [[STEP_ADD]] -; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP7]], i32 8, splat (i1 true), poison) -; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP8]], i32 8, splat (i1 true), poison) +; STRIDED-UF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], i64 [[INDEX]] +; STRIDED-UF2-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; STRIDED-UF2-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 1 +; STRIDED-UF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i64 [[TMP7]] +; STRIDED-UF2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP3]] to i32 +; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP5]], i64 4, splat (i1 true), i32 [[TMP15]]) +; STRIDED-UF2-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP3]] to i32 +; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP8]], i64 4, splat (i1 true), i32 [[TMP16]]) ; STRIDED-UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[INDEX]] ; STRIDED-UF2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() ; STRIDED-UF2-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1 @@ -1529,7 +1576,6 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; STRIDED-UF2-NEXT: store [[WIDE_MASKED_GATHER]], ptr [[TMP9]], align 8 ; STRIDED-UF2-NEXT: store [[WIDE_MASKED_GATHER1]], ptr [[TMP12]], align 8 ; STRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] -; STRIDED-UF2-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[BROADCAST_SPLAT]] ; STRIDED-UF2-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; STRIDED-UF2-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; STRIDED-UF2: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll index ba7005f4f56dc..f9b95f26ce0a7 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll @@ -10,35 +10,90 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %index, i64 %n) { ; IF-EVL-LABEL: @gather_scatter( ; IF-EVL-NEXT: entry: +; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-EVL: vector.ph: +; IF-EVL-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2 ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] +; IF-EVL: vector.body: +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY1]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY1]] ] +; IF-EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP2]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv2i32() +; IF-EVL-NEXT: [[TMP4:%.*]] = icmp ult [[TMP3]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP12]] to i32 +; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP5]], i64 4, [[TMP4]], i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], [[WIDE_STRIDED_LOAD]] +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2f32.nxv2p0( align 4 [[TMP7]], splat (i1 true), i32 [[TMP2]]) +; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], [[WIDE_STRIDED_LOAD]] +; IF-EVL-NEXT: call void @llvm.vp.scatter.nxv2f32.nxv2p0( [[WIDE_MASKED_GATHER]], align 4 [[TMP8]], splat (i1 true), i32 [[TMP2]]) +; IF-EVL-NEXT: [[TMP9:%.*]] = zext i32 [[TMP2]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; IF-EVL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; IF-EVL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL: middle.block: +; IF-EVL-NEXT: br label [[FOR_END:%.*]] +; IF-EVL: scalar.ph: +; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY1]] ] -; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV1]] +; IF-EVL-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV1]] ; IF-EVL-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8 -; IF-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP0]] ; IF-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 -; IF-EVL-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]] +; IF-EVL-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP0]] ; IF-EVL-NEXT: store float [[TMP1]], ptr [[ARRAYIDX7]], align 4 ; IF-EVL-NEXT: [[INDVARS_IV_NEXT1]] = add nuw nsw i64 [[INDVARS_IV1]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1]], [[N:%.*]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY1]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1]], [[N]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: for.end: ; IF-EVL-NEXT: ret void ; ; NO-VP-LABEL: @gather_scatter( ; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP4]], 1 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP9]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; NO-VP-NEXT: br label [[FOR_BODY1:%.*]] -; NO-VP: for.body: -; NO-VP-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY1]] ] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY1]] ] ; NO-VP-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV1]] -; NO-VP-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8 -; NO-VP-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 +; NO-VP-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[ARRAYIDX3]], i64 4, splat (i1 true), i32 [[TMP5]]) +; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], [[WIDE_STRIDED_LOAD]] +; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2f32.nxv2p0( [[TMP6]], i32 4, splat (i1 true), poison) +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], [[WIDE_STRIDED_LOAD]] +; NO-VP-NEXT: call void @llvm.masked.scatter.nxv2f32.nxv2p0( [[WIDE_MASKED_GATHER]], [[TMP7]], i32 4, splat (i1 true)) +; NO-VP-NEXT: [[INDVARS_IV_NEXT1]] = add nuw i64 [[INDVARS_IV1]], [[TMP3]] +; NO-VP-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.body: +; NO-VP-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX4]], align 8 +; NO-VP-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP0]] ; NO-VP-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 -; NO-VP-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP0]] ; NO-VP-NEXT: store float [[TMP1]], ptr [[ARRAYIDX7]], align 4 -; NO-VP-NEXT: [[INDVARS_IV_NEXT1]] = add nuw nsw i64 [[INDVARS_IV1]], 1 -; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1]], [[N:%.*]] -; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY1]] +; NO-VP-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; NO-VP: for.end: ; NO-VP-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll index 1eab944ef1e87..a5c04f58a67f1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll @@ -25,6 +25,12 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; IF-EVL-NEXT: [[TMP9:%.*]] = add nsw [[TMP15]], [[TMP14]] ; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP9]], ptr align 4 [[TMP10]], splat (i1 true), i32 [[TMP16]]) +; IF-EVL-NEXT: [[TMP8:%.*]] = zext i32 [[TMP16]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP8]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] +; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; IF-EVL: scalar.ph: @@ -126,30 +132,33 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: entry: ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP3:%.*]] = mul [[TMP2]], splat (i64 1) -; IF-EVL-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP3]] +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: -; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], [[VEC_IND]], i32 0 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP7]], splat (i1 true), i32 [[TMP4]]) +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv4i32() +; IF-EVL-NEXT: [[TMP7:%.*]] = icmp ult [[TMP3]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[EVL_BASED_IV]], i32 0 +; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP1]] to i32 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, [[TMP7]], i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP8:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP7]], splat (i1 true), i32 [[TMP4]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP1]] to i32 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, [[TMP7]], i32 [[TMP19]]) ; IF-EVL-NEXT: [[TMP9:%.*]] = add [[TMP8]], [[WIDE_MASKED_GATHER1]] -; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], [[VEC_IND]], i32 3 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP10]], splat (i1 true), i32 [[TMP4]]) +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[EVL_BASED_IV]], i32 3 +; IF-EVL-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP1]] to i32 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 16, [[TMP7]], i32 [[TMP20]]) ; IF-EVL-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[WIDE_MASKED_GATHER2]] ; IF-EVL-NEXT: [[TMP12]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP11]], [[VEC_PHI]], i32 [[TMP4]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP4]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] -; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: middle.block: @@ -189,26 +198,22 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) { ; NO-VP-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; NO-VP-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] ; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[TMP5]] -; NO-VP-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() -; NO-VP-NEXT: [[TMP7:%.*]] = mul [[TMP6]], splat (i64 1) -; NO-VP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] ; NO-VP: vector.body: ; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; NO-VP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], [[VEC_IND]], i32 0 -; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP9]], i32 4, splat (i1 true), poison) +; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[INDEX]], i32 0 +; NO-VP-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP3]] to i32 +; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, splat (i1 true), i32 [[TMP7]]) ; NO-VP-NEXT: [[TMP10:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP9]], i32 4, splat (i1 true), poison) +; NO-VP-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP3]] to i32 +; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, splat (i1 true), i32 [[TMP9]]) ; NO-VP-NEXT: [[TMP11:%.*]] = add [[TMP10]], [[WIDE_MASKED_GATHER1]] -; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], [[VEC_IND]], i32 3 -; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP12]], i32 4, splat (i1 true), poison) +; NO-VP-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[INDEX]], i32 3 +; NO-VP-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP3]] to i32 +; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP19]], i64 16, splat (i1 true), i32 [[TMP12]]) ; NO-VP-NEXT: [[TMP13]] = add [[TMP11]], [[WIDE_MASKED_GATHER2]] ; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; NO-VP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; NO-VP-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-VP-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; NO-VP: middle.block: @@ -398,30 +403,33 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: entry: ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP3:%.*]] = mul [[TMP2]], splat (i64 1) -; IF-EVL-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP3]] +; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: -; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], [[VEC_IND]], i32 0 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP7]], splat (i1 true), i32 [[TMP4]]) +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv4i32() +; IF-EVL-NEXT: [[TMP7:%.*]] = icmp ult [[TMP3]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[EVL_BASED_IV]], i32 0 +; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP1]] to i32 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, [[TMP7]], i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP8:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP7]], splat (i1 true), i32 [[TMP4]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP1]] to i32 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, [[TMP7]], i32 [[TMP19]]) ; IF-EVL-NEXT: [[TMP9:%.*]] = add [[TMP8]], [[WIDE_MASKED_GATHER1]] -; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], [[VEC_IND]], i32 2 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP10]], splat (i1 true), i32 [[TMP4]]) +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[EVL_BASED_IV]], i32 2 +; IF-EVL-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP1]] to i32 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 16, [[TMP7]], i32 [[TMP20]]) ; IF-EVL-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[WIDE_MASKED_GATHER2]] ; IF-EVL-NEXT: [[TMP12]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP11]], [[VEC_PHI]], i32 [[TMP4]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP4]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] -; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; IF-EVL: middle.block: @@ -461,26 +469,22 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) { ; NO-VP-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; NO-VP-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] ; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[TMP5]] -; NO-VP-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() -; NO-VP-NEXT: [[TMP7:%.*]] = mul [[TMP6]], splat (i64 1) -; NO-VP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] ; NO-VP: vector.body: ; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; NO-VP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], [[VEC_IND]], i32 0 -; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP9]], i32 4, splat (i1 true), poison) +; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[INDEX]], i32 0 +; NO-VP-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP3]] to i32 +; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, splat (i1 true), i32 [[TMP7]]) ; NO-VP-NEXT: [[TMP10:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP9]], i32 4, splat (i1 true), poison) +; NO-VP-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP3]] to i32 +; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP6]], i64 16, splat (i1 true), i32 [[TMP9]]) ; NO-VP-NEXT: [[TMP11:%.*]] = add [[TMP10]], [[WIDE_MASKED_GATHER1]] -; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], [[VEC_IND]], i32 2 -; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP12]], i32 4, splat (i1 true), poison) +; NO-VP-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[INDEX]], i32 2 +; NO-VP-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP3]] to i32 +; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP19]], i64 16, splat (i1 true), i32 [[TMP12]]) ; NO-VP-NEXT: [[TMP13]] = add [[TMP11]], [[WIDE_MASKED_GATHER2]] ; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; NO-VP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; NO-VP-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-VP-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; NO-VP: middle.block: @@ -663,36 +667,38 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[TMP1:%.*]] = sub i64 [[N]], [[SMIN]] ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[N]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP5:%.*]] = mul [[TMP4]], splat (i64 -1) -; IF-EVL-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT]], [[TMP5]] +; IF-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: -; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP1]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 -; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 -1, [[TMP7]] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], [[VEC_IND]], i32 0 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP9]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv4i32() +; IF-EVL-NEXT: [[TMP9:%.*]] = icmp ult [[TMP5]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 +; IF-EVL-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i32 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, [[TMP9]], i32 [[TMP8]]) ; IF-EVL-NEXT: [[TMP10:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP9]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP3]] to i32 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, [[TMP9]], i32 [[TMP14]]) ; IF-EVL-NEXT: [[TMP11:%.*]] = add [[TMP10]], [[WIDE_MASKED_GATHER3]] -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], [[VEC_IND]], i32 2 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP12]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 2 +; IF-EVL-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP3]] to i32 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP12]], i64 -16, [[TMP9]], i32 [[TMP24]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = add [[TMP11]], [[WIDE_MASKED_GATHER4]] -; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], [[VEC_IND]], i32 3 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP14]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 3 +; IF-EVL-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP3]] to i32 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP25]], i64 -16, [[TMP9]], i32 [[TMP26]]) ; IF-EVL-NEXT: [[TMP15:%.*]] = add [[TMP13]], [[WIDE_MASKED_GATHER5]] ; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP15]], [[VEC_PHI]], i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP6]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]] -; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] ; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; IF-EVL: middle.block: @@ -737,32 +743,27 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) { ; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]] ; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] ; NO-VP-NEXT: [[TMP6:%.*]] = sub i64 [[N]], [[N_VEC]] -; NO-VP-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv4i64() -; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[N]], i64 0 -; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; NO-VP-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i64 -1) -; NO-VP-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT]], [[TMP8]] -; NO-VP-NEXT: [[TMP9:%.*]] = mul i64 -1, [[TMP5]] -; NO-VP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 -; NO-VP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] ; NO-VP: vector.body: ; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; NO-VP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; NO-VP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], [[VEC_IND]], i32 0 -; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP10]], i32 4, splat (i1 true), poison) +; NO-VP-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]] +; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 +; NO-VP-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP5]] to i32 +; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, splat (i1 true), i32 [[TMP8]]) ; NO-VP-NEXT: [[TMP11:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; NO-VP-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP10]], i32 4, splat (i1 true), poison) +; NO-VP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32 +; NO-VP-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, splat (i1 true), i32 [[TMP10]]) ; NO-VP-NEXT: [[TMP12:%.*]] = add [[TMP11]], [[WIDE_MASKED_GATHER3]] -; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], [[VEC_IND]], i32 2 -; NO-VP-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP13]], i32 4, splat (i1 true), poison) +; NO-VP-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 2 +; NO-VP-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP5]] to i32 +; NO-VP-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP23]], i64 -16, splat (i1 true), i32 [[TMP13]]) ; NO-VP-NEXT: [[TMP14:%.*]] = add [[TMP12]], [[WIDE_MASKED_GATHER4]] -; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], [[VEC_IND]], i32 3 -; NO-VP-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP15]], i32 4, splat (i1 true), poison) +; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 3 +; NO-VP-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP5]] to i32 +; NO-VP-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP15]], i64 -16, splat (i1 true), i32 [[TMP24]]) ; NO-VP-NEXT: [[TMP16]] = add [[TMP14]], [[WIDE_MASKED_GATHER5]] ; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; NO-VP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] ; NO-VP-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NO-VP-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; NO-VP: middle.block: From 8bda7288210a846c7f50af5e5f4bc2cfad464878 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Fri, 22 Aug 2025 01:28:33 -0700 Subject: [PATCH 4/5] Support EVL --- .../Transforms/Vectorize/VPlanTransforms.cpp | 6 +++ .../RISCV/blocks-with-dead-instructions.ll | 9 +--- .../RISCV/masked_gather_scatter.ll | 26 ++-------- .../LoopVectorize/RISCV/pr154103.ll | 9 +--- .../LoopVectorize/RISCV/strided-accesses.ll | 45 ++--------------- .../RISCV/tail-folding-gather-scatter.ll | 9 +--- .../RISCV/tail-folding-interleave.ll | 48 ++++--------------- 7 files changed, 28 insertions(+), 124 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 960c8e8674607..719ca23c1d0c8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2459,6 +2459,12 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPValue *NewAddr = GetNewAddr(L->getAddr()); return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask); }) + .Case([&](VPWidenStridedLoadRecipe *L) { + VPValue *NewMask = GetNewMask(L->getMask()); + return new VPWidenStridedLoadRecipe( + *cast(&L->getIngredient()), L->getAddr(), L->getStride(), + &EVL, NewMask, *L, L->getDebugLoc()); + }) .Case([&](VPWidenStoreRecipe *S) { VPValue *NewMask = GetNewMask(S->getMask()); VPValue *NewAddr = GetNewAddr(S->getAddr()); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll index 7450ffd0ee30b..9abd8c17ef96f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll @@ -425,8 +425,6 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 % ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i1 [[IC]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = xor [[BROADCAST_SPLAT]], splat (i1 true) @@ -439,19 +437,14 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 % ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP27]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP27]] to i64 ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 3, [[TMP12]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.stepvector.nxv8i32() -; CHECK-NEXT: [[TMP19:%.*]] = icmp ult [[TMP18]], [[BROADCAST_SPLAT4]] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 3 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] -; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP4]] to i32 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv8i16.p0.i64(ptr align 2 [[TMP21]], i64 6, [[TMP19]], i32 [[TMP15]]) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv8i16.p0.i64(ptr align 2 [[TMP21]], i64 6, splat (i1 true), i32 [[TMP27]]) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq [[WIDE_MASKED_GATHER]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = select [[TMP17]], [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP28:%.*]] = xor [[TMP17]], splat (i1 true) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll index e34fc3557fda3..16c5a9d8818f7 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -30,8 +30,6 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV32: vector.ph: -; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 ; RV32-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i64() ; RV32-NEXT: [[TMP9:%.*]] = mul [[TMP7]], splat (i64 16) ; RV32-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP9]] @@ -41,24 +39,17 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV32-NEXT: [[AVL:%.*]] = phi i64 [ 625, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV32-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; RV32-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 -; RV32-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT6]], poison, zeroinitializer ; RV32-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64 ; RV32-NEXT: [[TMP11:%.*]] = mul i64 16, [[TMP8]] ; RV32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 ; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; RV32-NEXT: [[TMP16:%.*]] = call @llvm.stepvector.nxv2i32() -; RV32-NEXT: [[TMP25:%.*]] = icmp ult [[TMP16]], [[BROADCAST_SPLAT7]] ; RV32-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 16 ; RV32-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[OFFSET_IDX]] -; RV32-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP1]] to i32 -; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i32.p0.i64(ptr align 4 [[TMP26]], i64 64, [[TMP25]], i32 [[TMP27]]), !alias.scope [[META0:![0-9]+]] +; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i32.p0.i64(ptr align 4 [[TMP26]], i64 64, splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]] ; RV32-NEXT: [[TMP14:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], splat (i32 100) -; RV32-NEXT: [[TMP12:%.*]] = select [[TMP25]], [[TMP14]], zeroinitializer ; RV32-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[OFFSET_IDX]], 1 ; RV32-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP13]] -; RV32-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP1]] to i32 -; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.experimental.vp.strided.load.nxv2f64.p0.i64(ptr align 8 [[TMP28]], i64 256, [[TMP12]], i32 [[TMP15]]), !alias.scope [[META3:![0-9]+]] +; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.experimental.vp.strided.load.nxv2f64.p0.i64(ptr align 8 [[TMP28]], i64 256, [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]] ; RV32-NEXT: [[TMP17:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to ; RV32-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP17]] ; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] @@ -112,8 +103,6 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; RV64-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV64: vector.ph: -; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 ; RV64-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i64() ; RV64-NEXT: [[TMP9:%.*]] = mul [[TMP7]], splat (i64 16) ; RV64-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP9]] @@ -123,24 +112,17 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV64-NEXT: [[AVL:%.*]] = phi i64 [ 625, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; RV64-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; RV64-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 -; RV64-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT6]], poison, zeroinitializer ; RV64-NEXT: [[TMP8:%.*]] = zext i32 [[TMP10]] to i64 ; RV64-NEXT: [[TMP11:%.*]] = mul i64 16, [[TMP8]] ; RV64-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 ; RV64-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; RV64-NEXT: [[TMP16:%.*]] = call @llvm.stepvector.nxv2i32() -; RV64-NEXT: [[TMP25:%.*]] = icmp ult [[TMP16]], [[BROADCAST_SPLAT7]] ; RV64-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 16 ; RV64-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[OFFSET_IDX]] -; RV64-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP1]] to i32 -; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i32.p0.i64(ptr align 4 [[TMP26]], i64 64, [[TMP25]], i32 [[TMP27]]), !alias.scope [[META0:![0-9]+]] +; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i32.p0.i64(ptr align 4 [[TMP26]], i64 64, splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]] ; RV64-NEXT: [[TMP14:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], splat (i32 100) -; RV64-NEXT: [[TMP12:%.*]] = select [[TMP25]], [[TMP14]], zeroinitializer ; RV64-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[OFFSET_IDX]], 1 ; RV64-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP13]] -; RV64-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP1]] to i32 -; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.experimental.vp.strided.load.nxv2f64.p0.i64(ptr align 8 [[TMP28]], i64 256, [[TMP12]], i32 [[TMP15]]), !alias.scope [[META3:![0-9]+]] +; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.experimental.vp.strided.load.nxv2f64.p0.i64(ptr align 8 [[TMP28]], i64 256, [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]] ; RV64-NEXT: [[TMP17:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to ; RV64-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP17]] ; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll index 067b9908ab485..efab7b5e4e0d5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll @@ -9,8 +9,6 @@ define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalia ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[C]], i64 0 @@ -20,15 +18,10 @@ define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalia ; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ -7905747460161236406, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP2]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv4i32() -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult [[TMP3]], [[BROADCAST_SPLAT4]] ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[EVL_BASED_IV]], 7 ; CHECK-NEXT: [[IV:%.*]] = add i64 1, [[TMP5]] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP1]] to i32 -; CHECK-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i8.p0.i64(ptr align 1 [[GEP]], i64 7, [[TMP4]], i32 [[TMP7]]) +; CHECK-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i8.p0.i64(ptr align 1 [[GEP]], i64 7, splat (i1 true), i32 [[TMP2]]) ; CHECK-NEXT: [[TMP8:%.*]] = zext [[WIDE_STRIDED_LOAD]] to ; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.vp.merge.nxv4i64( splat (i1 true), [[TMP8]], splat (i64 1), i32 [[TMP2]]) ; CHECK-NEXT: [[TMP10:%.*]] = sdiv zeroinitializer, [[TMP9]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index 1f797886635fb..2192c7def4d1f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -10,8 +10,6 @@ define void @single_constant_stride_int_scaled(ptr %p) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH1:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP10:%.*]] = mul [[TMP8]], splat (i64 1) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] @@ -21,19 +19,14 @@ define void @single_constant_stride_int_scaled(ptr %p) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() -; CHECK-NEXT: [[TMP7:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i64 [[EVL_BASED_IV]], 8 ; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw [[VEC_IND]], splat (i64 8) ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P]], [[TMP14]] -; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP1]] to i32 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP18]], i64 32, [[TMP7]], i32 [[TMP19]]) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP18]], i64 32, splat (i1 true), i32 [[TMP11]]) ; CHECK-NEXT: [[TMP16:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP16]], align 4 [[TMP15]], splat (i1 true), i32 [[TMP11]]) ; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64 @@ -146,8 +139,6 @@ define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP6]], splat (i64 64) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] @@ -157,19 +148,14 @@ define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 64, [[TMP11]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = call @llvm.stepvector.nxv4i32() -; CHECK-NEXT: [[TMP16:%.*]] = icmp ult [[TMP15]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 64 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[P]], [[VEC_IND]] -; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP1]] to i32 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP17]], i64 256, [[TMP16]], i32 [[TMP18]]) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP17]], i64 256, splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP13:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP13]], align 4 [[TMP12]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64 @@ -861,8 +847,6 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; STRIDED-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: -; STRIDED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; STRIDED-NEXT: [[TMP45:%.*]] = mul nuw i64 [[TMP42]], 4 ; STRIDED-NEXT: [[TMP47:%.*]] = mul i64 [[STRIDE]], 4 ; STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[STRIDE]], i64 0 ; STRIDED-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer @@ -875,18 +859,13 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[TMP43:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; STRIDED-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement poison, i32 [[TMP43]], i64 0 -; STRIDED-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector [[BROADCAST_SPLATINSERT11]], poison, zeroinitializer ; STRIDED-NEXT: [[TMP44:%.*]] = zext i32 [[TMP43]] to i64 ; STRIDED-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement poison, i64 [[TMP44]], i64 0 ; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT9]], poison, zeroinitializer -; STRIDED-NEXT: [[TMP48:%.*]] = call @llvm.stepvector.nxv4i32() -; STRIDED-NEXT: [[TMP49:%.*]] = icmp ult [[TMP48]], [[BROADCAST_SPLAT12]] ; STRIDED-NEXT: [[TMP50:%.*]] = mul nuw nsw i64 [[EVL_BASED_IV]], [[STRIDE]] ; STRIDED-NEXT: [[TMP18:%.*]] = mul nuw nsw [[VEC_IND]], [[BROADCAST_SPLAT1]] ; STRIDED-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP50]] -; STRIDED-NEXT: [[TMP52:%.*]] = trunc i64 [[TMP45]] to i32 -; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP51]], i64 [[TMP47]], [[TMP49]], i32 [[TMP52]]), !alias.scope [[META9:![0-9]+]] +; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP51]], i64 [[TMP47]], splat (i1 true), i32 [[TMP43]]), !alias.scope [[META9:![0-9]+]] ; STRIDED-NEXT: [[TMP20:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; STRIDED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], [[TMP18]] ; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP20]], align 4 [[TMP21]], splat (i1 true), i32 [[TMP43]]), !alias.scope [[META12:![0-9]+]], !noalias [[META9]] @@ -1418,20 +1397,13 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; NOSTRIDED-NEXT: entry: ; NOSTRIDED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NOSTRIDED: vector.ph: -; NOSTRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; NOSTRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 ; NOSTRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; NOSTRIDED: vector.body: ; NOSTRIDED-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; NOSTRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; NOSTRIDED-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; NOSTRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP2]], i64 0 -; NOSTRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; NOSTRIDED-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv2i32() -; NOSTRIDED-NEXT: [[TMP4:%.*]] = icmp ult [[TMP3]], [[BROADCAST_SPLAT]] ; NOSTRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], i64 [[EVL_BASED_IV]] -; NOSTRIDED-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP1]] to i32 -; NOSTRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP9]], i64 4, [[TMP4]], i32 [[TMP10]]) +; NOSTRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP9]], i64 4, splat (i1 true), i32 [[TMP2]]) ; NOSTRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[EVL_BASED_IV]] ; NOSTRIDED-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP2]]) ; NOSTRIDED-NEXT: [[TMP6:%.*]] = zext i32 [[TMP2]] to i64 @@ -1509,20 +1481,13 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; STRIDED-NEXT: entry: ; STRIDED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: -; STRIDED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; STRIDED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 ; STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; STRIDED: vector.body: ; STRIDED-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP2]], i64 0 -; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; STRIDED-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv2i32() -; STRIDED-NEXT: [[TMP4:%.*]] = icmp ult [[TMP3]], [[BROADCAST_SPLAT]] ; STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], i64 [[EVL_BASED_IV]] -; STRIDED-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP1]] to i32 -; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP9]], i64 4, [[TMP4]], i32 [[TMP10]]) +; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP9]], i64 4, splat (i1 true), i32 [[TMP2]]) ; STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[EVL_BASED_IV]] ; STRIDED-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP2]]) ; STRIDED-NEXT: [[TMP6:%.*]] = zext i32 [[TMP2]] to i64 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll index f9b95f26ce0a7..0c72f3772b2c3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll @@ -12,20 +12,13 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde ; IF-EVL-NEXT: entry: ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2 ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY1]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY1]] ] ; IF-EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP2]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv2i32() -; IF-EVL-NEXT: [[TMP4:%.*]] = icmp ult [[TMP3]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP12]] to i32 -; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP5]], i64 4, [[TMP4]], i32 [[TMP6]]) +; IF-EVL-NEXT: [[WIDE_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 [[TMP5]], i64 4, splat (i1 true), i32 [[TMP2]]) ; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], [[WIDE_STRIDED_LOAD]] ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2f32.nxv2p0( align 4 [[TMP7]], splat (i1 true), i32 [[TMP2]]) ; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], [[WIDE_STRIDED_LOAD]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll index a5c04f58a67f1..bb0fa160be245 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll @@ -132,28 +132,19 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: entry: ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv4i32() -; IF-EVL-NEXT: [[TMP7:%.*]] = icmp ult [[TMP3]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[EVL_BASED_IV]], i32 0 -; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP1]] to i32 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, [[TMP7]], i32 [[TMP6]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, splat (i1 true), i32 [[TMP4]]) ; IF-EVL-NEXT: [[TMP8:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; IF-EVL-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP1]] to i32 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, [[TMP7]], i32 [[TMP19]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, splat (i1 true), i32 [[TMP4]]) ; IF-EVL-NEXT: [[TMP9:%.*]] = add [[TMP8]], [[WIDE_MASKED_GATHER1]] ; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[EVL_BASED_IV]], i32 3 -; IF-EVL-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP1]] to i32 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 16, [[TMP7]], i32 [[TMP20]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 16, splat (i1 true), i32 [[TMP4]]) ; IF-EVL-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[WIDE_MASKED_GATHER2]] ; IF-EVL-NEXT: [[TMP12]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP11]], [[VEC_PHI]], i32 [[TMP4]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP4]] to i64 @@ -403,28 +394,19 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: entry: ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv4i32() -; IF-EVL-NEXT: [[TMP7:%.*]] = icmp ult [[TMP3]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[EVL_BASED_IV]], i32 0 -; IF-EVL-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP1]] to i32 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, [[TMP7]], i32 [[TMP6]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, splat (i1 true), i32 [[TMP4]]) ; IF-EVL-NEXT: [[TMP8:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; IF-EVL-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP1]] to i32 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, [[TMP7]], i32 [[TMP19]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP5]], i64 16, splat (i1 true), i32 [[TMP4]]) ; IF-EVL-NEXT: [[TMP9:%.*]] = add [[TMP8]], [[WIDE_MASKED_GATHER1]] ; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[EVL_BASED_IV]], i32 2 -; IF-EVL-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP1]] to i32 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 16, [[TMP7]], i32 [[TMP20]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP10]], i64 16, splat (i1 true), i32 [[TMP4]]) ; IF-EVL-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[WIDE_MASKED_GATHER2]] ; IF-EVL-NEXT: [[TMP12]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP11]], [[VEC_PHI]], i32 [[TMP4]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP4]] to i64 @@ -667,33 +649,23 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[TMP1:%.*]] = sub i64 [[N]], [[SMIN]] ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP1]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv4i32() -; IF-EVL-NEXT: [[TMP9:%.*]] = icmp ult [[TMP5]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 -; IF-EVL-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP3]] to i32 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, [[TMP9]], i32 [[TMP8]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP10:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; IF-EVL-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP3]] to i32 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, [[TMP9]], i32 [[TMP14]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP7]], i64 -16, splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP11:%.*]] = add [[TMP10]], [[WIDE_MASKED_GATHER3]] ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 2 -; IF-EVL-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP3]] to i32 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP12]], i64 -16, [[TMP9]], i32 [[TMP24]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP12]], i64 -16, splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = add [[TMP11]], [[WIDE_MASKED_GATHER4]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[OFFSET_IDX]], i32 3 -; IF-EVL-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP3]] to i32 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP25]], i64 -16, [[TMP9]], i32 [[TMP26]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i64(ptr align 4 [[TMP25]], i64 -16, splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP15:%.*]] = add [[TMP13]], [[WIDE_MASKED_GATHER5]] ; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP15]], [[VEC_PHI]], i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP6]] to i64 From ccbfb790409c3b68b29a52f1e6da5710bb8791f2 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 4 Sep 2025 05:42:28 -0700 Subject: [PATCH 5/5] patch planContainsAdditionalSimplifications --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index cb007c4b07eaa..02b1625f1538b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6926,6 +6926,12 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, RepR->getUnderlyingInstr(), VF)) return true; } + + // The strided load is transformed from a gather through VPlanTransform, + // and its cost will be lower than the original gather. + if (isa(&R)) + return true; + if (Instruction *UI = GetInstructionForCost(&R)) { // If we adjusted the predicate of the recipe, the cost in the legacy // cost model may be different.