diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index af78e0c1e4799..c60877de28e29 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1842,6 +1842,10 @@ class TargetTransformInfo { /// otherwise scalar epilogue loop. LLVM_ABI bool preferEpilogueVectorization() const; + /// \returns True if the target supports lowering the @llvm.vp.udiv, + /// @llvm.vp.sdiv, @llvm.vp.urem and @llvm.vp.srem intrinsics. + LLVM_ABI bool supportsPredicatedDivRem() const; + /// \returns True if the target wants to expand the given reduction intrinsic /// into a shuffle sequence. LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 9c2ebb1891cac..1c3700ef85f09 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1103,6 +1103,8 @@ class TargetTransformInfoImplBase { virtual bool preferEpilogueVectorization() const { return true; } + virtual bool supportsPredicatedDivRem() const { return false; } + virtual bool shouldExpandReduction(const IntrinsicInst *II) const { return true; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index b4fa0d5964cb6..66c212b2507bd 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1424,6 +1424,10 @@ bool TargetTransformInfo::preferEpilogueVectorization() const { return TTIImpl->preferEpilogueVectorization(); } +bool TargetTransformInfo::supportsPredicatedDivRem() const { + return TTIImpl->supportsPredicatedDivRem(); +} + TargetTransformInfo::VPLegalization TargetTransformInfo::getVPLegalizationStrategy(const VPIntrinsic &VPI) const { return TTIImpl->getVPLegalizationStrategy(VPI); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 6bd7d51daff69..81facdfc1c374 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -141,6 +141,8 @@ class RISCVTTIImpl final : public BasicTTIImplBase { return false; } + bool supportsPredicatedDivRem() const override { return true; } + InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3fbeef1211954..33acdf30d468d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7838,8 +7838,8 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { Range); } -VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, - ArrayRef Operands) { +VPRecipeWithIRFlags *VPRecipeBuilder::tryToWiden(Instruction *I, + ArrayRef Operands) { switch (I->getOpcode()) { default: return nullptr; @@ -7847,11 +7847,25 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, case Instruction::UDiv: case Instruction::SRem: case Instruction::URem: { - // If not provably safe, use a select to form a safe divisor before widening the - // div/rem operation itself. Otherwise fall through to general handling below. + // If not provably safe use a predicated intrinsic to mask off trapping + // lanes if supported, or use a select to form a safe divisor before + // widening the div/rem operation itself. Otherwise fall through to general + // handling below. if (CM.isPredicatedInst(I)) { SmallVector Ops(Operands); VPValue *Mask = getBlockInMask(Builder.getInsertBlock()); + + if (TTI->supportsPredicatedDivRem()) { + Ops.push_back(Mask); + Ops.push_back(Builder.createScalarZExtOrTrunc( + &Plan.getVF(), IntegerType::getInt32Ty(I->getContext()), + VPTypeAnalysis(Plan).inferScalarType(&Plan.getVF()), + I->getDebugLoc())); + return new VPWidenIntrinsicRecipe( + VPIntrinsic::getForOpcode(I->getOpcode()), Ops, I->getType(), + I->getDebugLoc()); + } + VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false)); auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 41878e3c648e3..6696caefa3a04 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -121,7 +121,7 @@ class VPRecipeBuilder { /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe /// if it can. The function should only be called if the cost-model indicates /// that widening should be performed. - VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef Operands); + VPRecipeWithIRFlags *tryToWiden(Instruction *I, ArrayRef Operands); /// Makes Histogram count operations safe for vectorization, by emitting a /// llvm.experimental.vector.histogram.add intrinsic in place of the diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 109156c1469c5..8ffde28482eac 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -348,6 +348,12 @@ inline AllRecipe_match m_Trunc(const Op0_t &Op0) { return m_Unary(Op0); } +template +inline match_combine_or, Op0_t> +m_TruncOrSelf(const Op0_t &Op0) { + return m_CombineOr(m_Trunc(Op0), Op0); +} + template inline AllRecipe_match m_ZExt(const Op0_t &Op0) { return m_Unary(Op0); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 875d501f2a3dc..bb6920894d68e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2453,6 +2453,30 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL}, TypeInfo.inferScalarType(LHS), VPI->getDebugLoc()); }) + .Case( + [&](VPWidenIntrinsicRecipe *VPI) -> VPRecipeBase * { + switch (VPI->getVectorIntrinsicID()) { + case Intrinsic::vp_udiv: + case Intrinsic::vp_sdiv: + case Intrinsic::vp_urem: + case Intrinsic::vp_srem: + break; + default: + return nullptr; + } + VPValue *VF = &VPI->getParent()->getPlan()->getVF(); + if (!match(VPI->getOperand(3), m_TruncOrSelf(m_Specific(VF)))) + return nullptr; + VPValue *NewMask = GetNewMask(VPI->getOperand(2)); + if (NewMask == HeaderMask) + return nullptr; + if (!NewMask) + NewMask = &AllOneMask; + return new VPWidenIntrinsicRecipe( + VPI->getVectorIntrinsicID(), + {VPI->getOperand(0), VPI->getOperand(1), NewMask, &EVL}, + VPI->getResultType(), VPI->getDebugLoc()); + }) .Default([&](VPRecipeBase *R) { return nullptr; }); } @@ -2463,10 +2487,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); - assert(all_of(Plan.getVF().users(), - IsaPred) && - "User of VF that we can't transform to EVL."); Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) { return isa(U); }); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll index c35358b3eed0f..a69131f39f416 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -364,15 +364,9 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP12]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i32() -; CHECK-NEXT: [[TMP15:%.*]] = icmp ult [[TMP7]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP12]]) -; CHECK-NEXT: [[TMP16:%.*]] = select [[TMP15]], [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = select [[TMP16]], [[BROADCAST_SPLAT]], splat (i64 1) -; CHECK-NEXT: [[TMP11:%.*]] = udiv [[WIDE_LOAD]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call @llvm.vp.udiv.nxv2i64( [[WIDE_LOAD]], [[BROADCAST_SPLAT]], [[TMP6]], i32 [[TMP12]]) ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP6]], [[TMP11]], [[WIDE_LOAD]] ; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[PREDPHI]], ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP12]]) ; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 @@ -409,13 +403,12 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 ; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; FIXED-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1) ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 -; FIXED-NEXT: [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]] +; FIXED-NEXT: [[TMP8:%.*]] = call <4 x i64> @llvm.vp.udiv.v4i64(<4 x i64> [[WIDE_LOAD1]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i1> [[TMP0]], i32 4) ; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]] ; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP2]], align 8 ; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 @@ -479,15 +472,9 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP12]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i32() -; CHECK-NEXT: [[TMP15:%.*]] = icmp ult [[TMP7]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP12]]) -; CHECK-NEXT: [[TMP16:%.*]] = select [[TMP15]], [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = select [[TMP16]], [[BROADCAST_SPLAT]], splat (i64 1) -; CHECK-NEXT: [[TMP11:%.*]] = sdiv [[WIDE_LOAD]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call @llvm.vp.sdiv.nxv2i64( [[WIDE_LOAD]], [[BROADCAST_SPLAT]], [[TMP6]], i32 [[TMP12]]) ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP6]], [[TMP11]], [[WIDE_LOAD]] ; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[PREDPHI]], ptr align 8 [[TMP8]], splat (i1 true), i32 [[TMP12]]) ; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 @@ -524,13 +511,12 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 ; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; FIXED-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1) ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 -; FIXED-NEXT: [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]] +; FIXED-NEXT: [[TMP8:%.*]] = call <4 x i64> @llvm.vp.sdiv.v4i64(<4 x i64> [[WIDE_LOAD1]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i1> [[TMP0]], i32 4) ; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]] ; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP2]], align 8 ; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 @@ -799,16 +785,10 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP12]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() -; CHECK-NEXT: [[TMP15:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP7]], splat (i1 true), i32 [[TMP12]]) ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne [[WIDE_LOAD]], splat (i8 -128) -; CHECK-NEXT: [[TMP16:%.*]] = select [[TMP15]], [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = select [[TMP16]], splat (i8 -1), splat (i8 1) -; CHECK-NEXT: [[TMP11:%.*]] = sdiv [[WIDE_LOAD]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call @llvm.vp.sdiv.nxv16i8( [[WIDE_LOAD]], splat (i8 -1), [[TMP9]], i32 [[TMP12]]) ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP11]], [[WIDE_LOAD]] ; CHECK-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[PREDPHI]], ptr align 1 [[TMP7]], splat (i1 true), i32 [[TMP12]]) ; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 @@ -848,8 +828,7 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; FIXED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]] ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1 ; FIXED-NEXT: [[TMP5:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD1]], splat (i8 -128) -; FIXED-NEXT: [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> splat (i8 -1), <32 x i8> splat (i8 1) -; FIXED-NEXT: [[TMP9:%.*]] = sdiv <32 x i8> [[WIDE_LOAD1]], [[TMP7]] +; FIXED-NEXT: [[TMP9:%.*]] = call <32 x i8> @llvm.vp.sdiv.v32i8(<32 x i8> [[WIDE_LOAD1]], <32 x i8> splat (i8 -1), <32 x i1> [[TMP5]], i32 32) ; FIXED-NEXT: [[PREDPHI2:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP9]], <32 x i8> [[WIDE_LOAD1]] ; FIXED-NEXT: store <32 x i8> [[PREDPHI2]], ptr [[TMP1]], align 1 ; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll index c35a3d7b9269f..de94e7cbd30ab 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll @@ -6,10 +6,48 @@ define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d) { ; CHECK-LABEL: define void @pr154103( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul [[TMP3]], splat (i64 7) +; CHECK-NEXT: [[INDUCTION:%.*]] = add splat (i64 1), [[TMP4]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ -7905747460161236406, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 7, [[TMP6]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i8.nxv4p0( align 1 [[TMP10]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext [[WIDE_MASKED_GATHER]] to +; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.vp.sdiv.nxv4i64( zeroinitializer, [[TMP11]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[TMP13:%.*]] = icmp sgt [[TMP12]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call @llvm.vp.gather.nxv4i8.nxv4p0( align 1 [[BROADCAST_SPLAT]], [[TMP13]], i32 [[TMP5]]) +; CHECK-NEXT: [[TMP14:%.*]] = zext [[WIDE_MASKED_GATHER7]] to +; CHECK-NEXT: [[TMP15:%.*]] = xor [[TMP14]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP13]], [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = trunc [[PREDPHI]] to +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i16.nxv4p0( [[TMP16]], align 2 [[BROADCAST_SPLAT2]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: store i32 0, ptr [[D]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[X:%.*]] = load i8, ptr [[GEP]], align 1 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[X]] to i64 @@ -28,7 +66,7 @@ define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalia ; CHECK-NEXT: store i32 0, ptr [[D]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 7 ; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV]], 0 -; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll index 0b3f4766daf6d..af1b3dbd2cf39 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll @@ -22,8 +22,7 @@ define void @test_sdiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP11:%.*]] = call @llvm.vp.merge.nxv2i64( splat (i1 true), [[VP_OP_LOAD1]], splat (i64 1), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_OP:%.*]] = sdiv [[VP_OP_LOAD]], [[TMP11]] +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.sdiv.nxv2i64( [[VP_OP_LOAD]], [[VP_OP_LOAD1]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP]], ptr align 8 [[TMP12]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP5]] to i64 @@ -132,8 +131,7 @@ define void @test_udiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP11:%.*]] = call @llvm.vp.merge.nxv2i64( splat (i1 true), [[VP_OP_LOAD1]], splat (i64 1), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_OP:%.*]] = udiv [[VP_OP_LOAD]], [[TMP11]] +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.udiv.nxv2i64( [[VP_OP_LOAD]], [[VP_OP_LOAD1]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP]], ptr align 8 [[TMP12]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP5]] to i64 @@ -241,8 +239,7 @@ define void @test_srem(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP11:%.*]] = call @llvm.vp.merge.nxv2i64( splat (i1 true), [[VP_OP_LOAD1]], splat (i64 1), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_OP:%.*]] = srem [[VP_OP_LOAD]], [[TMP11]] +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.srem.nxv2i64( [[VP_OP_LOAD]], [[VP_OP_LOAD1]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP]], ptr align 8 [[TMP12]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP5]] to i64 @@ -350,8 +347,7 @@ define void @test_urem(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP11:%.*]] = call @llvm.vp.merge.nxv2i64( splat (i1 true), [[VP_OP_LOAD1]], splat (i64 1), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_OP:%.*]] = urem [[VP_OP_LOAD]], [[TMP11]] +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.urem.nxv2i64( [[VP_OP_LOAD]], [[VP_OP_LOAD1]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_OP]], ptr align 8 [[TMP12]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP5]] to i64