Skip to content

Commit 5faed1a

Browse files
authored
[VPlan] Add VPlan-based addMinIterCheck, replace ILV for non-epilogue. (#153643)
This patch adds a new VPlan-based addMinimumIterationCheck, which replaced the ILV version for the non-epilogue case. The VPlan-based version constructs a SCEV expression to compute the minimum iterations, use that to check if the check is known true or false. Otherwise it creates a VPExpandSCEV recipe and emits a compare-and-branch. When using epilogue vectorization, we still need to create the minimum trip-count-check during the legacy skeleton creation. The patch moves the definitions out of ILV. PR: #153643
1 parent 4454152 commit 5faed1a

36 files changed

+368
-350
lines changed

llvm/include/llvm/Analysis/ScalarEvolution.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -569,7 +569,9 @@ class ScalarEvolution {
569569
LLVM_ABI const SCEV *getTruncateExpr(const SCEV *Op, Type *Ty,
570570
unsigned Depth = 0);
571571
LLVM_ABI const SCEV *getVScale(Type *Ty);
572-
LLVM_ABI const SCEV *getElementCount(Type *Ty, ElementCount EC);
572+
LLVM_ABI const SCEV *
573+
getElementCount(Type *Ty, ElementCount EC,
574+
SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap);
573575
LLVM_ABI const SCEV *getZeroExtendExpr(const SCEV *Op, Type *Ty,
574576
unsigned Depth = 0);
575577
LLVM_ABI const SCEV *getZeroExtendExprImpl(const SCEV *Op, Type *Ty,

llvm/lib/Analysis/ScalarEvolution.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -500,10 +500,11 @@ const SCEV *ScalarEvolution::getVScale(Type *Ty) {
500500
return S;
501501
}
502502

503-
const SCEV *ScalarEvolution::getElementCount(Type *Ty, ElementCount EC) {
503+
const SCEV *ScalarEvolution::getElementCount(Type *Ty, ElementCount EC,
504+
SCEV::NoWrapFlags Flags) {
504505
const SCEV *Res = getConstant(Ty, EC.getKnownMinValue());
505506
if (EC.isScalable())
506-
Res = getMulExpr(Res, getVScale(Ty));
507+
Res = getMulExpr(Res, getVScale(Ty), Flags);
507508
return Res;
508509
}
509510

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,10 @@ class VPBuilder {
334334
FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags(), DL));
335335
}
336336

337+
VPExpandSCEVRecipe *createExpandSCEV(const SCEV *Expr) {
338+
return tryInsertInstruction(new VPExpandSCEVRecipe(Expr));
339+
}
340+
337341
//===--------------------------------------------------------------------===//
338342
// RAII helpers.
339343
//===--------------------------------------------------------------------===//
@@ -559,6 +563,11 @@ class LoopVectorizationPlanner {
559563
/// Emit remarks for recipes with invalid costs in the available VPlans.
560564
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE);
561565

566+
/// Create a check to \p Plan to see if the vector loop should be executed
567+
/// based on its trip count.
568+
void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF,
569+
ElementCount MinProfitableTripCount) const;
570+
562571
protected:
563572
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
564573
/// according to the information gathered by Legal when it checked if it is

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 72 additions & 127 deletions
Large diffs are not rendered by default.

llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,90 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
669669
}
670670
}
671671

672+
void VPlanTransforms::addMinimumIterationCheck(
673+
VPlan &Plan, ElementCount VF, unsigned UF,
674+
ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
675+
bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop,
676+
const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE) {
677+
// Generate code to check if the loop's trip count is less than VF * UF, or
678+
// equal to it in case a scalar epilogue is required; this implies that the
679+
// vector trip count is zero. This check also covers the case where adding one
680+
// to the backedge-taken count overflowed leading to an incorrect trip count
681+
// of zero. In this case we will also jump to the scalar loop.
682+
CmpInst::Predicate CmpPred =
683+
RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
684+
// If tail is to be folded, vector loop takes care of all iterations.
685+
VPValue *TripCountVPV = Plan.getTripCount();
686+
const SCEV *TripCount = vputils::getSCEVExprForVPValue(TripCountVPV, SE);
687+
Type *TripCountTy = TripCount->getType();
688+
auto GetMinTripCount = [&]() -> const SCEV * {
689+
// Compute max(MinProfitableTripCount, UF * VF) and return it.
690+
const SCEV *VFxUF =
691+
SE.getElementCount(TripCountTy, (VF * UF), SCEV::FlagNUW);
692+
if (UF * VF.getKnownMinValue() >=
693+
MinProfitableTripCount.getKnownMinValue()) {
694+
// TODO: SCEV should be able to simplify test.
695+
return VFxUF;
696+
}
697+
const SCEV *MinProfitableTripCountSCEV =
698+
SE.getElementCount(TripCountTy, MinProfitableTripCount, SCEV::FlagNUW);
699+
return SE.getUMaxExpr(MinProfitableTripCountSCEV, VFxUF);
700+
};
701+
702+
VPBasicBlock *EntryVPBB = Plan.getEntry();
703+
VPBuilder Builder(EntryVPBB);
704+
VPValue *TripCountCheck = Plan.getFalse();
705+
const SCEV *Step = GetMinTripCount();
706+
if (TailFolded) {
707+
if (CheckNeededWithTailFolding) {
708+
// vscale is not necessarily a power-of-2, which means we cannot guarantee
709+
// an overflow to zero when updating induction variables and so an
710+
// additional overflow check is required before entering the vector loop.
711+
712+
// Get the maximum unsigned value for the type.
713+
VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get(
714+
TripCountTy, cast<IntegerType>(TripCountTy)->getMask()));
715+
VPValue *DistanceToMax = Builder.createNaryOp(
716+
Instruction::Sub, {MaxUIntTripCount, TripCountVPV},
717+
DebugLoc::getUnknown());
718+
719+
// Don't execute the vector loop if (UMax - n) < (VF * UF).
720+
// FIXME: Should only check VF * UF, but currently checks Step=max(VF*UF,
721+
// minProfitableTripCount).
722+
TripCountCheck = Builder.createICmp(ICmpInst::ICMP_ULT, DistanceToMax,
723+
Builder.createExpandSCEV(Step), DL);
724+
} else {
725+
// TripCountCheck = false, folding tail implies positive vector trip
726+
// count.
727+
}
728+
} else {
729+
// TODO: Emit unconditional branch to vector preheader instead of
730+
// conditional branch with known condition.
731+
TripCount = SE.applyLoopGuards(TripCount, OrigLoop);
732+
// Check if the trip count is < the step.
733+
if (SE.isKnownPredicate(CmpPred, TripCount, Step)) {
734+
// TODO: Ensure step is at most the trip count when determining max VF and
735+
// UF, w/o tail folding.
736+
TripCountCheck = Plan.getTrue();
737+
} else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(CmpPred),
738+
TripCount, Step)) {
739+
// Generate the minimum iteration check only if we cannot prove the
740+
// check is known to be true, or known to be false.
741+
VPValue *MinTripCountVPV = Builder.createExpandSCEV(Step);
742+
TripCountCheck = Builder.createICmp(
743+
CmpPred, TripCountVPV, MinTripCountVPV, DL, "min.iters.check");
744+
} // else step known to be < trip count, use TripCountCheck preset to false.
745+
}
746+
VPInstruction *Term =
747+
Builder.createNaryOp(VPInstruction::BranchOnCond, {TripCountCheck}, DL);
748+
if (MinItersBypassWeights) {
749+
MDBuilder MDB(Plan.getContext());
750+
MDNode *BranchWeights = MDB.createBranchWeights(
751+
ArrayRef(MinItersBypassWeights, 2), /*IsExpected=*/false);
752+
Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
753+
}
754+
}
755+
672756
bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
673757
auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
674758
auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3525,6 +3525,21 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
35253525
Plan.resetTripCount(Exp);
35263526
ExpSCEV->eraseFromParent();
35273527
}
3528+
assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) &&
3529+
"VPExpandSCEVRecipes must be at the beginning of the entry block, "
3530+
"after any VPIRInstructions");
3531+
// Add IR instructions in the entry basic block but not in the VPIRBasicBlock
3532+
// to the VPIRBasicBlock.
3533+
auto EI = Entry->begin();
3534+
for (Instruction &I : drop_end(*EntryBB)) {
3535+
if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
3536+
&cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
3537+
EI++;
3538+
continue;
3539+
}
3540+
VPIRInstruction::create(I)->insertBefore(*Entry, EI);
3541+
}
3542+
35283543
return ExpandedSCEVs;
35293544
}
35303545

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,40 @@ struct VPlanTransforms {
6262
/// The created loop is wrapped in an initial skeleton to facilitate
6363
/// vectorization, consisting of a vector pre-header, an exit block for the
6464
/// main vector loop (middle.block) and a new block as preheader of the scalar
65-
/// loop (scalar.ph). It also adds a canonical IV and its increment, using \p
66-
/// InductionTy and \p IVDL, and creates a VPValue expression for the original
67-
/// trip count.
65+
/// loop (scalar.ph). See below for an illustration. It also adds a canonical
66+
/// IV and its increment, using \p InductionTy and \p IVDL, and creates a
67+
/// VPValue expression for the original trip count.
68+
///
69+
/// [ ] <-- Plan's entry VPIRBasicBlock, wrapping the original loop's
70+
/// / \ old preheader. Will contain iteration number check and SCEV
71+
/// | | expansions.
72+
/// | |
73+
/// / v
74+
/// | [ ] <-- vector loop bypass (may consist of multiple blocks) will be
75+
/// | / | added later.
76+
/// | / v
77+
/// || [ ] <-- vector pre header.
78+
/// |/ |
79+
/// | v
80+
/// | [ ] \ <-- plain CFG loop wrapping original loop to be vectorized.
81+
/// | [ ]_|
82+
/// | |
83+
/// | v
84+
/// | [ ] <--- middle-block with the branch to successors
85+
/// | / |
86+
/// | / |
87+
/// | | v
88+
/// \--->[ ] <--- scalar preheader (initial a VPBasicBlock, which will be
89+
/// | | replaced later by a VPIRBasicBlock wrapping the scalar
90+
/// | | preheader basic block.
91+
/// | |
92+
/// v <-- edge from middle to exit iff epilogue is not required.
93+
/// | [ ] \
94+
/// | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue,
95+
/// | | header wrapped in VPIRBasicBlock).
96+
/// \ |
97+
/// \ v
98+
/// >[ ] <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
6899
LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
69100
buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
70101
PredicatedScalarEvolution &PSE);
@@ -79,6 +110,13 @@ struct VPlanTransforms {
79110
bool RequiresScalarEpilogueCheck,
80111
bool TailFolded);
81112

113+
// Create a check to \p Plan to see if the vector loop should be executed.
114+
static void addMinimumIterationCheck(
115+
VPlan &Plan, ElementCount VF, unsigned UF,
116+
ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
117+
bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop,
118+
const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE);
119+
82120
/// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
83121
/// flat CFG into a hierarchical CFG.
84122
LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan);

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1246,7 +1246,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
12461246
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
12471247
; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
12481248
; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
1249-
; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP2]])
1249+
; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 8)
12501250
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
12511251
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
12521252
; DEFAULT: [[VECTOR_MEMCHECK]]:

llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
99
; CHECK-NEXT: [[CONV61:%.*]] = zext i32 [[X]] to i64
1010
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
1111
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP10]], 2
12-
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]])
12+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8)
1313
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
1414
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
1515
; CHECK: [[VECTOR_SCEVCHECK]]:

llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,22 +9,20 @@ define void @f1(ptr %A) #0 {
99
; CHECK-LABEL: define void @f1
1010
; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
1111
; CHECK-NEXT: entry:
12-
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
13-
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
1412
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1513
; CHECK: vector.ph:
16-
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
17-
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
18-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
14+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
15+
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
16+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
1917
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
2018
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
2119
; CHECK: vector.body:
2220
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
23-
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
24-
; CHECK-NEXT: store <vscale x 4 x i32> splat (i32 1), ptr [[TMP4]], align 4
25-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
26-
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
27-
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
21+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
22+
; CHECK-NEXT: store <vscale x 4 x i32> splat (i32 1), ptr [[TMP2]], align 4
23+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
24+
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
25+
; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
2826
; CHECK: middle.block:
2927
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
3028
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]

0 commit comments

Comments
 (0)