-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[VPlan] Add VPlan-based addMinIterCheck, replace ILV for non-epilogue. #153643
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-llvm-analysis Author: Florian Hahn (fhahn) ChangesThis patch adds a new VPlan-based addMinimumIterationCheck, which replaced the ILV version for the non-epilogue case. The VPlan-based version constructs a SCEV expression to compute the minimum iterations, use that to check if the check is known true or false. Otherwise it creates a VPExpandSCEV recipe and emits a compare-and-branch. When using epilogue vectorization, we still need to create the minimum trip-count-check during the legacy skeleton creation. The patch moves the definitions out of ILV. Patch is 127.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153643.diff 34 Files Affected:
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 167845ce646b9..858c1d5392071 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -569,7 +569,9 @@ class ScalarEvolution {
LLVM_ABI const SCEV *getTruncateExpr(const SCEV *Op, Type *Ty,
unsigned Depth = 0);
LLVM_ABI const SCEV *getVScale(Type *Ty);
- LLVM_ABI const SCEV *getElementCount(Type *Ty, ElementCount EC);
+ LLVM_ABI const SCEV *
+ getElementCount(Type *Ty, ElementCount EC,
+ SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap);
LLVM_ABI const SCEV *getZeroExtendExpr(const SCEV *Op, Type *Ty,
unsigned Depth = 0);
LLVM_ABI const SCEV *getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index c20b1bdaf94c7..3b06cb24986e6 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -501,10 +501,11 @@ const SCEV *ScalarEvolution::getVScale(Type *Ty) {
return S;
}
-const SCEV *ScalarEvolution::getElementCount(Type *Ty, ElementCount EC) {
+const SCEV *ScalarEvolution::getElementCount(Type *Ty, ElementCount EC,
+ SCEV::NoWrapFlags Flags) {
const SCEV *Res = getConstant(Ty, EC.getKnownMinValue());
if (EC.isScalable())
- Res = getMulExpr(Res, getVScale(Ty));
+ Res = getMulExpr(Res, getVScale(Ty), Flags);
return Res;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 4856ebebb596f..579db63bb8628 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -332,6 +332,10 @@ class VPBuilder {
FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags(), DL));
}
+ VPExpandSCEVRecipe *expandSCEV(const SCEV *Expr, ScalarEvolution &SE) {
+ return tryInsertInstruction(new VPExpandSCEVRecipe(Expr, SE));
+ }
+
//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
@@ -557,6 +561,10 @@ class LoopVectorizationPlanner {
/// Emit remarks for recipes with invalid costs in the available VPlans.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE);
+ /// Create a check to \p Plan to see if the vector loop should be executed.
+ void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount) const;
+
protected:
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09c9e63ff6a25..fbcd2b1004971 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -500,26 +500,23 @@ class InnerLoopVectorizer {
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
LoopInfo *LI, DominatorTree *DT,
const TargetTransformInfo *TTI, AssumptionCache *AC,
- ElementCount VecWidth,
- ElementCount MinProfitableTripCount,
- unsigned UnrollFactor, LoopVectorizationCostModel *CM,
- BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &RTChecks, VPlan &Plan)
+ ElementCount VecWidth, unsigned UnrollFactor,
+ LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
+ VPlan &Plan)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
- VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount),
- UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Cost(CM),
- BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
+ VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
+ Cost(CM), BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
VectorPHVPBB(cast<VPBasicBlock>(
Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
virtual ~InnerLoopVectorizer() = default;
- /// Create a new empty loop that will contain vectorized instructions later
- /// on, while the old loop will be used as the scalar remainder. Control flow
- /// is generated around the vectorized (and scalar epilogue) loops consisting
- /// of various checks and bypasses. Return the pre-header block of the new
- /// loop. In the case of epilogue vectorization, this function is overriden to
- /// handle the more complex control flow around the loops.
+ /// Create new basic blocks needed as entries and exits of the code generated
+ /// by executing a VPlan. For epilogue vectorization, it will create blocks
+ /// for the minimum iteration checks and IR basic blocks for the vector and
+ /// scalar preheaders. Otherwise it will create a basic block for the scalar
+ /// preheader only.
virtual BasicBlock *createVectorizedLoopSkeleton();
/// Fix the vectorized code, taking care of header phi's, and more.
@@ -547,16 +544,8 @@ class InnerLoopVectorizer {
protected:
friend class LoopVectorizationPlanner;
- // Create a check to see if the vector loop should be executed
- Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
-
- /// Emit a bypass check to see if the vector trip count is zero, including if
- /// it overflows.
- void emitIterationCountCheck(BasicBlock *Bypass);
-
- /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
- /// vector loop preheader, middle block and scalar preheader.
- void createVectorLoopSkeleton(StringRef Prefix);
+ /// Create a new IR basic block for the scalar preheader.
+ void createScalarPreheader(StringRef Prefix);
/// Allow subclasses to override and print debug traces before/after vplan
/// execution, when trace information is requested.
@@ -592,8 +581,6 @@ class InnerLoopVectorizer {
/// vector elements.
ElementCount VF;
- ElementCount MinProfitableTripCount;
-
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF;
@@ -679,9 +666,8 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
ElementCount MinProfitableTripCount, unsigned UnrollFactor)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
- MinProfitableTripCount, UnrollFactor, CM, BFI, PSI,
- Checks, Plan),
- EPI(EPI) {}
+ UnrollFactor, CM, BFI, PSI, Checks, Plan),
+ EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {}
// Override this function to handle the more complex control flow around the
// three loops.
@@ -701,6 +687,9 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
/// iteration count of the loop is so small that the main vector loop is
/// completely skipped.
EpilogueLoopVectorizationInfo &EPI;
+
+protected:
+ ElementCount MinProfitableTripCount;
};
/// A specialized derived class of inner loop vectorizer that performs
@@ -724,6 +713,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
protected:
+ // Create a check to see if the vector loop should be executed
+ Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
+
/// Emits an iteration count bypass check once for the main loop (when \p
/// ForEpilogue is false) and once for the epilogue loop (when \p
/// ForEpilogue is true).
@@ -2292,7 +2284,8 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
}
}
-Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
+Value *
+EpilogueVectorizerMainLoop::createIterationCountCheck(ElementCount VF,
unsigned UF) const {
// Generate code to check if the loop's trip count is less than VF * UF, or
// equal to it in case a scalar epilogue is required; this implies that the
@@ -2363,25 +2356,6 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
return CheckMinIters;
}
-void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
- BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
- Value *CheckMinIters = createIterationCountCheck(VF, UF);
- // Create new preheader for vector loop.
- LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
- static_cast<DominatorTree *>(nullptr), LI,
- nullptr, "vector.ph");
-
- BranchInst &BI =
- *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
- if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
- setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
- ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
-
- assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
- TCCheckBlock &&
- "Plan's entry must be TCCCheckBlock");
-}
-
/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
@@ -2402,7 +2376,7 @@ static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
return IRVPBB;
}
-void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
+void InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) {
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
assert((OrigLoop->getUniqueLatchExitBlock() ||
@@ -2414,8 +2388,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LI, nullptr, Twine(Prefix) + "scalar.ph");
// NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
// wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar
- // preheader may be unreachable at this point. Instead it is replaced in
- // createVectorizedLoopSkeleton.
+ // preheader may be unreachable at this point and SCEV expansion may add to it
+ // later.
}
/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2456,53 +2430,9 @@ static void addFullyUnrolledInstructionsToIgnore(
}
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
- /*
- In this function we generate a new loop. The new loop will contain
- the vectorized instructions while the old loop will continue to run the
- scalar remainder.
-
- [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
- / | preheader are expanded here. Eventually all required SCEV
- / | expansion should happen here.
- / v
- | [ ] <-- vector loop bypass (may consist of multiple blocks).
- | / |
- | / v
- || [ ] <-- vector pre header.
- |/ |
- | v
- | [ ] \
- | [ ]_| <-- vector loop (created during VPlan execution).
- | |
- | v
- \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
- | | successors created during VPlan execution)
- \/ |
- /\ v
- | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
- | |
- (opt) v <-- edge from middle to exit iff epilogue is not required.
- | [ ] \
- | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
- | | wrapped in VPIRBasicBlock).
- \ |
- \ v
- >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
- ...
- */
-
- // Create an empty vector loop, and prepare basic blocks for the runtime
- // checks.
- createVectorLoopSkeleton("");
-
- // Now, compare the new count to zero. If it is zero skip the vector loop and
- // jump to the scalar loop. This check also covers the case where the
- // backedge-taken count is uint##_max: adding one to it will overflow leading
- // to an incorrect trip count of zero. In this (rare) case we will also jump
- // to the scalar loop.
- emitIterationCountCheck(LoopScalarPreHeader);
-
- replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+ // Create a new IR basic block for the scalar preheader.
+ createScalarPreheader("");
+
return LoopVectorPreHeader;
}
@@ -7332,6 +7262,17 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan.resetTripCount(Exp);
ExpSCEV->eraseFromParent();
}
+ // SCEV expansion will add new instructions in the IRBB wrapped by Entry.
+ // Remove existing VPIRInstructions/VPIRPhi and re-create them to make sure
+ // all IR instructions are wrapped. Otherwise VPInstructions may be inserted
+ // at the wrong place.
+ for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
+ if (!isa<VPIRInstruction, VPIRPhi>(&R))
+ continue;
+ R.eraseFromParent();
+ }
+ for (Instruction &I : drop_begin(reverse(*Entry->getIRBasicBlock())))
+ VPIRInstruction::create(I)->insertBefore(*Entry, Entry->begin());
if (!ILV.getTripCount())
ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
@@ -7460,7 +7401,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
- createVectorLoopSkeleton("");
+ createScalarPreheader("");
// Generate the code to check the minimum iteration count of the vector
// epilogue (see below).
@@ -7547,7 +7488,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
BasicBlock *
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
- createVectorLoopSkeleton("vec.epilog.");
+ createScalarPreheader("vec.epilog.");
// Now, compare the remaining count and if there aren't enough iterations to
// execute the vectorized epilogue skip to the scalar part.
@@ -9353,6 +9294,28 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
}
}
+void LoopVectorizationPlanner::addMinimumIterationCheck(
+ VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount) const {
+ // vscale is not necessarily a power-of-2, which means we cannot guarantee
+ // an overflow to zero when updating induction variables and so an
+ // additional overflow check is required before entering the vector loop.
+ bool CheckNeededWithTailFolding =
+ VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
+ !isIndvarOverflowCheckKnownFalse(&CM, VF, 1) &&
+ CM.getTailFoldingStyle() !=
+ TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+ VPlanTransforms::addMinimumIterationCheck(
+ Plan, VF, UF, MinProfitableTripCount,
+ CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
+ CheckNeededWithTailFolding, OrigLoop,
+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
+ ? &MinItersBypassWeights[0]
+ : nullptr,
+ OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
+ *PSE.getSE());
+}
+
void VPDerivedIVRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
@@ -9468,10 +9431,13 @@ static bool processLoopInVPlanNativePath(
{
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
- InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.Width, 1, &CM,
- BFI, PSI, Checks, BestPlan);
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, 1, &CM, BFI, PSI,
+ Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, 1,
+ VF.MinProfitableTripCount);
+
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
}
@@ -10250,16 +10216,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
- InnerLoopVectorizer Unroller(
- L, PSE, LI, DT, TTI, AC, ElementCount::getFixed(1),
- ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
+ InnerLoopVectorizer Unroller(L, PSE, LI, DT, TTI, AC,
+ ElementCount::getFixed(1), IC, &CM, BFI, PSI,
+ Checks, BestPlan);
// TODO: Move to general VPlan pipeline once epilogue loops are also
// supported.
VPlanTransforms::runPass(
VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
VF.Width, IC, PSE);
-
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
+ VF.MinProfitableTripCount);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
ORE->emit([&]() {
@@ -10320,14 +10287,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (!Checks.hasChecks())
DisableRuntimeUnroll = true;
} else {
- InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width,
- VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
- Checks, BestPlan);
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI,
+ PSI, Checks, BestPlan);
// TODO: Move to general VPlan pipeline once epilogue loops are also
// supported.
VPlanTransforms::runPass(
VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
VF.Width, IC, PSE);
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
+ VF.MinProfitableTripCount);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index b231a8429503f..4f36de870d067 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -670,6 +670,80 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
}
}
+void VPlanTransforms::addMinimumIterationCheck(
+ VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
+ bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop,
+ const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE) {
+ // Generate code to check if the loop's trip count is less than VF * UF, or
+ // equal to it in case a scalar epilogue is required; this implies that the
+ // vector trip count is zero. This check also covers the case where adding one
+ // to the backedge-taken count overflowed leading to an incorrect trip count
+ // of zero. In this case we will also jump to the scalar loop.
+ auto P = RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ // If tail is to be folded, vector loop takes care of all iterations.
+ const SCEV *Count = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
+ Type *CountTy = Count->getType();
+ auto CreateStep = [&]() -> const SCEV * {
+ const SCEV *VFxUF = SE.getElementCount(CountTy, (VF * UF), SCEV::FlagNUW);
+ // Create step with max(MinProTripCount, UF * VF).
+ i...
[truncated]
|
@llvm/pr-subscribers-backend-risc-v Author: Florian Hahn (fhahn) ChangesThis patch adds a new VPlan-based addMinimumIterationCheck, which replaced the ILV version for the non-epilogue case. The VPlan-based version constructs a SCEV expression to compute the minimum iterations, use that to check if the check is known true or false. Otherwise it creates a VPExpandSCEV recipe and emits a compare-and-branch. When using epilogue vectorization, we still need to create the minimum trip-count-check during the legacy skeleton creation. The patch moves the definitions out of ILV. Patch is 127.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153643.diff 34 Files Affected:
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 167845ce646b9..858c1d5392071 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -569,7 +569,9 @@ class ScalarEvolution {
LLVM_ABI const SCEV *getTruncateExpr(const SCEV *Op, Type *Ty,
unsigned Depth = 0);
LLVM_ABI const SCEV *getVScale(Type *Ty);
- LLVM_ABI const SCEV *getElementCount(Type *Ty, ElementCount EC);
+ LLVM_ABI const SCEV *
+ getElementCount(Type *Ty, ElementCount EC,
+ SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap);
LLVM_ABI const SCEV *getZeroExtendExpr(const SCEV *Op, Type *Ty,
unsigned Depth = 0);
LLVM_ABI const SCEV *getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index c20b1bdaf94c7..3b06cb24986e6 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -501,10 +501,11 @@ const SCEV *ScalarEvolution::getVScale(Type *Ty) {
return S;
}
-const SCEV *ScalarEvolution::getElementCount(Type *Ty, ElementCount EC) {
+const SCEV *ScalarEvolution::getElementCount(Type *Ty, ElementCount EC,
+ SCEV::NoWrapFlags Flags) {
const SCEV *Res = getConstant(Ty, EC.getKnownMinValue());
if (EC.isScalable())
- Res = getMulExpr(Res, getVScale(Ty));
+ Res = getMulExpr(Res, getVScale(Ty), Flags);
return Res;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 4856ebebb596f..579db63bb8628 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -332,6 +332,10 @@ class VPBuilder {
FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags(), DL));
}
+ VPExpandSCEVRecipe *expandSCEV(const SCEV *Expr, ScalarEvolution &SE) {
+ return tryInsertInstruction(new VPExpandSCEVRecipe(Expr, SE));
+ }
+
//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
@@ -557,6 +561,10 @@ class LoopVectorizationPlanner {
/// Emit remarks for recipes with invalid costs in the available VPlans.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE);
+ /// Create a check to \p Plan to see if the vector loop should be executed.
+ void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount) const;
+
protected:
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09c9e63ff6a25..fbcd2b1004971 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -500,26 +500,23 @@ class InnerLoopVectorizer {
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
LoopInfo *LI, DominatorTree *DT,
const TargetTransformInfo *TTI, AssumptionCache *AC,
- ElementCount VecWidth,
- ElementCount MinProfitableTripCount,
- unsigned UnrollFactor, LoopVectorizationCostModel *CM,
- BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &RTChecks, VPlan &Plan)
+ ElementCount VecWidth, unsigned UnrollFactor,
+ LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
+ VPlan &Plan)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
- VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount),
- UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Cost(CM),
- BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
+ VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
+ Cost(CM), BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
VectorPHVPBB(cast<VPBasicBlock>(
Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
virtual ~InnerLoopVectorizer() = default;
- /// Create a new empty loop that will contain vectorized instructions later
- /// on, while the old loop will be used as the scalar remainder. Control flow
- /// is generated around the vectorized (and scalar epilogue) loops consisting
- /// of various checks and bypasses. Return the pre-header block of the new
- /// loop. In the case of epilogue vectorization, this function is overriden to
- /// handle the more complex control flow around the loops.
+ /// Create new basic blocks needed as entries and exits of the code generated
+ /// by executing a VPlan. For epilogue vectorization, it will create blocks
+ /// for the minimum iteration checks and IR basic blocks for the vector and
+ /// scalar preheaders. Otherwise it will create a basic block for the scalar
+ /// preheader only.
virtual BasicBlock *createVectorizedLoopSkeleton();
/// Fix the vectorized code, taking care of header phi's, and more.
@@ -547,16 +544,8 @@ class InnerLoopVectorizer {
protected:
friend class LoopVectorizationPlanner;
- // Create a check to see if the vector loop should be executed
- Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
-
- /// Emit a bypass check to see if the vector trip count is zero, including if
- /// it overflows.
- void emitIterationCountCheck(BasicBlock *Bypass);
-
- /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
- /// vector loop preheader, middle block and scalar preheader.
- void createVectorLoopSkeleton(StringRef Prefix);
+ /// Create a new IR basic block for the scalar preheader.
+ void createScalarPreheader(StringRef Prefix);
/// Allow subclasses to override and print debug traces before/after vplan
/// execution, when trace information is requested.
@@ -592,8 +581,6 @@ class InnerLoopVectorizer {
/// vector elements.
ElementCount VF;
- ElementCount MinProfitableTripCount;
-
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF;
@@ -679,9 +666,8 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
ElementCount MinProfitableTripCount, unsigned UnrollFactor)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
- MinProfitableTripCount, UnrollFactor, CM, BFI, PSI,
- Checks, Plan),
- EPI(EPI) {}
+ UnrollFactor, CM, BFI, PSI, Checks, Plan),
+ EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {}
// Override this function to handle the more complex control flow around the
// three loops.
@@ -701,6 +687,9 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
/// iteration count of the loop is so small that the main vector loop is
/// completely skipped.
EpilogueLoopVectorizationInfo &EPI;
+
+protected:
+ ElementCount MinProfitableTripCount;
};
/// A specialized derived class of inner loop vectorizer that performs
@@ -724,6 +713,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
protected:
+ // Create a check to see if the vector loop should be executed
+ Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
+
/// Emits an iteration count bypass check once for the main loop (when \p
/// ForEpilogue is false) and once for the epilogue loop (when \p
/// ForEpilogue is true).
@@ -2292,7 +2284,8 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
}
}
-Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
+Value *
+EpilogueVectorizerMainLoop::createIterationCountCheck(ElementCount VF,
unsigned UF) const {
// Generate code to check if the loop's trip count is less than VF * UF, or
// equal to it in case a scalar epilogue is required; this implies that the
@@ -2363,25 +2356,6 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
return CheckMinIters;
}
-void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
- BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
- Value *CheckMinIters = createIterationCountCheck(VF, UF);
- // Create new preheader for vector loop.
- LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
- static_cast<DominatorTree *>(nullptr), LI,
- nullptr, "vector.ph");
-
- BranchInst &BI =
- *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
- if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
- setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
- ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
-
- assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
- TCCheckBlock &&
- "Plan's entry must be TCCCheckBlock");
-}
-
/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
@@ -2402,7 +2376,7 @@ static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
return IRVPBB;
}
-void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
+void InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) {
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
assert((OrigLoop->getUniqueLatchExitBlock() ||
@@ -2414,8 +2388,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LI, nullptr, Twine(Prefix) + "scalar.ph");
// NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
// wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar
- // preheader may be unreachable at this point. Instead it is replaced in
- // createVectorizedLoopSkeleton.
+ // preheader may be unreachable at this point and SCEV expansion may add to it
+ // later.
}
/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2456,53 +2430,9 @@ static void addFullyUnrolledInstructionsToIgnore(
}
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
- /*
- In this function we generate a new loop. The new loop will contain
- the vectorized instructions while the old loop will continue to run the
- scalar remainder.
-
- [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
- / | preheader are expanded here. Eventually all required SCEV
- / | expansion should happen here.
- / v
- | [ ] <-- vector loop bypass (may consist of multiple blocks).
- | / |
- | / v
- || [ ] <-- vector pre header.
- |/ |
- | v
- | [ ] \
- | [ ]_| <-- vector loop (created during VPlan execution).
- | |
- | v
- \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
- | | successors created during VPlan execution)
- \/ |
- /\ v
- | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
- | |
- (opt) v <-- edge from middle to exit iff epilogue is not required.
- | [ ] \
- | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
- | | wrapped in VPIRBasicBlock).
- \ |
- \ v
- >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
- ...
- */
-
- // Create an empty vector loop, and prepare basic blocks for the runtime
- // checks.
- createVectorLoopSkeleton("");
-
- // Now, compare the new count to zero. If it is zero skip the vector loop and
- // jump to the scalar loop. This check also covers the case where the
- // backedge-taken count is uint##_max: adding one to it will overflow leading
- // to an incorrect trip count of zero. In this (rare) case we will also jump
- // to the scalar loop.
- emitIterationCountCheck(LoopScalarPreHeader);
-
- replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+ // Create a new IR basic block for the scalar preheader.
+ createScalarPreheader("");
+
return LoopVectorPreHeader;
}
@@ -7332,6 +7262,17 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan.resetTripCount(Exp);
ExpSCEV->eraseFromParent();
}
+ // SCEV expansion will add new instructions in the IRBB wrapped by Entry.
+ // Remove existing VPIRInstructions/VPIRPhi and re-create them to make sure
+ // all IR instructions are wrapped. Otherwise VPInstructions may be inserted
+ // at the wrong place.
+ for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
+ if (!isa<VPIRInstruction, VPIRPhi>(&R))
+ continue;
+ R.eraseFromParent();
+ }
+ for (Instruction &I : drop_begin(reverse(*Entry->getIRBasicBlock())))
+ VPIRInstruction::create(I)->insertBefore(*Entry, Entry->begin());
if (!ILV.getTripCount())
ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
@@ -7460,7 +7401,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
- createVectorLoopSkeleton("");
+ createScalarPreheader("");
// Generate the code to check the minimum iteration count of the vector
// epilogue (see below).
@@ -7547,7 +7488,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
BasicBlock *
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
- createVectorLoopSkeleton("vec.epilog.");
+ createScalarPreheader("vec.epilog.");
// Now, compare the remaining count and if there aren't enough iterations to
// execute the vectorized epilogue skip to the scalar part.
@@ -9353,6 +9294,28 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
}
}
+void LoopVectorizationPlanner::addMinimumIterationCheck(
+ VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount) const {
+ // vscale is not necessarily a power-of-2, which means we cannot guarantee
+ // an overflow to zero when updating induction variables and so an
+ // additional overflow check is required before entering the vector loop.
+ bool CheckNeededWithTailFolding =
+ VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
+ !isIndvarOverflowCheckKnownFalse(&CM, VF, 1) &&
+ CM.getTailFoldingStyle() !=
+ TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+ VPlanTransforms::addMinimumIterationCheck(
+ Plan, VF, UF, MinProfitableTripCount,
+ CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
+ CheckNeededWithTailFolding, OrigLoop,
+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
+ ? &MinItersBypassWeights[0]
+ : nullptr,
+ OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
+ *PSE.getSE());
+}
+
void VPDerivedIVRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
@@ -9468,10 +9431,13 @@ static bool processLoopInVPlanNativePath(
{
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
- InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.Width, 1, &CM,
- BFI, PSI, Checks, BestPlan);
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, 1, &CM, BFI, PSI,
+ Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, 1,
+ VF.MinProfitableTripCount);
+
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
}
@@ -10250,16 +10216,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
- InnerLoopVectorizer Unroller(
- L, PSE, LI, DT, TTI, AC, ElementCount::getFixed(1),
- ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
+ InnerLoopVectorizer Unroller(L, PSE, LI, DT, TTI, AC,
+ ElementCount::getFixed(1), IC, &CM, BFI, PSI,
+ Checks, BestPlan);
// TODO: Move to general VPlan pipeline once epilogue loops are also
// supported.
VPlanTransforms::runPass(
VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
VF.Width, IC, PSE);
-
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
+ VF.MinProfitableTripCount);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
ORE->emit([&]() {
@@ -10320,14 +10287,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (!Checks.hasChecks())
DisableRuntimeUnroll = true;
} else {
- InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width,
- VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
- Checks, BestPlan);
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI,
+ PSI, Checks, BestPlan);
// TODO: Move to general VPlan pipeline once epilogue loops are also
// supported.
VPlanTransforms::runPass(
VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
VF.Width, IC, PSE);
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
+ VF.MinProfitableTripCount);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index b231a8429503f..4f36de870d067 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -670,6 +670,80 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
}
}
+void VPlanTransforms::addMinimumIterationCheck(
+ VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
+ bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop,
+ const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE) {
+ // Generate code to check if the loop's trip count is less than VF * UF, or
+ // equal to it in case a scalar epilogue is required; this implies that the
+ // vector trip count is zero. This check also covers the case where adding one
+ // to the backedge-taken count overflowed leading to an incorrect trip count
+ // of zero. In this case we will also jump to the scalar loop.
+ auto P = RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ // If tail is to be folded, vector loop takes care of all iterations.
+ const SCEV *Count = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
+ Type *CountTy = Count->getType();
+ auto CreateStep = [&]() -> const SCEV * {
+ const SCEV *VFxUF = SE.getElementCount(CountTy, (VF * UF), SCEV::FlagNUW);
+ // Create step with max(MinProTripCount, UF * VF).
+ i...
[truncated]
|
@llvm/pr-subscribers-vectorizers Author: Florian Hahn (fhahn) ChangesThis patch adds a new VPlan-based addMinimumIterationCheck, which replaced the ILV version for the non-epilogue case. The VPlan-based version constructs a SCEV expression to compute the minimum iterations, use that to check if the check is known true or false. Otherwise it creates a VPExpandSCEV recipe and emits a compare-and-branch. When using epilogue vectorization, we still need to create the minimum trip-count-check during the legacy skeleton creation. The patch moves the definitions out of ILV. Patch is 127.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153643.diff 34 Files Affected:
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 167845ce646b9..858c1d5392071 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -569,7 +569,9 @@ class ScalarEvolution {
LLVM_ABI const SCEV *getTruncateExpr(const SCEV *Op, Type *Ty,
unsigned Depth = 0);
LLVM_ABI const SCEV *getVScale(Type *Ty);
- LLVM_ABI const SCEV *getElementCount(Type *Ty, ElementCount EC);
+ LLVM_ABI const SCEV *
+ getElementCount(Type *Ty, ElementCount EC,
+ SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap);
LLVM_ABI const SCEV *getZeroExtendExpr(const SCEV *Op, Type *Ty,
unsigned Depth = 0);
LLVM_ABI const SCEV *getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index c20b1bdaf94c7..3b06cb24986e6 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -501,10 +501,11 @@ const SCEV *ScalarEvolution::getVScale(Type *Ty) {
return S;
}
-const SCEV *ScalarEvolution::getElementCount(Type *Ty, ElementCount EC) {
+const SCEV *ScalarEvolution::getElementCount(Type *Ty, ElementCount EC,
+ SCEV::NoWrapFlags Flags) {
const SCEV *Res = getConstant(Ty, EC.getKnownMinValue());
if (EC.isScalable())
- Res = getMulExpr(Res, getVScale(Ty));
+ Res = getMulExpr(Res, getVScale(Ty), Flags);
return Res;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 4856ebebb596f..579db63bb8628 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -332,6 +332,10 @@ class VPBuilder {
FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags(), DL));
}
+ VPExpandSCEVRecipe *expandSCEV(const SCEV *Expr, ScalarEvolution &SE) {
+ return tryInsertInstruction(new VPExpandSCEVRecipe(Expr, SE));
+ }
+
//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
@@ -557,6 +561,10 @@ class LoopVectorizationPlanner {
/// Emit remarks for recipes with invalid costs in the available VPlans.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE);
+ /// Create a check to \p Plan to see if the vector loop should be executed.
+ void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount) const;
+
protected:
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09c9e63ff6a25..fbcd2b1004971 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -500,26 +500,23 @@ class InnerLoopVectorizer {
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
LoopInfo *LI, DominatorTree *DT,
const TargetTransformInfo *TTI, AssumptionCache *AC,
- ElementCount VecWidth,
- ElementCount MinProfitableTripCount,
- unsigned UnrollFactor, LoopVectorizationCostModel *CM,
- BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &RTChecks, VPlan &Plan)
+ ElementCount VecWidth, unsigned UnrollFactor,
+ LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
+ VPlan &Plan)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
- VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount),
- UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Cost(CM),
- BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
+ VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
+ Cost(CM), BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
VectorPHVPBB(cast<VPBasicBlock>(
Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
virtual ~InnerLoopVectorizer() = default;
- /// Create a new empty loop that will contain vectorized instructions later
- /// on, while the old loop will be used as the scalar remainder. Control flow
- /// is generated around the vectorized (and scalar epilogue) loops consisting
- /// of various checks and bypasses. Return the pre-header block of the new
- /// loop. In the case of epilogue vectorization, this function is overriden to
- /// handle the more complex control flow around the loops.
+ /// Create new basic blocks needed as entries and exits of the code generated
+ /// by executing a VPlan. For epilogue vectorization, it will create blocks
+ /// for the minimum iteration checks and IR basic blocks for the vector and
+ /// scalar preheaders. Otherwise it will create a basic block for the scalar
+ /// preheader only.
virtual BasicBlock *createVectorizedLoopSkeleton();
/// Fix the vectorized code, taking care of header phi's, and more.
@@ -547,16 +544,8 @@ class InnerLoopVectorizer {
protected:
friend class LoopVectorizationPlanner;
- // Create a check to see if the vector loop should be executed
- Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
-
- /// Emit a bypass check to see if the vector trip count is zero, including if
- /// it overflows.
- void emitIterationCountCheck(BasicBlock *Bypass);
-
- /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
- /// vector loop preheader, middle block and scalar preheader.
- void createVectorLoopSkeleton(StringRef Prefix);
+ /// Create a new IR basic block for the scalar preheader.
+ void createScalarPreheader(StringRef Prefix);
/// Allow subclasses to override and print debug traces before/after vplan
/// execution, when trace information is requested.
@@ -592,8 +581,6 @@ class InnerLoopVectorizer {
/// vector elements.
ElementCount VF;
- ElementCount MinProfitableTripCount;
-
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF;
@@ -679,9 +666,8 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
ElementCount MinProfitableTripCount, unsigned UnrollFactor)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
- MinProfitableTripCount, UnrollFactor, CM, BFI, PSI,
- Checks, Plan),
- EPI(EPI) {}
+ UnrollFactor, CM, BFI, PSI, Checks, Plan),
+ EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {}
// Override this function to handle the more complex control flow around the
// three loops.
@@ -701,6 +687,9 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
/// iteration count of the loop is so small that the main vector loop is
/// completely skipped.
EpilogueLoopVectorizationInfo &EPI;
+
+protected:
+ ElementCount MinProfitableTripCount;
};
/// A specialized derived class of inner loop vectorizer that performs
@@ -724,6 +713,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
protected:
+ // Create a check to see if the vector loop should be executed
+ Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
+
/// Emits an iteration count bypass check once for the main loop (when \p
/// ForEpilogue is false) and once for the epilogue loop (when \p
/// ForEpilogue is true).
@@ -2292,7 +2284,8 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
}
}
-Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
+Value *
+EpilogueVectorizerMainLoop::createIterationCountCheck(ElementCount VF,
unsigned UF) const {
// Generate code to check if the loop's trip count is less than VF * UF, or
// equal to it in case a scalar epilogue is required; this implies that the
@@ -2363,25 +2356,6 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
return CheckMinIters;
}
-void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
- BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
- Value *CheckMinIters = createIterationCountCheck(VF, UF);
- // Create new preheader for vector loop.
- LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
- static_cast<DominatorTree *>(nullptr), LI,
- nullptr, "vector.ph");
-
- BranchInst &BI =
- *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
- if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
- setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
- ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
-
- assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
- TCCheckBlock &&
- "Plan's entry must be TCCCheckBlock");
-}
-
/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
@@ -2402,7 +2376,7 @@ static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
return IRVPBB;
}
-void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
+void InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) {
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
assert((OrigLoop->getUniqueLatchExitBlock() ||
@@ -2414,8 +2388,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LI, nullptr, Twine(Prefix) + "scalar.ph");
// NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
// wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar
- // preheader may be unreachable at this point. Instead it is replaced in
- // createVectorizedLoopSkeleton.
+ // preheader may be unreachable at this point and SCEV expansion may add to it
+ // later.
}
/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2456,53 +2430,9 @@ static void addFullyUnrolledInstructionsToIgnore(
}
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
- /*
- In this function we generate a new loop. The new loop will contain
- the vectorized instructions while the old loop will continue to run the
- scalar remainder.
-
- [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
- / | preheader are expanded here. Eventually all required SCEV
- / | expansion should happen here.
- / v
- | [ ] <-- vector loop bypass (may consist of multiple blocks).
- | / |
- | / v
- || [ ] <-- vector pre header.
- |/ |
- | v
- | [ ] \
- | [ ]_| <-- vector loop (created during VPlan execution).
- | |
- | v
- \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
- | | successors created during VPlan execution)
- \/ |
- /\ v
- | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
- | |
- (opt) v <-- edge from middle to exit iff epilogue is not required.
- | [ ] \
- | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
- | | wrapped in VPIRBasicBlock).
- \ |
- \ v
- >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
- ...
- */
-
- // Create an empty vector loop, and prepare basic blocks for the runtime
- // checks.
- createVectorLoopSkeleton("");
-
- // Now, compare the new count to zero. If it is zero skip the vector loop and
- // jump to the scalar loop. This check also covers the case where the
- // backedge-taken count is uint##_max: adding one to it will overflow leading
- // to an incorrect trip count of zero. In this (rare) case we will also jump
- // to the scalar loop.
- emitIterationCountCheck(LoopScalarPreHeader);
-
- replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+ // Create a new IR basic block for the scalar preheader.
+ createScalarPreheader("");
+
return LoopVectorPreHeader;
}
@@ -7332,6 +7262,17 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan.resetTripCount(Exp);
ExpSCEV->eraseFromParent();
}
+ // SCEV expansion will add new instructions in the IRBB wrapped by Entry.
+ // Remove existing VPIRInstructions/VPIRPhi and re-create them to make sure
+ // all IR instructions are wrapped. Otherwise VPInstructions may be inserted
+ // at the wrong place.
+ for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
+ if (!isa<VPIRInstruction, VPIRPhi>(&R))
+ continue;
+ R.eraseFromParent();
+ }
+ for (Instruction &I : drop_begin(reverse(*Entry->getIRBasicBlock())))
+ VPIRInstruction::create(I)->insertBefore(*Entry, Entry->begin());
if (!ILV.getTripCount())
ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
@@ -7460,7 +7401,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
- createVectorLoopSkeleton("");
+ createScalarPreheader("");
// Generate the code to check the minimum iteration count of the vector
// epilogue (see below).
@@ -7547,7 +7488,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
BasicBlock *
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
- createVectorLoopSkeleton("vec.epilog.");
+ createScalarPreheader("vec.epilog.");
// Now, compare the remaining count and if there aren't enough iterations to
// execute the vectorized epilogue skip to the scalar part.
@@ -9353,6 +9294,28 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
}
}
+void LoopVectorizationPlanner::addMinimumIterationCheck(
+ VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount) const {
+ // vscale is not necessarily a power-of-2, which means we cannot guarantee
+ // an overflow to zero when updating induction variables and so an
+ // additional overflow check is required before entering the vector loop.
+ bool CheckNeededWithTailFolding =
+ VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
+ !isIndvarOverflowCheckKnownFalse(&CM, VF, 1) &&
+ CM.getTailFoldingStyle() !=
+ TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+ VPlanTransforms::addMinimumIterationCheck(
+ Plan, VF, UF, MinProfitableTripCount,
+ CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
+ CheckNeededWithTailFolding, OrigLoop,
+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
+ ? &MinItersBypassWeights[0]
+ : nullptr,
+ OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
+ *PSE.getSE());
+}
+
void VPDerivedIVRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
@@ -9468,10 +9431,13 @@ static bool processLoopInVPlanNativePath(
{
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
- InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.Width, 1, &CM,
- BFI, PSI, Checks, BestPlan);
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, 1, &CM, BFI, PSI,
+ Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, 1,
+ VF.MinProfitableTripCount);
+
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
}
@@ -10250,16 +10216,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
- InnerLoopVectorizer Unroller(
- L, PSE, LI, DT, TTI, AC, ElementCount::getFixed(1),
- ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
+ InnerLoopVectorizer Unroller(L, PSE, LI, DT, TTI, AC,
+ ElementCount::getFixed(1), IC, &CM, BFI, PSI,
+ Checks, BestPlan);
// TODO: Move to general VPlan pipeline once epilogue loops are also
// supported.
VPlanTransforms::runPass(
VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
VF.Width, IC, PSE);
-
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
+ VF.MinProfitableTripCount);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
ORE->emit([&]() {
@@ -10320,14 +10287,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (!Checks.hasChecks())
DisableRuntimeUnroll = true;
} else {
- InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width,
- VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
- Checks, BestPlan);
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI,
+ PSI, Checks, BestPlan);
// TODO: Move to general VPlan pipeline once epilogue loops are also
// supported.
VPlanTransforms::runPass(
VPlanTransforms::materializeConstantVectorTripCount, BestPlan,
VF.Width, IC, PSE);
+ LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
+ VF.MinProfitableTripCount);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index b231a8429503f..4f36de870d067 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -670,6 +670,80 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
}
}
+void VPlanTransforms::addMinimumIterationCheck(
+ VPlan &Plan, ElementCount VF, unsigned UF,
+ ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue,
+ bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop,
+ const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE) {
+ // Generate code to check if the loop's trip count is less than VF * UF, or
+ // equal to it in case a scalar epilogue is required; this implies that the
+ // vector trip count is zero. This check also covers the case where adding one
+ // to the backedge-taken count overflowed leading to an incorrect trip count
+ // of zero. In this case we will also jump to the scalar loop.
+ auto P = RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ // If tail is to be folded, vector loop takes care of all iterations.
+ const SCEV *Count = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
+ Type *CountTy = Count->getType();
+ auto CreateStep = [&]() -> const SCEV * {
+ const SCEV *VFxUF = SE.getElementCount(CountTy, (VF * UF), SCEV::FlagNUW);
+ // Create step with max(MinProTripCount, UF * VF).
+ i...
[truncated]
|
This patch adds a new VPlan-based addMinimumIterationCheck, which replaced the ILV version for the non-epilogue case. The VPlan-based version constructs a SCEV expression to compute the minimum iterations, use that to check if the check is known true or false. Otherwise it creates a VPExpandSCEV recipe and emits a compare-and-branch. When using epilogue vectorization, we still need to create the minimum trip-count-check during the legacy skeleton creation. The patch moves the definitions out of ILV.
302a390
to
cd7a170
Compare
ping :) |
After llvm#153643, there may be a BranchOnCond with constant condition in the entry block. Simplify those in removeBranchOnConst. This removes a number of redundant conditional branch from entry blocks. In some cases, it may also make the original scalar loop unreachable, because we know it will never execute. In that case, we need to remove the loop from LoopInfo, because all unreachable blocks may dominate each other, making LoopInfo invalid. In those cases, we can also completely remove the loop, for which I'll share a follow-up patch. A couple of places that assume the scalar loop to still be valid need updating. Depends on llvm#153643. (Included in PR).
@@ -332,6 +332,10 @@ class VPBuilder { | |||
FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags(), DL)); | |||
} | |||
|
|||
VPExpandSCEVRecipe *expandSCEV(const SCEV *Expr, ScalarEvolution &SE) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VPExpandSCEVRecipe *expandSCEV(const SCEV *Expr, ScalarEvolution &SE) { | |
VPExpandSCEVRecipe *createExpandSCEV(const SCEV *Expr, ScalarEvolution &SE) { |
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
const SCEV *Res = getConstant(Ty, EC.getKnownMinValue()); | ||
if (EC.isScalable()) | ||
Res = getMulExpr(Res, getVScale(Ty)); | ||
Res = getMulExpr(Res, getVScale(Ty), Flags); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it allowed for computing an element count to wrap? If it wraps will everything else continue to work fine?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are in a bit of an odd situation here at the moment, for the element counts generated by LV, it is assumed that they won't wrap, but in general they could wrap, hence adding the flag as optional. This is needed to avoid regressions w.r.t to legacy code
/// Emit basic blocks (prefixed with \p Prefix) for the iteration check, | ||
/// vector loop preheader, middle block and scalar preheader. | ||
void createVectorLoopSkeleton(StringRef Prefix); | ||
/// Create a new IR basic block for the scalar preheader. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// Create a new IR basic block for the scalar preheader. | |
/// Create a new IR basic block for the scalar preheader whose name is prefixed with \p Prefix. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
@@ -724,6 +713,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { | |||
BasicBlock *createEpilogueVectorizedLoopSkeleton() final; | |||
|
|||
protected: | |||
// Create a check to see if the vector loop should be executed |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Create a check to see if the vector loop should be executed | |
// Create a check to see if the main vector loop should be executed |
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, updated thanks!
// preheader may be unreachable at this point. Instead it is replaced in | ||
// createVectorizedLoopSkeleton. | ||
// preheader may be unreachable at this point and SCEV expansion may add to it | ||
// later. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
will this replacement still take place later?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, straight after State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Worth retaining the "Instead it is replaced .." part of the comment?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
CheckMinIters = Builder.createICmp(ICmpInst::ICMP_ULT, LHS, | ||
Builder.expandSCEV(Step, SE), DL); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an upper bound rather than a lower bound, i.e., checks that the number of iterations is less than a maximal number rather than more than a minimal number. Perhaps better call the check a "TripCountCheck" rather than MinIters, in variables and method name?
Note that the original check of minIter was actually checking both a lower bound (VF*UF) and an upper bound (max unsigned plus 1).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, updated the name. thanks!
if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) | ||
return VFxUF; | ||
|
||
const SCEV *MinProfTC = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
const SCEV *MinProfTC = | |
const SCEV *MinProfitableTripCountSCEV = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
@@ -559,6 +563,10 @@ class LoopVectorizationPlanner { | |||
/// Emit remarks for recipes with invalid costs in the available VPlans. | |||
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE); | |||
|
|||
/// Create a check to \p Plan to see if the vector loop should be executed. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// Create a check to \p Plan to see if the vector loop should be executed. | |
/// Create a check to \p Plan to see if the vector loop should be executed based on its trip count. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added thanks
VPBuilder Builder(EntryVPBB); | ||
VPValue *CheckMinIters = Plan.getFalse(); | ||
const SCEV *Step = CreateStep(); | ||
if (!TailFolded) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be clear to have here an
assert(!CheckNeededWithTailFolding && "Expecting IV bump to not wrap w/o tail folding");
or rather
assert(!IndvarBumpMayWrap && "Expecting IV bump to not wrap w/o tail folding");
|
||
/// Emit basic blocks (prefixed with \p Prefix) for the iteration check, | ||
/// vector loop preheader, middle block and scalar preheader. | ||
void createVectorLoopSkeleton(StringRef Prefix); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this removal resolve the discrepancy/ambiguity between
createVectorLoopSkeleton() and
createVectorizedloopSkeleton()?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, createVectorLoopSkeleton is removed now, replaced by createScalarPreheader
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very good!
@@ -559,6 +563,11 @@ class LoopVectorizationPlanner { | |||
/// Emit remarks for recipes with invalid costs in the available VPlans. | |||
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE); | |||
|
|||
/// Create a check to \p Plan to see if the vector loop should be executed | |||
/// based on its trip count. | |||
void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, | |
void addTripCountCheck(VPlan &Plan, ElementCount VF, unsigned UF, |
Renaming can be done separately to simplify the refactoring.
R.eraseFromParent(); | ||
} | ||
for (Instruction &I : drop_begin(reverse(*Entry->getIRBasicBlock()))) | ||
VPIRInstruction::create(I)->insertBefore(*Entry, Entry->begin()); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When vectorizing main loop with the epilog to be vectorized later, is EpilogueVectorizerMainLoop used instead of base ILV? Thought is to try and turn ILV::createVectorizedLoopSkeleton() unreachable (rather than pure virtual), as skeleton creation is essentially used only when both main and epilog are vectorized.
|
||
/// Emit basic blocks (prefixed with \p Prefix) for the iteration check, | ||
/// vector loop preheader, middle block and scalar preheader. | ||
void createVectorLoopSkeleton(StringRef Prefix); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very good!
[ ] <-- old preheader - loop iteration number check and SCEVs in Plan's | ||
/ | preheader are expanded here. Eventually all required SCEV | ||
/ | expansion should happen here. | ||
/ v | ||
| [ ] <-- vector loop bypass (may consist of multiple blocks). | ||
| / | | ||
| / v | ||
|| [ ] <-- vector pre header. | ||
|/ | | ||
| v | ||
| [ ] \ | ||
| [ ]_| <-- vector loop (created during VPlan execution). | ||
| | | ||
| v | ||
\ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to | ||
| | successors created during VPlan execution) | ||
\/ | | ||
/\ v | ||
| ->[ ] <--- new preheader (wrapped in VPIRBasicBlock). | ||
| | | ||
(opt) v <-- edge from middle to exit iff epilogue is not required. | ||
| [ ] \ | ||
| [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header | ||
| | wrapped in VPIRBasicBlock). | ||
\ | | ||
\ v | ||
>[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps this may fit somewhere at the outset. OTOH seems somewhat obsolete - needs to consider epilog vectorization, early exit.
{MaxUIntTripCount, Plan.getTripCount()}, | ||
DebugLoc::getUnknown()); | ||
|
||
// Don't execute the vector loop if (UMax - n) < (VF * UF). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, perhaps worth leaving a FIXME.
MDNode *BranchWeights = MDB.createBranchWeights( | ||
ArrayRef(MinItersBypassWeights, 2), /*IsExpected=*/false); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should know better when TripCountCheck is set to true or false.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, for now this is to keep existing behavior. This will be taken care of by simplifying branch-on-cond with constant condition to an unconditional branch: #154510
VPValue *TripCountVPV = Plan.getTripCount(); | ||
const SCEV *TripCount = vputils::getSCEVExprForVPValue(TripCountVPV, SE); | ||
Type *TripCountTy = TripCount->getType(); | ||
auto CreateMinTripCount = [&]() -> const SCEV * { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
auto CreateMinTripCount = [&]() -> const SCEV * { | |
auto GetMinTripCount = [&]() -> const SCEV * { |
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks
const SCEV *TripCount = vputils::getSCEVExprForVPValue(TripCountVPV, SE); | ||
Type *TripCountTy = TripCount->getType(); | ||
auto CreateMinTripCount = [&]() -> const SCEV * { | ||
// Create or get max(MinProfitableTripCount, UF * VF) and return it. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Create or get max(MinProfitableTripCount, UF * VF) and return it. | |
// Compute max(MinProfitableTripCount, UF * VF) and return it. |
now it's all "get()'s"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
if (!TailFolded) { | ||
// TODO: Emit unconditional branch to vector preheader instead of | ||
// conditional branch with known condition. | ||
TripCount = SE.applyLoopGuards(TripCount, OrigLoop); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Independent: should this apply to TailFold cases as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It could probably also help there, yes
// preheader may be unreachable at this point. Instead it is replaced in | ||
// createVectorizedLoopSkeleton. | ||
// preheader may be unreachable at this point and SCEV expansion may add to it | ||
// later. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Worth retaining the "Instead it is replaced .." part of the comment?
const SCEV *Max = SE.getUMaxExpr(MinProfitableTripCountSCEV, VFxUF); | ||
if (!VF.isScalable()) | ||
return Max; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: may be better to compress this and the below empty lines, being a lambda.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done thanks
ExpSCEV->eraseFromParent(); | ||
} | ||
assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) && | ||
"VPExpandSCEVRecipes must be be at the beginning of the entry block, " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"VPExpandSCEVRecipes must be be at the beginning of the entry block, " | |
"VPExpandSCEVRecipes must be at the beginning of the entry block, " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated, thanks
@@ -3507,8 +3508,19 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { | |||
ExpSCEV->replaceAllUsesWith(Exp); | |||
if (Plan.getTripCount() == ExpSCEV) | |||
Plan.resetTripCount(Exp); | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removed thanks
ExpSCEV->eraseFromParent(); | ||
} | ||
assert(none_of(*Entry, IsaPred<VPExpandSCEVRecipe>) && | ||
"VPExpandSCEVRecipes must be be at the beginning of the entry block, " | ||
"after any VPIRInstructions"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"after any VPIRInstructions"); | |
"after any VPIRPhi and VPIRInstructions"); |
what else is there in the entry block?
This assert allows ExpandSCEV recipes to originally reside between VPInstructions and VPIRPhi recipes, rather than "after any VPIRInstructions".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VPIRPhi is a sub-class of VPIRInstruction.
After VPExpandSCEV, there should only by VPInstructions, added by this patch
/// Create new basic blocks needed as entries and exits of the code generated | ||
/// by executing a VPlan. For epilogue vectorization, it will create blocks | ||
/// for the minimum iteration checks and IR basic blocks for the vector and | ||
/// scalar preheaders. Otherwise it will create a basic block for the scalar | ||
/// preheader only. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This explanation can be simplified, especially if the method is devoted to the case vectorizing both main and epilog loops.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, updated to say what it returns here and that it is specialized
if (!VF.isScalable()) | ||
return Max; | ||
|
||
if (UF * VF.getKnownMinValue() >= | ||
MinProfitableTripCount.getKnownMinValue()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (!VF.isScalable()) | |
return Max; | |
if (UF * VF.getKnownMinValue() >= | |
MinProfitableTripCount.getKnownMinValue()) { | |
if (VF.isScalable() && UF * VF.getKnownMinValue() >= | |
MinProfitableTripCount.getKnownMinValue()) { |
? Could be hoisted above computing Max and MinProfitableTripCountSCEV.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This LGTM, thanks! Note several comments, including follow-up thoughts.
/// handle the more complex control flow around the loops. | ||
/// Creates a basic block for the scalar preheader. Both | ||
/// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite | ||
/// the method to create blocks and checks needed for epilogue vectorization. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// the method to create blocks and checks needed for epilogue vectorization. | |
/// the method to create additional blocks and checks needed for epilogue | |
/// vectorization. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
@@ -721,9 +698,12 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { | |||
EPI.MainLoopVF, EPI.MainLoopUF) {} | |||
/// Implements the interface for creating a vectorized skeleton using the | |||
/// *main loop* strategy (ie the first pass of vplan execution). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// *main loop* strategy (ie the first pass of vplan execution). | |
/// *main loop* strategy (i.e., the first pass of vplan execution). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
@@ -750,7 +730,7 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { | |||
} | |||
/// Implements the interface for creating a vectorized skeleton using the | |||
/// *epilogue loop* strategy (ie the second pass of vplan execution). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// *epilogue loop* strategy (ie the second pass of vplan execution). | |
/// *epilogue loop* strategy (i.e., the second pass of vplan execution). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader); | ||
// Create a new IR basic block for the scalar preheader. | ||
createScalarPreheader(""); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
@@ -2423,7 +2385,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { | |||
// NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock | |||
// wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar | |||
// preheader may be unreachable at this point. Instead it is replaced in | |||
// createVectorizedLoopSkeleton. | |||
// executePlan. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better have createScalarPreheader()
return LoopScalarPreHeader
than pass it to its caller as a global variable. Can be done independently.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will do, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should be done in ffd0f5f
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() | ||
; CHECK-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 2 | ||
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP12]]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice dce
; CHECK-NEXT: [[MUL1:%.*]] = mul nuw nsw i64 [[TMP0]], 12 | ||
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() | ||
; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2 | ||
; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP0]], 2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
vscale*12 is nsw, so vscale<<2 should be nsw
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yep
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] | ||
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] | ||
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: | ||
; CHECK-NEXT: [[MIN_ITERS_CHECK2:%.*]] = icmp ult i32 [[TMP2]], 4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Change here is merely hoisting the freeze, but shows a missed optimization: same comparison is repeated and branched upon
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, although the generic epilogue tests are a bit misleading here, as they both use VF = 4 for main and epilogue loop.....
; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 | ||
; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 | ||
; CHECK-NEXT: IR %0 = sub i64 %end1, %start2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Example of SCEV expanded IR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep
; CHECK-NEXT: ir-bb<entry>: | ||
; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%n>, ir<8> | ||
; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> | ||
; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.ph | ||
; CHECK-EMPTY: | ||
; CHECK: vector.ph: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At this point vector.ph is no longer an ir-bb, entry is
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep
…on-epilogue. (#153643) This patch adds a new VPlan-based addMinimumIterationCheck, which replaced the ILV version for the non-epilogue case. The VPlan-based version constructs a SCEV expression to compute the minimum iterations, use that to check if the check is known true or false. Otherwise it creates a VPExpandSCEV recipe and emits a compare-and-branch. When using epilogue vectorization, we still need to create the minimum trip-count-check during the legacy skeleton creation. The patch moves the definitions out of ILV. PR: llvm/llvm-project#153643
Follow-up suggested in #153643. Remove some more global state by directly returning the scalar preheader from createScalarPreheader.
Follow-up suggested in llvm/llvm-project#153643. Remove some more global state by directly returning the scalar preheader from createScalarPreheader.
This patch adds a new VPlan-based addMinimumIterationCheck, which replaced the ILV version for the non-epilogue case.
The VPlan-based version constructs a SCEV expression to compute the minimum iterations, use that to check if the check is known true or false. Otherwise it creates a VPExpandSCEV recipe and emits a compare-and-branch.
When using epilogue vectorization, we still need to create the minimum trip-count-check during the legacy skeleton creation. The patch moves the definitions out of ILV.