-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[VPlan] Simplify Plan's entry in removeBranchOnConst. #154510
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
8c30518
to
1824795
Compare
@llvm/pr-subscribers-backend-powerpc @llvm/pr-subscribers-backend-risc-v Author: Florian Hahn (fhahn) ChangesAfter #153643, there may be a Simplify those in removeBranchOnConst. This removes a number of In some cases, it may also make the original scalar loop unreachable, A couple of places that assume the scalar loop to still be valid need Depends on #153643. Patch is 2.60 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154510.diff 332 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 98554310c74df..64cbf509a3118 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2357,9 +2357,9 @@ EpilogueVectorizerMainLoop::createIterationCountCheck(ElementCount VF,
/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
-static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
+static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPlan &Plan, VPBasicBlock *VPBB,
BasicBlock *IRBB) {
- VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
+ VPIRBasicBlock *IRVPBB = Plan.createVPIRBasicBlock(IRBB);
auto IP = IRVPBB->begin();
for (auto &R : make_early_inc_range(VPBB->phis()))
R.moveBefore(*IRVPBB, IP);
@@ -2571,6 +2571,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Remove redundant induction instructions.
cse(HeaderBB);
+ if (Plan.getScalarPreheader()->getNumPredecessors() == 0)
+ return;
+
// Set/update profile weights for the vector and remainder loops as original
// loop iterations are now distributed among them. Note that original loop
// becomes the scalar remainder loop after vectorization.
@@ -7226,6 +7229,12 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::simplifyRecipes(BestVPlan);
VPlanTransforms::removeBranchOnConst(BestVPlan);
+ if (BestVPlan.getEntry()->getSingleSuccessor() ==
+ BestVPlan.getScalarPreheader()) {
+ // TODO: Should not even try to vectorize.
+ return DenseMap<const SCEV *, Value *>();
+ }
+
VPlanTransforms::narrowInterleaveGroups(
BestVPlan, BestVF,
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
@@ -7268,7 +7277,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BasicBlock *EntryBB =
cast<VPIRBasicBlock>(BestVPlan.getEntry())->getIRBasicBlock();
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
- replaceVPBBWithIRVPBB(BestVPlan.getScalarPreheader(),
+ replaceVPBBWithIRVPBB(BestVPlan, BestVPlan.getScalarPreheader(),
State.CFG.PrevBB->getSingleSuccessor());
VPlanTransforms::removeDeadRecipes(BestVPlan);
@@ -7351,8 +7360,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
} else {
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
- if (MDNode *LID = OrigLoop->getLoopID())
- L->setLoopID(LID);
+ if (BestVPlan.getScalarPreheader()->getNumPredecessors() > 0)
+ if (MDNode *LID = OrigLoop->getLoopID())
+ L->setLoopID(LID);
LoopVectorizeHints Hints(L, true, *ORE);
Hints.setAlreadyVectorized();
@@ -7383,6 +7393,16 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
addRuntimeUnrollDisableMetaData(L);
}
+ if (BestVPlan.getScalarPreheader()->getNumPredecessors() == 0) {
+ // If the original loop became unreachable, we need to delete it.
+ auto Blocks = OrigLoop->getBlocksVector();
+ Blocks.push_back(cast<VPIRBasicBlock>(BestVPlan.getScalarPreheader())
+ ->getIRBasicBlock());
+ for (auto *BB : Blocks)
+ LI->removeBlock(BB);
+ LI->erase(OrigLoop);
+ }
+
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
ILV.fixVectorizedLoop(State);
@@ -7460,7 +7480,8 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
// generated here dominates the vector epilog iter check.
EPI.TripCount = Count;
} else {
- VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+ VectorPHVPBB =
+ replaceVPBBWithIRVPBB(Plan, VectorPHVPBB, LoopVectorPreHeader);
}
BranchInst &BI =
@@ -7493,7 +7514,7 @@ BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
BasicBlock *VecEpilogueIterationCountCheck =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
nullptr, "vec.epilog.iter.check", true);
- VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, LoopVectorPreHeader);
+ VectorPHVPBB = replaceVPBBWithIRVPBB(Plan, VectorPHVPBB, LoopVectorPreHeader);
emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
VecEpilogueIterationCountCheck);
@@ -10213,11 +10234,22 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
+ if (ORE->allowExtraAnalysis(LV_NAME))
+ checkMixedPrecision(L, ORE);
+
bool DisableRuntimeUnroll = false;
MDNode *OrigLoopID = L->getLoopID();
+ bool LoopRemoved = false;
{
using namespace ore;
if (!VectorizeLoop) {
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+ L->getHeader())
+ << "interleaved loop (interleaved count: "
+ << NV("InterleaveCount", IC) << ")";
+ });
+
assert(IC > 1 && "interleave count should not be 1 or 0");
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
@@ -10234,14 +10266,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
VF.MinProfitableTripCount);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
-
- ORE->emit([&]() {
- return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
- L->getHeader())
- << "interleaved loop (interleaved count: "
- << NV("InterleaveCount", IC) << ")";
- });
+ LoopRemoved = BestPlan.getScalarPreheader()->getNumPredecessors() == 0;
} else {
+ // Report the vectorization decision.
+ reportVectorization(ORE, L, VF, IC);
+
// If we decided that it is *legal* to vectorize the loop, then do it.
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
@@ -10311,23 +10340,23 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// rarely used is not worth unrolling.
if (!Checks.hasChecks())
DisableRuntimeUnroll = true;
+ LoopRemoved = BestPlan.getScalarPreheader()->getNumPredecessors() == 0;
}
- // Report the vectorization decision.
- reportVectorization(ORE, L, VF, IC);
}
-
- if (ORE->allowExtraAnalysis(LV_NAME))
- checkMixedPrecision(L, ORE);
}
assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
"DT not preserved correctly");
+ if (LoopRemoved)
+ return true;
+
std::optional<MDNode *> RemainderLoopID =
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupEpilogue});
if (RemainderLoopID) {
- L->setLoopID(*RemainderLoopID);
+ if (!LoopRemoved)
+ L->setLoopID(*RemainderLoopID);
} else {
if (DisableRuntimeUnroll)
addRuntimeUnrollDisableMetaData(L);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1438dc366b55d..4a7618f40164b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -972,12 +972,14 @@ void VPlan::execute(VPTransformState *State) {
setName("Final VPlan");
LLVM_DEBUG(dump());
- // Disconnect scalar preheader and scalar header, as the dominator tree edge
- // will be updated as part of VPlan execution. This allows keeping the DTU
- // logic generic during VPlan execution.
BasicBlock *ScalarPh = State->CFG.ExitBB;
- State->CFG.DTU.applyUpdates(
- {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
+ if (getScalarPreheader()->getNumPredecessors() > 0) {
+ // Disconnect scalar preheader and scalar header, as the dominator tree edge
+ // will be updated as part of VPlan execution. This allows keeping the DTU
+ // logic generic during VPlan execution.
+ State->CFG.DTU.applyUpdates(
+ {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
+ }
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
Entry);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d32d2a9ad11f7..8e7fc24080c31 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1920,7 +1920,7 @@ void VPlanTransforms::removeBranchOnConst(VPlan &Plan) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getEntry()))) {
VPValue *Cond;
- if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() ||
+ if (VPBB->getNumSuccessors() != 2 || VPBB->empty() ||
!match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
continue;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 9e1d325a4d8d6..2959e9440e753 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -49,6 +49,7 @@ inline bool isSingleScalar(const VPValue *VPV) {
case Instruction::GetElementPtr:
case Instruction::ICmp:
case Instruction::FCmp:
+ case Instruction::Select:
case VPInstruction::Broadcast:
case VPInstruction::PtrAdd:
return true;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index c18f9f2fae06b..ddfdb257ed49a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -52,8 +52,8 @@ define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) {
; CHECK-NEXT: [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
+; CHECK-NEXT: br i1 [[TMP18]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: for.end:
; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP17]])
; CHECK-NEXT: ret i64 [[TMP19]]
;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
index e44ddbce34fd5..58965c19ae1cc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
@@ -202,8 +202,8 @@ exit:
define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i1 %c.0) {
; CHECK-LABEL: define void @test_blend_feeding_replicated_store_2(
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i1 [[C_0:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[C_0]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
@@ -366,12 +366,11 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
; CHECK-NEXT: [[TMP71:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
; CHECK-NEXT: br i1 [[TMP71]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br label %[[SCALAR_PH]]
+; CHECK-NEXT: br label %[[SCALAR_PH:.*]]
; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ 96, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC1]], align 1
; CHECK-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
index f099c22333c3e..387bb4302de60 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -6,8 +6,8 @@ target triple = "arm64-apple-macosx11.0.0"
define void @fshl_operand_first_order_recurrence(ptr %dst, ptr noalias %src) {
; CHECK-LABEL: define void @fshl_operand_first_order_recurrence(
; CHECK-SAME: ptr [[DST:%.*]], ptr noalias [[SRC:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
@@ -30,14 +30,12 @@ define void @fshl_operand_first_order_recurrence(ptr %dst, ptr noalias %src) {
; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[WIDE_LOAD1]], i32 1
-; CHECK-NEXT: br label %[[SCALAR_PH]]
+; CHECK-NEXT: br label %[[SCALAR_PH:.*]]
; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 100, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[RECUR:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L]] = load i64, ptr [[GEP_SRC]], align 8
; CHECK-NEXT: [[OR:%.*]] = tail call i64 @llvm.fshl.i64(i64 1, i64 [[RECUR]], i64 1)
@@ -73,7 +71,7 @@ define void @powi_call(ptr %P) {
; CHECK-LABEL: define void @powi_call(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
@@ -83,7 +81,7 @@ define void @powi_call(ptr %P) {
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
-; CHECK: [[SCALAR_PH]]:
+; CHECK: [[SCALAR_PH:.*]]:
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -93,7 +91,7 @@ define void @powi_call(ptr %P) {
; CHECK-NEXT: store double [[POWI]], ptr [[GEP]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -224,5 +222,4 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 626242667e203..944f2699d6e62 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -5,7 +5,7 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK-LABEL: define void @clamped_tc_8(
; CHECK-SAME: ptr captures(none) [[DST:%.*]], i32 [[N:%.*]], i64 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
@@ -36,7 +36,7 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[P_OUT_TAIL_09:%.*]] = phi ptr [ [[DST]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
; CHECK-NEXT: [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP19]]
@@ -45,7 +45,7 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_OUT_TAIL_09]], i64 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
;
@@ -79,7 +79,7 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[REM]], 7
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD]], 3
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SHR]] to i64
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
@@ -104,13 +104,13 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = ph...
[truncated]
|
Build on top of llvm#154510 to completely remove dead scalar loops. Depends on llvm#154510. (Included in the PR)
Move the emission of remarks for the vectorization decision before executing the plan, in preparation for #154510.
1824795
to
6afb1b7
Compare
6afb1b7
to
33afce8
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping :)
…e (NFCI). Move the emission of remarks for the vectorization decision before executing the plan, in preparation for llvm/llvm-project#154510.
static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPlan &Plan, VPBasicBlock *VPBB, | ||
BasicBlock *IRBB) { | ||
VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB); | ||
VPIRBasicBlock *IRVPBB = Plan.createVPIRBasicBlock(IRBB); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change is needed because VPBB may no longer be reachable from Plan's entry. VPBB must still have a single predecessor, as documented above, but that pred might be pred free? If only some callers pass an unreachable VPBB, Plan could be an optional parameter, to keep other callers intact.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, updated the comment to say that all predecessors/successors are rewired, if any and added Plan as optional parameter, which must be passed if the may be unreachable, thanks!
if (Plan.getScalarPreheader()->getNumPredecessors() == 0) | ||
return; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Early exit if remainder/default scalar loop is dead, w/o setting/updating profile weights for the vector loop? Worth a comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had to move the code, becuase in order to preserve profile info on the vector loop when the remainder loop is dead requires retrieving the profile info before removing the remainder loop and conditionally setting the profile info on the vector/remainder loops as available.
if (BestVPlan.getEntry()->getSingleSuccessor() == | ||
BestVPlan.getScalarPreheader()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removeBranchOnConst() could conceivably bypass the vector loop; this actually happens in few tests. Worth emitting a missed-vectorization remark.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, added an analysis remark. I am not sure if missed-vectorization would be accurate, because this is for cases where we would create a dead vector loop and should not even try to vectorize.
@@ -7345,8 +7354,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( | |||
} else { | |||
// Keep all loop hints from the original loop on the vector loop (we'll | |||
// replace the vectorizer-specific hints below). | |||
if (MDNode *LID = OrigLoop->getLoopID()) | |||
L->setLoopID(LID); | |||
if (BestVPlan.getScalarPreheader()->getNumPredecessors() > 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (BestVPlan.getScalarPreheader()->getNumPredecessors() > 0) | |
} else if (BestVPlan.getScalarPreheader()->getNumPredecessors() > 0) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The code below will execute in in the general else{
, so I left it as-is for now.
@@ -7377,6 +7387,16 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( | |||
addRuntimeUnrollDisableMetaData(L); | |||
} | |||
|
|||
if (BestVPlan.getScalarPreheader()->getNumPredecessors() == 0) { | |||
// If the original loop became unreachable, we need to delete it. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"Native" VPlan constructs can simply be discarded when they become dead or unreached (even then lazily), whereas VPlan constructs that model the scalar loop are kept orphaned after becoming unreachable, to be processed here.
This raises an old thought: VPlan specifies code to be generated, using its blocks and recipes; how/could VPlan be extended to also dismantle existing code, perhaps using "anti-recipes" or "anti-blocks" whose execute() performs the desired clean-up.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, unfortunately we cannot do this when the VPIRBasicBlock for the scalar preheader gets executed, as we only execute reachable VPBBs.
I moved it for now to VPlan::execute. Unfortunately doing in the destructor of VPIRBasicBlock won't work either, because we need access to LoopInfo.
@@ -53,76 +53,35 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { | |||
; VF8UF2-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { | |||
; VF8UF2-NEXT: [[ENTRY:.*]]: | |||
; VF8UF2-NEXT: [[AND:%.*]] = and i64 [[N]], 15 | |||
; VF8UF2-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Another case skipping the vector loop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yep
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 | ||
; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], [[G_64]] | ||
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 | ||
; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yep removed dead vector loop
@@ -12,15 +12,13 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { | |||
; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF | |||
; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF | |||
; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count | |||
; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count | |||
; CHECK-NEXT: ir<16> = original trip-count |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Simplification due to branch-on-const of entry?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had to update the test because otherwise the vector loop would be dead
; CHECK-NEXT: } | ||
; | ||
entry: | ||
%and = and i64 %N, 15 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, test changed?
; CHECK-NEXT: [[TMP4:%.*]] = fdiv fast <4 x float> splat (float 1.000000e+00), [[BROADCAST_SPLAT]] | ||
; CHECK-NEXT: [[TMP2:%.*]] = fdiv fast <4 x float> splat (float 1.000000e+00), [[BROADCAST_SPLAT]] | ||
; CHECK-NEXT: [[TMP15:%.*]] = fdiv fast <4 x float> splat (float 1.000000e+00), [[BROADCAST_SPLAT]] | ||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] | ||
; CHECK: vector.body: | ||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] | ||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]] | ||
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA3:![0-9]+]] | ||
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP0]] | ||
; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP1]], align 4, !tbaa [[TBAA3]] | ||
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 | ||
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]] | ||
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16 | ||
; CHECK-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP7]], align 4, !tbaa [[TBAA3]] | ||
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[WIDE_LOAD_1]], [[TMP4]] | ||
; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[TMP7]], align 4, !tbaa [[TBAA3]] | ||
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]] | ||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i64 32 | ||
; CHECK-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x float>, ptr [[TMP10]], align 4, !tbaa [[TBAA3]] | ||
; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <4 x float> [[WIDE_LOAD_2]], [[TMP2]] | ||
; CHECK-NEXT: store <4 x float> [[TMP11]], ptr [[TMP10]], align 4, !tbaa [[TBAA3]] | ||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]] | ||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP12]], i64 48 | ||
; CHECK-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x float>, ptr [[TMP13]], align 4, !tbaa [[TBAA3]] | ||
; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x float> [[WIDE_LOAD_3]], [[TMP15]] | ||
; CHECK-NEXT: store <4 x float> [[TMP14]], ptr [[TMP13]], align 4, !tbaa [[TBAA3]] | ||
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 16 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed now, thanks
Split off from #154510, add helper to check if a block has any predecessors.
Split off from llvm/llvm-project#154510, add helper to check if a block has any predecessors.
✅ With the latest revision this PR passed the C/C++ code formatter. |
After #153643, there may be a
BranchOnCond with constant condition in the entry block.
Simplify those in removeBranchOnConst. This removes a number of
redundant conditional branch from entry blocks.
In some cases, it may also make the original scalar loop unreachable,
because we know it will never execute. In that case, we need to remove
the loop from LoopInfo, because all unreachable blocks may dominate each
other, making LoopInfo invalid. In those cases, we can also completely
remove the loop, for which I'll share a follow-up patch.
A couple of places that assume the scalar loop to still be valid need
updating.
Depends on #153643.