Skip to content

Commit af7e158

Browse files
committed
[LV] Vectorizer should adjust trip count in profile information
Summary: Vectorized loop processes VFxUF number of elements in one iteration thus total number of iterations decreases proportionally. In addition epilog loop may not have more than VFxUF - 1 iterations. This patch updates profile information accordingly. Reviewers: hsaito, Ayal, fhahn, reames, silvas, dcaballe, SjoerdMeijer, mkuper, DaniilSuchkov Reviewed By: Ayal, DaniilSuchkov Subscribers: fedor.sergeev, hiraditya, rkruppe, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67905
1 parent 1f946ee commit af7e158

File tree

5 files changed

+235
-13
lines changed

5 files changed

+235
-13
lines changed

llvm/include/llvm/Transforms/Utils/LoopUtils.h

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,10 +262,22 @@ TransformationMode hasLICMVersioningTransformation(Loop *L);
262262
void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
263263
unsigned V = 0);
264264

265-
/// Get a loop's estimated trip count based on branch weight metadata.
265+
/// Returns a loop's estimated trip count based on branch weight metadata.
266+
/// In addition if \p EstimatedLoopInvocationWeight is not null it is
267+
/// initialized with weight of loop's latch leading to the exit.
266268
/// Returns 0 when the count is estimated to be 0, or None when a meaningful
267269
/// estimate can not be made.
268-
Optional<unsigned> getLoopEstimatedTripCount(Loop *L);
270+
Optional<unsigned>
271+
getLoopEstimatedTripCount(Loop *L,
272+
unsigned *EstimatedLoopInvocationWeight = nullptr);
273+
274+
/// Set a loop's branch weight metadata to reflect that loop has \p
275+
/// EstimatedTripCount iterations and \p EstimatedLoopInvocationWeight exits
276+
/// through latch. Returns true if metadata is successfully updated, false
277+
/// otherwise. Note that loop must have a latch block which controls loop exit
278+
/// in order to succeed.
279+
bool setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
280+
unsigned EstimatedLoopInvocationWeight);
269281

270282
/// Check inner loop (L) backedge count is known to be invariant on all
271283
/// iterations of its outer loop. If the loop has no parent, this is trivially
@@ -370,6 +382,23 @@ int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
370382
DominatorTree *DT, ReplaceExitVal ReplaceExitValue,
371383
SmallVector<WeakTrackingVH, 16> &DeadInsts);
372384

385+
/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for
386+
/// \p OrigLoop and the following distribution of \p OrigLoop iteration among \p
387+
/// UnrolledLoop and \p RemainderLoop. \p UnrolledLoop receives weights that
388+
/// reflect TC/UF iterations, and \p RemainderLoop receives weights that reflect
389+
/// the remaining TC%UF iterations.
390+
///
391+
/// Note that \p OrigLoop may be equal to either \p UnrolledLoop or \p
392+
/// RemainderLoop in which case weights for \p OrigLoop are updated accordingly.
393+
/// Note also behavior is undefined if \p UnrolledLoop and \p RemainderLoop are
394+
/// equal. \p UF must be greater than zero.
395+
/// If \p OrigLoop has no profile info associated nothing happens.
396+
///
397+
/// This utility may be useful for such optimizations as unroller and
398+
/// vectorizer as it's typical transformation for them.
399+
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop,
400+
Loop *RemainderLoop, uint64_t UF);
401+
373402
} // end namespace llvm
374403

375404
#endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H

llvm/lib/Transforms/Utils/LoopUtils.cpp

Lines changed: 82 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "llvm/IR/Dominators.h"
3333
#include "llvm/IR/Instructions.h"
3434
#include "llvm/IR/IntrinsicInst.h"
35+
#include "llvm/IR/MDBuilder.h"
3536
#include "llvm/IR/Module.h"
3637
#include "llvm/IR/PatternMatch.h"
3738
#include "llvm/IR/ValueHandle.h"
@@ -690,17 +691,17 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
690691
}
691692
}
692693

693-
Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
694-
// Support loops with an exiting latch and other existing exists only
695-
// deoptimize.
696-
697-
// Get the branch weights for the loop's backedge.
694+
/// Checks if \p L has single exit through latch block except possibly
695+
/// "deoptimizing" exits. Returns branch instruction terminating the loop
696+
/// latch if above check is successful, nullptr otherwise.
697+
static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
698698
BasicBlock *Latch = L->getLoopLatch();
699699
if (!Latch)
700-
return None;
700+
return nullptr;
701+
701702
BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());
702703
if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))
703-
return None;
704+
return nullptr;
704705

705706
assert((LatchBR->getSuccessor(0) == L->getHeader() ||
706707
LatchBR->getSuccessor(1) == L->getHeader()) &&
@@ -711,21 +712,36 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
711712
if (any_of(ExitBlocks, [](const BasicBlock *EB) {
712713
return !EB->getTerminatingDeoptimizeCall();
713714
}))
715+
return nullptr;
716+
717+
return LatchBR;
718+
}
719+
720+
Optional<unsigned>
721+
llvm::getLoopEstimatedTripCount(Loop *L,
722+
unsigned *EstimatedLoopInvocationWeight) {
723+
// Support loops with an exiting latch and other existing exists only
724+
// deoptimize.
725+
BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
726+
if (!LatchBranch)
714727
return None;
715728

716729
// To estimate the number of times the loop body was executed, we want to
717730
// know the number of times the backedge was taken, vs. the number of times
718731
// we exited the loop.
719732
uint64_t BackedgeTakenWeight, LatchExitWeight;
720-
if (!LatchBR->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
733+
if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
721734
return None;
722735

723-
if (LatchBR->getSuccessor(0) != L->getHeader())
736+
if (LatchBranch->getSuccessor(0) != L->getHeader())
724737
std::swap(BackedgeTakenWeight, LatchExitWeight);
725738

726739
if (!LatchExitWeight)
727740
return None;
728741

742+
if (EstimatedLoopInvocationWeight)
743+
*EstimatedLoopInvocationWeight = LatchExitWeight;
744+
729745
// Estimated backedge taken count is a ratio of the backedge taken weight by
730746
// the weight of the edge exiting the loop, rounded to nearest.
731747
uint64_t BackedgeTakenCount =
@@ -734,6 +750,37 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
734750
return BackedgeTakenCount + 1;
735751
}
736752

753+
bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
754+
unsigned EstimatedloopInvocationWeight) {
755+
// Support loops with an exiting latch and other existing exists only
756+
// deoptimize.
757+
BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
758+
if (!LatchBranch)
759+
return false;
760+
761+
// Calculate taken and exit weights.
762+
unsigned LatchExitWeight = 0;
763+
unsigned BackedgeTakenWeight = 0;
764+
765+
if (EstimatedTripCount > 0) {
766+
LatchExitWeight = EstimatedloopInvocationWeight;
767+
BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight;
768+
}
769+
770+
// Make a swap if back edge is taken when condition is "false".
771+
if (LatchBranch->getSuccessor(0) != L->getHeader())
772+
std::swap(BackedgeTakenWeight, LatchExitWeight);
773+
774+
MDBuilder MDB(LatchBranch->getContext());
775+
776+
// Set/Update profile metadata.
777+
LatchBranch->setMetadata(
778+
LLVMContext::MD_prof,
779+
MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight));
780+
781+
return true;
782+
}
783+
737784
bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
738785
ScalarEvolution &SE) {
739786
Loop *OuterL = InnerLoop->getParentLoop();
@@ -1351,3 +1398,29 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI,
13511398
Rewriter.clearInsertPoint();
13521399
return NumReplaced;
13531400
}
1401+
1402+
/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for
1403+
/// \p OrigLoop.
1404+
void llvm::setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop,
1405+
Loop *RemainderLoop, uint64_t UF) {
1406+
assert(UF > 0 && "Zero unrolled factor is not supported");
1407+
assert(UnrolledLoop != RemainderLoop &&
1408+
"Unrolled and Remainder loops are expected to distinct");
1409+
1410+
// Get number of iterations in the original scalar loop.
1411+
unsigned OrigLoopInvocationWeight = 0;
1412+
Optional<unsigned> OrigAverageTripCount =
1413+
getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
1414+
if (!OrigAverageTripCount)
1415+
return;
1416+
1417+
// Calculate number of iterations in unrolled loop.
1418+
unsigned UnrolledAverageTripCount = *OrigAverageTripCount / UF;
1419+
// Calculate number of iterations for remainder loop.
1420+
unsigned RemainderAverageTripCount = *OrigAverageTripCount % UF;
1421+
1422+
setLoopEstimatedTripCount(UnrolledLoop, UnrolledAverageTripCount,
1423+
OrigLoopInvocationWeight);
1424+
setLoopEstimatedTripCount(RemainderLoop, RemainderAverageTripCount,
1425+
OrigLoopInvocationWeight);
1426+
}

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3483,6 +3483,19 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
34833483

34843484
// Remove redundant induction instructions.
34853485
cse(LoopVectorBody);
3486+
3487+
// Set/update profile weights for the vector and remainder loops as original
3488+
// loop iterations are now distributed among them. Note that original loop
3489+
// represented by LoopScalarBody becomes remainder loop after vectorization.
3490+
//
3491+
// For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3492+
// end up getting slightly roughened result but that should be OK since
3493+
// profile is not inherently precise anyway. Note also possible bypass of
3494+
// vector code caused by legality checks is ignored, assigning all the weight
3495+
// to the vector loop, optimistically.
3496+
setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3497+
LI->getLoopFor(LoopVectorBody),
3498+
LI->getLoopFor(LoopScalarBody), VF * UF);
34863499
}
34873500

34883501
void InnerLoopVectorizer::fixCrossIterationPHIs() {
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 -S < %s | FileCheck %s
3+
; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=4 -S < %s | FileCheck %s -check-prefix=CHECK-MASKED
4+
5+
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
6+
7+
@a = dso_local global [1024 x i32] zeroinitializer, align 16
8+
@b = dso_local global [1024 x i32] zeroinitializer, align 16
9+
10+
; Check correctness of profile info for vectorization without epilog.
11+
; Function Attrs: nofree norecurse nounwind uwtable
12+
define dso_local void @_Z3foov() local_unnamed_addr #0 {
13+
; CHECK-LABEL: @_Z3foov(
14+
; CHECK: [[VECTOR_BODY:vector\.body]]:
15+
; CHECK: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
16+
; CHECK: [[FOR_BODY:for\.body]]:
17+
; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
18+
; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]:
19+
; CHECK-MASKED: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
20+
; CHECK-MASKED: [[FOR_BODY:for\.body]]:
21+
; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
22+
;
23+
entry:
24+
br label %for.body
25+
26+
for.cond.cleanup: ; preds = %for.body
27+
ret void
28+
29+
for.body: ; preds = %for.body, %entry
30+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
31+
%arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv
32+
%0 = load i32, i32* %arrayidx, align 4, !tbaa !2
33+
%1 = trunc i64 %indvars.iv to i32
34+
%mul = mul nsw i32 %0, %1
35+
%arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv
36+
%2 = load i32, i32* %arrayidx2, align 4, !tbaa !2
37+
%add = add nsw i32 %2, %mul
38+
store i32 %add, i32* %arrayidx2, align 4, !tbaa !2
39+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
40+
%exitcond = icmp eq i64 %indvars.iv.next, 1024
41+
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !6
42+
}
43+
44+
; Check correctness of profile info for vectorization with epilog.
45+
; Function Attrs: nofree norecurse nounwind uwtable
46+
define dso_local void @_Z3foo2v() local_unnamed_addr #0 {
47+
; CHECK-LABEL: @_Z3foo2v(
48+
; CHECK: [[VECTOR_BODY:vector\.body]]:
49+
; CHECK: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
50+
; CHECK: [[FOR_BODY:for\.body]]:
51+
; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
52+
; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]:
53+
; CHECK-MASKED: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
54+
; CHECK-MASKED: [[FOR_BODY:for\.body]]:
55+
; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
56+
;
57+
entry:
58+
br label %for.body
59+
60+
for.cond.cleanup: ; preds = %for.body
61+
ret void
62+
63+
for.body: ; preds = %for.body, %entry
64+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
65+
%arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv
66+
%0 = load i32, i32* %arrayidx, align 4, !tbaa !2
67+
%1 = trunc i64 %indvars.iv to i32
68+
%mul = mul nsw i32 %0, %1
69+
%arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv
70+
%2 = load i32, i32* %arrayidx2, align 4, !tbaa !2
71+
%add = add nsw i32 %2, %mul
72+
store i32 %add, i32* %arrayidx2, align 4, !tbaa !2
73+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
74+
%exitcond = icmp eq i64 %indvars.iv.next, 1027
75+
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !7
76+
}
77+
78+
attributes #0 = { "use-soft-float"="false" }
79+
80+
!llvm.module.flags = !{!0}
81+
!llvm.ident = !{!1}
82+
83+
; CHECK: [[LP1_255]] = !{!"branch_weights", i32 1, i32 255}
84+
; CHECK: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
85+
; CHECK-MASKED: [[LP1_63]] = !{!"branch_weights", i32 1, i32 63}
86+
; CHECK-MASKED: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
87+
; CHECK: [[LP1_2]] = !{!"branch_weights", i32 1, i32 2}
88+
89+
!0 = !{i32 1, !"wchar_size", i32 4}
90+
!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project c292b5b5e059e6ce3e6449e6827ef7e1037c21c4)"}
91+
!2 = !{!3, !3, i64 0}
92+
!3 = !{!"int", !4, i64 0}
93+
!4 = !{!"omnipotent char", !5, i64 0}
94+
!5 = !{!"Simple C++ TBAA"}
95+
!6 = !{!"branch_weights", i32 1, i32 1023}
96+
!7 = !{!"branch_weights", i32 1, i32 1026}

llvm/test/Transforms/LoopVectorize/tripcount.ll

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,10 @@ define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
6161
; but has a high trip count per invocation. Vectorize it.
6262

6363
; CHECK-LABEL: @foo_low_trip_count3(
64-
; CHECK: vector.body:
65-
64+
; CHECK: [[VECTOR_BODY:vector\.body]]:
65+
; CHECK: br i1 [[TMP9:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP3:\!.*]],
66+
; CHECK: [[FOR_BODY:for\.body]]:
67+
; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP6:\!.*]],
6668
entry:
6769
br i1 %cond, label %for.preheader, label %for.end, !prof !2
6870

@@ -205,6 +207,15 @@ for.end: ; preds = %for.body
205207
ret i32 0
206208
}
207209

210+
; CHECK: [[LP3]] = !{!"branch_weights", i32 10, i32 2490}
211+
; CHECK: [[LP6]] = !{!"branch_weights", i32 10, i32 0}
212+
; original loop has latchExitWeight=10 and backedgeTakenWeight=10,000,
213+
; therefore estimatedBackedgeTakenCount=1,000 and estimatedTripCount=1,001.
214+
; Vectorizing by 4 produces estimatedTripCounts of 1,001/4=250 and 1,001%4=1
215+
; for vectorized and remainder loops, respectively, therefore their
216+
; estimatedBackedgeTakenCounts are 249 and 0, and so the weights recorded with
217+
; loop invocation weights of 10 are the above {10, 2490} and {10, 0}.
218+
208219
!0 = !{!"function_entry_count", i64 100}
209220
!1 = !{!"branch_weights", i32 100, i32 0}
210221
!2 = !{!"branch_weights", i32 10, i32 90}

0 commit comments

Comments
 (0)