-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[LV][AArch64] Prefer epilogue with fixed-width over scalable VF in case of equal costs. #155546
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: Hassnaa Hamdi (hassnaaHamdi) ChangesFor cases like post-LTO vectorization, epilogue with fixed-width VF Patch is 127.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155546.diff 14 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index c4ba8e9857dc4..b38e5e28363db 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1815,7 +1815,7 @@ class TargetTransformInfo {
/// \returns True if the targets prefers fixed width vectorization if the
/// loop vectorizer's cost-model assigns an equal cost to the fixed and
/// scalable version of the vectorized loop.
- LLVM_ABI bool preferFixedOverScalableIfEqualCost() const;
+ LLVM_ABI bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const;
/// \returns True if target prefers SLP vectorizer with altermate opcode
/// vectorization, false - otherwise.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 43813d2f3acb5..d5589113a14c9 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1092,7 +1092,9 @@ class TargetTransformInfoImplBase {
return VF;
}
- virtual bool preferFixedOverScalableIfEqualCost() const { return false; }
+ virtual bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const {
+ return false;
+ }
virtual bool preferInLoopReduction(RecurKind Kind, Type *Ty) const {
return false;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4ac8f03e6dbf5..35e69857d8987 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1402,8 +1402,9 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
}
-bool TargetTransformInfo::preferFixedOverScalableIfEqualCost() const {
- return TTIImpl->preferFixedOverScalableIfEqualCost();
+bool TargetTransformInfo::preferFixedOverScalableIfEqualCost(
+ bool IsEpilogue) const {
+ return TTIImpl->preferFixedOverScalableIfEqualCost(IsEpilogue);
}
bool TargetTransformInfo::preferInLoopReduction(RecurKind Kind,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 490f6391c15a0..f073e2c259632 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6017,7 +6017,13 @@ static bool containsDecreasingPointers(Loop *TheLoop,
return false;
}
-bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const {
+bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost(bool IsEpilogue) const {
+ // For cases like post-LTO vectorization, when we eventually know the trip
+ // count, epilogue with fixed-width vectorization can be deleted if the trip
+ // count is less than the epilogue iterations. That's why we prefer
+ // fixed-width vectorization in epilogue in case of equal costs.
+ if (IsEpilogue)
+ return true;
if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
return SVEPreferFixedOverScalableIfEqualCost;
return ST->useFixedOverScalableIfEqualCost();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 42ae962b3b426..8ca263ec1cb65 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -424,7 +424,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
return TailFoldingStyle::DataWithoutLaneMask;
}
- bool preferFixedOverScalableIfEqualCost() const override;
+ bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override;
unsigned getEpilogueVectorizationMinVF() const override;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 838476dcae661..0f6f991e866a3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -613,13 +613,15 @@ class LoopVectorizationPlanner {
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
/// that of B.
bool isMoreProfitable(const VectorizationFactor &A,
- const VectorizationFactor &B, bool HasTail) const;
+ const VectorizationFactor &B, bool HasTail,
+ bool isEpilogue = false) const;
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
/// that of B in the context of vectorizing a loop with known \p MaxTripCount.
bool isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B,
- const unsigned MaxTripCount, bool HasTail) const;
+ const unsigned MaxTripCount, bool HasTail,
+ bool isEpilogue = false) const;
/// Determines if we have the infrastructure to vectorize the loop and its
/// epilogue, assuming the main loop is vectorized by \p VF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7fc87a0b49f70..677e1b4096645 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3939,7 +3939,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B,
const unsigned MaxTripCount,
- bool HasTail) const {
+ bool HasTail,
+ bool isEpilogue) const {
InstructionCost CostA = A.Cost;
InstructionCost CostB = B.Cost;
@@ -3963,7 +3964,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
// Assume vscale may be larger than 1 (or the value being tuned for),
// so that scalable vectorization is slightly favorable over fixed-width
// vectorization.
- bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
+ bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(isEpilogue) &&
A.Width.isScalable() && !B.Width.isScalable();
auto CmpFn = [PreferScalable](const InstructionCost &LHS,
@@ -4001,10 +4002,11 @@ bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B,
- bool HasTail) const {
+ bool HasTail,
+ bool isEpilogue) const {
const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
- return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
- HasTail);
+ return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
+ isEpilogue);
}
void LoopVectorizationPlanner::emitInvalidCostRemarks(
@@ -4398,16 +4400,10 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
if (TTI.getMaxInterleaveFactor(VF) <= 1)
return false;
- // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
- // VFs when deciding profitability.
- // See related "TODO: extend to support scalable VFs." in
- // selectEpilogueVectorizationFactor.
- unsigned Multiplier = VF.isFixed() ? IC : 1;
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
? EpilogueVectorizationMinVF
: TTI.getEpilogueVectorizationMinVF();
- return estimateElementCount(VF * Multiplier, VScaleForTuning) >=
- MinVFThreshold;
+ return estimateElementCount(VF * IC, VScaleForTuning) >= MinVFThreshold;
}
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
@@ -4511,7 +4507,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
}
if (Result.Width.isScalar() ||
- isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking()))
+ isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
+ /*IsEpilogue*/ true))
Result = NextVF;
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 137e07336fd50..80130305aaf04 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -8,24 +8,29 @@ target triple = "arm64-apple-macosx14.0.0"
define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; DEFAULT-LABEL: define void @iv_casts(
; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; DEFAULT-NEXT: [[ENTRY:.*]]:
+; DEFAULT-NEXT: [[ITER_CHECK:.*]]:
; DEFAULT-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
; DEFAULT-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
+; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; DEFAULT: [[VECTOR_MEMCHECK]]:
; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
; DEFAULT-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
; DEFAULT-NEXT: [[TMP6:%.*]] = sub i64 [[DST1]], [[SRC2]]
; DEFAULT-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; DEFAULT-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
-; DEFAULT: [[VECTOR_PH]]:
+; DEFAULT-NEXT: br i1 [[DIFF_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; DEFAULT: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; DEFAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
+; DEFAULT-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP9]], 4
+; DEFAULT-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP0]], [[TMP8]]
+; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK3]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT: [[VECTOR_PH]]:
+; DEFAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP11]], 16
; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP10]]
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
@@ -63,25 +68,59 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; DEFAULT-NEXT: br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; DEFAULT: [[SCALAR_PH]]:
-; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; DEFAULT: [[VEC_EPILOG_ITER_CHECK]]:
+; DEFAULT-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP48:%.*]] = shl nuw i64 [[TMP47]], 2
+; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP48]]
+; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; DEFAULT: [[VEC_EPILOG_PH]]:
+; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; DEFAULT-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP50:%.*]] = mul nuw i64 [[TMP49]], 4
+; DEFAULT-NEXT: [[N_MOD_VF5:%.*]] = urem i64 [[TMP0]], [[TMP50]]
+; DEFAULT-NEXT: [[N_VEC6:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF5]]
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; DEFAULT-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT: [[TMP51:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT8]] to <vscale x 4 x i16>
+; DEFAULT-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; DEFAULT: [[VEC_EPILOG_VECTOR_BODY]]:
+; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; DEFAULT-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[GEP_SRC]], align 1
+; DEFAULT-NEXT: [[TMP39:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i16>
+; DEFAULT-NEXT: [[TMP40:%.*]] = mul <vscale x 4 x i16> [[TMP39]], [[TMP51]]
+; DEFAULT-NEXT: [[TMP52:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i16>
+; DEFAULT-NEXT: [[TMP53:%.*]] = or <vscale x 4 x i16> [[TMP40]], [[TMP52]]
+; DEFAULT-NEXT: [[TMP54:%.*]] = lshr <vscale x 4 x i16> [[TMP53]], splat (i16 1)
+; DEFAULT-NEXT: [[TMP55:%.*]] = trunc <vscale x 4 x i16> [[TMP54]] to <vscale x 4 x i8>
+; DEFAULT-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT: store <vscale x 4 x i8> [[TMP55]], ptr [[TMP45]], align 1
+; DEFAULT-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[IV]], [[TMP50]]
+; DEFAULT-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]]
+; DEFAULT-NEXT: br i1 [[TMP46]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; DEFAULT-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC6]]
+; DEFAULT-NEXT: br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; DEFAULT: [[VEC_EPILOG_SCALAR_PH]]:
+; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ITER_CHECK]] ]
; DEFAULT-NEXT: br label %[[LOOP:.*]]
; DEFAULT: [[LOOP]]:
-; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; DEFAULT-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
-; DEFAULT-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; DEFAULT-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; DEFAULT-NEXT: [[GEP_SRC1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV1]]
+; DEFAULT-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC1]], align 1
; DEFAULT-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32
; DEFAULT-NEXT: [[MUL16_US:%.*]] = mul i32 [[L_EXT]], [[X]]
-; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 1
; DEFAULT-NEXT: [[CONV25_US:%.*]] = zext i8 [[L]] to i32
; DEFAULT-NEXT: [[ADD34_US:%.*]] = or i32 [[MUL16_US]], [[CONV25_US]]
; DEFAULT-NEXT: [[SHR35_US:%.*]] = lshr i32 [[ADD34_US]], 1
; DEFAULT-NEXT: [[CONV36_US:%.*]] = trunc i32 [[SHR35_US]] to i8
-; DEFAULT-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV1]]
; DEFAULT-NEXT: store i8 [[CONV36_US]], ptr [[GEP_DST]], align 1
-; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV1]], [[N]]
+; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
; DEFAULT: [[EXIT]]:
; DEFAULT-NEXT: ret void
;
@@ -218,7 +257,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 1, ptr [[TMP21]], align 4
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; DEFAULT-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -234,7 +273,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 1, ptr [[GEP]], align 4
; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; DEFAULT: [[EXIT]]:
; DEFAULT-NEXT: ret void
;
@@ -382,7 +421,7 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 [[TMP15]], ptr [[TMP24]], align 4
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; DEFAULT-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -401,7 +440,7 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4
; DEFAULT-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1
; DEFAULT-NEXT: [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; DEFAULT-NEXT: br i1 [[EXITCOND_3_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[EXITCOND_3_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
; DEFAULT: [[EXIT]]:
; DEFAULT-NEXT: ret void
;
@@ -574,7 +613,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 [[TMP14]], ptr [[TMP23]], align 4
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; DEFAULT-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -593,7 +632,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4
; DEFAULT-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1
; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[EC]], label %[[EX...
[truncated]
|
// count, epilogue with fixed-width vectorization can be deleted if the trip | ||
// count is less than the epilogue iterations. That's why we prefer | ||
// fixed-width vectorization in epilogue in case of equal costs. | ||
if (IsEpilogue) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we probably want this to happen after checking SVEPreferFixedOverScalableIfEqualCost
so that we have a way to disable it. Also, it might be worth calling useFixedOverScalableIfEqualCost(IsEpilogue)
to get the answer and setting this on a per-CPU basis.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To set it on per-CPU, I think I have to create another new target feature for it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The reasoning above seems not CPU specific and should apply to all CPUs?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah that's probably true. I was just thinking of limiting this for now to specific CPUs where we have observed the problem, but maybe that's unnecessary. I guess we can leave like it this for now.
…se of equal costs. For cases like post-LTO vectorization, epilogue with fixed-width VF can be removed when we eventually know that the trip count is less than the epilogue iterations.
b8f68f6
to
3af468f
Compare
@@ -1815,7 +1815,7 @@ class TargetTransformInfo { | |||
/// \returns True if the targets prefers fixed width vectorization if the | |||
/// loop vectorizer's cost-model assigns an equal cost to the fixed and | |||
/// scalable version of the vectorized loop. | |||
LLVM_ABI bool preferFixedOverScalableIfEqualCost() const; | |||
LLVM_ABI bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should be documented
// count, epilogue with fixed-width vectorization can be deleted if the trip | ||
// count is less than the epilogue iterations. That's why we prefer | ||
// fixed-width vectorization in epilogue in case of equal costs. | ||
if (IsEpilogue) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The reasoning above seems not CPU specific and should apply to all CPUs?
// count, epilogue with fixed-width vectorization can be deleted if the trip | ||
// count is less than the epilogue iterations. That's why we prefer | ||
// fixed-width vectorization in epilogue in case of equal costs. | ||
if (IsEpilogue) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah that's probably true. I was just thinking of limiting this for now to specific CPUs where we have observed the problem, but maybe that's unnecessary. I guess we can leave like it this for now.
@@ -10,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu" | |||
define void @foo(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i64 %len) #0 { | |||
; CHECK-EPILOG: vec.epilog.ph: | |||
; CHECK-EPILOG: vec.epilog.vector.body: | |||
; CHECK-EPILOG: load <vscale x 4 x i16> | |||
; CHECK-EPILOG: load <8 x i16> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's worth adding a new RUN line for neoverse-v1 with the flag -sve-prefer-fixed-over-scalable-if-equal=false
so that we also test the old behaviour, i.e.
CHECK-EPILOG-PREFER-SCALABLE: load <vscale x 4 x i16>
@@ -11,7 +11,7 @@ target triple = "aarch64-linux-gnu" | |||
|
|||
; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_16' | |||
; DEBUG: Create Skeleton for epilogue vectorized loop (first pass) | |||
; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1 | |||
; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, I think it's probably worth adding a variant of the first RUN line where we set the flag -sve-prefer-fixed-over-scalable-if-equal=false
just to defend the selection of scalable VFs for the epilogue.
For cases like post-LTO vectorization, epilogue with fixed-width VF
can be removed when we eventually know that the trip count is less
than the epilogue iterations.