-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[LV] Add initial legality checks for early exit loops with side effects #145663
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[LV] Add initial legality checks for early exit loops with side effects #145663
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Graham Hunter (huntergr-arm) ChangesSeparating out the next piece of EE-with-stores Full diff: https://github.com/llvm/llvm-project/pull/145663.diff 4 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index d654ac3ec9273..aa34daf596b6b 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -407,6 +407,14 @@ class LoopVectorizationLegality {
return hasUncountableEarlyExit() ? getUncountableEdge()->second : nullptr;
}
+ /// Returns true if this is an early exit loop containing a store.
+ bool isConditionCopyRequired() const { return EarlyExitLoad.has_value(); }
+
+ /// Returns the load instruction, if any, directly used for an exit comparison
+ /// in and early exit loop containing state-changing or potentially-faulting
+ /// operations.
+ std::optional<LoadInst *> getEarlyExitLoad() const { return EarlyExitLoad; }
+
/// Return true if there is store-load forwarding dependencies.
bool isSafeForAnyStoreLoadForwardDistances() const {
return LAI->getDepChecker().isSafeForAnyStoreLoadForwardDistances();
@@ -536,6 +544,12 @@ class LoopVectorizationLegality {
/// additional cases safely.
bool isVectorizableEarlyExitLoop();
+ /// Clears any current early exit data gathered if a check failed.
+ void clearEarlyExitData() {
+ UncountableEdge = std::nullopt;
+ EarlyExitLoad = std::nullopt;
+ }
+
/// Return true if all of the instructions in the block can be speculatively
/// executed, and record the loads/stores that require masking.
/// \p SafePtrs is a list of addresses that are known to be legal and we know
@@ -654,6 +668,10 @@ class LoopVectorizationLegality {
/// Keep track of the loop edge to an uncountable exit, comprising a pair
/// of (Exiting, Exit) blocks, if there is exactly one early exit.
std::optional<std::pair<BasicBlock *, BasicBlock *>> UncountableEdge;
+
+ /// Keep track of the load used for early exits where state-changing or
+ /// potentially faulting operations occur inside the loop.
+ std::optional<LoadInst *> EarlyExitLoad;
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 969d225c6ef2e..eec660e5958ca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -17,6 +17,7 @@
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -1207,8 +1208,42 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
});
}
- if (!LAI->canVectorizeMemory())
- return canVectorizeIndirectUnsafeDependences();
+ if (LAI->canVectorizeMemory()) {
+ // FIXME: Remove or reduce this restriction. We're in a bit of an odd spot
+ // since we're (potentially) doing the load out of its normal order
+ // in the loop and that may throw off dependency checking.
+ // A forward dependency should be fine, but a backwards dep may not
+ // be even if LAA thinks it is due to performing the load for the
+ // vector iteration i+1 in vector iteration i.
+ if (isConditionCopyRequired()) {
+ const MemoryDepChecker &DepChecker = LAI->getDepChecker();
+ const auto *Deps = DepChecker.getDependences();
+
+ for (const MemoryDepChecker::Dependence &Dep : *Deps) {
+ if (Dep.getDestination(DepChecker) == EarlyExitLoad ||
+ Dep.getSource(DepChecker) == EarlyExitLoad) {
+ // Refine language a little? This currently only applies when a store
+ // is present in the early exit loop.
+ reportVectorizationFailure(
+ "No dependencies allowed for early exit condition load",
+ "Early exit condition loads may not have a dependence with "
+ "another"
+ " memory operation.",
+ "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+ return false;
+ }
+ }
+ }
+ } else {
+ if (!isConditionCopyRequired())
+ return canVectorizeIndirectUnsafeDependences();
+ reportVectorizationFailure(
+ "Cannot vectorize unsafe dependencies in state-changing early exit "
+ "loop.",
+ "Unable to vectorize memory in an early exit loop with store",
+ "CantVectorizeUnsafeDependencyForEELoopWithStore", ORE, TheLoop);
+ return false;
+ }
if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
reportVectorizationFailure("We don't allow storing to uniform addresses",
@@ -1747,16 +1782,31 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
}
};
+ bool HasStore = false;
for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+ HasStore = true;
+ if (SI->isSimple())
+ continue;
+
+ reportVectorizationFailure(
+ "Complex writes to memory unsupported in early exit loops",
+ "Cannot vectorize early exit loop with complex writes to memory",
+ "WritesInEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+
if (I.mayWriteToMemory()) {
// We don't support writes to memory.
reportVectorizationFailure(
- "Writes to memory unsupported in early exit loops",
- "Cannot vectorize early exit loop with writes to memory",
+ "Complex writes to memory unsupported in early exit loops",
+ "Cannot vectorize early exit loop with complex writes to memory",
"WritesInEarlyExitLoop", ORE, TheLoop);
return false;
- } else if (!IsSafeOperation(&I)) {
+ }
+
+ if (!IsSafeOperation(&I)) {
reportVectorizationFailure("Early exit loop contains operations that "
"cannot be speculatively executed",
"UnsafeOperationsEarlyExitLoop", ORE,
@@ -1771,13 +1821,65 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
// TODO: Handle loops that may fault.
Predicates.clear();
- if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
- &Predicates)) {
- reportVectorizationFailure(
- "Loop may fault",
- "Cannot vectorize potentially faulting early exit loop",
- "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
- return false;
+ if (HasStore) {
+ // Record load for analysis by isDereferenceableAndAlignedInLoop
+ // and later by dependence analysis.
+ if (BranchInst *Br = dyn_cast<BranchInst>(
+ SingleUncountableEdge->first->getTerminator())) {
+ // FIXME: Handle exit conditions with multiple users, more complex exit
+ // conditions than br(icmp(load, loop_inv)).
+ ICmpInst *Cmp = dyn_cast<ICmpInst>(Br->getCondition());
+ if (Cmp && Cmp->hasOneUse() &&
+ TheLoop->isLoopInvariant(Cmp->getOperand(1))) {
+ LoadInst *Load = dyn_cast<LoadInst>(Cmp->getOperand(0));
+ if (Load && Load->hasOneUse() && !TheLoop->isLoopInvariant(Load)) {
+ if (isDereferenceableAndAlignedInLoop(Load, TheLoop, *PSE.getSE(),
+ *DT, AC, &Predicates)) {
+ ICFLoopSafetyInfo SafetyInfo;
+ SafetyInfo.computeLoopSafetyInfo(TheLoop);
+ // FIXME: We may have multiple levels of conditional loads, so will
+ // need to improve on outright rejection at some point.
+ if (SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop)) {
+ EarlyExitLoad = Load;
+ } else {
+ reportVectorizationFailure(
+ "Early exit condition load not guaranteed to execute",
+ "Cannot vectorize early exit loop when condition load is not "
+ "guaranteed to execute",
+ "EarlyExitLoadNotGuaranteed", ORE, TheLoop);
+ }
+ } else {
+ reportVectorizationFailure(
+ "Uncounted loop condition not known safe",
+ "Cannot vectorize early exit loop with "
+ "possibly unsafe condition load",
+ "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+ }
+ }
+ }
+
+ if (!EarlyExitLoad) {
+ reportVectorizationFailure(
+ "Early exit loop with store but no condition load",
+ "Cannot vectorize early exit loop with store but no condition load",
+ "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+ } else {
+ // Read-only loop.
+ // FIXME: as with the loops with stores, only the loads contributing to
+ // the loop condition need to be guaranteed dereferenceable and
+ // aligned.
+ if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
+ &Predicates)) {
+ reportVectorizationFailure(
+ "Loop may fault",
+ "Cannot vectorize potentially faulting early exit loop",
+ "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
}
[[maybe_unused]] const SCEV *SymbolicMaxBTC =
@@ -1861,7 +1963,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
return false;
} else {
if (!isVectorizableEarlyExitLoop()) {
- UncountableEdge = std::nullopt;
+ clearEarlyExitData();
if (DoExtraAnalysis)
Result = false;
else
@@ -1879,6 +1981,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
return false;
}
+ // Bail out for state-changing EE loops for now.
+ if (EarlyExitLoad) {
+ reportVectorizationFailure(
+ "Writes to memory unsupported in early exit loops",
+ "Cannot vectorize early exit loop with writes to memory",
+ "WritesInEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+
if (Result) {
LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
<< (LAI->getRuntimePointerChecking()->Need
diff --git a/llvm/test/Transforms/LoopVectorize/control-flow.ll b/llvm/test/Transforms/LoopVectorize/control-flow.ll
index 3a8aec34dfe43..2578260fe878d 100644
--- a/llvm/test/Transforms/LoopVectorize/control-flow.ll
+++ b/llvm/test/Transforms/LoopVectorize/control-flow.ll
@@ -10,7 +10,7 @@
; return 0;
; }
-; CHECK: remark: source.cpp:5:9: loop not vectorized: Cannot vectorize early exit loop with writes to memory
+; CHECK: remark: source.cpp:5:9: loop not vectorized: Cannot vectorize early exit loop with possibly unsafe condition load
; CHECK: remark: source.cpp:5:9: loop not vectorized
; CHECK: _Z4testPii
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
index 7c80dad006952..6724c703e4940 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
@@ -3,7 +3,7 @@
define i64 @loop_contains_store(ptr %dest) {
; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store'
-; CHECK: LV: Not vectorizing: Writes to memory unsupported in early exit loops
+; CHECK: LV: Not vectorizing: Early exit loop with store but no condition load.
entry:
%p1 = alloca [1024 x i8]
call void @init_mem(ptr %p1, i64 1024)
@@ -56,7 +56,7 @@ exit:
define void @loop_contains_store_ee_condition_is_invariant(ptr dereferenceable(40) noalias %array, i16 %ee.val) {
; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_ee_condition_is_invariant'
-; CHECK: LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+; CHECK: LV: Not vectorizing: Early exit loop with store but no condition load.
entry:
br label %for.body
@@ -80,7 +80,7 @@ exit:
define void @loop_contains_store_fcmp_condition(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_fcmp_condition'
-; CHECK: LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+; CHECK: LV: Not vectorizing: Early exit loop with store but no condition load.
entry:
br label %for.body
@@ -106,7 +106,7 @@ exit:
define void @loop_contains_store_safe_dependency(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(96) %pred) {
; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_safe_dependency'
-; CHECK: LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+; CHECK: LV: Not vectorizing: No dependencies allowed for early exit condition load.
entry:
%pred.plus.8 = getelementptr inbounds nuw i16, ptr %pred, i64 8
br label %for.body
@@ -135,7 +135,7 @@ exit:
define void @loop_contains_store_unsafe_dependency(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(80) readonly %pred) {
; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_unsafe_dependency'
-; CHECK: LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+; CHECK: LV: Not vectorizing: Uncounted loop condition not known safe.
entry:
%unknown.offset = call i64 @get_an_unknown_offset()
%unknown.cmp = icmp ult i64 %unknown.offset, 20
@@ -149,10 +149,10 @@ for.body:
%data = load i16, ptr %st.addr, align 2
%inc = add nsw i16 %data, 1
store i16 %inc, ptr %st.addr, align 2
- %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+ %ee.addr = getelementptr inbounds nuw i16, ptr %unknown.base, i64 %iv
%ee.val = load i16, ptr %ee.addr, align 2
%ee.cond = icmp sgt i16 %ee.val, 500
- %some.addr = getelementptr inbounds nuw i16, ptr %unknown.base, i64 %iv
+ %some.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
store i16 42, ptr %some.addr, align 2
br i1 %ee.cond, label %exit, label %for.inc
@@ -194,7 +194,7 @@ exit:
define void @loop_contains_store_unknown_bounds(ptr align 2 dereferenceable(100) noalias %array, ptr align 2 dereferenceable(100) readonly %pred, i64 %n) {
; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_unknown_bounds'
-; CHECK: LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+; CHECK: LV: Not vectorizing: Uncounted loop condition not known safe.
entry:
br label %for.body
@@ -220,7 +220,7 @@ exit:
define void @loop_contains_store_volatile(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_volatile'
-; CHECK: LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+; CHECK: LV: Not vectorizing: Complex writes to memory unsupported in early exit loops.
entry:
br label %for.body
@@ -324,7 +324,7 @@ exit:
define void @loop_contains_store_condition_load_is_chained(ptr dereferenceable(40) noalias %array, ptr align 8 dereferenceable(160) readonly %offsets, ptr align 2 dereferenceable(40) readonly %pred) {
; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_condition_load_is_chained'
-; CHECK: LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+; CHECK: LV: Not vectorizing: Uncounted loop condition not known safe.
entry:
br label %for.body
|
Can you expand the commit message please and add more detail? I am aware of how this fits together with your other work, but others may not be. On this PR as well it might be worth adding a link to any other related PR or RFC to explain the motivation. |
@@ -407,6 +407,14 @@ class LoopVectorizationLegality { | |||
return hasUncountableEarlyExit() ? getUncountableEdge()->second : nullptr; | |||
} | |||
|
|||
/// Returns true if this is an early exit loop containing a store. | |||
bool isConditionCopyRequired() const { return EarlyExitLoad.has_value(); } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this looks odd, i.e. the comment says Returns true if this is an early exit loop containing a store.
but the code says return EarlyExitLoad.has_value()
. I expect others reading this code will be confused as well. Can you add more comments explaining why it's using an early exit load to decide if the loop contains a store? Or is just that the comment itself is misleading?
/// Returns the load instruction, if any, directly used for an exit comparison | ||
/// in and early exit loop containing state-changing or potentially-faulting | ||
/// operations. | ||
std::optional<LoadInst *> getEarlyExitLoad() const { return EarlyExitLoad; } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given the similarity between getEarlyExitLoad and isConditionCopyRequired, it might be better to rename isConditionCopyRequired to hasEarlyExitLoad? That way hasEarlyExitLoad
at least reflects what it's actually testing. If you still need a function called isConditionCopyRequired
you can always have one like this:
bool isConditionCopyRequired() const { return hasEarlyExitLoad(); }
|
||
/// Returns the load instruction, if any, directly used for an exit comparison | ||
/// in and early exit loop containing state-changing or potentially-faulting | ||
/// operations. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given we already have support for vectorising early exit loops with loads in them, I think the name getEarlyExitLoad
could be confusing. It also makes it sound like there is only one load in the loop. How about something like getCriticalLoadInUncountableEarlyExit
? The problem is that we support vectorising loops with countable early exits so we have to be specific what we mean.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've refactored this to no longer keep track of the load in the class, as we only need it during initial analysis and not when actually vectorizing. This does mean I have to pass it as a parameter for a couple of functions, but I think that's ok. It can be expanded to a SmallVector later once we have more than one load contributing to the uncounted exit condition.
@@ -1207,8 +1208,42 @@ bool LoopVectorizationLegality::canVectorizeMemory() { | |||
}); | |||
} | |||
|
|||
if (!LAI->canVectorizeMemory()) | |||
return canVectorizeIndirectUnsafeDependences(); | |||
if (LAI->canVectorizeMemory()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given the following code is still memory related I think it would be better to push these changes into canVectorizeMemory
itself.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure about adding this to LoopAccessAnalysis; are there other passes that could make use of the info?
const MemoryDepChecker &DepChecker = LAI->getDepChecker(); | ||
const auto *Deps = DepChecker.getDependences(); | ||
|
||
for (const MemoryDepChecker::Dependence &Dep : *Deps) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps worth using llvm::any_of here? i.e.
if (llvm::any_of(*Deps, [](auto &Dep) { return Dep.getDestination(DepChecker) == EarlyExitLoad || Dep.getSource(DepChecker) == EarlyExitLoad }) {
report...
return false;
}
} | ||
} else { | ||
// Read-only loop. | ||
// FIXME: as with the loops with stores, only the loads contributing to |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this comment is true for two reasons:
- You can have a loop containing loads before the early exit that do not contribute to the early exit condition, for example there may be an outside use of the load. The outside user only cares about the value just before exiting and the original scalar loop would not load beyond the early exit. Therefore such loads need to be dereferenceable.
- We also support vectorising loops with loads after the early exit condition, for example with outside uses of the loaded values. You should be able to find some tests for them I think. We can only vectorise such early exit loops if such loads are proven to be dereferenceable.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was thinking in terms of the transformation I apply to handle the stores; the vector loop stops before reaching the iteration which would exit, and the scalar loop handles it instead. Done this way, only the loads that contribute to the uncounted exit(s) need to be guaranteed to be dereferenceable since we're in the scalar loop when the exit occurs.
The same would apply with a predicated tail-folded vector loop, though -- once you've determined the masks for the exit, you can use them for the loads and stores inside the loop to ensure you never access memory addresses that the scalar loop wouldn't.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd still prefer this FIXME to be removed because I think it's misleading. The existing method of vectorisation for loops without stores is to execute the vector loop up to the point of the early exit, which definitely requires the loads to be dereferenceable. Your FIXME implies that with that style of vectorisation we can remove the checks, which isn't true. Also, if we know at runtime the loop contains no stores and the loads are dereferenceable anyway, then why artificially bail out before the early exit when we don't need to? The FIXME suggests that in future we will unconditionally move to this new style of vectorisation you're proposing, whereas I expect there to be a choice based on the trade-offs for this specific loop.
I'm happy for you to add a TODO that says something like "TODO: With an alternative style of vectorisation where we exit the vector loop prior to the early exit we can remove the restriction that loads must be dereferenceable."
// conditions than br(icmp(load, loop_inv)). | ||
ICmpInst *Cmp = dyn_cast<ICmpInst>(Br->getCondition()); | ||
if (Cmp && Cmp->hasOneUse() && | ||
TheLoop->isLoopInvariant(Cmp->getOperand(1))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the loop invariant always guaranteed to be on the right-hand side? It may not be a constant.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, but I'm deliberately not trying to cover all bases here; similar to the transform code I intend to get the bare minimum in first, then increase the number of cases handled.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a TODO here then so we don't forget?
} | ||
} | ||
|
||
if (!EarlyExitLoad) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think it's safe to leave here without also checking if the other loads are dereferenceable. See my comments below. We support vectorising early exit loops with loads before and after the early exit that don't contribute to the exit condition. Such loads can have outside users or perhaps one day we'll support reductions in early exit loops, etc.
"EarlyExitLoadNotGuaranteed", ORE, TheLoop); | ||
} | ||
} else { | ||
reportVectorizationFailure( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can't we just use the same error message as below?
reportVectorizationFailure(
"Loop may fault",
"Cannot vectorize potentially faulting early exit loop",
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
I'm not sure we need to distinguish between non-dereferenceable loads in loops with or without stores.
// FIXME: We may have multiple levels of conditional loads, so will | ||
// need to improve on outright rejection at some point. | ||
if (SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop)) { | ||
EarlyExitLoad = Load; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this variable should be named something else because it gives the impression that for any arbitrary early exit loop there is only one load permitted inside it. Plus, the loop could have a countable early exit too.
How about CriticalUncountableEarlyExitLoad or something like that? It's hard to come up with a decent name.
0555fe6
to
4691a20
Compare
} | ||
} else { | ||
// Read-only loop. | ||
// FIXME: as with the loops with stores, only the loads contributing to |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd still prefer this FIXME to be removed because I think it's misleading. The existing method of vectorisation for loops without stores is to execute the vector loop up to the point of the early exit, which definitely requires the loads to be dereferenceable. Your FIXME implies that with that style of vectorisation we can remove the checks, which isn't true. Also, if we know at runtime the loop contains no stores and the loads are dereferenceable anyway, then why artificially bail out before the early exit when we don't need to? The FIXME suggests that in future we will unconditionally move to this new style of vectorisation you're proposing, whereas I expect there to be a choice based on the trade-offs for this specific loop.
I'm happy for you to add a TODO that says something like "TODO: With an alternative style of vectorisation where we exit the vector loop prior to the early exit we can remove the restriction that loads must be dereferenceable."
@@ -512,7 +520,7 @@ class LoopVectorizationLegality { | |||
/// we read and write from memory. This method checks if it is | |||
/// legal to vectorize the code, considering only memory constrains. | |||
/// Returns true if the loop is vectorizable | |||
bool canVectorizeMemory(); | |||
bool canVectorizeMemory(std::optional<LoadInst *>); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add names to the new argument here and on isVectorizableEarlyExitLoop
? That seems to match existing convention.
@@ -542,7 +550,13 @@ class LoopVectorizationLegality { | |||
/// The list above is not based on theoretical limitations of vectorization, | |||
/// but simply a statement that more work is needed to support these | |||
/// additional cases safely. | |||
bool isVectorizableEarlyExitLoop(); | |||
bool isVectorizableEarlyExitLoop(std::optional<LoadInst *> &); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it guaranteed to always be a load instruction that's critical for the early exit? Could it not be any arbitrary instruction where SCEV was unable to come up with a counted exit count?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm only dealing with cases involving a load of the condition at the moment; we can consider additional cases later. However, I think we'll only want to record loads here in any case, since we need to know which ones to look at in canVectorizeMemory.
// Refine language a little? This currently only applies when a store | ||
// is present in the early exit loop. | ||
reportVectorizationFailure( | ||
"No dependencies allowed for early exit condition load", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It might be clearer if you change this to for critical early exit condition in loop with stores
"No dependencies allowed for early exit condition load", | ||
"Early exit condition loads may not have a dependence with " | ||
"another" | ||
" memory operation.", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The formatting seems a bit odd here?
} | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be good to test the else case and add a different error message for it, i.e. where the uncountable early exiting block does not have a branch as its terminator. I would assume we either a) we bailed out for such cases earlier on, or b) it would be a switch with only two cases including the default.
// conditions than br(icmp(load, loop_inv)). | ||
ICmpInst *Cmp = dyn_cast<ICmpInst>(Br->getCondition()); | ||
if (Cmp && Cmp->hasOneUse() && | ||
TheLoop->isLoopInvariant(Cmp->getOperand(1))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a TODO here then so we don't forget?
TheLoop->isLoopInvariant(Cmp->getOperand(1))) { | ||
LoadInst *Load = dyn_cast<LoadInst>(Cmp->getOperand(0)); | ||
if (Load && Load->hasOneUse() && !TheLoop->isLoopInvariant(Load)) { | ||
if (isDereferenceableAndAlignedInLoop(Load, TheLoop, *PSE.getSE(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The fact you're relying upon a single dereferenceable check here implies that you know it's possible to move this load above all other memory accesses, which I don't think is true. I realise you're checking for memory dependences in canVectorizeMemory, but I don't expect that to cover cases like this:
%off = load i8, ptr %a
%off64 = zext i8 %off to i64
%load = load i32, ptr %b, i64 %off64
In this case there isn't a strict memory dependence between the two loads, but the second load requires the first to have been executed. It's perfectly possible for isDereferenceableAndAlignedInLoop
to return true for %load
due to the limited range of the offset. In this example you also need to check if the first load is dereferenceable too since it must be executed first.
I think you have two choices:
- Check that all loads prior to this one are also dereferenceable, or
- Bail out if the critical load depends upon an earlier load.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
isDereferenceableAndAlignedInLoop
requires that the SCEV for the load's pointer operand be an AddRecExpr in the specified loop with a constant step, which I believe rules that out? Is that likely to change at some point? I can see it's not explicitly documented in the comment for that function.
I agree that this will need to be improved in future; we can't check the gathers this way, but it's possible the second operand to the comparison also comes from a load and that will require checking.
I think I'll wait until we have some first-faulting support before tackling gathers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK I'd like to play around with this using some test cases. I still feel nervous about this code because it feels like it's based on hope that such a situation will never occur, whereas at the legality stage we should be strict and explicitly rule it out. It may be fine for now, but if SCEV improves or we add new features to the vectoriser then this may suddenly be broken. I think we should either:
- Bail out if there is another load in the loop (the easiest to implement, but also the most restrictive), or
- Make sure all loads in the loop are dereferenceable (again, easy to implement, but also restrictive), or
- Do formal analysis to prove that the critical load can be rescheduled before all other memory accesses.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Determining whether we can reorder the load is why I have the dependency checks in canVectorizeMemory; I bail out even if there's any dependency at all right now, even safe ones. That can be refined over time.
In terms of reordering loads, though, it should be fine to do so as long as you only load the data that would have been loaded in the scalar loop alone. Since this load is what controls the (single) uncounted exit, we must execute this load up to the point either exit is reached. If we know dereferencing from the pointer is safe for the entirety of the maximum trip count then there is no problem reordering this wrt. other normal memory loads. Avoiding loading extra data from other pointers is handled by bailing out to the scalar loop in advance when we know there would be an exit in the middle of the next vector iteration. In future we could handle that by combining a partitioned mask from the critical load + comparison with the active lane mask for tail folded loops.
We only run into problems with stores (for which we have the dependency checks), or with non-standard loads (e.g. atomic, volatile), which we reject for vectorization in LoopAccessInfo::analyzeLoop
.
I'm happy to add some more test cases, and to throw a couple more checks here if you prefer (e.g. track back to a loop invariant pointer + offset from a PHI in the header?).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll write an in-code comment explaining the intended transformation to hopefully make it easier to gauge whether the checks are sufficient.
// A forward dependency should be fine, but a backwards dep may not | ||
// be even if LAA thinks it is due to performing the load for the | ||
// vector iteration i+1 in vector iteration i. | ||
if (CriticalEELoad) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a reason for splitting up the checks between isVectorizableEarlyExitLoop
and here or can we check the dependencies also in isVectorizableEarlyExitLoop
? Otherwise isVectorizableEarlyExitLoop
may need renaming or its documentation updating, as it returning true doesn't mean that we can vectorize the early. exit loop?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The LAI
variable has not been initialized when isVectorizableEarlyExitLoop
is invoked, so we would have to reorder things a little in order to unify the code. If it's ok to initialize it there as well, I can do so and move the checks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I do initialize LAI earlier, that will remove the need to pass the load around; I'll try that.
// vector iteration i+1 in vector iteration i. | ||
if (CriticalEELoad) { | ||
const MemoryDepChecker &DepChecker = LAI->getDepChecker(); | ||
const auto *Deps = DepChecker.getDependences(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not all dependences may be recorded, need to check that Deps
is not nullptr I think and ideally add a test that this the recording limit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. Have we thought about making getDependences return a std::optional to make it more obvious that needs to be considered?
I'll add the test.
/// | ||
/// This means we must ensure that it is safe to move the load for 'c[i]' | ||
/// before other memory operations (or any other observable side effects) in | ||
/// the loop. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This comment looks really helpful, thanks!
Is it worth adding as well that c[i]
is not permitted to have more than one use for now, because an outside use complicates things since you now have to use phi for c[i]
instead of c[i]
itself.
|
||
/// Clears any current early exit data gathered if a check failed. | ||
void clearEarlyExitData() { | ||
UncountableEdge = std::nullopt; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you may need to rebase as I thought @fhahn had removed the UncountableEdge
and replaced it with UncountableExitingBB
or something like that.
// FIXME: We're insisting on a single use for now, because otherwise we will | ||
// need to make PHI nodes for other users. That can be done once the initial | ||
// transform code lands. | ||
if (BranchInst *Br = dyn_cast<BranchInst>(ExitingBlock->getTerminator())) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: How about bailing out early if it's not a branch to remove some indentation? For example,
auto *Br = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
if (!Br) {
reportVectorizationFailure(
"Unsupported control flow in early exit loop with side effects",
"Cannot find branch instruction for uncounted exit in early exit loop "
"with side effects",
"UnsupportedUncountedExitTerminator", ORE, TheLoop);
return false;
}
// FIXME: Don't rely ...
if (Cmp && Cmp->hasOneUse() && | ||
TheLoop->isLoopInvariant(Cmp->getOperand(1))) { | ||
LoadInst *Load = dyn_cast<LoadInst>(Cmp->getOperand(0)); | ||
if (Load && Load->hasOneUse() && !TheLoop->isLoopInvariant(Load)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the algorithm for vectorising the loop absolutely depends upon the Load being variant, then isLoopInvariant
isn't strong enough here. It will return true if the load is from an invariant address, but happens to be in the loop. If you really want to make sure it's loop invariant (to stop the loop vectoriser hoisting it to the preheader, etc.) then you probably need to call ScalarEvolution::isLoopInvariant
// The following call also checks that the load address is either | ||
// invariant or is an affine SCEVAddRecExpr with a constant step. | ||
// In either case, we're not relying on another load. | ||
// FIXME: Support gathers after first-faulting support lands. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: after first-faulting load support lands.
CriticalUncountedExitConditionLoad); | ||
})) { | ||
reportVectorizationFailure( | ||
"No dependencies allowed for critical early exit condition load " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For some of these reports you can use a single string version of reportVectorizationFailure
if you don't really care about the debug message and report errors being different?
|
||
if (!CriticalUncountedExitConditionLoad) { | ||
reportVectorizationFailure( | ||
"Early exit loop with store but no condition load", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't strictly true I think? There could be a load feeding the comparison, but we've rejected it as we don't support multiple uses, etc. Perhaps better worded as Early exit loop with store but no supported condition load
} | ||
} else { | ||
reportVectorizationFailure( | ||
"Unsupported control flow in early exit loop with side effects", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can't see this message in any of the tests, although you have a test for it - @loop_contains_store_uncounted_exit_is_a_switch
. It suggests that we've caught the switch earlier on and already bailed out. Perhaps you can in fact completely remove the check for branches and simply assert it's a branch instead?
if (SafetyInfo.isGuaranteedToExecute(*Load, DT, TheLoop)) | ||
CriticalUncountedExitConditionLoad = Load; | ||
else | ||
reportVectorizationFailure( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to write a test for this? I guess it's difficult because we've probably already rejected certain loop forms already. I wonder if you can expose it with a loop that has an invoke before it that may lead to an exception?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No; invoke isn't a supported terminator (based on canVectorizeWithIfConvert
).
I also tried with a separate branch to skip over the exiting block, but I get the message Early exit is not the latch predecessor.
So we may be able to test this once that constraint is relaxed.
ret void | ||
} | ||
|
||
define void @loop_contains_store_uncounted_exit_is_a_switch(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be good to have an additional vectoriser test (doesn't have to be this file) somewhere for loops like this:
struct foo {
int a, b, c, d;
};
void foo(int *dst, struct foo *src) {
int array[(100 * 4) - 3];
memcpy(array, src, ((100 * 4) - 3) * 4);
for (int i = 0; i < 100; i++) {
struct foo tmp = array[i];
if (tmp.a > 3)
break;
dst[i] = tmp.b + tmp.c + tmp.d;
}
}
In this example I imagine this PR would consider such a loop legal to vectorise, but there are restrictions on how we vectorise. Given you are only checking if the load of foo.a is dereferenceable I don't think we should treat the load of struct foo as an interleave group and end up using a wider load, which could fault.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess that requires a separate PR, since LoopVectorizeLegality doesn't participate in detecting interleave groups?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah sure. I just meant it would be a good to have at least one vectoriser test for this, even if right not it doesn't vectorise. The test would have a name and a comment explaining that we should not treat this as an interleave group and end up using a wide load.
6801d72
to
5694286
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! Thanks for being patient and addressing all the comments. The patch looks much tidier now.
/// before other memory operations (or any other observable side effects) in | ||
/// the loop. | ||
/// | ||
/// Currently, c[i] should only have one user (the comparison used for the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// Currently, c[i] should only have one user (the comparison used for the | |
/// Currently, c[i] must only have one user (the comparison used for the |
currently it must have only a single user?
@@ -647,6 +722,10 @@ class LoopVectorizationLegality { | |||
/// Keep track of an uncountable exiting block, if there is exactly one early | |||
/// exit. | |||
BasicBlock *UncountableExitingBB = nullptr; | |||
|
|||
/// If true, the loop has at least one uncounted exit and operations within |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// If true, the loop has at least one uncounted exit and operations within | |
/// If true, the loop has at least one uncountable exit and operations within |
@@ -1853,6 +1978,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { | |||
} else { | |||
if (!isVectorizableEarlyExitLoop()) { | |||
assert(!hasUncountableEarlyExit() && | |||
!hasUncountedExitWithSideEffects() && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
!hasUncountedExitWithSideEffects() && | |
!hasUncountableExitWithSideEffects() && |
else | ||
reportVectorizationFailure( | ||
"Early exit condition load not guaranteed to execute", | ||
"EarlyExitLoadNotGuaranteed", ORE, TheLoop); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should this return false here?
It also seems like this code-path isn't covered. Not sure if it would be possible to add a test?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've tried finding a way to trigger this, but other restrictions prevent us from reaching it. We should be able to relax some of those constraints later, though -- e.g. the uncountable exit needing to be the direct predecessor of the latch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for checking, that's what I thought. Perhaps that could be an assertion then?
if (!hasUncountedExitWithSideEffects()) | ||
return canVectorizeIndirectUnsafeDependences(); | ||
reportVectorizationFailure( | ||
"Cannot vectorize unsafe dependencies in early exit loop with " | ||
"side effects.", | ||
"Unable to vectorize memory in an early exit loop with side effects", | ||
"CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE, TheLoop); | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (!hasUncountedExitWithSideEffects()) | |
return canVectorizeIndirectUnsafeDependences(); | |
reportVectorizationFailure( | |
"Cannot vectorize unsafe dependencies in early exit loop with " | |
"side effects.", | |
"Unable to vectorize memory in an early exit loop with side effects", | |
"CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE, TheLoop); | |
return false; | |
if (hasUncountedExitWithSideEffects()) { | |
reportVectorizationFailure( | |
"Cannot vectorize unsafe dependencies in early exit loop with " | |
"side effects.", | |
"Unable to vectorize memory in an early exit loop with side effects", | |
"CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE, TheLoop); | |
return false; | |
} | |
return canVectorizeIndirectUnsafeDependences(); |
might be clearer to keep the check together with the EE specific message
@@ -1871,6 +1997,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { | |||
return false; | |||
} | |||
|
|||
// Bail out for state-changing EE loops for now. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Bail out for state-changing EE loops for now. | |
// Bail out for state-changing loops with uncountable exits for now. |
if (!hasUncountedExitWithSideEffects()) | ||
return canVectorizeIndirectUnsafeDependences(); | ||
reportVectorizationFailure( | ||
"Cannot vectorize unsafe dependencies in early exit loop with " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
looks like there's missing test coverage for that code path?
"Loop may fault", | ||
"Cannot vectorize potentially faulting early exit loop", | ||
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can exit early with this if not deferenceable to reduce nesting
const SCEV *PtrScev = PSE.getSE()->getSCEV(Load->getPointerOperand()); | ||
if (PSE.getSE()->isLoopInvariant(PtrScev, TheLoop)) { | ||
reportVectorizationFailure( | ||
"Uncounted exit condition depends on load from invariant address", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
looks like there's no test coverage for that check.
|
||
// FIXME: Don't rely on operand ordering for the comparison. | ||
ICmpInst *Cmp = dyn_cast<ICmpInst>(Br->getCondition()); | ||
if (Cmp && Cmp->hasOneUse() && TheLoop->isLoopInvariant(Cmp->getOperand(1))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Simpler to use IR pattern matching here?
* Rename state variable and accessor to something covering more cases * Simplified some code following suggestions * Sharing a remark when specific subcases aren't interesting * Rebased
5694286
to
0b114b2
Compare
Hi @huntergr-arm, can you add more detail to the commit message please explaining what the PR is doing and possibly what the next steps are? Just for reference so that once the patch is merged it's obvious in the git commit log. |
// Prohibit any potential aliasing with any instruction in the loop which | ||
// might store to memory. | ||
// FIXME: Relax this constraint where possible. | ||
AAResults *AA = LAIs.getAAResults(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure about getting it from LAA, I think it should be fairly straigth-forward to construct here?
This adds initial support to LoopVectorizationLegality to analyze loops with side effects
(particularly stores to memory) and an uncountable exit. This patch alone doesn't
enable any new transformations, but does give clearer reasons for rejecting vectorization
for such a loop. The intent is for a loop like the following to pass the specific checks,
and only be rejected at the end until the transformation code is committed: