-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[SimpleLoopUnswitch] Adjust cost multiplier accounting for parent loop size #155379
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[SimpleLoopUnswitch] Adjust cost multiplier accounting for parent loop size #155379
Conversation
…p size When estimating the cost to avoid exponential unswitches of non-trivial invariant conditions, also consider the parent loop basic blocks size, ensuring this does not grow unexpectedly. Fixes: llvm#138509.
@llvm/pr-subscribers-llvm-transforms Author: Antonio Frighetto (antoniofrighetto) ChangesWhen estimating the cost to avoid exponential unswitches of non-trivial invariant conditions, also consider the parent loop basic blocks size, ensuring this does not grow unexpectedly. Alternative solution might involve adding a threshold on the number of times LoopRotate is allowed to rotate the same loop across invocations (#138509 (comment)), though I think this would require a per-loop count to be serialized/deserialized in a metadata each time, which seems far more invasive. Fixes: #138509. Full diff: https://github.com/llvm/llvm-project/pull/155379.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 9b40fc03da6bb..e4ba70d1bce16 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -98,6 +98,9 @@ static cl::opt<bool> EnableUnswitchCostMultiplier(
static cl::opt<int> UnswitchSiblingsToplevelDiv(
"unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden,
cl::desc("Toplevel siblings divisor for cost multiplier."));
+static cl::opt<int> UnswitchParentBlocksDiv(
+ "unswitch-parent-blocks-div", cl::init(8), cl::Hidden,
+ cl::desc("Outer loop size divisor for cost multiplier."));
static cl::opt<int> UnswitchNumInitialUnscaledCandidates(
"unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden,
cl::desc("Number of unswitch candidates that are ignored when calculating "
@@ -2809,9 +2812,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
}
/// Cost multiplier is a way to limit potentially exponential behavior
-/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
-/// candidates available. Also accounting for the number of "sibling" loops with
-/// the idea to account for previous unswitches that already happened on this
+/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch
+/// candidates available. Also consider the number of "sibling" loops with
+/// the idea of accounting for previous unswitches that already happened on this
/// cluster of loops. There was an attempt to keep this formula simple,
/// just enough to limit the worst case behavior. Even if it is not that simple
/// now it is still not an attempt to provide a detailed heuristic size
@@ -2842,7 +2845,19 @@ static int CalculateUnswitchCostMultiplier(
return 1;
}
+ // Each invariant non-trivial condition, after being unswitched, is supposed
+ // to have its own specialized sibling loop (the invariant condition has been
+ // hoisted out of the child loop into a newly-cloned loop). When unswitching
+ // conditions in nested loops, the basic block size of the outer loop should
+ // not be altered. If such a size significantly increases across unswitching
+ // invocations, something may be wrong; so adjust the final cost taking this
+ // into account.
auto *ParentL = L.getParentLoop();
+ int ParentLoopSizeMultiplier = 1;
+ if (ParentL)
+ ParentLoopSizeMultiplier =
+ std::max<int>(ParentL->getNumBlocks() / UnswitchParentBlocksDiv, 1);
+
int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
: std::distance(LI.begin(), LI.end()));
// Count amount of clones that all the candidates might cause during
@@ -2887,14 +2902,16 @@ static int CalculateUnswitchCostMultiplier(
// at an upper bound.
int CostMultiplier;
if (ClonesPower > Log2_32(UnswitchThreshold) ||
- SiblingsMultiplier > UnswitchThreshold)
+ SiblingsMultiplier > UnswitchThreshold ||
+ ParentLoopSizeMultiplier > UnswitchThreshold)
CostMultiplier = UnswitchThreshold;
else
CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower),
(int)UnswitchThreshold);
LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier
- << " (siblings " << SiblingsMultiplier << " * clones "
+ << " (siblings " << SiblingsMultiplier << " * parent size "
+ << ParentLoopSizeMultiplier << " * clones "
<< (1 << ClonesPower) << ")"
<< " for unswitch candidate: " << TI << "\n");
return CostMultiplier;
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll b/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll
new file mode 100644
index 0000000000000..2ce0cb5c3f299
--- /dev/null
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -enable-unswitch-cost-multiplier=true -unswitch-parent-blocks-div=1 \
+; RUN: -passes="loop-mssa(loop-simplifycfg,licm,loop-rotate,simple-loop-unswitch<nontrivial>),print<loops>" \
+; RUN: -disable-output 2>&1 | sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-DIV-1
+
+; RUN: opt < %s -S -enable-unswitch-cost-multiplier=true -unswitch-parent-blocks-div=2 \
+; RUN: -passes="loop-mssa(loop-simplifycfg,licm,loop-rotate,simple-loop-unswitch<nontrivial>),print<loops>" \
+; RUN: -disable-output 2>&1 | sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-DIV-2
+
+; LOOP-DIV-1-COUNT-6: Loop at depth 1 containing:
+; LOOP-DIV-2-COUNT-12: Loop at depth 1 containing:
+
+@a = global i32 0, align 4
+@b = global i32 0, align 4
+@c = global i32 0, align 4
+@d = global i32 0, align 4
+
+define i32 @main() {
+entry:
+ br label %outer.loop.header
+
+outer.loop.header: ; preds = %outer.loop.latch, %entry
+ br i1 false, label %exit, label %outer.loop.body
+
+outer.loop.body: ; preds = %inner.loop.header, %outer.loop.header
+ store i32 1, ptr @c, align 4
+ %cmp = icmp sgt i32 0, -1
+ br i1 %cmp, label %outer.loop.latch, label %exit
+
+inner.loop.header: ; preds = %outer.loop.latch, %inner.loop.body
+ %a_val = load i32, ptr @a, align 4
+ %c_val = load i32, ptr @c, align 4
+ %mul = mul nsw i32 %c_val, %a_val
+ store i32 %mul, ptr @b, align 4
+ %cmp2 = icmp sgt i32 %mul, -1
+ br i1 %cmp2, label %inner.loop.body, label %outer.loop.body
+
+inner.loop.body: ; preds = %inner.loop.header
+ %mul2 = mul nsw i32 %c_val, 3
+ store i32 %mul2, ptr @c, align 4
+ store i32 %c_val, ptr @d, align 4
+ %mul3 = mul nsw i32 %c_val, %a_val
+ %cmp3 = icmp sgt i32 %mul3, -1
+ br i1 %cmp3, label %inner.loop.header, label %exit
+
+outer.loop.latch: ; preds = %outer.loop.body
+ %d_val = load i32, ptr @d, align 4
+ store i32 %d_val, ptr @b, align 4
+ %cmp4 = icmp eq i32 %d_val, 0
+ br i1 %cmp4, label %inner.loop.header, label %outer.loop.header
+
+exit: ; preds = %inner.loop.body, %outer.loop.body, %outer.loop.header
+ ret i32 0
+}
|
Haven't really looked at the patch. Just wanted to say that I've tried adding this patch downstream, and I do not see the regressions that we detected using the earlier patch as mentioned here #141121 (comment). |
When estimating the cost to avoid exponential unswitches of non-trivial invariant conditions, also consider the parent loop basic blocks size, ensuring this does not grow unexpectedly.
Alternative solution might involve adding a threshold on the number of times LoopRotate is allowed to rotate the same loop across invocations (#138509 (comment)), though I think this would require a per-loop count to be serialized/deserialized in a metadata each time, which seems far more invasive.
Fixes: #138509.