From 4354ac6990fb93e490706b9560272c28eb1cd1a2 Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Tue, 26 Aug 2025 10:34:18 +0200 Subject: [PATCH] =?UTF-8?q?[SimpleLoopUnswitch]=C2=A0Adjust=20cost=20multi?= =?UTF-8?q?plier=20accounting=20for=20parent=20loop=20size?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When estimating the cost to avoid exponential unswitches of non-trivial invariant conditions, also consider the parent loop basic blocks size, ensuring this does not grow unexpectedly. Fixes: https://github.com/llvm/llvm-project/issues/138509. --- .../Transforms/Scalar/SimpleLoopUnswitch.cpp | 27 ++++++++-- .../Transforms/SimpleLoopUnswitch/pr138509.ll | 54 +++++++++++++++++++ 2 files changed, 76 insertions(+), 5 deletions(-) create mode 100644 llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 9b40fc03da6bb..e4ba70d1bce16 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -98,6 +98,9 @@ static cl::opt EnableUnswitchCostMultiplier( static cl::opt UnswitchSiblingsToplevelDiv( "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden, cl::desc("Toplevel siblings divisor for cost multiplier.")); +static cl::opt UnswitchParentBlocksDiv( + "unswitch-parent-blocks-div", cl::init(8), cl::Hidden, + cl::desc("Outer loop size divisor for cost multiplier.")); static cl::opt UnswitchNumInitialUnscaledCandidates( "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden, cl::desc("Number of unswitch candidates that are ignored when calculating " @@ -2809,9 +2812,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, } /// Cost multiplier is a way to limit potentially exponential behavior -/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch -/// candidates available. Also accounting for the number of "sibling" loops with -/// the idea to account for previous unswitches that already happened on this +/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch +/// candidates available. Also consider the number of "sibling" loops with +/// the idea of accounting for previous unswitches that already happened on this /// cluster of loops. There was an attempt to keep this formula simple, /// just enough to limit the worst case behavior. Even if it is not that simple /// now it is still not an attempt to provide a detailed heuristic size @@ -2842,7 +2845,19 @@ static int CalculateUnswitchCostMultiplier( return 1; } + // Each invariant non-trivial condition, after being unswitched, is supposed + // to have its own specialized sibling loop (the invariant condition has been + // hoisted out of the child loop into a newly-cloned loop). When unswitching + // conditions in nested loops, the basic block size of the outer loop should + // not be altered. If such a size significantly increases across unswitching + // invocations, something may be wrong; so adjust the final cost taking this + // into account. auto *ParentL = L.getParentLoop(); + int ParentLoopSizeMultiplier = 1; + if (ParentL) + ParentLoopSizeMultiplier = + std::max(ParentL->getNumBlocks() / UnswitchParentBlocksDiv, 1); + int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size() : std::distance(LI.begin(), LI.end())); // Count amount of clones that all the candidates might cause during @@ -2887,14 +2902,16 @@ static int CalculateUnswitchCostMultiplier( // at an upper bound. int CostMultiplier; if (ClonesPower > Log2_32(UnswitchThreshold) || - SiblingsMultiplier > UnswitchThreshold) + SiblingsMultiplier > UnswitchThreshold || + ParentLoopSizeMultiplier > UnswitchThreshold) CostMultiplier = UnswitchThreshold; else CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower), (int)UnswitchThreshold); LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier - << " (siblings " << SiblingsMultiplier << " * clones " + << " (siblings " << SiblingsMultiplier << " * parent size " + << ParentLoopSizeMultiplier << " * clones " << (1 << ClonesPower) << ")" << " for unswitch candidate: " << TI << "\n"); return CostMultiplier; diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll b/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll new file mode 100644 index 0000000000000..2ce0cb5c3f299 --- /dev/null +++ b/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -S -enable-unswitch-cost-multiplier=true -unswitch-parent-blocks-div=1 \ +; RUN: -passes="loop-mssa(loop-simplifycfg,licm,loop-rotate,simple-loop-unswitch),print" \ +; RUN: -disable-output 2>&1 | sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-DIV-1 + +; RUN: opt < %s -S -enable-unswitch-cost-multiplier=true -unswitch-parent-blocks-div=2 \ +; RUN: -passes="loop-mssa(loop-simplifycfg,licm,loop-rotate,simple-loop-unswitch),print" \ +; RUN: -disable-output 2>&1 | sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-DIV-2 + +; LOOP-DIV-1-COUNT-6: Loop at depth 1 containing: +; LOOP-DIV-2-COUNT-12: Loop at depth 1 containing: + +@a = global i32 0, align 4 +@b = global i32 0, align 4 +@c = global i32 0, align 4 +@d = global i32 0, align 4 + +define i32 @main() { +entry: + br label %outer.loop.header + +outer.loop.header: ; preds = %outer.loop.latch, %entry + br i1 false, label %exit, label %outer.loop.body + +outer.loop.body: ; preds = %inner.loop.header, %outer.loop.header + store i32 1, ptr @c, align 4 + %cmp = icmp sgt i32 0, -1 + br i1 %cmp, label %outer.loop.latch, label %exit + +inner.loop.header: ; preds = %outer.loop.latch, %inner.loop.body + %a_val = load i32, ptr @a, align 4 + %c_val = load i32, ptr @c, align 4 + %mul = mul nsw i32 %c_val, %a_val + store i32 %mul, ptr @b, align 4 + %cmp2 = icmp sgt i32 %mul, -1 + br i1 %cmp2, label %inner.loop.body, label %outer.loop.body + +inner.loop.body: ; preds = %inner.loop.header + %mul2 = mul nsw i32 %c_val, 3 + store i32 %mul2, ptr @c, align 4 + store i32 %c_val, ptr @d, align 4 + %mul3 = mul nsw i32 %c_val, %a_val + %cmp3 = icmp sgt i32 %mul3, -1 + br i1 %cmp3, label %inner.loop.header, label %exit + +outer.loop.latch: ; preds = %outer.loop.body + %d_val = load i32, ptr @d, align 4 + store i32 %d_val, ptr @b, align 4 + %cmp4 = icmp eq i32 %d_val, 0 + br i1 %cmp4, label %inner.loop.header, label %outer.loop.header + +exit: ; preds = %inner.loop.body, %outer.loop.body, %outer.loop.header + ret i32 0 +}