-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[PPC] Set minimum of largest number of comparisons to use bit test for switch lowering #155910
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-powerpc Author: Shimin Cui (scui-ibm) ChangesCurrently it is considered suitable to lower to a bit test for a set of switch case clusters when the the number of unique destinations ( However it is found for some cases on powerpc, for example, when NumDests is 3, and the number of comparisons for each destination is all 2, it's not profitable to lower the switch to bit test. This is to add an option to set the minimum of largest number of comparisons to use bit test for switch lowering. Full diff: https://github.com/llvm/llvm-project/pull/155910.diff 6 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 0a10b51f97c63..efd7580e10033 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -594,12 +594,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// Check if suitable for a bit test
if (N <= DL.getIndexSizeInBits(0u)) {
- SmallPtrSet<const BasicBlock *, 4> Dests;
- for (auto I : SI.cases())
- Dests.insert(I.getCaseSuccessor());
+ DenseMap<const BasicBlock *, unsigned int> DestMap;
+ for (auto I : SI.cases()) {
+ const BasicBlock *BB = I.getCaseSuccessor();
+ DestMap[BB] = DestMap.count(BB) ? (DestMap.count(BB) + 1) : 1;
+ }
- if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
- DL))
+ if (TLI->isSuitableForBitTests(DestMap, MinCaseVal, MaxCaseVal, DL))
return 1;
}
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 438b6ff55c85f..4c94ed34123c7 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1441,9 +1441,10 @@ class LLVM_ABI TargetLoweringBase {
/// \p High as its lowest and highest case values, and expects \p NumCmps
/// case value comparisons. Check if the number of destinations, comparison
/// metric, and range are all suitable.
- bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
- const APInt &Low, const APInt &High,
- const DataLayout &DL) const {
+ bool
+ isSuitableForBitTests(DenseMap<const BasicBlock *, unsigned int> DestCmps,
+ const APInt &Low, const APInt &High,
+ const DataLayout &DL) const {
// FIXME: I don't think NumCmps is the correct metric: a single case and a
// range of cases both require only one branch to lower. Just looking at the
// number of clusters and destinations should be enough to decide whether to
@@ -1454,6 +1455,20 @@ class LLVM_ABI TargetLoweringBase {
if (!rangeFitsInWord(Low, High, DL))
return false;
+ unsigned NumDests = DestCmps.size();
+ unsigned NumCmps = 0;
+ unsigned int MaxBitTestEntry = 0;
+ for (auto &DestCmp : DestCmps) {
+ NumCmps += DestCmp.second;
+ if (DestCmp.second > MaxBitTestEntry)
+ MaxBitTestEntry = DestCmp.second;
+ }
+
+ // Comparisons might be cheaper for small number of comparisons, which can
+ // be Arch Target specific.
+ if (MaxBitTestEntry < getMinimumBitTestCmps())
+ return false;
+
// Decide whether it's profitable to lower this range with bit tests. Each
// destination requires a bit test and branch, and there is an overall range
// check branch. For a small number of clusters, separate comparisons might
@@ -2063,6 +2078,9 @@ class LLVM_ABI TargetLoweringBase {
virtual bool isJumpTableRelative() const;
+ /// Retuen the minimum of largest number of comparisons in BitTest.
+ virtual unsigned getMinimumBitTestCmps() const;
+
/// If a physical register, this specifies the register that
/// llvm.savestack/llvm.restorestack should save and restore.
Register getStackPointerRegisterToSaveRestore() const {
@@ -2579,6 +2597,9 @@ class LLVM_ABI TargetLoweringBase {
/// Set to zero to generate unlimited jump tables.
void setMaximumJumpTableSize(unsigned);
+ /// Set the minimum of largest of number of comparisons to generate BitTest.
+ void setMinimumBitTestCmps(unsigned Val);
+
/// If set to a physical register, this specifies the register that
/// llvm.savestack/llvm.restorestack should save and restore.
void setStackPointerRegisterToSaveRestore(Register R) {
diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
index 038c499fe236e..8662f7801e2a6 100644
--- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
+++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
@@ -206,12 +206,16 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
for (unsigned I = First; I <= Last; ++I)
JTProbs[Clusters[I].MBB] = BranchProbability::getZero();
+ DenseMap<const BasicBlock *, unsigned int> DestMap;
for (unsigned I = First; I <= Last; ++I) {
assert(Clusters[I].Kind == CC_Range);
Prob += Clusters[I].Prob;
const APInt &Low = Clusters[I].Low->getValue();
const APInt &High = Clusters[I].High->getValue();
- NumCmps += (Low == High) ? 1 : 2;
+ unsigned int NumCmp = (Low == High) ? 1 : 2;
+ const BasicBlock *BB = Clusters[I].MBB->getBasicBlock();
+ DestMap[BB] = DestMap.count(BB) ? (DestMap[BB] + NumCmp) : NumCmp;
+
if (I != First) {
// Fill the gap between this and the previous cluster.
const APInt &PreviousHigh = Clusters[I - 1].High->getValue();
@@ -226,9 +230,7 @@ bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
JTProbs[Clusters[I].MBB] += Clusters[I].Prob;
}
- unsigned NumDests = JTProbs.size();
- if (TLI->isSuitableForBitTests(NumDests, NumCmps,
- Clusters[First].Low->getValue(),
+ if (TLI->isSuitableForBitTests(DestMap, Clusters[First].Low->getValue(),
Clusters[Last].High->getValue(), *DL)) {
// Clusters[First..Last] should be lowered as bit tests instead.
return false;
@@ -372,20 +374,19 @@ bool SwitchCG::SwitchLowering::buildBitTests(CaseClusterVector &Clusters,
if (First == Last)
return false;
- BitVector Dests(FuncInfo.MF->getNumBlockIDs());
- unsigned NumCmps = 0;
+ DenseMap<const BasicBlock *, unsigned int> DestMap;
for (int64_t I = First; I <= Last; ++I) {
assert(Clusters[I].Kind == CC_Range);
- Dests.set(Clusters[I].MBB->getNumber());
- NumCmps += (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
+ unsigned NumCmp = (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
+ const BasicBlock *BB = Clusters[I].MBB->getBasicBlock();
+ DestMap[BB] = DestMap.count(BB) ? (DestMap[BB] + NumCmp) : NumCmp;
}
- unsigned NumDests = Dests.count();
APInt Low = Clusters[First].Low->getValue();
APInt High = Clusters[Last].High->getValue();
assert(Low.slt(High));
- if (!TLI->isSuitableForBitTests(NumDests, NumCmps, Low, High, *DL))
+ if (!TLI->isSuitableForBitTests(DestMap, Low, High, *DL))
return false;
APInt LowBound;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 0549a947600dc..eeafe0605d654 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
@@ -90,6 +91,11 @@ static cl::opt<unsigned> OptsizeJumpTableDensity(
cl::desc("Minimum density for building a jump table in "
"an optsize function"));
+static cl::opt<unsigned>
+ MinimumBitTestCmps("min-bit-test-cmps", cl::init(2), cl::Hidden,
+ cl::desc("Set minimum of largest number of comparisons "
+ "to use bit test for switch."));
+
// FIXME: This option is only to test if the strict fp operation processed
// correctly by preventing mutating strict fp operation to normal fp operation
// during development. When the backend supports strict float operation, this
@@ -2120,6 +2126,14 @@ bool TargetLoweringBase::isJumpTableRelative() const {
return getTargetMachine().isPositionIndependent();
}
+unsigned TargetLoweringBase::getMinimumBitTestCmps() const {
+ return MinimumBitTestCmps;
+}
+
+void TargetLoweringBase::setMinimumBitTestCmps(unsigned Val) {
+ MinimumBitTestCmps = Val;
+}
+
Align TargetLoweringBase::getPrefLoopAlignment(MachineLoop *ML) const {
if (TM.Options.LoopAlignment)
return Align(TM.Options.LoopAlignment);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 5039f5df7a128..0e83599f9c366 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -138,6 +138,11 @@ static cl::opt<unsigned> PPCMinimumJumpTableEntries(
"ppc-min-jump-table-entries", cl::init(64), cl::Hidden,
cl::desc("Set minimum number of entries to use a jump table on PPC"));
+static cl::opt<unsigned> PPCMinimumBitTestCmps(
+ "ppc-min-bit-test-cmps", cl::init(3), cl::Hidden,
+ cl::desc("Set minimum of largest number of comparisons to use bit test for "
+ "switch on PPC."));
+
static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth(
"ppc-gather-alias-max-depth", cl::init(18), cl::Hidden,
cl::desc("max depth when checking alias info in GatherAllAliases()"));
@@ -1438,6 +1443,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// Re-evaluate this value on future HWs that can do better with mtctr.
setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);
+ // The default minimum of largest number in a BitTest cluster is 3.
+ setMinimumBitTestCmps(PPCMinimumBitTestCmps);
+
setMinFunctionAlignment(Align(4));
setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32);
diff --git a/llvm/test/CodeGen/PowerPC/bittest.ll b/llvm/test/CodeGen/PowerPC/bittest.ll
new file mode 100644
index 0000000000000..cba56e3d5798f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/bittest.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs < %s -O3 -mcpu=ppc -mtriple powerpc-ibm-aix \
+; RUN: -ppc-asm-full-reg-names | FileCheck %s
+
+define i32 @foo(i32 noundef signext %x) {
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: mflr r0
+; CHECK-NEXT: stwu r1, -64(r1)
+; CHECK-NEXT: stw r0, 72(r1)
+; CHECK-NEXT: cmpwi r3, 8
+; CHECK-NEXT: stw r31, 60(r1) # 4-byte Folded Spill
+; CHECK-NEXT: mr r31, r3
+; CHECK-NEXT: li r3, 0
+; CHECK-NEXT: ble cr0, L..BB0_4
+; CHECK-NEXT: # %bb.1: # %entry
+; CHECK-NEXT: cmpwi r31, 11
+; CHECK-NEXT: bge cr0, L..BB0_7
+; CHECK-NEXT: # %bb.2: # %entry
+; CHECK-NEXT: cmplwi r31, 9
+; CHECK-NEXT: beq cr0, L..BB0_9
+; CHECK-NEXT: # %bb.3: # %entry
+; CHECK-NEXT: cmplwi r31, 10
+; CHECK-NEXT: beq cr0, L..BB0_11
+; CHECK-NEXT: b L..BB0_13
+; CHECK-NEXT: L..BB0_4: # %entry
+; CHECK-NEXT: cmplwi r31, 4
+; CHECK-NEXT: beq cr0, L..BB0_12
+; CHECK-NEXT: # %bb.5: # %entry
+; CHECK-NEXT: cmplwi r31, 7
+; CHECK-NEXT: beq cr0, L..BB0_11
+; CHECK-NEXT: # %bb.6: # %entry
+; CHECK-NEXT: cmplwi r31, 8
+; CHECK-NEXT: beq cr0, L..BB0_10
+; CHECK-NEXT: b L..BB0_13
+; CHECK-NEXT: L..BB0_7: # %entry
+; CHECK-NEXT: beq cr0, L..BB0_10
+; CHECK-NEXT: # %bb.8: # %entry
+; CHECK-NEXT: cmplwi r31, 12
+; CHECK-NEXT: bne cr0, L..BB0_13
+; CHECK-NEXT: L..BB0_9: # %sw.bb2
+; CHECK-NEXT: mr r3, r31
+; CHECK-NEXT: bl .foo3[PR]
+; CHECK-NEXT: nop
+; CHECK-NEXT: mr r3, r31
+; CHECK-NEXT: b L..BB0_13
+; CHECK-NEXT: L..BB0_10: # %sw.bb1
+; CHECK-NEXT: mr r3, r31
+; CHECK-NEXT: bl .foo2[PR]
+; CHECK-NEXT: nop
+; CHECK-NEXT: mr r3, r31
+; CHECK-NEXT: b L..BB0_13
+; CHECK-NEXT: L..BB0_11: # %sw.bb
+; CHECK-NEXT: mr r3, r31
+; CHECK-NEXT: bl .foo1[PR]
+; CHECK-NEXT: nop
+; CHECK-NEXT: mr r3, r31
+; CHECK-NEXT: b L..BB0_13
+; CHECK-NEXT: L..BB0_12: # %sw.bb3
+; CHECK-NEXT: li r3, 4
+; CHECK-NEXT: bl .foo4[PR]
+; CHECK-NEXT: nop
+; CHECK-NEXT: li r3, 4
+; CHECK-NEXT: L..BB0_13: # %return
+; CHECK-NEXT: lwz r31, 60(r1) # 4-byte Folded Reload
+; CHECK-NEXT: addi r1, r1, 64
+; CHECK-NEXT: lwz r0, 8(r1)
+; CHECK-NEXT: mtlr r0
+; CHECK-NEXT: blr
+entry:
+ switch i32 %x, label %return [
+ i32 7, label %sw.bb
+ i32 10, label %sw.bb
+ i32 8, label %sw.bb1
+ i32 11, label %sw.bb1
+ i32 9, label %sw.bb2
+ i32 12, label %sw.bb2
+ i32 4, label %sw.bb3
+ ]
+
+sw.bb: ; preds = %entry, %entry
+ tail call void @foo1(i32 noundef signext %x)
+ br label %return
+
+sw.bb1: ; preds = %entry, %entry
+ tail call void @foo2(i32 noundef signext %x)
+ br label %return
+
+sw.bb2: ; preds = %entry, %entry
+ tail call void @foo3(i32 noundef signext %x)
+ br label %return
+
+sw.bb3: ; preds = %entry
+ tail call void @foo4(i32 noundef signext 4)
+ br label %return
+
+return: ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry
+ %retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ %x, %sw.bb ]
+ ret i32 %retval.0
+}
+
+define i32 @goo(i32 noundef signext %x) {
+; CHECK-LABEL: goo:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: mflr r0
+; CHECK-NEXT: stwu r1, -64(r1)
+; CHECK-NEXT: stw r0, 72(r1)
+; CHECK-NEXT: cmplwi r3, 12
+; CHECK-NEXT: stw r31, 60(r1) # 4-byte Folded Spill
+; CHECK-NEXT: mr r31, r3
+; CHECK-NEXT: bgt cr0, L..BB1_7
+; CHECK-NEXT: # %bb.1: # %entry
+; CHECK-NEXT: li r3, 1
+; CHECK-NEXT: slw r3, r3, r31
+; CHECK-NEXT: andi. r4, r3, 5632
+; CHECK-NEXT: bne cr0, L..BB1_4
+; CHECK-NEXT: # %bb.2: # %entry
+; CHECK-NEXT: andi. r3, r3, 2304
+; CHECK-NEXT: beq cr0, L..BB1_5
+; CHECK-NEXT: # %bb.3: # %sw.bb1
+; CHECK-NEXT: mr r3, r31
+; CHECK-NEXT: bl .foo2[PR]
+; CHECK-NEXT: nop
+; CHECK-NEXT: b L..BB1_9
+; CHECK-NEXT: L..BB1_4: # %sw.bb2
+; CHECK-NEXT: mr r3, r31
+; CHECK-NEXT: bl .foo3[PR]
+; CHECK-NEXT: nop
+; CHECK-NEXT: b L..BB1_9
+; CHECK-NEXT: L..BB1_5: # %entry
+; CHECK-NEXT: cmplwi r31, 7
+; CHECK-NEXT: bne cr0, L..BB1_7
+; CHECK-NEXT: # %bb.6: # %sw.bb
+; CHECK-NEXT: li r3, 7
+; CHECK-NEXT: li r31, 7
+; CHECK-NEXT: bl .foo1[PR]
+; CHECK-NEXT: nop
+; CHECK-NEXT: b L..BB1_9
+; CHECK-NEXT: L..BB1_7: # %entry
+; CHECK-NEXT: cmplwi r31, 4
+; CHECK-NEXT: li r31, 0
+; CHECK-NEXT: bne cr0, L..BB1_9
+; CHECK-NEXT: # %bb.8: # %sw.bb3
+; CHECK-NEXT: li r3, 4
+; CHECK-NEXT: li r31, 4
+; CHECK-NEXT: bl .foo4[PR]
+; CHECK-NEXT: nop
+; CHECK-NEXT: L..BB1_9: # %return
+; CHECK-NEXT: mr r3, r31
+; CHECK-NEXT: lwz r31, 60(r1) # 4-byte Folded Reload
+; CHECK-NEXT: addi r1, r1, 64
+; CHECK-NEXT: lwz r0, 8(r1)
+; CHECK-NEXT: mtlr r0
+; CHECK-NEXT: blr
+entry:
+ switch i32 %x, label %return [
+ i32 7, label %sw.bb
+ i32 8, label %sw.bb1
+ i32 11, label %sw.bb1
+ i32 9, label %sw.bb2
+ i32 10, label %sw.bb2
+ i32 12, label %sw.bb2
+ i32 4, label %sw.bb3
+ ]
+
+sw.bb: ; preds = %entry
+ tail call void @foo1(i32 noundef signext 7)
+ br label %return
+
+sw.bb1: ; preds = %entry, %entry
+ tail call void @foo2(i32 noundef signext %x)
+ br label %return
+
+sw.bb2: ; preds = %entry, %entry, %entry
+ tail call void @foo3(i32 noundef signext %x)
+ br label %return
+
+sw.bb3: ; preds = %entry
+ tail call void @foo4(i32 noundef signext 4)
+ br label %return
+
+return: ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry
+ %retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ 7, %sw.bb ]
+ ret i32 %retval.0
+}
+
+declare void @foo1(i32 noundef signext)
+
+declare void @foo2(i32 noundef signext)
+
+declare void @foo3(i32 noundef signext)
+
+declare void @foo4(i32 noundef signext)
|
It's probably worth trying to optimize the bit-testing sequence before deciding it should be suppressed by heuristics. Opened #155953 with some analysis. |
Currently it is considered suitable to lower to a bit test for a set of switch case clusters when the the number of unique destinations (
NumDests
) and the number of total comparisons (NumCmps
) satisfy:(NumDests == 1 && NumCmps >= 3) || (NumDests == 2 && NumCmps >= 5) || (NumDests == 3 && NumCmps >= 6)
However it is found for some cases on powerpc, for example, when NumDests is 3, and the number of comparisons for each destination is all 2, it's not profitable to lower the switch to bit test. This is to add an option to set the minimum of largest number of comparisons to use bit test for switch lowering.