Skip to content

Commit e511c4b

Browse files
committed
Temporarily Revert:
"[SLP] Generalization of stores vectorization." "[SLP] Fix -Wunused-variable. NFC" "[SLP] Vectorize jumbled stores." As they're causing significant (10-30x) compile time regressions on vectorizable code. The primary cause of the compile-time regression is f228b53. This reverts commits: f228b53 5503455 21d498c
1 parent e18f4db commit e511c4b

22 files changed

+506
-1045
lines changed

llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
138138
bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
139139

140140
bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,
141-
unsigned Idx);
141+
unsigned VecRegSize);
142142

143143
bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);
144144

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 98 additions & 169 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
#include "llvm/ADT/PostOrderIterator.h"
2727
#include "llvm/ADT/STLExtras.h"
2828
#include "llvm/ADT/SetVector.h"
29-
#include "llvm/ADT/SmallBitVector.h"
3029
#include "llvm/ADT/SmallPtrSet.h"
3130
#include "llvm/ADT/SmallSet.h"
3231
#include "llvm/ADT/SmallVector.h"
@@ -2678,74 +2677,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
26782677
}
26792678
case Instruction::Store: {
26802679
// Check if the stores are consecutive or if we need to swizzle them.
2681-
llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
2682-
// Make sure all stores in the bundle are simple - we can't vectorize
2683-
// atomic or volatile stores.
2684-
SmallVector<Value *, 4> PointerOps(VL.size());
2685-
ValueList Operands(VL.size());
2686-
auto POIter = PointerOps.begin();
2687-
auto OIter = Operands.begin();
2688-
for (Value *V : VL) {
2689-
auto *SI = cast<StoreInst>(V);
2690-
if (!SI->isSimple()) {
2680+
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
2681+
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
26912682
BS.cancelScheduling(VL, VL0);
26922683
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
26932684
ReuseShuffleIndicies);
2694-
LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
2685+
LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
26952686
return;
26962687
}
2697-
*POIter = SI->getPointerOperand();
2698-
*OIter = SI->getValueOperand();
2699-
++POIter;
2700-
++OIter;
2701-
}
27022688

2703-
OrdersType CurrentOrder;
2704-
// Check the order of pointer operands.
2705-
if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
2706-
Value *Ptr0;
2707-
Value *PtrN;
2708-
if (CurrentOrder.empty()) {
2709-
Ptr0 = PointerOps.front();
2710-
PtrN = PointerOps.back();
2711-
} else {
2712-
Ptr0 = PointerOps[CurrentOrder.front()];
2713-
PtrN = PointerOps[CurrentOrder.back()];
2714-
}
2715-
const SCEV *Scev0 = SE->getSCEV(Ptr0);
2716-
const SCEV *ScevN = SE->getSCEV(PtrN);
2717-
const auto *Diff =
2718-
dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
2719-
uint64_t Size = DL->getTypeAllocSize(ScalarTy);
2720-
// Check that the sorted pointer operands are consecutive.
2721-
if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
2722-
if (CurrentOrder.empty()) {
2723-
// Original stores are consecutive and does not require reordering.
2724-
++NumOpsWantToKeepOriginalOrder;
2725-
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
2726-
UserTreeIdx, ReuseShuffleIndicies);
2727-
TE->setOperandsInOrder();
2728-
buildTree_rec(Operands, Depth + 1, {TE, 0});
2729-
LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
2730-
} else {
2731-
// Need to reorder.
2732-
auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
2733-
++(I->getSecond());
2734-
TreeEntry *TE =
2735-
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2736-
ReuseShuffleIndicies, I->getFirst());
2737-
TE->setOperandsInOrder();
2738-
buildTree_rec(Operands, Depth + 1, {TE, 0});
2739-
LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
2740-
}
2741-
return;
2742-
}
2743-
}
2689+
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
2690+
ReuseShuffleIndicies);
2691+
LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
27442692

2745-
BS.cancelScheduling(VL, VL0);
2746-
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
2747-
ReuseShuffleIndicies);
2748-
LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
2693+
ValueList Operands;
2694+
for (Value *V : VL)
2695+
Operands.push_back(cast<Instruction>(V)->getOperand(0));
2696+
TE->setOperandsInOrder();
2697+
buildTree_rec(Operands, Depth + 1, {TE, 0});
27492698
return;
27502699
}
27512700
case Instruction::Call: {
@@ -3243,22 +3192,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
32433192
}
32443193
case Instruction::Store: {
32453194
// We know that we can merge the stores. Calculate the cost.
3246-
bool IsReorder = !E->ReorderIndices.empty();
3247-
auto *SI =
3248-
cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
3249-
MaybeAlign Alignment(SI->getAlignment());
3195+
MaybeAlign alignment(cast<StoreInst>(VL0)->getAlignment());
32503196
int ScalarEltCost =
3251-
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0);
3252-
if (NeedToShuffleReuses)
3253-
ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
3254-
int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
3255-
int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
3256-
VecTy, Alignment, 0, VL0);
3257-
if (IsReorder) {
3258-
// TODO: Merge this shuffle with the ReuseShuffleCost.
3259-
VecStCost += TTI->getShuffleCost(
3260-
TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
3197+
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
3198+
if (NeedToShuffleReuses) {
3199+
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
32613200
}
3201+
int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
3202+
int VecStCost =
3203+
TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0);
32623204
return ReuseShuffleCost + VecStCost - ScalarStCost;
32633205
}
32643206
case Instruction::Call: {
@@ -4120,25 +4062,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
41204062
return V;
41214063
}
41224064
case Instruction::Store: {
4123-
bool IsReorder = !E->ReorderIndices.empty();
4124-
auto *SI = cast<StoreInst>(
4125-
IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
4065+
StoreInst *SI = cast<StoreInst>(VL0);
41264066
unsigned Alignment = SI->getAlignment();
41274067
unsigned AS = SI->getPointerAddressSpace();
41284068

41294069
setInsertPointAfterBundle(E);
41304070

41314071
Value *VecValue = vectorizeTree(E->getOperand(0));
4132-
if (IsReorder) {
4133-
OrdersType Mask;
4134-
inversePermutation(E->ReorderIndices, Mask);
4135-
VecValue = Builder.CreateShuffleVector(
4136-
VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices,
4137-
"reorder_shuffle");
4138-
}
41394072
Value *ScalarPtr = SI->getPointerOperand();
4140-
Value *VecPtr = Builder.CreateBitCast(
4141-
ScalarPtr, VecValue->getType()->getPointerTo(AS));
4073+
Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
41424074
StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
41434075

41444076
// The pointer operand uses an in-tree scalar, so add the new BitCast to
@@ -5427,135 +5359,125 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
54275359
}
54285360

54295361
bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
5430-
unsigned Idx) {
5431-
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
5362+
unsigned VecRegSize) {
5363+
const unsigned ChainLen = Chain.size();
5364+
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
54325365
<< "\n");
54335366
const unsigned Sz = R.getVectorElementSize(Chain[0]);
5434-
const unsigned MinVF = R.getMinVecRegSize() / Sz;
5435-
unsigned VF = Chain.size();
5367+
const unsigned VF = VecRegSize / Sz;
54365368

5437-
if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
5369+
if (!isPowerOf2_32(Sz) || VF < 2)
54385370
return false;
54395371

5440-
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
5441-
<< "\n");
5372+
bool Changed = false;
5373+
// Look for profitable vectorizable trees at all offsets, starting at zero.
5374+
for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {
5375+
5376+
ArrayRef<Value *> Operands = Chain.slice(i, VF);
5377+
// Check that a previous iteration of this loop did not delete the Value.
5378+
if (llvm::any_of(Operands, [&R](Value *V) {
5379+
auto *I = dyn_cast<Instruction>(V);
5380+
return I && R.isDeleted(I);
5381+
}))
5382+
continue;
54425383

5443-
R.buildTree(Chain);
5444-
Optional<ArrayRef<unsigned>> Order = R.bestOrder();
5445-
// TODO: Handle orders of size less than number of elements in the vector.
5446-
if (Order && Order->size() == Chain.size()) {
5447-
// TODO: reorder tree nodes without tree rebuilding.
5448-
SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());
5449-
llvm::transform(*Order, ReorderedOps.begin(),
5450-
[Chain](const unsigned Idx) { return Chain[Idx]; });
5451-
R.buildTree(ReorderedOps);
5452-
}
5453-
if (R.isTreeTinyAndNotFullyVectorizable())
5454-
return false;
5384+
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
5385+
<< "\n");
54555386

5456-
R.computeMinimumValueSizes();
5387+
R.buildTree(Operands);
5388+
if (R.isTreeTinyAndNotFullyVectorizable())
5389+
continue;
54575390

5458-
int Cost = R.getTreeCost();
5391+
R.computeMinimumValueSizes();
54595392

5460-
LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
5461-
if (Cost < -SLPCostThreshold) {
5462-
LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
5393+
int Cost = R.getTreeCost();
54635394

5464-
using namespace ore;
5395+
LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF
5396+
<< "\n");
5397+
if (Cost < -SLPCostThreshold) {
5398+
LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
54655399

5466-
R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
5467-
cast<StoreInst>(Chain[0]))
5468-
<< "Stores SLP vectorized with cost " << NV("Cost", Cost)
5469-
<< " and with tree size "
5470-
<< NV("TreeSize", R.getTreeSize()));
5400+
using namespace ore;
54715401

5472-
R.vectorizeTree();
5473-
return true;
5402+
R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
5403+
cast<StoreInst>(Chain[i]))
5404+
<< "Stores SLP vectorized with cost " << NV("Cost", Cost)
5405+
<< " and with tree size "
5406+
<< NV("TreeSize", R.getTreeSize()));
5407+
5408+
R.vectorizeTree();
5409+
5410+
// Move to the next bundle.
5411+
i += VF - 1;
5412+
Changed = true;
5413+
}
54745414
}
54755415

5476-
return false;
5416+
return Changed;
54775417
}
54785418

54795419
bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
54805420
BoUpSLP &R) {
5421+
SetVector<StoreInst *> Heads;
5422+
SmallDenseSet<StoreInst *> Tails;
5423+
SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
5424+
54815425
// We may run into multiple chains that merge into a single chain. We mark the
54825426
// stores that we vectorized so that we don't visit the same store twice.
54835427
BoUpSLP::ValueSet VectorizedStores;
54845428
bool Changed = false;
54855429

5486-
int E = Stores.size();
5487-
SmallBitVector Tails(E, false);
5488-
SmallVector<int, 16> ConsecutiveChain(E, E + 1);
5489-
auto &&FindConsecutiveAccess = [this, &Stores, &Tails,
5490-
&ConsecutiveChain](int K, int Idx) {
5491-
if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))
5492-
return false;
5430+
auto &&FindConsecutiveAccess =
5431+
[this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) {
5432+
if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))
5433+
return false;
5434+
5435+
Tails.insert(Stores[Idx]);
5436+
Heads.insert(Stores[K]);
5437+
ConsecutiveChain[Stores[K]] = Stores[Idx];
5438+
return true;
5439+
};
54935440

5494-
Tails.set(Idx);
5495-
ConsecutiveChain[K] = Idx;
5496-
return true;
5497-
};
54985441
// Do a quadratic search on all of the given stores in reverse order and find
54995442
// all of the pairs of stores that follow each other.
5443+
int E = Stores.size();
55005444
for (int Idx = E - 1; Idx >= 0; --Idx) {
55015445
// If a store has multiple consecutive store candidates, search according
55025446
// to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
55035447
// This is because usually pairing with immediate succeeding or preceding
55045448
// candidate create the best chance to find slp vectorization opportunity.
5505-
const int MaxLookDepth = std::min(E - Idx, 16);
5506-
for (int Offset = 1, F = std::max(MaxLookDepth, Idx + 1); Offset < F;
5507-
++Offset)
5449+
for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset)
55085450
if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
55095451
(Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
55105452
break;
55115453
}
55125454

55135455
// For stores that start but don't end a link in the chain:
5514-
for (int Cnt = E; Cnt > 0; --Cnt) {
5515-
int I = Cnt - 1;
5516-
if (ConsecutiveChain[I] == E + 1 || Tails.test(I))
5456+
for (auto *SI : llvm::reverse(Heads)) {
5457+
if (Tails.count(SI))
55175458
continue;
5459+
55185460
// We found a store instr that starts a chain. Now follow the chain and try
55195461
// to vectorize it.
55205462
BoUpSLP::ValueList Operands;
5463+
StoreInst *I = SI;
55215464
// Collect the chain into a list.
5522-
while (I != E + 1 && !VectorizedStores.count(Stores[I])) {
5523-
Operands.push_back(Stores[I]);
5465+
while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) {
5466+
Operands.push_back(I);
55245467
// Move to the next value in the chain.
55255468
I = ConsecutiveChain[I];
55265469
}
55275470

5528-
// If a vector register can't hold 1 element, we are done.
5529-
unsigned MaxVecRegSize = R.getMaxVecRegSize();
5530-
unsigned EltSize = R.getVectorElementSize(Stores[0]);
5531-
if (MaxVecRegSize % EltSize != 0)
5532-
continue;
5533-
5534-
unsigned MaxElts = MaxVecRegSize / EltSize;
55355471
// FIXME: Is division-by-2 the correct step? Should we assert that the
55365472
// register size is a power-of-2?
5537-
unsigned StartIdx = 0;
5538-
for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) {
5539-
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
5540-
ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
5541-
if (!VectorizedStores.count(Slice.front()) &&
5542-
!VectorizedStores.count(Slice.back()) &&
5543-
vectorizeStoreChain(Slice, R, Cnt)) {
5544-
// Mark the vectorized stores so that we don't vectorize them again.
5545-
VectorizedStores.insert(Slice.begin(), Slice.end());
5546-
Changed = true;
5547-
// If we vectorized initial block, no need to try to vectorize it
5548-
// again.
5549-
if (Cnt == StartIdx)
5550-
StartIdx += Size;
5551-
Cnt += Size;
5552-
continue;
5553-
}
5554-
++Cnt;
5555-
}
5556-
// Check if the whole array was vectorized already - exit.
5557-
if (StartIdx >= Operands.size())
5473+
for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
5474+
Size /= 2) {
5475+
if (vectorizeStoreChain(Operands, R, Size)) {
5476+
// Mark the vectorized stores so that we don't vectorize them again.
5477+
VectorizedStores.insert(Operands.begin(), Operands.end());
5478+
Changed = true;
55585479
break;
5480+
}
55595481
}
55605482
}
55615483

@@ -7223,7 +7145,14 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
72237145
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
72247146
<< it->second.size() << ".\n");
72257147

7226-
Changed |= vectorizeStores(it->second, R);
7148+
// Process the stores in chunks of 16.
7149+
// TODO: The limit of 16 inhibits greater vectorization factors.
7150+
// For example, AVX2 supports v32i8. Increasing this limit, however,
7151+
// may cause a significant compile-time increase.
7152+
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {
7153+
unsigned Len = std::min<unsigned>(CE - CI, 16);
7154+
Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
7155+
}
72277156
}
72287157
return Changed;
72297158
}

0 commit comments

Comments
 (0)