|
26 | 26 | #include "llvm/ADT/PostOrderIterator.h"
|
27 | 27 | #include "llvm/ADT/STLExtras.h"
|
28 | 28 | #include "llvm/ADT/SetVector.h"
|
29 |
| -#include "llvm/ADT/SmallBitVector.h" |
30 | 29 | #include "llvm/ADT/SmallPtrSet.h"
|
31 | 30 | #include "llvm/ADT/SmallSet.h"
|
32 | 31 | #include "llvm/ADT/SmallVector.h"
|
@@ -2678,74 +2677,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
2678 | 2677 | }
|
2679 | 2678 | case Instruction::Store: {
|
2680 | 2679 | // Check if the stores are consecutive or if we need to swizzle them.
|
2681 |
| - llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); |
2682 |
| - // Make sure all stores in the bundle are simple - we can't vectorize |
2683 |
| - // atomic or volatile stores. |
2684 |
| - SmallVector<Value *, 4> PointerOps(VL.size()); |
2685 |
| - ValueList Operands(VL.size()); |
2686 |
| - auto POIter = PointerOps.begin(); |
2687 |
| - auto OIter = Operands.begin(); |
2688 |
| - for (Value *V : VL) { |
2689 |
| - auto *SI = cast<StoreInst>(V); |
2690 |
| - if (!SI->isSimple()) { |
| 2680 | + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) |
| 2681 | + if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { |
2691 | 2682 | BS.cancelScheduling(VL, VL0);
|
2692 | 2683 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
|
2693 | 2684 | ReuseShuffleIndicies);
|
2694 |
| - LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); |
| 2685 | + LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); |
2695 | 2686 | return;
|
2696 | 2687 | }
|
2697 |
| - *POIter = SI->getPointerOperand(); |
2698 |
| - *OIter = SI->getValueOperand(); |
2699 |
| - ++POIter; |
2700 |
| - ++OIter; |
2701 |
| - } |
2702 | 2688 |
|
2703 |
| - OrdersType CurrentOrder; |
2704 |
| - // Check the order of pointer operands. |
2705 |
| - if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { |
2706 |
| - Value *Ptr0; |
2707 |
| - Value *PtrN; |
2708 |
| - if (CurrentOrder.empty()) { |
2709 |
| - Ptr0 = PointerOps.front(); |
2710 |
| - PtrN = PointerOps.back(); |
2711 |
| - } else { |
2712 |
| - Ptr0 = PointerOps[CurrentOrder.front()]; |
2713 |
| - PtrN = PointerOps[CurrentOrder.back()]; |
2714 |
| - } |
2715 |
| - const SCEV *Scev0 = SE->getSCEV(Ptr0); |
2716 |
| - const SCEV *ScevN = SE->getSCEV(PtrN); |
2717 |
| - const auto *Diff = |
2718 |
| - dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); |
2719 |
| - uint64_t Size = DL->getTypeAllocSize(ScalarTy); |
2720 |
| - // Check that the sorted pointer operands are consecutive. |
2721 |
| - if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { |
2722 |
| - if (CurrentOrder.empty()) { |
2723 |
| - // Original stores are consecutive and does not require reordering. |
2724 |
| - ++NumOpsWantToKeepOriginalOrder; |
2725 |
| - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, |
2726 |
| - UserTreeIdx, ReuseShuffleIndicies); |
2727 |
| - TE->setOperandsInOrder(); |
2728 |
| - buildTree_rec(Operands, Depth + 1, {TE, 0}); |
2729 |
| - LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); |
2730 |
| - } else { |
2731 |
| - // Need to reorder. |
2732 |
| - auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; |
2733 |
| - ++(I->getSecond()); |
2734 |
| - TreeEntry *TE = |
2735 |
| - newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
2736 |
| - ReuseShuffleIndicies, I->getFirst()); |
2737 |
| - TE->setOperandsInOrder(); |
2738 |
| - buildTree_rec(Operands, Depth + 1, {TE, 0}); |
2739 |
| - LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); |
2740 |
| - } |
2741 |
| - return; |
2742 |
| - } |
2743 |
| - } |
| 2689 | + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
| 2690 | + ReuseShuffleIndicies); |
| 2691 | + LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); |
2744 | 2692 |
|
2745 |
| - BS.cancelScheduling(VL, VL0); |
2746 |
| - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, |
2747 |
| - ReuseShuffleIndicies); |
2748 |
| - LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); |
| 2693 | + ValueList Operands; |
| 2694 | + for (Value *V : VL) |
| 2695 | + Operands.push_back(cast<Instruction>(V)->getOperand(0)); |
| 2696 | + TE->setOperandsInOrder(); |
| 2697 | + buildTree_rec(Operands, Depth + 1, {TE, 0}); |
2749 | 2698 | return;
|
2750 | 2699 | }
|
2751 | 2700 | case Instruction::Call: {
|
@@ -3243,22 +3192,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
3243 | 3192 | }
|
3244 | 3193 | case Instruction::Store: {
|
3245 | 3194 | // We know that we can merge the stores. Calculate the cost.
|
3246 |
| - bool IsReorder = !E->ReorderIndices.empty(); |
3247 |
| - auto *SI = |
3248 |
| - cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); |
3249 |
| - MaybeAlign Alignment(SI->getAlignment()); |
| 3195 | + MaybeAlign alignment(cast<StoreInst>(VL0)->getAlignment()); |
3250 | 3196 | int ScalarEltCost =
|
3251 |
| - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0); |
3252 |
| - if (NeedToShuffleReuses) |
3253 |
| - ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; |
3254 |
| - int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; |
3255 |
| - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, |
3256 |
| - VecTy, Alignment, 0, VL0); |
3257 |
| - if (IsReorder) { |
3258 |
| - // TODO: Merge this shuffle with the ReuseShuffleCost. |
3259 |
| - VecStCost += TTI->getShuffleCost( |
3260 |
| - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); |
| 3197 | + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); |
| 3198 | + if (NeedToShuffleReuses) { |
| 3199 | + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; |
3261 | 3200 | }
|
| 3201 | + int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; |
| 3202 | + int VecStCost = |
| 3203 | + TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0); |
3262 | 3204 | return ReuseShuffleCost + VecStCost - ScalarStCost;
|
3263 | 3205 | }
|
3264 | 3206 | case Instruction::Call: {
|
@@ -4120,25 +4062,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
4120 | 4062 | return V;
|
4121 | 4063 | }
|
4122 | 4064 | case Instruction::Store: {
|
4123 |
| - bool IsReorder = !E->ReorderIndices.empty(); |
4124 |
| - auto *SI = cast<StoreInst>( |
4125 |
| - IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0); |
| 4065 | + StoreInst *SI = cast<StoreInst>(VL0); |
4126 | 4066 | unsigned Alignment = SI->getAlignment();
|
4127 | 4067 | unsigned AS = SI->getPointerAddressSpace();
|
4128 | 4068 |
|
4129 | 4069 | setInsertPointAfterBundle(E);
|
4130 | 4070 |
|
4131 | 4071 | Value *VecValue = vectorizeTree(E->getOperand(0));
|
4132 |
| - if (IsReorder) { |
4133 |
| - OrdersType Mask; |
4134 |
| - inversePermutation(E->ReorderIndices, Mask); |
4135 |
| - VecValue = Builder.CreateShuffleVector( |
4136 |
| - VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices, |
4137 |
| - "reorder_shuffle"); |
4138 |
| - } |
4139 | 4072 | Value *ScalarPtr = SI->getPointerOperand();
|
4140 |
| - Value *VecPtr = Builder.CreateBitCast( |
4141 |
| - ScalarPtr, VecValue->getType()->getPointerTo(AS)); |
| 4073 | + Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); |
4142 | 4074 | StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
|
4143 | 4075 |
|
4144 | 4076 | // The pointer operand uses an in-tree scalar, so add the new BitCast to
|
@@ -5427,135 +5359,125 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
|
5427 | 5359 | }
|
5428 | 5360 |
|
5429 | 5361 | bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
|
5430 |
| - unsigned Idx) { |
5431 |
| - LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() |
| 5362 | + unsigned VecRegSize) { |
| 5363 | + const unsigned ChainLen = Chain.size(); |
| 5364 | + LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen |
5432 | 5365 | << "\n");
|
5433 | 5366 | const unsigned Sz = R.getVectorElementSize(Chain[0]);
|
5434 |
| - const unsigned MinVF = R.getMinVecRegSize() / Sz; |
5435 |
| - unsigned VF = Chain.size(); |
| 5367 | + const unsigned VF = VecRegSize / Sz; |
5436 | 5368 |
|
5437 |
| - if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) |
| 5369 | + if (!isPowerOf2_32(Sz) || VF < 2) |
5438 | 5370 | return false;
|
5439 | 5371 |
|
5440 |
| - LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx |
5441 |
| - << "\n"); |
| 5372 | + bool Changed = false; |
| 5373 | + // Look for profitable vectorizable trees at all offsets, starting at zero. |
| 5374 | + for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) { |
| 5375 | + |
| 5376 | + ArrayRef<Value *> Operands = Chain.slice(i, VF); |
| 5377 | + // Check that a previous iteration of this loop did not delete the Value. |
| 5378 | + if (llvm::any_of(Operands, [&R](Value *V) { |
| 5379 | + auto *I = dyn_cast<Instruction>(V); |
| 5380 | + return I && R.isDeleted(I); |
| 5381 | + })) |
| 5382 | + continue; |
5442 | 5383 |
|
5443 |
| - R.buildTree(Chain); |
5444 |
| - Optional<ArrayRef<unsigned>> Order = R.bestOrder(); |
5445 |
| - // TODO: Handle orders of size less than number of elements in the vector. |
5446 |
| - if (Order && Order->size() == Chain.size()) { |
5447 |
| - // TODO: reorder tree nodes without tree rebuilding. |
5448 |
| - SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend()); |
5449 |
| - llvm::transform(*Order, ReorderedOps.begin(), |
5450 |
| - [Chain](const unsigned Idx) { return Chain[Idx]; }); |
5451 |
| - R.buildTree(ReorderedOps); |
5452 |
| - } |
5453 |
| - if (R.isTreeTinyAndNotFullyVectorizable()) |
5454 |
| - return false; |
| 5384 | + LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i |
| 5385 | + << "\n"); |
5455 | 5386 |
|
5456 |
| - R.computeMinimumValueSizes(); |
| 5387 | + R.buildTree(Operands); |
| 5388 | + if (R.isTreeTinyAndNotFullyVectorizable()) |
| 5389 | + continue; |
5457 | 5390 |
|
5458 |
| - int Cost = R.getTreeCost(); |
| 5391 | + R.computeMinimumValueSizes(); |
5459 | 5392 |
|
5460 |
| - LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); |
5461 |
| - if (Cost < -SLPCostThreshold) { |
5462 |
| - LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); |
| 5393 | + int Cost = R.getTreeCost(); |
5463 | 5394 |
|
5464 |
| - using namespace ore; |
| 5395 | + LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF |
| 5396 | + << "\n"); |
| 5397 | + if (Cost < -SLPCostThreshold) { |
| 5398 | + LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); |
5465 | 5399 |
|
5466 |
| - R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", |
5467 |
| - cast<StoreInst>(Chain[0])) |
5468 |
| - << "Stores SLP vectorized with cost " << NV("Cost", Cost) |
5469 |
| - << " and with tree size " |
5470 |
| - << NV("TreeSize", R.getTreeSize())); |
| 5400 | + using namespace ore; |
5471 | 5401 |
|
5472 |
| - R.vectorizeTree(); |
5473 |
| - return true; |
| 5402 | + R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", |
| 5403 | + cast<StoreInst>(Chain[i])) |
| 5404 | + << "Stores SLP vectorized with cost " << NV("Cost", Cost) |
| 5405 | + << " and with tree size " |
| 5406 | + << NV("TreeSize", R.getTreeSize())); |
| 5407 | + |
| 5408 | + R.vectorizeTree(); |
| 5409 | + |
| 5410 | + // Move to the next bundle. |
| 5411 | + i += VF - 1; |
| 5412 | + Changed = true; |
| 5413 | + } |
5474 | 5414 | }
|
5475 | 5415 |
|
5476 |
| - return false; |
| 5416 | + return Changed; |
5477 | 5417 | }
|
5478 | 5418 |
|
5479 | 5419 | bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
|
5480 | 5420 | BoUpSLP &R) {
|
| 5421 | + SetVector<StoreInst *> Heads; |
| 5422 | + SmallDenseSet<StoreInst *> Tails; |
| 5423 | + SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; |
| 5424 | + |
5481 | 5425 | // We may run into multiple chains that merge into a single chain. We mark the
|
5482 | 5426 | // stores that we vectorized so that we don't visit the same store twice.
|
5483 | 5427 | BoUpSLP::ValueSet VectorizedStores;
|
5484 | 5428 | bool Changed = false;
|
5485 | 5429 |
|
5486 |
| - int E = Stores.size(); |
5487 |
| - SmallBitVector Tails(E, false); |
5488 |
| - SmallVector<int, 16> ConsecutiveChain(E, E + 1); |
5489 |
| - auto &&FindConsecutiveAccess = [this, &Stores, &Tails, |
5490 |
| - &ConsecutiveChain](int K, int Idx) { |
5491 |
| - if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) |
5492 |
| - return false; |
| 5430 | + auto &&FindConsecutiveAccess = |
| 5431 | + [this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) { |
| 5432 | + if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) |
| 5433 | + return false; |
| 5434 | + |
| 5435 | + Tails.insert(Stores[Idx]); |
| 5436 | + Heads.insert(Stores[K]); |
| 5437 | + ConsecutiveChain[Stores[K]] = Stores[Idx]; |
| 5438 | + return true; |
| 5439 | + }; |
5493 | 5440 |
|
5494 |
| - Tails.set(Idx); |
5495 |
| - ConsecutiveChain[K] = Idx; |
5496 |
| - return true; |
5497 |
| - }; |
5498 | 5441 | // Do a quadratic search on all of the given stores in reverse order and find
|
5499 | 5442 | // all of the pairs of stores that follow each other.
|
| 5443 | + int E = Stores.size(); |
5500 | 5444 | for (int Idx = E - 1; Idx >= 0; --Idx) {
|
5501 | 5445 | // If a store has multiple consecutive store candidates, search according
|
5502 | 5446 | // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
|
5503 | 5447 | // This is because usually pairing with immediate succeeding or preceding
|
5504 | 5448 | // candidate create the best chance to find slp vectorization opportunity.
|
5505 |
| - const int MaxLookDepth = std::min(E - Idx, 16); |
5506 |
| - for (int Offset = 1, F = std::max(MaxLookDepth, Idx + 1); Offset < F; |
5507 |
| - ++Offset) |
| 5449 | + for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset) |
5508 | 5450 | if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
|
5509 | 5451 | (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
|
5510 | 5452 | break;
|
5511 | 5453 | }
|
5512 | 5454 |
|
5513 | 5455 | // For stores that start but don't end a link in the chain:
|
5514 |
| - for (int Cnt = E; Cnt > 0; --Cnt) { |
5515 |
| - int I = Cnt - 1; |
5516 |
| - if (ConsecutiveChain[I] == E + 1 || Tails.test(I)) |
| 5456 | + for (auto *SI : llvm::reverse(Heads)) { |
| 5457 | + if (Tails.count(SI)) |
5517 | 5458 | continue;
|
| 5459 | + |
5518 | 5460 | // We found a store instr that starts a chain. Now follow the chain and try
|
5519 | 5461 | // to vectorize it.
|
5520 | 5462 | BoUpSLP::ValueList Operands;
|
| 5463 | + StoreInst *I = SI; |
5521 | 5464 | // Collect the chain into a list.
|
5522 |
| - while (I != E + 1 && !VectorizedStores.count(Stores[I])) { |
5523 |
| - Operands.push_back(Stores[I]); |
| 5465 | + while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) { |
| 5466 | + Operands.push_back(I); |
5524 | 5467 | // Move to the next value in the chain.
|
5525 | 5468 | I = ConsecutiveChain[I];
|
5526 | 5469 | }
|
5527 | 5470 |
|
5528 |
| - // If a vector register can't hold 1 element, we are done. |
5529 |
| - unsigned MaxVecRegSize = R.getMaxVecRegSize(); |
5530 |
| - unsigned EltSize = R.getVectorElementSize(Stores[0]); |
5531 |
| - if (MaxVecRegSize % EltSize != 0) |
5532 |
| - continue; |
5533 |
| - |
5534 |
| - unsigned MaxElts = MaxVecRegSize / EltSize; |
5535 | 5471 | // FIXME: Is division-by-2 the correct step? Should we assert that the
|
5536 | 5472 | // register size is a power-of-2?
|
5537 |
| - unsigned StartIdx = 0; |
5538 |
| - for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) { |
5539 |
| - for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { |
5540 |
| - ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size); |
5541 |
| - if (!VectorizedStores.count(Slice.front()) && |
5542 |
| - !VectorizedStores.count(Slice.back()) && |
5543 |
| - vectorizeStoreChain(Slice, R, Cnt)) { |
5544 |
| - // Mark the vectorized stores so that we don't vectorize them again. |
5545 |
| - VectorizedStores.insert(Slice.begin(), Slice.end()); |
5546 |
| - Changed = true; |
5547 |
| - // If we vectorized initial block, no need to try to vectorize it |
5548 |
| - // again. |
5549 |
| - if (Cnt == StartIdx) |
5550 |
| - StartIdx += Size; |
5551 |
| - Cnt += Size; |
5552 |
| - continue; |
5553 |
| - } |
5554 |
| - ++Cnt; |
5555 |
| - } |
5556 |
| - // Check if the whole array was vectorized already - exit. |
5557 |
| - if (StartIdx >= Operands.size()) |
| 5473 | + for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); |
| 5474 | + Size /= 2) { |
| 5475 | + if (vectorizeStoreChain(Operands, R, Size)) { |
| 5476 | + // Mark the vectorized stores so that we don't vectorize them again. |
| 5477 | + VectorizedStores.insert(Operands.begin(), Operands.end()); |
| 5478 | + Changed = true; |
5558 | 5479 | break;
|
| 5480 | + } |
5559 | 5481 | }
|
5560 | 5482 | }
|
5561 | 5483 |
|
@@ -7223,7 +7145,14 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
|
7223 | 7145 | LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
|
7224 | 7146 | << it->second.size() << ".\n");
|
7225 | 7147 |
|
7226 |
| - Changed |= vectorizeStores(it->second, R); |
| 7148 | + // Process the stores in chunks of 16. |
| 7149 | + // TODO: The limit of 16 inhibits greater vectorization factors. |
| 7150 | + // For example, AVX2 supports v32i8. Increasing this limit, however, |
| 7151 | + // may cause a significant compile-time increase. |
| 7152 | + for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) { |
| 7153 | + unsigned Len = std::min<unsigned>(CE - CI, 16); |
| 7154 | + Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R); |
| 7155 | + } |
7227 | 7156 | }
|
7228 | 7157 | return Changed;
|
7229 | 7158 | }
|
|
0 commit comments