diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 62ab3f522bb6f..3643e2d3ce4a1 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2025,6 +2025,7 @@ class BoUpSLP { void deleteTree() { VectorizableTree.clear(); ScalarToTreeEntries.clear(); + PostponedNodesWithNonVecUsers.clear(); OperandsToTreeEntry.clear(); ScalarsInSplitNodes.clear(); MustGather.clear(); @@ -4033,6 +4034,9 @@ class BoUpSLP { /// Returns true if any scalar in the list is a copyable element. bool hasCopyableElements() const { return !CopyableElements.empty(); } + /// Returns the state of the operations. + const InstructionsState &getOperations() const { return S; } + /// When ReuseReorderShuffleIndices is empty it just returns position of \p /// V within vector of Scalars. Otherwise, try to remap on its reuse index. unsigned findLaneForValue(Value *V) const { @@ -4436,6 +4440,13 @@ class BoUpSLP { OrdersType &CurrentOrder, SmallVectorImpl &PointerOps); + /// Checks if it is profitable to vectorize the specified list of the + /// instructions if not all users are vectorized. + bool isProfitableToVectorizeWithNonVecUsers(const InstructionsState &S, + const EdgeInfo &UserTreeIdx, + ArrayRef VL, + ArrayRef Mask); + /// Maps a specific scalar to its tree entry(ies). SmallDenseMap> ScalarToTreeEntries; @@ -4446,6 +4457,9 @@ class BoUpSLP { /// Scalars, used in split vectorize nodes. SmallDenseMap> ScalarsInSplitNodes; + /// List of tree nodes indices, which have non-vectorized users. + SmallSet PostponedNodesWithNonVecUsers; + /// Maps a value to the proposed vectorizable size. SmallDenseMap InstrElementSize; @@ -9168,6 +9182,93 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, return {IntrinsicCost, LibCost}; } +/// Check if extracts are cheaper than the original scalars. +static bool +areExtractsCheaperThanScalars(TargetTransformInfo &TTI, Type *UserScalarTy, + VectorType *UserVecTy, const APInt &DemandedElts, + const InstructionCost UserScalarsCost, + Type *ScalarTy, unsigned VF, ArrayRef Mask, + InstructionCost UserEntryCost) { + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + // If extracts are cheaper than the original scalars - success. + InstructionCost ExtractCost = + ::getScalarizationOverhead(TTI, UserScalarTy, UserVecTy, DemandedElts, + /*Insert=*/false, /*Extract=*/true, CostKind); + if (ExtractCost <= UserScalarsCost) + return true; + // The node is profitable for vectorization - success. + if (ExtractCost <= UserEntryCost) + return true; + auto *VecTy = getWidenedType(ScalarTy, VF); + InstructionCost ScalarsCost = + ::getScalarizationOverhead(TTI, ScalarTy, VecTy, APInt::getAllOnes(VF), + /*Insert=*/true, /*Extract=*/false, CostKind); + if (!Mask.empty()) + ScalarsCost += + getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind); + return ExtractCost < UserScalarsCost + ScalarsCost; +} + +bool BoUpSLP::isProfitableToVectorizeWithNonVecUsers( + const InstructionsState &S, const EdgeInfo &UserTreeIdx, + ArrayRef VL, ArrayRef Mask) { + assert(S && "Expected valid instructions state."); + // Loads, extracts and geps are immediately scalarizable, so no need to check. + if (S.getOpcode() == Instruction::Load || + S.getOpcode() == Instruction::ExtractElement || + S.getOpcode() == Instruction::GetElementPtr) + return true; + // Check only vectorized users, others scalarized (potentially, at least) + // already. + if (!UserTreeIdx.UserTE || UserTreeIdx.UserTE->isGather() || + UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize) + return true; + // PHI nodes may have cyclic deps, so cannot check here. + if (UserTreeIdx.UserTE->getOpcode() == Instruction::PHI) + return true; + // Do not check root reduction nodes, they do not have non-vectorized users. + if (UserIgnoreList && UserTreeIdx.UserTE->Idx == 0) + return true; + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + ArrayRef UserVL = UserTreeIdx.UserTE->Scalars; + Type *UserScalarTy = getValueType(UserVL.front()); + if (!isValidElementType(UserScalarTy)) + return true; + Type *ScalarTy = getValueType(VL.front()); + if (!isValidElementType(ScalarTy)) + return true; + // Ignore subvectors extracts for revectorized nodes, subvector extracts are + // always cheap as they do not require vector-to-scalar move. + if (UserScalarTy->isVectorTy()) + return true; + auto *UserVecTy = + getWidenedType(UserScalarTy, UserTreeIdx.UserTE->getVectorFactor()); + APInt DemandedElts = APInt::getZero(UserTreeIdx.UserTE->getVectorFactor()); + // Check the external uses and check, if vector node + extracts is not + // profitable for the vectorization. + InstructionCost UserScalarsCost = 0; + for (Value *V : UserVL) { + auto *I = dyn_cast(V); + if (!I) + continue; + if (areAllUsersVectorized(I, UserIgnoreList)) + continue; + DemandedElts.setBit(UserTreeIdx.UserTE->findLaneForValue(V)); + UserScalarsCost += TTI->getInstructionCost(I, CostKind); + } + // No non-vectorized users - success. + if (DemandedElts.isZero()) + return true; + + // User extracts are cheaper than user scalars + immediate scalars - success. + SmallPtrSet CheckedExtracts; + InstructionCost UserEntryCost = + getEntryCost(UserTreeIdx.UserTE, {}, CheckedExtracts); + return areExtractsCheaperThanScalars(*TTI, UserScalarTy, UserVecTy, + DemandedElts, UserScalarsCost, ScalarTy, + VL.size(), Mask, UserEntryCost); +} + BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( const InstructionsState &S, ArrayRef VL, bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder, @@ -10717,6 +10818,16 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, return; } + // Postpone vectorization, if the node is not profitable because of the + // external uses of the user node, which will be represented as original + // scalars, not extracts. In this case, their operands must be kept scalar. + if (!isProfitableToVectorizeWithNonVecUsers(S, UserTreeIdx, VL, + ReuseShuffleIndices)) { + PostponedNodesWithNonVecUsers.insert(VectorizableTree.size()); + newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); + return; + } + Instruction *VL0 = S.getMainOp(); BasicBlock *BB = VL0->getParent(); auto &BSRef = BlocksSchedules[BB]; @@ -12102,6 +12213,27 @@ void BoUpSLP::transformNodes() { ArrayRef VL = E.Scalars; const unsigned Sz = getVectorElementSize(VL.front()); unsigned MinVF = getMinVF(2 * Sz); + const EdgeInfo &EI = E.UserTreeIndex; + // Try to vectorized postponed scalars, if external uses are vectorized. + if (PostponedNodesWithNonVecUsers.contains(E.Idx) && + isProfitableToVectorizeWithNonVecUsers( + E.getOperations(), EI, E.Scalars, E.ReuseShuffleIndices)) { + assert(E.hasState() && "Expected to have state"); + unsigned PrevSize = VectorizableTree.size(); + [[maybe_unused]] unsigned PrevEntriesSize = + LoadEntriesToVectorize.size(); + buildTreeRec(VL, 0, EdgeInfo(&E, UINT_MAX)); + if (PrevSize + 1 == VectorizableTree.size() && + VectorizableTree[PrevSize]->isGather()) { + VectorizableTree.pop_back(); + assert(PrevEntriesSize == LoadEntriesToVectorize.size() && + "LoadEntriesToVectorize expected to remain the same"); + } else { + E.CombinedEntriesWithIndices.emplace_back(PrevSize, 0); + continue; + } + } + // Do not try partial vectorization for small nodes (<= 2), nodes with the // same opcode and same parent block or all constants. if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) || @@ -13262,7 +13394,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { }); InVectors.front() = V; } - if (!SubVectors.empty()) { + bool FullSubvectorMatch = + SubVectors.size() == 1 && SubVectors.front().second == 0 && + SubVectors.front().first->getVectorFactor() == CommonMask.size(); + if (!SubVectors.empty() && !FullSubvectorMatch) { const PointerUnion &Vec = InVectors.front(); if (InVectors.size() == 2) Cost += createShuffle(Vec, InVectors.back(), CommonMask); @@ -13373,6 +13508,16 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, SmallPtrSetImpl &CheckedExtracts) { + // No need to count the cost for combined entries, they are combined and + // just skip their cost. + if (E->State == TreeEntry::CombinedVectorize) { + LLVM_DEBUG( + dbgs() << "SLP: Skipping cost for combined node that starts with " + << E->Scalars[0] << ".\n"; + E->dump()); + return 0; + } + ArrayRef VL = E->Scalars; Type *ScalarTy = getValueType(VL[0]); @@ -13784,7 +13929,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); + auto SrcIt = + MinBWs.empty() ? MinBWs.end() : MinBWs.find(getOperandEntry(E, 0)); Type *SrcScalarTy = VL0->getOperand(0)->getType(); auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size()); unsigned Opcode = ShuffleOrOp; @@ -14231,7 +14377,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType(); auto *SrcTy = getWidenedType(SrcSclTy, VL.size()); if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) { - auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); + auto SrcIt = MinBWs.empty() ? MinBWs.end() + : MinBWs.find(getOperandEntry(E, 0)); unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); unsigned SrcBWSz = DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType()); @@ -15036,15 +15183,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, SmallPtrSet CheckedExtracts; for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { TreeEntry &TE = *VectorizableTree[I]; - // No need to count the cost for combined entries, they are combined and - // just skip their cost. - if (TE.State == TreeEntry::CombinedVectorize) { - LLVM_DEBUG( - dbgs() << "SLP: Skipping cost for combined node that starts with " - << *TE.Scalars[0] << ".\n"; - TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); - continue; - } if (TE.hasState() && (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) { if (const TreeEntry *E = @@ -15259,6 +15397,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, auto *Inst = cast(EU.Scalar); InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind); auto OperandIsScalar = [&](Value *V) { + if (!isa(V)) + return true; if (!isVectorized(V)) { // Some extractelements might be not vectorized, but // transformed into shuffle and removed from the function, @@ -17355,7 +17495,20 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { }); InVectors.front() = Vec; } - if (!SubVectors.empty()) { + const bool FullSubvectorMatch = + SubVectors.size() == 1 && SubVectors.front().second == 0 && + SubVectors.front().first->getVectorFactor() == CommonMask.size(); + if (FullSubvectorMatch) { + Value *Vec = SubVectors.front().first->VectorizedValue; + if (Vec->getType()->isIntOrIntVectorTy()) + Vec = castToScalarTyElem( + Vec, any_of(SubVectors.front().first->Scalars, [&](Value *V) { + if (isa(V)) + return false; + return !isKnownNonNegative(V, SimplifyQuery(*R.DL)); + })); + transformMaskAfterShuffle(CommonMask, CommonMask); + } else if (!SubVectors.empty()) { Value *Vec = InVectors.front(); if (InVectors.size() == 2) { Vec = createShuffle(Vec, InVectors.back(), CommonMask); diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll index d4e323819402c..b05bcf7b97abd 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll @@ -100,82 +100,86 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72 ; THRESH-NEXT: [[ENTRY:.*:]] ; THRESH-NEXT: [[LOOPARRAY_SROA_24_0_I_I3:%.*]] = ashr i32 [[TMP0]], 1 ; THRESH-NEXT: [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1 -; THRESH-NEXT: [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1 +; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +; THRESH-NEXT: [[TMP21:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <2 x i32> zeroinitializer +; THRESH-NEXT: [[TMP17:%.*]] = add <2 x i32> [[TMP21]], splat (i32 1) ; THRESH-NEXT: [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1 -; THRESH-NEXT: [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]] -; THRESH-NEXT: [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1 -; THRESH-NEXT: [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16 -; THRESH-NEXT: [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2 -; THRESH-NEXT: [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2 -; THRESH-NEXT: [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16 -; THRESH-NEXT: [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1 -; THRESH-NEXT: [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]] -; THRESH-NEXT: [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16 -; THRESH-NEXT: [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1 -; THRESH-NEXT: [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]] -; THRESH-NEXT: [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16 +; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> +; THRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0 +; THRESH-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1) +; THRESH-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1) ; THRESH-NEXT: [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1 ; THRESH-NEXT: [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1 ; THRESH-NEXT: [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1 -; THRESH-NEXT: [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4 -; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8 -; THRESH-NEXT: [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]] -; THRESH-NEXT: [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1 -; THRESH-NEXT: [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4 -; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8 -; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4 -; THRESH-NEXT: [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1 -; THRESH-NEXT: [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1 -; THRESH-NEXT: [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8 -; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4 -; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8 -; THRESH-NEXT: [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1 -; THRESH-NEXT: [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]] -; THRESH-NEXT: [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1 -; THRESH-NEXT: [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4 -; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8 -; THRESH-NEXT: [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1 -; THRESH-NEXT: [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1 -; THRESH-NEXT: [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8 +; THRESH-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[ADD111_I_I]], i32 0 +; THRESH-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 1 +; THRESH-NEXT: [[TMP22:%.*]] = or <2 x i32> [[TMP19]], [[TMP21]] +; THRESH-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 1 +; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[SHR143_5_I_I9]], i32 0 +; THRESH-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP12]], splat (i32 1) +; THRESH-NEXT: [[TMP14:%.*]] = or <2 x i32> [[TMP12]], splat (i32 1) +; THRESH-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> ; THRESH-NEXT: [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1 ; THRESH-NEXT: [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]] ; THRESH-NEXT: [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16 ; THRESH-NEXT: store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8 -; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> -; THRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0 -; THRESH-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1) -; THRESH-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1) -; THRESH-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> -; THRESH-NEXT: store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4 ; THRESH-NEXT: [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1 ; THRESH-NEXT: [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]] -; THRESH-NEXT: [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2 -; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2 -; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2 -; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2 -; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2 +; THRESH-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> ; THRESH-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> ; THRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0 ; THRESH-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1) ; THRESH-NEXT: [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1) -; THRESH-NEXT: [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16> -; THRESH-NEXT: [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1 -; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4 -; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8 -; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4 -; THRESH-NEXT: store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2 -; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2 -; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2 -; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2 -; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2 -; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2 -; THRESH-NEXT: store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2 +; THRESH-NEXT: [[TMP26:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; THRESH-NEXT: [[TMP27:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16> +; THRESH-NEXT: store <2 x i16> [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4 +; THRESH-NEXT: [[TMP52:%.*]] = insertelement <4 x i32> poison, i32 [[SHR2237_I]], i32 0 +; THRESH-NEXT: [[TMP53:%.*]] = insertelement <4 x i32> [[TMP52]], i32 [[ADD2394_I]], i32 1 +; THRESH-NEXT: [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16 +; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2 +; THRESH-NEXT: [[TMP54:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 2 +; THRESH-NEXT: [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[MUL1445_I]], i32 3 +; THRESH-NEXT: [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> poison, <4 x i32> +; THRESH-NEXT: [[TMP29:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP28]], <4 x i32> +; THRESH-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 0 +; THRESH-NEXT: [[TMP31:%.*]] = or <4 x i32> [[TMP29]], [[TMP30]] +; THRESH-NEXT: [[TMP32:%.*]] = add <4 x i32> [[TMP29]], [[TMP30]] +; THRESH-NEXT: [[TMP56:%.*]] = shufflevector <4 x i32> [[TMP31]], <4 x i32> [[TMP32]], <4 x i32> +; THRESH-NEXT: [[TMP57:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 1 +; THRESH-NEXT: [[TMP58:%.*]] = lshr <4 x i32> [[TMP56]], [[TMP57]] +; THRESH-NEXT: [[TMP59:%.*]] = add <4 x i32> [[TMP56]], [[TMP57]] +; THRESH-NEXT: [[TMP60:%.*]] = shufflevector <4 x i32> [[TMP58]], <4 x i32> [[TMP59]], <4 x i32> +; THRESH-NEXT: [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP60]], <4 x i32> poison, <8 x i32> +; THRESH-NEXT: [[TMP62:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP61]], <8 x i32> +; THRESH-NEXT: [[TMP63:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <8 x i32> +; THRESH-NEXT: [[TMP24:%.*]] = shufflevector <8 x i32> [[TMP62]], <8 x i32> [[TMP63]], <8 x i32> +; THRESH-NEXT: [[TMP33:%.*]] = shufflevector <4 x i32> [[TMP60]], <4 x i32> poison, <4 x i32> +; THRESH-NEXT: [[TMP34:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; THRESH-NEXT: [[TMP35:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> [[TMP34]], <4 x i32> +; THRESH-NEXT: [[TMP64:%.*]] = trunc <4 x i32> [[TMP35]] to <4 x i16> +; THRESH-NEXT: store <4 x i16> [[TMP64]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4 +; THRESH-NEXT: [[TMP65:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> [[TMP60]], <4 x i32> +; THRESH-NEXT: [[TMP47:%.*]] = trunc <4 x i32> [[TMP65]] to <4 x i16> +; THRESH-NEXT: [[TMP48:%.*]] = trunc <4 x i32> [[TMP60]] to <4 x i16> +; THRESH-NEXT: store <4 x i16> [[TMP48]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4 +; THRESH-NEXT: store <4 x i16> [[TMP47]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4 +; THRESH-NEXT: [[TMP49:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <4 x i32> +; THRESH-NEXT: [[TMP50:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <4 x i32> +; THRESH-NEXT: [[TMP51:%.*]] = shufflevector <4 x i32> [[TMP49]], <4 x i32> [[TMP50]], <4 x i32> +; THRESH-NEXT: [[TMP36:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 3 +; THRESH-NEXT: [[TMP37:%.*]] = lshr <4 x i32> [[TMP51]], [[TMP36]] +; THRESH-NEXT: [[TMP38:%.*]] = add <4 x i32> [[TMP51]], [[TMP36]] +; THRESH-NEXT: [[TMP39:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> [[TMP38]], <4 x i32> +; THRESH-NEXT: [[TMP40:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> poison, <8 x i32> +; THRESH-NEXT: [[TMP41:%.*]] = shufflevector <8 x i32> [[TMP24]], <8 x i32> [[TMP40]], <8 x i32> +; THRESH-NEXT: [[TMP42:%.*]] = trunc <8 x i32> [[TMP41]] to <8 x i16> +; THRESH-NEXT: [[TMP43:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP39]], <4 x i32> +; THRESH-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[SHR2237_I]], i32 3 +; THRESH-NEXT: [[TMP45:%.*]] = trunc <4 x i32> [[TMP44]] to <4 x i16> +; THRESH-NEXT: [[TMP46:%.*]] = trunc <4 x i32> [[TMP39]] to <4 x i16> +; THRESH-NEXT: store <4 x i16> [[TMP45]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2 +; THRESH-NEXT: store <8 x i16> [[TMP42]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4 +; THRESH-NEXT: store <4 x i16> [[TMP46]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4 ; THRESH-NEXT: ret i32 0 ; entry: