diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 62ab3f522bb6f..3643e2d3ce4a1 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2025,6 +2025,7 @@ class BoUpSLP {
   void deleteTree() {
     VectorizableTree.clear();
     ScalarToTreeEntries.clear();
+    PostponedNodesWithNonVecUsers.clear();
     OperandsToTreeEntry.clear();
     ScalarsInSplitNodes.clear();
     MustGather.clear();
@@ -4033,6 +4034,9 @@ class BoUpSLP {
     /// Returns true if any scalar in the list is a copyable element.
     bool hasCopyableElements() const { return !CopyableElements.empty(); }
 
+    /// Returns the state of the operations.
+    const InstructionsState &getOperations() const { return S; }
+
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
     unsigned findLaneForValue(Value *V) const {
@@ -4436,6 +4440,13 @@ class BoUpSLP {
                                OrdersType &CurrentOrder,
                                SmallVectorImpl<Value *> &PointerOps);
 
+  /// Checks if it is profitable to vectorize the specified list of the
+  /// instructions if not all users are vectorized.
+  bool isProfitableToVectorizeWithNonVecUsers(const InstructionsState &S,
+                                              const EdgeInfo &UserTreeIdx,
+                                              ArrayRef<Value *> VL,
+                                              ArrayRef<int> Mask);
+
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
 
@@ -4446,6 +4457,9 @@ class BoUpSLP {
   /// Scalars, used in split vectorize nodes.
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
 
+  /// List of tree nodes indices, which have non-vectorized users.
+  SmallSet<unsigned, 4> PostponedNodesWithNonVecUsers;
+
   /// Maps a value to the proposed vectorizable size.
   SmallDenseMap<Value *, unsigned> InstrElementSize;
 
@@ -9168,6 +9182,93 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
   return {IntrinsicCost, LibCost};
 }
 
+/// Check if extracts are cheaper than the original scalars.
+static bool
+areExtractsCheaperThanScalars(TargetTransformInfo &TTI, Type *UserScalarTy,
+                              VectorType *UserVecTy, const APInt &DemandedElts,
+                              const InstructionCost UserScalarsCost,
+                              Type *ScalarTy, unsigned VF, ArrayRef<int> Mask,
+                              InstructionCost UserEntryCost) {
+  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  // If extracts are cheaper than the original scalars - success.
+  InstructionCost ExtractCost =
+      ::getScalarizationOverhead(TTI, UserScalarTy, UserVecTy, DemandedElts,
+                                 /*Insert=*/false, /*Extract=*/true, CostKind);
+  if (ExtractCost <= UserScalarsCost)
+    return true;
+  // The node is profitable for vectorization - success.
+  if (ExtractCost <= UserEntryCost)
+    return true;
+  auto *VecTy = getWidenedType(ScalarTy, VF);
+  InstructionCost ScalarsCost =
+      ::getScalarizationOverhead(TTI, ScalarTy, VecTy, APInt::getAllOnes(VF),
+                                 /*Insert=*/true, /*Extract=*/false, CostKind);
+  if (!Mask.empty())
+    ScalarsCost +=
+        getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind);
+  return ExtractCost < UserScalarsCost + ScalarsCost;
+}
+
+bool BoUpSLP::isProfitableToVectorizeWithNonVecUsers(
+    const InstructionsState &S, const EdgeInfo &UserTreeIdx,
+    ArrayRef<Value *> VL, ArrayRef<int> Mask) {
+  assert(S && "Expected valid instructions state.");
+  // Loads, extracts and geps are immediately scalarizable, so no need to check.
+  if (S.getOpcode() == Instruction::Load ||
+      S.getOpcode() == Instruction::ExtractElement ||
+      S.getOpcode() == Instruction::GetElementPtr)
+    return true;
+  // Check only vectorized users, others scalarized (potentially, at least)
+  // already.
+  if (!UserTreeIdx.UserTE || UserTreeIdx.UserTE->isGather() ||
+      UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize)
+    return true;
+  // PHI nodes may have cyclic deps, so cannot check here.
+  if (UserTreeIdx.UserTE->getOpcode() == Instruction::PHI)
+    return true;
+  // Do not check root reduction nodes, they do not have non-vectorized users.
+  if (UserIgnoreList && UserTreeIdx.UserTE->Idx == 0)
+    return true;
+  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  ArrayRef<Value *> UserVL = UserTreeIdx.UserTE->Scalars;
+  Type *UserScalarTy = getValueType(UserVL.front());
+  if (!isValidElementType(UserScalarTy))
+    return true;
+  Type *ScalarTy = getValueType(VL.front());
+  if (!isValidElementType(ScalarTy))
+    return true;
+  // Ignore subvectors extracts for revectorized nodes, subvector extracts are
+  // always cheap as they do not require vector-to-scalar move.
+  if (UserScalarTy->isVectorTy())
+    return true;
+  auto *UserVecTy =
+      getWidenedType(UserScalarTy, UserTreeIdx.UserTE->getVectorFactor());
+  APInt DemandedElts = APInt::getZero(UserTreeIdx.UserTE->getVectorFactor());
+  // Check the external uses and check, if vector node + extracts is not
+  // profitable for the vectorization.
+  InstructionCost UserScalarsCost = 0;
+  for (Value *V : UserVL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      continue;
+    if (areAllUsersVectorized(I, UserIgnoreList))
+      continue;
+    DemandedElts.setBit(UserTreeIdx.UserTE->findLaneForValue(V));
+    UserScalarsCost += TTI->getInstructionCost(I, CostKind);
+  }
+  // No non-vectorized users - success.
+  if (DemandedElts.isZero())
+    return true;
+
+  // User extracts are cheaper than user scalars + immediate scalars - success.
+  SmallPtrSet<Value *, 4> CheckedExtracts;
+  InstructionCost UserEntryCost =
+      getEntryCost(UserTreeIdx.UserTE, {}, CheckedExtracts);
+  return areExtractsCheaperThanScalars(*TTI, UserScalarTy, UserVecTy,
+                                       DemandedElts, UserScalarsCost, ScalarTy,
+                                       VL.size(), Mask, UserEntryCost);
+}
+
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     const InstructionsState &S, ArrayRef<Value *> VL,
     bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
@@ -10717,6 +10818,16 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
     return;
   }
 
+  // Postpone vectorization, if the node is not profitable because of the
+  // external uses of the user node, which will be represented as original
+  // scalars, not extracts. In this case, their operands must be kept scalar.
+  if (!isProfitableToVectorizeWithNonVecUsers(S, UserTreeIdx, VL,
+                                              ReuseShuffleIndices)) {
+    PostponedNodesWithNonVecUsers.insert(VectorizableTree.size());
+    newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
+    return;
+  }
+
   Instruction *VL0 = S.getMainOp();
   BasicBlock *BB = VL0->getParent();
   auto &BSRef = BlocksSchedules[BB];
@@ -12102,6 +12213,27 @@ void BoUpSLP::transformNodes() {
       ArrayRef<Value *> VL = E.Scalars;
       const unsigned Sz = getVectorElementSize(VL.front());
       unsigned MinVF = getMinVF(2 * Sz);
+      const EdgeInfo &EI = E.UserTreeIndex;
+      // Try to vectorized postponed scalars, if external uses are vectorized.
+      if (PostponedNodesWithNonVecUsers.contains(E.Idx) &&
+          isProfitableToVectorizeWithNonVecUsers(
+              E.getOperations(), EI, E.Scalars, E.ReuseShuffleIndices)) {
+        assert(E.hasState() && "Expected to have state");
+        unsigned PrevSize = VectorizableTree.size();
+        [[maybe_unused]] unsigned PrevEntriesSize =
+            LoadEntriesToVectorize.size();
+        buildTreeRec(VL, 0, EdgeInfo(&E, UINT_MAX));
+        if (PrevSize + 1 == VectorizableTree.size() &&
+            VectorizableTree[PrevSize]->isGather()) {
+          VectorizableTree.pop_back();
+          assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
+                 "LoadEntriesToVectorize expected to remain the same");
+        } else {
+          E.CombinedEntriesWithIndices.emplace_back(PrevSize, 0);
+          continue;
+        }
+      }
+
       // Do not try partial vectorization for small nodes (<= 2), nodes with the
       // same opcode and same parent block or all constants.
       if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
@@ -13262,7 +13394,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       });
       InVectors.front() = V;
     }
-    if (!SubVectors.empty()) {
+    bool FullSubvectorMatch =
+        SubVectors.size() == 1 && SubVectors.front().second == 0 &&
+        SubVectors.front().first->getVectorFactor() == CommonMask.size();
+    if (!SubVectors.empty() && !FullSubvectorMatch) {
       const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
       if (InVectors.size() == 2)
         Cost += createShuffle(Vec, InVectors.back(), CommonMask);
@@ -13373,6 +13508,16 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
 InstructionCost
 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                       SmallPtrSetImpl<Value *> &CheckedExtracts) {
+  // No need to count the cost for combined entries, they are combined and
+  // just skip their cost.
+  if (E->State == TreeEntry::CombinedVectorize) {
+    LLVM_DEBUG(
+        dbgs() << "SLP: Skipping cost for combined node that starts with "
+               << E->Scalars[0] << ".\n";
+        E->dump());
+    return 0;
+  }
+
   ArrayRef<Value *> VL = E->Scalars;
 
   Type *ScalarTy = getValueType(VL[0]);
@@ -13784,7 +13929,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
-    auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+    auto SrcIt =
+        MinBWs.empty() ? MinBWs.end() : MinBWs.find(getOperandEntry(E, 0));
     Type *SrcScalarTy = VL0->getOperand(0)->getType();
     auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
     unsigned Opcode = ShuffleOrOp;
@@ -14231,7 +14377,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
         auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
         if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
-          auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+          auto SrcIt = MinBWs.empty() ? MinBWs.end()
+                                      : MinBWs.find(getOperandEntry(E, 0));
           unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
           unsigned SrcBWSz =
               DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
@@ -15036,15 +15183,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
   SmallPtrSet<Value *, 4> CheckedExtracts;
   for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
     TreeEntry &TE = *VectorizableTree[I];
-    // No need to count the cost for combined entries, they are combined and
-    // just skip their cost.
-    if (TE.State == TreeEntry::CombinedVectorize) {
-      LLVM_DEBUG(
-          dbgs() << "SLP: Skipping cost for combined node that starts with "
-                 << *TE.Scalars[0] << ".\n";
-          TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
-      continue;
-    }
     if (TE.hasState() &&
         (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
       if (const TreeEntry *E =
@@ -15259,6 +15397,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
       auto *Inst = cast<Instruction>(EU.Scalar);
       InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
       auto OperandIsScalar = [&](Value *V) {
+        if (!isa<Instruction>(V))
+          return true;
         if (!isVectorized(V)) {
           // Some extractelements might be not vectorized, but
           // transformed into shuffle and removed from the function,
@@ -17355,7 +17495,20 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
       });
       InVectors.front() = Vec;
     }
-    if (!SubVectors.empty()) {
+    const bool FullSubvectorMatch =
+        SubVectors.size() == 1 && SubVectors.front().second == 0 &&
+        SubVectors.front().first->getVectorFactor() == CommonMask.size();
+    if (FullSubvectorMatch) {
+      Value *Vec = SubVectors.front().first->VectorizedValue;
+      if (Vec->getType()->isIntOrIntVectorTy())
+        Vec = castToScalarTyElem(
+            Vec, any_of(SubVectors.front().first->Scalars, [&](Value *V) {
+              if (isa<PoisonValue>(V))
+                return false;
+              return !isKnownNonNegative(V, SimplifyQuery(*R.DL));
+            }));
+      transformMaskAfterShuffle(CommonMask, CommonMask);
+    } else if (!SubVectors.empty()) {
       Value *Vec = InVectors.front();
       if (InVectors.size() == 2) {
         Vec = createShuffle(Vec, InVectors.back(), CommonMask);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
index d4e323819402c..b05bcf7b97abd 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
@@ -100,82 +100,86 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72
 ; THRESH-NEXT:  [[ENTRY:.*:]]
 ; THRESH-NEXT:    [[LOOPARRAY_SROA_24_0_I_I3:%.*]] = ashr i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1
+; THRESH-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; THRESH-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THRESH-NEXT:    [[TMP17:%.*]] = add <2 x i32> [[TMP21]], splat (i32 1)
 ; THRESH-NEXT:    [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]]
-; THRESH-NEXT:    [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1
-; THRESH-NEXT:    [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16
-; THRESH-NEXT:    [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2
-; THRESH-NEXT:    [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2
-; THRESH-NEXT:    [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
-; THRESH-NEXT:    [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1
-; THRESH-NEXT:    [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
-; THRESH-NEXT:    [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16
+; THRESH-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THRESH-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
+; THRESH-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
+; THRESH-NEXT:    [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
 ; THRESH-NEXT:    [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1
 ; THRESH-NEXT:    [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1
-; THRESH-NEXT:    [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
-; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8
-; THRESH-NEXT:    [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]]
-; THRESH-NEXT:    [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1
-; THRESH-NEXT:    [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4
-; THRESH-NEXT:    [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1
-; THRESH-NEXT:    [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1
-; THRESH-NEXT:    [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8
-; THRESH-NEXT:    [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]]
-; THRESH-NEXT:    [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1
-; THRESH-NEXT:    [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
-; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8
-; THRESH-NEXT:    [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1
-; THRESH-NEXT:    [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1
-; THRESH-NEXT:    [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8
+; THRESH-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[ADD111_I_I]], i32 0
+; THRESH-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 1
+; THRESH-NEXT:    [[TMP22:%.*]] = or <2 x i32> [[TMP19]], [[TMP21]]
+; THRESH-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 1
+; THRESH-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[SHR143_5_I_I9]], i32 0
+; THRESH-NEXT:    [[TMP13:%.*]] = add <2 x i32> [[TMP12]], splat (i32 1)
+; THRESH-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP12]], splat (i32 1)
+; THRESH-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> <i32 0, i32 3>
 ; THRESH-NEXT:    [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1
 ; THRESH-NEXT:    [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]]
 ; THRESH-NEXT:    [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16
 ; THRESH-NEXT:    store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8
-; THRESH-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THRESH-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
-; THRESH-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
-; THRESH-NEXT:    [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
-; THRESH-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
-; THRESH-NEXT:    store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
 ; THRESH-NEXT:    [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1
 ; THRESH-NEXT:    [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
-; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2
+; THRESH-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; THRESH-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; THRESH-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0
 ; THRESH-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1)
 ; THRESH-NEXT:    [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1)
-; THRESH-NEXT:    [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
-; THRESH-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1
-; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
-; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8
-; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4
-; THRESH-NEXT:    store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
-; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2
-; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2
-; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2
-; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2
-; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2
-; THRESH-NEXT:    store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2
+; THRESH-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP27:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
+; THRESH-NEXT:    store <2 x i16> [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
+; THRESH-NEXT:    [[TMP52:%.*]] = insertelement <4 x i32> poison, i32 [[SHR2237_I]], i32 0
+; THRESH-NEXT:    [[TMP53:%.*]] = insertelement <4 x i32> [[TMP52]], i32 [[ADD2394_I]], i32 1
+; THRESH-NEXT:    [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
+; THRESH-NEXT:    [[TMP54:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 2
+; THRESH-NEXT:    [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[MUL1445_I]], i32 3
+; THRESH-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP28]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; THRESH-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> <i32 poison, i32 1, i32 1, i32 2>, i32 [[TMP0]], i32 0
+; THRESH-NEXT:    [[TMP31:%.*]] = or <4 x i32> [[TMP29]], [[TMP30]]
+; THRESH-NEXT:    [[TMP32:%.*]] = add <4 x i32> [[TMP29]], [[TMP30]]
+; THRESH-NEXT:    [[TMP56:%.*]] = shufflevector <4 x i32> [[TMP31]], <4 x i32> [[TMP32]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; THRESH-NEXT:    [[TMP57:%.*]] = insertelement <4 x i32> <i32 1, i32 poison, i32 1, i32 2>, i32 [[TMP0]], i32 1
+; THRESH-NEXT:    [[TMP58:%.*]] = lshr <4 x i32> [[TMP56]], [[TMP57]]
+; THRESH-NEXT:    [[TMP59:%.*]] = add <4 x i32> [[TMP56]], [[TMP57]]
+; THRESH-NEXT:    [[TMP60:%.*]] = shufflevector <4 x i32> [[TMP58]], <4 x i32> [[TMP59]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; THRESH-NEXT:    [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP60]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP62:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP61]], <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP63:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP24:%.*]] = shufflevector <8 x i32> [[TMP62]], <8 x i32> [[TMP63]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP33:%.*]] = shufflevector <4 x i32> [[TMP60]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP34:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> [[TMP34]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; THRESH-NEXT:    [[TMP64:%.*]] = trunc <4 x i32> [[TMP35]] to <4 x i16>
+; THRESH-NEXT:    store <4 x i16> [[TMP64]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4
+; THRESH-NEXT:    [[TMP65:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> [[TMP60]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; THRESH-NEXT:    [[TMP47:%.*]] = trunc <4 x i32> [[TMP65]] to <4 x i16>
+; THRESH-NEXT:    [[TMP48:%.*]] = trunc <4 x i32> [[TMP60]] to <4 x i16>
+; THRESH-NEXT:    store <4 x i16> [[TMP48]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
+; THRESH-NEXT:    store <4 x i16> [[TMP47]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
+; THRESH-NEXT:    [[TMP49:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP50:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP51:%.*]] = shufflevector <4 x i32> [[TMP49]], <4 x i32> [[TMP50]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; THRESH-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 poison>, i32 [[TMP0]], i32 3
+; THRESH-NEXT:    [[TMP37:%.*]] = lshr <4 x i32> [[TMP51]], [[TMP36]]
+; THRESH-NEXT:    [[TMP38:%.*]] = add <4 x i32> [[TMP51]], [[TMP36]]
+; THRESH-NEXT:    [[TMP39:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> [[TMP38]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; THRESH-NEXT:    [[TMP40:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP41:%.*]] = shufflevector <8 x i32> [[TMP24]], <8 x i32> [[TMP40]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; THRESH-NEXT:    [[TMP42:%.*]] = trunc <8 x i32> [[TMP41]] to <8 x i16>
+; THRESH-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP39]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; THRESH-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[SHR2237_I]], i32 3
+; THRESH-NEXT:    [[TMP45:%.*]] = trunc <4 x i32> [[TMP44]] to <4 x i16>
+; THRESH-NEXT:    [[TMP46:%.*]] = trunc <4 x i32> [[TMP39]] to <4 x i16>
+; THRESH-NEXT:    store <4 x i16> [[TMP45]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
+; THRESH-NEXT:    store <8 x i16> [[TMP42]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
+; THRESH-NEXT:    store <4 x i16> [[TMP46]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
 ; THRESH-NEXT:    ret i32 0
 ;
 entry: