diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 65528b3050fe5..4248b0144ef18 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -3338,6 +3338,14 @@ namespace ISD { return St && St->getAddressingMode() == ISD::UNINDEXED; } + /// Returns true if the specified node is a non-extending and unindexed + /// masked load. + inline bool isNormalMaskedLoad(const SDNode *N) { + auto *Ld = dyn_cast(N); + return Ld && Ld->getExtensionType() == ISD::NON_EXTLOAD && + Ld->getAddressingMode() == ISD::UNINDEXED; + } + /// Attempt to match a unary predicate against a scalar/splat constant or /// every element of a constant BUILD_VECTOR. /// If AllowUndef is true, then UNDEF elements will pass nullptr to Match. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b8335113e4687..2f41fc1ae5655 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1179,6 +1179,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE); // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; @@ -27015,6 +27016,115 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return NVCAST; } +static SDValue performVectorDeinterleaveCombine( + SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + unsigned NumParts = N->getNumOperands(); + if (NumParts != 2 && NumParts != 4) + return SDValue(); + + EVT SubVecTy = N->getValueType(0); + + // At the moment we're unlikely to see a fixed-width vector deinterleave as + // we usually generate shuffles instead. + unsigned MinNumElements = SubVecTy.getVectorMinNumElements(); + if (!SubVecTy.isScalableVector() || + SubVecTy.getSizeInBits().getKnownMinValue() != 128 || + !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy)) + return SDValue(); + + // Make sure each input operand is the correct extract_subvector of the same + // wider vector. + SDValue Op0 = N->getOperand(0); + for (unsigned I = 0; I < NumParts; I++) { + SDValue OpI = N->getOperand(I); + if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR || + OpI->getOperand(0) != Op0->getOperand(0)) + return SDValue(); + if (OpI->getConstantOperandVal(1) != (I * MinNumElements)) + return SDValue(); + } + + // Normal loads are currently already handled by the InterleavedAccessPass so + // we don't expect to see them here. Bail out if the masked load has an + // unexpected number of uses, since we want to avoid a situation where we have + // both deinterleaving loads and normal loads in the same block. Also, discard + // masked loads that are extending, indexed, have an unexpected offset or have + // an unsupported passthru value until we find a valid use case. + auto MaskedLoad = dyn_cast(Op0->getOperand(0)); + if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) || + !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) || + !MaskedLoad->getOffset().isUndef() || + (!MaskedLoad->getPassThru()->isUndef() && + !isZerosVector(MaskedLoad->getPassThru().getNode()))) + return SDValue(); + + // Now prove that the mask is an interleave of identical masks. + SDValue Mask = MaskedLoad->getMask(); + if (Mask->getOpcode() != ISD::SPLAT_VECTOR && + Mask->getOpcode() != ISD::CONCAT_VECTORS) + return SDValue(); + + SDValue NarrowMask; + SDLoc DL(N); + if (Mask->getOpcode() == ISD::CONCAT_VECTORS) { + if (Mask->getNumOperands() != NumParts) + return SDValue(); + + // We should be concatenating each sequential result from a + // VECTOR_INTERLEAVE. + SDNode *InterleaveOp = Mask->getOperand(0).getNode(); + if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE || + InterleaveOp->getNumOperands() != NumParts) + return SDValue(); + + for (unsigned I = 0; I < NumParts; I++) { + if (Mask.getOperand(I) != SDValue(InterleaveOp, I)) + return SDValue(); + } + + // Make sure the inputs to the vector interleave are identical. + if (!llvm::all_equal(InterleaveOp->op_values())) + return SDValue(); + + NarrowMask = InterleaveOp->getOperand(0); + } else { // ISD::SPLAT_VECTOR + ElementCount EC = Mask.getValueType().getVectorElementCount(); + assert(EC.isKnownMultipleOf(NumParts) && + "Expected element count divisible by number of parts"); + EC = EC.divideCoefficientBy(NumParts); + NarrowMask = + DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC), + Mask->getOperand(0)); + } + + const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret + : Intrinsic::aarch64_sve_ld4_sret; + SDValue NewLdOps[] = {MaskedLoad->getChain(), + DAG.getConstant(IID, DL, MVT::i32), NarrowMask, + MaskedLoad->getBasePtr()}; + SDValue Res; + if (NumParts == 2) + Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, + {SubVecTy, SubVecTy, MVT::Other}, NewLdOps); + else + Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, + {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other}, + NewLdOps); + + // We can now generate a structured load! + SmallVector ResOps(NumParts); + for (unsigned Idx = 0; Idx < NumParts; Idx++) + ResOps[Idx] = SDValue(Res.getNode(), Idx); + + // Replace uses of the original chain result with the new chain result. + DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1), + SDValue(Res.getNode(), NumParts)); + return DCI.CombineTo(N, ResOps, false); +} + /// If the operand is a bitwise AND with a constant RHS, and the shift has a /// constant RHS and is the only use, we can pull it out of the shift, i.e. /// @@ -27083,6 +27193,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, default: LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); break; + case ISD::VECTOR_DEINTERLEAVE: + return performVectorDeinterleaveCombine(N, DCI, DAG); case ISD::VECREDUCE_AND: case ISD::VECREDUCE_OR: case ISD::VECREDUCE_XOR: diff --git a/llvm/test/CodeGen/AArch64/fixed_masked_deinterleaved_loads.ll b/llvm/test/CodeGen/AArch64/fixed_masked_deinterleaved_loads.ll new file mode 100644 index 0000000000000..730dfed5ff228 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fixed_masked_deinterleaved_loads.ll @@ -0,0 +1,464 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define { <16 x i8>, <16 x i8> } @foo_ld2_v16i8(<16 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 v1.16b, v0.16b, v0.16b +; CHECK-NEXT: zip1 v0.16b, v0.16b, v0.16b +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b +; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: bfi w8, w9, #16, #16 +; CHECK-NEXT: tbz w8, #0, .LBB0_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: ldr b1, [x0] +; CHECK-NEXT: tbnz w8, #1, .LBB0_3 +; CHECK-NEXT: b .LBB0_4 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #1, .LBB0_4 +; CHECK-NEXT: .LBB0_3: // %cond.load1 +; CHECK-NEXT: add x9, x0, #1 +; CHECK-NEXT: ld1 { v1.b }[1], [x9] +; CHECK-NEXT: .LBB0_4: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB0_20 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB0_21 +; CHECK-NEXT: .LBB0_6: // %else8 +; CHECK-NEXT: tbnz w8, #4, .LBB0_22 +; CHECK-NEXT: .LBB0_7: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB0_23 +; CHECK-NEXT: .LBB0_8: // %else14 +; CHECK-NEXT: tbnz w8, #6, .LBB0_24 +; CHECK-NEXT: .LBB0_9: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB0_25 +; CHECK-NEXT: .LBB0_10: // %else20 +; CHECK-NEXT: tbnz w8, #8, .LBB0_26 +; CHECK-NEXT: .LBB0_11: // %else23 +; CHECK-NEXT: tbnz w8, #9, .LBB0_27 +; CHECK-NEXT: .LBB0_12: // %else26 +; CHECK-NEXT: tbnz w8, #10, .LBB0_28 +; CHECK-NEXT: .LBB0_13: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB0_29 +; CHECK-NEXT: .LBB0_14: // %else32 +; CHECK-NEXT: tbnz w8, #12, .LBB0_30 +; CHECK-NEXT: .LBB0_15: // %else35 +; CHECK-NEXT: tbnz w8, #13, .LBB0_31 +; CHECK-NEXT: .LBB0_16: // %else38 +; CHECK-NEXT: tbnz w8, #14, .LBB0_32 +; CHECK-NEXT: .LBB0_17: // %else41 +; CHECK-NEXT: tbnz w8, #15, .LBB0_33 +; CHECK-NEXT: .LBB0_18: // %else44 +; CHECK-NEXT: tbz w8, #16, .LBB0_34 +; CHECK-NEXT: .LBB0_19: // %cond.load46 +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: ld1 { v2.b }[0], [x9] +; CHECK-NEXT: tbnz w8, #17, .LBB0_35 +; CHECK-NEXT: b .LBB0_36 +; CHECK-NEXT: .LBB0_20: // %cond.load4 +; CHECK-NEXT: add x9, x0, #2 +; CHECK-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB0_6 +; CHECK-NEXT: .LBB0_21: // %cond.load7 +; CHECK-NEXT: add x9, x0, #3 +; CHECK-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-NEXT: tbz w8, #4, .LBB0_7 +; CHECK-NEXT: .LBB0_22: // %cond.load10 +; CHECK-NEXT: add x9, x0, #4 +; CHECK-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB0_8 +; CHECK-NEXT: .LBB0_23: // %cond.load13 +; CHECK-NEXT: add x9, x0, #5 +; CHECK-NEXT: ld1 { v1.b }[5], [x9] +; CHECK-NEXT: tbz w8, #6, .LBB0_9 +; CHECK-NEXT: .LBB0_24: // %cond.load16 +; CHECK-NEXT: add x9, x0, #6 +; CHECK-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB0_10 +; CHECK-NEXT: .LBB0_25: // %cond.load19 +; CHECK-NEXT: add x9, x0, #7 +; CHECK-NEXT: ld1 { v1.b }[7], [x9] +; CHECK-NEXT: tbz w8, #8, .LBB0_11 +; CHECK-NEXT: .LBB0_26: // %cond.load22 +; CHECK-NEXT: add x9, x0, #8 +; CHECK-NEXT: ld1 { v1.b }[8], [x9] +; CHECK-NEXT: tbz w8, #9, .LBB0_12 +; CHECK-NEXT: .LBB0_27: // %cond.load25 +; CHECK-NEXT: add x9, x0, #9 +; CHECK-NEXT: ld1 { v1.b }[9], [x9] +; CHECK-NEXT: tbz w8, #10, .LBB0_13 +; CHECK-NEXT: .LBB0_28: // %cond.load28 +; CHECK-NEXT: add x9, x0, #10 +; CHECK-NEXT: ld1 { v1.b }[10], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB0_14 +; CHECK-NEXT: .LBB0_29: // %cond.load31 +; CHECK-NEXT: add x9, x0, #11 +; CHECK-NEXT: ld1 { v1.b }[11], [x9] +; CHECK-NEXT: tbz w8, #12, .LBB0_15 +; CHECK-NEXT: .LBB0_30: // %cond.load34 +; CHECK-NEXT: add x9, x0, #12 +; CHECK-NEXT: ld1 { v1.b }[12], [x9] +; CHECK-NEXT: tbz w8, #13, .LBB0_16 +; CHECK-NEXT: .LBB0_31: // %cond.load37 +; CHECK-NEXT: add x9, x0, #13 +; CHECK-NEXT: ld1 { v1.b }[13], [x9] +; CHECK-NEXT: tbz w8, #14, .LBB0_17 +; CHECK-NEXT: .LBB0_32: // %cond.load40 +; CHECK-NEXT: add x9, x0, #14 +; CHECK-NEXT: ld1 { v1.b }[14], [x9] +; CHECK-NEXT: tbz w8, #15, .LBB0_18 +; CHECK-NEXT: .LBB0_33: // %cond.load43 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: ld1 { v1.b }[15], [x9] +; CHECK-NEXT: tbnz w8, #16, .LBB0_19 +; CHECK-NEXT: .LBB0_34: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #17, .LBB0_36 +; CHECK-NEXT: .LBB0_35: // %cond.load49 +; CHECK-NEXT: add x9, x0, #17 +; CHECK-NEXT: ld1 { v2.b }[1], [x9] +; CHECK-NEXT: .LBB0_36: // %else50 +; CHECK-NEXT: tbnz w8, #18, .LBB0_52 +; CHECK-NEXT: // %bb.37: // %else53 +; CHECK-NEXT: tbnz w8, #19, .LBB0_53 +; CHECK-NEXT: .LBB0_38: // %else56 +; CHECK-NEXT: tbnz w8, #20, .LBB0_54 +; CHECK-NEXT: .LBB0_39: // %else59 +; CHECK-NEXT: tbnz w8, #21, .LBB0_55 +; CHECK-NEXT: .LBB0_40: // %else62 +; CHECK-NEXT: tbnz w8, #22, .LBB0_56 +; CHECK-NEXT: .LBB0_41: // %else65 +; CHECK-NEXT: tbnz w8, #23, .LBB0_57 +; CHECK-NEXT: .LBB0_42: // %else68 +; CHECK-NEXT: tbnz w8, #24, .LBB0_58 +; CHECK-NEXT: .LBB0_43: // %else71 +; CHECK-NEXT: tbnz w8, #25, .LBB0_59 +; CHECK-NEXT: .LBB0_44: // %else74 +; CHECK-NEXT: tbnz w8, #26, .LBB0_60 +; CHECK-NEXT: .LBB0_45: // %else77 +; CHECK-NEXT: tbnz w8, #27, .LBB0_61 +; CHECK-NEXT: .LBB0_46: // %else80 +; CHECK-NEXT: tbnz w8, #28, .LBB0_62 +; CHECK-NEXT: .LBB0_47: // %else83 +; CHECK-NEXT: tbnz w8, #29, .LBB0_63 +; CHECK-NEXT: .LBB0_48: // %else86 +; CHECK-NEXT: tbnz w8, #30, .LBB0_64 +; CHECK-NEXT: .LBB0_49: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB0_51 +; CHECK-NEXT: .LBB0_50: // %cond.load91 +; CHECK-NEXT: add x8, x0, #31 +; CHECK-NEXT: ld1 { v2.b }[15], [x8] +; CHECK-NEXT: .LBB0_51: // %else92 +; CHECK-NEXT: uzp1 v0.16b, v1.16b, v2.16b +; CHECK-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_52: // %cond.load52 +; CHECK-NEXT: add x9, x0, #18 +; CHECK-NEXT: ld1 { v2.b }[2], [x9] +; CHECK-NEXT: tbz w8, #19, .LBB0_38 +; CHECK-NEXT: .LBB0_53: // %cond.load55 +; CHECK-NEXT: add x9, x0, #19 +; CHECK-NEXT: ld1 { v2.b }[3], [x9] +; CHECK-NEXT: tbz w8, #20, .LBB0_39 +; CHECK-NEXT: .LBB0_54: // %cond.load58 +; CHECK-NEXT: add x9, x0, #20 +; CHECK-NEXT: ld1 { v2.b }[4], [x9] +; CHECK-NEXT: tbz w8, #21, .LBB0_40 +; CHECK-NEXT: .LBB0_55: // %cond.load61 +; CHECK-NEXT: add x9, x0, #21 +; CHECK-NEXT: ld1 { v2.b }[5], [x9] +; CHECK-NEXT: tbz w8, #22, .LBB0_41 +; CHECK-NEXT: .LBB0_56: // %cond.load64 +; CHECK-NEXT: add x9, x0, #22 +; CHECK-NEXT: ld1 { v2.b }[6], [x9] +; CHECK-NEXT: tbz w8, #23, .LBB0_42 +; CHECK-NEXT: .LBB0_57: // %cond.load67 +; CHECK-NEXT: add x9, x0, #23 +; CHECK-NEXT: ld1 { v2.b }[7], [x9] +; CHECK-NEXT: tbz w8, #24, .LBB0_43 +; CHECK-NEXT: .LBB0_58: // %cond.load70 +; CHECK-NEXT: add x9, x0, #24 +; CHECK-NEXT: ld1 { v2.b }[8], [x9] +; CHECK-NEXT: tbz w8, #25, .LBB0_44 +; CHECK-NEXT: .LBB0_59: // %cond.load73 +; CHECK-NEXT: add x9, x0, #25 +; CHECK-NEXT: ld1 { v2.b }[9], [x9] +; CHECK-NEXT: tbz w8, #26, .LBB0_45 +; CHECK-NEXT: .LBB0_60: // %cond.load76 +; CHECK-NEXT: add x9, x0, #26 +; CHECK-NEXT: ld1 { v2.b }[10], [x9] +; CHECK-NEXT: tbz w8, #27, .LBB0_46 +; CHECK-NEXT: .LBB0_61: // %cond.load79 +; CHECK-NEXT: add x9, x0, #27 +; CHECK-NEXT: ld1 { v2.b }[11], [x9] +; CHECK-NEXT: tbz w8, #28, .LBB0_47 +; CHECK-NEXT: .LBB0_62: // %cond.load82 +; CHECK-NEXT: add x9, x0, #28 +; CHECK-NEXT: ld1 { v2.b }[12], [x9] +; CHECK-NEXT: tbz w8, #29, .LBB0_48 +; CHECK-NEXT: .LBB0_63: // %cond.load85 +; CHECK-NEXT: add x9, x0, #29 +; CHECK-NEXT: ld1 { v2.b }[13], [x9] +; CHECK-NEXT: tbz w8, #30, .LBB0_49 +; CHECK-NEXT: .LBB0_64: // %cond.load88 +; CHECK-NEXT: add x9, x0, #30 +; CHECK-NEXT: ld1 { v2.b }[14], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB0_50 +; CHECK-NEXT: b .LBB0_51 + %interleaved.mask = call <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1> %mask, <16 x i1> %mask) + %wide.masked.vec = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr %p, i32 1, <32 x i1> %interleaved.mask, <32 x i8> poison) + %deinterleaved.vec = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %wide.masked.vec) + ret { <16 x i8>, <16 x i8> } %deinterleaved.vec +} + +define { <8 x i16>, <8 x i16> } @foo_ld2_v8i16(<8 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: zip1 v0.16b, v0.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tbz w8, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: ldr h1, [x0] +; CHECK-NEXT: tbnz w8, #1, .LBB1_3 +; CHECK-NEXT: b .LBB1_4 +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #1, .LBB1_4 +; CHECK-NEXT: .LBB1_3: // %cond.load1 +; CHECK-NEXT: add x9, x0, #2 +; CHECK-NEXT: ld1 { v1.h }[1], [x9] +; CHECK-NEXT: .LBB1_4: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB1_12 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB1_13 +; CHECK-NEXT: .LBB1_6: // %else8 +; CHECK-NEXT: tbnz w8, #4, .LBB1_14 +; CHECK-NEXT: .LBB1_7: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB1_15 +; CHECK-NEXT: .LBB1_8: // %else14 +; CHECK-NEXT: tbnz w8, #6, .LBB1_16 +; CHECK-NEXT: .LBB1_9: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB1_17 +; CHECK-NEXT: .LBB1_10: // %else20 +; CHECK-NEXT: tbz w8, #8, .LBB1_18 +; CHECK-NEXT: .LBB1_11: // %cond.load22 +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: ld1 { v2.h }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB1_19 +; CHECK-NEXT: b .LBB1_20 +; CHECK-NEXT: .LBB1_12: // %cond.load4 +; CHECK-NEXT: add x9, x0, #4 +; CHECK-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB1_6 +; CHECK-NEXT: .LBB1_13: // %cond.load7 +; CHECK-NEXT: add x9, x0, #6 +; CHECK-NEXT: ld1 { v1.h }[3], [x9] +; CHECK-NEXT: tbz w8, #4, .LBB1_7 +; CHECK-NEXT: .LBB1_14: // %cond.load10 +; CHECK-NEXT: add x9, x0, #8 +; CHECK-NEXT: ld1 { v1.h }[4], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB1_8 +; CHECK-NEXT: .LBB1_15: // %cond.load13 +; CHECK-NEXT: add x9, x0, #10 +; CHECK-NEXT: ld1 { v1.h }[5], [x9] +; CHECK-NEXT: tbz w8, #6, .LBB1_9 +; CHECK-NEXT: .LBB1_16: // %cond.load16 +; CHECK-NEXT: add x9, x0, #12 +; CHECK-NEXT: ld1 { v1.h }[6], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB1_10 +; CHECK-NEXT: .LBB1_17: // %cond.load19 +; CHECK-NEXT: add x9, x0, #14 +; CHECK-NEXT: ld1 { v1.h }[7], [x9] +; CHECK-NEXT: tbnz w8, #8, .LBB1_11 +; CHECK-NEXT: .LBB1_18: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #9, .LBB1_20 +; CHECK-NEXT: .LBB1_19: // %cond.load25 +; CHECK-NEXT: add x9, x0, #18 +; CHECK-NEXT: ld1 { v2.h }[1], [x9] +; CHECK-NEXT: .LBB1_20: // %else26 +; CHECK-NEXT: tbnz w8, #10, .LBB1_28 +; CHECK-NEXT: // %bb.21: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB1_29 +; CHECK-NEXT: .LBB1_22: // %else32 +; CHECK-NEXT: tbnz w8, #12, .LBB1_30 +; CHECK-NEXT: .LBB1_23: // %else35 +; CHECK-NEXT: tbnz w8, #13, .LBB1_31 +; CHECK-NEXT: .LBB1_24: // %else38 +; CHECK-NEXT: tbnz w8, #14, .LBB1_32 +; CHECK-NEXT: .LBB1_25: // %else41 +; CHECK-NEXT: tbz w8, #15, .LBB1_27 +; CHECK-NEXT: .LBB1_26: // %cond.load43 +; CHECK-NEXT: add x8, x0, #30 +; CHECK-NEXT: ld1 { v2.h }[7], [x8] +; CHECK-NEXT: .LBB1_27: // %else44 +; CHECK-NEXT: uzp1 v0.8h, v1.8h, v2.8h +; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_28: // %cond.load28 +; CHECK-NEXT: add x9, x0, #20 +; CHECK-NEXT: ld1 { v2.h }[2], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB1_22 +; CHECK-NEXT: .LBB1_29: // %cond.load31 +; CHECK-NEXT: add x9, x0, #22 +; CHECK-NEXT: ld1 { v2.h }[3], [x9] +; CHECK-NEXT: tbz w8, #12, .LBB1_23 +; CHECK-NEXT: .LBB1_30: // %cond.load34 +; CHECK-NEXT: add x9, x0, #24 +; CHECK-NEXT: ld1 { v2.h }[4], [x9] +; CHECK-NEXT: tbz w8, #13, .LBB1_24 +; CHECK-NEXT: .LBB1_31: // %cond.load37 +; CHECK-NEXT: add x9, x0, #26 +; CHECK-NEXT: ld1 { v2.h }[5], [x9] +; CHECK-NEXT: tbz w8, #14, .LBB1_25 +; CHECK-NEXT: .LBB1_32: // %cond.load40 +; CHECK-NEXT: add x9, x0, #28 +; CHECK-NEXT: ld1 { v2.h }[6], [x9] +; CHECK-NEXT: tbnz w8, #15, .LBB1_26 +; CHECK-NEXT: b .LBB1_27 + %interleaved.mask = call <16 x i1> @llvm.vector.interleave2.v16i1(<8 x i1> %mask, <8 x i1> %mask) + %wide.masked.vec = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr %p, i32 2, <16 x i1> %interleaved.mask, <16 x i16> poison) + %deinterleaved.vec = call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> %wide.masked.vec) + ret { <8 x i16>, <8 x i16> } %deinterleaved.vec +} + +define { <4 x float>, <4 x float> } @foo_ld2_v4f32(<4 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-NEXT: shl v0.8b, v0.8b, #7 +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: addv b0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tbz w8, #0, .LBB2_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: tbnz w8, #1, .LBB2_3 +; CHECK-NEXT: b .LBB2_4 +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #1, .LBB2_4 +; CHECK-NEXT: .LBB2_3: // %cond.load1 +; CHECK-NEXT: add x9, x0, #4 +; CHECK-NEXT: ld1 { v1.s }[1], [x9] +; CHECK-NEXT: .LBB2_4: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB2_8 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB2_9 +; CHECK-NEXT: .LBB2_6: // %else8 +; CHECK-NEXT: tbz w8, #4, .LBB2_10 +; CHECK-NEXT: .LBB2_7: // %cond.load10 +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: ld1 { v2.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB2_11 +; CHECK-NEXT: b .LBB2_12 +; CHECK-NEXT: .LBB2_8: // %cond.load4 +; CHECK-NEXT: add x9, x0, #8 +; CHECK-NEXT: ld1 { v1.s }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB2_6 +; CHECK-NEXT: .LBB2_9: // %cond.load7 +; CHECK-NEXT: add x9, x0, #12 +; CHECK-NEXT: ld1 { v1.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #4, .LBB2_7 +; CHECK-NEXT: .LBB2_10: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #5, .LBB2_12 +; CHECK-NEXT: .LBB2_11: // %cond.load13 +; CHECK-NEXT: add x9, x0, #20 +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: .LBB2_12: // %else14 +; CHECK-NEXT: tbnz w8, #6, .LBB2_16 +; CHECK-NEXT: // %bb.13: // %else17 +; CHECK-NEXT: tbz w8, #7, .LBB2_15 +; CHECK-NEXT: .LBB2_14: // %cond.load19 +; CHECK-NEXT: add x8, x0, #28 +; CHECK-NEXT: ld1 { v2.s }[3], [x8] +; CHECK-NEXT: .LBB2_15: // %else20 +; CHECK-NEXT: uzp1 v0.4s, v1.4s, v2.4s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_16: // %cond.load16 +; CHECK-NEXT: add x9, x0, #24 +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB2_14 +; CHECK-NEXT: b .LBB2_15 + %interleaved.mask = call <8 x i1> @llvm.vector.interleave2.v8i1(<4 x i1> %mask, <4 x i1> %mask) + %wide.masked.vec = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %p, i32 4, <8 x i1> %interleaved.mask, <8 x float> poison) + %deinterleaved.vec = call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v16f32(<8 x float> %wide.masked.vec) + ret { <4 x float>, <4 x float> } %deinterleaved.vec +} + +define { <2 x double>, <2 x double> } @foo_ld2_v2f64(<2 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-NEXT: shl v0.4h, v0.4h, #15 +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: addv h0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tbz w8, #0, .LBB3_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: tbnz w8, #1, .LBB3_3 +; CHECK-NEXT: b .LBB3_4 +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #1, .LBB3_4 +; CHECK-NEXT: .LBB3_3: // %cond.load1 +; CHECK-NEXT: add x9, x0, #8 +; CHECK-NEXT: ld1 { v1.d }[1], [x9] +; CHECK-NEXT: .LBB3_4: // %else2 +; CHECK-NEXT: tbz w8, #2, .LBB3_6 +; CHECK-NEXT: // %bb.5: // %cond.load4 +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: ld1 { v2.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB3_7 +; CHECK-NEXT: b .LBB3_8 +; CHECK-NEXT: .LBB3_6: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #3, .LBB3_8 +; CHECK-NEXT: .LBB3_7: // %cond.load7 +; CHECK-NEXT: add x8, x0, #24 +; CHECK-NEXT: ld1 { v2.d }[1], [x8] +; CHECK-NEXT: .LBB3_8: // %else8 +; CHECK-NEXT: zip1 v0.2d, v1.2d, v2.2d +; CHECK-NEXT: zip2 v1.2d, v1.2d, v2.2d +; CHECK-NEXT: ret + %interleaved.mask = call <4 x i1> @llvm.vector.interleave2.v4i1(<2 x i1> %mask, <2 x i1> %mask) + %wide.masked.vec = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %p, i32 8, <4 x i1> %interleaved.mask, <4 x double> poison) + %deinterleaved.vec = call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> %wide.masked.vec) + ret { <2 x double>, <2 x double> } %deinterleaved.vec +} + diff --git a/llvm/test/CodeGen/AArch64/scalable_masked_deinterleaved_loads.ll b/llvm/test/CodeGen/AArch64/scalable_masked_deinterleaved_loads.ll new file mode 100644 index 0000000000000..c9b77a2109dcf --- /dev/null +++ b/llvm/test/CodeGen/AArch64/scalable_masked_deinterleaved_loads.ll @@ -0,0 +1,287 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define { , } @foo_ld2_nxv16i8( %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave2.nxv32i1( %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv32i8(ptr %p, i32 1, %interleaved.mask, poison) + %deinterleaved.vec = call { , } @llvm.vector.deinterleave2.nxv32i8( %wide.masked.vec) + ret { , } %deinterleaved.vec +} + +define { , } @foo_ld2_nxv8i16( %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave2.nxv16i1( %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv16i16.p0(ptr %p, i32 2, %interleaved.mask, poison) + %deinterleaved.vec = call { , } @llvm.vector.deinterleave2.nxv16i16( %wide.masked.vec) + ret { , } %deinterleaved.vec +} + +define { , } @foo_ld2_nxv4f32( %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave2.nxv8i1( %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv8f32(ptr %p, i32 4, %interleaved.mask, poison) + %deinterleaved.vec = call { , } @llvm.vector.deinterleave2.nxv8f32( %wide.masked.vec) + ret { , } %deinterleaved.vec +} + +define { , } @foo_ld2_nxv2f64( %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave2.nxv4i1( %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv4f64(ptr %p, i32 8, %interleaved.mask, poison) + %deinterleaved.vec = call { , } @llvm.vector.deinterleave2.nxv4f64( %wide.masked.vec) + ret { , } %deinterleaved.vec +} + +define { , , , } @foo_ld4_nxv16i8( %mask, ptr %p) { +; CHECK-LABEL: foo_ld4_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave4.nxv64i1( %mask, %mask, %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv64i8(ptr %p, i32 1, %interleaved.mask, poison) + %deinterleaved.vec = call { , , , } @llvm.vector.deinterleave4.nxv64i8( %wide.masked.vec) + ret { , , , } %deinterleaved.vec +} + +define { , , , } @foo_ld4_nxv8i16( %mask, ptr %p) { +; CHECK-LABEL: foo_ld4_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4h { z0.h - z3.h }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave4.nxv32i1( %mask, %mask, %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv32i16(ptr %p, i32 2, %interleaved.mask, poison) + %deinterleaved.vec = call { , , , } @llvm.vector.deinterleave4.nxv32i16( %wide.masked.vec) + ret { , , , } %deinterleaved.vec +} + +define { , , , } @foo_ld4_nxv4f32( %mask, ptr %p) { +; CHECK-LABEL: foo_ld4_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4w { z0.s - z3.s }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave4.nxv16i1( %mask, %mask, %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv16f32(ptr %p, i32 4, %interleaved.mask, poison) + %deinterleaved.vec = call { , , , } @llvm.vector.deinterleave4.nxv16f32( %wide.masked.vec) + ret { , , , } %deinterleaved.vec +} + +define { , , , } @foo_ld4_nxv2f64( %mask, ptr %p) { +; CHECK-LABEL: foo_ld4_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4d { z0.d - z3.d }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave4.nxv8i1( %mask, %mask, %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv8f64(ptr %p, i32 8, %interleaved.mask, poison) + %deinterleaved.vec = call { , , , } @llvm.vector.deinterleave4.nxv8f64( %wide.masked.vec) + ret { , , , } %deinterleaved.vec +} + + +define { , , , } @foo_ld4_nxv16i8_mul_use_of_mask( %mask, ptr %p, ptr %p2) { +; CHECK-LABEL: foo_ld4_nxv16i8_mul_use_of_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 p2.b, p0.b, p0.b +; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0] +; CHECK-NEXT: zip2 p1.b, p0.b, p0.b +; CHECK-NEXT: zip1 p3.b, p2.b, p2.b +; CHECK-NEXT: zip2 p0.b, p1.b, p1.b +; CHECK-NEXT: zip1 p1.b, p1.b, p1.b +; CHECK-NEXT: zip2 p2.b, p2.b, p2.b +; CHECK-NEXT: // fake_use: $p3 +; CHECK-NEXT: // fake_use: $p2 +; CHECK-NEXT: // fake_use: $p1 +; CHECK-NEXT: // fake_use: $p0 +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave4.nxv64i1( %mask, %mask, %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv64i8(ptr %p, i32 4, %interleaved.mask, poison) + %deinterleaved.vec = call { , , , } @llvm.vector.deinterleave4.nxv64i8( %wide.masked.vec) + call void (...) @llvm.fake.use( %interleaved.mask) + ret { , , , } %deinterleaved.vec +} + +define { , , , } @foo_ld4_nxv16i8_mask_of_interleaved_ones(ptr %p) { +; CHECK-LABEL: foo_ld4_nxv16i8_mask_of_interleaved_ones: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave4.nxv64i1( splat(i1 1), splat(i1 1), splat(i1 1), splat(i1 1)) + %wide.masked.vec = call @llvm.masked.load.nxv64i8(ptr %p, i32 4, %interleaved.mask, poison) + %deinterleaved.vec = call { , , , } @llvm.vector.deinterleave4.nxv64i8( %wide.masked.vec) + ret { , , , } %deinterleaved.vec +} + +define { , , , } @foo_ld4_nxv16i8_mask_of_ones(ptr %p) { +; CHECK-LABEL: foo_ld4_nxv16i8_mask_of_ones: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0] +; CHECK-NEXT: ret + %wide.masked.vec = call @llvm.masked.load.nxv64i8(ptr %p, i32 4, splat(i1 1), poison) + %deinterleaved.vec = call { , , , } @llvm.vector.deinterleave4.nxv64i8( %wide.masked.vec) + ret { , , , } %deinterleaved.vec +} + + +; Negative tests + +define { , } @foo_ld2_nxv16i8_mul_use_of_load( %mask, ptr %p, ptr %p2) { +; CHECK-LABEL: foo_ld2_nxv16i8_mul_use_of_load: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 p1.b, p0.b, p0.b +; CHECK-NEXT: zip2 p0.b, p0.b, p0.b +; CHECK-NEXT: ld1b { z3.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: uzp1 z0.b, z3.b, z2.b +; CHECK-NEXT: uzp2 z1.b, z3.b, z2.b +; CHECK-NEXT: // fake_use: $z3 +; CHECK-NEXT: // fake_use: $z2 +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave2.nxv32i1( %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv32i8(ptr %p, i32 4, %interleaved.mask, poison) + %deinterleaved.vec = call { , } @llvm.vector.deinterleave2.nxv32i8( %wide.masked.vec) + call void (...) @llvm.fake.use( %wide.masked.vec) + ret { , } %deinterleaved.vec +} + +; Mask must be an interleave of identical masks. +define { , } @foo_ld2_nxv16i8_bad_mask( %mask, %mask2, ptr %p, ptr %p2) { +; CHECK-LABEL: foo_ld2_nxv16i8_bad_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 p2.b, p0.b, p1.b +; CHECK-NEXT: zip2 p0.b, p0.b, p1.b +; CHECK-NEXT: ld1b { z2.b }, p2/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: uzp1 z0.b, z2.b, z1.b +; CHECK-NEXT: uzp2 z1.b, z2.b, z1.b +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave2.nxv32i1( %mask, %mask2) + %wide.masked.vec = call @llvm.masked.load.nxv32i8(ptr %p, i32 4, %interleaved.mask, poison) + %deinterleaved.vec = call { , } @llvm.vector.deinterleave2.nxv32i8( %wide.masked.vec) + ret { , } %deinterleaved.vec +} + +; Number of parts in mask interleave must match deinterleave. +define { , , , } @foo_ld4_nxv16i8_bad_mask2( %mask, ptr %p, ptr %p2) { +; CHECK-LABEL: foo_ld4_nxv16i8_bad_mask2: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 p2.b, p1.b, p1.b +; CHECK-NEXT: zip2 p1.b, p1.b, p1.b +; CHECK-NEXT: zip2 p3.b, p0.b, p0.b +; CHECK-NEXT: ld1b { z3.b }, p2/z, [x0, #2, mul vl] +; CHECK-NEXT: zip1 p0.b, p0.b, p0.b +; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, #3, mul vl] +; CHECK-NEXT: ld1b { z0.b }, p3/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: uzp1 z4.b, z3.b, z2.b +; CHECK-NEXT: uzp2 z3.b, z3.b, z2.b +; CHECK-NEXT: uzp1 z5.b, z1.b, z0.b +; CHECK-NEXT: uzp2 z6.b, z1.b, z0.b +; CHECK-NEXT: uzp1 z0.b, z5.b, z4.b +; CHECK-NEXT: uzp1 z1.b, z6.b, z3.b +; CHECK-NEXT: uzp2 z2.b, z5.b, z4.b +; CHECK-NEXT: uzp2 z3.b, z6.b, z3.b +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave2.nxv64i1( %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv64i8(ptr %p, i32 4, %interleaved.mask, poison) + %deinterleaved.vec = call { , , , } @llvm.vector.deinterleave4.nxv64i8( %wide.masked.vec) + ret { , , , } %deinterleaved.vec +} + +; Mask must come from an interleave or a splat. +define { , } @foo_ld2_nxv16i8_bad_mask3( %mask, ptr %p, ptr %p2) { +; CHECK-LABEL: foo_ld2_nxv16i8_bad_mask3: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0] +; CHECK-NEXT: uzp1 z0.b, z2.b, z1.b +; CHECK-NEXT: uzp2 z1.b, z2.b, z1.b +; CHECK-NEXT: ret + %wide.masked.vec = call @llvm.masked.load.nxv32i8(ptr %p, i32 4, %mask, poison) + %deinterleaved.vec = call { , } @llvm.vector.deinterleave2.nxv32i8( %wide.masked.vec) + ret { , } %deinterleaved.vec +} + +; Each deinterleaved vector must be exactly 128 bits. +define { , } @foo_ld2_nxv8i8( %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 p1.h, p0.h, p0.h +; CHECK-NEXT: zip1 p0.h, p0.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: uunpklo z2.h, z0.b +; CHECK-NEXT: uzp1 z0.h, z2.h, z1.h +; CHECK-NEXT: uzp2 z1.h, z2.h, z1.h +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave2.nxv16i1( %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv16i8(ptr %p, i32 1, %interleaved.mask, poison) + %deinterleaved.vec = call { , } @llvm.vector.deinterleave2.nxv16i8( %wide.masked.vec) + ret { , } %deinterleaved.vec +} + +; Passthru must be poison or zero. +define { , } @foo_ld2_nxv16i8_bad_passthru( %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv16i8_bad_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 p1.b, p0.b, p0.b +; CHECK-NEXT: mov z0.b, #3 // =0x3 +; CHECK-NEXT: zip2 p0.b, p0.b, p0.b +; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: sel z2.b, p1, z2.b, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b +; CHECK-NEXT: uzp1 z0.b, z2.b, z1.b +; CHECK-NEXT: uzp2 z1.b, z2.b, z1.b +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave2.nxv32i1( %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv32i8(ptr %p, i32 1, %interleaved.mask, splat(i8 3)) + %deinterleaved.vec = call { , } @llvm.vector.deinterleave2.nxv32i8( %wide.masked.vec) + ret { , } %deinterleaved.vec +} + +define { , } @foo_deinterleave2_not_load( %vec1, %vec2) { +; CHECK-LABEL: foo_deinterleave2_not_load: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp2 z1.h, z0.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %bad.vec.init = call @llvm.vector.insert.nxv16i16( poison, %vec1, i64 0) + %bad.vec = call @llvm.vector.insert.nxv16i16( %bad.vec.init, %vec2, i64 8) + %deinterleaved.vec = call { , } @llvm.vector.deinterleave2.nxv16i16( %bad.vec) + ret { , } %deinterleaved.vec +} + +define { , } @foo_ld2_nxv8i8_exti16( %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv8i8_exti16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 p1.s, p0.s, p0.s +; CHECK-NEXT: zip1 p0.s, p0.s, p0.s +; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z2.s, z0.h +; CHECK-NEXT: uzp1 z0.s, z2.s, z1.s +; CHECK-NEXT: uzp2 z1.s, z2.s, z1.s +; CHECK-NEXT: ret + %interleaved.mask = call @llvm.vector.interleave2.nxv8i1( %mask, %mask) + %wide.masked.vec = call @llvm.masked.load.nxv8i8(ptr %p, i32 1, %interleaved.mask, poison) + %wide.masked.vec.ext = zext %wide.masked.vec to + %deinterleaved.vec = call { , } @llvm.vector.deinterleave2.nxv8i16( %wide.masked.vec.ext) + ret { , } %deinterleaved.vec +}