-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[LoongArch] Split 256-bit build_vector to avoid using LASX element insertion #154918
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/zhaoqi5/broadcast-repeated-buildvector
Are you sure you want to change the base?
[LoongArch] Split 256-bit build_vector to avoid using LASX element insertion #154918
Conversation
@llvm/pr-subscribers-backend-loongarch Author: ZhaoQi (zhaoqi5) ChangesNote: Only worse for Patch is 45.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154918.diff 7 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 0499e2c3f1d4c..53be8e5804229 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2578,24 +2578,47 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
return DAG.getBitcast(ResTy, SplatVec);
}
- // Use INSERT_VECTOR_ELT operations rather than expand to stores.
- // The resulting code is the same length as the expansion, but it doesn't
- // use memory operations.
- assert(ResTy.isVector());
+ // Use INSERT_VECTOR_ELT operations rather than expand to stores, because
+ // using memory operations is much lower.
+ EVT VecTy = ResTy;
+ unsigned VecNumElts = NumElts;
+
+ // Split the 256-bits vector and fill them separately, concat the two parts
+ // to get the result vector.
+ if (Is256Vec) {
+ VecTy = ResTy.getHalfNumVectorElementsVT(*DAG.getContext());
+ VecNumElts = NumElts / 2;
+ }
+ SDValue Vector = DAG.getUNDEF(VecTy);
SDValue Op0 = Node->getOperand(0);
- SDValue Vector = DAG.getUNDEF(ResTy);
-
if (!Op0.isUndef())
- Vector = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Op0);
- for (unsigned i = 1; i < NumElts; ++i) {
+ Vector = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecTy, Op0);
+ for (unsigned i = 1; i < VecNumElts; ++i) {
SDValue Opi = Node->getOperand(i);
if (Opi.isUndef())
continue;
- Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector, Opi,
+ Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecTy, Vector, Opi,
DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
}
- return Vector;
+
+ if (Is128Vec)
+ return Vector;
+
+ SDValue VectorHi = DAG.getUNDEF(VecTy);
+ SDValue OpHi0 = Node->getOperand(VecNumElts);
+ if (!OpHi0.isUndef())
+ VectorHi = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecTy, OpHi0);
+ for (unsigned i = VecNumElts + 1; i < NumElts; ++i) {
+ SDValue Opi = Node->getOperand(i);
+ if (Opi.isUndef())
+ continue;
+ VectorHi = DAG.getNode(
+ ISD::INSERT_VECTOR_ELT, DL, VecTy, VectorHi, Opi,
+ DAG.getConstant(i - VecNumElts, DL, Subtarget.getGRLenVT()));
+ }
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResTy, Vector, VectorHi);
}
return SDValue();
diff --git a/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll b/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll
index 11f1bce55fad6..87ee4ad025395 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll
@@ -7,18 +7,19 @@ declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>)
define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
; CHECK-LABEL: test_bitreverse_v32i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT: bitrev.8b $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
-; CHECK-NEXT: bitrev.8b $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
; CHECK-NEXT: bitrev.8b $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
+; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
; CHECK-NEXT: bitrev.8b $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
+; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
+; CHECK-NEXT: bitrev.8b $a0, $a0
+; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
+; CHECK-NEXT: bitrev.8b $a0, $a0
+; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
+; CHECK-NEXT: xvpermi.q $xr1, $xr2, 2
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
; CHECK-NEXT: ret
%b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
@@ -30,19 +31,20 @@ declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>)
define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; CHECK-LABEL: test_bitreverse_v16i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT: bitrev.d $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
-; CHECK-NEXT: bitrev.d $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
; CHECK-NEXT: bitrev.d $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
+; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
; CHECK-NEXT: bitrev.d $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
-; CHECK-NEXT: xvshuf4i.h $xr0, $xr1, 27
+; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
+; CHECK-NEXT: bitrev.d $a0, $a0
+; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
+; CHECK-NEXT: bitrev.d $a0, $a0
+; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1
+; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2
+; CHECK-NEXT: xvshuf4i.h $xr0, $xr2, 27
; CHECK-NEXT: ret
%b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
ret <16 x i16> %b
@@ -53,19 +55,20 @@ declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>)
define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; CHECK-LABEL: test_bitreverse_v8i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT: bitrev.d $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
-; CHECK-NEXT: bitrev.d $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
; CHECK-NEXT: bitrev.d $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
+; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
; CHECK-NEXT: bitrev.d $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
-; CHECK-NEXT: xvshuf4i.w $xr0, $xr1, 177
+; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
+; CHECK-NEXT: bitrev.d $a0, $a0
+; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
+; CHECK-NEXT: bitrev.d $a0, $a0
+; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1
+; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2
+; CHECK-NEXT: xvshuf4i.w $xr0, $xr2, 177
; CHECK-NEXT: ret
%b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
ret <8 x i32> %b
@@ -76,18 +79,19 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>)
define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
; CHECK-LABEL: test_bitreverse_v4i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT: bitrev.d $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
-; CHECK-NEXT: bitrev.d $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
; CHECK-NEXT: bitrev.d $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
+; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
; CHECK-NEXT: bitrev.d $a0, $a0
-; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
+; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
+; CHECK-NEXT: bitrev.d $a0, $a0
+; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
+; CHECK-NEXT: bitrev.d $a0, $a0
+; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
+; CHECK-NEXT: xvpermi.q $xr1, $xr2, 2
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
; CHECK-NEXT: ret
%b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
index e759f5c98f1e7..83aaf3376cb29 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
@@ -224,144 +224,65 @@ entry:
define void @buildvector_v32i8(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
; CHECK-LABEL: buildvector_v32i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: addi.d $sp, $sp, -80
-; CHECK-NEXT: fst.d $fs0, $sp, 72 # 8-byte Folded Spill
-; CHECK-NEXT: fst.d $fs1, $sp, 64 # 8-byte Folded Spill
-; CHECK-NEXT: fst.d $fs2, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT: fst.d $fs3, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT: fst.d $fs4, $sp, 40 # 8-byte Folded Spill
-; CHECK-NEXT: fst.d $fs5, $sp, 32 # 8-byte Folded Spill
-; CHECK-NEXT: fst.d $fs6, $sp, 24 # 8-byte Folded Spill
-; CHECK-NEXT: fst.d $fs7, $sp, 16 # 8-byte Folded Spill
-; CHECK-NEXT: vinsgr2vr.b $vr0, $a1, 0
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $a2
-; CHECK-NEXT: xvreplgr2vr.b $xr2, $a3
-; CHECK-NEXT: xvreplgr2vr.b $xr3, $a4
-; CHECK-NEXT: ld.b $a1, $sp, 264
-; CHECK-NEXT: xvreplgr2vr.b $xr4, $a5
-; CHECK-NEXT: ld.b $a2, $sp, 80
-; CHECK-NEXT: xvreplgr2vr.b $xr5, $a6
-; CHECK-NEXT: ld.b $a3, $sp, 88
-; CHECK-NEXT: xvreplgr2vr.b $xr6, $a7
-; CHECK-NEXT: ld.b $a4, $sp, 96
-; CHECK-NEXT: xvreplgr2vr.b $xr7, $a2
-; CHECK-NEXT: ld.b $a2, $sp, 104
-; CHECK-NEXT: xvreplgr2vr.b $xr8, $a3
-; CHECK-NEXT: ld.b $a3, $sp, 112
-; CHECK-NEXT: xvreplgr2vr.b $xr9, $a4
-; CHECK-NEXT: ld.b $a4, $sp, 120
-; CHECK-NEXT: xvreplgr2vr.b $xr10, $a2
-; CHECK-NEXT: ld.b $a2, $sp, 128
-; CHECK-NEXT: xvreplgr2vr.b $xr11, $a3
-; CHECK-NEXT: ld.b $a3, $sp, 136
-; CHECK-NEXT: xvreplgr2vr.b $xr12, $a4
-; CHECK-NEXT: ld.b $a4, $sp, 144
-; CHECK-NEXT: xvreplgr2vr.b $xr13, $a2
-; CHECK-NEXT: ld.b $a2, $sp, 152
-; CHECK-NEXT: xvreplgr2vr.b $xr14, $a3
-; CHECK-NEXT: ld.b $a3, $sp, 160
-; CHECK-NEXT: xvreplgr2vr.b $xr15, $a4
-; CHECK-NEXT: ld.b $a4, $sp, 168
-; CHECK-NEXT: xvreplgr2vr.b $xr16, $a2
-; CHECK-NEXT: ld.b $a2, $sp, 176
-; CHECK-NEXT: xvreplgr2vr.b $xr17, $a3
-; CHECK-NEXT: ld.b $a3, $sp, 184
-; CHECK-NEXT: xvreplgr2vr.b $xr18, $a4
-; CHECK-NEXT: ld.b $a4, $sp, 192
-; CHECK-NEXT: xvreplgr2vr.b $xr19, $a2
-; CHECK-NEXT: ld.b $a2, $sp, 200
-; CHECK-NEXT: xvreplgr2vr.b $xr20, $a3
-; CHECK-NEXT: ld.b $a3, $sp, 208
-; CHECK-NEXT: xvreplgr2vr.b $xr21, $a4
-; CHECK-NEXT: ld.b $a4, $sp, 216
-; CHECK-NEXT: xvreplgr2vr.b $xr22, $a2
-; CHECK-NEXT: ld.b $a2, $sp, 224
-; CHECK-NEXT: xvreplgr2vr.b $xr23, $a3
-; CHECK-NEXT: ld.b $a3, $sp, 232
-; CHECK-NEXT: xvreplgr2vr.b $xr24, $a4
-; CHECK-NEXT: ld.b $a4, $sp, 240
-; CHECK-NEXT: xvreplgr2vr.b $xr25, $a2
-; CHECK-NEXT: ld.b $a2, $sp, 248
-; CHECK-NEXT: xvreplgr2vr.b $xr26, $a3
-; CHECK-NEXT: ld.b $a3, $sp, 256
-; CHECK-NEXT: xvreplgr2vr.b $xr27, $a4
-; CHECK-NEXT: ld.b $a4, $sp, 272
-; CHECK-NEXT: xvreplgr2vr.b $xr28, $a2
-; CHECK-NEXT: xvreplgr2vr.b $xr29, $a3
-; CHECK-NEXT: xvreplgr2vr.b $xr30, $a1
-; CHECK-NEXT: xvreplgr2vr.b $xr31, $a4
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 17
-; CHECK-NEXT: xvpermi.q $xr2, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr2, 34
-; CHECK-NEXT: xvpermi.q $xr3, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr3, 51
-; CHECK-NEXT: xvpermi.q $xr4, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr4, 68
-; CHECK-NEXT: xvpermi.q $xr5, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr5, 85
-; CHECK-NEXT: xvpermi.q $xr6, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr6, 102
-; CHECK-NEXT: xvpermi.q $xr7, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr7, 119
-; CHECK-NEXT: xvpermi.q $xr8, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr8, 136
-; CHECK-NEXT: xvpermi.q $xr9, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr9, 153
-; CHECK-NEXT: xvpermi.q $xr10, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr10, 170
-; CHECK-NEXT: xvpermi.q $xr11, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr11, 187
-; CHECK-NEXT: xvpermi.q $xr12, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr12, 204
-; CHECK-NEXT: xvpermi.q $xr13, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr13, 221
-; CHECK-NEXT: xvpermi.q $xr14, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr14, 238
-; CHECK-NEXT: xvpermi.q $xr15, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr15, 255
-; CHECK-NEXT: xvpermi.q $xr16, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr16, 0
-; CHECK-NEXT: xvpermi.q $xr17, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr17, 17
-; CHECK-NEXT: xvpermi.q $xr18, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr18, 34
-; CHECK-NEXT: xvpermi.q $xr19, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr19, 51
-; CHECK-NEXT: xvpermi.q $xr20, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr20, 68
-; CHECK-NEXT: xvpermi.q $xr21, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr21, 85
-; CHECK-NEXT: xvpermi.q $xr22, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr22, 102
-; CHECK-NEXT: xvpermi.q $xr23, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr23, 119
-; CHECK-NEXT: xvpermi.q $xr24, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr24, 136
-; CHECK-NEXT: xvpermi.q $xr25, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr25, 153
-; CHECK-NEXT: xvpermi.q $xr26, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr26, 170
-; CHECK-NEXT: xvpermi.q $xr27, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr27, 187
-; CHECK-NEXT: xvpermi.q $xr28, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr28, 204
-; CHECK-NEXT: xvpermi.q $xr29, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr29, 221
-; CHECK-NEXT: xvpermi.q $xr30, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr30, 238
-; CHECK-NEXT: xvpermi.q $xr31, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr31, 255
-; CHECK-NEXT: xvst $xr0, $a0, 0
-; CHECK-NEXT: fld.d $fs7, $sp, 16 # 8-byte Folded Reload
-; CHECK-NEXT: fld.d $fs6, $sp, 24 # 8-byte Folded Reload
-; CHECK-NEXT: fld.d $fs5, $sp, 32 # 8-byte Folded Reload
-; CHECK-NEXT: fld.d $fs4, $sp, 40 # 8-byte Folded Reload
-; CHECK-NEXT: fld.d $fs3, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT: fld.d $fs2, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT: fld.d $fs1, $sp, 64 # 8-byte Folded Reload
-; CHECK-NEXT: fld.d $fs0, $sp, 72 # 8-byte Folded Reload
-; CHECK-NEXT: addi.d $sp, $sp, 80
+; CHECK-NEXT: ld.b $t0, $sp, 72
+; CHECK-NEXT: ld.b $t1, $sp, 184
+; CHECK-NEXT: ld.b $t2, $sp, 80
+; CHECK-NEXT: ld.b $t3, $sp, 88
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t0, 0
+; CHECK-NEXT: ld.b $t0, $sp, 96
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t2, 1
+; CHECK-NEXT: ld.b $t2, $sp, 104
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t3, 2
+; CHECK-NEXT: ld.b $t3, $sp, 112
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t0, 3
+; CHECK-NEXT: ld.b $t0, $sp, 120
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t2, 4
+; CHECK-NEXT: ld.b $t2, $sp, 128
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t3, 5
+; CHECK-NEXT: ld.b $t3, $sp, 136
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t0, 6
+; CHECK-NEXT: ld.b $t0, $sp, 144
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t2, 7
+; CHECK-NEXT: ld.b $t2, $sp, 152
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t3, 8
+; CHECK-NEXT: ld.b $t3, $sp, 160
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t0, 9
+; CHECK-NEXT: ld.b $t0, $sp, 168
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t2, 10
+; CHECK-NEXT: ld.b $t2, $sp, 176
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t3, 11
+; CHECK-NEXT: ld.b $t3, $sp, 192
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t0, 12
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t2, 13
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t1, 14
+; CHECK-NEXT: vinsgr2vr.b $vr0, $t3, 15
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 0
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 1
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a3, 2
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a4, 3
+; CHECK-NEXT: ld.b $a1, $sp, 56
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a5, 4
+; CHECK-NEXT: ld.b $a2, $sp, 0
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a6, 5
+; CHECK-NEXT: ld.b $a3, $sp, 8
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a7, 6
+; CHECK-NEXT: ld.b $a4, $sp, 16
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 7
+; CHECK-NEXT: ld.b $a2, $sp, 24
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a3, 8
+; CHECK-NEXT: ld.b $a3, $sp, 32
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a4, 9
+; CHECK-NEXT: ld.b $a4, $sp, 40
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 10
+; CHECK-NEXT: ld.b $a2, $sp, 48
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a3, 11
+; CHECK-NEXT: ld.b $a3, $sp, 64
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a4, 12
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 13
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 14
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a3, 15
+; CHECK-NEXT: xvpermi.q $xr1, $xr0, 2
+; CHECK-NEXT: xvst $xr1, $a0, 0
; CHECK-NEXT: ret
entry:
%ins0 = insertelement <32 x i8> undef, i8 %a0, i32 0
@@ -412,48 +333,21 @@ define void @buildvector_v32i8_partial(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a5,
; CHECK-NEXT: ld.b $t6, $sp, 8
; CHECK-NEXT: ld.b $t7, $sp, 0
; CHECK-NEXT: vinsgr2vr.b $vr0, $a1, 0
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $a2
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 17
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $a3
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 34
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $a4
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 85
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $a5
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 119
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $a6
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 136
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $a7
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 18
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 255
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $t7
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 17
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $t6
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 34
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $t5
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 68
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $t4
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 102
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $t3
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 119
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $t2
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 187
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $t1
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 204
-; CHECK-NEXT: xvreplgr2vr.b $xr1, $t0
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 48
-; CHECK-NEXT: xvextrins.b $xr0, $xr1, 255
+; CHECK-NEXT: vinsgr2vr.b $vr0, $a2, 1
+; CHECK-NEXT: vinsgr2vr.b $vr0, $a3, 2
+; CHECK-NEXT: vinsgr2vr.b $vr0, $a4, 5
+; CHECK-NEXT: vinsgr2vr.b $vr0, $a5, 7
+; CHECK-NEXT: vinsgr2vr.b $vr0, $a6, 8
+; CHECK-NEXT: vinsgr2vr.b $vr0, $a7, 15
+; CHECK-NEXT: vinsgr2vr.b $vr1, $t7, 1
+; CHECK-NEXT: vinsgr2vr.b $vr1, $t6, 2
+; CHECK-NEXT: vinsgr2vr.b $vr1, $t5, 4
+; CHECK-NEXT: vinsgr2vr.b $vr1, $t4, 6
+; CHECK-NEXT: vinsgr2vr.b $vr1, $t3, 7
+; CHECK-NEXT: vinsgr2vr.b $vr1, $t2, 11
+; CHECK-NEXT: vinsgr2vr.b $vr1, $t1, 12
+; CHECK-NEXT: vinsgr2vr.b $vr1, $t0, 15
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -804,62 +698,33 @@ entry:
define void @buildvector_v16i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
; CHECK-LABEL: buildvector_v16i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: ld.h $t0, $sp, 64
-; CHECK-NEXT: ld.h $t1, $sp, 56
-; CHECK-NEXT: ld.h $t2, $sp, 48
-; CHECK-NEXT: ld.h $t3, $sp, 40
-; CHECK-NEXT: ld.h $t4, $sp, 32
-; CHECK-NEXT: ld.h $t5, $sp, 24
+; CHECK-NEXT: ld.h $t0, $sp, 0
+; CHECK-NEXT: ld.h $t1, $sp, 64
+; CHECK-NEXT: ld.h $t2, $sp, 56
+; CHECK-NEXT: ld.h $t3, $sp, 48
+; CHECK-NEXT: ld.h $t4, $sp, 40
+; CHECK-NEXT: ld.h $t5, $sp, 8
;...
[truncated]
|
6e52551
to
ca08e88
Compare
cdad7e6
to
6ef9fd8
Compare
ca08e88
to
0e3aa0d
Compare
…sertion Note: Only worse for v8i32/v8f32/v4i64/v4f64 types when the high part only has one non-undef element.
6ef9fd8
to
8cd1a2a
Compare
APInt Imm(64, SplatIndex); | ||
return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1, | ||
DAG.getConstant(Imm, DL, Subtarget.getGRLenVT())); | ||
DAG.getConstant(SplatIndex, DL, Subtarget.getGRLenVT())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tests for la32 passed.
Note: Only worse for
v8i32/v8f32/v4i64/v4f64
types when the highpart only has one non-undef element. Skip spliting to avoid this.