-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[PowerPC] Merge vsr(vsro(input, byte_shift), bit_shift) to vsrq(input, res_bit_shift) #154388
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-powerpc Author: Tony Varghese (tonykuttai) ChangesThis change implements a dag combiner that combines consecutive Note:
Patch is 20.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154388.diff 5 Files Affected:
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 652edd4e04c60..c7c17c2d0f85a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1697,6 +1697,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::XXPERM:
return "PPCISD::XXPERM";
case PPCISD::VECSHL: return "PPCISD::VECSHL";
+ case PPCISD::VSRQ: return "PPCISD::VSRQ";
case PPCISD::CMPB: return "PPCISD::CMPB";
case PPCISD::Hi: return "PPCISD::Hi";
case PPCISD::Lo: return "PPCISD::Lo";
@@ -16680,6 +16681,67 @@ SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
llvm_unreachable("Expected a load or store node here");
}
+// Combine VSR(VSRO input, shift), shift) to VSRQ(input, shift)
+//
+// PowerPC Vector Shift Instructions:
+// - vsro (Vector Shift Right by Octet): Shifts vector right by N bytes,
+// where N is specified in bits 121:124 of the shift vector (4 bits, 0-15
+// bytes)
+// - vsr (Vector Shift Right): Shifts vector right by N bits,
+// where N is specified in bits 125:127 of the shift vector (3 bits, 0-7 bits)
+// - vsrq (Vector Shift Right Quadword): Shifts vector right by N bits,
+// where N is specified in bits 57:63 of the shift vector (7 bits, 0-127 bits)
+//
+// Input DAG pattern: vsr(vsro(input, shift_vector), shift_vector)
+// performs the following shifts:
+// 1. vsro: input >> (bits[121:124] * 8) bits [byte shifts converted to
+// bits]
+// 2. vsr: result >> bits[125:127] bits [additional bit shifts]
+// Total shift = (bits[121:124] * 8) + bits[125:127] bits
+//
+// Since bits 121:127 form a 7-bit value representing the total shift amount,
+// and vsrq uses the same 7-bit shift amount (assuming bits 57:63 map to
+// 121:127), we can replace the two-instruction sequence with a single vsrq
+// instruction.
+//
+// Optimization: vsr(vsro(input, shift), shift) -> vsrq(input, shift)
+SDValue PPCTargetLowering::combineVSROVSRToVSRQ(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+
+ // Only available on ISA 3.1+ (Power10+)
+ if (!Subtarget.isISA3_1())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue VSRInput = N->getOperand(1);
+ SDValue VSRShift = N->getOperand(2);
+
+ // Check if VSR input comes from a VSRO intrinsic
+ if (VSRInput.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
+ return SDValue();
+
+ unsigned VSROIntrinsicID = VSRInput->getConstantOperandVal(0);
+ if (VSROIntrinsicID != Intrinsic::ppc_altivec_vsro)
+ return SDValue();
+
+ // Check if VSRO uses the same shift amount register as VSR
+ SDValue VSROShift = VSRInput.getOperand(2);
+ if (VSRShift != VSROShift)
+ return SDValue();
+
+ // Check single use - VSRO result should only be used by this VSR
+ if (!VSRInput.hasOneUse())
+ return SDValue();
+
+ // Get the original input to VSRO instruction
+ SDValue VSROOrigInput = VSRInput.getOperand(1);
+
+ return DAG.getNode(PPCISD::VSRQ, SDLoc(N),
+ N->getValueType(0), // Preserve original result type
+ VSROOrigInput, // Original input vector
+ VSRShift); // Shift amount
+}
+
static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) {
unsigned IntrinsicID = Intrin.getConstantOperandVal(1);
if (IntrinsicID == Intrinsic::ppc_stdcx)
@@ -17207,6 +17269,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
}
}
}
+
+ // combine VSRO + VSR intrinsic calls to optimize with VSRQ
+ if (IID == Intrinsic::ppc_altivec_vsr)
+ return combineVSROVSRToVSRQ(N, DCI);
}
break;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 5e0d6bf184f20..362ccdfb26efa 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -498,6 +498,9 @@ namespace llvm {
/// SETBCR - The ISA 3.1 (P10) SETBCR instruction.
SETBCR,
+ /// VSRQ - The ISA 3.1 (P10) Vector Shift right quadword instruction
+ VSRQ,
+
// NOTE: The nodes below may require PC-Rel specific patterns if the
// address could be PC-Relative. When adding new nodes below, consider
// whether or not the address can be PC-Relative and add the corresponding
@@ -1447,6 +1450,7 @@ namespace llvm {
SelectionDAG &DAG) const;
SDValue combineVReverseMemOP(ShuffleVectorSDNode *SVN, LSBaseSDNode *LSBase,
DAGCombinerInfo &DCI) const;
+ SDValue combineVSROVSRToVSRQ(SDNode *N, DAGCombinerInfo &DCI) const;
/// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
/// SETCC with integer subtraction when (1) there is a legal way of doing it
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index c2f91ce8e6b96..1a57d622a5f6c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -58,6 +58,10 @@ def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3>
]>;
+def SDT_PPCVecShiftQuad : SDTypeProfile<1, 2, [
+ SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>
+]>;
+
def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
]>;
@@ -157,6 +161,8 @@ def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
+def PPCvsrq: SDNode<"PPCISD::VSRQ", SDT_PPCVecShiftQuad, []>;
+
def PPCstrict_fcfid : SDNode<"PPCISD::STRICT_FCFID",
SDTFPUnaryOp, [SDNPHasChain]>;
def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU",
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index 98dd8464c0ac8..902c40544ac28 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -1919,7 +1919,8 @@ let Predicates = [IsISA3_1] in {
RegConstraint<"$VDi = $VD">, NoEncode<"$VDi">;
def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>;
def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>;
- def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", []>;
+ def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq",
+ [(set v4i32:$VD, (PPCvsrq v4i32:$VA, v4i32:$VB))]>;
def VRLQ : VX1_VT5_VA5_VB5<5, "vrlq", []>;
def XSCVQPUQZ : X_VT5_XO5_VB5<63, 0, 836, "xscvqpuqz", []>;
def XSCVQPSQZ : X_VT5_XO5_VB5<63, 8, 836, "xscvqpsqz", []>;
diff --git a/llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll b/llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll
new file mode 100644
index 0000000000000..c2599c8f6af13
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vsro-vsr-vsrq-dag-combine.ll
@@ -0,0 +1,337 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER10-LE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER10-BE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc-ibm-aix-xcoff \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER3210-BE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER9-LE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWER9-BE
+
+; Test VSRO + VSR peephole optimization to VSRQ on Power10+
+; This should combine consecutive VSRO (Vector Shift Right Octet) and VSR (Vector Shift Right)
+; instructions using the same shift amount into a single VSRQ (Vector Shift Right Quadword)
+; instruction when targeting Power10 or later processors.
+declare <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32>, <4 x i32>)
+
+define <16 x i8> @shiftright128_v16i8(<16 x i8> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: shiftright128_v16i8:
+; POWER10-LE: # %bb.0: # %entry
+; POWER10-LE-NEXT: mtvsrd v3, r5
+; POWER10-LE-NEXT: vspltb v3, v3, 7
+; POWER10-LE-NEXT: vsrq v2, v2, v3
+; POWER10-LE-NEXT: blr
+;
+; POWER10-BE-LABEL: shiftright128_v16i8:
+; POWER10-BE: # %bb.0: # %entry
+; POWER10-BE-NEXT: mtvsrwz v3, r3
+; POWER10-BE-NEXT: vspltb v3, v3, 7
+; POWER10-BE-NEXT: vsrq v2, v2, v3
+; POWER10-BE-NEXT: blr
+;
+; POWER3210-BE-LABEL: shiftright128_v16i8:
+; POWER3210-BE: # %bb.0: # %entry
+; POWER3210-BE-NEXT: mtvsrwz v3, r3
+; POWER3210-BE-NEXT: vspltb v3, v3, 7
+; POWER3210-BE-NEXT: vsrq v2, v2, v3
+; POWER3210-BE-NEXT: blr
+;
+; POWER9-LE-LABEL: shiftright128_v16i8:
+; POWER9-LE: # %bb.0: # %entry
+; POWER9-LE-NEXT: mtvsrd v3, r5
+; POWER9-LE-NEXT: vspltb v3, v3, 7
+; POWER9-LE-NEXT: vsro v2, v2, v3
+; POWER9-LE-NEXT: vsr v2, v2, v3
+; POWER9-LE-NEXT: blr
+;
+; POWER9-BE-LABEL: shiftright128_v16i8:
+; POWER9-BE: # %bb.0: # %entry
+; POWER9-BE-NEXT: mtvsrwz v3, r3
+; POWER9-BE-NEXT: vspltb v3, v3, 7
+; POWER9-BE-NEXT: vsro v2, v2, v3
+; POWER9-BE-NEXT: vsr v2, v2, v3
+; POWER9-BE-NEXT: blr
+entry:
+ %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+ %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+ %0 = bitcast <16 x i8> %in to <4 x i32>
+ %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+ %2 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+ %3 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %2, <4 x i32> %1)
+ %4 = bitcast <4 x i32> %3 to <16 x i8>
+ ret <16 x i8> %4
+}
+
+define <4 x i32> @shiftright128_v4i32(<4 x i32> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: shiftright128_v4i32:
+; POWER10-LE: # %bb.0: # %entry
+; POWER10-LE-NEXT: mtvsrd v3, r5
+; POWER10-LE-NEXT: vspltb v3, v3, 7
+; POWER10-LE-NEXT: vsrq v2, v2, v3
+; POWER10-LE-NEXT: blr
+;
+; POWER10-BE-LABEL: shiftright128_v4i32:
+; POWER10-BE: # %bb.0: # %entry
+; POWER10-BE-NEXT: mtvsrwz v3, r3
+; POWER10-BE-NEXT: vspltb v3, v3, 7
+; POWER10-BE-NEXT: vsrq v2, v2, v3
+; POWER10-BE-NEXT: blr
+;
+; POWER3210-BE-LABEL: shiftright128_v4i32:
+; POWER3210-BE: # %bb.0: # %entry
+; POWER3210-BE-NEXT: mtvsrwz v3, r3
+; POWER3210-BE-NEXT: vspltb v3, v3, 7
+; POWER3210-BE-NEXT: vsrq v2, v2, v3
+; POWER3210-BE-NEXT: blr
+;
+; POWER9-LE-LABEL: shiftright128_v4i32:
+; POWER9-LE: # %bb.0: # %entry
+; POWER9-LE-NEXT: mtvsrd v3, r5
+; POWER9-LE-NEXT: vspltb v3, v3, 7
+; POWER9-LE-NEXT: vsro v2, v2, v3
+; POWER9-LE-NEXT: vsr v2, v2, v3
+; POWER9-LE-NEXT: blr
+;
+; POWER9-BE-LABEL: shiftright128_v4i32:
+; POWER9-BE: # %bb.0: # %entry
+; POWER9-BE-NEXT: mtvsrwz v3, r3
+; POWER9-BE-NEXT: vspltb v3, v3, 7
+; POWER9-BE-NEXT: vsro v2, v2, v3
+; POWER9-BE-NEXT: vsr v2, v2, v3
+; POWER9-BE-NEXT: blr
+entry:
+ %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+ %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+ %0 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+ %1 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %in, <4 x i32> %0)
+ %2 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %1, <4 x i32> %0)
+ ret <4 x i32> %2
+}
+
+define <2 x i64> @shiftright128_v2i64(<2 x i64> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: shiftright128_v2i64:
+; POWER10-LE: # %bb.0: # %entry
+; POWER10-LE-NEXT: mtvsrd v3, r5
+; POWER10-LE-NEXT: vspltb v3, v3, 7
+; POWER10-LE-NEXT: vsrq v2, v2, v3
+; POWER10-LE-NEXT: blr
+;
+; POWER10-BE-LABEL: shiftright128_v2i64:
+; POWER10-BE: # %bb.0: # %entry
+; POWER10-BE-NEXT: mtvsrwz v3, r3
+; POWER10-BE-NEXT: vspltb v3, v3, 7
+; POWER10-BE-NEXT: vsrq v2, v2, v3
+; POWER10-BE-NEXT: blr
+;
+; POWER3210-BE-LABEL: shiftright128_v2i64:
+; POWER3210-BE: # %bb.0: # %entry
+; POWER3210-BE-NEXT: mtvsrwz v3, r3
+; POWER3210-BE-NEXT: vspltb v3, v3, 7
+; POWER3210-BE-NEXT: vsrq v2, v2, v3
+; POWER3210-BE-NEXT: blr
+;
+; POWER9-LE-LABEL: shiftright128_v2i64:
+; POWER9-LE: # %bb.0: # %entry
+; POWER9-LE-NEXT: mtvsrd v3, r5
+; POWER9-LE-NEXT: vspltb v3, v3, 7
+; POWER9-LE-NEXT: vsro v2, v2, v3
+; POWER9-LE-NEXT: vsr v2, v2, v3
+; POWER9-LE-NEXT: blr
+;
+; POWER9-BE-LABEL: shiftright128_v2i64:
+; POWER9-BE: # %bb.0: # %entry
+; POWER9-BE-NEXT: mtvsrwz v3, r3
+; POWER9-BE-NEXT: vspltb v3, v3, 7
+; POWER9-BE-NEXT: vsro v2, v2, v3
+; POWER9-BE-NEXT: vsr v2, v2, v3
+; POWER9-BE-NEXT: blr
+entry:
+ %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+ %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+ %0 = bitcast <2 x i64> %in to <4 x i32>
+ %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+ %2 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+ %3 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %2, <4 x i32> %1)
+ %4 = bitcast <4 x i32> %3 to <2 x i64>
+ ret <2 x i64> %4
+}
+
+define <8 x i16> @shiftright128_v8i16(<8 x i16> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: shiftright128_v8i16:
+; POWER10-LE: # %bb.0: # %entry
+; POWER10-LE-NEXT: mtvsrd v3, r5
+; POWER10-LE-NEXT: vspltb v3, v3, 7
+; POWER10-LE-NEXT: vsrq v2, v2, v3
+; POWER10-LE-NEXT: blr
+;
+; POWER10-BE-LABEL: shiftright128_v8i16:
+; POWER10-BE: # %bb.0: # %entry
+; POWER10-BE-NEXT: mtvsrwz v3, r3
+; POWER10-BE-NEXT: vspltb v3, v3, 7
+; POWER10-BE-NEXT: vsrq v2, v2, v3
+; POWER10-BE-NEXT: blr
+;
+; POWER3210-BE-LABEL: shiftright128_v8i16:
+; POWER3210-BE: # %bb.0: # %entry
+; POWER3210-BE-NEXT: mtvsrwz v3, r3
+; POWER3210-BE-NEXT: vspltb v3, v3, 7
+; POWER3210-BE-NEXT: vsrq v2, v2, v3
+; POWER3210-BE-NEXT: blr
+;
+; POWER9-LE-LABEL: shiftright128_v8i16:
+; POWER9-LE: # %bb.0: # %entry
+; POWER9-LE-NEXT: mtvsrd v3, r5
+; POWER9-LE-NEXT: vspltb v3, v3, 7
+; POWER9-LE-NEXT: vsro v2, v2, v3
+; POWER9-LE-NEXT: vsr v2, v2, v3
+; POWER9-LE-NEXT: blr
+;
+; POWER9-BE-LABEL: shiftright128_v8i16:
+; POWER9-BE: # %bb.0: # %entry
+; POWER9-BE-NEXT: mtvsrwz v3, r3
+; POWER9-BE-NEXT: vspltb v3, v3, 7
+; POWER9-BE-NEXT: vsro v2, v2, v3
+; POWER9-BE-NEXT: vsr v2, v2, v3
+; POWER9-BE-NEXT: blr
+entry:
+ %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+ %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+ %0 = bitcast <8 x i16> %in to <4 x i32>
+ %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+ %2 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+ %3 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %2, <4 x i32> %1)
+ %4 = bitcast <4 x i32> %3 to <8 x i16>
+ ret <8 x i16> %4
+}
+
+; Test case with different vectors (should not optimize - different shift amounts)
+define <16 x i8> @no_optimization_different_shifts(<16 x i8> %in, i8 zeroext %sh1, i8 zeroext %sh2) {
+; POWER10-LE-LABEL: no_optimization_different_shifts:
+; POWER10-LE: # %bb.0: # %entry
+; POWER10-LE-NEXT: mtvsrd v3, r5
+; POWER10-LE-NEXT: mtvsrd v4, r6
+; POWER10-LE-NEXT: vspltb v3, v3, 7
+; POWER10-LE-NEXT: vspltb v4, v4, 7
+; POWER10-LE-NEXT: vsro v2, v2, v3
+; POWER10-LE-NEXT: vsr v2, v2, v4
+; POWER10-LE-NEXT: blr
+;
+; POWER10-BE-LABEL: no_optimization_different_shifts:
+; POWER10-BE: # %bb.0: # %entry
+; POWER10-BE-NEXT: mtvsrwz v3, r3
+; POWER10-BE-NEXT: mtvsrwz v4, r4
+; POWER10-BE-NEXT: vspltb v3, v3, 7
+; POWER10-BE-NEXT: vspltb v4, v4, 7
+; POWER10-BE-NEXT: vsro v2, v2, v3
+; POWER10-BE-NEXT: vsr v2, v2, v4
+; POWER10-BE-NEXT: blr
+;
+; POWER3210-BE-LABEL: no_optimization_different_shifts:
+; POWER3210-BE: # %bb.0: # %entry
+; POWER3210-BE-NEXT: mtvsrwz v3, r3
+; POWER3210-BE-NEXT: mtvsrwz v4, r4
+; POWER3210-BE-NEXT: vspltb v3, v3, 7
+; POWER3210-BE-NEXT: vspltb v4, v4, 7
+; POWER3210-BE-NEXT: vsro v2, v2, v3
+; POWER3210-BE-NEXT: vsr v2, v2, v4
+; POWER3210-BE-NEXT: blr
+;
+; POWER9-LE-LABEL: no_optimization_different_shifts:
+; POWER9-LE: # %bb.0: # %entry
+; POWER9-LE-NEXT: mtvsrd v3, r5
+; POWER9-LE-NEXT: mtvsrd v4, r6
+; POWER9-LE-NEXT: vspltb v3, v3, 7
+; POWER9-LE-NEXT: vspltb v4, v4, 7
+; POWER9-LE-NEXT: vsro v2, v2, v3
+; POWER9-LE-NEXT: vsr v2, v2, v4
+; POWER9-LE-NEXT: blr
+;
+; POWER9-BE-LABEL: no_optimization_different_shifts:
+; POWER9-BE: # %bb.0: # %entry
+; POWER9-BE-NEXT: mtvsrwz v3, r3
+; POWER9-BE-NEXT: mtvsrwz v4, r4
+; POWER9-BE-NEXT: vspltb v3, v3, 7
+; POWER9-BE-NEXT: vspltb v4, v4, 7
+; POWER9-BE-NEXT: vsro v2, v2, v3
+; POWER9-BE-NEXT: vsr v2, v2, v4
+; POWER9-BE-NEXT: blr
+entry:
+ %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh1, i64 0
+ %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+ %splat.splatinsert.i2 = insertelement <16 x i8> poison, i8 %sh2, i64 0
+ %splat.splat.i2 = shufflevector <16 x i8> %splat.splatinsert.i2, <16 x i8> poison, <16 x i32> zeroinitializer
+ %0 = bitcast <16 x i8> %in to <4 x i32>
+ %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+ %2 = bitcast <16 x i8> %splat.splat.i2 to <4 x i32>
+ %3 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+ %4 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<4 x i32> %3, <4 x i32> %2)
+ %5 = bitcast <4 x i32> %4 to <16 x i8>
+ ret <16 x i8> %5
+}
+
+; Test case with multiple uses of VSRO result (should not optimize)
+define <16 x i8> @no_optimization_multiple_uses(<16 x i8> %in, i8 zeroext %sh) {
+; POWER10-LE-LABEL: no_optimization_multiple_uses:
+; POWER10-LE: # %bb.0: # %entry
+; POWER10-LE-NEXT: mtvsrd v3, r5
+; POWER10-LE-NEXT: vspltb v3, v3, 7
+; POWER10-LE-NEXT: vsro v2, v2, v3
+; POWER10-LE-NEXT: vsr v3, v2, v3
+; POWER10-LE-NEXT: vaddubm v2, v2, v3
+; POWER10-LE-NEXT: blr
+;
+; POWER10-BE-LABEL: no_optimization_multiple_uses:
+; POWER10-BE: # %bb.0: # %entry
+; POWER10-BE-NEXT: mtvsrwz v3, r3
+; POWER10-BE-NEXT: vspltb v3, v3, 7
+; POWER10-BE-NEXT: vsro v2, v2, v3
+; POWER10-BE-NEXT: vsr v3, v2, v3
+; POWER10-BE-NEXT: vaddubm v2, v2, v3
+; POWER10-BE-NEXT: blr
+;
+; POWER3210-BE-LABEL: no_optimization_multiple_uses:
+; POWER3210-BE: # %bb.0: # %entry
+; POWER3210-BE-NEXT: mtvsrwz v3, r3
+; POWER3210-BE-NEXT: vspltb v3, v3, 7
+; POWER3210-BE-NEXT: vsro v2, v2, v3
+; POWER3210-BE-NEXT: vsr v3, v2, v3
+; POWER3210-BE-NEXT: vaddubm v2, v2, v3
+; POWER3210-BE-NEXT: blr
+;
+; POWER9-LE-LABEL: no_optimization_multiple_uses:
+; POWER9-LE: # %bb.0: # %entry
+; POWER9-LE-NEXT: mtvsrd v3, r5
+; POWER9-LE-NEXT: vspltb v3, v3, 7
+; POWER9-LE-NEXT: vsro v2, v2, v3
+; POWER9-LE-NEXT: vsr v3, v2, v3
+; POWER9-LE-NEXT: vaddubm v2, v2, v3
+; POWER9-LE-NEXT: blr
+;
+; POWER9-BE-LABEL: no_optimization_multiple_uses:
+; POWER9-BE: # %bb.0: # %entry
+; POWER9-BE-NEXT: mtvsrwz v3, r3
+; POWER9-BE-NEXT: vspltb v3, v3, 7
+; POWER9-BE-NEXT: vsro v2, v2, v3
+; POWER9-BE-NEXT: vsr v3, v2, v3
+; POWER9-BE-NEXT: vaddubm v2, v2, v3
+; POWER9-BE-NEXT: blr
+entry:
+ %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %sh, i64 0
+ %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> poison, <16 x i32> zeroinitializer
+ %0 = bitcast <16 x i8> %in to <4 x i32>
+ %1 = bitcast <16 x i8> %splat.splat.i to <4 x i32>
+ %2 = tail call <4 x i32> @llvm.ppc.altivec.vsro(<4 x i32> %0, <4 x i32> %1)
+ %3 = tail call <4 x i32> @llvm.ppc.altivec.vsr(<...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
a972eb8
to
d288a26
Compare
gentle ping @RolandF77 @amy-kwan @redstar @lei137 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This change implements a patfrag based pattern matching
dag combinerthat combines consecutiveVSRO (Vector Shift Right Octet)
andVSR (Vector Shift Right)
instructions into a singleVSRQ (Vector Shift Right Quadword)
instruction on Power10+ processors.Vector right shift operations like
vec_srl(vec_sro(input, byte_shift), bit_shift)
generate two separate instructions(VSRO + VSR)
when they could be optimised into a singleVSRQ
instruction that performs the equivalent operation.Note: