-
Notifications
You must be signed in to change notification settings - Fork 14.9k
s390x: optimize 128-bit fshl and fshr by high values #154919
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-systemz Author: Folkert de Vries (folkertdev) ChangesTurn a funnel shift by N in the range This additional rule is useful because LLVM appears to canonicalize I reported this in #129955 (comment), where a https://godbolt.org/z/ossMvr31E define <16 x i8> @<!-- -->vec_sld_manual(<16 x i8> %a, <16 x i8> %b) unnamed_addr {
start:
%0 = bitcast <16 x i8> %a to i128
%1 = bitcast <16 x i8> %b to i128
%_3 = tail call i128 @<!-- -->llvm.fshl.i128(i128 %0, i128 %1, i128 123)
%2 = bitcast i128 %_3 to <16 x i8>
ret <16 x i8> %2
}
define <16 x i8> @<!-- -->vec_sld_builtin(<16 x i8> %a, <16 x i8> %b) unnamed_addr {
start:
%_0 = tail call <16 x i8> @<!-- -->llvm.s390.vsrd(<16 x i8> %a, <16 x i8> %b, i32 noundef 5) #<!-- -->4
ret <16 x i8> %_0
}
declare <16 x i8> @<!-- -->llvm.s390.vsrd(<16 x i8>, <16 x i8>, i32 immarg) unnamed_addr #<!-- -->2
declare i128 @<!-- -->llvm.fshl.i128(i128, i128, i128) #<!-- -->3 resulting in vec_sld_manual:
vsldb %v0, %v24, %v26, 15
vsldb %v1, %v26, %v26, 15
vsld %v24, %v0, %v1, 3
br %r14
vec_sld_builtin:
vsrd %v24, %v24, %v26, 5
br %r14 with this PR both functions generate the same, smaller assembly. cc @uweigand Full diff: https://github.com/llvm/llvm-project/pull/154919.diff 2 Files Affected:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index c73dc3021eb42..2c8b4f478a6a4 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -6706,30 +6706,70 @@ SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
return Op;
}
-SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
+SDValue lowerFSHLHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt);
+SDValue lowerFSHRHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt);
+
+SDValue lowerFSHLHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt) {
SDLoc DL(Op);
+ // A 128-bit FSHR by small N is canonicalized to a fshl (128 - N).
+ // Convert back to a FSHR make to use of SHR_DOUBLE_BIT.
+ if (ShiftAmt > 120)
+ return lowerFSHRHelp(Op, DAG, 128 - ShiftAmt);
+
// i128 FSHL with a constant amount that is a multiple of 8 can be
- // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
+ // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
// facility, FSHL with a constant amount less than 8 can be implemented
// via SHL_DOUBLE_BIT, and FSHL with other constant amounts by a
// combination of the two.
+ SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
+ SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+ SmallVector<int, 16> Mask(16);
+ for (unsigned Elt = 0; Elt < 16; Elt++)
+ Mask[Elt] = (ShiftAmt >> 3) + Elt;
+ SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
+ if ((ShiftAmt & 7) == 0)
+ return DAG.getBitcast(MVT::i128, Shuf1);
+ SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op1, Op1, Mask);
+ SDValue Val =
+ DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Shuf1, Shuf2,
+ DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
+ return DAG.getBitcast(MVT::i128, Val);
+}
+
+SDValue lowerFSHRHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt) {
+ SDLoc DL(Op);
+
+ // Unlikely to come up organically because LLVM prefers FSHL.
+ // Convert back to a FSHL make to use of SHL_DOUBLE_BIT.
+ if (ShiftAmt > 120)
+ return lowerFSHLHelp(Op, DAG, 128 - ShiftAmt);
+
+ // i128 FSHR with a constant amount that is a multiple of 8 can be
+ // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
+ // facility, FSHR with a constant amount less than 8 can be implemented
+ // via SHR_DOUBLE_BIT, and FSHR with other constant amounts by a
+ // combination of the two.
+ SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
+ SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+ SmallVector<int, 16> Mask(16);
+ for (unsigned Elt = 0; Elt < 16; Elt++)
+ Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt;
+ SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
+ if ((ShiftAmt & 7) == 0)
+ return DAG.getBitcast(MVT::i128, Shuf1);
+ SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op0, Mask);
+ SDValue Val =
+ DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Shuf2, Shuf1,
+ DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
+ return DAG.getBitcast(MVT::i128, Val);
+}
+
+SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
- SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
- SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
- SmallVector<int, 16> Mask(16);
- for (unsigned Elt = 0; Elt < 16; Elt++)
- Mask[Elt] = (ShiftAmt >> 3) + Elt;
- SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
- if ((ShiftAmt & 7) == 0)
- return DAG.getBitcast(MVT::i128, Shuf1);
- SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op1, Op1, Mask);
- SDValue Val =
- DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Shuf1, Shuf2,
- DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
- return DAG.getBitcast(MVT::i128, Val);
+ return lowerFSHLHelp(Op, DAG, ShiftAmt);
}
}
@@ -6737,29 +6777,10 @@ SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
}
SDValue SystemZTargetLowering::lowerFSHR(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
-
- // i128 FSHR with a constant amount that is a multiple of 8 can be
- // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
- // facility, FSHR with a constant amount less than 8 can be implemented
- // via SHL_DOUBLE_BIT, and FSHR with other constant amounts by a
- // combination of the two.
if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
- SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
- SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
- SmallVector<int, 16> Mask(16);
- for (unsigned Elt = 0; Elt < 16; Elt++)
- Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt;
- SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
- if ((ShiftAmt & 7) == 0)
- return DAG.getBitcast(MVT::i128, Shuf1);
- SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op0, Mask);
- SDValue Val =
- DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Shuf2, Shuf1,
- DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
- return DAG.getBitcast(MVT::i128, Val);
+ return lowerFSHRHelp(Op, DAG, ShiftAmt);
}
}
diff --git a/llvm/test/CodeGen/SystemZ/shift-17.ll b/llvm/test/CodeGen/SystemZ/shift-17.ll
index 45f4ed4d70d20..8f5f9abd0540b 100644
--- a/llvm/test/CodeGen/SystemZ/shift-17.ll
+++ b/llvm/test/CodeGen/SystemZ/shift-17.ll
@@ -249,3 +249,54 @@ define i128 @f8(i128 %a, i128 %b, i128 %sh) {
ret i128 %res
}
+; Funnel shift left by constant N in 121..128, in such cases fshl N == fshr (128 - N)
+define i128 @f9(i128 %a, i128 %b) {
+; CHECK-LABEL: f9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v1, 0(%r4), 3
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vrepib %v2, 5
+; CHECK-NEXT: vsrl %v1, %v1, %v2
+; CHECK-NEXT: vrepib %v2, 123
+; CHECK-NEXT: vslb %v0, %v0, %v2
+; CHECK-NEXT: vsl %v0, %v0, %v2
+; CHECK-NEXT: vo %v0, %v0, %v1
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+;
+; Z15-LABEL: f9:
+; Z15: # %bb.0:
+; Z15-NEXT: vl %v0, 0(%r4), 3
+; Z15-NEXT: vl %v1, 0(%r3), 3
+; Z15-NEXT: vsrd %v0, %v1, %v0, 5
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
+ %res = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 123)
+ ret i128 %res
+}
+
+; Funnel shift right by constant N in 121..128, in such cases fshr N == fshl (128 - N)
+define i128 @f10(i128 %a, i128 %b) {
+; CHECK-LABEL: f10:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vrepib %v2, 5
+; CHECK-NEXT: vsl %v1, %v1, %v2
+; CHECK-NEXT: vrepib %v2, 123
+; CHECK-NEXT: vsrlb %v0, %v0, %v2
+; CHECK-NEXT: vsrl %v0, %v0, %v2
+; CHECK-NEXT: vo %v0, %v1, %v0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+;
+; Z15-LABEL: f10:
+; Z15: # %bb.0:
+; Z15-NEXT: vl %v0, 0(%r4), 3
+; Z15-NEXT: vl %v1, 0(%r3), 3
+; Z15-NEXT: vsld %v0, %v1, %v0, 5
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
+ %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 123)
+ ret i128 %res
+}
|
Turn a funnel shift by N in the range 121..128 into a funnel shift in the opposite direction by `128 - N`. Because there are dedicated instructions for funnel shifts by values smaller than 8, this emits fewer instructions. This additional rule is useful because LLVM appears to canonicalize `fshr` into `fshl`, meaning that the rules for `fshr` on values less than 8 would not match on organic input.
87ec324
to
b3dc118
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks for taking care of this!
Can you merge this one as well? |
Done, thanks for the reminder! |
Turn a funnel shift by N in the range
121..128
into a funnel shift in the opposite direction by128 - N
. Because there are dedicated instructions for funnel shifts by values smaller than 8, this emits fewer instructions.This additional rule is useful because LLVM appears to canonicalize
fshr
intofshl
, meaning that the rules forfshr
on values less than 8 would not match on organic input.I reported this in #129955 (comment), where a
fshr(a, b, 5)
is canonicalized intofshl(a, b, 123)
.https://godbolt.org/z/ossMvr31E
resulting in
with this PR both functions generate the same, smaller assembly.
cc @uweigand