Skip to content

Conversation

folkertdev
Copy link
Contributor

Turn a funnel shift by N in the range 121..128 into a funnel shift in the opposite direction by 128 - N. Because there are dedicated instructions for funnel shifts by values smaller than 8, this emits fewer instructions.

This additional rule is useful because LLVM appears to canonicalize fshr into fshl, meaning that the rules for fshr on values less than 8 would not match on organic input.

I reported this in #129955 (comment), where a fshr(a, b, 5) is canonicalized into fshl(a, b, 123).

https://godbolt.org/z/ossMvr31E

define <16 x i8> @vec_sld_manual(<16 x i8> %a, <16 x i8> %b) unnamed_addr {
start:
  %0 = bitcast <16 x i8> %a to i128
  %1 = bitcast <16 x i8> %b to i128
  %_3 = tail call i128 @llvm.fshl.i128(i128 %0, i128 %1, i128 123)
  %2 = bitcast i128 %_3 to <16 x i8>
  ret <16 x i8> %2
}

define <16 x i8> @vec_sld_builtin(<16 x i8> %a, <16 x i8> %b) unnamed_addr {
start:
  %_0 = tail call <16 x i8> @llvm.s390.vsrd(<16 x i8> %a, <16 x i8> %b, i32 noundef 5) #4
  ret <16 x i8> %_0
}

declare <16 x i8> @llvm.s390.vsrd(<16 x i8>, <16 x i8>, i32 immarg) unnamed_addr #2

declare i128 @llvm.fshl.i128(i128, i128, i128) #3

resulting in

vec_sld_manual:
        vsldb   %v0, %v24, %v26, 15
        vsldb   %v1, %v26, %v26, 15
        vsld    %v24, %v0, %v1, 3
        br      %r14

vec_sld_builtin:
        vsrd    %v24, %v24, %v26, 5
        br      %r14

with this PR both functions generate the same, smaller assembly.


cc @uweigand

@llvmbot
Copy link
Member

llvmbot commented Aug 22, 2025

@llvm/pr-subscribers-backend-systemz

Author: Folkert de Vries (folkertdev)

Changes

Turn a funnel shift by N in the range 121..128 into a funnel shift in the opposite direction by 128 - N. Because there are dedicated instructions for funnel shifts by values smaller than 8, this emits fewer instructions.

This additional rule is useful because LLVM appears to canonicalize fshr into fshl, meaning that the rules for fshr on values less than 8 would not match on organic input.

I reported this in #129955 (comment), where a fshr(a, b, 5) is canonicalized into fshl(a, b, 123).

https://godbolt.org/z/ossMvr31E

define &lt;16 x i8&gt; @<!-- -->vec_sld_manual(&lt;16 x i8&gt; %a, &lt;16 x i8&gt; %b) unnamed_addr {
start:
  %0 = bitcast &lt;16 x i8&gt; %a to i128
  %1 = bitcast &lt;16 x i8&gt; %b to i128
  %_3 = tail call i128 @<!-- -->llvm.fshl.i128(i128 %0, i128 %1, i128 123)
  %2 = bitcast i128 %_3 to &lt;16 x i8&gt;
  ret &lt;16 x i8&gt; %2
}

define &lt;16 x i8&gt; @<!-- -->vec_sld_builtin(&lt;16 x i8&gt; %a, &lt;16 x i8&gt; %b) unnamed_addr {
start:
  %_0 = tail call &lt;16 x i8&gt; @<!-- -->llvm.s390.vsrd(&lt;16 x i8&gt; %a, &lt;16 x i8&gt; %b, i32 noundef 5) #<!-- -->4
  ret &lt;16 x i8&gt; %_0
}

declare &lt;16 x i8&gt; @<!-- -->llvm.s390.vsrd(&lt;16 x i8&gt;, &lt;16 x i8&gt;, i32 immarg) unnamed_addr #<!-- -->2

declare i128 @<!-- -->llvm.fshl.i128(i128, i128, i128) #<!-- -->3

resulting in

vec_sld_manual:
        vsldb   %v0, %v24, %v26, 15
        vsldb   %v1, %v26, %v26, 15
        vsld    %v24, %v0, %v1, 3
        br      %r14

vec_sld_builtin:
        vsrd    %v24, %v24, %v26, 5
        br      %r14

with this PR both functions generate the same, smaller assembly.


cc @uweigand


Full diff: https://github.com/llvm/llvm-project/pull/154919.diff

2 Files Affected:

  • (modified) llvm/lib/Target/SystemZ/SystemZISelLowering.cpp (+56-35)
  • (modified) llvm/test/CodeGen/SystemZ/shift-17.ll (+51)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index c73dc3021eb42..2c8b4f478a6a4 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -6706,30 +6706,70 @@ SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
   return Op;
 }
 
-SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
+SDValue lowerFSHLHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt);
+SDValue lowerFSHRHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt);
+
+SDValue lowerFSHLHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt) {
   SDLoc DL(Op);
 
+  // A 128-bit FSHR by small N is canonicalized to a fshl (128 - N).
+  // Convert back to a FSHR make to use of SHR_DOUBLE_BIT.
+  if (ShiftAmt > 120)
+    return lowerFSHRHelp(Op, DAG, 128 - ShiftAmt);
+
   // i128 FSHL with a constant amount that is a multiple of 8 can be
-  // implemented via VECTOR_SHUFFLE.  If we have the vector-enhancements-2
+  // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
   // facility, FSHL with a constant amount less than 8 can be implemented
   // via SHL_DOUBLE_BIT, and FSHL with other constant amounts by a
   // combination of the two.
+  SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
+  SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+  SmallVector<int, 16> Mask(16);
+  for (unsigned Elt = 0; Elt < 16; Elt++)
+    Mask[Elt] = (ShiftAmt >> 3) + Elt;
+  SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
+  if ((ShiftAmt & 7) == 0)
+    return DAG.getBitcast(MVT::i128, Shuf1);
+  SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op1, Op1, Mask);
+  SDValue Val =
+      DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Shuf1, Shuf2,
+                  DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
+  return DAG.getBitcast(MVT::i128, Val);
+}
+
+SDValue lowerFSHRHelp(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt) {
+  SDLoc DL(Op);
+
+  // Unlikely to come up organically because LLVM prefers FSHL.
+  // Convert back to a FSHL make to use of SHL_DOUBLE_BIT.
+  if (ShiftAmt > 120)
+    return lowerFSHLHelp(Op, DAG, 128 - ShiftAmt);
+
+  // i128 FSHR with a constant amount that is a multiple of 8 can be
+  // implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
+  // facility, FSHR with a constant amount less than 8 can be implemented
+  // via SHR_DOUBLE_BIT, and FSHR with other constant amounts by a
+  // combination of the two.
+  SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
+  SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
+  SmallVector<int, 16> Mask(16);
+  for (unsigned Elt = 0; Elt < 16; Elt++)
+    Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt;
+  SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
+  if ((ShiftAmt & 7) == 0)
+    return DAG.getBitcast(MVT::i128, Shuf1);
+  SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op0, Mask);
+  SDValue Val =
+      DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Shuf2, Shuf1,
+                  DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
+  return DAG.getBitcast(MVT::i128, Val);
+}
+
+SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
   if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
     uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
     if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
-      SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
-      SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
-      SmallVector<int, 16> Mask(16);
-      for (unsigned Elt = 0; Elt < 16; Elt++)
-        Mask[Elt] = (ShiftAmt >> 3) + Elt;
-      SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
-      if ((ShiftAmt & 7) == 0)
-        return DAG.getBitcast(MVT::i128, Shuf1);
-      SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op1, Op1, Mask);
-      SDValue Val =
-          DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Shuf1, Shuf2,
-                      DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
-      return DAG.getBitcast(MVT::i128, Val);
+      return lowerFSHLHelp(Op, DAG, ShiftAmt);
     }
   }
 
@@ -6737,29 +6777,10 @@ SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue SystemZTargetLowering::lowerFSHR(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-
-  // i128 FSHR with a constant amount that is a multiple of 8 can be
-  // implemented via VECTOR_SHUFFLE.  If we have the vector-enhancements-2
-  // facility, FSHR with a constant amount less than 8 can be implemented
-  // via SHL_DOUBLE_BIT, and FSHR with other constant amounts by a
-  // combination of the two.
   if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
     uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
     if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
-      SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
-      SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
-      SmallVector<int, 16> Mask(16);
-      for (unsigned Elt = 0; Elt < 16; Elt++)
-        Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt;
-      SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
-      if ((ShiftAmt & 7) == 0)
-        return DAG.getBitcast(MVT::i128, Shuf1);
-      SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op0, Mask);
-      SDValue Val =
-          DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Shuf2, Shuf1,
-                      DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
-      return DAG.getBitcast(MVT::i128, Val);
+      return lowerFSHRHelp(Op, DAG, ShiftAmt);
     }
   }
 
diff --git a/llvm/test/CodeGen/SystemZ/shift-17.ll b/llvm/test/CodeGen/SystemZ/shift-17.ll
index 45f4ed4d70d20..8f5f9abd0540b 100644
--- a/llvm/test/CodeGen/SystemZ/shift-17.ll
+++ b/llvm/test/CodeGen/SystemZ/shift-17.ll
@@ -249,3 +249,54 @@ define i128 @f8(i128 %a, i128 %b, i128 %sh) {
   ret i128 %res
 }
 
+; Funnel shift left by constant N in 121..128, in such cases fshl N == fshr (128 - N)
+define i128 @f9(i128 %a, i128 %b) {
+; CHECK-LABEL: f9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl %v1, 0(%r4), 3
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vrepib %v2, 5
+; CHECK-NEXT:    vsrl %v1, %v1, %v2
+; CHECK-NEXT:    vrepib %v2, 123
+; CHECK-NEXT:    vslb %v0, %v0, %v2
+; CHECK-NEXT:    vsl %v0, %v0, %v2
+; CHECK-NEXT:    vo %v0, %v0, %v1
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+;
+; Z15-LABEL: f9:
+; Z15:       # %bb.0:
+; Z15-NEXT:    vl %v0, 0(%r4), 3
+; Z15-NEXT:    vl %v1, 0(%r3), 3
+; Z15-NEXT:    vsrd %v0, %v1, %v0, 5
+; Z15-NEXT:    vst %v0, 0(%r2), 3
+; Z15-NEXT:    br %r14
+  %res = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 123)
+  ret i128 %res
+}
+
+; Funnel shift right by constant N in 121..128, in such cases fshr N == fshl (128 - N)
+define i128 @f10(i128 %a, i128 %b) {
+; CHECK-LABEL: f10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl %v1, 0(%r3), 3
+; CHECK-NEXT:    vl %v0, 0(%r4), 3
+; CHECK-NEXT:    vrepib %v2, 5
+; CHECK-NEXT:    vsl %v1, %v1, %v2
+; CHECK-NEXT:    vrepib %v2, 123
+; CHECK-NEXT:    vsrlb %v0, %v0, %v2
+; CHECK-NEXT:    vsrl %v0, %v0, %v2
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+;
+; Z15-LABEL: f10:
+; Z15:       # %bb.0:
+; Z15-NEXT:    vl %v0, 0(%r4), 3
+; Z15-NEXT:    vl %v1, 0(%r3), 3
+; Z15-NEXT:    vsld %v0, %v1, %v0, 5
+; Z15-NEXT:    vst %v0, 0(%r2), 3
+; Z15-NEXT:    br %r14
+  %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 123)
+  ret i128 %res
+}

Turn a funnel shift by N in the range 121..128 into a funnel shift in
the opposite direction by `128 - N`. Because there are dedicated
instructions for funnel shifts by values smaller than 8, this emits
fewer instructions.

This additional rule is useful because LLVM appears to canonicalize
`fshr` into `fshl`, meaning that the rules for `fshr` on values less
than 8 would not match on organic input.
@folkertdev folkertdev force-pushed the s390x-funnel-shift-high-value branch from 87ec324 to b3dc118 Compare August 22, 2025 10:46
Copy link
Member

@uweigand uweigand left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, thanks for taking care of this!

@folkertdev
Copy link
Contributor Author

Can you merge this one as well?

@uweigand uweigand merged commit 8a9e333 into llvm:main Aug 27, 2025
9 checks passed
@uweigand
Copy link
Member

Done, thanks for the reminder!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants