supython-coder
diff --git a/‎clang/include/clang/Basic/arm_mve.td
Lines changed: 50 additions & 7 deletions b/‎clang/include/clang/Basic/arm_mve.td
Lines changed: 50 additions & 7 deletions
diff --git a/‎clang/include/clang/Basic/arm_mve_defs.td
Lines changed: 1 addition & 1 deletion b/‎clang/include/clang/Basic/arm_mve_defs.td
Lines changed: 1 addition & 1 deletion
diff --git a/‎clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c
Lines changed: 231 additions & 0 deletions b/‎clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c
Lines changed: 231 additions & 0 deletions
diff --git a/‎llvm/include/llvm/IR/IntrinsicsARM.td
Lines changed: 19 additions & 3 deletions b/‎llvm/include/llvm/IR/IntrinsicsARM.td
Lines changed: 19 additions & 3 deletions
@@ -388,13 +388,56 @@ defm vstrhq: scatter_offset_both<!listconcat(T.All16, T.Int32), u16, 1>;
 defm vstrwq: scatter_offset_both<T.All32, u32, 2>;
 defm vstrdq: scatter_offset_both<T.Int64, u64, 3>;
 
-let params = [Void], pnt = PNT_None in
-def urshrl: Intrinsic<u64, (args u64:$value, imm_1to32:$shift),
-                      (seq (u32 (lshr $value, (u64 32))):$hi,
-                           (u32 $value):$lo,
-                           (IRInt<"urshrl"> $lo, $hi, $shift):$pair,
-                           (or (shl (u64 (xval $pair, 1)), (u64 32)),
-                               (u64 (xval $pair, 0))))>;
+// Base class for the scalar shift intrinsics.
+class ScalarShift<Type argtype, dag shiftCountArg, dag shiftCodeGen>:
+  Intrinsic<argtype, !con((args argtype:$value), shiftCountArg), shiftCodeGen> {
+  let params = [Void];
+  let pnt = PNT_None;
+}
+
+// Subclass that includes the machinery to take a 64-bit input apart
+// into halves, retrieve the two halves of a shifted output as a pair,
+// and glue the pieces of the pair back into an i64 for output.
+class LongScalarShift<Type argtype, dag shiftCountArg, dag shiftCodeGen>:
+   ScalarShift<argtype, shiftCountArg,
+               (seq (u32 (lshr $value, (argtype 32))):$hi,
+                    (u32 $value):$lo,
+                    shiftCodeGen:$pair,
+                    (or (shl (u64 (xval $pair, 1)), (u64 32)),
+                             (u64 (xval $pair, 0))))>;
+
+// The family of saturating/rounding scalar shifts that take an
+// immediate shift count. They come in matched 32- and 64-bit pairs.
+multiclass ScalarSaturatingShiftImm<Type arg32, Type arg64> {
+  def "": ScalarShift<arg32, (args imm_1to32:$sh),
+                      (IRInt<NAME> $value, $sh)>;
+  def l:  LongScalarShift<arg64, (args imm_1to32:$sh),
+                          (IRInt<NAME # "l"> $lo, $hi, $sh)>;
+}
+defm uqshl: ScalarSaturatingShiftImm<u32, u64>;
+defm urshr: ScalarSaturatingShiftImm<u32, u64>;
+defm sqshl: ScalarSaturatingShiftImm<s32, s64>;
+defm srshr: ScalarSaturatingShiftImm<s32, s64>;
+
+// The family of saturating/rounding scalar shifts that take a
+// register shift count. They also have 32- and 64-bit forms, but the
+// 64-bit form also has a version that saturates to 48 bits, so the IR
+// intrinsic takes an extra saturation-type operand.
+multiclass ScalarSaturatingShiftReg<Type arg32, Type arg64> {
+  def "":          ScalarShift<arg32, (args s32:$sh),
+                               (IRInt<NAME> $value, $sh)>;
+  def l:       LongScalarShift<arg64, (args s32:$sh),
+                               (IRInt<NAME # "l"> $lo, $hi, $sh, 64)>;
+  def l_sat48: LongScalarShift<arg64, (args s32:$sh),
+                               (IRInt<NAME # "l"> $lo, $hi, $sh, 48)>;
+}
+defm uqrshl: ScalarSaturatingShiftReg<u32, u64>;
+defm sqrshr: ScalarSaturatingShiftReg<s32, s64>;
+
+// The intrinsics for LSLL and ASRL come in 64-bit versions only, with
+// no saturation count.
+def lsll: LongScalarShift<u64, (args s32:$sh), (IRInt<"lsll"> $lo, $hi, $sh)>;
+def asrl: LongScalarShift<s64, (args s32:$sh), (IRInt<"asrl"> $lo, $hi, $sh)>;
 
 let params = T.Int32 in {
 def vadcq: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry),
 
@@ -312,7 +312,7 @@ def imm_lane : Immediate<sint, IB_LaneIndex>;
 
 // imm_1to32 can be in the range 1 to 32, unconditionally. (e.g. scalar shift
 // intrinsics)
-def imm_1to32 : Immediate<u32, IB_ConstRange<1, 32>>;
+def imm_1to32 : Immediate<sint, IB_ConstRange<1, 32>>;
 
 // imm_1248 can be 1, 2, 4 or 8. (e.g. vidupq)
 def imm_1248 : Immediate<u32, IB_ConstRange<1, 8>> {
 
@@ -3,6 +3,237 @@
 
 #include <arm_mve.h>
 
+// CHECK-LABEL: @test_asrl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.asrl(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_asrl(int64_t value, int32_t shift)
+{
+    return asrl(value, shift);
+}
+
+// CHECK-LABEL: @test_lsll(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.lsll(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_lsll(uint64_t value, int32_t shift)
+{
+    return lsll(value, shift);
+}
+
+// CHECK-LABEL: @test_sqrshr(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.sqrshr(i32 [[VALUE:%.*]], i32 [[SHIFT:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_sqrshr(int32_t value, int32_t shift)
+{
+    return sqrshr(value, shift);
+}
+
+// CHECK-LABEL: @test_sqrshrl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]], i32 64)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_sqrshrl(int64_t value, int32_t shift)
+{
+    return sqrshrl(value, shift);
+}
+
+// CHECK-LABEL: @test_sqrshrl_sat48(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]], i32 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_sqrshrl_sat48(int64_t value, int32_t shift)
+{
+    return sqrshrl_sat48(value, shift);
+}
+
+// CHECK-LABEL: @test_sqshl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.sqshl(i32 [[VALUE:%.*]], i32 2)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_sqshl(int32_t value)
+{
+    return sqshl(value, 2);
+}
+
+// CHECK-LABEL: @test_sqshll(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.sqshll(i32 [[TMP2]], i32 [[TMP1]], i32 17)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_sqshll(int64_t value)
+{
+    return sqshll(value, 17);
+}
+
+// CHECK-LABEL: @test_srshr(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.srshr(i32 [[VALUE:%.*]], i32 6)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_srshr(int32_t value)
+{
+    return srshr(value, 6);
+}
+
+// CHECK-LABEL: @test_srshrl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.srshrl(i32 [[TMP2]], i32 [[TMP1]], i32 26)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+int64_t test_srshrl(int64_t value)
+{
+    return srshrl(value, 26);
+}
+
+// CHECK-LABEL: @test_uqrshl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.uqrshl(i32 [[VALUE:%.*]], i32 [[SHIFT:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_uqrshl(uint32_t value, int32_t shift)
+{
+    return uqrshl(value, shift);
+}
+
+// CHECK-LABEL: @test_uqrshll(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.uqrshll(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]], i32 64)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_uqrshll(uint64_t value, int32_t shift)
+{
+    return uqrshll(value, shift);
+}
+
+// CHECK-LABEL: @test_uqrshll_sat48(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.uqrshll(i32 [[TMP2]], i32 [[TMP1]], i32 [[SHIFT:%.*]], i32 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_uqrshll_sat48(uint64_t value, int32_t shift)
+{
+    return uqrshll_sat48(value, shift);
+}
+
+// CHECK-LABEL: @test_uqshl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.uqshl(i32 [[VALUE:%.*]], i32 21)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_uqshl(uint32_t value)
+{
+    return uqshl(value, 21);
+}
+
+// CHECK-LABEL: @test_uqshll(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.mve.uqshll(i32 [[TMP2]], i32 [[TMP1]], i32 16)
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 32
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret i64 [[TMP9]]
+//
+uint64_t test_uqshll(uint64_t value)
+{
+    return uqshll(value, 16);
+}
+
+// CHECK-LABEL: @test_urshr(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.urshr(i32 [[VALUE:%.*]], i32 22)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_urshr(uint32_t value)
+{
+    return urshr(value, 22);
+}
+
 // CHECK-LABEL: @test_urshrl(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32
 
@@ -850,9 +850,25 @@ defm int_arm_mve_vstr_scatter_offset: MVEPredicated<
    [], [llvm_anyptr_ty, llvm_anyvector_ty, llvm_anyvector_ty,
    llvm_i32_ty, llvm_i32_ty], llvm_anyvector_ty, [IntrWriteMem]>;
 
-def int_arm_mve_urshrl: Intrinsic<
-   [llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-   [IntrNoMem]>;
+// MVE scalar shifts.
+class ARM_MVE_qrshift_single<list<LLVMType> value,
+                             list<LLVMType> saturate = []> :
+  Intrinsic<value, value # [llvm_i32_ty] # saturate, [IntrNoMem]>;
+multiclass ARM_MVE_qrshift<list<LLVMType> saturate = []> {
+  // Most of these shifts come in 32- and 64-bit versions. But only
+  // the 64-bit ones have the extra saturation argument (if any).
+  def "": ARM_MVE_qrshift_single<[llvm_i32_ty]>;
+  def l:  ARM_MVE_qrshift_single<[llvm_i32_ty, llvm_i32_ty], saturate>;
+}
+defm int_arm_mve_urshr: ARM_MVE_qrshift;
+defm int_arm_mve_uqshl: ARM_MVE_qrshift;
+defm int_arm_mve_srshr: ARM_MVE_qrshift;
+defm int_arm_mve_sqshl: ARM_MVE_qrshift;
+defm int_arm_mve_uqrshl: ARM_MVE_qrshift<[llvm_i32_ty]>;
+defm int_arm_mve_sqrshr: ARM_MVE_qrshift<[llvm_i32_ty]>;
+// LSLL and ASRL only have 64-bit versions, not 32.
+def int_arm_mve_lsll: ARM_MVE_qrshift_single<[llvm_i32_ty, llvm_i32_ty]>;
+def int_arm_mve_asrl: ARM_MVE_qrshift_single<[llvm_i32_ty, llvm_i32_ty]>;
 
 def int_arm_mve_vadc: Intrinsic<
    [llvm_anyvector_ty, llvm_i32_ty],