-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[GISel] Combine shift + trunc + shift pattern #155583
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-aarch64 Author: None (jyli0116) ChangesFolds shift(trunc(shift(...))) pattern into trunc(shift(...)) by combining the two shift instructions Patch is 68.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155583.diff 13 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 6dba689e8af71..40f612cc98bcc 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -81,6 +81,13 @@ struct ShiftOfShiftedLogic {
uint64_t ValSum;
};
+struct ShiftOfTruncOfShift {
+ Register Src;
+ uint64_t ShiftAmt;
+ LLT ShiftAmtTy;
+ LLT InnerShiftTy;
+};
+
using BuildFnTy = std::function<void(MachineIRBuilder &)>;
using OperandBuildSteps =
@@ -338,6 +345,12 @@ class CombinerHelper {
bool matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) const;
+ /// Fold (shift (trunc (shift x, C1)), C2) -> trunc (shift x, (C1 + C2))
+ bool matchShiftOfTruncOfShift(MachineInstr &MI,
+ ShiftOfTruncOfShift &MatchInfo) const;
+ void applyShiftOfTruncOfShift(MachineInstr &MI,
+ ShiftOfTruncOfShift &MatchInfo) const;
+
/// Transform a multiply by a power-of-2 value to a left shift.
bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const;
void applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 9564b581c5ebb..46e41a5cc4c79 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -396,6 +396,14 @@ def commute_shift : GICombineRule<
[{ return Helper.matchCommuteShift(*${d}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${d}, ${matchinfo}); }])>;
+// Fold (shift (trunc (shift x, C1)), C2) -> trunc (shift x, (C1 + C2))
+def shift_of_trunc_of_shift_matchdata : GIDefMatchData<"ShiftOfTruncOfShift">;
+def shift_of_trunc_of_shift : GICombineRule<
+ (defs root:$root, shift_of_trunc_of_shift_matchdata:$matchinfo),
+ (match (wip_match_opcode G_LSHR, G_ASHR):$root,
+ [{ return Helper.matchShiftOfTruncOfShift(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyShiftOfTruncOfShift(*${root}, ${matchinfo}); }])>;
+
def narrow_binop_feeding_and : GICombineRule<
(defs root:$root, build_fn_matchinfo:$matchinfo),
(match (wip_match_opcode G_AND):$root,
@@ -2103,7 +2111,8 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
simplify_neg_minmax, combine_concat_vector,
sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
- combine_use_vector_truncate, merge_combines, overflow_combines, truncsat_combines]>;
+ combine_use_vector_truncate, merge_combines, overflow_combines,
+ truncsat_combines, shift_of_trunc_of_shift]>;
// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 0674f5fd1ae06..d3f0731955353 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2094,6 +2094,60 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI,
return true;
}
+bool CombinerHelper::matchShiftOfTruncOfShift(
+ MachineInstr &MI, ShiftOfTruncOfShift &MatchInfo) const {
+ unsigned ShiftOpcode = MI.getOpcode();
+ assert(ShiftOpcode == TargetOpcode::G_LSHR ||
+ ShiftOpcode == TargetOpcode::G_ASHR);
+
+ Register N0 = MI.getOperand(1).getReg();
+ Register N1 = MI.getOperand(2).getReg();
+ unsigned OpSizeInBits = MRI.getType(N0).getScalarSizeInBits();
+
+ APInt N1C;
+ Register InnerShift;
+ if (!mi_match(N1, MRI, m_ICstOrSplat(N1C)) ||
+ !mi_match(N0, MRI, m_GTrunc(m_Reg(InnerShift))))
+ return false;
+
+ auto *InnerMI = MRI.getVRegDef(InnerShift);
+ if (InnerMI->getOpcode() != ShiftOpcode)
+ return false;
+
+ APInt N001C;
+ auto N001 = InnerMI->getOperand(2).getReg();
+ if (!mi_match(N001, MRI, m_ICstOrSplat(N001C)))
+ return false;
+
+ uint64_t c1 = N001C.getZExtValue();
+ uint64_t c2 = N1C.getZExtValue();
+ LLT InnerShiftTy = MRI.getType(InnerShift);
+ uint64_t InnerShiftSize = InnerShiftTy.getScalarSizeInBits();
+ if (!(c1 + OpSizeInBits == InnerShiftSize) || !(c1 + c2 < InnerShiftSize))
+ return false;
+
+ MatchInfo.Src = InnerMI->getOperand(1).getReg();
+ MatchInfo.ShiftAmt = c1 + c2;
+ MatchInfo.ShiftAmtTy = MRI.getType(N001);
+ MatchInfo.InnerShiftTy = InnerShiftTy;
+ return true;
+}
+
+void CombinerHelper::applyShiftOfTruncOfShift(
+ MachineInstr &MI, ShiftOfTruncOfShift &MatchInfo) const {
+ unsigned ShiftOpcode = MI.getOpcode();
+ assert(ShiftOpcode == TargetOpcode::G_LSHR ||
+ ShiftOpcode == TargetOpcode::G_ASHR);
+
+ Register Dst = MI.getOperand(0).getReg();
+ auto ShiftAmt =
+ Builder.buildConstant(MatchInfo.ShiftAmtTy, MatchInfo.ShiftAmt);
+ auto Shift = Builder.buildInstr(ShiftOpcode, {MatchInfo.InnerShiftTy},
+ {MatchInfo.Src, ShiftAmt});
+ Builder.buildTrunc(Dst, Shift);
+ MI.eraseFromParent();
+}
+
bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
unsigned &ShiftVal) const {
assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 5f499e5e9700a..e44819ad5a4ae 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -349,6 +349,8 @@ def AArch64PostLegalizerLowering
}
// Post-legalization combines which are primarily optimizations.
+
+
def AArch64PostLegalizerCombiner
: GICombiner<"AArch64PostLegalizerCombinerImpl",
[copy_prop, cast_of_cast_combines,
@@ -369,5 +371,5 @@ def AArch64PostLegalizerCombiner
commute_constant_to_rhs, extract_vec_elt_combines,
push_freeze_to_prevent_poison_from_propagating,
combine_mul_cmlt, combine_use_vector_truncate,
- extmultomull, truncsat_combines]> {
+ extmultomull, truncsat_combines, shift_of_trunc_of_shift]> {
}
diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
index 9d0ade2480428..014eaee5ebb2f 100644
--- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll
+++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
@@ -1684,24 +1684,14 @@ define i32 @combine_i32_sdiv_const7(i32 %x) {
}
define i32 @combine_i32_sdiv_const100(i32 %x) {
-; CHECK-SD-LABEL: combine_i32_sdiv_const100:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, #34079 // =0x851f
-; CHECK-SD-NEXT: movk w8, #20971, lsl #16
-; CHECK-SD-NEXT: smull x8, w0, w8
-; CHECK-SD-NEXT: asr x8, x8, #37
-; CHECK-SD-NEXT: add w0, w8, w8, lsr #31
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: combine_i32_sdiv_const100:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov w8, #34079 // =0x851f
-; CHECK-GI-NEXT: movk w8, #20971, lsl #16
-; CHECK-GI-NEXT: smull x8, w0, w8
-; CHECK-GI-NEXT: asr x8, x8, #32
-; CHECK-GI-NEXT: asr w8, w8, #5
-; CHECK-GI-NEXT: add w0, w8, w8, lsr #31
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: combine_i32_sdiv_const100:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #34079 // =0x851f
+; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: smull x8, w0, w8
+; CHECK-NEXT: asr x8, x8, #37
+; CHECK-NEXT: add w0, w8, w8, lsr #31
+; CHECK-NEXT: ret
%1 = sdiv i32 %x, 100
ret i32 %1
}
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index c57383ad9b1e7..f36a87794be35 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -276,28 +276,16 @@ entry:
}
define i32 @si32_100(i32 %a, i32 %b) {
-; CHECK-SD-LABEL: si32_100:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov w8, #34079 // =0x851f
-; CHECK-SD-NEXT: mov w9, #100 // =0x64
-; CHECK-SD-NEXT: movk w8, #20971, lsl #16
-; CHECK-SD-NEXT: smull x8, w0, w8
-; CHECK-SD-NEXT: asr x8, x8, #37
-; CHECK-SD-NEXT: add w8, w8, w8, lsr #31
-; CHECK-SD-NEXT: msub w0, w8, w9, w0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: si32_100:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #34079 // =0x851f
-; CHECK-GI-NEXT: mov w9, #100 // =0x64
-; CHECK-GI-NEXT: movk w8, #20971, lsl #16
-; CHECK-GI-NEXT: smull x8, w0, w8
-; CHECK-GI-NEXT: asr x8, x8, #32
-; CHECK-GI-NEXT: asr w8, w8, #5
-; CHECK-GI-NEXT: add w8, w8, w8, lsr #31
-; CHECK-GI-NEXT: msub w0, w8, w9, w0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: si32_100:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #34079 // =0x851f
+; CHECK-NEXT: mov w9, #100 // =0x64
+; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: smull x8, w0, w8
+; CHECK-NEXT: asr x8, x8, #37
+; CHECK-NEXT: add w8, w8, w8, lsr #31
+; CHECK-NEXT: msub w0, w8, w9, w0
+; CHECK-NEXT: ret
entry:
%s = srem i32 %a, 100
ret i32 %s
@@ -336,26 +324,15 @@ entry:
}
define i32 @ui32_100(i32 %a, i32 %b) {
-; CHECK-SD-LABEL: ui32_100:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov w8, #34079 // =0x851f
-; CHECK-SD-NEXT: mov w9, #100 // =0x64
-; CHECK-SD-NEXT: movk w8, #20971, lsl #16
-; CHECK-SD-NEXT: umull x8, w0, w8
-; CHECK-SD-NEXT: lsr x8, x8, #37
-; CHECK-SD-NEXT: msub w0, w8, w9, w0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ui32_100:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #34079 // =0x851f
-; CHECK-GI-NEXT: mov w9, #100 // =0x64
-; CHECK-GI-NEXT: movk w8, #20971, lsl #16
-; CHECK-GI-NEXT: umull x8, w0, w8
-; CHECK-GI-NEXT: lsr x8, x8, #32
-; CHECK-GI-NEXT: lsr w8, w8, #5
-; CHECK-GI-NEXT: msub w0, w8, w9, w0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ui32_100:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #34079 // =0x851f
+; CHECK-NEXT: mov w9, #100 // =0x64
+; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: umull x8, w0, w8
+; CHECK-NEXT: lsr x8, x8, #37
+; CHECK-NEXT: msub w0, w8, w9, w0
+; CHECK-NEXT: ret
entry:
%s = urem i32 %a, 100
ret i32 %s
@@ -1118,13 +1095,12 @@ define <8 x i8> @sv8i8_100(<8 x i8> %d, <8 x i8> %e) {
; CHECK-GI-LABEL: sv8i8_100:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v1.8b, #41
-; CHECK-GI-NEXT: movi v3.8b, #100
+; CHECK-GI-NEXT: movi v2.8b, #100
; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b
-; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8
-; CHECK-GI-NEXT: sshr v2.8b, v1.8b, #4
-; CHECK-GI-NEXT: ushr v2.8b, v2.8b, #7
-; CHECK-GI-NEXT: ssra v2.8b, v1.8b, #4
-; CHECK-GI-NEXT: mls v0.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #12
+; CHECK-GI-NEXT: xtn v1.8b, v1.8h
+; CHECK-GI-NEXT: usra v1.8b, v1.8b, #7
+; CHECK-GI-NEXT: mls v0.8b, v1.8b, v2.8b
; CHECK-GI-NEXT: ret
entry:
%s = srem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
@@ -1619,15 +1595,25 @@ entry:
}
define <8 x i8> @uv8i8_100(<8 x i8> %d, <8 x i8> %e) {
-; CHECK-LABEL: uv8i8_100:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v1.8b, #41
-; CHECK-NEXT: movi v2.8b, #100
-; CHECK-NEXT: umull v1.8h, v0.8b, v1.8b
-; CHECK-NEXT: shrn v1.8b, v1.8h, #8
-; CHECK-NEXT: ushr v1.8b, v1.8b, #4
-; CHECK-NEXT: mls v0.8b, v1.8b, v2.8b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: uv8i8_100:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v1.8b, #41
+; CHECK-SD-NEXT: movi v2.8b, #100
+; CHECK-SD-NEXT: umull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: shrn v1.8b, v1.8h, #8
+; CHECK-SD-NEXT: ushr v1.8b, v1.8b, #4
+; CHECK-SD-NEXT: mls v0.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uv8i8_100:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: movi v1.8b, #41
+; CHECK-GI-NEXT: movi v2.8b, #100
+; CHECK-GI-NEXT: umull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: ushr v1.8h, v1.8h, #12
+; CHECK-GI-NEXT: xtn v1.8b, v1.8h
+; CHECK-GI-NEXT: mls v0.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT: ret
entry:
%s = urem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
ret <8 x i8> %s
@@ -1904,14 +1890,13 @@ define <4 x i16> @sv4i16_7(<4 x i16> %d, <4 x i16> %e) {
; CHECK-GI-LABEL: sv4i16_7:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: adrp x8, .LCPI44_0
-; CHECK-GI-NEXT: movi v3.4h, #7
+; CHECK-GI-NEXT: movi v2.4h, #7
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI44_0]
; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16
-; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #1
-; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15
-; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #1
-; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #17
+; CHECK-GI-NEXT: xtn v1.4h, v1.4s
+; CHECK-GI-NEXT: usra v1.4h, v1.4h, #15
+; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-GI-NEXT: ret
entry:
%s = srem <4 x i16> %d, <i16 7, i16 7, i16 7, i16 7>
@@ -1934,14 +1919,13 @@ define <4 x i16> @sv4i16_100(<4 x i16> %d, <4 x i16> %e) {
; CHECK-GI-LABEL: sv4i16_100:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: adrp x8, .LCPI45_0
-; CHECK-GI-NEXT: movi v3.4h, #100
+; CHECK-GI-NEXT: movi v2.4h, #100
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI45_0]
; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16
-; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #3
-; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15
-; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #3
-; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #19
+; CHECK-GI-NEXT: xtn v1.4h, v1.4s
+; CHECK-GI-NEXT: usra v1.4h, v1.4h, #15
+; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-GI-NEXT: ret
entry:
%s = srem <4 x i16> %d, <i16 100, i16 100, i16 100, i16 100>
@@ -2301,8 +2285,8 @@ define <4 x i16> @uv4i16_100(<4 x i16> %d, <4 x i16> %e) {
; CHECK-GI-NEXT: ldr d2, [x8, :lo12:.LCPI53_0]
; CHECK-GI-NEXT: umull v1.4s, v1.4h, v2.4h
; CHECK-GI-NEXT: movi v2.4h, #100
-; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16
-; CHECK-GI-NEXT: ushr v1.4h, v1.4h, #1
+; CHECK-GI-NEXT: ushr v1.4s, v1.4s, #17
+; CHECK-GI-NEXT: xtn v1.4h, v1.4s
; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-GI-NEXT: ret
entry:
@@ -2424,14 +2408,13 @@ define <2 x i32> @sv2i32_100(<2 x i32> %d, <2 x i32> %e) {
; CHECK-GI-LABEL: sv2i32_100:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: adrp x8, .LCPI57_0
-; CHECK-GI-NEXT: movi v3.2s, #100
+; CHECK-GI-NEXT: movi v2.2s, #100
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI57_0]
; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32
-; CHECK-GI-NEXT: sshr v2.2s, v1.2s, #5
-; CHECK-GI-NEXT: ushr v2.2s, v2.2s, #31
-; CHECK-GI-NEXT: ssra v2.2s, v1.2s, #5
-; CHECK-GI-NEXT: mls v0.2s, v2.2s, v3.2s
+; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #37
+; CHECK-GI-NEXT: xtn v1.2s, v1.2d
+; CHECK-GI-NEXT: usra v1.2s, v1.2s, #31
+; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s
; CHECK-GI-NEXT: ret
entry:
%s = srem <2 x i32> %d, <i32 100, i32 100>
@@ -2656,8 +2639,8 @@ define <2 x i32> @uv2i32_100(<2 x i32> %d, <2 x i32> %e) {
; CHECK-GI-NEXT: movi v2.2s, #100
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI63_0]
; CHECK-GI-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32
-; CHECK-GI-NEXT: ushr v1.2s, v1.2s, #5
+; CHECK-GI-NEXT: ushr v1.2d, v1.2d, #37
+; CHECK-GI-NEXT: xtn v1.2s, v1.2d
; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/urem-lkk.ll b/llvm/test/CodeGen/AArch64/urem-lkk.ll
index 0dd6685555826..40016c7e4ce0f 100644
--- a/llvm/test/CodeGen/AArch64/urem-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/urem-lkk.ll
@@ -20,26 +20,15 @@ define i32 @fold_urem_positive_odd(i32 %x) {
}
define i32 @fold_urem_positive_even(i32 %x) {
-; CHECK-SD-LABEL: fold_urem_positive_even:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, #16323 // =0x3fc3
-; CHECK-SD-NEXT: mov w9, #1060 // =0x424
-; CHECK-SD-NEXT: movk w8, #63310, lsl #16
-; CHECK-SD-NEXT: umull x8, w0, w8
-; CHECK-SD-NEXT: lsr x8, x8, #42
-; CHECK-SD-NEXT: msub w0, w8, w9, w0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fold_urem_positive_even:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov w8, #16323 // =0x3fc3
-; CHECK-GI-NEXT: mov w9, #1060 // =0x424
-; CHECK-GI-NEXT: movk w8, #63310, lsl #16
-; CHECK-GI-NEXT: umull x8, w0, w8
-; CHECK-GI-NEXT: lsr x8, x8, #32
-; CHECK-GI-NEXT: lsr w8, w8, #10
-; CHECK-GI-NEXT: msub w0, w8, w9, w0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fold_urem_positive_even:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #16323 // =0x3fc3
+; CHECK-NEXT: mov w9, #1060 // =0x424
+; CHECK-NEXT: movk w8, #63310, lsl #16
+; CHECK-NEXT: umull x8, w0, w8
+; CHECK-NEXT: lsr x8, x8, #42
+; CHECK-NEXT: msub w0, w8, w9, w0
+; CHECK-NEXT: ret
%1 = urem i32 %x, 1060
ret i32 %1
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index fc81e16d68e98..7869a81cb9705 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -3686,22 +3686,21 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
;
; GFX8-LABEL: s_fshl_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s4, s1, 16
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_lshr_b32 s5, s2, 16
-; GFX8-NEXT: s_and_b32 s6, s2, 15
-; GFX8-NEXT: s_andn2_b32 s2, 15, s2
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
+; GFX8-NEXT: s_and_b32 s5, s2, 15
; GFX8-NEXT: s_lshr_b32 s3, s0, 16
-; GFX8-NEXT: s_lshl_b32 s0, s0, s6
-; GFX8-NEXT: s_lshr_b32 s1, s1, s2
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s1, s5, 15
-; GFX8-NEXT: s_andn2_b32 s2, 15, s5
-; GFX8-NEXT: s_lshl_b32 s1, s3, s1
-; GFX8-NEXT: s_lshr_b32 s3, s4, 1
-; GFX8-NEXT: s_lshr_b32 s2, s3, s2
-; GFX8-NEXT: s_or_b32 s1, s1, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s5
+; GFX8-NEXT: s_and_b32 s5, 0xffff, s1
+; GFX8-NEXT: s_lshr_b32 s4, s2, 16
+; GFX8-NEXT: s_andn2_b32 s2, 15, s2
+; GFX8-NEXT: s_lshr_b32 s5, s5, 1
+; GFX8-NEXT: s_lshr_b32 s2, s5, s2
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_and_b32 s2, s4, 15
+; GFX8-NEXT: s_andn2_b32 s4, 15, s4
+; GFX8-NEXT: s_lshr_b32 s1, s1, 17
+; GFX8-NEXT: s_lshl_b32 s2, s3, s2
+; GFX8-NEXT: s_lshr_b32 s1, s1, s4
+; GFX8-NEXT: s_or_b32 s1, s2, s1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
@@ -3813,13 +3812,12 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v5
; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
; GFX8-NEXT: v_mov_b32_e32 v4, 15
-; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v5, -1
+; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v4, 1
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 17, v1
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -3886,14 +3884,14 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: None (jyli0116) ChangesFolds shift(trunc(shift(...))) pattern into trunc(shift(...)) by combining the two shift instructions Patch is 68.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155583.diff 13 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 6dba689e8af71..40f612cc98bcc 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -81,6 +81,13 @@ struct ShiftOfShiftedLogic {
uint64_t ValSum;
};
+struct ShiftOfTruncOfShift {
+ Register Src;
+ uint64_t ShiftAmt;
+ LLT ShiftAmtTy;
+ LLT InnerShiftTy;
+};
+
using BuildFnTy = std::function<void(MachineIRBuilder &)>;
using OperandBuildSteps =
@@ -338,6 +345,12 @@ class CombinerHelper {
bool matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) const;
+ /// Fold (shift (trunc (shift x, C1)), C2) -> trunc (shift x, (C1 + C2))
+ bool matchShiftOfTruncOfShift(MachineInstr &MI,
+ ShiftOfTruncOfShift &MatchInfo) const;
+ void applyShiftOfTruncOfShift(MachineInstr &MI,
+ ShiftOfTruncOfShift &MatchInfo) const;
+
/// Transform a multiply by a power-of-2 value to a left shift.
bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const;
void applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 9564b581c5ebb..46e41a5cc4c79 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -396,6 +396,14 @@ def commute_shift : GICombineRule<
[{ return Helper.matchCommuteShift(*${d}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${d}, ${matchinfo}); }])>;
+// Fold (shift (trunc (shift x, C1)), C2) -> trunc (shift x, (C1 + C2))
+def shift_of_trunc_of_shift_matchdata : GIDefMatchData<"ShiftOfTruncOfShift">;
+def shift_of_trunc_of_shift : GICombineRule<
+ (defs root:$root, shift_of_trunc_of_shift_matchdata:$matchinfo),
+ (match (wip_match_opcode G_LSHR, G_ASHR):$root,
+ [{ return Helper.matchShiftOfTruncOfShift(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyShiftOfTruncOfShift(*${root}, ${matchinfo}); }])>;
+
def narrow_binop_feeding_and : GICombineRule<
(defs root:$root, build_fn_matchinfo:$matchinfo),
(match (wip_match_opcode G_AND):$root,
@@ -2103,7 +2111,8 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
simplify_neg_minmax, combine_concat_vector,
sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
- combine_use_vector_truncate, merge_combines, overflow_combines, truncsat_combines]>;
+ combine_use_vector_truncate, merge_combines, overflow_combines,
+ truncsat_combines, shift_of_trunc_of_shift]>;
// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 0674f5fd1ae06..d3f0731955353 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2094,6 +2094,60 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI,
return true;
}
+bool CombinerHelper::matchShiftOfTruncOfShift(
+ MachineInstr &MI, ShiftOfTruncOfShift &MatchInfo) const {
+ unsigned ShiftOpcode = MI.getOpcode();
+ assert(ShiftOpcode == TargetOpcode::G_LSHR ||
+ ShiftOpcode == TargetOpcode::G_ASHR);
+
+ Register N0 = MI.getOperand(1).getReg();
+ Register N1 = MI.getOperand(2).getReg();
+ unsigned OpSizeInBits = MRI.getType(N0).getScalarSizeInBits();
+
+ APInt N1C;
+ Register InnerShift;
+ if (!mi_match(N1, MRI, m_ICstOrSplat(N1C)) ||
+ !mi_match(N0, MRI, m_GTrunc(m_Reg(InnerShift))))
+ return false;
+
+ auto *InnerMI = MRI.getVRegDef(InnerShift);
+ if (InnerMI->getOpcode() != ShiftOpcode)
+ return false;
+
+ APInt N001C;
+ auto N001 = InnerMI->getOperand(2).getReg();
+ if (!mi_match(N001, MRI, m_ICstOrSplat(N001C)))
+ return false;
+
+ uint64_t c1 = N001C.getZExtValue();
+ uint64_t c2 = N1C.getZExtValue();
+ LLT InnerShiftTy = MRI.getType(InnerShift);
+ uint64_t InnerShiftSize = InnerShiftTy.getScalarSizeInBits();
+ if (!(c1 + OpSizeInBits == InnerShiftSize) || !(c1 + c2 < InnerShiftSize))
+ return false;
+
+ MatchInfo.Src = InnerMI->getOperand(1).getReg();
+ MatchInfo.ShiftAmt = c1 + c2;
+ MatchInfo.ShiftAmtTy = MRI.getType(N001);
+ MatchInfo.InnerShiftTy = InnerShiftTy;
+ return true;
+}
+
+void CombinerHelper::applyShiftOfTruncOfShift(
+ MachineInstr &MI, ShiftOfTruncOfShift &MatchInfo) const {
+ unsigned ShiftOpcode = MI.getOpcode();
+ assert(ShiftOpcode == TargetOpcode::G_LSHR ||
+ ShiftOpcode == TargetOpcode::G_ASHR);
+
+ Register Dst = MI.getOperand(0).getReg();
+ auto ShiftAmt =
+ Builder.buildConstant(MatchInfo.ShiftAmtTy, MatchInfo.ShiftAmt);
+ auto Shift = Builder.buildInstr(ShiftOpcode, {MatchInfo.InnerShiftTy},
+ {MatchInfo.Src, ShiftAmt});
+ Builder.buildTrunc(Dst, Shift);
+ MI.eraseFromParent();
+}
+
bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
unsigned &ShiftVal) const {
assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 5f499e5e9700a..e44819ad5a4ae 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -349,6 +349,8 @@ def AArch64PostLegalizerLowering
}
// Post-legalization combines which are primarily optimizations.
+
+
def AArch64PostLegalizerCombiner
: GICombiner<"AArch64PostLegalizerCombinerImpl",
[copy_prop, cast_of_cast_combines,
@@ -369,5 +371,5 @@ def AArch64PostLegalizerCombiner
commute_constant_to_rhs, extract_vec_elt_combines,
push_freeze_to_prevent_poison_from_propagating,
combine_mul_cmlt, combine_use_vector_truncate,
- extmultomull, truncsat_combines]> {
+ extmultomull, truncsat_combines, shift_of_trunc_of_shift]> {
}
diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
index 9d0ade2480428..014eaee5ebb2f 100644
--- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll
+++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
@@ -1684,24 +1684,14 @@ define i32 @combine_i32_sdiv_const7(i32 %x) {
}
define i32 @combine_i32_sdiv_const100(i32 %x) {
-; CHECK-SD-LABEL: combine_i32_sdiv_const100:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, #34079 // =0x851f
-; CHECK-SD-NEXT: movk w8, #20971, lsl #16
-; CHECK-SD-NEXT: smull x8, w0, w8
-; CHECK-SD-NEXT: asr x8, x8, #37
-; CHECK-SD-NEXT: add w0, w8, w8, lsr #31
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: combine_i32_sdiv_const100:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov w8, #34079 // =0x851f
-; CHECK-GI-NEXT: movk w8, #20971, lsl #16
-; CHECK-GI-NEXT: smull x8, w0, w8
-; CHECK-GI-NEXT: asr x8, x8, #32
-; CHECK-GI-NEXT: asr w8, w8, #5
-; CHECK-GI-NEXT: add w0, w8, w8, lsr #31
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: combine_i32_sdiv_const100:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #34079 // =0x851f
+; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: smull x8, w0, w8
+; CHECK-NEXT: asr x8, x8, #37
+; CHECK-NEXT: add w0, w8, w8, lsr #31
+; CHECK-NEXT: ret
%1 = sdiv i32 %x, 100
ret i32 %1
}
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index c57383ad9b1e7..f36a87794be35 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -276,28 +276,16 @@ entry:
}
define i32 @si32_100(i32 %a, i32 %b) {
-; CHECK-SD-LABEL: si32_100:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov w8, #34079 // =0x851f
-; CHECK-SD-NEXT: mov w9, #100 // =0x64
-; CHECK-SD-NEXT: movk w8, #20971, lsl #16
-; CHECK-SD-NEXT: smull x8, w0, w8
-; CHECK-SD-NEXT: asr x8, x8, #37
-; CHECK-SD-NEXT: add w8, w8, w8, lsr #31
-; CHECK-SD-NEXT: msub w0, w8, w9, w0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: si32_100:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #34079 // =0x851f
-; CHECK-GI-NEXT: mov w9, #100 // =0x64
-; CHECK-GI-NEXT: movk w8, #20971, lsl #16
-; CHECK-GI-NEXT: smull x8, w0, w8
-; CHECK-GI-NEXT: asr x8, x8, #32
-; CHECK-GI-NEXT: asr w8, w8, #5
-; CHECK-GI-NEXT: add w8, w8, w8, lsr #31
-; CHECK-GI-NEXT: msub w0, w8, w9, w0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: si32_100:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #34079 // =0x851f
+; CHECK-NEXT: mov w9, #100 // =0x64
+; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: smull x8, w0, w8
+; CHECK-NEXT: asr x8, x8, #37
+; CHECK-NEXT: add w8, w8, w8, lsr #31
+; CHECK-NEXT: msub w0, w8, w9, w0
+; CHECK-NEXT: ret
entry:
%s = srem i32 %a, 100
ret i32 %s
@@ -336,26 +324,15 @@ entry:
}
define i32 @ui32_100(i32 %a, i32 %b) {
-; CHECK-SD-LABEL: ui32_100:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov w8, #34079 // =0x851f
-; CHECK-SD-NEXT: mov w9, #100 // =0x64
-; CHECK-SD-NEXT: movk w8, #20971, lsl #16
-; CHECK-SD-NEXT: umull x8, w0, w8
-; CHECK-SD-NEXT: lsr x8, x8, #37
-; CHECK-SD-NEXT: msub w0, w8, w9, w0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ui32_100:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #34079 // =0x851f
-; CHECK-GI-NEXT: mov w9, #100 // =0x64
-; CHECK-GI-NEXT: movk w8, #20971, lsl #16
-; CHECK-GI-NEXT: umull x8, w0, w8
-; CHECK-GI-NEXT: lsr x8, x8, #32
-; CHECK-GI-NEXT: lsr w8, w8, #5
-; CHECK-GI-NEXT: msub w0, w8, w9, w0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ui32_100:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #34079 // =0x851f
+; CHECK-NEXT: mov w9, #100 // =0x64
+; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: umull x8, w0, w8
+; CHECK-NEXT: lsr x8, x8, #37
+; CHECK-NEXT: msub w0, w8, w9, w0
+; CHECK-NEXT: ret
entry:
%s = urem i32 %a, 100
ret i32 %s
@@ -1118,13 +1095,12 @@ define <8 x i8> @sv8i8_100(<8 x i8> %d, <8 x i8> %e) {
; CHECK-GI-LABEL: sv8i8_100:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v1.8b, #41
-; CHECK-GI-NEXT: movi v3.8b, #100
+; CHECK-GI-NEXT: movi v2.8b, #100
; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b
-; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8
-; CHECK-GI-NEXT: sshr v2.8b, v1.8b, #4
-; CHECK-GI-NEXT: ushr v2.8b, v2.8b, #7
-; CHECK-GI-NEXT: ssra v2.8b, v1.8b, #4
-; CHECK-GI-NEXT: mls v0.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #12
+; CHECK-GI-NEXT: xtn v1.8b, v1.8h
+; CHECK-GI-NEXT: usra v1.8b, v1.8b, #7
+; CHECK-GI-NEXT: mls v0.8b, v1.8b, v2.8b
; CHECK-GI-NEXT: ret
entry:
%s = srem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
@@ -1619,15 +1595,25 @@ entry:
}
define <8 x i8> @uv8i8_100(<8 x i8> %d, <8 x i8> %e) {
-; CHECK-LABEL: uv8i8_100:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v1.8b, #41
-; CHECK-NEXT: movi v2.8b, #100
-; CHECK-NEXT: umull v1.8h, v0.8b, v1.8b
-; CHECK-NEXT: shrn v1.8b, v1.8h, #8
-; CHECK-NEXT: ushr v1.8b, v1.8b, #4
-; CHECK-NEXT: mls v0.8b, v1.8b, v2.8b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: uv8i8_100:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v1.8b, #41
+; CHECK-SD-NEXT: movi v2.8b, #100
+; CHECK-SD-NEXT: umull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: shrn v1.8b, v1.8h, #8
+; CHECK-SD-NEXT: ushr v1.8b, v1.8b, #4
+; CHECK-SD-NEXT: mls v0.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: uv8i8_100:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: movi v1.8b, #41
+; CHECK-GI-NEXT: movi v2.8b, #100
+; CHECK-GI-NEXT: umull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: ushr v1.8h, v1.8h, #12
+; CHECK-GI-NEXT: xtn v1.8b, v1.8h
+; CHECK-GI-NEXT: mls v0.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT: ret
entry:
%s = urem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
ret <8 x i8> %s
@@ -1904,14 +1890,13 @@ define <4 x i16> @sv4i16_7(<4 x i16> %d, <4 x i16> %e) {
; CHECK-GI-LABEL: sv4i16_7:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: adrp x8, .LCPI44_0
-; CHECK-GI-NEXT: movi v3.4h, #7
+; CHECK-GI-NEXT: movi v2.4h, #7
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI44_0]
; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16
-; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #1
-; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15
-; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #1
-; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #17
+; CHECK-GI-NEXT: xtn v1.4h, v1.4s
+; CHECK-GI-NEXT: usra v1.4h, v1.4h, #15
+; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-GI-NEXT: ret
entry:
%s = srem <4 x i16> %d, <i16 7, i16 7, i16 7, i16 7>
@@ -1934,14 +1919,13 @@ define <4 x i16> @sv4i16_100(<4 x i16> %d, <4 x i16> %e) {
; CHECK-GI-LABEL: sv4i16_100:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: adrp x8, .LCPI45_0
-; CHECK-GI-NEXT: movi v3.4h, #100
+; CHECK-GI-NEXT: movi v2.4h, #100
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI45_0]
; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16
-; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #3
-; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15
-; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #3
-; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #19
+; CHECK-GI-NEXT: xtn v1.4h, v1.4s
+; CHECK-GI-NEXT: usra v1.4h, v1.4h, #15
+; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-GI-NEXT: ret
entry:
%s = srem <4 x i16> %d, <i16 100, i16 100, i16 100, i16 100>
@@ -2301,8 +2285,8 @@ define <4 x i16> @uv4i16_100(<4 x i16> %d, <4 x i16> %e) {
; CHECK-GI-NEXT: ldr d2, [x8, :lo12:.LCPI53_0]
; CHECK-GI-NEXT: umull v1.4s, v1.4h, v2.4h
; CHECK-GI-NEXT: movi v2.4h, #100
-; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16
-; CHECK-GI-NEXT: ushr v1.4h, v1.4h, #1
+; CHECK-GI-NEXT: ushr v1.4s, v1.4s, #17
+; CHECK-GI-NEXT: xtn v1.4h, v1.4s
; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-GI-NEXT: ret
entry:
@@ -2424,14 +2408,13 @@ define <2 x i32> @sv2i32_100(<2 x i32> %d, <2 x i32> %e) {
; CHECK-GI-LABEL: sv2i32_100:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: adrp x8, .LCPI57_0
-; CHECK-GI-NEXT: movi v3.2s, #100
+; CHECK-GI-NEXT: movi v2.2s, #100
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI57_0]
; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32
-; CHECK-GI-NEXT: sshr v2.2s, v1.2s, #5
-; CHECK-GI-NEXT: ushr v2.2s, v2.2s, #31
-; CHECK-GI-NEXT: ssra v2.2s, v1.2s, #5
-; CHECK-GI-NEXT: mls v0.2s, v2.2s, v3.2s
+; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #37
+; CHECK-GI-NEXT: xtn v1.2s, v1.2d
+; CHECK-GI-NEXT: usra v1.2s, v1.2s, #31
+; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s
; CHECK-GI-NEXT: ret
entry:
%s = srem <2 x i32> %d, <i32 100, i32 100>
@@ -2656,8 +2639,8 @@ define <2 x i32> @uv2i32_100(<2 x i32> %d, <2 x i32> %e) {
; CHECK-GI-NEXT: movi v2.2s, #100
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI63_0]
; CHECK-GI-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32
-; CHECK-GI-NEXT: ushr v1.2s, v1.2s, #5
+; CHECK-GI-NEXT: ushr v1.2d, v1.2d, #37
+; CHECK-GI-NEXT: xtn v1.2s, v1.2d
; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/urem-lkk.ll b/llvm/test/CodeGen/AArch64/urem-lkk.ll
index 0dd6685555826..40016c7e4ce0f 100644
--- a/llvm/test/CodeGen/AArch64/urem-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/urem-lkk.ll
@@ -20,26 +20,15 @@ define i32 @fold_urem_positive_odd(i32 %x) {
}
define i32 @fold_urem_positive_even(i32 %x) {
-; CHECK-SD-LABEL: fold_urem_positive_even:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, #16323 // =0x3fc3
-; CHECK-SD-NEXT: mov w9, #1060 // =0x424
-; CHECK-SD-NEXT: movk w8, #63310, lsl #16
-; CHECK-SD-NEXT: umull x8, w0, w8
-; CHECK-SD-NEXT: lsr x8, x8, #42
-; CHECK-SD-NEXT: msub w0, w8, w9, w0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fold_urem_positive_even:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov w8, #16323 // =0x3fc3
-; CHECK-GI-NEXT: mov w9, #1060 // =0x424
-; CHECK-GI-NEXT: movk w8, #63310, lsl #16
-; CHECK-GI-NEXT: umull x8, w0, w8
-; CHECK-GI-NEXT: lsr x8, x8, #32
-; CHECK-GI-NEXT: lsr w8, w8, #10
-; CHECK-GI-NEXT: msub w0, w8, w9, w0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fold_urem_positive_even:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #16323 // =0x3fc3
+; CHECK-NEXT: mov w9, #1060 // =0x424
+; CHECK-NEXT: movk w8, #63310, lsl #16
+; CHECK-NEXT: umull x8, w0, w8
+; CHECK-NEXT: lsr x8, x8, #42
+; CHECK-NEXT: msub w0, w8, w9, w0
+; CHECK-NEXT: ret
%1 = urem i32 %x, 1060
ret i32 %1
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index fc81e16d68e98..7869a81cb9705 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -3686,22 +3686,21 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
;
; GFX8-LABEL: s_fshl_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s4, s1, 16
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_lshr_b32 s5, s2, 16
-; GFX8-NEXT: s_and_b32 s6, s2, 15
-; GFX8-NEXT: s_andn2_b32 s2, 15, s2
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
+; GFX8-NEXT: s_and_b32 s5, s2, 15
; GFX8-NEXT: s_lshr_b32 s3, s0, 16
-; GFX8-NEXT: s_lshl_b32 s0, s0, s6
-; GFX8-NEXT: s_lshr_b32 s1, s1, s2
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s1, s5, 15
-; GFX8-NEXT: s_andn2_b32 s2, 15, s5
-; GFX8-NEXT: s_lshl_b32 s1, s3, s1
-; GFX8-NEXT: s_lshr_b32 s3, s4, 1
-; GFX8-NEXT: s_lshr_b32 s2, s3, s2
-; GFX8-NEXT: s_or_b32 s1, s1, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s5
+; GFX8-NEXT: s_and_b32 s5, 0xffff, s1
+; GFX8-NEXT: s_lshr_b32 s4, s2, 16
+; GFX8-NEXT: s_andn2_b32 s2, 15, s2
+; GFX8-NEXT: s_lshr_b32 s5, s5, 1
+; GFX8-NEXT: s_lshr_b32 s2, s5, s2
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_and_b32 s2, s4, 15
+; GFX8-NEXT: s_andn2_b32 s4, 15, s4
+; GFX8-NEXT: s_lshr_b32 s1, s1, 17
+; GFX8-NEXT: s_lshl_b32 s2, s3, s2
+; GFX8-NEXT: s_lshr_b32 s1, s1, s4
+; GFX8-NEXT: s_or_b32 s1, s2, s1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
@@ -3813,13 +3812,12 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v5
; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
; GFX8-NEXT: v_mov_b32_e32 v4, 15
-; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v5, -1
+; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v4, 1
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 17, v1
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -3886,14 +3884,14 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_...
[truncated]
|
def shift_of_trunc_of_shift_matchdata : GIDefMatchData<"ShiftOfTruncOfShift">; | ||
def shift_of_trunc_of_shift : GICombineRule< | ||
(defs root:$root, shift_of_trunc_of_shift_matchdata:$matchinfo), | ||
(match (wip_match_opcode G_LSHR, G_ASHR):$root, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We shouldn't be adding new combines with wip_match_opcode
anymore.
uint64_t c1 = N001C.getZExtValue(); | ||
uint64_t c2 = N1C.getZExtValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
C1, C2.
Be careful of getZExtValue for values larger than 64bits.
uint64_t c2 = N1C.getZExtValue(); | ||
LLT InnerShiftTy = MRI.getType(InnerShift); | ||
uint64_t InnerShiftSize = InnerShiftTy.getScalarSizeInBits(); | ||
if (!(c1 + OpSizeInBits == InnerShiftSize) || !(c1 + c2 < InnerShiftSize)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Invert the !'s.
Why is the c1 + OpSizeInBits != InnerShiftSize
needed?
; CHECK-GI-NEXT: asr w8, w8, #5 | ||
; CHECK-GI-NEXT: add w0, w8, w8, lsr #31 | ||
; CHECK-GI-NEXT: ret | ||
; CHECK-LABEL: combine_i32_sdiv_const100: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have any direct tests for various shift(trunc(shift combos?
Folds shift(trunc(shift(...))) pattern into trunc(shift(...)) by combining the two shift instructions