-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGPU] Remove setcc by using add/sub carryout #155255
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: None (LU-JOHN) ChangesRemove setcc instruction by utilizing add/sub carryout. Addresses #152992. Patch is 111.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155255.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 66c1dfc71c2f5..e38af343beeaf 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15972,6 +15972,62 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
}
}
+ // Eliminate setcc by using carryout from add/sub instruction
+
+ // X = ADD i64 Y, Z Xlo = UADDO i32 Ylo, Zlo
+ // setcc X ult Y -> XHi = UADDO_CARRY i32 Yhi, Zhi
+ // similarly for subtraction
+
+ // X = ADD i64 Y, 1 Xlo = UADDO i32 Ylo, 1
+ // setcc X eq 0 -> XHi = UADDO_CARRY i32 Yhi, 0
+
+ // Don't split a 64-bit add/sub into two 32-bit add/sub instructions for
+ // non-divergent operations. This can result in lo/hi 32-bit operations
+ // being done in SGPR and VGPR with additional operations being needed
+ // to move operands and/or generate the intermediate carry.
+ if (VT == MVT::i64 && N->isDivergent() &&
+ ((((LHS.getOpcode() == ISD::ADD && CC == ISD::SETULT) ||
+ (LHS.getOpcode() == ISD::SUB && CC == ISD::SETUGT)) &&
+ LHS.getOperand(0) == RHS) ||
+ (LHS.getOpcode() == ISD::ADD && CC == ISD::SETEQ && CRHS &&
+ CRHS->isZero() && dyn_cast<ConstantSDNode>(LHS.getOperand(1)) &&
+ dyn_cast<ConstantSDNode>(LHS.getOperand(1))->isOne()))) {
+ EVT TargetType = MVT::i32;
+ EVT CarryVT = MVT::i1;
+ const SDValue One = DAG.getConstant(1, SL, TargetType);
+ bool IsAdd = LHS.getOpcode() == ISD::ADD;
+
+ SDValue Op0 = LHS.getOperand(0);
+ SDValue Op1 = LHS.getOperand(1);
+
+ SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, Op0);
+ SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, Op1);
+
+ SDValue Op0Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, TargetType, Op0, One);
+ SDValue Op1Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, TargetType, Op1, One);
+
+ SDValue NodeLo =
+ DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
+ DAG.getVTList(TargetType, CarryVT), {Op0Lo, Op1Lo});
+
+ SDValue CarryInHi = SDValue(NodeLo.getNode(), 1);
+ SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
+ SL, DAG.getVTList(TargetType, CarryVT),
+ {Op0Hi, Op1Hi, CarryInHi});
+
+ SDValue ResultLo = SDValue(NodeLo.getNode(), 0);
+ SDValue ResultHi = SDValue(NodeHi.getNode(), 0);
+
+ EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
+ SDValue JoinedResult =
+ DAG.getBuildVector(ConcatType, SL, {ResultLo, ResultHi});
+
+ SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
+ SDValue Overflow = SDValue(NodeHi.getNode(), 1);
+ DCI.CombineTo(LHS.getNode(), Result);
+ return Overflow;
+ }
+
if (VT != MVT::f32 && VT != MVT::f64 &&
(!Subtarget->has16BitInsts() || VT != MVT::f16))
return SDValue();
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
new file mode 100644
index 0000000000000..ba0648116e029
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;; Test that carryout from 64-bit add/sub (synthesized from two 32-bit adds/subs) is utilized
+;; (i.e. no additional compare is generated).
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+%0 = type { i64, i64, i32, i32 }
+%1 = type { [64 x [8 x i64]] }
+%struct.uint96 = type { i64, i32 }
+%struct.uint64pair = type { i64, i64 }
+
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64)
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64)
+
+declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+
+define hidden %struct.uint96 @add64_32(i64 %val64A, i64 %val64B, i32 %val32) {
+; CHECK-LABEL: add64_32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %sum64 = add i64 %val64A, %val64B
+ %obit = icmp ult i64 %sum64, %val64A
+ %obit32 = zext i1 %obit to i32
+ %sum32 = add i32 %val32, %obit32
+ %.fca.0.insert = insertvalue %struct.uint96 poison, i64 %sum64, 0
+ %.fca.1.insert = insertvalue %struct.uint96 %.fca.0.insert, i32 %sum32, 1
+ ret %struct.uint96 %.fca.1.insert
+}
+
+define <2 x i64> @uadd_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
+; CHECK-LABEL: uadd_v2i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, v2, v6
+; CHECK-NEXT: v_add_co_u32_e64 v4, s[4:5], v0, v4
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
+; CHECK-NEXT: v_addc_co_u32_e64 v5, s[4:5], v1, v5, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
+ %val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
+ %obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
+ %res = sext <2 x i1> %obit to <2 x i64>
+ store <2 x i64> %val, ptr %ptrval
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @usub_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
+; CHECK-LABEL: usub_v2i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_sub_co_u32_e32 v6, vcc, v2, v6
+; CHECK-NEXT: v_sub_co_u32_e64 v4, s[4:5], v0, v4
+; CHECK-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
+; CHECK-NEXT: v_subb_co_u32_e64 v5, s[4:5], v1, v5, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
+ %val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
+ %obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
+ %res = sext <2 x i1> %obit to <2 x i64>
+ store <2 x i64> %val, ptr %ptrval
+ ret <2 x i64> %res
+}
+
+define i64 @uadd_i64(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: uadd_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 %val1)
+ %val = extractvalue {i64, i1} %pair, 0
+ %obit = extractvalue {i64, i1} %pair, 1
+ %res = sext i1 %obit to i64
+ store i64 %val, ptr %ptrval
+ ret i64 %res
+}
+
+define i64 @uadd_p1(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: uadd_p1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 1)
+ %val = extractvalue {i64, i1} %pair, 0
+ %obit = extractvalue {i64, i1} %pair, 1
+ %res = sext i1 %obit to i64
+ store i64 %val, ptr %ptrval
+ ret i64 %res
+}
+
+; FIXME: Intermediate compare to generate carryout was transformed. Extend
+; to recognize this.
+define i64 @uadd_n1(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: uadd_n1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 -1)
+ %val = extractvalue {i64, i1} %pair, 0
+ %obit = extractvalue {i64, i1} %pair, 1
+ %res = sext i1 %obit to i64
+ store i64 %val, ptr %ptrval
+ ret i64 %res
+}
+
+define i64 @usub_p1(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: usub_p1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v0
+; CHECK-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 1)
+ %val = extractvalue {i64, i1} %pair, 0
+ %obit = extractvalue {i64, i1} %pair, 1
+ %res = sext i1 %obit to i64
+ store i64 %val, ptr %ptrval
+ ret i64 %res
+}
+
+define i64 @usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: usub_n1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_subrev_co_u32_e32 v0, vcc, -1, v0
+; CHECK-NEXT: v_subbrev_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 -1)
+ %val = extractvalue {i64, i1} %pair, 0
+ %obit = extractvalue {i64, i1} %pair, 1
+ %res = sext i1 %obit to i64
+ store i64 %val, ptr %ptrval
+ ret i64 %res
+}
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index b71885b54b5a2..1158d73c0c152 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -841,7 +841,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GCN-ISEL-LABEL: name: vuaddo64
; GCN-ISEL-LABEL: body:
; GCN-ISEL-LABEL: bb.0
-; GCN-ISEL: V_ADD_U64_PSEUDO
+; GCN-ISEL: V_ADD_CO_U32_e64
define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
; CISI-LABEL: vuaddo64:
@@ -854,9 +854,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s4, s0
; CISI-NEXT: v_mov_b32_e32 v1, s9
; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0
-; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
; CISI-NEXT: s_mov_b32 s5, s1
+; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s6
@@ -876,7 +875,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-NEXT: v_mov_b32_e32 v6, s5
; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6]
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v4, s3
@@ -894,7 +892,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
@@ -909,8 +906,7 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: v_add_co_u32 v0, s4, s6, v0
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
-; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1010-NEXT: s_endpgm
@@ -923,9 +919,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: v_add_co_u32 v0, s4, s6, v0
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4
-; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W32-NEXT: s_endpgm
@@ -938,9 +933,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s6, v0
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s[4:5]
-; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W64-NEXT: s_endpgm
@@ -955,10 +949,9 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s4, s6, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
@@ -969,16 +962,17 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
-; GFX1250-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[2:3]
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_co_u32 v0, s4, s6, v0
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
-; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1821,7 +1815,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GCN-ISEL-LABEL: name: vusubo64
; GCN-ISEL-LABEL: body:
; GCN-ISEL-LABEL: bb.0
-; GCN-ISEL: V_SUB_U64_PSEUDO
+; GCN-ISEL: V_SUBB_U32_e64
define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
; CISI-LABEL: vusubo64:
@@ -1834,9 +1828,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s4, s0
; CISI-NEXT: v_mov_b32_e32 v1, s9
; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
-; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
; CISI-NEXT: s_mov_b32 s5, s1
+; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s6
@@ -1856,7 +1849,6 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-NEXT: v_mov_b32_e32 v6, s5
; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v0
; VI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[5:6]
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v4, s3
@@ -1874,7 +1866,6 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
@@ -1889,8 +1880,7 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: v_sub_co_u32 v0, s4, s6, v0
; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
-; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1010-NEXT: s_endpgm
@@ -1903,9 +1893,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: v_sub_co_u32 v0, s4, s6, v0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s4
-; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W32-NEXT: s_endpgm
@@ -1918,9 +1907,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s6, v0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s[4:5]
-; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W64-NEXT: s_endpgm
@@ -1935,10 +1923,9 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1...
[truncated]
|
Signed-off-by: John Lu <John.Lu@amd.com>
Signed-off-by: John Lu <John.Lu@amd.com>
Signed-off-by: John Lu <John.Lu@amd.com>
Signed-off-by: John Lu <John.Lu@amd.com>
Signed-off-by: John Lu <John.Lu@amd.com>
1607c47
to
6c4d8b3
Compare
LHS.getOperand(0) == RHS) || | ||
(LHS.getOpcode() == ISD::ADD && CC == ISD::SETEQ && CRHS && | ||
CRHS->isZero() && dyn_cast<ConstantSDNode>(LHS.getOperand(1)) && | ||
dyn_cast<ConstantSDNode>(LHS.getOperand(1))->isOne()))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unchecked dyn_cast
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use sd_match. dyn_cast is unnecessary.
((((LHS.getOpcode() == ISD::ADD && CC == ISD::SETULT) || | ||
(LHS.getOpcode() == ISD::SUB && CC == ISD::SETUGT)) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you simplify this with sd_match?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Simplified with sd_match.
SDValue Op0Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, TargetType, Op0, One); | ||
SDValue Op1Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, TargetType, Op1, One); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using EXTRACT_ELEMENT is strange. If this is target code, the canonical way would be to extract_vector_elt from index 1. EXTRACT_ELEMENT or lshr by 32 will just add extra steps to get there
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to use getHiHalf64
Signed-off-by: John Lu <John.Lu@amd.com>
Signed-off-by: John Lu <John.Lu@amd.com>
Remove setcc instruction by utilizing add/sub carryout.
Addresses #152992.