Skip to content

Commit 1607c47

Browse files
committed
Remove setcc by using add/sub carryout
Signed-off-by: John Lu <John.Lu@amd.com>
1 parent f1f45e9 commit 1607c47

File tree

9 files changed

+766
-765
lines changed

9 files changed

+766
-765
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15972,6 +15972,62 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
1597215972
}
1597315973
}
1597415974

15975+
// Eliminate setcc by using carryout from add/sub instruction
15976+
15977+
// X = ADD i64 Y, Z Xlo = UADDO i32 Ylo, Zlo
15978+
// setcc X ult Y -> XHi = UADDO_CARRY i32 Yhi, Zhi
15979+
// similarly for subtraction
15980+
15981+
// X = ADD i64 Y, 1 Xlo = UADDO i32 Ylo, 1
15982+
// setcc X eq 0 -> XHi = UADDO_CARRY i32 Yhi, 0
15983+
15984+
// Don't split a 64-bit add/sub into two 32-bit add/sub instructions for
15985+
// non-divergent operations. This can result in lo/hi 32-bit operations
15986+
// being done in SGPR and VGPR with additional operations being needed
15987+
// to move operands and/or generate the intermediate carry.
15988+
if (VT == MVT::i64 && N->isDivergent() &&
15989+
((((LHS.getOpcode() == ISD::ADD && CC == ISD::SETULT) ||
15990+
(LHS.getOpcode() == ISD::SUB && CC == ISD::SETUGT)) &&
15991+
LHS.getOperand(0) == RHS) ||
15992+
(LHS.getOpcode() == ISD::ADD && CC == ISD::SETEQ && CRHS &&
15993+
CRHS->isZero() && dyn_cast<ConstantSDNode>(LHS.getOperand(1)) &&
15994+
dyn_cast<ConstantSDNode>(LHS.getOperand(1))->isOne()))) {
15995+
EVT TargetType = MVT::i32;
15996+
EVT CarryVT = MVT::i1;
15997+
const SDValue One = DAG.getConstant(1, SL, TargetType);
15998+
bool IsAdd = LHS.getOpcode() == ISD::ADD;
15999+
16000+
SDValue Op0 = LHS.getOperand(0);
16001+
SDValue Op1 = LHS.getOperand(1);
16002+
16003+
SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, Op0);
16004+
SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, Op1);
16005+
16006+
SDValue Op0Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, TargetType, Op0, One);
16007+
SDValue Op1Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, TargetType, Op1, One);
16008+
16009+
SDValue NodeLo =
16010+
DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
16011+
DAG.getVTList(TargetType, CarryVT), {Op0Lo, Op1Lo});
16012+
16013+
SDValue CarryInHi = SDValue(NodeLo.getNode(), 1);
16014+
SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
16015+
SL, DAG.getVTList(TargetType, CarryVT),
16016+
{Op0Hi, Op1Hi, CarryInHi});
16017+
16018+
SDValue ResultLo = SDValue(NodeLo.getNode(), 0);
16019+
SDValue ResultHi = SDValue(NodeHi.getNode(), 0);
16020+
16021+
EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
16022+
SDValue JoinedResult =
16023+
DAG.getBuildVector(ConcatType, SL, {ResultLo, ResultHi});
16024+
16025+
SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
16026+
SDValue Overflow = SDValue(NodeHi.getNode(), 1);
16027+
DCI.CombineTo(LHS.getNode(), Result);
16028+
return Overflow;
16029+
}
16030+
1597516031
if (VT != MVT::f32 && VT != MVT::f64 &&
1597616032
(!Subtarget->has16BitInsts() || VT != MVT::f16))
1597716033
return SDValue();

llvm/test/CodeGen/AMDGPU/addsub64_carry.ll

Lines changed: 23 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,9 @@ define hidden %struct.uint96 @add64_32(i64 %val64A, i64 %val64B, i32 %val32) {
1919
; CHECK-LABEL: add64_32:
2020
; CHECK: ; %bb.0:
2121
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22-
; CHECK-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
23-
; CHECK-NEXT: v_addc_co_u32_e32 v6, vcc, v1, v3, vcc
24-
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[0:1]
25-
; CHECK-NEXT: v_mov_b32_e32 v0, v5
22+
; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
23+
; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
2624
; CHECK-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
27-
; CHECK-NEXT: v_mov_b32_e32 v1, v6
2825
; CHECK-NEXT: s_setpc_b64 s[30:31]
2926
%sum64 = add i64 %val64A, %val64B
3027
%obit = icmp ult i64 %sum64, %val64A
@@ -40,16 +37,14 @@ define <2 x i64> @uadd_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
4037
; CHECK: ; %bb.0:
4138
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4239
; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, v2, v6
40+
; CHECK-NEXT: v_add_co_u32_e64 v4, s[4:5], v0, v4
4341
; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
44-
; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, v0, v4
45-
; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc
46-
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
47-
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
48-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
49-
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
50-
; CHECK-NEXT: v_mov_b32_e32 v1, v0
42+
; CHECK-NEXT: v_addc_co_u32_e64 v5, s[4:5], v1, v5, s[4:5]
43+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
5144
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
45+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
5246
; CHECK-NEXT: v_mov_b32_e32 v3, v2
47+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
5348
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5449
; CHECK-NEXT: s_setpc_b64 s[30:31]
5550
%pair = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
@@ -65,16 +60,14 @@ define <2 x i64> @usub_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
6560
; CHECK: ; %bb.0:
6661
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6762
; CHECK-NEXT: v_sub_co_u32_e32 v6, vcc, v2, v6
63+
; CHECK-NEXT: v_sub_co_u32_e64 v4, s[4:5], v0, v4
6864
; CHECK-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
69-
; CHECK-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v4
70-
; CHECK-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v5, vcc
71-
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[0:1]
72-
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
73-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
74-
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
75-
; CHECK-NEXT: v_mov_b32_e32 v1, v0
65+
; CHECK-NEXT: v_subb_co_u32_e64 v5, s[4:5], v1, v5, s[4:5]
66+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
7667
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
68+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
7769
; CHECK-NEXT: v_mov_b32_e32 v3, v2
70+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
7871
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7972
; CHECK-NEXT: s_setpc_b64 s[30:31]
8073
%pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
@@ -89,10 +82,9 @@ define i64 @uadd_i64(i64 %val0, i64 %val1, ptr %ptrval) {
8982
; CHECK-LABEL: uadd_i64:
9083
; CHECK: ; %bb.0:
9184
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92-
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
93-
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
94-
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
95-
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
85+
; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
86+
; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
87+
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
9688
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
9789
; CHECK-NEXT: v_mov_b32_e32 v1, v0
9890
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -111,7 +103,6 @@ define i64 @uadd_p1(i64 %val0, i64 %val1, ptr %ptrval) {
111103
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112104
; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
113105
; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
114-
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
115106
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
116107
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
117108
; CHECK-NEXT: v_mov_b32_e32 v1, v0
@@ -125,6 +116,8 @@ define i64 @uadd_p1(i64 %val0, i64 %val1, ptr %ptrval) {
125116
ret i64 %res
126117
}
127118

119+
; FIXME: Intermediate compare to generate carryout was transformed. Extend
120+
; to recognize this.
128121
define i64 @uadd_n1(i64 %val0, i64 %val1, ptr %ptrval) {
129122
; CHECK-LABEL: uadd_n1:
130123
; CHECK: ; %bb.0:
@@ -149,10 +142,9 @@ define i64 @usub_p1(i64 %val0, i64 %val1, ptr %ptrval) {
149142
; CHECK-LABEL: usub_p1:
150143
; CHECK: ; %bb.0:
151144
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152-
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0
153-
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
154-
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
155-
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
145+
; CHECK-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v0
146+
; CHECK-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
147+
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
156148
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
157149
; CHECK-NEXT: v_mov_b32_e32 v1, v0
158150
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -169,10 +161,9 @@ define i64 @usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
169161
; CHECK-LABEL: usub_n1:
170162
; CHECK: ; %bb.0:
171163
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172-
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0
173-
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
174-
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
175-
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
164+
; CHECK-NEXT: v_subrev_co_u32_e32 v0, vcc, -1, v0
165+
; CHECK-NEXT: v_subbrev_co_u32_e32 v1, vcc, -1, v1, vcc
166+
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
176167
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
177168
; CHECK-NEXT: v_mov_b32_e32 v1, v0
178169
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)

llvm/test/CodeGen/AMDGPU/carryout-selection.ll

Lines changed: 34 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -841,7 +841,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
841841
; GCN-ISEL-LABEL: name: vuaddo64
842842
; GCN-ISEL-LABEL: body:
843843
; GCN-ISEL-LABEL: bb.0
844-
; GCN-ISEL: V_ADD_U64_PSEUDO
844+
; GCN-ISEL: V_ADD_CO_U32_e64
845845

846846
define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
847847
; CISI-LABEL: vuaddo64:
@@ -854,9 +854,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
854854
; CISI-NEXT: s_mov_b32 s4, s0
855855
; CISI-NEXT: v_mov_b32_e32 v1, s9
856856
; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0
857-
; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
858-
; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
859857
; CISI-NEXT: s_mov_b32 s5, s1
858+
; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
860859
; CISI-NEXT: s_mov_b32 s0, s2
861860
; CISI-NEXT: s_mov_b32 s1, s3
862861
; CISI-NEXT: s_mov_b32 s2, s6
@@ -876,7 +875,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
876875
; VI-NEXT: v_mov_b32_e32 v6, s5
877876
; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v0
878877
; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
879-
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6]
880878
; VI-NEXT: v_mov_b32_e32 v2, s1
881879
; VI-NEXT: v_mov_b32_e32 v3, s2
882880
; VI-NEXT: v_mov_b32_e32 v4, s3
@@ -894,7 +892,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
894892
; GFX9-NEXT: v_mov_b32_e32 v1, s7
895893
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
896894
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
897-
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
898895
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
899896
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
900897
; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
@@ -909,8 +906,7 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
909906
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
910907
; GFX1010-NEXT: v_add_co_u32 v0, s4, s6, v0
911908
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
912-
; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
913-
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
909+
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
914910
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
915911
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
916912
; GFX1010-NEXT: s_endpgm
@@ -923,9 +919,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
923919
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
924920
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
925921
; GFX1030W32-NEXT: v_add_co_u32 v0, s4, s6, v0
926-
; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4
927-
; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
928-
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
922+
; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
923+
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
929924
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
930925
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
931926
; GFX1030W32-NEXT: s_endpgm
@@ -938,9 +933,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
938933
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
939934
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
940935
; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s6, v0
941-
; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s[4:5]
942-
; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
943-
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
936+
; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
937+
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
944938
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
945939
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
946940
; GFX1030W64-NEXT: s_endpgm
@@ -955,10 +949,9 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
955949
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
956950
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
957951
; GFX11-NEXT: v_add_co_u32 v0, s4, s6, v0
958-
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4
952+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
959953
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
960-
; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
961-
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
954+
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
962955
; GFX11-NEXT: s_clause 0x1
963956
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
964957
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
@@ -969,16 +962,17 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
969962
; GFX1250-NEXT: s_clause 0x1
970963
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
971964
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
972-
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
973965
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
966+
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
974967
; GFX1250-NEXT: s_wait_kmcnt 0x0
975-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
976-
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
977-
; GFX1250-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[2:3]
978-
; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
968+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
969+
; GFX1250-NEXT: v_add_co_u32 v0, s4, s6, v0
970+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
971+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
972+
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
979973
; GFX1250-NEXT: s_clause 0x1
980-
; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
981-
; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
974+
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
975+
; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
982976
; GFX1250-NEXT: s_endpgm
983977
%tid = call i32 @llvm.amdgcn.workitem.id.x()
984978
%tid.ext = sext i32 %tid to i64
@@ -1821,7 +1815,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
18211815
; GCN-ISEL-LABEL: name: vusubo64
18221816
; GCN-ISEL-LABEL: body:
18231817
; GCN-ISEL-LABEL: bb.0
1824-
; GCN-ISEL: V_SUB_U64_PSEUDO
1818+
; GCN-ISEL: V_SUBB_U32_e64
18251819

18261820
define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
18271821
; CISI-LABEL: vusubo64:
@@ -1834,9 +1828,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
18341828
; CISI-NEXT: s_mov_b32 s4, s0
18351829
; CISI-NEXT: v_mov_b32_e32 v1, s9
18361830
; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
1837-
; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
1838-
; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
18391831
; CISI-NEXT: s_mov_b32 s5, s1
1832+
; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
18401833
; CISI-NEXT: s_mov_b32 s0, s2
18411834
; CISI-NEXT: s_mov_b32 s1, s3
18421835
; CISI-NEXT: s_mov_b32 s2, s6
@@ -1856,7 +1849,6 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
18561849
; VI-NEXT: v_mov_b32_e32 v6, s5
18571850
; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v0
18581851
; VI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
1859-
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[5:6]
18601852
; VI-NEXT: v_mov_b32_e32 v2, s1
18611853
; VI-NEXT: v_mov_b32_e32 v3, s2
18621854
; VI-NEXT: v_mov_b32_e32 v4, s3
@@ -1874,7 +1866,6 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
18741866
; GFX9-NEXT: v_mov_b32_e32 v1, s7
18751867
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
18761868
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
1877-
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
18781869
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
18791870
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
18801871
; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
@@ -1889,8 +1880,7 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
18891880
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
18901881
; GFX1010-NEXT: v_sub_co_u32 v0, s4, s6, v0
18911882
; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
1892-
; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
1893-
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1883+
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
18941884
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
18951885
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
18961886
; GFX1010-NEXT: s_endpgm
@@ -1903,9 +1893,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
19031893
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
19041894
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
19051895
; GFX1030W32-NEXT: v_sub_co_u32 v0, s4, s6, v0
1906-
; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s4
1907-
; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
1908-
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1896+
; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
1897+
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
19091898
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
19101899
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
19111900
; GFX1030W32-NEXT: s_endpgm
@@ -1918,9 +1907,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
19181907
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
19191908
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
19201909
; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s6, v0
1921-
; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s[4:5]
1922-
; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
1923-
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
1910+
; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
1911+
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
19241912
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
19251913
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
19261914
; GFX1030W64-NEXT: s_endpgm
@@ -1935,10 +1923,9 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
19351923
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
19361924
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
19371925
; GFX11-NEXT: v_sub_co_u32 v0, s4, s6, v0
1938-
; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s4
1926+
; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
19391927
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1940-
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
1941-
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1928+
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
19421929
; GFX11-NEXT: s_clause 0x1
19431930
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
19441931
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
@@ -1949,16 +1936,17 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
19491936
; GFX1250-NEXT: s_clause 0x1
19501937
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
19511938
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1952-
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
19531939
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1940+
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
19541941
; GFX1250-NEXT: s_wait_kmcnt 0x0
1955-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1956-
; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[6:7], v[0:1]
1957-
; GFX1250-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[2:3]
1958-
; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
1942+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1943+
; GFX1250-NEXT: v_sub_co_u32 v0, s4, s6, v0
1944+
; GFX1250-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
1945+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
1946+
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
19591947
; GFX1250-NEXT: s_clause 0x1
1960-
; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
1961-
; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
1948+
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1949+
; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
19621950
; GFX1250-NEXT: s_endpgm
19631951
%tid = call i32 @llvm.amdgcn.workitem.id.x()
19641952
%tid.ext = sext i32 %tid to i64

0 commit comments

Comments
 (0)