Skip to content

Conversation

LU-JOHN
Copy link
Contributor

@LU-JOHN LU-JOHN commented Aug 23, 2025

Pre-commit test for setcc removal by using add/sub carryout.

@llvmbot
Copy link
Member

llvmbot commented Aug 23, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: None (LU-JOHN)

Changes

Pre-commit test for setcc removal by using add/sub carryout.


Full diff: https://github.com/llvm/llvm-project/pull/155118.diff

1 Files Affected:

  • (added) llvm/test/CodeGen/AMDGPU/addsub64_carry.ll (+186)
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
new file mode 100644
index 0000000000000..5cdb0b2407bbc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;; Test that carryout from 64-bit add/sub (synthesized from two 32-bit adds/subs) is utilized
+;; (i.e. no additional compare is generated).
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+%0 = type { i64, i64, i32, i32 }
+%1 = type { [64 x [8 x i64]] }
+%struct.uint96 = type { i64, i32 }
+%struct.uint64pair = type { i64, i64 }
+
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64)
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64)
+
+declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+
+define hidden %struct.uint96 @add64_32(i64 %val64A, i64 %val64B, i32 %val32) {
+; CHECK-LABEL: add64_32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
+; CHECK-NEXT:    v_addc_co_u32_e32 v6, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, v[5:6], v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v0, v5
+; CHECK-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, v6
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %sum64 = add i64 %val64A, %val64B
+  %obit = icmp ult i64 %sum64, %val64A
+  %obit32 = zext i1 %obit to i32
+  %sum32 = add i32 %val32, %obit32
+  %.fca.0.insert = insertvalue %struct.uint96 poison, i64 %sum64, 0
+  %.fca.1.insert = insertvalue %struct.uint96 %.fca.0.insert, i32 %sum32, 1
+  ret %struct.uint96 %.fca.1.insert
+}
+
+define <2 x i64> @uadd_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
+; CHECK-LABEL: uadd_v2i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, v2, v6
+; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
+; CHECK-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v4
+; CHECK-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v5, vcc
+; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
+; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %pair = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
+  %val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
+  %obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
+  %res = sext <2 x i1> %obit to <2 x i64>
+  store <2 x i64> %val, ptr %ptrval
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @usub_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
+; CHECK-LABEL: usub_v2i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_sub_co_u32_e32 v6, vcc, v2, v6
+; CHECK-NEXT:    v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
+; CHECK-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v4
+; CHECK-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v5, vcc
+; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[0:1]
+; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
+  %val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
+  %obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
+  %res = sext <2 x i1> %obit to <2 x i64>
+  store <2 x i64> %val, ptr %ptrval
+  ret <2 x i64> %res
+}
+
+define i64 @uadd_i64(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: uadd_i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
+; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 %val1)
+  %val = extractvalue {i64, i1} %pair, 0
+  %obit = extractvalue {i64, i1} %pair, 1
+  %res = sext i1 %obit to i64
+  store i64 %val, ptr %ptrval
+  ret i64 %res
+}
+
+define i64 @uadd_p1(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: uadd_p1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; CHECK-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 1)
+  %val = extractvalue {i64, i1} %pair, 0
+  %obit = extractvalue {i64, i1} %pair, 1
+  %res = sext i1 %obit to i64
+  store i64 %val, ptr %ptrval
+  ret i64 %res
+}
+
+define i64 @uadd_n1(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: uadd_n1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_add_co_u32_e32 v2, vcc, -1, v0
+; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 -1)
+  %val = extractvalue {i64, i1} %pair, 0
+  %obit = extractvalue {i64, i1} %pair, 1
+  %res = sext i1 %obit to i64
+  store i64 %val, ptr %ptrval
+  ret i64 %res
+}
+
+define i64 @usub_p1(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: usub_p1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_add_co_u32_e32 v2, vcc, -1, v0
+; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
+; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 1)
+  %val = extractvalue {i64, i1} %pair, 0
+  %obit = extractvalue {i64, i1} %pair, 1
+  %res = sext i1 %obit to i64
+  store i64 %val, ptr %ptrval
+  ret i64 %res
+}
+
+define i64 @usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
+; CHECK-LABEL: usub_n1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_add_co_u32_e32 v2, vcc, 1, v0
+; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
+; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 -1)
+  %val = extractvalue {i64, i1} %pair, 0
+  %obit = extractvalue {i64, i1} %pair, 1
+  %res = sext i1 %obit to i64
+  store i64 %val, ptr %ptrval
+  ret i64 %res
+}

@LU-JOHN LU-JOHN force-pushed the remove_setcc_test branch from 3617949 to f1f45e9 Compare August 25, 2025 13:34
@shiltian
Copy link
Contributor

Can you stack your next PR that can show the difference?

@LU-JOHN
Copy link
Contributor Author

LU-JOHN commented Aug 25, 2025

Can you stack your next PR that can show the difference?

Stacked PR made at #155255

@LU-JOHN LU-JOHN force-pushed the remove_setcc_test branch from f1f45e9 to ad39058 Compare August 25, 2025 15:54
declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>)

define hidden %struct.uint96 @add64_32(i64 %val64A, i64 %val64B, i32 %val32) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't need the hidden. Can you add a v_ prefix to these tests, and add some variants with SGPR inputs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed hidden. Added v_ prefix to tests. Made SGPR variant of tests prefixed with s_.

@LU-JOHN LU-JOHN force-pushed the remove_setcc_test branch from d8c0929 to 1d6794d Compare August 26, 2025 15:12
@LU-JOHN LU-JOHN requested a review from arsenm August 26, 2025 15:52
Signed-off-by: John Lu <John.Lu@amd.com>
Signed-off-by: John Lu <John.Lu@amd.com>
@LU-JOHN LU-JOHN force-pushed the remove_setcc_test branch from 1d6794d to 4fbecc2 Compare August 26, 2025 18:16
; test SGPR
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

define %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B, i32 inreg %val32) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
define %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B, i32 inreg %val32) {
define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B, i32 inreg %val32) {

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed.

;; Test that carryout from 64-bit add/sub (synthesized from two 32-bit adds/subs) is utilized
;; (i.e. no additional compare is generated).

; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated to amdpal

Comment on lines 204 to 206
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None of these SGPR cases have scalar uses, since they are using the normal return-in-vgpr. This could break if we had a proper optimization to pull scalar operations only copied into VGPRs into VALU operations.

You should either use amdgpu_ps which uses integer types for SGPR returns, or use inline asm with an "s" use constraint

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed to have SGPR returns and thus SGPR uses.

Signed-off-by: John Lu <John.Lu@amd.com>
Signed-off-by: John Lu <John.Lu@amd.com>
@LU-JOHN LU-JOHN requested a review from arsenm August 27, 2025 16:45
Copy link
Contributor

@shiltian shiltian left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like @arsenm 's concern have been resolved. LGTM.

@LU-JOHN LU-JOHN merged commit 3e10bdd into llvm:main Sep 4, 2025
9 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants