-
Notifications
You must be signed in to change notification settings - Fork 14.9k
AMDGPU: Add more tests for flat/global atomicrmw with agprs #156874
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Add more tests for flat/global atomicrmw with agprs #156874
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesAdd comprehensive tests for global atomics with return in Patch is 1.58 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156874.diff 2 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
new file mode 100644
index 0000000000000..6b6eb43baf856
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -0,0 +1,21350 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=CHECK,GFX950 %s
+
+;---------------------------------------------------------------------
+; xchg i32 cases
+;---------------------------------------------------------------------
+
+; Input and result use AGPR
+define void @flat_atomic_xchg_i32_ret_a_a(ptr %ptr) #0 {
+; GFX90A-LABEL: flat_atomic_xchg_i32_ret_a_a:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a0
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use a0
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_xchg_i32_ret_a_a:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; def a0
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: buffer_inv sc0 sc1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; use a0
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is AGPR, result used as VGPR.
+define void @flat_atomic_xchg_i32_ret_a_v(ptr %ptr) #0 {
+; GFX90A-LABEL: flat_atomic_xchg_i32_ret_a_v:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a0
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use v0
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_xchg_i32_ret_a_v:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; def a0
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: buffer_inv sc0 sc1
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; use v0
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+; Input is VGPR, result used as AGPR
+define void @flat_atomic_xchg_i32_ret_v_a(ptr %ptr) #0 {
+; GFX90A-LABEL: flat_atomic_xchg_i32_ret_v_a:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use a0
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_xchg_i32_ret_v_a:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: buffer_inv sc0 sc1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; use a0
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=v"()
+ %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is AV, result also used as AV
+define void @flat_atomic_xchg_i32_ret_av_av(ptr %ptr) #0 {
+; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_av:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use v0
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: buffer_inv sc0 sc1
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; use v0
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+; Input is AV, used as v
+define void @flat_atomic_xchg_i32_ret_av_v(ptr %ptr) #0 {
+; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_v:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use v0
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_v:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: buffer_inv sc0 sc1
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; use v0
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+; Input is AV, used as a
+define void @flat_atomic_xchg_i32_ret_av_a(ptr %ptr) #0 {
+; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_a:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use a0
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_a:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: buffer_inv sc0 sc1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; use a0
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is a, result used as AV
+define void @flat_atomic_xchg_i32_ret_a_av(ptr %ptr) #0 {
+; GFX90A-LABEL: flat_atomic_xchg_i32_ret_a_av:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a0
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use v0
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_xchg_i32_ret_a_av:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; def a0
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: buffer_inv sc0 sc1
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; use v0
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+; Input is v, result used as AV
+define void @flat_atomic_xchg_i32_ret_v_av(ptr %ptr) #0 {
+; GFX90A-LABEL: flat_atomic_xchg_i32_ret_v_av:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use v0
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_xchg_i32_ret_v_av:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: buffer_inv sc0 sc1
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; use v0
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=v"()
+ %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
+; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[0:31]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a2
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use a0
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use v[0:31]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GFX...
[truncated]
|
This would select the pseudo and then crash when the MC instruction was used. I believe this has been broken since 9912ccb
Add comprehensive tests for global atomics with return in agpr / AV usage contexts.
249e734
to
522784f
Compare
68ea4cd
to
0b03d7f
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The contents of the test files are huge and I'm unable to view the changes in the testcase llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll. There is a suggestion to use a local Git Client to view the changes. Do others see a similar recommendation?
@@ -1,381 +1,633 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | |||
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s | |||
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=CHECK,GFX90A %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to use the full triple.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't really matter, these aren't even testing different entry points
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The files better be named like flat-atomicrmw-a-v.ll etc. So they do not popup at the top of the directory sort order but close to other flat/global stuff.
This matches the identically patterned existing a-v-atomic tests, if you want to rename them the others should be renamed at the same time |
I see. Would be good for those who are using console file managers like me. |
Add comprehensive tests for global atomics with return in
agpr / AV usage contexts.