-
Notifications
You must be signed in to change notification settings - Fork 14.9k
AMDGPU: Add tests for ds_write2 formation with agprs #155765
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
AMDGPU: Add tests for ds_write2 formation with agprs #155765
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThe current handling for write2 formation is overly conservative Patch is 34.46 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155765.diff 1 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-ds-write2.ll b/llvm/test/CodeGen/AMDGPU/a-v-ds-write2.ll
new file mode 100644
index 0000000000000..23ad7e6afa362
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/a-v-ds-write2.ll
@@ -0,0 +1,740 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+
+; Make sure the register class requirments of ds_write2_* instructions
+; are properly respected when they can use AGPRs. Both data operands
+; together must be VGPR or AGPR.
+
+;---------------------------------------------------------------------
+; b32 cases
+;---------------------------------------------------------------------
+
+; Test a pattern that can form ds_write_b32 with data in AGPRs
+define void @ds_write2_b32_a_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_a_a:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a0
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a1
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_write_b32 v0, a0 offset:40
+; GCN-NEXT: ds_write_b32 v0, a1 offset:96
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+ %a0 = call i32 asm "; def $0", "=a"()
+ %a1 = call i32 asm "; def $0", "=a"()
+ store i32 %a0, ptr addrspace(3) %gep.0
+ store i32 %a1, ptr addrspace(3) %gep.1
+ ret void
+}
+
+define void @ds_write2_b32_a_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_a_v:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a0
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v1
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_write_b32 v0, a0 offset:40
+; GCN-NEXT: ds_write_b32 v0, v1 offset:96
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+ %a0 = call i32 asm "; def $0", "=a"()
+ %v0 = call i32 asm "; def $0", "=v"()
+ store i32 %a0, ptr addrspace(3) %gep.0
+ store i32 %v0, ptr addrspace(3) %gep.1
+ ret void
+}
+
+define void @ds_write2_b32_v_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_v_a:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a0
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v1
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_write_b32 v0, a0 offset:40
+; GCN-NEXT: ds_write_b32 v0, v1 offset:96
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+ %v0 = call i32 asm "; def $0", "=v"()
+ %a0 = call i32 asm "; def $0", "=a"()
+ store i32 %a0, ptr addrspace(3) %gep.0
+ store i32 %v0, ptr addrspace(3) %gep.1
+ ret void
+}
+
+define void @ds_write2_b32_v_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_v_v:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v1
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v2
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+ %v0 = call i32 asm "; def $0", "=v"()
+ %v1 = call i32 asm "; def $0", "=v"()
+ store i32 %v0, ptr addrspace(3) %gep.0
+ store i32 %v1, ptr addrspace(3) %gep.1
+ ret void
+}
+
+define void @ds_write2_b32_av_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_av_av:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v1
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v2
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+ %av0 = call i32 asm "; def $0", "=^VA"()
+ %av1 = call i32 asm "; def $0", "=^VA"()
+ store i32 %av0, ptr addrspace(3) %gep.0
+ store i32 %av1, ptr addrspace(3) %gep.1
+ ret void
+}
+
+define void @ds_write2st64_b32_a_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_a_a:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a0
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a1
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_write_b32 v0, a0 offset:256
+; GCN-NEXT: ds_write_b32 v0, a1 offset:1024
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+ %a0 = call i32 asm "; def $0", "=a"()
+ %a1 = call i32 asm "; def $0", "=a"()
+ store i32 %a0, ptr addrspace(3) %gep.0
+ store i32 %a1, ptr addrspace(3) %gep.1
+ ret void
+}
+
+define void @ds_write2st64_b32_a_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_a_v:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a0
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v1
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_write_b32 v0, a0 offset:256
+; GCN-NEXT: ds_write_b32 v0, v1 offset:1024
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+ %a0 = call i32 asm "; def $0", "=a"()
+ %v0 = call i32 asm "; def $0", "=v"()
+ store i32 %a0, ptr addrspace(3) %gep.0
+ store i32 %v0, ptr addrspace(3) %gep.1
+ ret void
+}
+
+define void @ds_write2st64_b32_v_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_v_a:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v1
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a0
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_write_b32 v0, v1 offset:256
+; GCN-NEXT: ds_write_b32 v0, a0 offset:1024
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+ %v0 = call i32 asm "; def $0", "=a"()
+ %a0 = call i32 asm "; def $0", "=v"()
+ store i32 %a0, ptr addrspace(3) %gep.0
+ store i32 %v0, ptr addrspace(3) %gep.1
+ ret void
+}
+
+define void @ds_write2st64_b32_v_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_v_v:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v1
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v2
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_write2st64_b32 v0, v1, v2 offset0:1 offset1:4
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+ %v0 = call i32 asm "; def $0", "=v"()
+ %v1 = call i32 asm "; def $0", "=v"()
+ store i32 %v0, ptr addrspace(3) %gep.0
+ store i32 %v1, ptr addrspace(3) %gep.1
+ ret void
+}
+
+define void @ds_write2st64_b32_av_av(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2st64_b32_av_av:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v1
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v2
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_write2st64_b32 v0, v1, v2 offset0:1 offset1:4
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 64
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 256
+ %av0 = call i32 asm "; def $0", "=^VA"()
+ %av1 = call i32 asm "; def $0", "=^VA"()
+ store i32 %av0, ptr addrspace(3) %gep.0
+ store i32 %av1, ptr addrspace(3) %gep.1
+ ret void
+}
+
+define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b32_av_av_no_vgprs:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a0, v0
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a1
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a2
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v[0:31]
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GCN-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
+; GCN-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GCN-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use v[0:31]
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+ %av0 = call i32 asm sideeffect "; def $0", "=^VA"()
+ %av1 = call i32 asm sideeffect "; def $0", "=^VA"()
+ %vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
+ %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
+ %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
+ store i32 %av0, ptr addrspace(3) %gep.0
+ store i32 %av1, ptr addrspace(3) %gep.1
+ call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
+ ret void
+}
+
+;---------------------------------------------------------------------
+; b64 cases
+;---------------------------------------------------------------------
+
+; Test a pattern that can form ds_write_b64 with data in AGPRs
+define void @ds_write2_b64_a_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b64_a_a:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a[0:1]
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a[2:3]
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_write_b64 v0, a[0:1] offset:40
+; GCN-NEXT: ds_write_b64 v0, a[2:3] offset:96
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+ %a0 = call i64 asm "; def $0", "=a"()
+ %a1 = call i64 asm "; def $0", "=a"()
+ store i64 %a0, ptr addrspace(3) %gep.0
+ store i64 %a1, ptr addrspace(3) %gep.1
+ ret void
+}
+
+define void @ds_write2_b64_a_v(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b64_a_v:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a[0:1]
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v[2:3]
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_write_b64 v0, a[0:1] offset:40
+; GCN-NEXT: ds_write_b64 v0, v[2:3] offset:96
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+ %a0 = call i64 asm "; def $0", "=a"()
+ %v0 = call i64 asm "; def $0", "=v"()
+ store i64 %a0, ptr addrspace(3) %gep.0
+ store i64 %v0, ptr addrspace(3) %gep.1
+ ret void
+}
+
+define void @ds_write2_b64_v_a(ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: ds_write2_b64_v_a:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def a[0:1]
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; def v[2:3]
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_write_b64 v0, a[0:1] offset:40
+; GCN-NEXT: ds_write_b64 v0, v[2:3] offset:96
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 10
+ %gep.1 = getelementptr inbounds [512 x float], ptr addrspace(3) %lds, i32 0, i32 24
+ %v0 = call i64 asm "; def $0", "=v"()
+ %a0 = call i64 asm "; def $0", "=a"()
+ store i...
[truncated]
|
f315e8b
to
7757076
Compare
7757076
to
e600683
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, just maybe rename the file so it starts with ds_write2. I.e., so that ds tests are close together in the directory sort order.
The current handling for write2 formation is overly conservative and cannot form write2s with AGPR inputs.
e600683
to
1e02d4a
Compare
The current handling for write2 formation is overly conservative
and cannot form write2s with AGPR inputs.