@@ -629,7 +629,7 @@ define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half>
629
629
; GFX9-LABEL: shuffle_v8f16_0123:
630
630
; GFX9: ; %bb.0:
631
631
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
632
- ; GFX9-NEXT: global_load_dwordx4 v[0:3 ], v[0:1], off
632
+ ; GFX9-NEXT: global_load_dwordx2 v[0:1 ], v[0:1], off
633
633
; GFX9-NEXT: s_waitcnt vmcnt(0)
634
634
; GFX9-NEXT: s_setpc_b64 s[30:31]
635
635
%val0 = load <8 x half >, <8 x half > addrspace (1 )* %arg0
@@ -656,8 +656,10 @@ define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x
656
656
; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
657
657
; GFX9: ; %bb.0:
658
658
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
659
+ ; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
659
660
; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4
660
- ; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4
661
+ ; GFX9-NEXT: s_waitcnt vmcnt(1)
662
+ ; GFX9-NEXT: v_mov_b32_e32 v0, v2
661
663
; GFX9-NEXT: s_waitcnt vmcnt(0)
662
664
; GFX9-NEXT: s_setpc_b64 s[30:31]
663
665
%val0 = load <8 x half >, <8 x half > addrspace (1 )* %arg0
@@ -670,12 +672,12 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x
670
672
; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
671
673
; GFX9: ; %bb.0:
672
674
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
673
- ; GFX9-NEXT: global_load_dwordx4 v[2:5 ], v[2:3], off
675
+ ; GFX9-NEXT: global_load_dwordx2 v[2:3 ], v[2:3], off offset:8
674
676
; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4
675
677
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
676
678
; GFX9-NEXT: s_waitcnt vmcnt(1)
677
- ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
678
- ; GFX9-NEXT: v_lshl_or_b32 v0, v5 , 16, v0
679
+ ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
680
+ ; GFX9-NEXT: v_lshl_or_b32 v0, v3 , 16, v0
679
681
; GFX9-NEXT: s_waitcnt vmcnt(0)
680
682
; GFX9-NEXT: s_setpc_b64 s[30:31]
681
683
%val0 = load <8 x half >, <8 x half > addrspace (1 )* %arg0
@@ -809,6 +811,27 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half>
809
811
ret <4 x half > %shuffle
810
812
}
811
813
814
+ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123 (<8 x i32 > addrspace (4 )* %in , <4 x i32 > addrspace (1 )* %out ) {
815
+ ; GFX9-LABEL: shuffle_scalar_load_v8i32_0123:
816
+ ; GFX9: ; %bb.0:
817
+ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
818
+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
819
+ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
820
+ ; GFX9-NEXT: v_mov_b32_e32 v4, s2
821
+ ; GFX9-NEXT: v_mov_b32_e32 v5, s3
822
+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
823
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s4
824
+ ; GFX9-NEXT: v_mov_b32_e32 v1, s5
825
+ ; GFX9-NEXT: v_mov_b32_e32 v2, s6
826
+ ; GFX9-NEXT: v_mov_b32_e32 v3, s7
827
+ ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
828
+ ; GFX9-NEXT: s_endpgm
829
+ %ld8 = load <8 x i32 >, <8 x i32 > addrspace (4 )* %in , align 16
830
+ %id = shufflevector <8 x i32 > %ld8 , <8 x i32 > undef , <4 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 >
831
+ store <4 x i32 > %id , <4 x i32 > addrspace (1 )* %out , align 8
832
+ ret void
833
+ }
834
+
812
835
declare <2 x half > @llvm.fma.v2f16 (<2 x half >, <2 x half >, <2 x half >) #0
813
836
declare i32 @llvm.amdgcn.workitem.id.x () #0
814
837
0 commit comments