-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGPU][True16][Codegen] remove another build_vector pattern from true16 #149861
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
4759607
to
705d7f2
Compare
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesRemove a 16bit-vgpr32 build_vector pattern from true16 mode. This stop isel from generating illegal "vgpr_32 = COPY vgpr_16". ISel will use vgpr16 build vector pattern in true16 mode instead Patch is 1.95 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149861.diff 38 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e8b4501226732..06e40236f65b2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2174,7 +2174,6 @@ def : GCNPat <
}
foreach fp16vt = [f16, bf16] in {
-
def : GCNPat <
(fcopysign fp16vt:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -3637,13 +3636,24 @@ def : GCNPat <
>;
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in
+let True16Predicate = p in {
// Take the lower 16 bits from each VGPR_32 and concat them
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
>;
+// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
+// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
+def : GCNPat <
+ (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
+ (Ty !if(!eq(Ty, i16),
+ (Ty (trunc (srl VGPR_32:$b, (i32 16)))),
+ (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b)
+>;
+}
+
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
@@ -3669,18 +3679,6 @@ def : GCNPat <
(V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b)
>;
-
-// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
-// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
-def : GCNPat <
- (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
- (Ty !if(!eq(Ty, i16),
- (Ty (trunc (srl VGPR_32:$b, (i32 16)))),
- (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
- (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b)
->;
-
-
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
// Special case, can use V_ALIGNBIT (always uses encoded literal)
let True16Predicate = NotHasTrue16BitInsts in {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index d03d6a8940b2f..46b82d3a3d651 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -25961,22 +25961,64 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
-; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v63, v31 :: v_dual_mov_b32 v62, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v61, v29 :: v_dual_mov_b32 v60, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v59, v27 :: v_dual_mov_b32 v58, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v57, v25 :: v_dual_mov_b32 v56, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v23 :: v_dual_mov_b32 v54, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v21 :: v_dual_mov_b32 v52, v20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v19 :: v_dual_mov_b32 v50, v18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v17 :: v_dual_mov_b32 v48, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v47, v15 :: v_dual_mov_b32 v46, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v45, v13 :: v_dual_mov_b32 v44, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v43, v11 :: v_dual_mov_b32 v42, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v7 :: v_dual_mov_b32 v38, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v5 :: v_dual_mov_b32 v36, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v3 :: v_dual_mov_b32 v34, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
+; GFX11-TRUE16-NEXT: .LBB18_2: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_4
+; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32
@@ -25984,539 +26026,551 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v47, v36, v37
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v33
; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v34 :: v_dual_cndmask_b32 v46, v35, v32
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v34, v15, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v13, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v32, v34, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v45, v33, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v14
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v14, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v15, v15, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v15, v33, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v44, v32, v34, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v11
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v12.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v15, v15, v13, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v14, v14, v11, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v14, v32 :: v_dual_and_b32 v12, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_cndmask_b32 v43, v15, v33
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32
-; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v14, v14, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v10, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v13, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v42, v14, v32, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v10.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v9, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v12, v14 :: v_dual_and_b32 v10, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VA...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its not obvious from the test changes why this was illegal. May need a specific test that fails to compile without this patch.
@@ -201,11 +201,6 @@ define <10 x i16> @bitcast_i160_to_v10i16(i160 %int) { | |||
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 | |||
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 | |||
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 | |||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks wrong
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This removed block is equivalent to:
v0 (line 207)<= v5.l | v0.h (line 204)<= v0.l | v0.h = v0
v2 (line 208)<= v6.l | v2.h (line 205)<= v2.l | v2.h = v2
It's removed since it's a NOP
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see thanks. Then this is a nice change.
This patch aims to fix a functional failure so it might be a bit difficult to find a ll test which triggers the functional error (could be a large test). I'll add a mir test just like in #153143 |
705d7f2
to
8a22470
Compare
added a mir test which includes this pattern and pattern removed from #148715. This is making sure no "vgpr32 = copy vgpr16" is generated from isel |
8a22470
to
6ba4810
Compare
Remove another build_vector pattern which takes a i16 but placed in a VGPR_32 from true16 mode. This stop isel from generating illegal "vgpr_32 = COPY vgpr_16".
ISel will use vgpr16 build vector pattern in true16 mode instead