diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 46eab2a0a98c7..9cc9af7575db6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2480,6 +2480,22 @@ def : AMDGPUPatIgnoreCopies < (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) >; +// (z & ~x) +def : AMDGPUPatIgnoreCopies < + (DivergentBinFrag i32:$z, (not_oneuse i32:$x)), + (V_BFI_B32_e64 VSrc_b32:$x, (i32 0), VSrc_b32:$z) +>; + +// 64-bit version +def : AMDGPUPatIgnoreCopies < + (DivergentBinFrag i64:$z, (not_oneuse i64:$x)), + (REG_SEQUENCE VReg_64, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), (i32 0), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), (i32 0), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) +>; + // SHA-256 Ch function // z ^ (x & (y ^ z)) def : AMDGPUPatIgnoreCopies < diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll index e1ef3f9be0a5d..aa38c63dc9dcd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -99,15 +99,13 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) { ; GCN-LABEL: v_andn2_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_not_b32_e32 v1, v1 -; GCN-NEXT: v_and_b32_e32 v0, v0, v1 +; GCN-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_andn2_i32: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 @@ -117,14 +115,12 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) { define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) { ; GCN-LABEL: v_andn2_i32_sv: ; GCN: ; %bb.0: -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, s2, v0 +; GCN-NEXT: v_bfi_b32 v0, v0, 0, s2 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_andn2_i32_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, 0, s2 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 @@ -135,14 +131,12 @@ define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) { define amdgpu_ps float @v_andn2_i32_vs(i32 %src0, i32 inreg %src1) { ; GCN-LABEL: v_andn2_i32_vs: ; GCN: ; %bb.0: -; GCN-NEXT: s_not_b32 s0, s2 -; GCN-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-NEXT: v_bfi_b32 v0, s2, 0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_andn2_i32_vs: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_not_b32 s0, s2 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX10PLUS-NEXT: v_bfi_b32 v0, s2, 0, v0 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 @@ -247,19 +241,15 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) { ; GCN-LABEL: v_andn2_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_not_b32_e32 v2, v2 -; GCN-NEXT: v_not_b32_e32 v3, v3 -; GCN-NEXT: v_and_b32_e32 v0, v0, v2 -; GCN-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN-NEXT: v_bfi_b32 v0, v2, 0, v0 +; GCN-NEXT: v_bfi_b32 v1, v3, 0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_andn2_i64: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_not_b32_e32 v2, v2 -; GFX10PLUS-NEXT: v_not_b32_e32 v3, v3 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX10PLUS-NEXT: v_bfi_b32 v0, v2, 0, v0 +; GFX10PLUS-NEXT: v_bfi_b32 v1, v3, 0, v1 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 @@ -269,18 +259,14 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) { define amdgpu_ps <2 x float> @v_andn2_i64_sv(i64 inreg %src0, i64 %src1) { ; GCN-LABEL: v_andn2_i64_sv: ; GCN: ; %bb.0: -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: v_not_b32_e32 v1, v1 -; GCN-NEXT: v_and_b32_e32 v0, s2, v0 -; GCN-NEXT: v_and_b32_e32 v1, s3, v1 +; GCN-NEXT: v_bfi_b32 v0, v0, 0, s2 +; GCN-NEXT: v_bfi_b32 v1, v1, 0, s3 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: v_andn2_i64_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0 -; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10PLUS-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, 0, s2 +; GFX10PLUS-NEXT: v_bfi_b32 v1, v1, 0, s3 ; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index fc81e16d68e98..fd329e230e78b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -396,8 +396,7 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX6-NEXT: v_not_b32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7 ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 @@ -784,19 +783,17 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX6-LABEL: v_fshl_v2i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 7, v2 -; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0 ; GFX6-NEXT: v_bfe_u32 v5, v1, 1, 7 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v5 +; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 -; GFX6-NEXT: v_not_b32_e32 v4, v4 -; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8 -; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX6-NEXT: v_bfi_b32 v4, v4, 0, 7 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1 @@ -1184,38 +1181,34 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX6-LABEL: v_fshl_v4i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v9, 7, v2 -; GFX6-NEXT: v_not_b32_e32 v2, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_bfe_u32 v9, v1, 1, 7 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v9 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v6 -; GFX6-NEXT: v_not_b32_e32 v6, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_bfe_u32 v3, v1, 8, 8 -; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX6-NEXT: v_bfi_b32 v6, v6, 0, 7 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 7, v7 -; GFX6-NEXT: v_not_b32_e32 v6, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_bfe_u32 v4, v1, 16, 8 -; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX6-NEXT: v_bfi_b32 v6, v7, 0, 7 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v8 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v8 -; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX6-NEXT: v_bfi_b32 v6, v8, 0, 7 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 25, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 @@ -5023,10 +5016,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX6-LABEL: v_fshl_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 63, v4 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_not_b32_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX6-NEXT: v_bfi_b32 v4, v4, 0, 63 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -5036,10 +5028,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX8-LABEL: v_fshl_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v5, 63, v4 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_not_b32_e32 v4, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX8-NEXT: v_bfi_b32 v4, v4, 0, 63 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 @@ -5049,10 +5040,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX9-LABEL: v_fshl_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_not_b32_e32 v4, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX9-NEXT: v_bfi_b32 v4, v4, 0, 63 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 @@ -5062,12 +5052,11 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX10-LABEL: v_fshl_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v5, v4 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX10-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX10-NEXT: v_bfi_b32 v4, v4, 0, 63 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5075,16 +5064,14 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX11-LABEL: v_fshl_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v5, v4 ; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX11-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v5, 63, v5 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX11-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX11-NEXT: v_bfi_b32 v4, v4, 0, 63 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) @@ -5204,10 +5191,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; GFX6-LABEL: v_fshl_i64_ssv: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v1, 63, v0 -; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 -; GFX6-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX6-NEXT: v_bfi_b32 v0, v0, 0, 63 ; GFX6-NEXT: v_lshr_b64 v[3:4], s[0:1], v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v4 @@ -5216,10 +5202,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; GFX8-LABEL: v_fshl_i64_ssv: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v1, 63, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 -; GFX8-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, 63 ; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1] ; GFX8-NEXT: v_or_b32_e32 v0, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v4 @@ -5228,10 +5213,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; GFX9-LABEL: v_fshl_i64_ssv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v1, 63, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 -; GFX9-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX9-NEXT: v_bfi_b32 v0, v0, 0, 63 ; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1] ; GFX9-NEXT: v_or_b32_e32 v0, v1, v3 ; GFX9-NEXT: v_or_b32_e32 v1, v2, v4 @@ -5239,11 +5223,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; ; GFX10-LABEL: v_fshl_i64_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_not_b32_e32 v1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 63, v0 +; GFX10-NEXT: v_bfi_b32 v2, v0, 0, 63 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: v_and_b32_e32 v2, 63, v1 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 @@ -5251,16 +5234,14 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; ; GFX11-LABEL: v_fshl_i64_ssv: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_not_b32_e32 v1, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 63, v0 +; GFX11-NEXT: v_bfi_b32 v2, v0, 0, 63 ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 63, v1 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) @@ -5466,18 +5447,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX6-LABEL: v_fshl_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v9, 63, v8 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], 1 -; GFX6-NEXT: v_not_b32_e32 v8, v8 -; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX6-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX6-NEXT: v_bfi_b32 v8, v8, 0, 63 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v10 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 -; GFX6-NEXT: v_not_b32_e32 v4, v10 -; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX6-NEXT: v_bfi_b32 v4, v10, 0, 63 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5487,18 +5466,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX8-LABEL: v_fshl_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v9, 63, v8 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX8-NEXT: v_not_b32_e32 v8, v8 -; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX8-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX8-NEXT: v_bfi_b32 v8, v8, 0, 63 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 63, v10 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] -; GFX8-NEXT: v_not_b32_e32 v4, v10 -; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX8-NEXT: v_bfi_b32 v4, v10, 0, 63 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5508,18 +5485,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX9-LABEL: v_fshl_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX9-NEXT: v_not_b32_e32 v8, v8 -; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX9-NEXT: v_bfi_b32 v8, v8, 0, 63 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v10 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] -; GFX9-NEXT: v_not_b32_e32 v4, v10 -; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX9-NEXT: v_bfi_b32 v4, v10, 0, 63 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5529,18 +5504,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX10-LABEL: v_fshl_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v9, v8 -; GFX10-NEXT: v_not_b32_e32 v11, v10 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 -; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 -; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v11, v[6:7] +; GFX10-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX10-NEXT: v_bfi_b32 v8, v8, 0, 63 +; GFX10-NEXT: v_and_b32_e32 v11, 63, v10 +; GFX10-NEXT: v_bfi_b32 v10, v10, 0, 63 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5550,20 +5523,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX11-LABEL: v_fshl_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v9, v8 -; GFX11-NEXT: v_not_b32_e32 v11, v10 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX11-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX11-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX11-NEXT: v_and_b32_e32 v9, 63, v9 -; GFX11-NEXT: v_and_b32_e32 v10, 63, v10 -; GFX11-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX11-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX11-NEXT: v_bfi_b32 v8, v8, 0, 63 +; GFX11-NEXT: v_and_b32_e32 v11, 63, v10 +; GFX11-NEXT: v_bfi_b32 v10, v10, 0, 63 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] -; GFX11-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3] -; GFX11-NEXT: v_lshrrev_b64 v[6:7], v11, v[6:7] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] +; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v5 @@ -5818,32 +5789,32 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-LABEL: v_fshl_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 64, v15 -; GFX6-NEXT: v_add_i32_e32 v17, vcc, 0xffffffc0, v15 +; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v8 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 64, v16 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, 0xffffffc0, v16 ; GFX6-NEXT: v_lshr_b64 v[9:10], v[0:1], v9 -; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15 -; GFX6-NEXT: v_lshl_b64 v[13:14], v[0:1], v15 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v17 +; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v16 +; GFX6-NEXT: v_lshl_b64 v[13:14], v[0:1], v16 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v18 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX6-NEXT: v_or_b32_e32 v10, v10, v12 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; GFX6-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc ; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], 1 +; GFX6-NEXT: v_mov_b32_e32 v15, 0x7f ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v6 -; GFX6-NEXT: v_not_b32_e32 v4, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[6:7], 1 -; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v4 -; GFX6-NEXT: v_not_b32_e32 v16, 63 +; GFX6-NEXT: v_bfi_b32 v14, v8, 0, v15 +; GFX6-NEXT: v_not_b32_e32 v17, 63 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v14 -; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v16 +; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v17 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v14 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v6 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v14 @@ -5867,32 +5838,32 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-LABEL: v_fshl_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 64, v15 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffffc0, v15 +; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v8 +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 64, v16 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xffffffc0, v16 ; GFX8-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v17, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[11:12], v16, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[13:14], v16, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v18, v[0:1] ; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX8-NEXT: v_or_b32_e32 v10, v10, v12 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v15, 0x7f ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v6 -; GFX8-NEXT: v_not_b32_e32 v4, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7] -; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v4 -; GFX8-NEXT: v_not_b32_e32 v16, 63 +; GFX8-NEXT: v_bfi_b32 v14, v8, 0, v15 +; GFX8-NEXT: v_not_b32_e32 v17, 63 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v14 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v16 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v17 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3] @@ -5916,27 +5887,27 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-LABEL: v_fshl_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX9-NEXT: v_sub_u32_e32 v9, 64, v15 -; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v15 +; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v8 +; GFX9-NEXT: v_sub_u32_e32 v9, 64, v16 +; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16 ; GFX9-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[11:12], v16, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[13:14], v16, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v17, v[0:1] ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_or_b32_e32 v10, v10, v12 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v15, 0x7f ; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5] -; GFX9-NEXT: v_not_b32_e32 v4, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v9, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7] -; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v4 +; GFX9-NEXT: v_bfi_b32 v14, v8, 0, v15 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 31, v1 ; GFX9-NEXT: v_sub_u32_e32 v6, 64, v14 ; GFX9-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14 @@ -5963,99 +5934,96 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-LABEL: v_fshl_i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v8 -; GFX10-NEXT: v_not_b32_e32 v12, v8 +; GFX10-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v12 -; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] +; GFX10-NEXT: v_bfi_b32 v18, v8, 0, 0x7f +; GFX10-NEXT: v_lshrrev_b64 v[9:10], 1, v[6:7] +; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v17 +; GFX10-NEXT: v_lshlrev_b64 v[7:8], v17, v[2:3] +; GFX10-NEXT: v_add_nc_u32_e32 v15, 0xffffffc0, v17 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 31, v5 -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 -; GFX10-NEXT: v_or_b32_e32 v8, v10, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19 -; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX10-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7] -; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v19 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v18 -; GFX10-NEXT: v_or_b32_e32 v0, v14, v16 -; GFX10-NEXT: v_or_b32_e32 v10, v15, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s5 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v18 +; GFX10-NEXT: v_lshrrev_b64 v[11:12], v11, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[13:14], v17, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v15, v[0:1] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v17 +; GFX10-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v18 +; GFX10-NEXT: v_lshlrev_b64 v[15:16], v16, v[9:10] +; GFX10-NEXT: v_or_b32_e32 v11, v11, v7 +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v18, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v18 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v20, v0, v11, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[11:12], v19, v[9:10] +; GFX10-NEXT: v_or_b32_e32 v0, v6, v15 +; GFX10-NEXT: v_or_b32_e32 v6, v7, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 +; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v0, s5 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v18, v[9:10] +; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v6, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v20, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s5 -; GFX10-NEXT: v_or_b32_e32 v0, v12, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v7, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v1, s5 +; GFX10-NEXT: v_or_b32_e32 v0, v13, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v9, v5 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v8 -; GFX11-NEXT: v_not_b32_e32 v12, v8 +; GFX11-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v12 -; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] -; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5 -; GFX11-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v19 -; GFX11-NEXT: v_or_b32_e32 v8, v10, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19 -; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX11-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7] -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v0, v14, v16 -; GFX11-NEXT: v_or_b32_e32 v10, v15, v17 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v18 +; GFX11-NEXT: v_bfi_b32 v18, v8, 0, 0x7f +; GFX11-NEXT: v_lshrrev_b64 v[9:10], 1, v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v0, s1 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v10, s1 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo +; GFX11-NEXT: v_sub_nc_u32_e32 v11, 64, v17 +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v17, v[2:3] +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0xffffffc0, v17 +; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v18 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], v11, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[13:14], v17, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v15, v[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v18 +; GFX11-NEXT: v_lshlrev_b64 v[15:16], v16, v[9:10] +; GFX11-NEXT: v_or_b32_e32 v11, v11, v7 +; GFX11-NEXT: v_lshrrev_b64 v[6:7], v18, v[4:5] +; GFX11-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v18 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v0, v11, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[11:12], v19, v[9:10] +; GFX11-NEXT: v_or_b32_e32 v0, v6, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v8, v11, v0, s1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v18, v[9:10] +; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s1 +; GFX11-NEXT: v_cndmask_b32_e32 v9, 0, v14, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v20, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, v1, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v0, v12, v4 -; GFX11-NEXT: v_or_b32_e32 v1, v7, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v13, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v9, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) ret i128 %result @@ -6064,264 +6032,260 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { ; GFX6-LABEL: v_fshl_i128_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7 +; GFX6-NEXT: v_and_b32_e32 v8, 0x7f, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v8 ; GFX6-NEXT: v_lshr_b64 v[1:2], s[0:1], v1 -; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v7 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0xffffffc0, v7 -; GFX6-NEXT: v_lshl_b64 v[5:6], s[0:1], v7 +; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v8 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, 0xffffffc0, v8 +; GFX6-NEXT: v_lshl_b64 v[5:6], s[0:1], v8 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v9 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX6-NEXT: v_not_b32_e32 v0, v0 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v10 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x7f +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 ; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_mov_b32_e32 v3, s2 ; GFX6-NEXT: v_mov_b32_e32 v4, s3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX6-NEXT: s_lshl_b32 s9, s6, 31 -; GFX6-NEXT: v_and_b32_e32 v11, 0x7f, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc +; GFX6-NEXT: v_bfi_b32 v7, v0, 0, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, v2, v4, vcc ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v11 -; GFX6-NEXT: v_not_b32_e32 v8, 63 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v11 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v7 +; GFX6-NEXT: v_not_b32_e32 v9, 63 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v7 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v2 -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v7, v9 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v8 -; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v11 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v9 +; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v7 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v11, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_i128_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7 +; GFX8-NEXT: v_and_b32_e32 v8, 0x7f, v0 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v8 ; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffffc0, v7 -; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v8, s[2:3] +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffffc0, v8 +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v8, s[0:1] ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], v9, s[0:1] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v10, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v7, 0x7f +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 ; GFX8-NEXT: s_mov_b32 s8, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX8-NEXT: s_lshl_b32 s9, s6, 31 -; GFX8-NEXT: v_and_b32_e32 v11, 0x7f, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc +; GFX8-NEXT: v_bfi_b32 v7, v0, 0, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, v2, v4, vcc ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v11 -; GFX8-NEXT: v_not_b32_e32 v8, 63 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[0:1] +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v7 +; GFX8-NEXT: v_not_b32_e32 v9, 63 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v11, v8 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v7, v9 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v8, s[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v11, s[2:3] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v9, s[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v11, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i128_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0x7f, v0 +; GFX9-NEXT: v_sub_u32_e32 v1, 64, v8 ; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] -; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v7 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[3:4], v8, s[2:3] +; GFX9-NEXT: v_add_u32_e32 v9, 0xffffffc0, v8 +; GFX9-NEXT: v_lshlrev_b64 v[5:6], v8, s[0:1] ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX9-NEXT: v_lshlrev_b64 v[1:2], v8, s[0:1] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX9-NEXT: v_not_b32_e32 v0, v0 +; GFX9-NEXT: v_lshlrev_b64 v[1:2], v9, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v7, 0x7f +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 ; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX9-NEXT: s_lshl_b32 s9, s6, 31 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX9-NEXT: v_bfi_b32 v7, v0, 0, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX9-NEXT: v_sub_u32_e32 v2, 64, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1] +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10 +; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v7 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, s[2:3] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX9-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX9-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX9-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i128_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0 -; GFX10-NEXT: v_not_b32_e32 v6, v0 +; GFX10-NEXT: v_and_b32_e32 v11, 0x7f, v0 +; GFX10-NEXT: v_bfi_b32 v12, v0, 0, 0x7f ; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX10-NEXT: s_lshl_b32 s9, s6, 31 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] -; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v6 -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v12 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 64, v11 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v11 +; GFX10-NEXT: v_lshlrev_b64 v[1:2], v11, s[2:3] +; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v12 ; GFX10-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[3:4], v3, s[0:1] ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], v7, s[0:1] -; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v13, s[8:9] -; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v13 -; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] -; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v13 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v2, s[6:7] -; GFX10-NEXT: v_or_b32_e32 v2, v3, v1 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v3, v8, v10 -; GFX10-NEXT: v_or_b32_e32 v8, v9, v11 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1 +; GFX10-NEXT: v_lshlrev_b64 v[7:8], v7, s[0:1] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11 +; GFX10-NEXT: v_add_nc_u32_e32 v13, 0xffffffc0, v12 +; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7] +; GFX10-NEXT: v_or_b32_e32 v3, v3, v1 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v12, s[8:9] +; GFX10-NEXT: v_or_b32_e32 v4, v4, v2 +; GFX10-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1] +; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s9, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1 -; GFX10-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX10-NEXT: v_or_b32_e32 v9, v1, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s9, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1 +; GFX10-NEXT: v_or_b32_e32 v0, v5, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v6, v3 +; GFX10-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX10-NEXT: v_or_b32_e32 v3, v4, v9 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshl_i128_ssv: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0 -; GFX11-NEXT: v_not_b32_e32 v6, v0 -; GFX11-NEXT: s_lshl_b32 s9, s6, 31 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; GFX11-NEXT: v_and_b32_e32 v11, 0x7f, v0 +; GFX11-NEXT: v_bfi_b32 v12, v0, 0, 0x7f ; GFX11-NEXT: s_mov_b32 s8, 0 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 -; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v12 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] ; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_lshl_b32 s9, s6, 31 +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11 +; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v12 ; GFX11-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v7, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v13 -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v2, s[6:7] -; GFX11-NEXT: v_or_b32_e32 v2, v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v13, s[8:9] -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v13 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v13 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v6 :: v_dual_add_nc_u32 v13, 0xffffffc0, v12 +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 64, v11 +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v11, s[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7] +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 ; GFX11-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s2, s0 -; GFX11-NEXT: v_or_b32_e32 v3, v8, v10 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v11 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s3, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], v3, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v4, v4, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v11 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v12, s[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v7, s[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v11 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX11-NEXT: v_or_b32_e32 v9, v1, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s8, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s9, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1 -; GFX11-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s2, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s3, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s8, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s9, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX11-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v5, v2 +; GFX11-NEXT: v_or_b32_e32 v1, v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v9 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -7445,185 +7409,183 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-LABEL: v_fshl_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 -; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[8:9], 1 -; GFX6-NEXT: v_not_b32_e32 v16, v16 -; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX6-NEXT: v_lshlrev_b32_e32 v17, 31, v10 -; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], 1 -; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v17 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24 -; GFX6-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX6-NEXT: v_not_b32_e32 v18, 63 +; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 64, v19 +; GFX6-NEXT: v_add_i32_e32 v27, vcc, v19, v18 +; GFX6-NEXT: v_lshr_b64 v[23:24], v[0:1], v23 +; GFX6-NEXT: v_lshl_b64 v[25:26], v[2:3], v19 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], v19 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v27 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX6-NEXT: v_or_b32_e32 v19, v23, v25 +; GFX6-NEXT: v_or_b32_e32 v23, v24, v26 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] +; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], 1 +; GFX6-NEXT: v_mov_b32_e32 v17, 0x7f +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v10 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1 +; GFX6-NEXT: v_bfi_b32 v10, v16, 0, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v24, 0, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v22, vcc +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v10, v18 +; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 64, v10 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v10 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[0:1], v10 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v21 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v16 +; GFX6-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX6-NEXT: v_or_b32_e32 v11, v11, v22 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v16 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[4:5], v10 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[6:7], v16 +; GFX6-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v16, v18 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v16 +; GFX6-NEXT: v_or_b32_e32 v16, v10, v21 +; GFX6-NEXT: v_or_b32_e32 v21, v11, v22 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[4:5], v19 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v8, v10, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, v11, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] +; GFX6-NEXT: v_lshr_b64 v[8:9], v[12:13], 1 +; GFX6-NEXT: v_lshlrev_b32_e32 v10, 31, v14 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[14:15], 1 +; GFX6-NEXT: v_bfi_b32 v14, v20, 0, v17 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v14, v18 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v14 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GFX6-NEXT: v_lshr_b64 v[12:13], v[10:11], v14 +; GFX6-NEXT: v_lshr_b64 v[14:15], v[8:9], v14 ; GFX6-NEXT: v_lshl_b64 v[16:17], v[10:11], v16 -; GFX6-NEXT: v_lshr_b64 v[18:19], v[8:9], v24 -; GFX6-NEXT: v_not_b32_e32 v25, 63 -; GFX6-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX6-NEXT: v_add_i32_e32 v16, vcc, v23, v25 -; GFX6-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v23 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX6-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX6-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v24, v25 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v0 -; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v26, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v17, v8 -; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v20 -; GFX6-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] -; GFX6-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 64, v17 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v3 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v17 -; GFX6-NEXT: v_or_b32_e32 v3, v16, v19 -; GFX6-NEXT: v_add_i32_e32 v16, vcc, v17, v25 -; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v17 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v16 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc -; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14 -; GFX6-NEXT: v_not_b32_e32 v8, v20 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1 -; GFX6-NEXT: v_and_b32_e32 v12, 0x7f, v8 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v12 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v12 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10 -; GFX6-NEXT: v_add_i32_e32 v13, vcc, v12, v25 -; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v12 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v13 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX6-NEXT: v_or_b32_e32 v5, v18, v5 -; GFX6-NEXT: v_or_b32_e32 v6, v17, v6 -; GFX6-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], v18 +; GFX6-NEXT: v_or_b32_e32 v14, v14, v16 +; GFX6-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v24, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v25, v1 +; GFX6-NEXT: v_or_b32_e32 v3, v23, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX6-NEXT: v_or_b32_e32 v6, v6, v10 +; GFX6-NEXT: v_or_b32_e32 v7, v7, v11 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 -; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX8-NEXT: v_not_b32_e32 v16, v16 -; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v17, 31, v10 -; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v17 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24 -; GFX8-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX8-NEXT: v_not_b32_e32 v18, 63 +; GFX8-NEXT: v_sub_u32_e32 v23, vcc, 64, v19 +; GFX8-NEXT: v_add_u32_e32 v27, vcc, v19, v18 +; GFX8-NEXT: v_lshrrev_b64 v[23:24], v23, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[25:26], v19, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v27, v[0:1] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX8-NEXT: v_or_b32_e32 v19, v23, v25 +; GFX8-NEXT: v_or_b32_e32 v23, v24, v26 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v17, 0x7f +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] +; GFX8-NEXT: v_bfi_b32 v10, v16, 0, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v24, 0, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v22, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v10, v18 +; GFX8-NEXT: v_sub_u32_e32 v21, vcc, 64, v10 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v10, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v21, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v16, v[2:3] +; GFX8-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX8-NEXT: v_or_b32_e32 v11, v11, v22 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v16 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v16, v[6:7] +; GFX8-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v16, v18 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] +; GFX8-NEXT: v_or_b32_e32 v16, v10, v21 +; GFX8-NEXT: v_or_b32_e32 v21, v11, v22 +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v19, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 31, v14 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[14:15] +; GFX8-NEXT: v_bfi_b32 v14, v20, 0, v17 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v14, v18 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v14 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[14:15], v14, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] -; GFX8-NEXT: v_not_b32_e32 v25, 63 -; GFX8-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v23, v25 -; GFX8-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX8-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v24, v25 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] -; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v26, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v17, v8 -; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 64, v17 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v3, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v3, v16, v19 -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v17, v25 -; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc -; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14 -; GFX8-NEXT: v_not_b32_e32 v8, v20 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX8-NEXT: v_and_b32_e32 v12, 0x7f, v8 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v12 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v12, v25 -; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX8-NEXT: v_or_b32_e32 v5, v18, v5 -; GFX8-NEXT: v_or_b32_e32 v6, v17, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v18, v[10:11] +; GFX8-NEXT: v_or_b32_e32 v14, v14, v16 +; GFX8-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v24, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v25, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v23, v3 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v10 +; GFX8-NEXT: v_or_b32_e32 v7, v7, v11 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v2i128: @@ -7632,17 +7594,17 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 -; GFX9-NEXT: v_not_b32_e32 v16, v16 +; GFX9-NEXT: v_mov_b32_e32 v24, 0x7f ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] ; GFX9-NEXT: v_lshl_or_b32 v9, v10, 31, v9 ; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 +; GFX9-NEXT: v_bfi_b32 v25, v16, 0, v24 +; GFX9-NEXT: v_sub_u32_e32 v16, 64, v25 ; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX9-NEXT: v_or_b32_e32 v22, v18, v22 ; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] -; GFX9-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], v25, v[8:9] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 ; GFX9-NEXT: v_or_b32_e32 v18, v18, v16 ; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v23 @@ -7650,48 +7612,47 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v24 +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v25 ; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20 -; GFX9-NEXT: v_or_b32_e32 v0, v25, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v0, v26, v2 ; GFX9-NEXT: v_or_b32_e32 v2, v17, v8 -; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] -; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16 +; GFX9-NEXT: v_and_b32_e32 v17, 0x7f, v20 +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, 64, v17 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v3, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7] +; GFX9-NEXT: v_or_b32_e32 v3, v16, v19 +; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v17 ; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v8, vcc +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v17, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v7, vcc -; GFX9-NEXT: v_not_b32_e32 v8, v20 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX9-NEXT: v_and_b32_e32 v13, 0x7f, v8 +; GFX9-NEXT: v_bfi_b32 v13, v20, 0, v24 ; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5 ; GFX9-NEXT: v_sub_u32_e32 v10, 64, v13 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5] @@ -7709,68 +7670,66 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v4, v17, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX9-NEXT: v_or_b32_e32 v5, v18, v5 -; GFX9-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v17, v6 ; GFX9-NEXT: v_or_b32_e32 v7, v12, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16 -; GFX10-NEXT: v_not_b32_e32 v21, v16 +; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v16 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v27 -; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v21 -; GFX10-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v27 -; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[2:3] +; GFX10-NEXT: v_bfi_b32 v29, v16, 0, 0x7f +; GFX10-NEXT: v_sub_nc_u32_e32 v21, 64, v19 +; GFX10-NEXT: v_add_nc_u32_e32 v25, 0xffffffc0, v19 +; GFX10-NEXT: v_lshlrev_b64 v[23:24], v19, v[2:3] ; GFX10-NEXT: v_lshl_or_b32 v9, v10, 31, v9 -; GFX10-NEXT: v_lshrrev_b64 v[18:19], v18, v[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28 -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v27, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 -; GFX10-NEXT: v_or_b32_e32 v18, v18, v21 -; GFX10-NEXT: v_add_nc_u32_e32 v21, 0xffffffc0, v28 -; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] -; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] -; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v29, v0, v18, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v0, v19, v22 -; GFX10-NEXT: v_lshrrev_b64 v[18:19], v21, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v21, v23, v25 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v27 -; GFX10-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[21:22], v21, v[0:1] +; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v29 +; GFX10-NEXT: v_lshlrev_b64 v[17:18], v19, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v19 +; GFX10-NEXT: v_lshrrev_b64 v[25:26], v29, v[8:9] +; GFX10-NEXT: v_or_b32_e32 v21, v21, v23 +; GFX10-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v29 +; GFX10-NEXT: v_lshlrev_b64 v[27:28], v16, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e32 v30, 0, v17, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v0, v22, v24 +; GFX10-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11] +; GFX10-NEXT: v_or_b32_e32 v19, v25, v27 +; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v29 +; GFX10-NEXT: v_or_b32_e32 v22, v26, v28 ; GFX10-NEXT: v_cndmask_b32_e32 v23, v1, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v21, s5 -; GFX10-NEXT: v_or_b32_e32 v22, v24, v26 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v21, v29, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, v19, v22, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v18, 0, v18, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29 +; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v19, s5 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v29, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v10, v17, v22, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v22, v23, v3, s4 ; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v20 -; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v0, s5 -; GFX10-NEXT: v_or_b32_e32 v0, v16, v2 -; GFX10-NEXT: v_not_b32_e32 v16, v20 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v23 +; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v23 +; GFX10-NEXT: v_bfi_b32 v20, v20, 0, 0x7f +; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v0, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, v1, s5 -; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v16 -; GFX10-NEXT: v_or_b32_e32 v1, v17, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v17, 0xffffffc0, v23 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v0, v30, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v23 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v10, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], v23, v[6:7] ; GFX10-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] ; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v20 ; GFX10-NEXT: v_lshlrev_b64 v[12:13], v23, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 ; GFX10-NEXT: v_or_b32_e32 v10, v2, v10 ; GFX10-NEXT: v_add_nc_u32_e32 v26, 0xffffffc0, v20 @@ -7807,96 +7766,91 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11-LABEL: v_fshl_v2i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v27, 0x7f, v16 -; GFX11-NEXT: v_not_b32_e32 v21, v16 +; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v16 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b64 v[16:17], v27, v[0:1] -; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v27 -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v27 +; GFX11-NEXT: v_bfi_b32 v29, v16, 0, 0x7f +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[17:18], v19, v[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v19 ; GFX11-NEXT: v_lshl_or_b32 v9, v10, 31, v9 ; GFX11-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX11-NEXT: v_lshrrev_b64 v[18:19], v18, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] -; GFX11-NEXT: v_and_b32_e32 v28, 0x7f, v21 -; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[2:3] -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v18, v18, v21 -; GFX11-NEXT: v_cndmask_b32_e32 v29, v0, v18, vcc_lo -; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 0xffffffc0, v28 -; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] -; GFX11-NEXT: v_or_b32_e32 v0, v19, v22 -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v28 -; GFX11-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] -; GFX11-NEXT: v_lshrrev_b64 v[18:19], v21, v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v21, v23, v25 +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v29 +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v30, 0, v17, vcc_lo +; GFX11-NEXT: v_sub_nc_u32_e32 v21, 64, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v18, 0, v18 :: v_dual_add_nc_u32 v25, 0xffffffc0, v19 +; GFX11-NEXT: v_lshlrev_b64 v[23:24], v19, v[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[27:28], v16, v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[21:22], v21, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[25:26], v29, v[8:9] +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v21, v21, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v29 +; GFX11-NEXT: v_or_b32_e32 v19, v25, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v22, v24 +; GFX11-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11] +; GFX11-NEXT: v_or_b32_e32 v22, v26, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v21, v21, v2, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v23, v1, v0, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v22, v24, v26 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v21, s1 -; GFX11-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: v_cndmask_b32_e64 v21, v29, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v19, v22, s1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29 +; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v19, s1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v29, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v10, v17, v22, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v22, v23, v3, s0 ; GFX11-NEXT: v_and_b32_e32 v23, 0x7f, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v24, 0, v0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, v1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v23 -; GFX11-NEXT: v_or_b32_e32 v0, v16, v2 -; GFX11-NEXT: v_not_b32_e32 v16, v20 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc_lo +; GFX11-NEXT: v_bfi_b32 v20, v20, 0, 0x7f ; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v23 -; GFX11-NEXT: v_lshlrev_b64 v[12:13], v23, v[4:5] -; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v16 -; GFX11-NEXT: v_or_b32_e32 v1, v17, v3 -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v23, v[6:7] -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 0xffffffc0, v23 +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v23 +; GFX11-NEXT: v_cndmask_b32_e64 v24, 0, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, v1, s1 +; GFX11-NEXT: v_or_b32_e32 v0, v30, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v23 ; GFX11-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX11-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] -; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5] -; GFX11-NEXT: v_or_b32_e32 v10, v2, v10 ; GFX11-NEXT: v_add_nc_u32_e32 v26, 0xffffffc0, v20 +; GFX11-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v10, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v23, v[6:7] +; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20 +; GFX11-NEXT: v_lshlrev_b64 v[12:13], v23, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 ; GFX11-NEXT: v_lshrrev_b64 v[16:17], v20, v[8:9] +; GFX11-NEXT: v_or_b32_e32 v10, v2, v10 ; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[14:15] ; GFX11-NEXT: v_or_b32_e32 v2, v21, v24 ; GFX11-NEXT: v_or_b32_e32 v11, v3, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v21, v4, v10, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v12, 0, v12 :: v_dual_cndmask_b32 v21, v4, v10 ; GFX11-NEXT: v_lshrrev_b64 v[3:4], v26, v[14:15] -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20 ; GFX11-NEXT: v_or_b32_e32 v10, v16, v18 +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20 ; GFX11-NEXT: v_or_b32_e32 v16, v17, v19 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v23 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v10, s1 ; GFX11-NEXT: v_lshrrev_b64 v[10:11], v20, v[14:15] ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1 +; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v6, v21, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, v8, s2 -; GFX11-NEXT: v_or_b32_e32 v3, v22, v25 ; GFX11-NEXT: v_cndmask_b32_e64 v8, v4, v9, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v10, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v11, s1 +; GFX11-NEXT: v_or_b32_e32 v3, v22, v25 ; GFX11-NEXT: v_or_b32_e32 v4, v12, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v5, v13, v8 ; GFX11-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v7, v7, v10 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 238cc06fc7f7c..d1ba24673043d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -398,8 +398,7 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX6-NEXT: v_not_b32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 @@ -785,19 +784,17 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX6-LABEL: v_fshr_v2i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX6-NEXT: v_and_b32_e32 v5, 7, v2 -; GFX6-NEXT: v_not_b32_e32 v2, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 -; GFX6-NEXT: v_not_b32_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX6-NEXT: v_bfi_b32 v4, v4, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 @@ -1187,40 +1184,36 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX6-LABEL: v_fshr_v4i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v10, 7, v2 -; GFX6-NEXT: v_not_b32_e32 v2, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v10, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v7 -; GFX6-NEXT: v_not_b32_e32 v7, v7 -; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX6-NEXT: v_bfi_b32 v7, v7, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v7 -; GFX6-NEXT: v_not_b32_e32 v7, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 7, v8 -; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX6-NEXT: v_bfi_b32 v7, v8, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_not_b32_e32 v4, v9 ; GFX6-NEXT: v_and_b32_e32 v3, 7, v9 -; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX6-NEXT: v_bfi_b32 v4, v9, 0, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 @@ -5052,8 +5045,7 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GFX6-NEXT: v_not_b32_e32 v5, v4 -; GFX6-NEXT: v_and_b32_e32 v5, 63, v5 +; GFX6-NEXT: v_bfi_b32 v5, v4, 0, 63 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 @@ -5065,8 +5057,7 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX8-NEXT: v_not_b32_e32 v5, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 63, v5 +; GFX8-NEXT: v_bfi_b32 v5, v4, 0, 63 ; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] @@ -5078,8 +5069,7 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: v_not_b32_e32 v5, v4 -; GFX9-NEXT: v_and_b32_e32 v5, 63, v5 +; GFX9-NEXT: v_bfi_b32 v5, v4, 0, 63 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] @@ -5090,12 +5080,11 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX10-LABEL: v_fshr_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v5, v4 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX10-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX10-NEXT: v_bfi_b32 v4, v4, 0, 63 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5103,16 +5092,14 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX11-LABEL: v_fshr_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v5, v4 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v5, 63, v5 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX11-NEXT: v_bfi_b32 v4, v4, 0, 63 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) @@ -5228,9 +5215,8 @@ define i64 @v_fshr_i64_48(i64 %lhs, i64 %rhs) { define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 %amt) { ; GFX6-LABEL: v_fshr_i64_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_not_b32_e32 v1, v0 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX6-NEXT: v_and_b32_e32 v1, 63, v1 +; GFX6-NEXT: v_bfi_b32 v1, v0, 0, 63 ; GFX6-NEXT: v_and_b32_e32 v0, 63, v0 ; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1 ; GFX6-NEXT: v_lshr_b64 v[3:4], s[2:3], v0 @@ -5240,9 +5226,8 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; ; GFX8-LABEL: v_fshr_i64_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_not_b32_e32 v1, v0 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX8-NEXT: v_and_b32_e32 v1, 63, v1 +; GFX8-NEXT: v_bfi_b32 v1, v0, 0, 63 ; GFX8-NEXT: v_and_b32_e32 v0, 63, v0 ; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3] @@ -5252,9 +5237,8 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; ; GFX9-LABEL: v_fshr_i64_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_not_b32_e32 v1, v0 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: v_and_b32_e32 v1, 63, v1 +; GFX9-NEXT: v_bfi_b32 v1, v0, 0, 63 ; GFX9-NEXT: v_and_b32_e32 v0, 63, v0 ; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3] @@ -5264,29 +5248,27 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; ; GFX10-LABEL: v_fshr_i64_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_not_b32_e32 v1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX10-NEXT: v_bfi_b32 v1, v0, 0, 63 +; GFX10-NEXT: v_and_b32_e32 v2, 63, v0 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: v_and_b32_e32 v2, 63, v1 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] -; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshr_i64_ssv: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_not_b32_e32 v1, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX11-NEXT: v_bfi_b32 v1, v0, 0, 63 +; GFX11-NEXT: v_and_b32_e32 v2, 63, v0 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 63, v1 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] -; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) %cast = bitcast i64 %result to <2 x float> @@ -5492,15 +5474,13 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GFX6-NEXT: v_not_b32_e32 v9, v8 -; GFX6-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX6-NEXT: v_bfi_b32 v9, v8, 0, 63 ; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX6-NEXT: v_not_b32_e32 v4, v10 -; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX6-NEXT: v_bfi_b32 v4, v10, 0, 63 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v10 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4 @@ -5513,15 +5493,13 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX8-NEXT: v_not_b32_e32 v9, v8 -; GFX8-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX8-NEXT: v_bfi_b32 v9, v8, 0, 63 ; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_not_b32_e32 v4, v10 -; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX8-NEXT: v_bfi_b32 v4, v10, 0, 63 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] ; GFX8-NEXT: v_and_b32_e32 v4, 63, v10 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] @@ -5534,15 +5512,13 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: v_not_b32_e32 v9, v8 -; GFX9-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX9-NEXT: v_bfi_b32 v9, v8, 0, 63 ; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX9-NEXT: v_not_b32_e32 v4, v10 -; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX9-NEXT: v_bfi_b32 v4, v10, 0, 63 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] ; GFX9-NEXT: v_and_b32_e32 v4, 63, v10 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] @@ -5554,16 +5530,14 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX10-LABEL: v_fshr_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v9, v8 -; GFX10-NEXT: v_not_b32_e32 v11, v10 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: v_bfi_b32 v9, v8, 0, 63 ; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 -; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX10-NEXT: v_bfi_b32 v11, v10, 0, 63 ; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 @@ -5575,17 +5549,15 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX11-LABEL: v_fshr_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v9, v8 -; GFX11-NEXT: v_not_b32_e32 v11, v10 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: v_bfi_b32 v9, v8, 0, 63 ; GFX11-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX11-NEXT: v_and_b32_e32 v9, 63, v9 -; GFX11-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX11-NEXT: v_bfi_b32 v11, v10, 0, 63 ; GFX11-NEXT: v_and_b32_e32 v10, 63, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] @@ -5848,8 +5820,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-NEXT: v_lshl_b64 v[9:10], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX6-NEXT: v_not_b32_e32 v0, v8 -; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0x7f +; GFX6-NEXT: v_bfi_b32 v15, v8, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v15 ; GFX6-NEXT: v_not_b32_e32 v16, 63 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[9:10], v0 @@ -5897,8 +5869,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v8 -; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x7f +; GFX8-NEXT: v_bfi_b32 v15, v8, 0, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v15 ; GFX8-NEXT: v_not_b32_e32 v16, 63 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10] @@ -5946,8 +5918,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v8 -; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7f +; GFX9-NEXT: v_bfi_b32 v15, v8, 0, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v15 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10] ; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] @@ -5990,107 +5962,103 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-LABEL: v_fshr_i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v9, v8 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 31, v1 +; GFX10-NEXT: v_bfi_b32 v18, v8, 0, 0x7f +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 31, v1 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_and_b32_e32 v21, 0x7f, v8 -; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v9 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v12, 64, v20 -; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v20 -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v20, v[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[8:9], v20, v[0:1] -; GFX10-NEXT: v_add_nc_u32_e32 v18, 0xffffffc0, v21 -; GFX10-NEXT: v_lshrrev_b64 v[12:13], v12, v[0:1] +; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18 +; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 +; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v10, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[14:15], v21, v[4:5] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 +; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v20 -; GFX10-NEXT: v_lshrrev_b64 v[18:19], v18, v[6:7] -; GFX10-NEXT: v_or_b32_e32 v10, v12, v10 -; GFX10-NEXT: v_or_b32_e32 v11, v13, v11 -; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v21 -; GFX10-NEXT: v_or_b32_e32 v12, v15, v17 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo +; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v19 +; GFX10-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19 +; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v18 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7] ; GFX10-NEXT: v_or_b32_e32 v0, v14, v16 +; GFX10-NEXT: v_or_b32_e32 v10, v15, v17 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v21 -; GFX10-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v18, v0, s5 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v21, v[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v6, v19, v12, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v20, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s5 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v13, v4, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v1, s5 -; GFX10-NEXT: v_or_b32_e32 v0, v8, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s5 +; GFX10-NEXT: v_or_b32_e32 v0, v12, v4 ; GFX10-NEXT: v_or_b32_e32 v1, v7, v5 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v9, v8 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 31, v1 +; GFX11-NEXT: v_bfi_b32 v18, v8, 0, 0x7f +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 31, v1 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v9 -; GFX11-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v12, 64, v20 -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v20 -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v20, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b64 v[12:13], v12, v[0:1] -; GFX11-NEXT: v_or_b32_e32 v10, v12, v10 -; GFX11-NEXT: v_and_b32_e32 v21, 0x7f, v8 -; GFX11-NEXT: v_lshlrev_b64 v[8:9], v20, v[0:1] -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v20 -; GFX11-NEXT: v_or_b32_e32 v11, v13, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo -; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 0xffffffc0, v21 -; GFX11-NEXT: v_lshrrev_b64 v[14:15], v21, v[4:5] -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v21 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 +; GFX11-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v10, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v19 ; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] -; GFX11-NEXT: v_lshrrev_b64 v[18:19], v18, v[6:7] -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v20 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v21 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v18 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7] ; GFX11-NEXT: v_or_b32_e32 v0, v14, v16 -; GFX11-NEXT: v_or_b32_e32 v12, v15, v17 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v10, v15, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v20, v2, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v0, s1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v10, s1 +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v13, v18, v0, s1 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v21, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v6, v19, v12, s1 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v13, v4, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1 -; GFX11-NEXT: v_or_b32_e32 v0, v8, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s1 +; GFX11-NEXT: v_or_b32_e32 v0, v12, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v1, v7, v5 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v8 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) ret i128 %result @@ -6099,12 +6067,12 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { ; GFX6-LABEL: v_fshr_i128_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_not_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x7f ; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s0, s1, 31 ; GFX6-NEXT: s_mov_b32 s1, 0 -; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v1 +; GFX6-NEXT: v_bfi_b32 v7, v0, 0, v1 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7 ; GFX6-NEXT: v_not_b32_e32 v8, 63 @@ -6152,12 +6120,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; ; GFX8-LABEL: v_fshr_i128_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_not_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x7f ; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s0, s1, 31 ; GFX8-NEXT: s_mov_b32 s1, 0 -; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v1 +; GFX8-NEXT: v_bfi_b32 v7, v0, 0, v1 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7 ; GFX8-NEXT: v_not_b32_e32 v8, 63 @@ -6205,12 +6173,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; ; GFX9-LABEL: v_fshr_i128_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f ; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 31 ; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v1 +; GFX9-NEXT: v_bfi_b32 v7, v0, 0, v1 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7 ; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9] @@ -6257,101 +6225,99 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; ; GFX10-LABEL: v_fshr_i128_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_not_b32_e32 v1, v0 -; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: v_bfi_b32 v11, v0, 0, 0x7f ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX10-NEXT: s_lshr_b32 s8, s1, 31 -; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 -; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v1 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v11 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] -; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffc0, v12 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] -; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[10:11] -; GFX10-NEXT: v_lshlrev_b64 v[6:7], v6, s[10:11] -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v13, s[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[6:7] -; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v13 -; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[10:11] -; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_or_b32_e32 v2, v3, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v12 -; GFX10-NEXT: v_or_b32_e32 v3, v8, v10 -; GFX10-NEXT: v_or_b32_e32 v8, v9, v11 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xffffffc0, v11 +; GFX10-NEXT: v_lshlrev_b64 v[3:4], v11, s[8:9] +; GFX10-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] +; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v12 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11 +; GFX10-NEXT: v_add_nc_u32_e32 v13, 0xffffffc0, v12 +; GFX10-NEXT: v_lshrrev_b64 v[7:8], v12, s[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1] +; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7] +; GFX10-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v11 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v0, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1 -; GFX10-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v7, v9 +; GFX10-NEXT: v_or_b32_e32 v7, v8, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v14, s8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s5, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1 +; GFX10-NEXT: v_or_b32_e32 v0, v5, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v6, v3 +; GFX10-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX10-NEXT: v_or_b32_e32 v3, v4, v9 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshr_i128_ssv: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_not_b32_e32 v1, v0 -; GFX11-NEXT: s_mov_b32 s9, 0 -; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: v_bfi_b32 v11, v0, 0, 0x7f ; GFX11-NEXT: s_lshr_b32 s8, s1, 31 -; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v0 -; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v1 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_mov_b32 s9, 0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 64, v11 +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0 ; GFX11-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v13 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffc0, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[10:11] -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v13, s[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[6:7] -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v6, s[10:11] -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v13 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v13 -; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v1 -; GFX11-NEXT: v_or_b32_e32 v3, v8, v10 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, s[10:11] -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v12 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1 +; GFX11-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v11, s[8:9] +; GFX11-NEXT: v_dual_cndmask_b32 v5, 0, v5 :: v_dual_add_nc_u32 v0, 0xffffffc0, v11 +; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v12 +; GFX11-NEXT: v_lshrrev_b64 v[7:8], v12, s[4:5] +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v12 +; GFX11-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0xffffffc0, v12 +; GFX11-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7] +; GFX11-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v0, v3, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1 -; GFX11-NEXT: v_dual_cndmask_b32 v4, 0, v4 :: v_dual_cndmask_b32 v5, 0, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s8, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s9, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v7, v9 +; GFX11-NEXT: v_or_b32_e32 v7, v8, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v14, s8, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s5, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1 +; GFX11-NEXT: v_or_b32_e32 v0, v5, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v6, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v4, v9 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -7486,226 +7452,224 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_lshl_b64 v[17:18], v[0:1], 1 +; GFX6-NEXT: v_mov_b32_e32 v18, 0x7f +; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GFX6-NEXT: v_bfi_b32 v19, v16, 0, v18 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX6-NEXT: v_not_b32_e32 v0, v16 -; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v0 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v19 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[17:18], v0 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 -; GFX6-NEXT: v_and_b32_e32 v25, 0x7f, v16 -; GFX6-NEXT: v_or_b32_e32 v23, v0, v21 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v25 -; GFX6-NEXT: v_or_b32_e32 v24, v1, v22 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[10:11], v0 -; GFX6-NEXT: v_lshr_b64 v[21:22], v[8:9], v25 -; GFX6-NEXT: v_not_b32_e32 v26, 63 -; GFX6-NEXT: v_or_b32_e32 v21, v21, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v19, v26 -; GFX6-NEXT: v_or_b32_e32 v22, v22, v1 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v0 +; GFX6-NEXT: v_not_b32_e32 v17, 63 +; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 64, v19 +; GFX6-NEXT: v_add_i32_e32 v27, vcc, v19, v17 +; GFX6-NEXT: v_lshr_b64 v[23:24], v[21:22], v23 +; GFX6-NEXT: v_lshl_b64 v[25:26], v[2:3], v19 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[21:22], v19 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[21:22], v27 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] -; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v25, v26 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[17:18], v19 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v0 -; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 -; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v25 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 -; GFX6-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc +; GFX6-NEXT: v_or_b32_e32 v19, v23, v25 +; GFX6-NEXT: v_or_b32_e32 v23, v24, v26 +; GFX6-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v21, v19, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v2, v17 +; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 64, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v2 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[8:9], v2 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[10:11], v21 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5] -; GFX6-NEXT: v_or_b32_e32 v0, v16, v8 -; GFX6-NEXT: v_or_b32_e32 v1, v17, v9 +; GFX6-NEXT: v_or_b32_e32 v21, v2, v21 +; GFX6-NEXT: v_or_b32_e32 v22, v3, v22 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v16 +; GFX6-NEXT: v_bfi_b32 v16, v20, 0, v18 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v22, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v24, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v25, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v19, v8 +; GFX6-NEXT: v_or_b32_e32 v3, v23, v9 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX6-NEXT: v_not_b32_e32 v4, v20 -; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v4 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v16 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16 -; GFX6-NEXT: v_add_i32_e32 v17, vcc, v16, v26 -; GFX6-NEXT: v_or_b32_e32 v10, v4, v10 -; GFX6-NEXT: v_or_b32_e32 v11, v5, v11 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v16 +; GFX6-NEXT: v_add_i32_e32 v21, vcc, v16, v17 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[8:9], v10 +; GFX6-NEXT: v_lshl_b64 v[18:19], v[6:7], v16 ; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v16 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v17 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v21 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v18 -; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc +; GFX6-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc +; GFX6-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX6-NEXT: v_cndmask_b32_e64 v10, v4, v6, s[4:5] +; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v20 -; GFX6-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v10 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v10 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[14:15], v6 -; GFX6-NEXT: v_add_i32_e32 v11, vcc, v10, v26 -; GFX6-NEXT: v_or_b32_e32 v16, v4, v6 -; GFX6-NEXT: v_or_b32_e32 v19, v5, v7 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v11 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v10 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v6, v17 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v6 +; GFX6-NEXT: v_cndmask_b32_e64 v11, v5, v7, s[4:5] +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v6 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[12:13], v6 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[14:15], v8 +; GFX6-NEXT: v_or_b32_e32 v8, v6, v8 +; GFX6-NEXT: v_or_b32_e32 v9, v7, v9 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v17 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc -; GFX6-NEXT: v_or_b32_e32 v4, v17, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX6-NEXT: v_or_b32_e32 v4, v16, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v18, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX6-NEXT: v_or_b32_e32 v6, v10, v8 +; GFX6-NEXT: v_or_b32_e32 v7, v11, v9 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v18, 0x7f +; GFX8-NEXT: v_lshlrev_b64 v[21:22], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GFX8-NEXT: v_bfi_b32 v19, v16, 0, v18 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v16 -; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v19 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v25, 0x7f, v16 -; GFX8-NEXT: v_or_b32_e32 v23, v0, v21 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v25 -; GFX8-NEXT: v_or_b32_e32 v24, v1, v22 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9] -; GFX8-NEXT: v_not_b32_e32 v26, 63 -; GFX8-NEXT: v_or_b32_e32 v21, v21, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v19, v26 -; GFX8-NEXT: v_or_b32_e32 v22, v22, v1 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18] +; GFX8-NEXT: v_not_b32_e32 v17, 63 +; GFX8-NEXT: v_sub_u32_e32 v23, vcc, 64, v19 +; GFX8-NEXT: v_add_u32_e32 v27, vcc, v19, v17 +; GFX8-NEXT: v_lshrrev_b64 v[23:24], v23, v[21:22] +; GFX8-NEXT: v_lshlrev_b64 v[25:26], v19, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v19, v[21:22] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v27, v[21:22] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] -; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v25, v26 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18] -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] -; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 -; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc +; GFX8-NEXT: v_or_b32_e32 v19, v23, v25 +; GFX8-NEXT: v_or_b32_e32 v23, v24, v26 +; GFX8-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v21, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v2, v17 +; GFX8-NEXT: v_sub_u32_e32 v21, vcc, 64, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v2, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v21, v[10:11] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v0, v16, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v17, v9 +; GFX8-NEXT: v_or_b32_e32 v21, v2, v21 +; GFX8-NEXT: v_or_b32_e32 v22, v3, v22 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v16, v[10:11] +; GFX8-NEXT: v_bfi_b32 v16, v20, 0, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v22, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v24, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v25, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v19, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v23, v9 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX8-NEXT: v_not_b32_e32 v4, v20 -; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v4 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v16 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v16, v26 -; GFX8-NEXT: v_or_b32_e32 v10, v4, v10 -; GFX8-NEXT: v_or_b32_e32 v11, v5, v11 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v16 +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v16, v17 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[18:19], v16, v[6:7] ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v21, v[8:9] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc +; GFX8-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX8-NEXT: v_cndmask_b32_e64 v10, v4, v6, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v10 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[12:13] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v10, v26 -; GFX8-NEXT: v_or_b32_e32 v16, v4, v6 -; GFX8-NEXT: v_or_b32_e32 v19, v5, v7 -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v11, v[14:15] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[14:15] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v6, v17 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v5, v7, s[4:5] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v6, v[14:15] +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v6, v[12:13] +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v8, v[14:15] +; GFX8-NEXT: v_or_b32_e32 v8, v6, v8 +; GFX8-NEXT: v_or_b32_e32 v9, v7, v9 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v17, v[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v17, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v4, v16, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v18, v7 -; GFX8-NEXT: v_or_b32_e32 v6, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX8-NEXT: v_or_b32_e32 v6, v10, v8 +; GFX8-NEXT: v_or_b32_e32 v7, v11, v9 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v19, 0x7f ; GFX9-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GFX9-NEXT: v_bfi_b32 v23, v16, 0, v19 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v16 -; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v19 +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v23 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v25, 0x7f, v16 -; GFX9-NEXT: v_or_b32_e32 v23, v0, v21 -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v25 -; GFX9-NEXT: v_or_b32_e32 v24, v1, v22 +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v26, 0x7f, v16 +; GFX9-NEXT: v_or_b32_e32 v24, v0, v21 +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v26 +; GFX9-NEXT: v_or_b32_e32 v25, v1, v22 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], v26, v[8:9] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 ; GFX9-NEXT: v_or_b32_e32 v21, v21, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v19 +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v23 ; GFX9-NEXT: v_or_b32_e32 v22, v22, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v24, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v25, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v25 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18] +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v26 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v23, v[17:18] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v21, v1, v22, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v26, v[10:11] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v21, v9, vcc ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] @@ -7713,9 +7677,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_or_b32_e32 v1, v17, v9 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 +; GFX9-NEXT: v_bfi_b32 v16, v20, 0, v19 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX9-NEXT: v_not_b32_e32 v4, v20 -; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v4 ; GFX9-NEXT: v_sub_u32_e32 v4, 64, v16 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v11 @@ -7760,14 +7723,12 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-LABEL: v_fshr_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v17, v16 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v26, 0x7f, v16 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v17 +; GFX10-NEXT: v_bfi_b32 v25, v16, 0, 0x7f ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26 +; GFX10-NEXT: v_and_b32_e32 v26, 0x7f, v16 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v25 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX10-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v25 @@ -7776,54 +7737,54 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v25 +; GFX10-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26 ; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v23, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v25 ; GFX10-NEXT: v_cndmask_b32_e32 v24, 0, v24, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v22, v18, v22 ; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v26 ; GFX10-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9] +; GFX10-NEXT: v_bfi_b32 v25, v20, 0, 0x7f ; GFX10-NEXT: v_cndmask_b32_e32 v22, v1, v22, vcc_lo ; GFX10-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v26 ; GFX10-NEXT: v_cndmask_b32_e64 v22, v22, v3, s4 +; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v20 ; GFX10-NEXT: v_or_b32_e32 v16, v16, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v2, s4 ; GFX10-NEXT: v_or_b32_e32 v17, v17, v19 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v26 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo -; GFX10-NEXT: v_not_b32_e32 v16, v20 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 31, v5 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s4 -; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v16 +; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v25 ; GFX10-NEXT: v_or_b32_e32 v6, v6, v10 -; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v26, 0, v2, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v25 ; GFX10-NEXT: v_cndmask_b32_e32 v27, 0, v3, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v20 -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5] ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v8, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v25, v[6:7] +; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v20 +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] ; GFX10-NEXT: v_or_b32_e32 v0, v23, v0 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 +; GFX10-NEXT: v_or_b32_e32 v8, v2, v8 ; GFX10-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v20 ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v20, v[12:13] -; GFX10-NEXT: v_or_b32_e32 v8, v2, v8 ; GFX10-NEXT: v_lshlrev_b64 v[18:19], v18, v[14:15] ; GFX10-NEXT: v_or_b32_e32 v2, v21, v26 ; GFX10-NEXT: v_or_b32_e32 v9, v3, v9 -; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v20 ; GFX10-NEXT: v_cndmask_b32_e32 v21, v4, v8, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[3:4], v23, v[14:15] +; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v20 ; GFX10-NEXT: v_or_b32_e32 v8, v16, v18 ; GFX10-NEXT: v_or_b32_e32 v16, v17, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo @@ -7851,99 +7812,95 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11-LABEL: v_fshr_v2i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v17, v16 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v17 +; GFX11-NEXT: v_bfi_b32 v25, v16, 0, 0x7f ; GFX11-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 ; GFX11-NEXT: v_lshlrev_b64 v[23:24], v25, v[0:1] ; GFX11-NEXT: v_and_b32_e32 v26, 0x7f, v16 ; GFX11-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_cndmask_b32 v24, 0, v24 :: v_dual_add_nc_u32 v19, 0xffffffc0, v25 -; GFX11-NEXT: v_cndmask_b32_e32 v23, 0, v23, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v25 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v25 +; GFX11-NEXT: v_dual_cndmask_b32 v23, 0, v23 :: v_dual_cndmask_b32 v24, 0, v24 +; GFX11-NEXT: v_bfi_b32 v25, v20, 0, 0x7f ; GFX11-NEXT: v_or_b32_e32 v22, v18, v22 ; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v26 ; GFX11-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] ; GFX11-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v20 ; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_dual_cndmask_b32 v21, v0, v21 :: v_dual_cndmask_b32 v22, v1, v22 ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v16, v16, v18 ; GFX11-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v25 -; GFX11-NEXT: v_or_b32_e32 v17, v17, v19 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v21, v21, v2, s0 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v19 ; GFX11-NEXT: v_cndmask_b32_e64 v22, v22, v3, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v26 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11] ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 31, v5 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo -; GFX11-NEXT: v_not_b32_e32 v16, v20 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v26 +; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20 +; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v16 :: v_dual_cndmask_b32 v1, v1, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v26, 0, v2, vcc_lo ; GFX11-NEXT: v_or_b32_e32 v6, v6, v10 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v17 :: v_dual_and_b32 v20, 0x7f, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_cndmask_b32 v26, 0, v2 :: v_dual_and_b32 v25, 0x7f, v16 ; GFX11-NEXT: v_cndmask_b32_e32 v27, 0, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v8, s0 +; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v25 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 -; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20 +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5] ; GFX11-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v8, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[8:9], v25, v[6:7] ; GFX11-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v20 -; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc_lo -; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25 ; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v25 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v20 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v8, v[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[8:9], v25, v[6:7] -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] -; GFX11-NEXT: v_lshrrev_b64 v[16:17], v20, v[12:13] -; GFX11-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v1, v24, v1 ; GFX11-NEXT: v_or_b32_e32 v8, v2, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25 ; GFX11-NEXT: v_or_b32_e32 v2, v21, v26 ; GFX11-NEXT: v_or_b32_e32 v9, v3, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v1, v24, v1 +; GFX11-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] +; GFX11-NEXT: v_lshrrev_b64 v[16:17], v20, v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v21, v4, v8, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[3:4], v23, v[14:15] ; GFX11-NEXT: v_or_b32_e32 v8, v16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v16, v17, v19 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v6, v21, v6, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v8, s1 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], v20, v[14:15] ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, v12, s2 ; GFX11-NEXT: v_or_b32_e32 v3, v22, v27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v12, v4, v13, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v8, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v9, s1 ; GFX11-NEXT: v_or_b32_e32 v4, v10, v5 -; GFX11-NEXT: v_or_b32_e32 v5, v11, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v5, v11, v12 ; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v7, v7, v9 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index cae833b0d64e3..0e1bbbd1ea92b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -123,9 +123,8 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 ; GFX8-NEXT: s_lshl_b32 s1, s1, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX8-NEXT: v_bfi_b32 v2, s0, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, s1, v2 @@ -143,11 +142,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 ; GFX7-NEXT: s_lshl_b32 s1, s1, s0 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -302,9 +300,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_bfi_b32 v3, v0, 0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -319,9 +316,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: s_and_b32 s1, s4, 0xffff ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, s0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -393,9 +389,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX8-NEXT: v_bfi_b32 v2, v0, 0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -410,9 +405,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, s0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -482,12 +476,11 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_and_b32 s0, s2, 0xffff -; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_bfi_b32 v2, v1, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -505,11 +498,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -576,10 +568,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_bfi_b32 v3, s0, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -597,11 +588,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -668,10 +658,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX8-NEXT: v_bfi_b32 v3, v1, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -689,11 +678,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -820,19 +808,18 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX8-LABEL: insertelement_v_v4i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 1 ; GFX8-NEXT: s_lshr_b32 s0, s3, 1 +; GFX8-NEXT: s_and_b32 s1, s3, 1 ; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s1, v4 +; GFX8-NEXT: v_bfi_b32 v4, s1, 0, v4 ; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] @@ -846,19 +833,18 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s3, 1 ; GFX7-NEXT: s_lshr_b32 s0, s3, 1 +; GFX7-NEXT: s_and_b32 s1, s3, 1 ; GFX7-NEXT: s_and_b32 s2, s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_lshl_b32 s2, s2, s1 ; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX7-NEXT: v_bfi_b32 v2, s1, 0, v2 ; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -1090,8 +1076,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1117,8 +1102,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_not_b32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -1228,8 +1212,7 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1246,17 +1229,16 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX7-NEXT: v_not_b32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -1356,16 +1338,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX8-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX8-NEXT: s_and_b32 s0, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_bfi_b32 v2, v2, 0, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -1382,16 +1363,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_bfi_b32 v2, v2, 0, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -1479,15 +1459,14 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: s_lshr_b32 s0, s2, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s1, v5 +; GFX8-NEXT: v_bfi_b32 v5, s1, 0, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -1501,19 +1480,18 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s2, 1 ; GFX7-NEXT: s_lshr_b32 s0, s2, 1 +; GFX7-NEXT: s_and_b32 s1, s2, 1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 ; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 +; GFX7-NEXT: v_bfi_b32 v3, s1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -1601,16 +1579,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX8-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX8-NEXT: v_not_b32_e32 v3, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, v6, v3 +; GFX8-NEXT: v_bfi_b32 v3, v3, 0, v6 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -1627,16 +1604,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_bfi_b32 v3, v3, 0, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -1910,14 +1886,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX8-LABEL: insertelement_v_v8i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_and_b32 s0, s3, 1 ; GFX8-NEXT: s_lshr_b32 s4, s3, 1 +; GFX8-NEXT: s_and_b32 s0, s3, 1 ; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshl_b32 s5, s1, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_not_b32 s6, s0 +; GFX8-NEXT: s_lshl_b32 s5, s1, s0 +; GFX8-NEXT: s_lshl_b32 s6, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 @@ -1926,7 +1901,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_bfi_b32 v6, s6, 0, v6 ; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] @@ -1942,14 +1917,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s0, s3, 1 ; GFX7-NEXT: s_lshr_b32 s4, s3, 1 +; GFX7-NEXT: s_and_b32 s0, s3, 1 ; GFX7-NEXT: s_and_b32 s1, s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: s_lshl_b32 s5, s1, s0 -; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s6, s0 +; GFX7-NEXT: s_lshl_b32 s5, s1, s0 +; GFX7-NEXT: s_lshl_b32 s6, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -1958,7 +1932,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_bfi_b32 v4, s6, 0, v4 ; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] @@ -2263,17 +2237,16 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_and_b32 s4, s4, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 @@ -2294,23 +2267,22 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s4, s4, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX7-NEXT: v_not_b32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 @@ -2441,23 +2413,22 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2478,23 +2449,22 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX7-NEXT: v_not_b32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2628,7 +2598,6 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 @@ -2636,7 +2605,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v9, v0, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc @@ -2658,9 +2627,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 @@ -2668,7 +2636,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc @@ -2773,13 +2741,12 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX8-LABEL: insertelement_v_v8i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_and_b32 s0, s2, 1 ; GFX8-NEXT: s_lshr_b32 s4, s2, 1 +; GFX8-NEXT: s_and_b32 s0, s2, 1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_not_b32 s5, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshl_b32 s5, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -2789,7 +2756,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX8-NEXT: v_bfi_b32 v1, s5, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] @@ -2805,14 +2772,13 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s0, s2, 1 ; GFX7-NEXT: s_lshr_b32 s4, s2, 1 +; GFX7-NEXT: s_and_b32 s0, s2, 1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s5, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s5, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -2821,7 +2787,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX7-NEXT: v_bfi_b32 v1, s5, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v7, v1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] @@ -2935,7 +2901,6 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 @@ -2943,7 +2908,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc @@ -2959,15 +2924,14 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 @@ -2975,7 +2939,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc @@ -3283,19 +3247,18 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-NEXT: s_and_b32 s0, s3, 1 +; GFX8-NEXT: s_lshr_b32 m0, s3, 1 ; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshr_b32 m0, s3, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_movrels_b32_e32 v12, v0 -; GFX8-NEXT: v_and_b32_e32 v12, s0, v12 +; GFX8-NEXT: v_bfi_b32 v12, s0, 0, v12 ; GFX8-NEXT: v_or_b32_e32 v12, s1, v12 ; GFX8-NEXT: v_movreld_b32_e32 v0, v12 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -3310,17 +3273,16 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: s_and_b32 s0, s3, 1 +; GFX7-NEXT: s_lshr_b32 m0, s3, 1 ; GFX7-NEXT: s_and_b32 s1, s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: s_lshr_b32 m0, s3, 1 ; GFX7-NEXT: s_lshl_b32 s1, s1, s0 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_movrels_b32_e32 v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_movreld_b32_e32 v2, v0 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 @@ -3644,21 +3606,20 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i ; GFX8-NEXT: v_mov_b32_e32 v6, s21 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v7, s22 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s4, s4, 0xffff -; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v9, s23 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v9, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 @@ -3705,20 +3666,19 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i ; GFX7-NEXT: v_mov_b32_e32 v6, s21 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_mov_b32_e32 v7, s22 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s4, s4, 0xffff +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_mov_b32_e32 v9, s23 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX7-NEXT: v_not_b32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v9, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-NEXT: v_mov_b32_e32 v1, s17 @@ -3936,20 +3896,19 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX8-NEXT: v_mov_b32_e32 v7, s17 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, s18 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v10, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 @@ -3996,20 +3955,19 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX7-NEXT: v_mov_b32_e32 v7, s17 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_mov_b32_e32 v9, s18 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_mov_b32_e32 v10, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX7-NEXT: v_not_b32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX7-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 @@ -4216,7 +4174,6 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v12, 0 @@ -4231,7 +4188,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX8-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v15, v0, v15 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v15, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc @@ -4263,9 +4220,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: s_mov_b32 s18, -1 @@ -4278,7 +4234,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr ; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] -; GFX7-NEXT: v_and_b32_e32 v1, v11, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v11 ; GFX7-NEXT: v_or_b32_e32 v11, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc @@ -4452,14 +4408,13 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va ; GFX8-NEXT: v_mov_b32_e32 v13, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v11, 16 ; GFX8-NEXT: v_mov_b32_e32 v12, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_movrels_b32_e32 v13, v3 -; GFX8-NEXT: v_and_b32_e32 v13, s0, v13 +; GFX8-NEXT: v_bfi_b32 v13, s0, 0, v13 ; GFX8-NEXT: v_or_b32_e32 v2, v13, v2 ; GFX8-NEXT: v_movreld_b32_e32 v3, v2 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -4474,17 +4429,16 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: s_and_b32 s0, s2, 1 +; GFX7-NEXT: s_lshr_b32 m0, s2, 1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: s_lshr_b32 m0, s2, 1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_movrels_b32_e32 v1, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_bfi_b32 v1, s0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_movreld_b32_e32 v3, v0 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 @@ -4611,7 +4565,6 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v12, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v13, 0 @@ -4626,7 +4579,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3 ; GFX8-NEXT: v_or_b32_e32 v16, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v16, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc @@ -4654,13 +4607,12 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: s_mov_b32 s18, -1 @@ -4673,7 +4625,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v12, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index fe7d421d27f84..4598bcc04a505 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -910,9 +910,8 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: s_lshl_b32 s1, s1, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX8-NEXT: v_bfi_b32 v2, s0, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, s1, v2 @@ -930,11 +929,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 ; GFX7-NEXT: s_lshl_b32 s1, s1, s0 ; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -1089,9 +1087,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_bfi_b32 v3, v0, 0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1106,9 +1103,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_and_b32 s1, s4, 0xff ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, s0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -1180,9 +1176,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX8-NEXT: v_bfi_b32 v2, v0, 0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1197,9 +1192,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, s0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -1269,12 +1263,11 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_and_b32 s0, s2, 0xff -; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_bfi_b32 v2, v1, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1292,11 +1285,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -1363,10 +1355,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_bfi_b32 v3, s0, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1384,11 +1375,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -1455,10 +1445,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_mov_b32_e32 v1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX8-NEXT: v_bfi_b32 v3, v1, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1476,11 +1465,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1683,19 +1671,18 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX8-LABEL: insertelement_v_v8i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 3 ; GFX8-NEXT: s_lshr_b32 s0, s3, 2 +; GFX8-NEXT: s_and_b32 s1, s3, 3 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s1, v4 +; GFX8-NEXT: v_bfi_b32 v4, s1, 0, v4 ; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] @@ -1709,19 +1696,18 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s3, 3 ; GFX7-NEXT: s_lshr_b32 s0, s3, 2 +; GFX7-NEXT: s_and_b32 s1, s3, 3 ; GFX7-NEXT: s_and_b32 s2, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_lshl_b32 s2, s2, s1 ; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX7-NEXT: v_bfi_b32 v2, s1, 0, v2 ; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -1953,8 +1939,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1980,8 +1965,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_not_b32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2091,8 +2075,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2109,17 +2092,16 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX7-NEXT: v_not_b32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2219,16 +2201,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_bfi_b32 v2, v2, 0, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2245,16 +2226,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX7-NEXT: s_and_b32 s0, s2, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xff, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_bfi_b32 v2, v2, 0, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2342,15 +2322,14 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: s_lshr_b32 s0, s2, 2 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s1, v5 +; GFX8-NEXT: v_bfi_b32 v5, s1, 0, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -2364,19 +2343,18 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s2, 3 ; GFX7-NEXT: s_lshr_b32 s0, s2, 2 +; GFX7-NEXT: s_and_b32 s1, s2, 3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 ; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 +; GFX7-NEXT: v_bfi_b32 v3, s1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -2464,16 +2442,15 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_mov_b32_e32 v6, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX8-NEXT: v_not_b32_e32 v3, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, v6, v3 +; GFX8-NEXT: v_bfi_b32 v3, v3, 0, v6 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2490,16 +2467,15 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xff, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_bfi_b32 v3, v3, 0, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2773,14 +2749,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX8-LABEL: insertelement_v_v16i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_and_b32 s0, s3, 3 ; GFX8-NEXT: s_lshr_b32 s4, s3, 2 +; GFX8-NEXT: s_and_b32 s0, s3, 3 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 -; GFX8-NEXT: s_lshl_b32 s5, s1, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_not_b32 s6, s0 +; GFX8-NEXT: s_lshl_b32 s5, s1, s0 +; GFX8-NEXT: s_lshl_b32 s6, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 @@ -2789,7 +2764,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_bfi_b32 v6, s6, 0, v6 ; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] @@ -2805,14 +2780,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s0, s3, 3 ; GFX7-NEXT: s_lshr_b32 s4, s3, 2 +; GFX7-NEXT: s_and_b32 s0, s3, 3 ; GFX7-NEXT: s_and_b32 s1, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: s_lshl_b32 s5, s1, s0 -; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s6, s0 +; GFX7-NEXT: s_lshl_b32 s5, s1, s0 +; GFX7-NEXT: s_lshl_b32 s6, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -2821,7 +2795,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_bfi_b32 v4, s6, 0, v4 ; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] @@ -3126,17 +3100,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: v_mov_b32_e32 v3, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 @@ -3157,23 +3130,22 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_and_b32 s4, s4, 0xff ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_and_b32 s4, s4, 0xff +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX7-NEXT: v_not_b32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 @@ -3304,23 +3276,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0xff ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xff +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3341,23 +3312,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX7-NEXT: v_not_b32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -3491,7 +3461,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 @@ -3499,7 +3468,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v9, v0, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc @@ -3521,9 +3490,8 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 @@ -3531,7 +3499,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc @@ -3636,13 +3604,12 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX8-LABEL: insertelement_v_v16i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: s_lshr_b32 s4, s2, 2 +; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_not_b32 s5, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshl_b32 s5, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3652,7 +3619,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX8-NEXT: v_bfi_b32 v1, s5, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] @@ -3668,14 +3635,13 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s0, s2, 3 ; GFX7-NEXT: s_lshr_b32 s4, s2, 2 +; GFX7-NEXT: s_and_b32 s0, s2, 3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s5, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s5, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -3684,7 +3650,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX7-NEXT: v_bfi_b32 v1, s5, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v7, v1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] @@ -3798,7 +3764,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 @@ -3806,7 +3771,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc @@ -3822,15 +3787,14 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 2, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 3, v3 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 @@ -3838,7 +3802,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll index e22cee87e17da..d0e32fc205144 100644 --- a/llvm/test/CodeGen/AMDGPU/andorn2.ll +++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll @@ -48,8 +48,7 @@ entry: } ; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use -; GCN: v_not_b32 -; GCN: v_and_b32 +; GCN: v_bfi_b32 define amdgpu_kernel void @vector_andn2_i32_s_v_one_use( ptr addrspace(1) %r0, i32 %s) { entry: @@ -61,8 +60,7 @@ entry: } ; GCN-LABEL: {{^}}vector_andn2_i32_v_s_one_use -; GCN: s_not_b32 -; GCN: v_and_b32 +; GCN: v_bfi_b32 define amdgpu_kernel void @vector_andn2_i32_v_s_one_use( ptr addrspace(1) %r0, i32 %s) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll index 089d6f5bf57ca..1492119a6022d 100644 --- a/llvm/test/CodeGen/AMDGPU/anyext.ll +++ b/llvm/test/CodeGen/AMDGPU/anyext.ll @@ -77,8 +77,7 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace ; GCN-NEXT: s_mov_b32 s9, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_bfi_b32 v0, v0, 0, 1 ; GCN-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll index 187f19f653858..52d4780005aad 100644 --- a/llvm/test/CodeGen/AMDGPU/bitop3.ll +++ b/llvm/test/CodeGen/AMDGPU/bitop3.ll @@ -99,9 +99,8 @@ define amdgpu_ps float @and_not_and_and(i32 %a, i32 %b, i32 %c) { ; ; GFX950-GISEL-LABEL: and_not_and_and: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_not_b32_e32 v1, v1 ; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX950-GISEL-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX950-GISEL-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-LABEL: and_not_and_and: @@ -111,10 +110,9 @@ define amdgpu_ps float @and_not_and_and(i32 %a, i32 %b, i32 %c) { ; ; GFX1250-GISEL-LABEL: and_not_and_and: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_not_b32_e32 v1, v1 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX1250-GISEL-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog %notb = xor i32 %b, -1 %and1 = and i32 %a, %c @@ -268,9 +266,8 @@ define amdgpu_ps float @test_12_src_overflow(i32 %a, i32 %b, i32 %c) { ; GFX950-GISEL-LABEL: test_12_src_overflow: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_not_b32_e32 v3, v0 -; GFX950-GISEL-NEXT: v_not_b32_e32 v4, v2 ; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v0, v2, v0 bitop3:0xc -; GFX950-GISEL-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX950-GISEL-NEXT: v_bfi_b32 v2, v2, 0, v3 ; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0xc8 ; GFX950-GISEL-NEXT: ; return to shader part epilog ; @@ -282,10 +279,9 @@ define amdgpu_ps float @test_12_src_overflow(i32 %a, i32 %b, i32 %c) { ; GFX1250-GISEL-LABEL: test_12_src_overflow: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_not_b32_e32 v3, v0 -; GFX1250-GISEL-NEXT: v_not_b32_e32 v4, v2 ; GFX1250-GISEL-NEXT: v_bitop3_b32 v0, v0, v2, v0 bitop3:0xc ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX1250-GISEL-NEXT: v_bfi_b32 v2, v2, 0, v3 ; GFX1250-GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0xc8 ; GFX1250-GISEL-NEXT: ; return to shader part epilog %nota = xor i32 %a, -1 @@ -312,13 +308,12 @@ define amdgpu_ps float @test_100_src_overflow(i32 %a, i32 %b, i32 %c) { ; ; GFX950-GISEL-LABEL: test_100_src_overflow: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_bitop3_b32 v3, v2, v0, v2 bitop3:3 -; GFX950-GISEL-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX950-GISEL-NEXT: v_or_b32_e32 v3, v2, v0 ; GFX950-GISEL-NEXT: v_bitop3_b32 v4, v0, v1, v0 bitop3:0x30 ; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX950-GISEL-NEXT: v_not_b32_e32 v1, v2 +; GFX950-GISEL-NEXT: v_bfi_b32 v3, v3, 0, v1 ; GFX950-GISEL-NEXT: v_and_b32_e32 v4, v4, v2 -; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX950-GISEL-NEXT: v_bfi_b32 v0, v2, 0, v0 ; GFX950-GISEL-NEXT: v_or3_b32 v0, v3, v4, v0 ; GFX950-GISEL-NEXT: ; return to shader part epilog ; @@ -333,16 +328,15 @@ define amdgpu_ps float @test_100_src_overflow(i32 %a, i32 %b, i32 %c) { ; ; GFX1250-GISEL-LABEL: test_100_src_overflow: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_bitop3_b32 v3, v2, v0, v2 bitop3:3 +; GFX1250-GISEL-NEXT: v_or_b32_e32 v3, v2, v0 ; GFX1250-GISEL-NEXT: v_bitop3_b32 v4, v0, v1, v0 bitop3:0x30 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX1250-GISEL-NEXT: v_not_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-GISEL-NEXT: v_bfi_b32 v1, v3, 0, v1 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v3, v4, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v5 -; GFX1250-GISEL-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX1250-GISEL-NEXT: v_bfi_b32 v0, v2, 0, v0 +; GFX1250-GISEL-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog %or1 = or i32 %c, %a %not1 = xor i32 %or1, -1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 355f77acfd302..ba5ce8bb5fae7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -85,10 +85,8 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4 ; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6 ; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 -; SI-NEXT: v_not_b32_e32 v5, v5 -; SI-NEXT: v_not_b32_e32 v4, v4 -; SI-NEXT: v_and_b32_e32 v5, v3, v5 -; SI-NEXT: v_and_b32_e32 v4, v2, v4 +; SI-NEXT: v_bfi_b32 v5, v5, 0, v3 +; SI-NEXT: v_bfi_b32 v4, v4, 0, v2 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc