Skip to content

Conversation

jayfoad
Copy link
Contributor

@jayfoad jayfoad commented Sep 3, 2025

No description provided.

// (z & ~x)
def : AMDGPUPatIgnoreCopies <
(DivergentBinFrag<and> i32:$z, (not_oneuse i32:$x)),
(V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), (i32 0),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am really not sure if/why we need COPY_TO_REGCLASS here. This is just copied from above.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect this is a really old workaround. I remember using COPY_TO_REGCLASS to work around an old tablegen bug with multiple results

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, since the general bfi pattern has three inputs, I think this handles the case where two of them are in sgprs. Without the COPY_TO_REGCLASS we could select an instruction that violates the constant bus constraint (pre GFX10).

For this new pattern there are only two inputs so that is not a problem.

Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM but can probably drop the COPY_TO_REGCLASSes

@jayfoad jayfoad enabled auto-merge (squash) September 3, 2025 10:33
@jayfoad jayfoad merged commit d4de780 into llvm:main Sep 3, 2025
9 checks passed
@jayfoad jayfoad deleted the bfi-and-not branch September 3, 2025 12:07
@llvmbot
Copy link
Member

llvmbot commented Sep 3, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Jay Foad (jayfoad)

Changes

Patch is 236.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156636.diff

10 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+16)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll (+14-28)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+604-650)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+423-466)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll (+117-165)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll (+82-118)
  • (modified) llvm/test/CodeGen/AMDGPU/andorn2.ll (+2-4)
  • (modified) llvm/test/CodeGen/AMDGPU/anyext.ll (+1-2)
  • (modified) llvm/test/CodeGen/AMDGPU/bitop3.ll (+13-19)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll (+2-4)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 46eab2a0a98c7..9cc9af7575db6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2480,6 +2480,22 @@ def : AMDGPUPatIgnoreCopies <
               (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
 >;
 
+// (z & ~x)
+def : AMDGPUPatIgnoreCopies <
+  (DivergentBinFrag<and> i32:$z, (not_oneuse i32:$x)),
+  (V_BFI_B32_e64 VSrc_b32:$x, (i32 0), VSrc_b32:$z)
+>;
+
+// 64-bit version
+def : AMDGPUPatIgnoreCopies <
+  (DivergentBinFrag<and> i64:$z, (not_oneuse i64:$x)),
+  (REG_SEQUENCE VReg_64,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), (i32 0),
+                   (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), (i32 0),
+                   (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
+>;
+
 // SHA-256 Ch function
 // z ^ (x & (y ^ z))
 def : AMDGPUPatIgnoreCopies <
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index e1ef3f9be0a5d..aa38c63dc9dcd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -99,15 +99,13 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
 ; GCN-LABEL: v_andn2_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_not_b32_e32 v1, v1
-; GCN-NEXT:    v_and_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_bfi_b32 v0, v1, 0, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_andn2_i32:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_not_b32_e32 v1, v1
-; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    v_bfi_b32 v0, v1, 0, v0
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor i32 %src1, -1
   %and = and i32 %src0, %not.src1
@@ -117,14 +115,12 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
 define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) {
 ; GCN-LABEL: v_andn2_i32_sv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_not_b32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, s2, v0
+; GCN-NEXT:    v_bfi_b32 v0, v0, 0, s2
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: v_andn2_i32_sv:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_not_b32_e32 v0, v0
-; GFX10PLUS-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX10PLUS-NEXT:    v_bfi_b32 v0, v0, 0, s2
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %not.src1 = xor i32 %src1, -1
   %and = and i32 %src0, %not.src1
@@ -135,14 +131,12 @@ define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) {
 define amdgpu_ps float @v_andn2_i32_vs(i32 %src0, i32 inreg %src1) {
 ; GCN-LABEL: v_andn2_i32_vs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_not_b32 s0, s2
-; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
+; GCN-NEXT:    v_bfi_b32 v0, s2, 0, v0
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: v_andn2_i32_vs:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_not_b32 s0, s2
-; GFX10PLUS-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX10PLUS-NEXT:    v_bfi_b32 v0, s2, 0, v0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %not.src1 = xor i32 %src1, -1
   %and = and i32 %src0, %not.src1
@@ -247,19 +241,15 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
 ; GCN-LABEL: v_andn2_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_not_b32_e32 v2, v2
-; GCN-NEXT:    v_not_b32_e32 v3, v3
-; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
-; GCN-NEXT:    v_and_b32_e32 v1, v1, v3
+; GCN-NEXT:    v_bfi_b32 v0, v2, 0, v0
+; GCN-NEXT:    v_bfi_b32 v1, v3, 0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_andn2_i64:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_not_b32_e32 v2, v2
-; GFX10PLUS-NEXT:    v_not_b32_e32 v3, v3
-; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX10PLUS-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT:    v_bfi_b32 v0, v2, 0, v0
+; GFX10PLUS-NEXT:    v_bfi_b32 v1, v3, 0, v1
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor i64 %src1, -1
   %and = and i64 %src0, %not.src1
@@ -269,18 +259,14 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
 define amdgpu_ps <2 x float> @v_andn2_i64_sv(i64 inreg %src0, i64 %src1) {
 ; GCN-LABEL: v_andn2_i64_sv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_not_b32_e32 v0, v0
-; GCN-NEXT:    v_not_b32_e32 v1, v1
-; GCN-NEXT:    v_and_b32_e32 v0, s2, v0
-; GCN-NEXT:    v_and_b32_e32 v1, s3, v1
+; GCN-NEXT:    v_bfi_b32 v0, v0, 0, s2
+; GCN-NEXT:    v_bfi_b32 v1, v1, 0, s3
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: v_andn2_i64_sv:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_not_b32_e32 v0, v0
-; GFX10PLUS-NEXT:    v_not_b32_e32 v1, v1
-; GFX10PLUS-NEXT:    v_and_b32_e32 v0, s2, v0
-; GFX10PLUS-NEXT:    v_and_b32_e32 v1, s3, v1
+; GFX10PLUS-NEXT:    v_bfi_b32 v0, v0, 0, s2
+; GFX10PLUS-NEXT:    v_bfi_b32 v1, v1, 0, s3
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %not.src1 = xor i64 %src1, -1
   %and = and i64 %src0, %not.src1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index fc81e16d68e98..fd329e230e78b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -396,8 +396,7 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v3, 7, v2
-; GFX6-NEXT:    v_not_b32_e32 v2, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT:    v_bfi_b32 v2, v2, 0, 7
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 1, 7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
@@ -784,19 +783,17 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX6-LABEL: v_fshl_v2i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
 ; GFX6-NEXT:    v_and_b32_e32 v5, 7, v2
-; GFX6-NEXT:    v_not_b32_e32 v2, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX6-NEXT:    v_bfi_b32 v2, v2, 0, 7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v5, v0
 ; GFX6-NEXT:    v_bfe_u32 v5, v1, 1, 7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v5
+; GFX6-NEXT:    v_bfe_u32 v1, v1, 8, 8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v4
-; GFX6-NEXT:    v_not_b32_e32 v4, v4
-; GFX6-NEXT:    v_bfe_u32 v1, v1, 8, 8
-; GFX6-NEXT:    v_and_b32_e32 v4, 7, v4
+; GFX6-NEXT:    v_bfi_b32 v4, v4, 0, 7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v4, v1
@@ -1184,38 +1181,34 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-LABEL: v_fshl_v4i8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
 ; GFX6-NEXT:    v_and_b32_e32 v9, 7, v2
-; GFX6-NEXT:    v_not_b32_e32 v2, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
+; GFX6-NEXT:    v_bfi_b32 v2, v2, 0, 7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v9, v0
 ; GFX6-NEXT:    v_bfe_u32 v9, v1, 1, 7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v9
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v6
-; GFX6-NEXT:    v_not_b32_e32 v6, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_bfe_u32 v3, v1, 8, 8
-; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT:    v_bfi_b32 v6, v6, 0, 7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v6, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_and_b32_e32 v3, 7, v7
-; GFX6-NEXT:    v_not_b32_e32 v6, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_bfe_u32 v4, v1, 16, 8
-; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT:    v_bfi_b32 v6, v7, 0, 7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v6, v4
-; GFX6-NEXT:    v_not_b32_e32 v6, v8
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 7, v8
-; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT:    v_bfi_b32 v6, v8, 0, 7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 25, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
@@ -5023,10 +5016,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX6-LABEL: v_fshl_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v5, 63, v4
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    v_not_b32_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT:    v_and_b32_e32 v5, 63, v4
+; GFX6-NEXT:    v_bfi_b32 v4, v4, 0, 63
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v5
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -5036,10 +5028,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX8-LABEL: v_fshl_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v5, 63, v4
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    v_not_b32_e32 v4, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 63, v4
+; GFX8-NEXT:    v_bfi_b32 v4, v4, 0, 63
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -5049,10 +5040,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX9-LABEL: v_fshl_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v5, 63, v4
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    v_not_b32_e32 v4, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 63, v4
+; GFX9-NEXT:    v_bfi_b32 v4, v4, 0, 63
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -5062,12 +5052,11 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX10-LABEL: v_fshl_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_not_b32_e32 v5, v4
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT:    v_and_b32_e32 v4, 63, v4
-; GFX10-NEXT:    v_and_b32_e32 v5, 63, v5
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX10-NEXT:    v_and_b32_e32 v5, 63, v4
+; GFX10-NEXT:    v_bfi_b32 v4, v4, 0, 63
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -5075,16 +5064,14 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX11-LABEL: v_fshl_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_not_b32_e32 v5, v4
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT:    v_and_b32_e32 v4, 63, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v5, 63, v5
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX11-NEXT:    v_and_b32_e32 v5, 63, v4
+; GFX11-NEXT:    v_bfi_b32 v4, v4, 0, 63
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5204,10 +5191,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
 ; GFX6-LABEL: v_fshl_i64_ssv:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_and_b32_e32 v1, 63, v0
-; GFX6-NEXT:    v_not_b32_e32 v0, v0
 ; GFX6-NEXT:    v_lshl_b64 v[1:2], s[0:1], v1
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
-; GFX6-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX6-NEXT:    v_bfi_b32 v0, v0, 0, 63
 ; GFX6-NEXT:    v_lshr_b64 v[3:4], s[0:1], v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v4
@@ -5216,10 +5202,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
 ; GFX8-LABEL: v_fshl_i64_ssv:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_and_b32_e32 v1, 63, v0
-; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v1, s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
-; GFX8-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX8-NEXT:    v_bfi_b32 v0, v0, 0, 63
 ; GFX8-NEXT:    v_lshrrev_b64 v[3:4], v0, s[0:1]
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v4
@@ -5228,10 +5213,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
 ; GFX9-LABEL: v_fshl_i64_ssv:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_and_b32_e32 v1, 63, v0
-; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v1, s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[2:3], 1
-; GFX9-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT:    v_bfi_b32 v0, v0, 0, 63
 ; GFX9-NEXT:    v_lshrrev_b64 v[3:4], v0, s[0:1]
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v3
 ; GFX9-NEXT:    v_or_b32_e32 v1, v2, v4
@@ -5239,11 +5223,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
 ;
 ; GFX10-LABEL: v_fshl_i64_ssv:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_not_b32_e32 v1, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 63, v0
+; GFX10-NEXT:    v_bfi_b32 v2, v0, 0, 63
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    v_and_b32_e32 v2, 63, v1
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, s[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v1, s[0:1]
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[2:3]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -5251,16 +5234,14 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
 ;
 ; GFX11-LABEL: v_fshl_i64_ssv:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_not_b32_e32 v1, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 63, v0
+; GFX11-NEXT:    v_bfi_b32 v2, v0, 0, 63
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v2, 63, v1
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v1, s[0:1]
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v2, s[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5466,18 +5447,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX6-LABEL: v_fshl_v2i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v9, 63, v8
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], 1
-; GFX6-NEXT:    v_not_b32_e32 v8, v8
-; GFX6-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX6-NEXT:    v_and_b32_e32 v9, 63, v8
+; GFX6-NEXT:    v_bfi_b32 v8, v8, 0, 63
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v9
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], 1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 63, v10
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
-; GFX6-NEXT:    v_not_b32_e32 v4, v10
-; GFX6-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT:    v_bfi_b32 v4, v10, 0, 63
 ; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
@@ -5487,18 +5466,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX8-LABEL: v_fshl_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v9, 63, v8
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
-; GFX8-NEXT:    v_not_b32_e32 v8, v8
-; GFX8-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 63, v8
+; GFX8-NEXT:    v_bfi_b32 v8, v8, 0, 63
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 63, v10
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX8-NEXT:    v_not_b32_e32 v4, v10
-; GFX8-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT:    v_bfi_b32 v4, v10, 0, 63
 ; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v4, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
@@ -5508,18 +5485,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX9-LABEL: v_fshl_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v9, 63, v8
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
-; GFX9-NEXT:    v_not_b32_e32 v8, v8
-; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 63, v8
+; GFX9-NEXT:    v_bfi_b32 v8, v8, 0, 63
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 63, v10
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX9-NEXT:    v_not_b32_e32 v4, v10
-; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT:    v_bfi_b32 v4, v10, 0, 63
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v4, v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v6
@@ -5529,18 +5504,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX10-LABEL: v_fshl_v2i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_not_b32_e32 v9, v8
-; GFX10-NEXT:    v_not_b32_e32 v11, v10
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX10-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
-; GFX10-NEXT:    v_and_b32_e32 v8, 63, v8
-; GFX10-NEXT:    v_and_b32_e32 v9, 63, v9
-; GFX10-NEXT:    v_and_b32_e32 v10, 63, v10
-; GFX10-NEXT:    v_and_b32_e32 v11, 63, v11
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v11, v[6:7]
+; GFX10-NEXT:    v_and_b32_e32 v9, 63, v8
+; GFX10-NEXT:    v_bfi_b32 v8, v8, 0, 63
+; GFX10-NEXT:    v_and_b32_e32 v11, 63, v10
+; GFX10-NEXT:    v_bfi_b32 v10, v10, 0, 63
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v10, v[6:7]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
@@ -5550,20 +5523,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %l...
[truncated]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants