Skip to content

Conversation

gandhi56
Copy link
Contributor

@gandhi56 gandhi56 commented Sep 4, 2025

Scalarize G_FSHR only if the subtarget does not support V2S16 type.

@llvmbot
Copy link
Member

llvmbot commented Sep 4, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Anshil Gandhi (gandhi56)

Changes

Scalarize G_FSHR only if the subtarget does not support V2S16 type.


Patch is 116.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156796.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+13-7)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+271-554)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir (+327-623)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 55a76f1172cb9..197c7009d8e86 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2082,13 +2082,19 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .scalarize(0)
     .lower();
 
-  // TODO: Only Try to form v2s16 with legal packed instructions.
-  getActionDefinitionsBuilder(G_FSHR)
-    .legalFor({{S32, S32}})
-    .lowerFor({{V2S16, V2S16}})
-    .clampMaxNumElementsStrict(0, S16, 2)
-    .scalarize(0)
-    .lower();
+  if (ST.hasVOP3PInsts()) {
+    getActionDefinitionsBuilder(G_FSHR)
+      .legalFor({{S32, S32}})
+      .lowerFor({{V2S16, V2S16}})
+      .clampMaxNumElementsStrict(0, S16, 2)
+      .scalarize(0)
+      .lower();
+  } else {
+    getActionDefinitionsBuilder(G_FSHR)
+      .legalFor({{S32, S32}})
+      .scalarize(0)
+      .lower();
+  }
 
   if (ST.hasVOP3PInsts()) {
     getActionDefinitionsBuilder(G_FSHL)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index d1ba24673043d..7338bf830a652 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -3404,32 +3404,19 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX6-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX6-NEXT:    s_or_b32 s4, s5, s4
-; GFX6-NEXT:    s_bfe_u32 s5, s2, 0xf0001
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX6-NEXT:    s_lshr_b32 s5, s5, 14
-; GFX6-NEXT:    s_or_b32 s0, s0, s5
-; GFX6-NEXT:    s_bfe_u32 s5, s3, 0xf0001
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX6-NEXT:    s_lshr_b32 s5, s5, 14
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
-; GFX6-NEXT:    s_xor_b32 s4, s4, -1
-; GFX6-NEXT:    s_or_b32 s1, s1, s5
-; GFX6-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX6-NEXT:    s_and_b32 s6, s4, 15
 ; GFX6-NEXT:    s_andn2_b32 s4, 15, s4
-; GFX6-NEXT:    s_bfe_u32 s2, s2, 0xf0001
-; GFX6-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX6-NEXT:    s_lshr_b32 s2, s2, s4
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX6-NEXT:    s_lshr_b32 s2, s2, s6
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s5, 15
 ; GFX6-NEXT:    s_andn2_b32 s4, 15, s5
-; GFX6-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX6-NEXT:    s_bfe_u32 s2, s3, 0xf0001
-; GFX6-NEXT:    s_lshr_b32 s2, s2, s4
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX6-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX6-NEXT:    s_and_b32 s0, 0xffff, s0
@@ -3439,33 +3426,22 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ;
 ; GFX8-LABEL: s_fshr_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s5, 0xffff, s1
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX8-NEXT:    s_lshr_b32 s5, s5, 15
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX8-NEXT:    s_or_b32 s0, s0, s5
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s5, s4, 15
-; GFX8-NEXT:    s_xor_b32 s2, s2, -1
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT:    s_or_b32 s3, s3, s5
 ; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX8-NEXT:    s_and_b32 s6, s2, 15
 ; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
-; GFX8-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s6
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s5, 15
-; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX8-NEXT:    s_and_b32 s3, 0xffff, s4
 ; GFX8-NEXT:    s_andn2_b32 s2, 15, s5
-; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
-; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX8-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX8-NEXT:    s_lshr_b32 s1, s4, s1
+; GFX8-NEXT:    s_or_b32 s1, s2, s1
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
@@ -3547,65 +3523,43 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX6-LABEL: v_fshr_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    v_bfe_u32 v5, v2, 1, 15
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 14, v5
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX6-NEXT:    v_bfe_u32 v5, v3, 1, 15
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 14, v5
-; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX6-NEXT:    v_and_b32_e32 v6, 15, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 15
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v6, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v5
-; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v5
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_bfe_u32 v2, v3, 1, 15
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 15, v1
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, 1
-; GFX8-NEXT:    v_mov_b32_e32 v5, 15
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 1, v1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v2
-; GFX8-NEXT:    v_xor_b32_e32 v7, -1, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 15, v7
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v4, v3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 1, v6
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v7, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 15, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX8-NEXT:    v_lshlrev_b16_e32 v5, 1, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v4, v4, v5
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, v3, v1
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v5, -1
+; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v5, 1
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -3657,13 +3611,11 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_fshr_v2i16_4_8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 15
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 12, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 3, v2
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 4, 12
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_bfe_u32 v2, v3, 1, 15
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 7, v2
+; GFX6-NEXT:    v_bfe_u32 v2, v3, 8, 8
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3716,35 +3668,22 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> %amt) {
 ; GFX6-LABEL: v_fshr_v2i16_ssv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT:    s_bfe_u32 s4, s2, 0xf0001
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 14
-; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT:    s_or_b32 s0, s0, s4
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
-; GFX6-NEXT:    s_bfe_u32 s0, s2, 0xf0001
-; GFX6-NEXT:    s_bfe_u32 s4, s3, 0xf0001
-; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 14
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
+; GFX6-NEXT:    s_and_b32 s0, s2, 0xffff
+; GFX6-NEXT:    v_lshr_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
-; GFX6-NEXT:    s_or_b32 s1, s1, s4
 ; GFX6-NEXT:    v_and_b32_e32 v1, 15, v1
-; GFX6-NEXT:    s_bfe_u32 s0, s3, 0xf0001
-; GFX6-NEXT:    v_lshl_b32_e32 v2, s1, v2
-; GFX6-NEXT:    v_lshr_b32_e32 v1, s0, v1
-; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    s_lshl_b32 s0, s1, 1
+; GFX6-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX6-NEXT:    s_and_b32 s0, s3, 0xffff
+; GFX6-NEXT:    v_lshr_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -3753,36 +3692,24 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ;
 ; GFX8-LABEL: v_fshr_v2i16_ssv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s4, 0xffff, s1
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v0
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX8-NEXT:    s_lshr_b32 s4, s4, 15
-; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_or_b32 s0, s0, s4
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX8-NEXT:    v_and_b32_e32 v1, 15, v0
-; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v0
-; GFX8-NEXT:    v_lshlrev_b16_e64 v1, v1, s0
-; GFX8-NEXT:    s_and_b32 s0, 0xffff, s1
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, v2, s0
-; GFX8-NEXT:    s_lshr_b32 s4, s3, 15
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v1, v1, s1
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v3, -1
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    s_and_b32 s0, 0xffff, s3
-; GFX8-NEXT:    s_or_b32 s2, s2, s4
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_lshl_b32 s0, s2, 1
+; GFX8-NEXT:    v_lshlrev_b16_e64 v0, v0, s0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v2, v2, s3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -3838,33 +3765,20 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) {
 ; GFX6-LABEL: v_fshr_v2i16_svs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_bfe_u32 v2, v0, 1, 15
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s4, s2, 15
+; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 14, v2
-; GFX6-NEXT:    v_bfe_u32 v3, v1, 1, 15
-; GFX6-NEXT:    s_or_b32 s2, s3, s2
-; GFX6-NEXT:    v_or_b32_e32 v2, s0, v2
-; GFX6-NEXT:    s_lshl_b32 s0, s1, 1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 14, v3
-; GFX6-NEXT:    v_or_b32_e32 v3, s0, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_xor_b32 s0, s2, -1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX6-NEXT:    s_and_b32 s2, s0, 15
-; GFX6-NEXT:    s_andn2_b32 s0, 15, s0
-; GFX6-NEXT:    v_bfe_u32 v0, v0, 1, 15
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, s2, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
-; GFX6-NEXT:    s_and_b32 s0, s1, 15
-; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
-; GFX6-NEXT:    v_bfe_u32 v1, v1, 1, 15
-; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, s0, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s1, v1
-; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT:    s_and_b32 s0, s3, 15
+; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s0, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -3874,31 +3788,21 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ; GFX8-LABEL: v_fshr_v2i16_svs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s4, s1, 15
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 15, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 15
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, s4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
-; GFX8-NEXT:    s_lshl_b32 s0, s2, 1
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v2, s0, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, 1
-; GFX8-NEXT:    s_xor_b32 s0, s1, -1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s2, s0, 15
-; GFX8-NEXT:    s_andn2_b32 s0, 15, s0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 1, v3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, s0, v3
-; GFX8-NEXT:    s_and_b32 s0, s1, 15
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v2, s0, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, s1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, s2, v1
+; GFX8-NEXT:    s_and_b32 s0, s3, 15
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, s1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -3963,32 +3867,19 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
 ; GFX6-LABEL: v_fshr_v2i16_vss:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX6-NEXT:    s_or_b32 s2, s3, s2
-; GFX6-NEXT:    s_bfe_u32 s3, s0, 0xf0001
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 14
-; GFX6-NEXT:    v_or_b32_e32 v0, s3, v0
-; GFX6-NEXT:    s_bfe_u32 s3, s1, 0xf0001
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 14
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX6-NEXT:    s_xor_b32 s2, s2, -1
-; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX6-NEXT:    s_and_b32 s4, s2, 15
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX6-NEXT:    s_bfe_u32 s0, s0, 0xf0001
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX6-NEXT:    s_lshr_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s4
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s3, 15
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
-; GFX6-NEXT:    s_bfe_u32 s0, s1, 0xf0001
-; GFX6-NEXT:    s_lshr_b32 s0, s0, s2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s2, v1
+; GFX6-NEXT:    s_lshr_b32 s0, s1, s0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -3998,32 +3889,21 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ;
 ; GFX8-LABEL: v_fshr_v2i16_vss:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s3, 0xffff, s0
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 1, v0
-; GFX8-NEXT:    s_lshr_b32 s3, s3, 15
-; GFX8-NEXT:    v_mov_b32_e32 v2, 1
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    s_lshr_b32 s3, s2, 15
-; GFX8-NEXT:    s_xor_b32 s1, s1, -1
-; GFX8-NEXT...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Sep 4, 2025

@llvm/pr-subscribers-llvm-globalisel

Author: Anshil Gandhi (gandhi56)

Changes

Scalarize G_FSHR only if the subtarget does not support V2S16 type.


Patch is 116.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156796.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+13-7)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+271-554)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir (+327-623)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 55a76f1172cb9..197c7009d8e86 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2082,13 +2082,19 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .scalarize(0)
     .lower();
 
-  // TODO: Only Try to form v2s16 with legal packed instructions.
-  getActionDefinitionsBuilder(G_FSHR)
-    .legalFor({{S32, S32}})
-    .lowerFor({{V2S16, V2S16}})
-    .clampMaxNumElementsStrict(0, S16, 2)
-    .scalarize(0)
-    .lower();
+  if (ST.hasVOP3PInsts()) {
+    getActionDefinitionsBuilder(G_FSHR)
+      .legalFor({{S32, S32}})
+      .lowerFor({{V2S16, V2S16}})
+      .clampMaxNumElementsStrict(0, S16, 2)
+      .scalarize(0)
+      .lower();
+  } else {
+    getActionDefinitionsBuilder(G_FSHR)
+      .legalFor({{S32, S32}})
+      .scalarize(0)
+      .lower();
+  }
 
   if (ST.hasVOP3PInsts()) {
     getActionDefinitionsBuilder(G_FSHL)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index d1ba24673043d..7338bf830a652 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -3404,32 +3404,19 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX6-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX6-NEXT:    s_or_b32 s4, s5, s4
-; GFX6-NEXT:    s_bfe_u32 s5, s2, 0xf0001
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX6-NEXT:    s_lshr_b32 s5, s5, 14
-; GFX6-NEXT:    s_or_b32 s0, s0, s5
-; GFX6-NEXT:    s_bfe_u32 s5, s3, 0xf0001
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX6-NEXT:    s_lshr_b32 s5, s5, 14
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
-; GFX6-NEXT:    s_xor_b32 s4, s4, -1
-; GFX6-NEXT:    s_or_b32 s1, s1, s5
-; GFX6-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX6-NEXT:    s_and_b32 s6, s4, 15
 ; GFX6-NEXT:    s_andn2_b32 s4, 15, s4
-; GFX6-NEXT:    s_bfe_u32 s2, s2, 0xf0001
-; GFX6-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX6-NEXT:    s_lshr_b32 s2, s2, s4
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s4
+; GFX6-NEXT:    s_lshr_b32 s2, s2, s6
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s5, 15
 ; GFX6-NEXT:    s_andn2_b32 s4, 15, s5
-; GFX6-NEXT:    s_lshl_b32 s1, s1, s2
-; GFX6-NEXT:    s_bfe_u32 s2, s3, 0xf0001
-; GFX6-NEXT:    s_lshr_b32 s2, s2, s4
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX6-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX6-NEXT:    s_and_b32 s0, 0xffff, s0
@@ -3439,33 +3426,22 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ;
 ; GFX8-LABEL: s_fshr_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s5, 0xffff, s1
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX8-NEXT:    s_lshr_b32 s5, s5, 15
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX8-NEXT:    s_or_b32 s0, s0, s5
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s5, s4, 15
-; GFX8-NEXT:    s_xor_b32 s2, s2, -1
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT:    s_or_b32 s3, s3, s5
 ; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX8-NEXT:    s_and_b32 s6, s2, 15
 ; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX8-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
-; GFX8-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX8-NEXT:    s_lshr_b32 s1, s1, s6
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s5, 15
-; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX8-NEXT:    s_and_b32 s3, 0xffff, s4
 ; GFX8-NEXT:    s_andn2_b32 s2, 15, s5
-; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
-; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX8-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX8-NEXT:    s_lshr_b32 s1, s4, s1
+; GFX8-NEXT:    s_or_b32 s1, s2, s1
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
@@ -3547,65 +3523,43 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX6-LABEL: v_fshr_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT:    v_bfe_u32 v5, v2, 1, 15
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 14, v5
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX6-NEXT:    v_bfe_u32 v5, v3, 1, 15
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 14, v5
-; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX6-NEXT:    v_and_b32_e32 v6, 15, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 15
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v6, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v5
-; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v5
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_bfe_u32 v2, v3, 1, 15
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 15, v1
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, 1
-; GFX8-NEXT:    v_mov_b32_e32 v5, 15
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 1, v1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v2
-; GFX8-NEXT:    v_xor_b32_e32 v7, -1, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 15, v7
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, v4, v3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 1, v6
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v7, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 15, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX8-NEXT:    v_lshlrev_b16_e32 v5, 1, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v4, v4, v5
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, v3, v1
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v5, -1
+; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v5, 1
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -3657,13 +3611,11 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_fshr_v2i16_4_8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 15
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 12, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 3, v2
+; GFX6-NEXT:    v_bfe_u32 v2, v2, 4, 12
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_bfe_u32 v2, v3, 1, 15
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 7, v2
+; GFX6-NEXT:    v_bfe_u32 v2, v3, 8, 8
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3716,35 +3668,22 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> %amt) {
 ; GFX6-LABEL: v_fshr_v2i16_ssv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT:    s_bfe_u32 s4, s2, 0xf0001
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 14
-; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT:    s_or_b32 s0, s0, s4
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, -1, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
-; GFX6-NEXT:    s_bfe_u32 s0, s2, 0xf0001
-; GFX6-NEXT:    s_bfe_u32 s4, s3, 0xf0001
-; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX6-NEXT:    s_lshr_b32 s4, s4, 14
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
+; GFX6-NEXT:    s_and_b32 s0, s2, 0xffff
+; GFX6-NEXT:    v_lshr_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
-; GFX6-NEXT:    s_or_b32 s1, s1, s4
 ; GFX6-NEXT:    v_and_b32_e32 v1, 15, v1
-; GFX6-NEXT:    s_bfe_u32 s0, s3, 0xf0001
-; GFX6-NEXT:    v_lshl_b32_e32 v2, s1, v2
-; GFX6-NEXT:    v_lshr_b32_e32 v1, s0, v1
-; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    s_lshl_b32 s0, s1, 1
+; GFX6-NEXT:    v_lshl_b32_e32 v1, s0, v1
+; GFX6-NEXT:    s_and_b32 s0, s3, 0xffff
+; GFX6-NEXT:    v_lshr_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -3753,36 +3692,24 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ;
 ; GFX8-LABEL: v_fshr_v2i16_ssv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s4, 0xffff, s1
+; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v0
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX8-NEXT:    s_lshr_b32 s4, s4, 15
-; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
-; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_or_b32 s0, s0, s4
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX8-NEXT:    v_and_b32_e32 v1, 15, v0
-; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v0
-; GFX8-NEXT:    v_lshlrev_b16_e64 v1, v1, s0
-; GFX8-NEXT:    s_and_b32 s0, 0xffff, s1
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, v2, s0
-; GFX8-NEXT:    s_lshr_b32 s4, s3, 15
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v1, v1, s1
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v3, -1
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    s_and_b32 s0, 0xffff, s3
-; GFX8-NEXT:    s_or_b32 s2, s2, s4
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_lshl_b32 s0, s2, 1
+; GFX8-NEXT:    v_lshlrev_b16_e64 v0, v0, s0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v2, v2, s3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -3838,33 +3765,20 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) {
 ; GFX6-LABEL: v_fshr_v2i16_svs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_bfe_u32 v2, v0, 1, 15
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s4, s2, 15
+; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 14, v2
-; GFX6-NEXT:    v_bfe_u32 v3, v1, 1, 15
-; GFX6-NEXT:    s_or_b32 s2, s3, s2
-; GFX6-NEXT:    v_or_b32_e32 v2, s0, v2
-; GFX6-NEXT:    s_lshl_b32 s0, s1, 1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 14, v3
-; GFX6-NEXT:    v_or_b32_e32 v3, s0, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_xor_b32 s0, s2, -1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX6-NEXT:    s_and_b32 s2, s0, 15
-; GFX6-NEXT:    s_andn2_b32 s0, 15, s0
-; GFX6-NEXT:    v_bfe_u32 v0, v0, 1, 15
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, s2, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
-; GFX6-NEXT:    s_and_b32 s0, s1, 15
-; GFX6-NEXT:    s_andn2_b32 s1, 15, s1
-; GFX6-NEXT:    v_bfe_u32 v1, v1, 1, 15
-; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, s0, v3
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s1, v1
-; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT:    s_and_b32 s0, s3, 15
+; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s0, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -3874,31 +3788,21 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ; GFX8-LABEL: v_fshr_v2i16_svs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s4, s1, 15
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 15, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 15
+; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, s4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
-; GFX8-NEXT:    s_lshl_b32 s0, s2, 1
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v2, s0, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, 1
-; GFX8-NEXT:    s_xor_b32 s0, s1, -1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s2, s0, 15
-; GFX8-NEXT:    s_andn2_b32 s0, 15, s0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 1, v3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, s0, v3
-; GFX8-NEXT:    s_and_b32 s0, s1, 15
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v2, s0, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, s1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, s2, v1
+; GFX8-NEXT:    s_and_b32 s0, s3, 15
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s2, s1
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, s1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -3963,32 +3867,19 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
 ; GFX6-LABEL: v_fshr_v2i16_vss:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX6-NEXT:    s_or_b32 s2, s3, s2
-; GFX6-NEXT:    s_bfe_u32 s3, s0, 0xf0001
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 14
-; GFX6-NEXT:    v_or_b32_e32 v0, s3, v0
-; GFX6-NEXT:    s_bfe_u32 s3, s1, 0xf0001
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 14
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX6-NEXT:    s_xor_b32 s2, s2, -1
-; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX6-NEXT:    s_and_b32 s4, s2, 15
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX6-NEXT:    s_bfe_u32 s0, s0, 0xf0001
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
-; GFX6-NEXT:    s_lshr_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s4
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s3, 15
 ; GFX6-NEXT:    s_andn2_b32 s2, 15, s3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
-; GFX6-NEXT:    s_bfe_u32 s0, s1, 0xf0001
-; GFX6-NEXT:    s_lshr_b32 s0, s0, s2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s2, v1
+; GFX6-NEXT:    s_lshr_b32 s0, s1, s0
 ; GFX6-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -3998,32 +3889,21 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ;
 ; GFX8-LABEL: v_fshr_v2i16_vss:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s3, 0xffff, s0
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 1, v0
-; GFX8-NEXT:    s_lshr_b32 s3, s3, 15
-; GFX8-NEXT:    v_mov_b32_e32 v2, 1
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    s_lshr_b32 s3, s2, 15
-; GFX8-NEXT:    s_xor_b32 s1, s1, -1
-; GFX8-NEXT...
[truncated]

Copy link
Contributor

@Copilot Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull Request Overview

This PR optimizes G_FSHR legalization in AMDGPU GlobalISel by avoiding unnecessary scalarization for V2S16 types on subtargets that don't support packed instructions. Instead of performing pack/unpack operations that cause instruction bloat, the code now directly scalarizes G_FSHR operations on older subtargets while maintaining the existing behavior for subtargets with VOP3P support.

Key changes:

  • Conditionally handle G_FSHR based on VOP3P instruction availability
  • Simplify code generation for older subtargets without V2S16 support

Reviewed Changes

Copilot reviewed 3 out of 3 changed files in this pull request and generated 1 comment.

File Description
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp Modified G_FSHR legalization to check for VOP3P support before attempting V2S16 operations
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll Updated test expectations showing optimized instruction sequences
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir Updated MIR test expectations reflecting simplified legalization

Copy link

github-actions bot commented Sep 4, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@gandhi56 gandhi56 force-pushed the globalisel/legalize-fshr branch 2 times, most recently from 9d05e05 to 7f22ec8 Compare September 4, 2025 05:06
Scalarize G_FSHR only if the subtarget does not support V2S16 type.
@gandhi56 gandhi56 force-pushed the globalisel/legalize-fshr branch from 7f22ec8 to dd42b0e Compare September 4, 2025 14:45
Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
@gandhi56 gandhi56 merged commit c689919 into llvm:main Sep 4, 2025
5 checks passed
@gandhi56
Copy link
Contributor Author

gandhi56 commented Sep 4, 2025

Thanks for the review.

@gandhi56 gandhi56 deleted the globalisel/legalize-fshr branch September 4, 2025 23:15
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants