Skip to content

Conversation

petar-avramovic
Copy link
Collaborator

Add G_AMDGPU_LOAD_D16 generic instructions and GINodeEquivs for them,
this will import D16 load patterns to global-isel's tablegened
instruction selector.
For newly imported patterns to work add combines for G_AMDGPU_LOAD_D16
in AMDGPURegBankCombiner.

Copy link
Collaborator Author

petar-avramovic commented Aug 12, 2025

Warning

This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
Learn more

This stack of pull requests is managed by Graphite. Learn more about stacking.

@llvmbot
Copy link
Member

llvmbot commented Aug 12, 2025

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Petar Avramovic (petar-avramovic)

Changes

Add G_AMDGPU_LOAD_D16 generic instructions and GINodeEquivs for them,
this will import D16 load patterns to global-isel's tablegened
instruction selector.
For newly imported patterns to work add combines for G_AMDGPU_LOAD_D16
in AMDGPURegBankCombiner.


Patch is 39.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153178.diff

6 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUCombine.td (+10-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUGISel.td (+7)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp (+105)
  • (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+15)
  • (added) llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll (+412)
  • (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+72-174)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 9587fad1ecd63..30e5362ba7adf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -71,6 +71,14 @@ def int_minmax_to_med3 : GICombineRule<
          [{ return matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
   (apply [{ applyMed3(*${min_or_max}, ${matchinfo}); }])>;
 
+def d16_matchdata : GIDefMatchData<"D16MatchInfo">;
+
+def d16_load : GICombineRule<
+  (defs root:$bitcast, d16_matchdata:$matchinfo),
+  (match (wip_match_opcode G_BITCAST):$bitcast,
+         [{ return matchD16Load(*${bitcast}, ${matchinfo}); }]),
+  (apply [{ applyD16Load(*${bitcast}, ${matchinfo}); }])>;
+
 def fp_minmax_to_med3 : GICombineRule<
   (defs root:$min_or_max, med3_matchdata:$matchinfo),
   (match (wip_match_opcode G_FMAXNUM,
@@ -198,5 +206,6 @@ def AMDGPURegBankCombiner : GICombiner<
    zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
    fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
    identity_combines, redundant_and, constant_fold_cast_op,
-   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
+   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
+   d16_load]> {
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 394a143dd3086..a4ccf368f7745 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -309,6 +309,13 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;
 
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO, SIload_d16_lo>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_U8, SIload_d16_lo_u8>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_I8, SIload_d16_lo_i8>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI, SIload_d16_hi>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_U8, SIload_d16_hi_u8>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_I8, SIload_d16_hi_i8>;
+
 def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
 // G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
 // so we don't mark it as equivalent.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index ee324a5e93f0f..946a3361aed29 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -74,6 +74,12 @@ class AMDGPURegBankCombinerImpl : public Combiner {
     Register Val0, Val1, Val2;
   };
 
+  struct D16MatchInfo {
+    unsigned Opc;
+    Register Dst;
+    MachineInstr *Load;
+  };
+
   MinMaxMedOpc getMinMaxPair(unsigned Opc) const;
 
   template <class m_Cst, typename CstTy>
@@ -89,6 +95,9 @@ class AMDGPURegBankCombinerImpl : public Combiner {
 
   void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const;
 
+  bool matchD16Load(MachineInstr &MI, D16MatchInfo &MatchInfo) const;
+  void applyD16Load(MachineInstr &MI, D16MatchInfo &MatchInfo) const;
+
 private:
   SIModeRegisterDefaults getMode() const;
   bool getIEEE() const;
@@ -392,6 +401,102 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
   MI.eraseFromParent();
 }
 
+bool AMDGPURegBankCombinerImpl::matchD16Load(MachineInstr &MI,
+                                             D16MatchInfo &MatchInfo) const {
+  if (!STI.d16PreservesUnusedBits())
+    return false;
+
+  Register Dst;
+  MachineInstr *Load, *SextLoad;
+  const int64_t CleanLo16 = 0xFFFFFFFFFFFF0000;
+  const int64_t CleanHi16 = 0x000000000000FFFF;
+
+  // Load lo
+  if (mi_match(MI.getOperand(1).getReg(), MRI,
+               m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)),
+                            m_Copy(m_SpecificICst(CleanLo16))),
+                     m_MInstr(Load)))) {
+
+    if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
+      const MachineMemOperand *MMO = *Load->memoperands_begin();
+      if (MMO->isAtomic())
+        return false;
+
+      unsigned LoadSize = MMO->getSizeInBits().getValue();
+      if (LoadSize == 8) {
+        MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_LO_U8, Dst, Load};
+      } else if (LoadSize == 16) {
+        MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_LO, Dst, Load};
+      } else
+        return false;
+      return true;
+    }
+
+    if (mi_match(
+            Load, MRI,
+            m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) {
+      if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
+        return false;
+
+      const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
+      if (MMO->isAtomic() || MMO->getSizeInBits().getValue() != 8)
+        return false;
+
+      MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_LO_I8, Dst, SextLoad};
+      return true;
+    }
+
+    return false;
+  }
+
+  // Load hi
+  if (mi_match(MI.getOperand(1).getReg(), MRI,
+               m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)),
+                            m_Copy(m_SpecificICst(CleanHi16))),
+                     m_GShl(m_MInstr(Load), m_Copy(m_SpecificICst(16)))))) {
+
+    if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
+      const MachineMemOperand *MMO = *Load->memoperands_begin();
+      if (MMO->isAtomic())
+        return false;
+
+      unsigned LoadSize = MMO->getSizeInBits().getValue();
+      if (LoadSize == 8) {
+        MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_HI_U8, Dst, Load};
+      } else if (LoadSize == 16) {
+        MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_HI, Dst, Load};
+      } else
+        return false;
+      return true;
+    }
+
+    if (mi_match(
+            Load, MRI,
+            m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) {
+      if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
+        return false;
+      const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
+      if (MMO->isAtomic() || MMO->getSizeInBits().getValue() != 8)
+        return false;
+
+      MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_HI_I8, Dst, SextLoad};
+      return true;
+    }
+
+    return false;
+  }
+
+  return false;
+}
+
+void AMDGPURegBankCombinerImpl::applyD16Load(MachineInstr &MI,
+                                             D16MatchInfo &MatchInfo) const {
+  B.buildInstr(MatchInfo.Opc, {MI.getOperand(0).getReg()},
+               {MatchInfo.Load->getOperand(1).getReg(), MatchInfo.Dst})
+      .addMemOperand(*MatchInfo.Load->memoperands_begin());
+  MI.eraseFromParent();
+}
+
 SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
   return MF.getInfo<SIMachineFunctionInfo>()->getMode();
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bd5dfa92a8e43..ec761207351e1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -4206,6 +4206,21 @@ def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
 def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
 def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;
 
+class D16LoadGenericInstruction : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins ptype1:$addr);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+}
+
+def G_AMDGPU_LOAD_D16_LO : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_LO_U8 : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_LO_I8 : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_HI : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_HI_U8 : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_HI_I8 : D16LoadGenericInstruction;
+
+
 class BufferStoreGenericInstruction : AMDGPUGenericInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll
new file mode 100644
index 0000000000000..62459247cc440
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll
@@ -0,0 +1,412 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200  -global-isel -new-reg-bank-select < %s | FileCheck --check-prefixes=GFX12 %s
+
+define amdgpu_ps void @load_P0_B16_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: load_P0_B16_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_load_d16_b16 v0, v[1:2]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(0) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+  store <2 x i16> %res, ptr addrspace(0) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P0_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: load_P0_B16_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_load_d16_hi_b16 v0, v[1:2]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(0) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+  store <2 x i16> %res, ptr addrspace(0) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: sextload_P0_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_load_d16_i8 v0, v[1:2]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(0) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(0) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: sextload_P0_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_load_d16_hi_i8 v0, v[1:2]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(0) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(0) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: zextload_P0_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_load_d16_u8 v0, v[1:2]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(0) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(0) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: zextload_P0_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_load_d16_hi_u8 v0, v[1:2]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(0) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(0) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P1_B16_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P1_B16_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_b16 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(1) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P1_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P1_B16_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_hi_b16 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(1) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P1_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_i8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(1) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P1_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_hi_i8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(1) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P1_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_u8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(1) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P1_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_hi_u8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(1) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P3_B16_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: load_P3_B16_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    ds_load_u16_d16 v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    ds_store_b32 v2, v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(3) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+  store <2 x i16> %res, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P3_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: load_P3_B16_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    ds_load_u16_d16_hi v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    ds_store_b32 v2, v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(3) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+  store <2 x i16> %res, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: sextload_P3_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    ds_load_i8_d16 v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    ds_store_b32 v2, v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(3) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: sextload_P3_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    ds_load_i8_d16_hi v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    ds_store_b32 v2, v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(3) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: zextload_P3_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    ds_load_u8_d16 v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    ds_store_b32 v2, v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(3) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: zextload_P3_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    ds_load_u8_d16_hi v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    ds_store_b32 v2, v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(3) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P4_B16_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P4_B16_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_b16 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(4) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P4_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P4_B16_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_hi_b16 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(4) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P4_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_i8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(4) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P4_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_hi_i8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(4) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P4_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_u8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(4) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P4_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_hi_u8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(4) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a...
[truncated]

@@ -392,6 +401,102 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
MI.eraseFromParent();
}

bool AMDGPURegBankCombinerImpl::matchD16Load(MachineInstr &MI,
D16MatchInfo &MatchInfo) const {
if (!STI.d16PreservesUnusedBits())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We really need a way to attach predicates to the pattern in tablegen

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do have a way, it's the field called Predicates in the combine rule.

Comment on lines 422 to 423
if (MMO->isAtomic())
return false;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think atomic is an issue for d16, this doesn't change the memory properties only the destination register. If it is a problem, it's also a problem for volatile. So either this check is unnecessarily conservative or missing a check for volatile too

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what I was missing, removing isAtomic check made changes to a few files, looks correct.

D16MatchInfo &MatchInfo) const {
B.buildInstr(MatchInfo.Opc, {MI.getOperand(0).getReg()},
{MatchInfo.Load->getOperand(1).getReg(), MatchInfo.Dst})
.addMemOperand(*MatchInfo.Load->memoperands_begin());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copyMemRefs. Also can this be done inline in tablegen

@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from 2a2bdbf to 696b1b2 Compare August 12, 2025 13:05
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch from b7c4e4a to db26062 Compare August 12, 2025 13:06
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from 696b1b2 to 788368e Compare August 12, 2025 13:16
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch from db26062 to 033a3b7 Compare August 12, 2025 13:16
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from 788368e to 1464e56 Compare August 12, 2025 13:28
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch 2 times, most recently from a7fa75b to d3106eb Compare August 13, 2025 12:46
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch 2 times, most recently from d1da8f1 to d8c91e6 Compare August 13, 2025 13:16
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch from d3106eb to b0b1c30 Compare August 13, 2025 13:16
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from d8c91e6 to e863c35 Compare August 13, 2025 13:51
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch from b0b1c30 to 34bde4a Compare August 13, 2025 13:52
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from e863c35 to 87b9b39 Compare August 22, 2025 14:49
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch 2 times, most recently from 441b892 to 617ca54 Compare August 22, 2025 14:56
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from 87b9b39 to ccc9546 Compare August 22, 2025 14:57
@@ -392,6 +401,102 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
MI.eraseFromParent();
}

bool AMDGPURegBankCombinerImpl::matchD16Load(MachineInstr &MI,
D16MatchInfo &MatchInfo) const {
if (!STI.d16PreservesUnusedBits())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do have a way, it's the field called Predicates in the combine rule.

Add G_AMDGPU_LOAD_D16 generic instructions and GINodeEquivs for them,
this will import D16 load patterns to global-isel's tablegened
instruction selector.
For newly imported patterns to work add combines for G_AMDGPU_LOAD_D16
in AMDGPURegBankCombiner.
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch from 617ca54 to acde25e Compare August 26, 2025 12:17
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from ccc9546 to 0a46b28 Compare August 26, 2025 12:17
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants