-
Notifications
You must be signed in to change notification settings - Fork 14.9k
AMDGPU/GlobalISel: Import D16 load patterns and add combines for them #153178
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/petar-avramovic/load-rules
Are you sure you want to change the base?
AMDGPU/GlobalISel: Import D16 load patterns and add combines for them #153178
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Petar Avramovic (petar-avramovic) ChangesAdd G_AMDGPU_LOAD_D16 generic instructions and GINodeEquivs for them, Patch is 39.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153178.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 9587fad1ecd63..30e5362ba7adf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -71,6 +71,14 @@ def int_minmax_to_med3 : GICombineRule<
[{ return matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
(apply [{ applyMed3(*${min_or_max}, ${matchinfo}); }])>;
+def d16_matchdata : GIDefMatchData<"D16MatchInfo">;
+
+def d16_load : GICombineRule<
+ (defs root:$bitcast, d16_matchdata:$matchinfo),
+ (match (wip_match_opcode G_BITCAST):$bitcast,
+ [{ return matchD16Load(*${bitcast}, ${matchinfo}); }]),
+ (apply [{ applyD16Load(*${bitcast}, ${matchinfo}); }])>;
+
def fp_minmax_to_med3 : GICombineRule<
(defs root:$min_or_max, med3_matchdata:$matchinfo),
(match (wip_match_opcode G_FMAXNUM,
@@ -198,5 +206,6 @@ def AMDGPURegBankCombiner : GICombiner<
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
- cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
+ cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
+ d16_load]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 394a143dd3086..a4ccf368f7745 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -309,6 +309,13 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO, SIload_d16_lo>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_U8, SIload_d16_lo_u8>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_I8, SIload_d16_lo_i8>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI, SIload_d16_hi>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_U8, SIload_d16_hi_u8>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_I8, SIload_d16_hi_i8>;
+
def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
// so we don't mark it as equivalent.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index ee324a5e93f0f..946a3361aed29 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -74,6 +74,12 @@ class AMDGPURegBankCombinerImpl : public Combiner {
Register Val0, Val1, Val2;
};
+ struct D16MatchInfo {
+ unsigned Opc;
+ Register Dst;
+ MachineInstr *Load;
+ };
+
MinMaxMedOpc getMinMaxPair(unsigned Opc) const;
template <class m_Cst, typename CstTy>
@@ -89,6 +95,9 @@ class AMDGPURegBankCombinerImpl : public Combiner {
void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const;
+ bool matchD16Load(MachineInstr &MI, D16MatchInfo &MatchInfo) const;
+ void applyD16Load(MachineInstr &MI, D16MatchInfo &MatchInfo) const;
+
private:
SIModeRegisterDefaults getMode() const;
bool getIEEE() const;
@@ -392,6 +401,102 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
MI.eraseFromParent();
}
+bool AMDGPURegBankCombinerImpl::matchD16Load(MachineInstr &MI,
+ D16MatchInfo &MatchInfo) const {
+ if (!STI.d16PreservesUnusedBits())
+ return false;
+
+ Register Dst;
+ MachineInstr *Load, *SextLoad;
+ const int64_t CleanLo16 = 0xFFFFFFFFFFFF0000;
+ const int64_t CleanHi16 = 0x000000000000FFFF;
+
+ // Load lo
+ if (mi_match(MI.getOperand(1).getReg(), MRI,
+ m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)),
+ m_Copy(m_SpecificICst(CleanLo16))),
+ m_MInstr(Load)))) {
+
+ if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
+ const MachineMemOperand *MMO = *Load->memoperands_begin();
+ if (MMO->isAtomic())
+ return false;
+
+ unsigned LoadSize = MMO->getSizeInBits().getValue();
+ if (LoadSize == 8) {
+ MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_LO_U8, Dst, Load};
+ } else if (LoadSize == 16) {
+ MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_LO, Dst, Load};
+ } else
+ return false;
+ return true;
+ }
+
+ if (mi_match(
+ Load, MRI,
+ m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) {
+ if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
+ return false;
+
+ const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
+ if (MMO->isAtomic() || MMO->getSizeInBits().getValue() != 8)
+ return false;
+
+ MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_LO_I8, Dst, SextLoad};
+ return true;
+ }
+
+ return false;
+ }
+
+ // Load hi
+ if (mi_match(MI.getOperand(1).getReg(), MRI,
+ m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)),
+ m_Copy(m_SpecificICst(CleanHi16))),
+ m_GShl(m_MInstr(Load), m_Copy(m_SpecificICst(16)))))) {
+
+ if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
+ const MachineMemOperand *MMO = *Load->memoperands_begin();
+ if (MMO->isAtomic())
+ return false;
+
+ unsigned LoadSize = MMO->getSizeInBits().getValue();
+ if (LoadSize == 8) {
+ MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_HI_U8, Dst, Load};
+ } else if (LoadSize == 16) {
+ MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_HI, Dst, Load};
+ } else
+ return false;
+ return true;
+ }
+
+ if (mi_match(
+ Load, MRI,
+ m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) {
+ if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
+ return false;
+ const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
+ if (MMO->isAtomic() || MMO->getSizeInBits().getValue() != 8)
+ return false;
+
+ MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_HI_I8, Dst, SextLoad};
+ return true;
+ }
+
+ return false;
+ }
+
+ return false;
+}
+
+void AMDGPURegBankCombinerImpl::applyD16Load(MachineInstr &MI,
+ D16MatchInfo &MatchInfo) const {
+ B.buildInstr(MatchInfo.Opc, {MI.getOperand(0).getReg()},
+ {MatchInfo.Load->getOperand(1).getReg(), MatchInfo.Dst})
+ .addMemOperand(*MatchInfo.Load->memoperands_begin());
+ MI.eraseFromParent();
+}
+
SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
return MF.getInfo<SIMachineFunctionInfo>()->getMode();
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bd5dfa92a8e43..ec761207351e1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -4206,6 +4206,21 @@ def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;
+class D16LoadGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins ptype1:$addr);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+}
+
+def G_AMDGPU_LOAD_D16_LO : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_LO_U8 : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_LO_I8 : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_HI : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_HI_U8 : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_HI_I8 : D16LoadGenericInstruction;
+
+
class BufferStoreGenericInstruction : AMDGPUGenericInstruction {
let OutOperandList = (outs);
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll
new file mode 100644
index 0000000000000..62459247cc440
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll
@@ -0,0 +1,412 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -new-reg-bank-select < %s | FileCheck --check-prefixes=GFX12 %s
+
+define amdgpu_ps void @load_P0_B16_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: load_P0_B16_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_d16_b16 v0, v[1:2]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(0) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+ store <2 x i16> %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P0_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: load_P0_B16_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(0) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+ store <2 x i16> %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: sextload_P0_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_d16_i8 v0, v[1:2]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(0) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: sextload_P0_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_d16_hi_i8 v0, v[1:2]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(0) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: zextload_P0_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_d16_u8 v0, v[1:2]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(0) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: zextload_P0_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: flat_load_d16_hi_u8 v0, v[1:2]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(0) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(0) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P1_B16_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P1_B16_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_b16 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P1_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P1_B16_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P1_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_i8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P1_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_i8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P1_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_u8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P1_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_u8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P3_B16_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: load_P3_B16_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_u16_d16 v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v2, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(3) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+ store <2 x i16> %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P3_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: load_P3_B16_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_u16_d16_hi v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v2, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(3) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+ store <2 x i16> %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: sextload_P3_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_i8_d16 v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v2, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(3) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: sextload_P3_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_i8_d16_hi v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v2, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(3) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: zextload_P3_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_u8_d16 v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v2, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(3) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: zextload_P3_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: ds_load_u8_d16_hi v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: ds_store_b32 v2, v0
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(3) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P4_B16_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P4_B16_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_b16 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @load_P4_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P4_B16_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(4) %ptra
+ %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P4_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_i8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @sextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P4_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_i8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ %a16 = sext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P4_i8_D16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_u8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+ store <2 x i16> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @zextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P4_i8_D16_Hi:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_d16_hi_u8 v0, v[1:2], off
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(4) %ptra
+ %a16 = zext i8 %a to i16
+ %res = insertelement <2 x i16> %vec, i16 %a...
[truncated]
|
@@ -392,6 +401,102 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt( | |||
MI.eraseFromParent(); | |||
} | |||
|
|||
bool AMDGPURegBankCombinerImpl::matchD16Load(MachineInstr &MI, | |||
D16MatchInfo &MatchInfo) const { | |||
if (!STI.d16PreservesUnusedBits()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We really need a way to attach predicates to the pattern in tablegen
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do have a way, it's the field called Predicates
in the combine rule.
if (MMO->isAtomic()) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think atomic is an issue for d16, this doesn't change the memory properties only the destination register. If it is a problem, it's also a problem for volatile. So either this check is unnecessarily conservative or missing a check for volatile too
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure what I was missing, removing isAtomic check made changes to a few files, looks correct.
D16MatchInfo &MatchInfo) const { | ||
B.buildInstr(MatchInfo.Opc, {MI.getOperand(0).getReg()}, | ||
{MatchInfo.Load->getOperand(1).getReg(), MatchInfo.Dst}) | ||
.addMemOperand(*MatchInfo.Load->memoperands_begin()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
copyMemRefs. Also can this be done inline in tablegen
2a2bdbf
to
696b1b2
Compare
b7c4e4a
to
db26062
Compare
696b1b2
to
788368e
Compare
db26062
to
033a3b7
Compare
788368e
to
1464e56
Compare
a7fa75b
to
d3106eb
Compare
d1da8f1
to
d8c91e6
Compare
d3106eb
to
b0b1c30
Compare
d8c91e6
to
e863c35
Compare
b0b1c30
to
34bde4a
Compare
e863c35
to
87b9b39
Compare
441b892
to
617ca54
Compare
87b9b39
to
ccc9546
Compare
@@ -392,6 +401,102 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt( | |||
MI.eraseFromParent(); | |||
} | |||
|
|||
bool AMDGPURegBankCombinerImpl::matchD16Load(MachineInstr &MI, | |||
D16MatchInfo &MatchInfo) const { | |||
if (!STI.d16PreservesUnusedBits()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do have a way, it's the field called Predicates
in the combine rule.
Add G_AMDGPU_LOAD_D16 generic instructions and GINodeEquivs for them, this will import D16 load patterns to global-isel's tablegened instruction selector. For newly imported patterns to work add combines for G_AMDGPU_LOAD_D16 in AMDGPURegBankCombiner.
617ca54
to
acde25e
Compare
ccc9546
to
0a46b28
Compare
Add G_AMDGPU_LOAD_D16 generic instructions and GINodeEquivs for them,
this will import D16 load patterns to global-isel's tablegened
instruction selector.
For newly imported patterns to work add combines for G_AMDGPU_LOAD_D16
in AMDGPURegBankCombiner.