Skip to content

Commit 2605adb

Browse files
committed
[AMDGPU][GlobalISel] Select 8-byte LDS Ops with 4-byte alignment
Reviewers: arsenm Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D73585
1 parent ce07cde commit 2605adb

File tree

6 files changed

+304
-170
lines changed

6 files changed

+304
-170
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ def gi_ds_1addr_1offset :
8888
GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">,
8989
GIComplexPatternEquiv<DS1Addr1Offset>;
9090

91+
def gi_ds_64bit_4byte_aligned :
92+
GIComplexOperandMatcher<s64, "selectDS64Bit4ByteAligned">,
93+
GIComplexPatternEquiv<DS64Bit4ByteAligned>;
94+
9195
def gi_mubuf_addr64 :
9296
GIComplexOperandMatcher<s64, "selectMUBUFAddr64">,
9397
GIComplexPatternEquiv<MUBUFAddr64>;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2399,6 +2399,50 @@ AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
23992399
}};
24002400
}
24012401

2402+
InstructionSelector::ComplexRendererFns
2403+
AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
2404+
const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
2405+
if (!RootDef) {
2406+
return {{
2407+
[=](MachineInstrBuilder &MIB) { MIB.add(Root); },
2408+
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
2409+
[=](MachineInstrBuilder &MIB) { MIB.addImm(1); }
2410+
}};
2411+
}
2412+
2413+
int64_t ConstAddr = 0;
2414+
Register PtrBase;
2415+
int64_t Offset;
2416+
2417+
std::tie(PtrBase, Offset) =
2418+
getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
2419+
2420+
if (Offset) {
2421+
int64_t DWordOffset0 = Offset / 4;
2422+
int64_t DWordOffset1 = DWordOffset0 + 1;
2423+
if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
2424+
// (add n0, c0)
2425+
return {{
2426+
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); },
2427+
[=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset0); },
2428+
[=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset1); }
2429+
}};
2430+
}
2431+
} else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
2432+
// TODO
2433+
2434+
} else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
2435+
// TODO
2436+
2437+
}
2438+
2439+
return {{
2440+
[=](MachineInstrBuilder &MIB) { MIB.add(Root); },
2441+
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
2442+
[=](MachineInstrBuilder &MIB) { MIB.addImm(1); }
2443+
}};
2444+
}
2445+
24022446
/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
24032447
/// the base value with the constant offset. There may be intervening copies
24042448
/// between \p Root and the identified constant. Returns \p Root, 0 if this does

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,8 @@ class AMDGPUInstructionSelector : public InstructionSelector {
179179

180180
InstructionSelector::ComplexRendererFns
181181
selectDS1Addr1Offset(MachineOperand &Root) const;
182+
InstructionSelector::ComplexRendererFns
183+
selectDS64Bit4ByteAligned(MachineOperand &Root) const;
182184

183185
std::pair<Register, int64_t>
184186
getPtrBaseWithConstantOffset(Register Root,

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -737,31 +737,35 @@ def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>;
737737
def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>;
738738
}
739739

740-
741-
class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, PatFrag frag> : GCNPat <
742-
(v2i32 (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
740+
class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
741+
(vt:$value (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
743742
(inst $ptr, $offset0, $offset1, (i1 0))
744743
>;
745744

746-
class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
747-
(frag v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
748-
(inst $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
749-
(i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
745+
class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
746+
(frag vt:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
747+
(inst $ptr, (i32 (EXTRACT_SUBREG VReg_64:$value, sub0)),
748+
(i32 (EXTRACT_SUBREG VReg_64:$value, sub1)), $offset0, $offset1,
750749
(i1 0))
751750
>;
752751

753-
// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
754-
// related to bounds checking.
755-
let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
756-
def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
757-
def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
758-
}
752+
multiclass DS64Bit4ByteAlignedPat_mc<ValueType vt> {
753+
let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
754+
def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, vt, load_local_m0>;
755+
def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, vt, store_local_m0>;
756+
}
759757

760-
let OtherPredicates = [NotLDSRequiresM0Init] in {
761-
def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, load_local>;
762-
def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
758+
let OtherPredicates = [NotLDSRequiresM0Init] in {
759+
def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, vt, load_local>;
760+
def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, vt, store_local>;
761+
}
763762
}
764763

764+
// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
765+
// related to bounds checking.
766+
foreach vt = VReg_64.RegTypes in {
767+
defm : DS64Bit4ByteAlignedPat_mc<vt>;
768+
}
765769

766770
let AddedComplexity = 100 in {
767771

0 commit comments

Comments
 (0)