Skip to content

Commit 533d650

Browse files
committed
AMDGPU/GlobalISel: Move llvm.amdgcn.raw.buffer.store handling
Treat this the same way as loads. There's less value to the intermediate nodes, but it's good to be consistent.
1 parent e6d2583 commit 533d650

11 files changed

+254
-345
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,11 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT, SIbuffer_load_format>;
139139
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT_D16, SIbuffer_load_format_d16>;
140140
def : GINodeEquiv<G_AMDGPU_TBUFFER_LOAD_FORMAT, SItbuffer_load>;
141141
def : GINodeEquiv<G_AMDGPU_TBUFFER_LOAD_FORMAT_D16, SItbuffer_load_d16>;
142+
def : GINodeEquiv<G_AMDGPU_BUFFER_STORE, SIbuffer_store>;
143+
def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_SHORT, SIbuffer_store_short>;
144+
def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_BYTE, SIbuffer_store_byte>;
145+
def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_FORMAT, SIbuffer_store_format>;
146+
def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_FORMAT_D16, SIbuffer_store_format_d16>;
142147

143148
// FIXME: Check MMO is atomic
144149
def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, SIatomic_inc>;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 0 additions & 229 deletions
Original file line numberDiff line numberDiff line change
@@ -856,177 +856,6 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
856856
return Ret;
857857
}
858858

859-
static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
860-
int64_t C;
861-
if (mi_match(Reg, MRI, m_ICst(C)) && C == 0)
862-
return true;
863-
864-
// FIXME: matcher should ignore copies
865-
return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0;
866-
}
867-
868-
static unsigned extractGLC(unsigned AuxiliaryData) {
869-
return AuxiliaryData & 1;
870-
}
871-
872-
static unsigned extractSLC(unsigned AuxiliaryData) {
873-
return (AuxiliaryData >> 1) & 1;
874-
}
875-
876-
static unsigned extractDLC(unsigned AuxiliaryData) {
877-
return (AuxiliaryData >> 2) & 1;
878-
}
879-
880-
static unsigned extractSWZ(unsigned AuxiliaryData) {
881-
return (AuxiliaryData >> 3) & 1;
882-
}
883-
884-
static unsigned getBufferStoreOpcode(LLT Ty,
885-
const unsigned MemSize,
886-
const bool Offen) {
887-
const int Size = Ty.getSizeInBits();
888-
switch (8 * MemSize) {
889-
case 8:
890-
return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
891-
AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
892-
case 16:
893-
return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
894-
AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
895-
default:
896-
unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
897-
AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
898-
if (Size > 32)
899-
Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
900-
return Opc;
901-
}
902-
}
903-
904-
static unsigned getBufferStoreFormatOpcode(LLT Ty,
905-
const unsigned MemSize,
906-
const bool Offen) {
907-
bool IsD16Packed = Ty.getScalarSizeInBits() == 16;
908-
bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits();
909-
int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
910-
911-
if (IsD16Packed) {
912-
switch (NumElts) {
913-
case 1:
914-
return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
915-
AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
916-
case 2:
917-
return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact :
918-
AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact;
919-
case 3:
920-
return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact :
921-
AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact;
922-
case 4:
923-
return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact :
924-
AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact;
925-
default:
926-
return -1;
927-
}
928-
}
929-
930-
if (IsD16Unpacked) {
931-
switch (NumElts) {
932-
case 1:
933-
return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
934-
AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
935-
case 2:
936-
return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact :
937-
AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact;
938-
case 3:
939-
return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact :
940-
AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact;
941-
case 4:
942-
return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact :
943-
AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact;
944-
default:
945-
return -1;
946-
}
947-
}
948-
949-
switch (NumElts) {
950-
case 1:
951-
return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact :
952-
AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact;
953-
case 2:
954-
return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact :
955-
AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact;
956-
case 3:
957-
return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact :
958-
AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact;
959-
case 4:
960-
return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact :
961-
AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact;
962-
default:
963-
return -1;
964-
}
965-
966-
llvm_unreachable("unhandled buffer store");
967-
}
968-
969-
// TODO: Move this to combiner
970-
// Returns base register, imm offset, total constant offset.
971-
std::tuple<Register, unsigned, unsigned>
972-
AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B,
973-
Register OrigOffset) const {
974-
const unsigned MaxImm = 4095;
975-
Register BaseReg;
976-
unsigned TotalConstOffset;
977-
MachineInstr *OffsetDef;
978-
979-
std::tie(BaseReg, TotalConstOffset, OffsetDef)
980-
= AMDGPU::getBaseWithConstantOffset(*MRI, OrigOffset);
981-
982-
unsigned ImmOffset = TotalConstOffset;
983-
984-
// If the immediate value is too big for the immoffset field, put the value
985-
// and -4096 into the immoffset field so that the value that is copied/added
986-
// for the voffset field is a multiple of 4096, and it stands more chance
987-
// of being CSEd with the copy/add for another similar load/store.f
988-
// However, do not do that rounding down to a multiple of 4096 if that is a
989-
// negative number, as it appears to be illegal to have a negative offset
990-
// in the vgpr, even if adding the immediate offset makes it positive.
991-
unsigned Overflow = ImmOffset & ~MaxImm;
992-
ImmOffset -= Overflow;
993-
if ((int32_t)Overflow < 0) {
994-
Overflow += ImmOffset;
995-
ImmOffset = 0;
996-
}
997-
998-
if (Overflow != 0) {
999-
// In case this is in a waterfall loop, insert offset code at the def point
1000-
// of the offset, not inside the loop.
1001-
MachineBasicBlock::iterator OldInsPt = B.getInsertPt();
1002-
MachineBasicBlock &OldMBB = B.getMBB();
1003-
B.setInstr(*OffsetDef);
1004-
1005-
if (!BaseReg) {
1006-
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1007-
B.buildInstr(AMDGPU::V_MOV_B32_e32)
1008-
.addDef(BaseReg)
1009-
.addImm(Overflow);
1010-
} else {
1011-
Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1012-
B.buildInstr(AMDGPU::V_MOV_B32_e32)
1013-
.addDef(OverflowVal)
1014-
.addImm(Overflow);
1015-
1016-
Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1017-
TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg)
1018-
.addReg(BaseReg)
1019-
.addReg(OverflowVal, RegState::Kill)
1020-
.addImm(0);
1021-
BaseReg = NewBaseReg;
1022-
}
1023-
1024-
B.setInsertPt(OldMBB, OldInsPt);
1025-
}
1026-
1027-
return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
1028-
}
1029-
1030859
bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1031860
// FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1032861
// SelectionDAG uses for wave32 vs wave64.
@@ -1042,60 +871,6 @@ bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1042871
return true;
1043872
}
1044873

1045-
bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI,
1046-
bool IsFormat) const {
1047-
MachineIRBuilder B(MI);
1048-
MachineFunction &MF = B.getMF();
1049-
Register VData = MI.getOperand(1).getReg();
1050-
LLT Ty = MRI->getType(VData);
1051-
1052-
int Size = Ty.getSizeInBits();
1053-
if (Size % 32 != 0)
1054-
return false;
1055-
1056-
// FIXME: Verifier should enforce 1 MMO for these intrinsics.
1057-
MachineMemOperand *MMO = *MI.memoperands_begin();
1058-
const int MemSize = MMO->getSize();
1059-
1060-
Register RSrc = MI.getOperand(2).getReg();
1061-
Register VOffset = MI.getOperand(3).getReg();
1062-
Register SOffset = MI.getOperand(4).getReg();
1063-
unsigned AuxiliaryData = MI.getOperand(5).getImm();
1064-
unsigned ImmOffset;
1065-
unsigned TotalOffset;
1066-
1067-
std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
1068-
if (TotalOffset != 0)
1069-
MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize);
1070-
1071-
const bool Offen = !isZero(VOffset, *MRI);
1072-
1073-
int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) :
1074-
getBufferStoreOpcode(Ty, MemSize, Offen);
1075-
if (Opc == -1)
1076-
return false;
1077-
1078-
MachineInstrBuilder MIB = B.buildInstr(Opc)
1079-
.addUse(VData);
1080-
1081-
if (Offen)
1082-
MIB.addUse(VOffset);
1083-
1084-
MIB.addUse(RSrc)
1085-
.addUse(SOffset)
1086-
.addImm(ImmOffset)
1087-
.addImm(extractGLC(AuxiliaryData))
1088-
.addImm(extractSLC(AuxiliaryData))
1089-
.addImm(0) // tfe: FIXME: Remove from inst
1090-
.addImm(extractDLC(AuxiliaryData))
1091-
.addImm(extractSWZ(AuxiliaryData))
1092-
.addMemOperand(MMO);
1093-
1094-
MI.eraseFromParent();
1095-
1096-
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1097-
}
1098-
1099874
static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
1100875
switch (MF.getFunction().getCallingConv()) {
1101876
case CallingConv::AMDGPU_PS:
@@ -1325,10 +1100,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
13251100
switch (IntrinsicID) {
13261101
case Intrinsic::amdgcn_end_cf:
13271102
return selectEndCfIntrinsic(I);
1328-
case Intrinsic::amdgcn_raw_buffer_store:
1329-
return selectStoreIntrinsic(I, false);
1330-
case Intrinsic::amdgcn_raw_buffer_store_format:
1331-
return selectStoreIntrinsic(I, true);
13321103
case Intrinsic::amdgcn_ds_ordered_add:
13331104
case Intrinsic::amdgcn_ds_ordered_swap:
13341105
return selectDSOrderedIntrinsic(I, IntrinsicID);

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,7 @@ class AMDGPUInstructionSelector : public InstructionSelector {
100100
bool selectInterpP1F16(MachineInstr &MI) const;
101101
bool selectG_INTRINSIC(MachineInstr &I) const;
102102

103-
std::tuple<Register, unsigned, unsigned>
104-
splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
105-
106103
bool selectEndCfIntrinsic(MachineInstr &MI) const;
107-
bool selectStoreIntrinsic(MachineInstr &MI, bool IsFormat) const;
108104
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
109105
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
110106
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;

0 commit comments

Comments
 (0)