-
Notifications
You must be signed in to change notification settings - Fork 14.9k
AMDGPU/GlobalISel: Add regbanklegalize rules for load and store #153176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Petar Avramovic (petar-avramovic) ChangesCover all the missing cases and add very detailed tests for each rule.
Some tests have code size regression since they use more sgpr instructions, Patch is 396.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153176.diff 60 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index b45627d9c1c5d..ce2de50b4a118 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -352,6 +352,34 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
MI.eraseFromParent();
}
+void RegBankLegalizeHelper::widenMMO(MachineInstr &MI) {
+ MachineFunction &MF = B.getMF();
+ assert(MI.getNumMemOperands() == 1);
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+
+ const unsigned MemSize = 8 * MMO->getSize().getValue();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Ptr = MI.getOperand(1).getReg();
+
+ MachineMemOperand *WideMMO = MF.getMachineMemOperand(MMO, 0, S32);
+
+ if (MI.getOpcode() == G_LOAD) {
+ B.buildLoad(Dst, Ptr, *WideMMO);
+ } else {
+ auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
+
+ if (MI.getOpcode() == G_ZEXTLOAD) {
+ auto Mask =
+ B.buildConstant(SgprRB_S32, APInt::getLowBitsSet(32, MemSize));
+ B.buildAnd(Dst, Load, Mask);
+ } else {
+ B.buildSExtInReg(Dst, Load, MemSize);
+ }
+ }
+
+ MI.eraseFromParent();
+}
+
void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(Dst);
@@ -744,6 +772,9 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
}
break;
}
+ case WidenMMO: {
+ return widenMMO(MI);
+ }
}
if (!WaterfallSgprs.empty()) {
@@ -759,6 +790,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
return LLT::scalar(1);
case Sgpr16:
case Vgpr16:
+ case UniInVgprS16:
return LLT::scalar(16);
case Sgpr32:
case Sgpr32_WF:
@@ -895,6 +927,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprB256:
case SgprB512:
case UniInVcc:
+ case UniInVgprS16:
case UniInVgprS32:
case UniInVgprV2S16:
case UniInVgprV4S32:
@@ -1015,6 +1048,18 @@ void RegBankLegalizeHelper::applyMappingDst(
B.buildTrunc(Reg, CopyS32_Vcc);
break;
}
+ case UniInVgprS16: {
+ assert(Ty == getTyFromID(MethodIDs[OpIdx]));
+ assert(RB == SgprRB);
+ Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
+ Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
+ Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
+ Op.setReg(NewVgprDstS16);
+ B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
+ buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
+ B.buildTrunc(Reg, NewSgprDstS32);
+ break;
+ }
case UniInVgprS32:
case UniInVgprV2S16:
case UniInVgprV4S32: {
@@ -1257,6 +1302,7 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
return;
}
+ MI.dump();
LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump(););
llvm_unreachable("type not supported");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index db965d8c000d9..44bf031b6884f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -107,6 +107,7 @@ class RegBankLegalizeHelper {
void splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown,
LLT MergeTy = LLT());
void widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT());
+ void widenMMO(MachineInstr &MI);
void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 8c56c21621121..5c66b91d5f364 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -467,6 +467,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
: ST(&_ST), MRI(&_MRI) {
addRulesForGOpcs({G_ADD, G_SUB}, Standard)
+ .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
@@ -615,8 +617,9 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
.Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SplitTo32SExtInReg}});
- bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12;
+ bool hasSMRDx3 = ST->hasScalarDwordx3Loads();
bool hasSMRDSmall = ST->hasScalarSubwordLoads();
+ bool usesTrue16 = ST->useRealTrue16Insts();
Predicate isAlign16([](const MachineInstr &MI) -> bool {
return (*MI.memoperands_begin())->getAlign() >= Align(16);
@@ -654,54 +657,153 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
return (*MI.memoperands_begin())->getFlags() & MONoClobber;
});
- Predicate isNaturalAlignedSmall([](const MachineInstr &MI) -> bool {
+ Predicate isNaturalAligned([](const MachineInstr &MI) -> bool {
+ const MachineMemOperand *MMO = *MI.memoperands_begin();
+ return MMO->getAlign() >= Align(MMO->getSize().getValue());
+ });
+
+ Predicate is8Or16BitMMO([](const MachineInstr &MI) -> bool {
const MachineMemOperand *MMO = *MI.memoperands_begin();
const unsigned MemSize = 8 * MMO->getSize().getValue();
- return (MemSize == 16 && MMO->getAlign() >= Align(2)) ||
- (MemSize == 8 && MMO->getAlign() >= Align(1));
+ return MemSize == 16 || MemSize == 8;
+ });
+
+ Predicate is32BitMMO([](const MachineInstr &MI) -> bool {
+ const MachineMemOperand *MMO = *MI.memoperands_begin();
+ return 8 * MMO->getSize().getValue() == 32;
});
auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) &&
(isConst || isInvMMO || isNoClobberMMO);
// clang-format off
+ // TODO: S32Dst, 16bit any-extending load should not appear on True16 targets
addRulesForGOpcs({G_LOAD})
- .Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}})
- .Any({{DivB32, UniP0}, {{VgprB32}, {VgprP0}}})
-
- .Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
- .Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
- .Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
- .Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}})
- .Any({{{UniB64, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
- .Any({{{UniB96, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
- .Any({{{UniB128, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
- .Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
- .Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})
-
- .Any({{DivB32, UniP3}, {{VgprB32}, {VgprP3}}})
- .Any({{{UniB32, UniP3}, isAlign4 && isUL}, {{SgprB32}, {SgprP3}}})
- .Any({{{UniB32, UniP3}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP3}}})
-
- .Any({{{DivB256, DivP4}}, {{VgprB256}, {VgprP4}, SplitLoad}})
- .Any({{{UniB32, UniP4}, isNaturalAlignedSmall && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) // i8 and i16 load
- .Any({{{UniB32, UniP4}, isAlign4 && isUL}, {{SgprB32}, {SgprP4}}})
- .Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnalignedLoads)
- .Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnalignedLoads)
- .Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnalignedLoads)
- .Any({{{UniB128, UniP4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
- .Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
- .Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
- .Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load
- .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}})
- .Any({{{UniB256, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP4}, SplitLoad}})
- .Any({{{UniB512, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP4}, SplitLoad}})
-
- .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}});
-
- addRulesForGOpcs({G_ZEXTLOAD}) // i8 and i16 zero-extending loads
- .Any({{{UniB32, UniP3}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP3}}})
- .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}});
+ // flat, addrspace(0), never uniform - flat_load
+ .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16)
+ .Any({{DivB32, P0}, {{VgprB32}, {VgprP0}}}) // 32bit load, 8bit and 16bit any-extending load
+ .Any({{DivB64, P0}, {{VgprB64}, {VgprP0}}})
+ .Any({{DivB96, P0}, {{VgprB96}, {VgprP0}}})
+ .Any({{DivB128, P0}, {{VgprB128}, {VgprP0}}})
+
+ // global, addrspace(1)
+ // divergent - global_load
+ .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16)
+ .Any({{DivB32, P1}, {{VgprB32}, {VgprP1}}}) //32bit load, 8bit and 16bit any-extending load
+ .Any({{DivB64, P1}, {{VgprB64}, {VgprP1}}})
+ .Any({{DivB96, P1}, {{VgprB96}, {VgprP1}}})
+ .Any({{DivB128, P1}, {{VgprB128}, {VgprP1}}})
+ .Any({{DivB256, P1}, {{VgprB256}, {VgprP1}, SplitLoad}})
+ .Any({{DivB512, P1}, {{VgprB512}, {VgprP1}, SplitLoad}})
+
+ // uniform - s_load
+ .Any({{{UniS16, P1}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
+ .Any({{{UniS16, P1}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP1}, WidenMMO}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32bit load
+ .Any({{{UniB32, P1}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP1}}}, hasSMRDSmall) //32bit load, 8bit and 16bit any-extending load
+ // TODO: SplitLoad when !isNaturalAligned && isUL and target hasSMRDSmall
+ .Any({{{UniB32, P1}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}, WidenMMO}}, !hasSMRDSmall) //8bit and 16bit any-extending load to 32bit load
+ .Any({{{UniB32, P1}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}}}) //32bit load
+ .Any({{{UniB64, P1}, isAlign4 && isUL}, {{SgprB64}, {SgprP1}}})
+ .Any({{{UniB96, P1}, isAlign16 && isUL}, {{SgprB96}, {SgprP1}, WidenLoad}}, !hasSMRDx3)
+ .Any({{{UniB96, P1}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP1}, SplitLoad}}, !hasSMRDx3)
+ .Any({{{UniB96, P1}, isAlign4 && isUL}, {{SgprB96}, {SgprP1}}}, hasSMRDx3)
+ .Any({{{UniB128, P1}, isAlign4 && isUL}, {{SgprB128}, {SgprP1}}})
+ .Any({{{UniB256, P1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
+ .Any({{{UniB512, P1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
+
+ // Uniform via global or buffer load, for example volatile or non-aligned
+ // uniform load. Not using standard {{UniInVgprTy}, {VgprP1}} since it is
+ // selected as global_load, use SgprP1 for pointer instead to match
+ // patterns without flat-for-global, default before GFX8.
+ // -> +flat-for-global + {{UniInVgprTy}, {SgprP1}} - global_load
+ // -> -flat-for-global + {{UniInVgprTy}, {SgprP1}} - buffer_load
+ .Any({{{UniS16, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
+ .Any({{{UniS16, P1}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32bit load
+ .Any({{{UniB32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP1}}}, hasSMRDSmall) //32bit load, 8bit and 16bit any-extending load
+ .Any({{{UniB32, P1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}, !hasSMRDSmall) //32bit load, 8bit and 16bit any-extending load
+ .Any({{{UniB64, P1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
+ .Any({{{UniB96, P1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
+ .Any({{{UniB128, P1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
+ .Any({{{UniB256, P1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP1}, SplitLoad}})
+ .Any({{{UniB512, P1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP1}, SplitLoad}})
+
+ // local, addrspace(3) - ds_load
+ .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16)
+ .Any({{DivB32, P3}, {{VgprB32}, {VgprP3}}}) // 32bit load, 8bit and 16bit any-extending load
+ .Any({{DivB64, P3}, {{VgprB64}, {VgprP3}}})
+ .Any({{DivB96, P3}, {{VgprB96}, {VgprP3}}})
+ .Any({{DivB128, P3}, {{VgprB128}, {VgprP3}}})
+
+ .Any({{UniS16, P3}, {{UniInVgprS16}, {SgprP3}}}, usesTrue16) // 16bit load
+ .Any({{UniB32, P3}, {{UniInVgprB32}, {VgprP3}}}) // 32bit load, 8bit and 16bit any-extending load
+ .Any({{UniB64, P3}, {{UniInVgprB64}, {VgprP3}}})
+ .Any({{UniB96, P3}, {{UniInVgprB96}, {VgprP3}}})
+ .Any({{UniB128, P3}, {{UniInVgprB128}, {VgprP3}}})
+
+ // constant, addrspace(4)
+ // divergent - global_load
+ .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16)
+ .Any({{DivB32, P4}, {{VgprB32}, {VgprP4}}}) //32bit load, 8bit and 16bit any-extending load
+ .Any({{DivB64, P4}, {{VgprB64}, {VgprP4}}})
+ .Any({{DivB96, P4}, {{VgprB96}, {VgprP4}}})
+ .Any({{DivB128, P4}, {{VgprB128}, {VgprP4}}})
+ .Any({{DivB256, P4}, {{VgprB256}, {VgprP4}, SplitLoad}})
+ .Any({{DivB512, P4}, {{VgprB512}, {VgprP4}, SplitLoad}})
+
+ // uniform - s_load
+ .Any({{{UniS16, P4}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
+ .Any({{{UniS16, P4}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP4}, WidenMMO}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32bit load
+ .Any({{{UniB32, P4}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) //32bit load, 8bit and 16bit any-extending load
+ .Any({{{UniB32, P4}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}, WidenMMO}}, !hasSMRDSmall) //8bit and 16bit any-extending load to 32bit load
+ .Any({{{UniB32, P4}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}}}) //32bit load
+ .Any({{{UniB64, P4}, isAlign4 && isUL}, {{SgprB64}, {SgprP4}}})
+ .Any({{{UniB96, P4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasSMRDx3)
+ .Any({{{UniB96, P4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasSMRDx3)
+ .Any({{{UniB96, P4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasSMRDx3)
+ .Any({{{UniB128, P4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
+ .Any({{{UniB256, P4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
+ .Any({{{UniB512, P4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
+
+ // uniform in vgpr - global_load or buffer_load
+ .Any({{{UniS16, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
+ .Any({{{UniS16, P4}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32bit load
+ .Any({{{UniB32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP4}}}, hasSMRDSmall) //32bit load, 8bit and 16bit any-extending load
+ .Any({{{UniB32, P4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP4}}}, !hasSMRDSmall) //32bit load, 8bit and 16bit any-extending load
+ .Any({{{UniB64, P4}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP4}}})
+ .Any({{{UniB96, P4}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP4}}})
+ .Any({{{UniB128, P4}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP4}}})
+ .Any({{{UniB256, P4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP4}, SplitLoad}})
+ .Any({{{UniB512, P4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP4}, SplitLoad}})
+
+ // private, addrspace(5), never uniform - scratch_load
+ .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16)
+ .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}}) // 32bit load, 8bit and 16bit any-extending load
+ .Any({{DivB64, P5}, {{VgprB64}, {VgprP5}}})
+ .Any({{DivB96, P5}, {{VgprB96}, {VgprP5}}})
+ .Any({{DivB128, P5}, {{VgprB128}, {VgprP5}}})
+
+ .Any({{DivS32, Ptr128}, {{Vgpr32}, {VgprPtr128}}});
+
+
+ addRulesForGOpcs({G_ZEXTLOAD, G_SEXTLOAD}) // i8 and i16 zeroextending loads
+ .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0}}})
+
+ .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1}}})
+ .Any({{{UniS32, P1}, isAlign4 && isUL}, {{Sgpr32}, {SgprP1}, WidenMMO}}, !hasSMRDSmall)
+ .Any({{{UniS32, P1}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP1}}}, hasSMRDSmall)
+ .Any({{{UniS32, P1}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP1}}}, !hasSMRDSmall)
+ .Any({{{UniS32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP1}}}, hasSMRDSmall)
+
+ .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3}}})
+ .Any({{UniS32, P3}, {{UniInVgprS32}, {VgprP3}}})
+
+ .Any({{DivS32, P4}, {{Vgpr32}, {VgprP4}}})
+ .Any({{{UniS32, P4}, isAlign4 && isUL}, {{Sgpr32}, {SgprP4}, WidenMMO}}, !hasSMRDSmall)
+ .Any({{{UniS32, P4}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP4}}}, hasSMRDSmall)
+ .Any({{{UniS32, P4}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP4}}}, !hasSMRDSmall)
+ .Any({{{UniS32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP4}}}, hasSMRDSmall)
+
+ .Any({{DivS32, P5}, {{Vgpr32}, {VgprP5}}});
// clang-format on
addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, StandardB)
@@ -715,10 +817,19 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
addRulesForGOpcs({G_STORE})
- .Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}})
- .Any({{S32, P1}, {{}, {Vgpr32, VgprP1}}})
- .Any({{S64, P1}, {{}, {Vgpr64, VgprP1}}})
- .Any({{V4S32, P1}, {{}, {VgprV4S32, VgprP1}}});
+ // addrspace(0) and addrspace(1), there are no stores to addrspace(4)
+ .Any({{S16, Ptr64}, {{}, {Vgpr16, VgprPtr64}}})
+ .Any({{B32, Ptr64}, {{}, {VgprB32, VgprPtr64}}})
+ .Any({{B64, Ptr64}, {{}, {VgprB64, VgprPtr64}}})
+ .Any({{B96, Ptr64}, {{}, {VgprB96, VgprPtr64}}})
+ .Any({{B128, Ptr64}, {{}, {VgprB128, VgprPtr64}}})
+
+ // addrspace(3) and addrspace(5)
+ .Any({{S16, Ptr32}, {{}, {Vgpr16, VgprPtr32}}})
+ .Any({{B32, Ptr32}, {{}, {VgprB32, VgprPtr32}}})
+ .Any({{B64, Ptr32}, {{}, {VgprB64, VgprPtr32}}})
+ .Any({{B96, Ptr32}, {{}, {VgprB96, VgprPtr32}}})
+ .Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}});
addRulesForGOpcs({G_AMDGPU_BUFFER_STORE})
.Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 13914403c439e..75e8b6ffa534d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -176,6 +176,7 @@ enum RegBankLLTMappingApplyID {
// Dst only modifiers: read-any-lane and truncs
UniInVcc,
+ UniInVgprS16,
UniInVgprS32,
UniInVgprV2S16,
UniInVgprV4S32,
@@ -221,6 +222,7 @@ enum LoweringMethodID {
UniCstExt,
SplitLoad,
WidenLoad,
+ WidenMMO
};
enum FastRulesTypes {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
index 83912b1e77db2..97694f3304431 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
...
[truncated]
|
d79a3f8
to
477c501
Compare
2a2bdbf
to
696b1b2
Compare
477c501
to
f9314b8
Compare
788368e
to
1464e56
Compare
|
||
if (MI.getOpcode() == G_ZEXTLOAD) { | ||
auto Mask = | ||
B.buildConstant(SgprRB_S32, APInt::getLowBitsSet(32, MemSize)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Take the bitwidth from the type and don't hardcode the 32?
// uniform - s_load | ||
.Any({{{UniS16, P1}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load | ||
.Any({{{UniS16, P1}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP1}, WidenMMO}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32bit load | ||
.Any({{{UniB32, P1}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP1}}}, hasSMRDSmall) //32bit load, 8bit and 16bit any-extending load |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we actually ever care about natural alignment? I thought 4 byte alignment was good enough for all scalar loads
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Implemented as in AMDGPURegisterBankInfo, AFAIK isAlign4 is required for gfx11 and older. For 8-bit and 16-bit s_loads on gfx12, natural alignment is enough, and for larger loads 4-byte alignment is enough.
Speaking of 4 byte alignment being required, for some reason B96 -> B128 load
.Any({{{UniB96, P1}, isAlign16 && isUL}, {{SgprB96}, {SgprP1}, WidenLoad}}, !hasSMRDx3)
requires 16 bit align, and just B128 load requires only isAlign4.
if load has only align 4, then it does split into B32 + B64 load. Should we always WidenLoad for B96?
@@ -352,6 +352,34 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, | |||
MI.eraseFromParent(); | |||
} | |||
|
|||
void RegBankLegalizeHelper::widenMMO(MachineInstr &MI) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
void RegBankLegalizeHelper::widenMMO(MachineInstr &MI) { | |
void RegBankLegalizeHelper::widenMMO(MachineInstr &MI) const { |
d1da8f1
to
d8c91e6
Compare
f9314b8
to
8571417
Compare
d8c91e6
to
e863c35
Compare
addRulesForGOpcs({G_ZEXTLOAD}) // i8 and i16 zero-extending loads | ||
.Any({{{UniB32, UniP3}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP3}}}) | ||
.Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}}); | ||
// flat, addrspace(0), never uniform - flat_load |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Speaking of some loads being always divergent from SIInstrInfo::getGenericInstructionUniformity
if (opcode == AMDGPU::G_LOAD) {
if (MI.memoperands_empty())
return InstructionUniformity::NeverUniform; // conservative assumption
if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
})) {
// At least one MMO in a non-global address space.
return InstructionUniformity::NeverUniform;
}
return InstructionUniformity::Default;
}
G_ZEXTLOAD and G_SEXTLOAD are not covered and can end up being uniform for P0 and P5.
Should we also force them to be always divergent?
e863c35
to
87b9b39
Compare
Added tests for stores, in particular older targets that match buffer_store patterns. Fixed store rules for older targets. |
✅ With the latest revision this PR passed the C/C++ code formatter. |
87b9b39
to
ccc9546
Compare
Cover all the missing cases and add very detailed tests for each rule. In summary: - Flat and Scratch, addrspace(0) and addrspace(5), loads are always divergent. - Global and Constant, addrspace(1) and addrspace(4), have real uniform loads, s_load, but require additional checks for align and flags in mmo. For not natural align or not uniform mmo do uniform-in-vgpr lowering. - Private, addrspace(3), only has instructions for divergent load, for uniform do uniform-in-vgpr lowering. - Store rules are simplified using Ptr32. Operand to be stored needs to be vgpr. - Store for GFX7 and older supports buffer_store patterns: - divergent addrspace(1) -> buffer_store addr64 - uniform addrspace(1) -> buffer_store offset - addrspace(5) -> buffer_store offen Some tests have code size regression since they use more sgpr instructions, marked with FixMe comment to get back to later.
ccc9546
to
0a46b28
Compare
Cover all the missing cases and add very detailed tests for each rule.
In summary:
divergent.
loads, s_load, but require additional checks for align and flags in mmo.
For not natural align or not uniform mmo do uniform-in-vgpr lowering.
uniform do uniform-in-vgpr lowering.
All operands need to be vgpr.
Some tests have code size regression since they use more sgpr instructions,
marked with FixMe comment to get back to later.