Skip to content

Commit db0ed3e

Browse files
committed
AMDGPU: Refactor treatment of denormal mode
Start moving towards treating this as a property of the calling convention, and not the subtarget. The default denormal mode should not be part of the subtarget, and be moved into a separate function attribute. This patch is still NFC. The denormal mode remains as a subtarget feature for now, but make the necessary changes to switch to using an attribute.
1 parent ea23b64 commit db0ed3e

19 files changed

+153
-90
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -69,15 +69,14 @@ using namespace llvm::AMDGPU::HSAMD;
6969
// We want to use these instructions, and using fp32 denormals also causes
7070
// instructions to run at the double precision rate for the device so it's
7171
// probably best to just report no single precision denormals.
72-
static uint32_t getFPMode(const MachineFunction &F) {
73-
const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>();
74-
// TODO: Is there any real use for the flush in only / flush out only modes?
72+
static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) {
7573

74+
// TODO: Is there any real use for the flush in only / flush out only modes?
7675
uint32_t FP32Denormals =
77-
ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
76+
Mode.FP32Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
7877

7978
uint32_t FP64Denormals =
80-
ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
79+
Mode.FP64FP16Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
8180

8281
return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
8382
FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
@@ -1033,11 +1032,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
10331032
ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
10341033
&STM, ProgInfo.NumVGPRsForWavesPerEU);
10351034

1035+
const SIModeRegisterDefaults Mode = MFI->getMode();
1036+
10361037
// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
10371038
// register.
1038-
ProgInfo.FloatMode = getFPMode(MF);
1039+
ProgInfo.FloatMode = getFPMode(Mode);
10391040

1040-
const SIModeRegisterDefaults Mode = MFI->getMode();
10411041
ProgInfo.IEEEMode = Mode.IEEE;
10421042

10431043
// Make clamp modifier on NaN input returns 0.

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
7070
Module *Mod = nullptr;
7171
const DataLayout *DL = nullptr;
7272
bool HasUnsafeFPMath = false;
73+
bool HasFP32Denormals = false;
7374

7475
/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
7576
/// binary operation \p V.
@@ -575,7 +576,6 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
575576

576577
Value *NewFDiv = nullptr;
577578

578-
bool HasDenormals = ST->hasFP32Denormals();
579579
if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
580580
NewFDiv = UndefValue::get(VT);
581581

@@ -586,7 +586,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
586586
Value *DenEltI = Builder.CreateExtractElement(Den, I);
587587
Value *NewElt;
588588

589-
if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
589+
if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
590590
NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
591591
} else {
592592
NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
@@ -595,7 +595,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
595595
NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
596596
}
597597
} else {
598-
if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
598+
if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
599599
NewFDiv = Builder.CreateCall(Decl, { Num, Den });
600600
}
601601

@@ -1034,6 +1034,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
10341034
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
10351035
DA = &getAnalysis<LegacyDivergenceAnalysis>();
10361036
HasUnsafeFPMath = hasUnsafeFPMath(F);
1037+
HasFP32Denormals = ST->hasFP32Denormals(F);
10371038

10381039
bool MadeChange = false;
10391040

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,10 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
128128
// Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
129129
// make the right decision when generating code for different targets.
130130
const GCNSubtarget *Subtarget;
131+
132+
// Default FP mode for the current function.
133+
AMDGPU::SIModeRegisterDefaults Mode;
134+
131135
bool EnableLateStructurizeCFG;
132136

133137
public:
@@ -393,6 +397,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
393397
}
394398
#endif
395399
Subtarget = &MF.getSubtarget<GCNSubtarget>();
400+
Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
396401
return SelectionDAGISel::runOnMachineFunction(MF);
397402
}
398403

@@ -2104,7 +2109,7 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
21042109
bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
21052110
bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
21062111

2107-
assert((IsFMA || !Subtarget->hasFP32Denormals()) &&
2112+
assert((IsFMA || !Mode.FP32Denormals) &&
21082113
"fmad selected with denormals enabled");
21092114
// TODO: We can select this with f32 denormals enabled if all the sources are
21102115
// converted from f16 (in which case fmad isn't legal).

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1581,8 +1581,11 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
15811581
// float fqneg = -fq;
15821582
SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
15831583

1584+
MachineFunction &MF = DAG.getMachineFunction();
1585+
const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
1586+
15841587
// float fr = mad(fqneg, fb, fa);
1585-
unsigned OpCode = Subtarget->hasFP32Denormals() ?
1588+
unsigned OpCode = MFI->getMode().FP32Denormals ?
15861589
(unsigned)AMDGPUISD::FMAD_FTZ :
15871590
(unsigned)ISD::FMAD;
15881591
SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
@@ -1663,8 +1666,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
16631666
}
16641667

16651668
if (isTypeLegal(MVT::i64)) {
1669+
MachineFunction &MF = DAG.getMachineFunction();
1670+
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1671+
16661672
// Compute denominator reciprocal.
1667-
unsigned FMAD = Subtarget->hasFP32Denormals() ?
1673+
unsigned FMAD = MFI->getMode().FP32Denormals ?
16681674
(unsigned)AMDGPUISD::FMAD_FTZ :
16691675
(unsigned)ISD::FMAD;
16701676

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,16 @@ class PredicateControl {
100100
class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
101101
PredicateControl;
102102

103-
def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">;
104-
def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">;
105-
def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">;
106-
def NoFP16Denormals : Predicate<"!Subtarget->hasFP16Denormals()">;
107-
def NoFP32Denormals : Predicate<"!Subtarget->hasFP32Denormals()">;
108-
def NoFP64Denormals : Predicate<"!Subtarget->hasFP64Denormals()">;
103+
let RecomputePerFunction = 1 in {
104+
def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
105+
def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">;
106+
def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
107+
def NoFP16Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
108+
def NoFP32Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">;
109+
def NoFP64Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
109110
def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
111+
}
112+
110113
def FMA : Predicate<"Subtarget->hasFMA()">;
111114

112115
def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
1818
LocalMemoryObjects(),
1919
ExplicitKernArgSize(0),
2020
LDSSize(0),
21+
Mode(MF.getFunction(), MF.getSubtarget<GCNSubtarget>()),
2122
IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
2223
NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
2324
MemoryBound(false),

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "llvm/ADT/DenseMap.h"
1313
#include "llvm/CodeGen/MachineFunction.h"
14+
#include "Utils/AMDGPUBaseInfo.h"
1415

1516
namespace llvm {
1617

@@ -28,6 +29,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
2829
/// Number of bytes in the LDS that are being used.
2930
unsigned LDSSize;
3031

32+
// State of MODE register, assumed FP mode.
33+
AMDGPU::SIModeRegisterDefaults Mode;
34+
3135
// Kernels + shaders. i.e. functions called by the driver and not called
3236
// by other functions.
3337
bool IsEntryFunction;
@@ -53,6 +57,10 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
5357
return LDSSize;
5458
}
5559

60+
AMDGPU::SIModeRegisterDefaults getMode() const {
61+
return Mode;
62+
}
63+
5664
bool isEntryFunction() const {
5765
return IsEntryFunction;
5866
}

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,12 @@ class AMDGPUSubtarget {
148148
return HasMadMixInsts;
149149
}
150150

151-
bool hasFP32Denormals() const {
151+
bool hasFP32Denormals(const Function &F) const {
152+
// FIXME: This should not be a property of the subtarget. This should be a
153+
// property with a default set by the calling convention which can be
154+
// overridden by attributes. For now, use the subtarget feature as a
155+
// placeholder attribute. The function arguments only purpose is to
156+
// discourage use without a function context until this is removed.
152157
return FP32Denormals;
153158
}
154159

@@ -612,11 +617,17 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
612617
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
613618
const Function &) const;
614619

615-
bool hasFP16Denormals() const {
620+
/// Alias for hasFP64FP16Denormals
621+
bool hasFP16Denormals(const Function &F) const {
616622
return FP64FP16Denormals;
617623
}
618624

619-
bool hasFP64Denormals() const {
625+
/// Alias for hasFP64FP16Denormals
626+
bool hasFP64Denormals(const Function &F) const {
627+
return FP64FP16Denormals;
628+
}
629+
630+
bool hasFP64FP16Denormals(const Function &F) const {
620631
return FP64FP16Denormals;
621632
}
622633

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,7 @@ int GCNTTIImpl::getArithmeticInstrCost(
412412

413413
if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
414414
// TODO: This is more complicated, unsafe flags etc.
415-
if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) ||
415+
if ((SLT == MVT::f32 && !HasFP32Denormals) ||
416416
(SLT == MVT::f16 && ST->has16BitInsts())) {
417417
return LT.first * getQuarterRateInstrCost() * NElts;
418418
}
@@ -431,7 +431,7 @@ int GCNTTIImpl::getArithmeticInstrCost(
431431
if (SLT == MVT::f32 || SLT == MVT::f16) {
432432
int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
433433

434-
if (!ST->hasFP32Denormals()) {
434+
if (!HasFP32Denormals) {
435435
// FP mode switches.
436436
Cost += 2 * getFullRateInstrCost();
437437
}
@@ -671,10 +671,13 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
671671
bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
672672
const Function *Callee) const {
673673
const TargetMachine &TM = getTLI()->getTargetMachine();
674-
const FeatureBitset &CallerBits =
675-
TM.getSubtargetImpl(*Caller)->getFeatureBits();
676-
const FeatureBitset &CalleeBits =
677-
TM.getSubtargetImpl(*Callee)->getFeatureBits();
674+
const GCNSubtarget *CallerST
675+
= static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
676+
const GCNSubtarget *CalleeST
677+
= static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
678+
679+
const FeatureBitset &CallerBits = CallerST->getFeatureBits();
680+
const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
678681

679682
FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
680683
FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
@@ -683,8 +686,8 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
683686

684687
// FIXME: dx10_clamp can just take the caller setting, but there seems to be
685688
// no way to support merge for backend defined attributes.
686-
AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
687-
AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
689+
AMDGPU::SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
690+
AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
688691
return CallerMode.isInlineCompatible(CalleeMode);
689692
}
690693

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
4646

4747
Triple TargetTriple;
4848

49-
const TargetSubtargetInfo *ST;
49+
const GCNSubtarget *ST;
5050
const TargetLoweringBase *TLI;
5151

5252
const TargetSubtargetInfo *getST() const { return ST; }
@@ -73,6 +73,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
7373
const AMDGPUTargetLowering *TLI;
7474
AMDGPUTTIImpl CommonTTI;
7575
bool IsGraphicsShader;
76+
bool HasFP32Denormals;
7677

7778
const FeatureBitset InlineFeatureIgnoreList = {
7879
// Codegen control options which don't matter.
@@ -131,7 +132,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
131132
ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
132133
TLI(ST->getTargetLowering()),
133134
CommonTTI(TM, F),
134-
IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {}
135+
IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
136+
HasFP32Denormals(ST->hasFP32Denormals(F)) { }
135137

136138
bool hasBranchDivergence() { return true; }
137139

0 commit comments

Comments
 (0)