supython-coder
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Lines changed: 7 additions & 7 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Lines changed: 7 additions & 7 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Lines changed: 4 additions & 3 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Lines changed: 4 additions & 3 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Lines changed: 6 additions & 1 deletion b/‎llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Lines changed: 6 additions & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 8 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 8 additions & 2 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Lines changed: 9 additions & 6 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Lines changed: 9 additions & 6 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
Lines changed: 1 addition & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
Lines changed: 8 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Lines changed: 14 additions & 3 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Lines changed: 14 additions & 3 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Lines changed: 11 additions & 8 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Lines changed: 11 additions & 8 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Lines changed: 4 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Lines changed: 4 additions & 2 deletions
@@ -69,15 +69,14 @@ using namespace llvm::AMDGPU::HSAMD;
 // We want to use these instructions, and using fp32 denormals also causes
 // instructions to run at the double precision rate for the device so it's
 // probably best to just report no single precision denormals.
-static uint32_t getFPMode(const MachineFunction &F) {
-  const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>();
-  // TODO: Is there any real use for the flush in only / flush out only modes?
+static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) {
 
+  // TODO: Is there any real use for the flush in only / flush out only modes?
   uint32_t FP32Denormals =
-    ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+    Mode.FP32Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
 
   uint32_t FP64Denormals =
-    ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+    Mode.FP64FP16Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
 
   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
@@ -1033,11 +1032,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
       &STM, ProgInfo.NumVGPRsForWavesPerEU);
 
+  const SIModeRegisterDefaults Mode = MFI->getMode();
+
   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
   // register.
-  ProgInfo.FloatMode = getFPMode(MF);
+  ProgInfo.FloatMode = getFPMode(Mode);
 
-  const SIModeRegisterDefaults Mode = MFI->getMode();
   ProgInfo.IEEEMode = Mode.IEEE;
 
   // Make clamp modifier on NaN input returns 0.
 
@@ -70,6 +70,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
   bool HasUnsafeFPMath = false;
+  bool HasFP32Denormals = false;
 
   /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
   /// binary operation \p V.
@@ -575,7 +576,6 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
 
   Value *NewFDiv = nullptr;
 
-  bool HasDenormals = ST->hasFP32Denormals();
   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
     NewFDiv = UndefValue::get(VT);
 
@@ -586,7 +586,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
       Value *DenEltI = Builder.CreateExtractElement(Den, I);
       Value *NewElt;
 
-      if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
+      if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
       } else {
         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
@@ -595,7 +595,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     }
   } else {
-    if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
+    if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
   }
 
@@ -1034,6 +1034,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   DA = &getAnalysis<LegacyDivergenceAnalysis>();
   HasUnsafeFPMath = hasUnsafeFPMath(F);
+  HasFP32Denormals = ST->hasFP32Denormals(F);
 
   bool MadeChange = false;
 
 
@@ -128,6 +128,10 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
   // make the right decision when generating code for different targets.
   const GCNSubtarget *Subtarget;
+
+  // Default FP mode for the current function.
+  AMDGPU::SIModeRegisterDefaults Mode;
+
   bool EnableLateStructurizeCFG;
 
 public:
@@ -393,6 +397,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   }
 #endif
   Subtarget = &MF.getSubtarget<GCNSubtarget>();
+  Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
@@ -2104,7 +2109,7 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
   bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
   bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
 
-  assert((IsFMA || !Subtarget->hasFP32Denormals()) &&
+  assert((IsFMA || !Mode.FP32Denormals) &&
          "fmad selected with denormals enabled");
   // TODO: We can select this with f32 denormals enabled if all the sources are
   // converted from f16 (in which case fmad isn't legal).
 
@@ -1581,8 +1581,11 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
   // float fqneg = -fq;
   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
 
+  MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+
   // float fr = mad(fqneg, fb, fa);
-  unsigned OpCode = Subtarget->hasFP32Denormals() ?
+  unsigned OpCode = MFI->getMode().FP32Denormals ?
                     (unsigned)AMDGPUISD::FMAD_FTZ :
                     (unsigned)ISD::FMAD;
   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
@@ -1663,8 +1666,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
   }
 
   if (isTypeLegal(MVT::i64)) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
     // Compute denominator reciprocal.
-    unsigned FMAD = Subtarget->hasFP32Denormals() ?
+    unsigned FMAD = MFI->getMode().FP32Denormals ?
                     (unsigned)AMDGPUISD::FMAD_FTZ :
                     (unsigned)ISD::FMAD;
 
 
@@ -100,13 +100,16 @@ class PredicateControl {
 class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
       PredicateControl;
 
-def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">;
-def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">;
-def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">;
-def NoFP16Denormals : Predicate<"!Subtarget->hasFP16Denormals()">;
-def NoFP32Denormals : Predicate<"!Subtarget->hasFP32Denormals()">;
-def NoFP64Denormals : Predicate<"!Subtarget->hasFP64Denormals()">;
+let RecomputePerFunction = 1 in {
+def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
+def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">;
+def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
+def NoFP16Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
+def NoFP32Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">;
+def NoFP64Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
 def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
+}
+
 def FMA : Predicate<"Subtarget->hasFMA()">;
 
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
 
@@ -18,6 +18,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   LocalMemoryObjects(),
   ExplicitKernArgSize(0),
   LDSSize(0),
+  Mode(MF.getFunction(), MF.getSubtarget<GCNSubtarget>()),
   IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
   NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
   MemoryBound(false),
 
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "Utils/AMDGPUBaseInfo.h"
 
 namespace llvm {
 
@@ -28,6 +29,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   /// Number of bytes in the LDS that are being used.
   unsigned LDSSize;
 
+  // State of MODE register, assumed FP mode.
+  AMDGPU::SIModeRegisterDefaults Mode;
+
   // Kernels + shaders. i.e. functions called by the driver and not called
   // by other functions.
   bool IsEntryFunction;
@@ -53,6 +57,10 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
     return LDSSize;
   }
 
+  AMDGPU::SIModeRegisterDefaults getMode() const {
+    return Mode;
+  }
+
   bool isEntryFunction() const {
     return IsEntryFunction;
   }
 
@@ -148,7 +148,12 @@ class AMDGPUSubtarget {
     return HasMadMixInsts;
   }
 
-  bool hasFP32Denormals() const {
+  bool hasFP32Denormals(const Function &F) const {
+    // FIXME: This should not be a property of the subtarget. This should be a
+    // property with a default set by the calling convention which can be
+    // overridden by attributes. For now, use the subtarget feature as a
+    // placeholder attribute. The function arguments only purpose is to
+    // discourage use without a function context until this is removed.
     return FP32Denormals;
   }
 
@@ -612,11 +617,17 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
                                            const Function &) const;
 
-  bool hasFP16Denormals() const {
+  /// Alias for hasFP64FP16Denormals
+  bool hasFP16Denormals(const Function &F) const {
     return FP64FP16Denormals;
   }
 
-  bool hasFP64Denormals() const {
+  /// Alias for hasFP64FP16Denormals
+  bool hasFP64Denormals(const Function &F) const {
+    return FP64FP16Denormals;
+  }
+
+  bool hasFP64FP16Denormals(const Function &F) const {
     return FP64FP16Denormals;
   }
 
 
@@ -412,7 +412,7 @@ int GCNTTIImpl::getArithmeticInstrCost(
 
     if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
       // TODO: This is more complicated, unsafe flags etc.
-      if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) ||
+      if ((SLT == MVT::f32 && !HasFP32Denormals) ||
           (SLT == MVT::f16 && ST->has16BitInsts())) {
         return LT.first * getQuarterRateInstrCost() * NElts;
       }
@@ -431,7 +431,7 @@ int GCNTTIImpl::getArithmeticInstrCost(
     if (SLT == MVT::f32 || SLT == MVT::f16) {
       int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
 
-      if (!ST->hasFP32Denormals()) {
+      if (!HasFP32Denormals) {
         // FP mode switches.
         Cost += 2 * getFullRateInstrCost();
       }
@@ -671,10 +671,13 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
                                      const Function *Callee) const {
   const TargetMachine &TM = getTLI()->getTargetMachine();
-  const FeatureBitset &CallerBits =
-    TM.getSubtargetImpl(*Caller)->getFeatureBits();
-  const FeatureBitset &CalleeBits =
-    TM.getSubtargetImpl(*Callee)->getFeatureBits();
+  const GCNSubtarget *CallerST
+    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
+  const GCNSubtarget *CalleeST
+    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
+
+  const FeatureBitset &CallerBits = CallerST->getFeatureBits();
+  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
 
   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
@@ -683,8 +686,8 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
 
   // FIXME: dx10_clamp can just take the caller setting, but there seems to be
   // no way to support merge for backend defined attributes.
-  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
-  AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
+  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
+  AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
   return CallerMode.isInlineCompatible(CalleeMode);
 }
 
 
@@ -46,7 +46,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
 
   Triple TargetTriple;
 
-  const TargetSubtargetInfo *ST;
+  const GCNSubtarget *ST;
   const TargetLoweringBase *TLI;
 
   const TargetSubtargetInfo *getST() const { return ST; }
@@ -73,6 +73,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   const AMDGPUTargetLowering *TLI;
   AMDGPUTTIImpl CommonTTI;
   bool IsGraphicsShader;
+  bool HasFP32Denormals;
 
   const FeatureBitset InlineFeatureIgnoreList = {
     // Codegen control options which don't matter.
@@ -131,7 +132,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
       ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
       TLI(ST->getTargetLowering()),
       CommonTTI(TM, F),
-      IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {}
+      IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
+      HasFP32Denormals(ST->hasFP32Denormals(F)) { }
 
   bool hasBranchDivergence() { return true; }