supython-coder
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Lines changed: 110 additions & 32 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Lines changed: 110 additions & 32 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 44 additions & 34 deletions b/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 44 additions & 34 deletions
@@ -606,12 +606,64 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
   return true;
 }
 
-static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
+// Perform RCP optimizations:
+//
+// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
+//                                                denormals flushed.
+//
+// a/b -> a*rcp(b) when fast unsafe rcp is legal.
+static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal,
+                            IRBuilder<> Builder, MDNode *FPMath, Module *Mod,
+                            bool HasDenormals, bool NeedHighAccuracy) {
+
+  Type *Ty = Den->getType();
+  if (!FastUnsafeRcpLegal && Ty->isFloatTy() &&
+                             (HasDenormals || NeedHighAccuracy))
+    return nullptr;
+
+  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty);
+  if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
+    if (FastUnsafeRcpLegal || Ty->isFloatTy() || Ty->isHalfTy()) {
+      if (CLHS->isExactlyValue(1.0)) {
+        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+        // the CI documentation has a worst case error of 1 ulp.
+        // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+        // use it as long as we aren't trying to use denormals.
+        //
+        // v_rcp_f16 and v_rsq_f16 DO support denormals.
+
+        // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
+        //       insert rsq intrinsic here.
+
+        // 1.0 / x -> rcp(x)
+        return Builder.CreateCall(Decl, { Den });
+       }
+
+       // Same as for 1.0, but expand the sign out of the constant.
+       if (CLHS->isExactlyValue(-1.0)) {
+         // -1.0 / x -> rcp (fneg x)
+         Value *FNeg = Builder.CreateFNeg(Den);
+         return Builder.CreateCall(Decl, { FNeg });
+       }
+    }
+  }
+
+  if (FastUnsafeRcpLegal) {
+    // Turn into multiply by the reciprocal.
+    // x / y -> x * (1.0 / y)
+    Value *Recip = Builder.CreateCall(Decl, { Den });
+    return Builder.CreateFMul(Num, Recip, "", FPMath);
+  }
+  return nullptr;
+}
+
+static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal,
+                              bool HasDenormals) {
   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
   if (!CNum)
     return HasDenormals;
 
-  if (UnsafeDiv)
+  if (FastUnsafeRcpLegal)
     return true;
 
   bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
@@ -620,64 +672,90 @@ static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
   return HasDenormals ^ IsOne;
 }
 
-// Insert an intrinsic for fast fdiv for safe math situations where we can
-// reduce precision. Leave fdiv for situations where the generic node is
-// expected to be optimized.
+
+// Optimizations is performed based on fpmath, fast math flags as wells as
+// denormals to lower fdiv using either rcp or fdiv.fast.
+//
+// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
+//                     unsafe-fp-math, fast math flags, denormals and fpmath
+//                     accuracy request.
+//
+// RCP Optimizations:
+//   1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
+//                                                  denormals flushed.
+//   a/b -> a*rcp(b) when fast unsafe rcp is legal.
+//
+// Use fdiv.fast:
+//   a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
+//                          fpmath >= 2.5ULP with denormals flushed.
+//
+//   1/x -> fdiv.fast(1,x)  when RCP optimization is not performed and
+//                          fpmath >= 2.5ULP with denormals.
 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
-  Type *Ty = FDiv.getType();
 
-  if (!Ty->getScalarType()->isFloatTy())
-    return false;
+  Type *Ty = FDiv.getType()->getScalarType();
 
-  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
-  if (!FPMath)
+  // No intrinsic for fdiv16 if target does not support f16.
+  if (Ty->isHalfTy() && !ST->has16BitInsts())
     return false;
 
   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
-  float ULP = FPOp->getFPAccuracy();
-  if (ULP < 2.5f)
-    return false;
+  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
+  const bool NeedHighAccuracy = !FPMath || FPOp->getFPAccuracy() < 2.5f;
 
   FastMathFlags FMF = FPOp->getFastMathFlags();
-  bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
-                                      FMF.allowReciprocal();
+  // Determine whether it is ok to use rcp based on unsafe-fp-math,
+  // fast math flags, denormals and accuracy request.
+  const bool FastUnsafeRcpLegal = HasUnsafeFPMath || FMF.isFast() ||
+          (FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy)
+                                     || FMF.approxFunc()));
 
-  // With UnsafeDiv node will be optimized to just rcp and mul.
-  if (UnsafeDiv)
-    return false;
+  // Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used.
+  const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy &&
+                           !FastUnsafeRcpLegal;
 
-  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
   Builder.setFastMathFlags(FMF);
   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
 
-  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
-
   Value *Num = FDiv.getOperand(0);
   Value *Den = FDiv.getOperand(1);
 
   Value *NewFDiv = nullptr;
-
-  if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+  if (VectorType *VT = dyn_cast<VectorType>(FDiv.getType())) {
     NewFDiv = UndefValue::get(VT);
 
     // FIXME: Doesn't do the right thing for cases where the vector is partially
     // constant. This works when the scalarizer pass is run first.
     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
       Value *NumEltI = Builder.CreateExtractElement(Num, I);
       Value *DenEltI = Builder.CreateExtractElement(Den, I);
-      Value *NewElt;
-
-      if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
-        NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
-      } else {
-        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
+      Value *NewElt = nullptr;
+      if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal,
+                                           HasFP32Denormals)) {
+        Function *Decl =
+                 Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath);
       }
+      if (!NewElt) // Try rcp.
+        NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder,
+                               FPMath, Mod, HasFP32Denormals, NeedHighAccuracy);
+      if (!NewElt)
+        NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath);
 
       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     }
-  } else {
-    if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
-      NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+  } else { // Scalar.
+    if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal,
+                                          HasFP32Denormals)) {
+      Function *Decl =
+               Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+      NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath);
+    }
+    if (!NewFDiv) { // Try rcp.
+      NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath,
+                              Mod, HasFP32Denormals, NeedHighAccuracy);
+    }
   }
 
   if (NewFDiv) {
 
@@ -7474,49 +7474,54 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
   SDValue RHS = Op.getOperand(1);
   EVT VT = Op.getValueType();
   const SDNodeFlags Flags = Op->getFlags();
-  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
 
-  if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction()))
+  bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath ||
+         (Flags.hasAllowReciprocal() &&
+          ((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) ||
+            VT == MVT::f16 ||
+            Flags.hasApproximateFuncs()));
+
+  // Do rcp optimization only when fast unsafe rcp is legal here.
+  // NOTE: We already performed RCP optimization to insert intrinsics in
+  // AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to
+  // rcp optimization.
+  //   However, there are cases like FREM, which is expended into a sequence
+  // of instructions including FDIV, which may expose new opportunities.
+  if (!FastUnsafeRcpLegal)
     return SDValue();
 
   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
-    if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
-      if (CLHS->isExactlyValue(1.0)) {
-        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
-        // the CI documentation has a worst case error of 1 ulp.
-        // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
-        // use it as long as we aren't trying to use denormals.
-        //
-        // v_rcp_f16 and v_rsq_f16 DO support denormals.
-
-        // 1.0 / sqrt(x) -> rsq(x)
-
-        // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
-        // error seems really high at 2^29 ULP.
-        if (RHS.getOpcode() == ISD::FSQRT)
-          return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
-
-        // 1.0 / x -> rcp(x)
-        return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-      }
+    if (CLHS->isExactlyValue(1.0)) {
+      // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+      // the CI documentation has a worst case error of 1 ulp.
+      // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+      // use it as long as we aren't trying to use denormals.
+      //
+      // v_rcp_f16 and v_rsq_f16 DO support denormals.
 
-      // Same as for 1.0, but expand the sign out of the constant.
-      if (CLHS->isExactlyValue(-1.0)) {
-        // -1.0 / x -> rcp (fneg x)
-        SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
-        return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
-      }
+      // 1.0 / sqrt(x) -> rsq(x)
+
+      // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
+      // error seems really high at 2^29 ULP.
+      if (RHS.getOpcode() == ISD::FSQRT)
+        return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
+
+      // 1.0 / x -> rcp(x)
+      return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
     }
-  }
 
-  if (Unsafe) {
-    // Turn into multiply by the reciprocal.
-    // x / y -> x * (1.0 / y)
-    SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
+    // Same as for 1.0, but expand the sign out of the constant.
+    if (CLHS->isExactlyValue(-1.0)) {
+      // -1.0 / x -> rcp (fneg x)
+      SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+      return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
+    }
   }
 
-  return SDValue();
+  // Turn into multiply by the reciprocal.
+  // x / y -> x * (1.0 / y)
+  SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+  return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
 }
 
 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
@@ -8663,6 +8668,11 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
                            N->getFlags());
   }
 
+  if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) {
+    return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
+                           N0.getOperand(0), N->getFlags());
+  }
+
   return AMDGPUTargetLowering::performRcpCombine(N, DCI);
 }