@@ -606,12 +606,64 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
606
606
return true ;
607
607
}
608
608
609
- static bool shouldKeepFDivF32 (Value *Num, bool UnsafeDiv, bool HasDenormals) {
609
+ // Perform RCP optimizations:
610
+ //
611
+ // 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
612
+ // denormals flushed.
613
+ //
614
+ // a/b -> a*rcp(b) when fast unsafe rcp is legal.
615
+ static Value *performRCPOpt (Value *Num, Value *Den, bool FastUnsafeRcpLegal,
616
+ IRBuilder<> Builder, MDNode *FPMath, Module *Mod,
617
+ bool HasDenormals, bool NeedHighAccuracy) {
618
+
619
+ Type *Ty = Den->getType ();
620
+ if (!FastUnsafeRcpLegal && Ty->isFloatTy () &&
621
+ (HasDenormals || NeedHighAccuracy))
622
+ return nullptr ;
623
+
624
+ Function *Decl = Intrinsic::getDeclaration (Mod, Intrinsic::amdgcn_rcp, Ty);
625
+ if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
626
+ if (FastUnsafeRcpLegal || Ty->isFloatTy () || Ty->isHalfTy ()) {
627
+ if (CLHS->isExactlyValue (1.0 )) {
628
+ // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
629
+ // the CI documentation has a worst case error of 1 ulp.
630
+ // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
631
+ // use it as long as we aren't trying to use denormals.
632
+ //
633
+ // v_rcp_f16 and v_rsq_f16 DO support denormals.
634
+
635
+ // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
636
+ // insert rsq intrinsic here.
637
+
638
+ // 1.0 / x -> rcp(x)
639
+ return Builder.CreateCall (Decl, { Den });
640
+ }
641
+
642
+ // Same as for 1.0, but expand the sign out of the constant.
643
+ if (CLHS->isExactlyValue (-1.0 )) {
644
+ // -1.0 / x -> rcp (fneg x)
645
+ Value *FNeg = Builder.CreateFNeg (Den);
646
+ return Builder.CreateCall (Decl, { FNeg });
647
+ }
648
+ }
649
+ }
650
+
651
+ if (FastUnsafeRcpLegal) {
652
+ // Turn into multiply by the reciprocal.
653
+ // x / y -> x * (1.0 / y)
654
+ Value *Recip = Builder.CreateCall (Decl, { Den });
655
+ return Builder.CreateFMul (Num, Recip, " " , FPMath);
656
+ }
657
+ return nullptr ;
658
+ }
659
+
660
+ static bool shouldKeepFDivF32 (Value *Num, bool FastUnsafeRcpLegal,
661
+ bool HasDenormals) {
610
662
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
611
663
if (!CNum)
612
664
return HasDenormals;
613
665
614
- if (UnsafeDiv )
666
+ if (FastUnsafeRcpLegal )
615
667
return true ;
616
668
617
669
bool IsOne = CNum->isExactlyValue (+1.0 ) || CNum->isExactlyValue (-1.0 );
@@ -620,64 +672,90 @@ static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
620
672
return HasDenormals ^ IsOne;
621
673
}
622
674
623
- // Insert an intrinsic for fast fdiv for safe math situations where we can
624
- // reduce precision. Leave fdiv for situations where the generic node is
625
- // expected to be optimized.
675
+
676
+ // Optimizations is performed based on fpmath, fast math flags as wells as
677
+ // denormals to lower fdiv using either rcp or fdiv.fast.
678
+ //
679
+ // FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
680
+ // unsafe-fp-math, fast math flags, denormals and fpmath
681
+ // accuracy request.
682
+ //
683
+ // RCP Optimizations:
684
+ // 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
685
+ // denormals flushed.
686
+ // a/b -> a*rcp(b) when fast unsafe rcp is legal.
687
+ //
688
+ // Use fdiv.fast:
689
+ // a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
690
+ // fpmath >= 2.5ULP with denormals flushed.
691
+ //
692
+ // 1/x -> fdiv.fast(1,x) when RCP optimization is not performed and
693
+ // fpmath >= 2.5ULP with denormals.
626
694
bool AMDGPUCodeGenPrepare::visitFDiv (BinaryOperator &FDiv) {
627
- Type *Ty = FDiv.getType ();
628
695
629
- if (!Ty->getScalarType ()->isFloatTy ())
630
- return false ;
696
+ Type *Ty = FDiv.getType ()->getScalarType ();
631
697
632
- MDNode *FPMath = FDiv. getMetadata (LLVMContext::MD_fpmath);
633
- if (!FPMath )
698
+ // No intrinsic for fdiv16 if target does not support f16.
699
+ if (Ty-> isHalfTy () && !ST-> has16BitInsts () )
634
700
return false ;
635
701
636
702
const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
637
- float ULP = FPOp->getFPAccuracy ();
638
- if (ULP < 2 .5f )
639
- return false ;
703
+ MDNode *FPMath = FDiv.getMetadata (LLVMContext::MD_fpmath);
704
+ const bool NeedHighAccuracy = !FPMath || FPOp->getFPAccuracy () < 2 .5f ;
640
705
641
706
FastMathFlags FMF = FPOp->getFastMathFlags ();
642
- bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast () ||
643
- FMF.allowReciprocal ();
707
+ // Determine whether it is ok to use rcp based on unsafe-fp-math,
708
+ // fast math flags, denormals and accuracy request.
709
+ const bool FastUnsafeRcpLegal = HasUnsafeFPMath || FMF.isFast () ||
710
+ (FMF.allowReciprocal () && ((!HasFP32Denormals && !NeedHighAccuracy)
711
+ || FMF.approxFunc ()));
644
712
645
- // With UnsafeDiv node will be optimized to just rcp and mul .
646
- if (UnsafeDiv)
647
- return false ;
713
+ // Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used .
714
+ const bool UseFDivFast = Ty-> isFloatTy () && !NeedHighAccuracy &&
715
+ !FastUnsafeRcpLegal ;
648
716
649
- IRBuilder<> Builder (FDiv.getParent (), std::next (FDiv.getIterator ()), FPMath );
717
+ IRBuilder<> Builder (FDiv.getParent (), std::next (FDiv.getIterator ()));
650
718
Builder.setFastMathFlags (FMF);
651
719
Builder.SetCurrentDebugLocation (FDiv.getDebugLoc ());
652
720
653
- Function *Decl = Intrinsic::getDeclaration (Mod, Intrinsic::amdgcn_fdiv_fast);
654
-
655
721
Value *Num = FDiv.getOperand (0 );
656
722
Value *Den = FDiv.getOperand (1 );
657
723
658
724
Value *NewFDiv = nullptr ;
659
-
660
- if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
725
+ if (VectorType *VT = dyn_cast<VectorType>(FDiv.getType ())) {
661
726
NewFDiv = UndefValue::get (VT);
662
727
663
728
// FIXME: Doesn't do the right thing for cases where the vector is partially
664
729
// constant. This works when the scalarizer pass is run first.
665
730
for (unsigned I = 0 , E = VT->getNumElements (); I != E; ++I) {
666
731
Value *NumEltI = Builder.CreateExtractElement (Num, I);
667
732
Value *DenEltI = Builder.CreateExtractElement (Den, I);
668
- Value *NewElt;
669
-
670
- if ( shouldKeepFDivF32 (NumEltI, UnsafeDiv, HasFP32Denormals)) {
671
- NewElt = Builder. CreateFDiv (NumEltI, DenEltI);
672
- } else {
673
- NewElt = Builder.CreateCall (Decl, { NumEltI, DenEltI });
733
+ Value *NewElt = nullptr ;
734
+ if (UseFDivFast && ! shouldKeepFDivF32 (NumEltI, FastUnsafeRcpLegal,
735
+ HasFP32Denormals)) {
736
+ Function *Decl =
737
+ Intrinsic::getDeclaration (Mod, Intrinsic::amdgcn_fdiv_fast);
738
+ NewElt = Builder.CreateCall (Decl, { NumEltI, DenEltI }, " " , FPMath );
674
739
}
740
+ if (!NewElt) // Try rcp.
741
+ NewElt = performRCPOpt (NumEltI, DenEltI, FastUnsafeRcpLegal, Builder,
742
+ FPMath, Mod, HasFP32Denormals, NeedHighAccuracy);
743
+ if (!NewElt)
744
+ NewElt = Builder.CreateFDiv (NumEltI, DenEltI, " " , FPMath);
675
745
676
746
NewFDiv = Builder.CreateInsertElement (NewFDiv, NewElt, I);
677
747
}
678
- } else {
679
- if (!shouldKeepFDivF32 (Num, UnsafeDiv, HasFP32Denormals))
680
- NewFDiv = Builder.CreateCall (Decl, { Num, Den });
748
+ } else { // Scalar.
749
+ if (UseFDivFast && !shouldKeepFDivF32 (Num, FastUnsafeRcpLegal,
750
+ HasFP32Denormals)) {
751
+ Function *Decl =
752
+ Intrinsic::getDeclaration (Mod, Intrinsic::amdgcn_fdiv_fast);
753
+ NewFDiv = Builder.CreateCall (Decl, { Num, Den }, " " , FPMath);
754
+ }
755
+ if (!NewFDiv) { // Try rcp.
756
+ NewFDiv = performRCPOpt (Num, Den, FastUnsafeRcpLegal, Builder, FPMath,
757
+ Mod, HasFP32Denormals, NeedHighAccuracy);
758
+ }
681
759
}
682
760
683
761
if (NewFDiv) {
0 commit comments