arrayfire · umar456 · Nov 26, 2022 · Nov 25, 2022 · Nov 25, 2022
diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt
@@ -636,6 +636,7 @@ endif()
 
 target_compile_options(afcuda
   PRIVATE
+    $<$<BOOL:${AF_WITH_FAST_MATH}>:$<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>>
     $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
     $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe --diag_suppress=unrecognized_gcc_pragma>
     $<$<COMPILE_LANGUAGE:CUDA>: $<$<CXX_COMPILER_ID:MSVC>:  -Xcompiler=/wd4251

diff --git a/src/backend/cuda/compile_module.cpp b/src/backend/cuda/compile_module.cpp
@@ -261,6 +261,10 @@ Module compileModule(const string &moduleKey, span<const string> sources,
         arch.data(),
         "--std=c++14",
         "--device-as-default-execution-space",
+#ifdef AF_WITH_FAST_MATH
+        "--use_fast_math",
+        "-DAF_WITH_FAST_MATH",
+#endif
 #if !(defined(NDEBUG) || defined(__aarch64__) || defined(__LP64__))
         "--device-debug",
         "--generate-line-info"

diff --git a/src/backend/cuda/kernel/jit.cuh b/src/backend/cuda/kernel/jit.cuh
@@ -59,8 +59,13 @@ typedef cuDoubleComplex cdouble;
 #define __rem(lhs, rhs) ((lhs) % (rhs))
 #define __mod(lhs, rhs) ((lhs) % (rhs))
 
+#ifdef AF_WITH_FAST_MATH
+#define __pow(lhs, rhs) \
+    static_cast<double>(pow(static_cast<double>(lhs), static_cast<double>(rhs)));
+#else
 #define __pow(lhs, rhs) \
     __float2int_rn(pow(__int2float_rn((int)lhs), __int2float_rn((int)rhs)))
+#endif
 #define __powll(lhs, rhs) \
     __double2ll_rn(pow(__ll2double_rn(lhs), __ll2double_rn(rhs)))
 #define __powul(lhs, rhs) \

diff --git a/src/backend/cuda/math.hpp b/src/backend/cuda/math.hpp
@@ -32,6 +32,12 @@
 
 namespace cuda {
 
+#ifdef AF_WITH_FAST_MATH
+constexpr bool fast_math = true;
+#else
+constexpr bool fast_math = false;
+#endif
+
 template<typename T>
 static inline __DH__ T abs(T val) {
     return ::abs(val);
@@ -138,29 +144,22 @@ __DH__ static To scalar(Ti real, Ti imag) {
 }
 
 #ifndef __CUDA_ARCH__
+
 template<typename T>
 inline T maxval() {
-    return std::numeric_limits<T>::max();
+    if constexpr (std::is_floating_point_v<T> && !fast_math) {
+        return std::numeric_limits<T>::infinity();
+    } else {
+        return std::numeric_limits<T>::max();
+    }
 }
 template<typename T>
 inline T minval() {
-    return std::numeric_limits<T>::min();
-}
-template<>
-inline float maxval() {
-    return std::numeric_limits<float>::infinity();
-}
-template<>
-inline double maxval() {
-    return std::numeric_limits<double>::infinity();
-}
-template<>
-inline float minval() {
-    return -std::numeric_limits<float>::infinity();
-}
-template<>
-inline double minval() {
-    return -std::numeric_limits<double>::infinity();
+    if constexpr (std::is_floating_point_v<T> && !fast_math) {
+        return -std::numeric_limits<T>::infinity();
+    } else {
+        return std::numeric_limits<T>::lowest();
+    }
 }
 #else
 template<typename T>

diff --git a/src/backend/cuda/platform.cpp b/src/backend/cuda/platform.cpp
@@ -94,6 +94,12 @@ unique_handle<cublasHandle_t> *cublasManager(const int deviceId) {
         // call outside of call_once scope.
         CUBLAS_CHECK(
             cublasSetStream(handles[deviceId], cuda::getStream(deviceId)));
+#ifdef AF_WITH_FAST_MATH
+        CUBLAS_CHECK(
+            cublasSetMathMode(handles[deviceId], CUBLAS_TF32_TENSOR_OP_MATH));
+        CUBLAS_CHECK(
+            cublasSetAtomicsMode(handles[deviceId], CUBLAS_ATOMICS_ALLOWED));
+#endif
     });
 
     return &handles[deviceId];

diff --git a/src/backend/opencl/compile_module.cpp b/src/backend/opencl/compile_module.cpp
@@ -126,6 +126,10 @@ Program buildProgram(span<const string> kernelSources,
         ostringstream options;
         for (auto &opt : compileOpts) { options << opt; }
 
+#ifdef AF_WITH_FAST_MATH
+        options << " -cl-fast-relaxed-math -DAF_WITH_FAST_MATH";
+#endif
+
         retVal.build({device}, (cl_std + defaults + options.str()).c_str());
     } catch (Error &err) {
         if (err.err() == CL_BUILD_PROGRAM_FAILURE) {

diff --git a/src/backend/opencl/math.hpp b/src/backend/opencl/math.hpp
@@ -106,40 +106,27 @@ static To scalar(Ti real, Ti imag) {
     return cval;
 }
 
+#ifdef AF_WITH_FAST_MATH
+constexpr bool fast_math = true;
+#else
+constexpr bool fast_math = false;
+#endif
+
 template<typename T>
 inline T maxval() {
-    return std::numeric_limits<T>::max();
+    if constexpr (std::is_floating_point_v<T> && !fast_math) {
+        return std::numeric_limits<T>::infinity();
+    } else {
+        return std::numeric_limits<T>::max();
+    }
 }
 template<typename T>
 inline T minval() {
-    return std::numeric_limits<T>::min();
-}
-template<>
-inline float maxval() {
-    return std::numeric_limits<float>::infinity();
-}
-template<>
-inline double maxval() {
-    return std::numeric_limits<double>::infinity();
-}
-
-template<>
-inline common::half maxval() {
-    return std::numeric_limits<common::half>::infinity();
-}
-
-template<>
-inline float minval() {
-    return -std::numeric_limits<float>::infinity();
-}
-
-template<>
-inline double minval() {
-    return -std::numeric_limits<double>::infinity();
-}
-template<>
-inline common::half minval() {
-    return -std::numeric_limits<common::half>::infinity();
+    if constexpr (std::is_floating_point_v<T> && !fast_math) {
+        return -std::numeric_limits<T>::infinity();
+    } else {
+        return std::numeric_limits<T>::lowest();
+    }
 }
 
 static inline double real(cdouble in) { return in.s[0]; }

diff --git a/test/reduce.cpp b/test/reduce.cpp
@@ -2296,6 +2296,7 @@ TEST(Reduce, Test_Sum_Global_Array_nanval) {
 }
 
 TEST(Reduce, nanval_issue_3255) {
+    SKIP_IF_FAST_MATH_ENABLED();
     char *info_str;
     af_array  ikeys, ivals, okeys, ovals;
     dim_t dims[1] = {8};