Merge pull request opencv#8518 from alalek:fix_fp16

alalek · alalek · commit 739f87da6de1 · 2017-04-04T20:06:59.000Z
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
@@ -273,35 +273,6 @@ if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_OPENCV_GCC_VERSION_NUM GREATER 399)
   add_extra_compiler_option(-fvisibility-inlines-hidden)
 endif()
 
-# TODO !!!!!
-if(NOT OPENCV_FP16_DISABLE AND NOT IOS)
-  if(ARM AND ENABLE_NEON)
-    set(FP16_OPTION "-mfpu=neon-fp16")
-  elseif((X86 OR X86_64) AND NOT MSVC AND ENABLE_AVX)
-    set(FP16_OPTION "-mf16c")
-  endif()
-  try_compile(__VALID_FP16
-    "${OpenCV_BINARY_DIR}"
-    "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp"
-    COMPILE_DEFINITIONS "-DCHECK_FP16" "${FP16_OPTION}"
-    OUTPUT_VARIABLE TRY_OUT
-    )
-  if(NOT __VALID_FP16)
-    if((X86 OR X86_64) AND NOT MSVC AND NOT ENABLE_AVX)
-      # GCC enables AVX when mf16c is passed
-      message(STATUS "FP16: Feature disabled")
-    else()
-      message(STATUS "FP16: Compiler support is not available")
-    endif()
-  else()
-    message(STATUS "FP16: Compiler support is available")
-    set(HAVE_FP16 1)
-    if(NOT ${FP16_OPTION} STREQUAL "")
-      add_extra_compiler_option(${FP16_OPTION})
-    endif()
-  endif()
-endif()
-
 #combine all "extra" options
 set(CMAKE_C_FLAGS           "${CMAKE_C_FLAGS} ${OPENCV_EXTRA_FLAGS} ${OPENCV_EXTRA_C_FLAGS}")
 set(CMAKE_CXX_FLAGS         "${CMAKE_CXX_FLAGS} ${OPENCV_EXTRA_FLAGS} ${OPENCV_EXTRA_CXX_FLAGS}")
diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in
@@ -207,9 +207,6 @@
 /* Lapack */
 #cmakedefine HAVE_LAPACK
 
-/* FP16 */
-#cmakedefine HAVE_FP16
-
 /* Library was compiled with functions instrumentation */
 #cmakedefine ENABLE_INSTRUMENTATION
 
diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@@ -70,6 +70,10 @@
 #  include <immintrin.h>
 #  define CV_AVX 1
 #endif
+#ifdef CV_CPU_COMPILE_FP16
+#  include <immintrin.h>
+#  define CV_FP16 1
+#endif
 #ifdef CV_CPU_COMPILE_AVX2
 #  include <immintrin.h>
 #  define CV_AVX2 1
@@ -154,6 +158,9 @@ struct VZeroUpperGuard {
 #ifndef CV_AVX
 #  define CV_AVX 0
 #endif
+#ifndef CV_FP16
+#  define CV_FP16 0
+#endif
 #ifndef CV_AVX2
 #  define CV_AVX2 0
 #endif
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -721,7 +721,7 @@ inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp,
 {
     typedef typename V_TypeTraits<_Tp>::abs_type rtype;
     v_reg<rtype, n> c;
-    const rtype mask = std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0;
+    const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
     for( int i = 0; i < n; i++ )
     {
         rtype ua = a.s[i] ^ mask;
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -278,7 +278,7 @@ struct v_float64x2
 };
 #endif
 
-#if defined (HAVE_FP16)
+#if CV_FP16
 // Workaround for old comiplers
 template <typename T> static inline int16x4_t vreinterpret_s16_f16(T a)
 { return (int16x4_t)a; }
@@ -775,7 +775,7 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 #endif
 
-#if defined (HAVE_FP16)
+#if CV_FP16
 // Workaround for old comiplers
 inline v_float16x4 v_load_f16(const short* ptr)
 { return v_float16x4(vld1_f16(ptr)); }
@@ -1223,7 +1223,7 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 }
 #endif
 
-#if defined (HAVE_FP16)
+#if CV_FP16
 inline v_float32x4 v_cvt_f32(const v_float16x4& a)
 {
     return v_float32x4(vcvt_f32_f16(a.val));
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -255,7 +255,7 @@ struct v_float64x2
     __m128d val;
 };
 
-#if defined(HAVE_FP16)
+#if CV_FP16
 struct v_float16x4
 {
     typedef short lane_type;
@@ -1056,7 +1056,7 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
 
-#if defined(HAVE_FP16)
+#if CV_FP16
 inline v_float16x4 v_load_f16(const short* ptr)
 { return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); }
 inline void v_store_f16(short* ptr, v_float16x4& a)
@@ -1776,7 +1776,7 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
     return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8))));
 }
 
-#if defined(HAVE_FP16)
+#if CV_FP16
 inline v_float32x4 v_cvt_f32(const v_float16x4& a)
 {
     return v_float32x4(_mm_cvtph_ps(a.val));
diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp
@@ -66,17 +66,6 @@
 #  undef max
 #endif
 
-#if defined HAVE_FP16 && (defined __F16C__ || (defined _MSC_VER && _MSC_VER >= 1700))
-#  include <immintrin.h>
-#  define CV_FP16 1
-#elif defined HAVE_FP16 && defined __GNUC__
-#  define CV_FP16 1
-#endif
-
-#ifndef CV_FP16
-#  define CV_FP16 0
-#endif
-
 //! @cond IGNORED
 
 namespace cv
diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp
@@ -743,7 +743,7 @@ template<typename R> struct TheTest
 
     TheTest & test_loadstore_fp16()
     {
-#if CV_FP16
+#if CV_FP16 && CV_SIMD128
         AlignedData<R> data;
         AlignedData<R> out;
 
@@ -775,7 +775,7 @@ template<typename R> struct TheTest
 
     TheTest & test_float_cvt_fp16()
     {
-#if CV_FP16
+#if CV_FP16 && CV_SIMD128
         AlignedData<v_float32x4> data;
 
         if(checkHardwareSupport(CV_CPU_FP16))
@@ -1008,7 +1008,7 @@ TEST(hal_intrin, float64x2) {
 }
 #endif
 
-#if CV_FP16
+#if CV_FP16 && CV_SIMD128
 TEST(hal_intrin, float16x4) {
     TheTest<v_float16x4>()
         .test_loadstore_fp16()

Original file line number	Diff line number	Diff line change
`@@ -721,7 +721,7 @@ inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp,`
`721`	`721`	`{`
`722`	`722`	`typedef typename V_TypeTraits<_Tp>::abs_type rtype;`
`723`	`723`	`v_reg<rtype, n> c;`
`724`		`- const rtype mask = std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0;`
	`724`	`+ const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);`
`725`	`725`	`for( int i = 0; i < n; i++ )`
`726`	`726`	`{`
`727`	`727`	`rtype ua = a.s[i] ^ mask;`