Skip to content

Commit 125abe2

Browse files
committed
Merge pull request opencv#8838 from tomoaki0705:dispatchFp16
2 parents ebd98ea + e269ef9 commit 125abe2

File tree

4 files changed

+371
-238
lines changed

4 files changed

+371
-238
lines changed

modules/core/src/convert.cpp

Lines changed: 26 additions & 237 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
#include "precomp.hpp"
4545

4646
#include "opencl_kernels_core.hpp"
47-
#include "opencv2/core/hal/intrin.hpp"
47+
#include "convert.hpp"
4848

4949
#include "opencv2/core/openvx/ovx_defs.hpp"
5050

@@ -4573,256 +4573,40 @@ struct Cvt_SIMD<float, int>
45734573

45744574
#endif
45754575

4576-
#if !CV_FP16_TYPE
4577-
// const numbers for floating points format
4578-
const unsigned int kShiftSignificand = 13;
4579-
const unsigned int kMaskFp16Significand = 0x3ff;
4580-
const unsigned int kBiasFp16Exponent = 15;
4581-
const unsigned int kBiasFp32Exponent = 127;
4582-
#endif
4583-
4584-
#if CV_FP16_TYPE
4585-
static float convertFp16SW(short fp16)
4586-
{
4587-
// Fp16 -> Fp32
4588-
Cv16suf a;
4589-
a.i = fp16;
4590-
return (float)a.h;
4591-
}
4592-
#else
4593-
static float convertFp16SW(short fp16)
4594-
{
4595-
// Fp16 -> Fp32
4596-
Cv16suf b;
4597-
b.i = fp16;
4598-
int exponent = b.fmt.exponent - kBiasFp16Exponent;
4599-
int significand = b.fmt.significand;
4600-
4601-
Cv32suf a;
4602-
a.i = 0;
4603-
a.fmt.sign = b.fmt.sign; // sign bit
4604-
if( exponent == 16 )
4605-
{
4606-
// Inf or NaN
4607-
a.i = a.i | 0x7F800000;
4608-
if( significand != 0 )
4609-
{
4610-
// NaN
4611-
#if defined(__x86_64__) || defined(_M_X64)
4612-
// 64bit
4613-
a.i = a.i | 0x7FC00000;
4614-
#endif
4615-
a.fmt.significand = a.fmt.significand | (significand << kShiftSignificand);
4616-
}
4617-
return a.f;
4618-
}
4619-
else if ( exponent == -15 )
4620-
{
4621-
// subnormal in Fp16
4622-
if( significand == 0 )
4623-
{
4624-
// zero
4625-
return a.f;
4626-
}
4627-
else
4628-
{
4629-
int shift = -1;
4630-
while( ( significand & 0x400 ) == 0 )
4631-
{
4632-
significand = significand << 1;
4633-
shift++;
4634-
}
4635-
significand = significand & kMaskFp16Significand;
4636-
exponent -= shift;
4637-
}
4638-
}
4639-
4640-
a.fmt.exponent = (exponent+kBiasFp32Exponent);
4641-
a.fmt.significand = significand << kShiftSignificand;
4642-
return a.f;
4643-
}
4644-
#endif
4645-
4646-
#if CV_FP16_TYPE
4647-
static short convertFp16SW(float fp32)
4648-
{
4649-
// Fp32 -> Fp16
4650-
Cv16suf a;
4651-
a.h = (__fp16)fp32;
4652-
return a.i;
4653-
}
4654-
#else
4655-
static short convertFp16SW(float fp32)
4656-
{
4657-
// Fp32 -> Fp16
4658-
Cv32suf a;
4659-
a.f = fp32;
4660-
int exponent = a.fmt.exponent - kBiasFp32Exponent;
4661-
int significand = a.fmt.significand;
4662-
4663-
Cv16suf result;
4664-
result.i = 0;
4665-
unsigned int absolute = a.i & 0x7fffffff;
4666-
if( 0x477ff000 <= absolute )
4667-
{
4668-
// Inf in Fp16
4669-
result.i = result.i | 0x7C00;
4670-
if( exponent == 128 && significand != 0 )
4671-
{
4672-
// NaN
4673-
result.i = (short)( result.i | 0x200 | ( significand >> kShiftSignificand ) );
4674-
}
4675-
}
4676-
else if ( absolute < 0x33000001 )
4677-
{
4678-
// too small for fp16
4679-
result.i = 0;
4680-
}
4681-
else if ( absolute < 0x33c00000 )
4682-
{
4683-
result.i = 1;
4684-
}
4685-
else if ( absolute < 0x34200001 )
4686-
{
4687-
result.i = 2;
4688-
}
4689-
else if ( absolute < 0x387fe000 )
4690-
{
4691-
// subnormal in Fp16
4692-
int fp16Significand = significand | 0x800000;
4693-
int bitShift = (-exponent) - 1;
4694-
fp16Significand = fp16Significand >> bitShift;
4695-
4696-
// special cases to round up
4697-
bitShift = exponent + 24;
4698-
int threshold = ( ( 0x400000 >> bitShift ) | ( ( ( significand & ( 0x800000 >> bitShift ) ) >> ( 126 - a.fmt.exponent ) ) ^ 1 ) );
4699-
if( threshold <= ( significand & ( 0xffffff >> ( exponent + 25 ) ) ) )
4700-
{
4701-
fp16Significand++;
4702-
}
4703-
result.i = (short)fp16Significand;
4704-
}
4705-
else
4706-
{
4707-
// usual situation
4708-
// exponent
4709-
result.fmt.exponent = ( exponent + kBiasFp16Exponent );
4710-
4711-
// significand;
4712-
short fp16Significand = (short)(significand >> kShiftSignificand);
4713-
result.fmt.significand = fp16Significand;
4714-
4715-
// special cases to round up
4716-
short lsb10bitsFp32 = (significand & 0x1fff);
4717-
short threshold = 0x1000 + ( ( fp16Significand & 0x1 ) ? 0 : 1 );
4718-
if( threshold <= lsb10bitsFp32 )
4719-
{
4720-
result.i++;
4721-
}
4722-
else if ( fp16Significand == 0x3ff && exponent == -15)
4723-
{
4724-
result.i++;
4725-
}
4726-
}
4727-
4728-
// sign bit
4729-
result.fmt.sign = a.fmt.sign;
4730-
return result.i;
4731-
}
4732-
#endif
4733-
47344576
// template for FP16 HW conversion function
47354577
template<typename T, typename DT> static void
47364578
cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
47374579

47384580
template<> void
4739-
cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size)
4581+
cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size )
47404582
{
4583+
CV_CPU_CALL_FP16(cvtScaleHalf_SIMD32f16f, (src, sstep, dst, dstep, size));
4584+
47414585
sstep /= sizeof(src[0]);
47424586
dstep /= sizeof(dst[0]);
47434587

4744-
if( checkHardwareSupport(CV_CPU_FP16) )
4745-
{
4746-
for( ; size.height--; src += sstep, dst += dstep )
4747-
{
4748-
int x = 0;
4749-
4750-
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
4751-
if ( ( (intptr_t)dst & 0xf ) == 0 )
4752-
#endif
4753-
{
4754-
#if CV_FP16 && CV_SIMD128
4755-
for ( ; x <= size.width - 4; x += 4)
4756-
{
4757-
v_float32x4 v_src = v_load(src + x);
4758-
4759-
v_float16x4 v_dst = v_cvt_f16(v_src);
4760-
4761-
v_store_f16(dst + x, v_dst);
4762-
}
4763-
#endif
4764-
}
4765-
for ( ; x < size.width; x++ )
4766-
{
4767-
dst[x] = convertFp16SW(src[x]);
4768-
}
4769-
}
4770-
}
4771-
else
4588+
for( ; size.height--; src += sstep, dst += dstep )
47724589
{
4773-
for( ; size.height--; src += sstep, dst += dstep )
4590+
for ( int x = 0; x < size.width; x++ )
47744591
{
4775-
int x = 0;
4776-
for ( ; x < size.width; x++ )
4777-
{
4778-
dst[x] = convertFp16SW(src[x]);
4779-
}
4592+
dst[x] = convertFp16SW(src[x]);
47804593
}
47814594
}
47824595
}
47834596

47844597
template<> void
4785-
cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t dstep, Size size)
4598+
cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t dstep, Size size )
47864599
{
4600+
CV_CPU_CALL_FP16(cvtScaleHalf_SIMD16f32f, (src, sstep, dst, dstep, size));
4601+
47874602
sstep /= sizeof(src[0]);
47884603
dstep /= sizeof(dst[0]);
47894604

4790-
if( checkHardwareSupport(CV_CPU_FP16) )
4791-
{
4792-
for( ; size.height--; src += sstep, dst += dstep )
4793-
{
4794-
int x = 0;
4795-
4796-
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
4797-
if ( ( (intptr_t)src & 0xf ) == 0 )
4798-
#endif
4799-
{
4800-
#if CV_FP16 && CV_SIMD128
4801-
for ( ; x <= size.width - 4; x += 4)
4802-
{
4803-
v_float16x4 v_src = v_load_f16(src + x);
4804-
4805-
v_float32x4 v_dst = v_cvt_f32(v_src);
4806-
4807-
v_store(dst + x, v_dst);
4808-
}
4809-
#endif
4810-
}
4811-
for ( ; x < size.width; x++ )
4812-
{
4813-
dst[x] = convertFp16SW(src[x]);
4814-
}
4815-
}
4816-
}
4817-
else
4605+
for( ; size.height--; src += sstep, dst += dstep )
48184606
{
4819-
for( ; size.height--; src += sstep, dst += dstep )
4607+
for ( int x = 0; x < size.width; x++ )
48204608
{
4821-
int x = 0;
4822-
for ( ; x < size.width; x++ )
4823-
{
4824-
dst[x] = convertFp16SW(src[x]);
4825-
}
4609+
dst[x] = convertFp16SW(src[x]);
48264610
}
48274611
}
48284612
}
@@ -5027,12 +4811,13 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s
50274811
}
50284812

50294813
#define DEF_CVT_SCALE_FP16_FUNC(suffix, stype, dtype) \
5030-
static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
5031-
dtype* dst, size_t dstep, Size size, double*) \
4814+
static void cvtScaleHalf##suffix( const stype* src, size_t sstep, \
4815+
dtype* dst, size_t dstep, Size size) \
50324816
{ \
50334817
cvtScaleHalf_<stype,dtype>(src, sstep, dst, dstep, size); \
50344818
}
50354819

4820+
50364821
#define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
50374822
static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
50384823
dtype* dst, size_t dstep, Size size, double* scale) \
@@ -5213,12 +4998,16 @@ static BinaryFunc getCvtScaleAbsFunc(int depth)
52134998
return cvtScaleAbsTab[depth];
52144999
}
52155000

5216-
BinaryFunc getConvertFuncFp16(int ddepth)
5001+
typedef void (*UnaryFunc)(const uchar* src1, size_t step1,
5002+
uchar* dst, size_t step, Size sz,
5003+
void*);
5004+
5005+
static UnaryFunc getConvertFuncFp16(int ddepth)
52175006
{
5218-
static BinaryFunc cvtTab[] =
5007+
static UnaryFunc cvtTab[] =
52195008
{
52205009
0, 0, 0,
5221-
(BinaryFunc)(cvtScaleHalf32f16f), 0, (BinaryFunc)(cvtScaleHalf16f32f),
5010+
(UnaryFunc)(cvtScaleHalf32f16f), 0, (UnaryFunc)(cvtScaleHalf16f32f),
52225011
0, 0,
52235012
};
52245013
return cvtTab[CV_MAT_DEPTH(ddepth)];
@@ -5464,14 +5253,14 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
54645253
int type = CV_MAKETYPE(ddepth, src.channels());
54655254
_dst.create( src.dims, src.size, type );
54665255
Mat dst = _dst.getMat();
5467-
BinaryFunc func = getConvertFuncFp16(ddepth);
5256+
UnaryFunc func = getConvertFuncFp16(ddepth);
54685257
int cn = src.channels();
54695258
CV_Assert( func != 0 );
54705259

54715260
if( src.dims <= 2 )
54725261
{
54735262
Size sz = getContinuousSize(src, dst, cn);
5474-
func( src.data, src.step, 0, 0, dst.data, dst.step, sz, 0);
5263+
func( src.data, src.step, dst.data, dst.step, sz, 0);
54755264
}
54765265
else
54775266
{
@@ -5481,7 +5270,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
54815270
Size sz((int)(it.size*cn), 1);
54825271

54835272
for( size_t i = 0; i < it.nplanes; i++, ++it )
5484-
func(ptrs[0], 1, 0, 0, ptrs[1], 1, sz, 0);
5273+
func(ptrs[0], 1, ptrs[1], 1, sz, 0);
54855274
}
54865275
}
54875276

0 commit comments

Comments
 (0)