44
44
#include " precomp.hpp"
45
45
46
46
#include " opencl_kernels_core.hpp"
47
- #include " opencv2/core/hal/intrin .hpp"
47
+ #include " convert .hpp"
48
48
49
49
#include " opencv2/core/openvx/ovx_defs.hpp"
50
50
@@ -4573,256 +4573,40 @@ struct Cvt_SIMD<float, int>
4573
4573
4574
4574
#endif
4575
4575
4576
- #if !CV_FP16_TYPE
4577
- // const numbers for floating points format
4578
- const unsigned int kShiftSignificand = 13 ;
4579
- const unsigned int kMaskFp16Significand = 0x3ff ;
4580
- const unsigned int kBiasFp16Exponent = 15 ;
4581
- const unsigned int kBiasFp32Exponent = 127 ;
4582
- #endif
4583
-
4584
- #if CV_FP16_TYPE
4585
- static float convertFp16SW (short fp16)
4586
- {
4587
- // Fp16 -> Fp32
4588
- Cv16suf a;
4589
- a.i = fp16;
4590
- return (float )a.h ;
4591
- }
4592
- #else
4593
- static float convertFp16SW (short fp16)
4594
- {
4595
- // Fp16 -> Fp32
4596
- Cv16suf b;
4597
- b.i = fp16;
4598
- int exponent = b.fmt .exponent - kBiasFp16Exponent ;
4599
- int significand = b.fmt .significand ;
4600
-
4601
- Cv32suf a;
4602
- a.i = 0 ;
4603
- a.fmt .sign = b.fmt .sign ; // sign bit
4604
- if ( exponent == 16 )
4605
- {
4606
- // Inf or NaN
4607
- a.i = a.i | 0x7F800000 ;
4608
- if ( significand != 0 )
4609
- {
4610
- // NaN
4611
- #if defined(__x86_64__) || defined(_M_X64)
4612
- // 64bit
4613
- a.i = a.i | 0x7FC00000 ;
4614
- #endif
4615
- a.fmt .significand = a.fmt .significand | (significand << kShiftSignificand );
4616
- }
4617
- return a.f ;
4618
- }
4619
- else if ( exponent == -15 )
4620
- {
4621
- // subnormal in Fp16
4622
- if ( significand == 0 )
4623
- {
4624
- // zero
4625
- return a.f ;
4626
- }
4627
- else
4628
- {
4629
- int shift = -1 ;
4630
- while ( ( significand & 0x400 ) == 0 )
4631
- {
4632
- significand = significand << 1 ;
4633
- shift++;
4634
- }
4635
- significand = significand & kMaskFp16Significand ;
4636
- exponent -= shift;
4637
- }
4638
- }
4639
-
4640
- a.fmt .exponent = (exponent+kBiasFp32Exponent );
4641
- a.fmt .significand = significand << kShiftSignificand ;
4642
- return a.f ;
4643
- }
4644
- #endif
4645
-
4646
- #if CV_FP16_TYPE
4647
- static short convertFp16SW (float fp32)
4648
- {
4649
- // Fp32 -> Fp16
4650
- Cv16suf a;
4651
- a.h = (__fp16)fp32;
4652
- return a.i ;
4653
- }
4654
- #else
4655
- static short convertFp16SW (float fp32)
4656
- {
4657
- // Fp32 -> Fp16
4658
- Cv32suf a;
4659
- a.f = fp32;
4660
- int exponent = a.fmt .exponent - kBiasFp32Exponent ;
4661
- int significand = a.fmt .significand ;
4662
-
4663
- Cv16suf result;
4664
- result.i = 0 ;
4665
- unsigned int absolute = a.i & 0x7fffffff ;
4666
- if ( 0x477ff000 <= absolute )
4667
- {
4668
- // Inf in Fp16
4669
- result.i = result.i | 0x7C00 ;
4670
- if ( exponent == 128 && significand != 0 )
4671
- {
4672
- // NaN
4673
- result.i = (short )( result.i | 0x200 | ( significand >> kShiftSignificand ) );
4674
- }
4675
- }
4676
- else if ( absolute < 0x33000001 )
4677
- {
4678
- // too small for fp16
4679
- result.i = 0 ;
4680
- }
4681
- else if ( absolute < 0x33c00000 )
4682
- {
4683
- result.i = 1 ;
4684
- }
4685
- else if ( absolute < 0x34200001 )
4686
- {
4687
- result.i = 2 ;
4688
- }
4689
- else if ( absolute < 0x387fe000 )
4690
- {
4691
- // subnormal in Fp16
4692
- int fp16Significand = significand | 0x800000 ;
4693
- int bitShift = (-exponent) - 1 ;
4694
- fp16Significand = fp16Significand >> bitShift;
4695
-
4696
- // special cases to round up
4697
- bitShift = exponent + 24 ;
4698
- int threshold = ( ( 0x400000 >> bitShift ) | ( ( ( significand & ( 0x800000 >> bitShift ) ) >> ( 126 - a.fmt .exponent ) ) ^ 1 ) );
4699
- if ( threshold <= ( significand & ( 0xffffff >> ( exponent + 25 ) ) ) )
4700
- {
4701
- fp16Significand++;
4702
- }
4703
- result.i = (short )fp16Significand;
4704
- }
4705
- else
4706
- {
4707
- // usual situation
4708
- // exponent
4709
- result.fmt .exponent = ( exponent + kBiasFp16Exponent );
4710
-
4711
- // significand;
4712
- short fp16Significand = (short )(significand >> kShiftSignificand );
4713
- result.fmt .significand = fp16Significand;
4714
-
4715
- // special cases to round up
4716
- short lsb10bitsFp32 = (significand & 0x1fff );
4717
- short threshold = 0x1000 + ( ( fp16Significand & 0x1 ) ? 0 : 1 );
4718
- if ( threshold <= lsb10bitsFp32 )
4719
- {
4720
- result.i ++;
4721
- }
4722
- else if ( fp16Significand == 0x3ff && exponent == -15 )
4723
- {
4724
- result.i ++;
4725
- }
4726
- }
4727
-
4728
- // sign bit
4729
- result.fmt .sign = a.fmt .sign ;
4730
- return result.i ;
4731
- }
4732
- #endif
4733
-
4734
4576
// template for FP16 HW conversion function
4735
4577
template <typename T, typename DT> static void
4736
4578
cvtScaleHalf_ ( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
4737
4579
4738
4580
template <> void
4739
- cvtScaleHalf_<float , short >( const float * src, size_t sstep, short * dst, size_t dstep, Size size)
4581
+ cvtScaleHalf_<float , short >( const float * src, size_t sstep, short * dst, size_t dstep, Size size )
4740
4582
{
4583
+ CV_CPU_CALL_FP16 (cvtScaleHalf_SIMD32f16f, (src, sstep, dst, dstep, size));
4584
+
4741
4585
sstep /= sizeof (src[0 ]);
4742
4586
dstep /= sizeof (dst[0 ]);
4743
4587
4744
- if ( checkHardwareSupport (CV_CPU_FP16) )
4745
- {
4746
- for ( ; size.height --; src += sstep, dst += dstep )
4747
- {
4748
- int x = 0 ;
4749
-
4750
- #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
4751
- if ( ( (intptr_t )dst & 0xf ) == 0 )
4752
- #endif
4753
- {
4754
- #if CV_FP16 && CV_SIMD128
4755
- for ( ; x <= size.width - 4 ; x += 4 )
4756
- {
4757
- v_float32x4 v_src = v_load (src + x);
4758
-
4759
- v_float16x4 v_dst = v_cvt_f16 (v_src);
4760
-
4761
- v_store_f16 (dst + x, v_dst);
4762
- }
4763
- #endif
4764
- }
4765
- for ( ; x < size.width ; x++ )
4766
- {
4767
- dst[x] = convertFp16SW (src[x]);
4768
- }
4769
- }
4770
- }
4771
- else
4588
+ for ( ; size.height --; src += sstep, dst += dstep )
4772
4589
{
4773
- for ( ; size.height --; src += sstep, dst += dstep )
4590
+ for ( int x = 0 ; x < size.width ; x++ )
4774
4591
{
4775
- int x = 0 ;
4776
- for ( ; x < size.width ; x++ )
4777
- {
4778
- dst[x] = convertFp16SW (src[x]);
4779
- }
4592
+ dst[x] = convertFp16SW (src[x]);
4780
4593
}
4781
4594
}
4782
4595
}
4783
4596
4784
4597
template <> void
4785
- cvtScaleHalf_<short , float >( const short * src, size_t sstep, float * dst, size_t dstep, Size size)
4598
+ cvtScaleHalf_<short , float >( const short * src, size_t sstep, float * dst, size_t dstep, Size size )
4786
4599
{
4600
+ CV_CPU_CALL_FP16 (cvtScaleHalf_SIMD16f32f, (src, sstep, dst, dstep, size));
4601
+
4787
4602
sstep /= sizeof (src[0 ]);
4788
4603
dstep /= sizeof (dst[0 ]);
4789
4604
4790
- if ( checkHardwareSupport (CV_CPU_FP16) )
4791
- {
4792
- for ( ; size.height --; src += sstep, dst += dstep )
4793
- {
4794
- int x = 0 ;
4795
-
4796
- #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
4797
- if ( ( (intptr_t )src & 0xf ) == 0 )
4798
- #endif
4799
- {
4800
- #if CV_FP16 && CV_SIMD128
4801
- for ( ; x <= size.width - 4 ; x += 4 )
4802
- {
4803
- v_float16x4 v_src = v_load_f16 (src + x);
4804
-
4805
- v_float32x4 v_dst = v_cvt_f32 (v_src);
4806
-
4807
- v_store (dst + x, v_dst);
4808
- }
4809
- #endif
4810
- }
4811
- for ( ; x < size.width ; x++ )
4812
- {
4813
- dst[x] = convertFp16SW (src[x]);
4814
- }
4815
- }
4816
- }
4817
- else
4605
+ for ( ; size.height --; src += sstep, dst += dstep )
4818
4606
{
4819
- for ( ; size.height --; src += sstep, dst += dstep )
4607
+ for ( int x = 0 ; x < size.width ; x++ )
4820
4608
{
4821
- int x = 0 ;
4822
- for ( ; x < size.width ; x++ )
4823
- {
4824
- dst[x] = convertFp16SW (src[x]);
4825
- }
4609
+ dst[x] = convertFp16SW (src[x]);
4826
4610
}
4827
4611
}
4828
4612
}
@@ -5027,12 +4811,13 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s
5027
4811
}
5028
4812
5029
4813
#define DEF_CVT_SCALE_FP16_FUNC (suffix, stype, dtype ) \
5030
- static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t , \
5031
- dtype* dst, size_t dstep, Size size, double * ) \
4814
+ static void cvtScaleHalf##suffix( const stype* src, size_t sstep, \
4815
+ dtype* dst, size_t dstep, Size size) \
5032
4816
{ \
5033
4817
cvtScaleHalf_<stype,dtype>(src, sstep, dst, dstep, size); \
5034
4818
}
5035
4819
4820
+
5036
4821
#define DEF_CVT_SCALE_FUNC (suffix, stype, dtype, wtype ) \
5037
4822
static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t , \
5038
4823
dtype* dst, size_t dstep, Size size, double * scale) \
@@ -5213,12 +4998,16 @@ static BinaryFunc getCvtScaleAbsFunc(int depth)
5213
4998
return cvtScaleAbsTab[depth];
5214
4999
}
5215
5000
5216
- BinaryFunc getConvertFuncFp16 (int ddepth)
5001
+ typedef void (*UnaryFunc)(const uchar* src1, size_t step1,
5002
+ uchar* dst, size_t step, Size sz,
5003
+ void *);
5004
+
5005
+ static UnaryFunc getConvertFuncFp16 (int ddepth)
5217
5006
{
5218
- static BinaryFunc cvtTab[] =
5007
+ static UnaryFunc cvtTab[] =
5219
5008
{
5220
5009
0 , 0 , 0 ,
5221
- (BinaryFunc )(cvtScaleHalf32f16f), 0 , (BinaryFunc )(cvtScaleHalf16f32f),
5010
+ (UnaryFunc )(cvtScaleHalf32f16f), 0 , (UnaryFunc )(cvtScaleHalf16f32f),
5222
5011
0 , 0 ,
5223
5012
};
5224
5013
return cvtTab[CV_MAT_DEPTH (ddepth)];
@@ -5464,14 +5253,14 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
5464
5253
int type = CV_MAKETYPE (ddepth, src.channels ());
5465
5254
_dst.create ( src.dims , src.size , type );
5466
5255
Mat dst = _dst.getMat ();
5467
- BinaryFunc func = getConvertFuncFp16 (ddepth);
5256
+ UnaryFunc func = getConvertFuncFp16 (ddepth);
5468
5257
int cn = src.channels ();
5469
5258
CV_Assert ( func != 0 );
5470
5259
5471
5260
if ( src.dims <= 2 )
5472
5261
{
5473
5262
Size sz = getContinuousSize (src, dst, cn);
5474
- func ( src.data , src.step , 0 , 0 , dst.data , dst.step , sz, 0 );
5263
+ func ( src.data , src.step , dst.data , dst.step , sz, 0 );
5475
5264
}
5476
5265
else
5477
5266
{
@@ -5481,7 +5270,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
5481
5270
Size sz ((int )(it.size *cn), 1 );
5482
5271
5483
5272
for ( size_t i = 0 ; i < it.nplanes ; i++, ++it )
5484
- func (ptrs[0 ], 1 , 0 , 0 , ptrs[1 ], 1 , sz, 0 );
5273
+ func (ptrs[0 ], 1 , ptrs[1 ], 1 , sz, 0 );
5485
5274
}
5486
5275
}
5487
5276
0 commit comments