42
42
//M*/
43
43
44
44
#include " precomp.hpp"
45
+ #include " opencv2/core/hal/intrin.hpp"
45
46
#include " opencl_kernels_imgproc.hpp"
46
47
47
48
#include " opencv2/core/openvx/ovx_defs.hpp"
@@ -467,6 +468,8 @@ template<>
467
468
struct ColumnSum <ushort, uchar> :
468
469
public BaseColumnFilter
469
470
{
471
+ enum { SHIFT = 23 };
472
+
470
473
ColumnSum ( int _ksize, int _anchor, double _scale ) :
471
474
BaseColumnFilter ()
472
475
{
@@ -479,7 +482,7 @@ public BaseColumnFilter
479
482
if ( scale != 1 )
480
483
{
481
484
int d = cvRound (1 ./scale);
482
- double scalef = ((double )(1 << 16 ))/d;
485
+ double scalef = ((double )(1 << SHIFT ))/d;
483
486
divScale = cvFloor (scalef);
484
487
scalef -= divScale;
485
488
divDelta = d/2 ;
@@ -554,35 +557,43 @@ public BaseColumnFilter
554
557
if ( haveScale )
555
558
{
556
559
int i = 0 ;
557
- #if CV_SSE2
558
- if (haveSSE2)
560
+ #if CV_SIMD128
561
+ v_uint32x4 ds4 = v_setall_u32 ((unsigned )ds);
562
+ v_uint16x8 dd8 = v_setall_u16 ((ushort)dd);
563
+
564
+ for ( ; i <= width-16 ; i+=16 )
559
565
{
560
- __m128i ds8 = _mm_set1_epi16 (( short )ds );
561
- __m128i dd8 = _mm_set1_epi16 (( short )dd );
566
+ v_uint16x8 _sm0 = v_load (Sm + i );
567
+ v_uint16x8 _sm1 = v_load (Sm + i + 8 );
562
568
563
- for ( ; i <= width-16 ; i+=16 )
564
- {
565
- __m128i _sm0 = _mm_loadu_si128 ((const __m128i*)(Sm+i));
566
- __m128i _sm1 = _mm_loadu_si128 ((const __m128i*)(Sm+i+8 ));
569
+ v_uint16x8 _s0 = v_add_wrap (v_load (SUM + i), v_load (Sp + i));
570
+ v_uint16x8 _s1 = v_add_wrap (v_load (SUM + i + 8 ), v_load (Sp + i + 8 ));
567
571
568
- __m128i _s0 = _mm_add_epi16 (_mm_loadu_si128 ((const __m128i*)(SUM+i)),
569
- _mm_loadu_si128 ((const __m128i*)(Sp+i)));
570
- __m128i _s1 = _mm_add_epi16 (_mm_loadu_si128 ((const __m128i*)(SUM+i+8 )),
571
- _mm_loadu_si128 ((const __m128i*)(Sp+i+8 )));
572
- __m128i _s2 = _mm_mulhi_epu16 (_mm_adds_epu16 (_s0, dd8), ds8);
573
- __m128i _s3 = _mm_mulhi_epu16 (_mm_adds_epu16 (_s1, dd8), ds8);
574
- _s0 = _mm_sub_epi16 (_s0, _sm0);
575
- _s1 = _mm_sub_epi16 (_s1, _sm1);
576
- _mm_storeu_si128 ((__m128i*)(D+i), _mm_packus_epi16 (_s2, _s3));
577
- _mm_storeu_si128 ((__m128i*)(SUM+i), _s0);
578
- _mm_storeu_si128 ((__m128i*)(SUM+i+8 ), _s1);
579
- }
572
+ v_uint32x4 _s00, _s01, _s10, _s11;
573
+
574
+ v_expand (_s0 + dd8, _s00, _s01);
575
+ v_expand (_s1 + dd8, _s10, _s11);
576
+
577
+ _s00 = v_shr<SHIFT>(_s00*ds4);
578
+ _s01 = v_shr<SHIFT>(_s01*ds4);
579
+ _s10 = v_shr<SHIFT>(_s10*ds4);
580
+ _s11 = v_shr<SHIFT>(_s11*ds4);
581
+
582
+ v_int16x8 r0 = v_pack (v_reinterpret_as_s32 (_s00), v_reinterpret_as_s32 (_s01));
583
+ v_int16x8 r1 = v_pack (v_reinterpret_as_s32 (_s10), v_reinterpret_as_s32 (_s11));
584
+
585
+ _s0 = v_sub_wrap (_s0, _sm0);
586
+ _s1 = v_sub_wrap (_s1, _sm1);
587
+
588
+ v_store (D + i, v_pack_u (r0, r1));
589
+ v_store (SUM + i, _s0);
590
+ v_store (SUM + i + 8 , _s1);
580
591
}
581
- #endif
592
+ #endif
582
593
for ( ; i < width; i++ )
583
594
{
584
595
int s0 = SUM[i] + Sp[i];
585
- D[i] = (uchar)((s0 + dd)*ds >> 16 );
596
+ D[i] = (uchar)((s0 + dd)*ds >> SHIFT );
586
597
SUM[i] = (ushort)(s0 - Sm[i]);
587
598
}
588
599
}
0 commit comments