Skip to content

Commit 19464a3

Browse files
committed
Merge pull request opencv#8780 from vpisarev:fix_boxfilter
2 parents 246f47f + 883d925 commit 19464a3

File tree

1 file changed

+34
-23
lines changed

1 file changed

+34
-23
lines changed

modules/imgproc/src/smooth.cpp

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
//M*/
4343

4444
#include "precomp.hpp"
45+
#include "opencv2/core/hal/intrin.hpp"
4546
#include "opencl_kernels_imgproc.hpp"
4647

4748
#include "opencv2/core/openvx/ovx_defs.hpp"
@@ -467,6 +468,8 @@ template<>
467468
struct ColumnSum<ushort, uchar> :
468469
public BaseColumnFilter
469470
{
471+
enum { SHIFT = 23 };
472+
470473
ColumnSum( int _ksize, int _anchor, double _scale ) :
471474
BaseColumnFilter()
472475
{
@@ -479,7 +482,7 @@ public BaseColumnFilter
479482
if( scale != 1 )
480483
{
481484
int d = cvRound(1./scale);
482-
double scalef = ((double)(1 << 16))/d;
485+
double scalef = ((double)(1 << SHIFT))/d;
483486
divScale = cvFloor(scalef);
484487
scalef -= divScale;
485488
divDelta = d/2;
@@ -554,35 +557,43 @@ public BaseColumnFilter
554557
if( haveScale )
555558
{
556559
int i = 0;
557-
#if CV_SSE2
558-
if(haveSSE2)
560+
#if CV_SIMD128
561+
v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
562+
v_uint16x8 dd8 = v_setall_u16((ushort)dd);
563+
564+
for( ; i <= width-16; i+=16 )
559565
{
560-
__m128i ds8 = _mm_set1_epi16((short)ds);
561-
__m128i dd8 = _mm_set1_epi16((short)dd);
566+
v_uint16x8 _sm0 = v_load(Sm + i);
567+
v_uint16x8 _sm1 = v_load(Sm + i + 8);
562568

563-
for( ; i <= width-16; i+=16 )
564-
{
565-
__m128i _sm0 = _mm_loadu_si128((const __m128i*)(Sm+i));
566-
__m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+8));
569+
v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i));
570+
v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + 8), v_load(Sp + i + 8));
567571

568-
__m128i _s0 = _mm_add_epi16(_mm_loadu_si128((const __m128i*)(SUM+i)),
569-
_mm_loadu_si128((const __m128i*)(Sp+i)));
570-
__m128i _s1 = _mm_add_epi16(_mm_loadu_si128((const __m128i*)(SUM+i+8)),
571-
_mm_loadu_si128((const __m128i*)(Sp+i+8)));
572-
__m128i _s2 = _mm_mulhi_epu16(_mm_adds_epu16(_s0, dd8), ds8);
573-
__m128i _s3 = _mm_mulhi_epu16(_mm_adds_epu16(_s1, dd8), ds8);
574-
_s0 = _mm_sub_epi16(_s0, _sm0);
575-
_s1 = _mm_sub_epi16(_s1, _sm1);
576-
_mm_storeu_si128((__m128i*)(D+i), _mm_packus_epi16(_s2, _s3));
577-
_mm_storeu_si128((__m128i*)(SUM+i), _s0);
578-
_mm_storeu_si128((__m128i*)(SUM+i+8), _s1);
579-
}
572+
v_uint32x4 _s00, _s01, _s10, _s11;
573+
574+
v_expand(_s0 + dd8, _s00, _s01);
575+
v_expand(_s1 + dd8, _s10, _s11);
576+
577+
_s00 = v_shr<SHIFT>(_s00*ds4);
578+
_s01 = v_shr<SHIFT>(_s01*ds4);
579+
_s10 = v_shr<SHIFT>(_s10*ds4);
580+
_s11 = v_shr<SHIFT>(_s11*ds4);
581+
582+
v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
583+
v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
584+
585+
_s0 = v_sub_wrap(_s0, _sm0);
586+
_s1 = v_sub_wrap(_s1, _sm1);
587+
588+
v_store(D + i, v_pack_u(r0, r1));
589+
v_store(SUM + i, _s0);
590+
v_store(SUM + i + 8, _s1);
580591
}
581-
#endif
592+
#endif
582593
for( ; i < width; i++ )
583594
{
584595
int s0 = SUM[i] + Sp[i];
585-
D[i] = (uchar)((s0 + dd)*ds >> 16);
596+
D[i] = (uchar)((s0 + dd)*ds >> SHIFT);
586597
SUM[i] = (ushort)(s0 - Sm[i]);
587598
}
588599
}

0 commit comments

Comments
 (0)