Skip to content

Commit c370cc1

Browse files
author
Woody Chow
committed
Optimize normL2Sqr_ with AVX2
1 parent a83a1ca commit c370cc1

File tree

1 file changed

+16
-1
lines changed

1 file changed

+16
-1
lines changed

modules/core/src/stat.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4422,7 +4422,22 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
44224422
float normL2Sqr_(const float* a, const float* b, int n)
44234423
{
44244424
int j = 0; float d = 0.f;
4425-
#if CV_SSE
4425+
#if CV_AVX2
4426+
float CV_DECL_ALIGNED(32) buf[8];
4427+
__m256 d0 = _mm256_setzero_ps();
4428+
4429+
for( ; j <= n - 8; j += 8 )
4430+
{
4431+
__m256 t0 = _mm256_sub_ps(_mm256_loadu_ps(a + j), _mm256_loadu_ps(b + j));
4432+
#ifdef CV_FMA3
4433+
d0 = _mm256_fmadd_ps(t0, t0, d0);
4434+
#else
4435+
d0 = _mm256_add_ps(d0, _mm256_mul_ps(t0, t0));
4436+
#endif
4437+
}
4438+
_mm256_store_ps(buf, d0);
4439+
d = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7];
4440+
#elif CV_SSE
44264441
float CV_DECL_ALIGNED(16) buf[4];
44274442
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
44284443

0 commit comments

Comments
 (0)