Skip to content

Commit 820fdbf

Browse files
committed
brush up AVX optimization in popcount
- make sure SIMD optimization works even when AVX is not available
1 parent aa5caf8 commit 820fdbf

File tree

1 file changed

+18
-8
lines changed

1 file changed

+18
-8
lines changed

modules/core/src/stat.cpp

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4261,7 +4261,9 @@ int normHamming(const uchar* a, int n)
42614261
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
42624262
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
42634263
}
4264-
#elif CV_POPCNT
4264+
#endif // CV_AVX2
4265+
4266+
#if CV_POPCNT
42654267
if(checkHardwareSupport(CV_CPU_POPCNT))
42664268
{
42674269
# if defined CV_POPCNT_U64
@@ -4275,17 +4277,20 @@ int normHamming(const uchar* a, int n)
42754277
result += CV_POPCNT_U32(*(uint*)(a + i));
42764278
}
42774279
}
4278-
#elif CV_SIMD128
4280+
#endif // CV_POPCNT
4281+
4282+
#if CV_SIMD128
42794283
if(hasSIMD128())
42804284
{
42814285
v_uint32x4 t = v_setzero_u32();
42824286
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
42834287
{
42844288
t += v_popcount(v_load(a + i));
42854289
}
4286-
result = v_reduce_sum(t);
4290+
result += v_reduce_sum(t);
42874291
}
4288-
#endif
4292+
#endif // CV_SIMD128
4293+
42894294
for(; i <= n - 4; i += 4)
42904295
{
42914296
result += popCountTable[a[i]] + popCountTable[a[i+1]] +
@@ -4327,7 +4332,9 @@ int normHamming(const uchar* a, const uchar* b, int n)
43274332
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
43284333
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
43294334
}
4330-
#elif CV_POPCNT
4335+
#endif // CV_AVX2
4336+
4337+
#if CV_POPCNT
43314338
if(checkHardwareSupport(CV_CPU_POPCNT))
43324339
{
43334340
# if defined CV_POPCNT_U64
@@ -4341,17 +4348,20 @@ int normHamming(const uchar* a, const uchar* b, int n)
43414348
result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
43424349
}
43434350
}
4344-
#elif CV_SIMD128
4351+
#endif // CV_POPCNT
4352+
4353+
#if CV_SIMD128
43454354
if(hasSIMD128())
43464355
{
43474356
v_uint32x4 t = v_setzero_u32();
43484357
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
43494358
{
43504359
t += v_popcount(v_load(a + i) ^ v_load(b + i));
43514360
}
4352-
result = v_reduce_sum(t);
4361+
result += v_reduce_sum(t);
43534362
}
4354-
#endif
4363+
#endif // CV_SIMD128
4364+
43554365
for(; i <= n - 4; i += 4)
43564366
{
43574367
result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +

0 commit comments

Comments
 (0)