Skip to content

Commit 4e39d03

Browse files
authored
Merge pull request opencv#9074 from alalek:cpu_dispatch_core_hamming
cpu dispatch(core): hamming
2 parents 11feae6 + b3f5e3b commit 4e39d03

File tree

6 files changed

+210
-162
lines changed

6 files changed

+210
-162
lines changed

cmake/OpenCVCompilerOptimizations.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ if(X86 OR X86_64)
238238
endif()
239239

240240
if(NOT DEFINED CPU_DISPATCH)
241-
set(CPU_DISPATCH "SSE4_1;AVX;FP16;AVX2" CACHE STRING "${HELP_CPU_DISPATCH}")
241+
set(CPU_DISPATCH "SSE4_1;SSE4_2;AVX;FP16;AVX2" CACHE STRING "${HELP_CPU_DISPATCH}")
242242
endif()
243243

244244
if(NOT DEFINED CPU_BASELINE)

modules/core/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
set(the_description "The Core Functionality")
22

33
ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
4+
ocv_add_dispatched_file(stat SSE4_2 AVX2)
45

56
ocv_add_module(core
67
"${OPENCV_HAL_LINKER_LIBS}"

modules/core/include/opencv2/core/cv_cpu_dispatch.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,12 @@ struct VZeroUpperGuard {
111111
#define __CV_AVX_GUARD VZeroUpperGuard __vzeroupper_guard; (void)__vzeroupper_guard;
112112
#endif
113113

114+
#ifdef __CV_AVX_GUARD
115+
#define CV_AVX_GUARD __CV_AVX_GUARD
116+
#else
117+
#define CV_AVX_GUARD
118+
#endif
119+
114120
#endif // __OPENCV_BUILD
115121

116122

modules/core/src/stat.cpp

Lines changed: 3 additions & 161 deletions
Original file line numberDiff line numberDiff line change
@@ -53,16 +53,6 @@
5353
namespace cv
5454
{
5555

56-
template<typename T> static inline Scalar rawToScalar(const T& v)
57-
{
58-
Scalar s;
59-
typedef typename DataType<T>::channel_type T1;
60-
int i, n = DataType<T>::channels;
61-
for( i = 0; i < n; i++ )
62-
s.val[i] = ((T1*)&v)[i];
63-
return s;
64-
}
65-
6656
/****************************************************************************************\
6757
* sum *
6858
\****************************************************************************************/
@@ -4249,7 +4239,7 @@ cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )
42494239

42504240
namespace cv { namespace hal {
42514241

4252-
static const uchar popCountTable[] =
4242+
extern const uchar popCountTable[256] =
42534243
{
42544244
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
42554245
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
@@ -4285,154 +4275,6 @@ static const uchar popCountTable4[] =
42854275
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
42864276
};
42874277

4288-
#if CV_AVX2
4289-
static inline int _mm256_extract_epi32_(__m256i reg, const int i)
4290-
{
4291-
CV_DECL_ALIGNED(32) int reg_data[8];
4292-
CV_DbgAssert(0 <= i && i < 8);
4293-
_mm256_store_si256((__m256i*)reg_data, reg);
4294-
return reg_data[i];
4295-
}
4296-
#endif
4297-
4298-
int normHamming(const uchar* a, int n)
4299-
{
4300-
int i = 0;
4301-
int result = 0;
4302-
#if CV_AVX2
4303-
if(USE_AVX2)
4304-
{
4305-
__m256i _r0 = _mm256_setzero_si256();
4306-
__m256i _0 = _mm256_setzero_si256();
4307-
__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
4308-
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
4309-
__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
4310-
4311-
for(; i <= n - 32; i+= 32)
4312-
{
4313-
__m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
4314-
4315-
__m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_a0, _popcnt_mask));
4316-
__m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
4317-
_mm256_and_si256(_mm256_srli_epi16(_a0, 4), _popcnt_mask));
4318-
4319-
_r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
4320-
}
4321-
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
4322-
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
4323-
}
4324-
#endif // CV_AVX2
4325-
4326-
#if CV_POPCNT
4327-
if(checkHardwareSupport(CV_CPU_POPCNT))
4328-
{
4329-
# if defined CV_POPCNT_U64
4330-
for(; i <= n - 8; i += 8)
4331-
{
4332-
result += (int)CV_POPCNT_U64(*(uint64*)(a + i));
4333-
}
4334-
# endif
4335-
for(; i <= n - 4; i += 4)
4336-
{
4337-
result += CV_POPCNT_U32(*(uint*)(a + i));
4338-
}
4339-
}
4340-
#endif // CV_POPCNT
4341-
4342-
#if CV_SIMD128
4343-
if(hasSIMD128())
4344-
{
4345-
v_uint32x4 t = v_setzero_u32();
4346-
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
4347-
{
4348-
t += v_popcount(v_load(a + i));
4349-
}
4350-
result += v_reduce_sum(t);
4351-
}
4352-
#endif // CV_SIMD128
4353-
4354-
for(; i <= n - 4; i += 4)
4355-
{
4356-
result += popCountTable[a[i]] + popCountTable[a[i+1]] +
4357-
popCountTable[a[i+2]] + popCountTable[a[i+3]];
4358-
}
4359-
for(; i < n; i++)
4360-
{
4361-
result += popCountTable[a[i]];
4362-
}
4363-
return result;
4364-
}
4365-
4366-
int normHamming(const uchar* a, const uchar* b, int n)
4367-
{
4368-
int i = 0;
4369-
int result = 0;
4370-
#if CV_AVX2
4371-
if(USE_AVX2)
4372-
{
4373-
__m256i _r0 = _mm256_setzero_si256();
4374-
__m256i _0 = _mm256_setzero_si256();
4375-
__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
4376-
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
4377-
__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
4378-
4379-
for(; i <= n - 32; i+= 32)
4380-
{
4381-
__m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
4382-
__m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i));
4383-
4384-
__m256i _xor = _mm256_xor_si256(_a0, _b0);
4385-
4386-
__m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask));
4387-
__m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
4388-
_mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask));
4389-
4390-
_r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
4391-
}
4392-
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
4393-
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
4394-
}
4395-
#endif // CV_AVX2
4396-
4397-
#if CV_POPCNT
4398-
if(checkHardwareSupport(CV_CPU_POPCNT))
4399-
{
4400-
# if defined CV_POPCNT_U64
4401-
for(; i <= n - 8; i += 8)
4402-
{
4403-
result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i));
4404-
}
4405-
# endif
4406-
for(; i <= n - 4; i += 4)
4407-
{
4408-
result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
4409-
}
4410-
}
4411-
#endif // CV_POPCNT
4412-
4413-
#if CV_SIMD128
4414-
if(hasSIMD128())
4415-
{
4416-
v_uint32x4 t = v_setzero_u32();
4417-
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
4418-
{
4419-
t += v_popcount(v_load(a + i) ^ v_load(b + i));
4420-
}
4421-
result += v_reduce_sum(t);
4422-
}
4423-
#endif // CV_SIMD128
4424-
4425-
for(; i <= n - 4; i += 4)
4426-
{
4427-
result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
4428-
popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
4429-
}
4430-
for(; i < n; i++)
4431-
{
4432-
result += popCountTable[a[i] ^ b[i]];
4433-
}
4434-
return result;
4435-
}
44364278

44374279
int normHamming(const uchar* a, int n, int cellSize)
44384280
{
@@ -4469,11 +4311,11 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
44694311
return -1;
44704312
int i = 0;
44714313
int result = 0;
4472-
#if CV_ENABLE_UNROLLED
4314+
#if CV_ENABLE_UNROLLED
44734315
for( ; i <= n - 4; i += 4 )
44744316
result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
44754317
tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
4476-
#endif
4318+
#endif
44774319
for( ; i < n; i++ )
44784320
result += tab[a[i] ^ b[i]];
44794321
return result;

modules/core/src/stat.dispatch.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
// This file is part of OpenCV project.
2+
// It is subject to the license terms in the LICENSE file found in the top-level directory
3+
// of this distribution and at http://opencv.org/license.html.
4+
5+
#include "precomp.hpp"
6+
7+
#include "stat.simd.hpp"
8+
#include "stat.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
9+
10+
namespace cv { namespace hal {
11+
12+
int normHamming(const uchar* a, int n)
13+
{
14+
CV_INSTRUMENT_REGION()
15+
16+
CV_CPU_DISPATCH(normHamming, (a, n),
17+
CV_CPU_DISPATCH_MODES_ALL);
18+
}
19+
20+
int normHamming(const uchar* a, const uchar* b, int n)
21+
{
22+
CV_INSTRUMENT_REGION()
23+
24+
CV_CPU_DISPATCH(normHamming, (a, b, n),
25+
CV_CPU_DISPATCH_MODES_ALL);
26+
}
27+
28+
}} //cv::hal

0 commit comments

Comments
 (0)