Skip to content

Commit 85afbd4

Browse files
committed
core(stat): move implementations into .hpp file w/o changes
1 parent 03c3e0e commit 85afbd4

File tree

2 files changed

+170
-150
lines changed

2 files changed

+170
-150
lines changed

modules/core/src/stat.cpp

Lines changed: 0 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -4269,156 +4269,6 @@ static const uchar popCountTable4[] =
42694269
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
42704270
};
42714271

4272-
#if CV_AVX2
4273-
static inline int _mm256_extract_epi32_(__m256i reg, const int i)
4274-
{
4275-
CV_DECL_ALIGNED(32) int reg_data[8];
4276-
CV_DbgAssert(0 <= i && i < 8);
4277-
_mm256_store_si256((__m256i*)reg_data, reg);
4278-
return reg_data[i];
4279-
}
4280-
#endif
4281-
4282-
int normHamming(const uchar* a, int n)
4283-
{
4284-
int i = 0;
4285-
int result = 0;
4286-
#if CV_AVX2
4287-
if(USE_AVX2)
4288-
{
4289-
__m256i _r0 = _mm256_setzero_si256();
4290-
__m256i _0 = _mm256_setzero_si256();
4291-
__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
4292-
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
4293-
__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
4294-
4295-
for(; i <= n - 32; i+= 32)
4296-
{
4297-
__m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
4298-
4299-
__m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_a0, _popcnt_mask));
4300-
__m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
4301-
_mm256_and_si256(_mm256_srli_epi16(_a0, 4), _popcnt_mask));
4302-
4303-
_r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
4304-
}
4305-
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
4306-
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
4307-
}
4308-
#endif // CV_AVX2
4309-
4310-
#if CV_POPCNT
4311-
if(checkHardwareSupport(CV_CPU_POPCNT))
4312-
{
4313-
# if defined CV_POPCNT_U64
4314-
for(; i <= n - 8; i += 8)
4315-
{
4316-
result += (int)CV_POPCNT_U64(*(uint64*)(a + i));
4317-
}
4318-
# endif
4319-
for(; i <= n - 4; i += 4)
4320-
{
4321-
result += CV_POPCNT_U32(*(uint*)(a + i));
4322-
}
4323-
}
4324-
#endif // CV_POPCNT
4325-
4326-
#if CV_SIMD128
4327-
if(hasSIMD128())
4328-
{
4329-
v_uint32x4 t = v_setzero_u32();
4330-
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
4331-
{
4332-
t += v_popcount(v_load(a + i));
4333-
}
4334-
result += v_reduce_sum(t);
4335-
}
4336-
#endif // CV_SIMD128
4337-
#if CV_ENABLE_UNROLLED
4338-
for(; i <= n - 4; i += 4)
4339-
{
4340-
result += popCountTable[a[i]] + popCountTable[a[i+1]] +
4341-
popCountTable[a[i+2]] + popCountTable[a[i+3]];
4342-
}
4343-
#endif
4344-
for(; i < n; i++)
4345-
{
4346-
result += popCountTable[a[i]];
4347-
}
4348-
return result;
4349-
}
4350-
4351-
int normHamming(const uchar* a, const uchar* b, int n)
4352-
{
4353-
int i = 0;
4354-
int result = 0;
4355-
#if CV_AVX2
4356-
if(USE_AVX2)
4357-
{
4358-
__m256i _r0 = _mm256_setzero_si256();
4359-
__m256i _0 = _mm256_setzero_si256();
4360-
__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
4361-
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
4362-
__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
4363-
4364-
for(; i <= n - 32; i+= 32)
4365-
{
4366-
__m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
4367-
__m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i));
4368-
4369-
__m256i _xor = _mm256_xor_si256(_a0, _b0);
4370-
4371-
__m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask));
4372-
__m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
4373-
_mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask));
4374-
4375-
_r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
4376-
}
4377-
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
4378-
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
4379-
}
4380-
#endif // CV_AVX2
4381-
4382-
#if CV_POPCNT
4383-
if(checkHardwareSupport(CV_CPU_POPCNT))
4384-
{
4385-
# if defined CV_POPCNT_U64
4386-
for(; i <= n - 8; i += 8)
4387-
{
4388-
result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i));
4389-
}
4390-
# endif
4391-
for(; i <= n - 4; i += 4)
4392-
{
4393-
result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
4394-
}
4395-
}
4396-
#endif // CV_POPCNT
4397-
4398-
#if CV_SIMD128
4399-
if(hasSIMD128())
4400-
{
4401-
v_uint32x4 t = v_setzero_u32();
4402-
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
4403-
{
4404-
t += v_popcount(v_load(a + i) ^ v_load(b + i));
4405-
}
4406-
result += v_reduce_sum(t);
4407-
}
4408-
#endif // CV_SIMD128
4409-
#if CV_ENABLE_UNROLLED
4410-
for(; i <= n - 4; i += 4)
4411-
{
4412-
result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
4413-
popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
4414-
}
4415-
#endif
4416-
for(; i < n; i++)
4417-
{
4418-
result += popCountTable[a[i] ^ b[i]];
4419-
}
4420-
return result;
4421-
}
44224272

44234273
int normHamming(const uchar* a, int n, int cellSize)
44244274
{

modules/core/src/stat.simd.hpp

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
// This file is part of OpenCV project.
2+
// It is subject to the license terms in the LICENSE file found in the top-level directory
3+
// of this distribution and at http://opencv.org/license.html.
4+
5+
#include "opencv2/core/hal/intrin.hpp"
6+
7+
namespace cv { namespace hal {
8+
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
9+
10+
// forward declarations
11+
int normHamming(const uchar* a, int n);
12+
int normHamming(const uchar* a, const uchar* b, int n);
13+
14+
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
15+
16+
#if CV_AVX2
17+
static inline int _mm256_extract_epi32_(__m256i reg, const int i)
18+
{
19+
CV_DECL_ALIGNED(32) int reg_data[8];
20+
CV_DbgAssert(0 <= i && i < 8);
21+
_mm256_store_si256((__m256i*)reg_data, reg);
22+
return reg_data[i];
23+
}
24+
#endif
25+
26+
int normHamming(const uchar* a, int n)
27+
{
28+
int i = 0;
29+
int result = 0;
30+
#if CV_AVX2
31+
if(USE_AVX2)
32+
{
33+
__m256i _r0 = _mm256_setzero_si256();
34+
__m256i _0 = _mm256_setzero_si256();
35+
__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
36+
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
37+
__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
38+
39+
for(; i <= n - 32; i+= 32)
40+
{
41+
__m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
42+
43+
__m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_a0, _popcnt_mask));
44+
__m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
45+
_mm256_and_si256(_mm256_srli_epi16(_a0, 4), _popcnt_mask));
46+
47+
_r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
48+
}
49+
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
50+
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
51+
}
52+
#endif // CV_AVX2
53+
54+
#if CV_POPCNT
55+
if(checkHardwareSupport(CV_CPU_POPCNT))
56+
{
57+
# if defined CV_POPCNT_U64
58+
for(; i <= n - 8; i += 8)
59+
{
60+
result += (int)CV_POPCNT_U64(*(uint64*)(a + i));
61+
}
62+
# endif
63+
for(; i <= n - 4; i += 4)
64+
{
65+
result += CV_POPCNT_U32(*(uint*)(a + i));
66+
}
67+
}
68+
#endif // CV_POPCNT
69+
70+
#if CV_SIMD128
71+
if(hasSIMD128())
72+
{
73+
v_uint32x4 t = v_setzero_u32();
74+
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
75+
{
76+
t += v_popcount(v_load(a + i));
77+
}
78+
result += v_reduce_sum(t);
79+
}
80+
#endif // CV_SIMD128
81+
#if CV_ENABLE_UNROLLED
82+
for(; i <= n - 4; i += 4)
83+
{
84+
result += popCountTable[a[i]] + popCountTable[a[i+1]] +
85+
popCountTable[a[i+2]] + popCountTable[a[i+3]];
86+
}
87+
#endif
88+
for(; i < n; i++)
89+
{
90+
result += popCountTable[a[i]];
91+
}
92+
return result;
93+
}
94+
95+
int normHamming(const uchar* a, const uchar* b, int n)
96+
{
97+
int i = 0;
98+
int result = 0;
99+
#if CV_AVX2
100+
if(USE_AVX2)
101+
{
102+
__m256i _r0 = _mm256_setzero_si256();
103+
__m256i _0 = _mm256_setzero_si256();
104+
__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
105+
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
106+
__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
107+
108+
for(; i <= n - 32; i+= 32)
109+
{
110+
__m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
111+
__m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i));
112+
113+
__m256i _xor = _mm256_xor_si256(_a0, _b0);
114+
115+
__m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask));
116+
__m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
117+
_mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask));
118+
119+
_r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
120+
}
121+
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
122+
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
123+
}
124+
#endif // CV_AVX2
125+
126+
#if CV_POPCNT
127+
if(checkHardwareSupport(CV_CPU_POPCNT))
128+
{
129+
# if defined CV_POPCNT_U64
130+
for(; i <= n - 8; i += 8)
131+
{
132+
result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i));
133+
}
134+
# endif
135+
for(; i <= n - 4; i += 4)
136+
{
137+
result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
138+
}
139+
}
140+
#endif // CV_POPCNT
141+
142+
#if CV_SIMD128
143+
if(hasSIMD128())
144+
{
145+
v_uint32x4 t = v_setzero_u32();
146+
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
147+
{
148+
t += v_popcount(v_load(a + i) ^ v_load(b + i));
149+
}
150+
result += v_reduce_sum(t);
151+
}
152+
#endif // CV_SIMD128
153+
#if CV_ENABLE_UNROLLED
154+
for(; i <= n - 4; i += 4)
155+
{
156+
result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
157+
popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
158+
}
159+
#endif
160+
for(; i < n; i++)
161+
{
162+
result += popCountTable[a[i] ^ b[i]];
163+
}
164+
return result;
165+
}
166+
167+
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
168+
169+
CV_CPU_OPTIMIZATION_NAMESPACE_END
170+
}} //cv::hal

0 commit comments

Comments
 (0)