Skip to content

Commit 1d01e43

Browse files
committed
use universal intrinsic in fast
1 parent bb5c40d commit 1d01e43

File tree

1 file changed

+62
-56
lines changed

1 file changed

+62
-56
lines changed

modules/features2d/src/fast.cpp

Lines changed: 62 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ The references are:
4444
#include "precomp.hpp"
4545
#include "fast_score.hpp"
4646
#include "opencl_kernels_features2d.hpp"
47+
#include "opencv2/core/hal/intrin.hpp"
4748

4849
#include "opencv2/core/openvx/ovx_defs.hpp"
4950
#if defined _MSC_VER
@@ -58,9 +59,10 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
5859
{
5960
Mat img = _img.getMat();
6061
const int K = patternSize/2, N = patternSize + K + 1;
61-
#if CV_SSE2
62+
#if CV_SIMD128
6263
const int quarterPatternSize = patternSize/4;
63-
(void)quarterPatternSize;
64+
v_uint8x16 delta = v_setall_u8(0x80), t = v_setall_u8((char)threshold), K16 = v_setall_u8((char)K);
65+
bool hasSimd = hasSIMD128();
6466
#endif
6567
int i, j, k, pixel[25];
6668
makeOffsets(pixel, (int)img.step, patternSize);
@@ -69,12 +71,6 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
6971

7072
threshold = std::min(std::max(threshold, 0), 255);
7173

72-
#if CV_SSE2
73-
__m128i delta = _mm_set1_epi8(-128), t = _mm_set1_epi8((char)threshold), K16 = _mm_set1_epi8((char)K);
74-
(void)K16;
75-
(void)delta;
76-
(void)t;
77-
#endif
7874
uchar threshold_tab[512];
7975
for( i = -255; i <= 255; i++ )
8076
threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0);
@@ -99,66 +95,76 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
9995
if( i < img.rows - 3 )
10096
{
10197
j = 3;
102-
#if CV_SSE2
103-
if( patternSize == 16 )
98+
#if CV_SIMD128
99+
if( hasSimd )
104100
{
105-
for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
101+
if( patternSize == 16 )
106102
{
107-
__m128i m0, m1;
108-
__m128i v0 = _mm_loadu_si128((const __m128i*)ptr);
109-
__m128i v1 = _mm_xor_si128(_mm_subs_epu8(v0, t), delta);
110-
v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta);
111-
112-
__m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta);
113-
__m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[quarterPatternSize])), delta);
114-
__m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[2*quarterPatternSize])), delta);
115-
__m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[3*quarterPatternSize])), delta);
116-
m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0));
117-
m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1));
118-
m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0)));
119-
m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x1), _mm_cmpgt_epi8(v1, x2)));
120-
m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x2, v0), _mm_cmpgt_epi8(x3, v0)));
121-
m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x2), _mm_cmpgt_epi8(v1, x3)));
122-
m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x3, v0), _mm_cmpgt_epi8(x0, v0)));
123-
m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x3), _mm_cmpgt_epi8(v1, x0)));
124-
m0 = _mm_or_si128(m0, m1);
125-
int mask = _mm_movemask_epi8(m0);
126-
if( mask == 0 )
127-
continue;
128-
if( (mask & 255) == 0 )
103+
for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
129104
{
130-
j -= 8;
131-
ptr -= 8;
132-
continue;
133-
}
105+
v_uint8x16 v = v_load(ptr);
106+
v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
107+
v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);
108+
109+
v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
110+
v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
111+
v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta));
112+
v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));
113+
114+
v_int8x16 m0, m1;
115+
m0 = (v0 < x0) & (v0 < x1);
116+
m1 = (x0 < v1) & (x1 < v1);
117+
m0 = m0 | ((v0 < x1) & (v0 < x2));
118+
m1 = m1 | ((x1 < v1) & (x2 < v1));
119+
m0 = m0 | ((v0 < x2) & (v0 < x3));
120+
m1 = m1 | ((x2 < v1) & (x3 < v1));
121+
m0 = m0 | ((v0 < x3) & (v0 < x0));
122+
m1 = m1 | ((x3 < v1) & (x0 < v1));
123+
m0 = m0 | m1;
124+
125+
int mask = v_signmask(m0);
126+
if( mask == 0 )
127+
continue;
128+
if( (mask & 255) == 0 )
129+
{
130+
j -= 8;
131+
ptr -= 8;
132+
continue;
133+
}
134134

135-
__m128i c0 = _mm_setzero_si128(), c1 = c0, max0 = c0, max1 = c0;
136-
for( k = 0; k < N; k++ )
137-
{
138-
__m128i x = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(ptr + pixel[k])), delta);
139-
m0 = _mm_cmpgt_epi8(x, v0);
140-
m1 = _mm_cmpgt_epi8(v1, x);
135+
v_int8x16 c0 = v_setzero_s8();
136+
v_int8x16 c1 = v_setzero_s8();
137+
v_uint8x16 max0 = v_setzero_u8();
138+
v_uint8x16 max1 = v_setzero_u8();
139+
for( k = 0; k < N; k++ )
140+
{
141+
v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
142+
m0 = v0 < x;
143+
m1 = x < v1;
141144

142-
c0 = _mm_and_si128(_mm_sub_epi8(c0, m0), m0);
143-
c1 = _mm_and_si128(_mm_sub_epi8(c1, m1), m1);
145+
c0 = v_sub_wrap(c0, m0) & m0;
146+
c1 = v_sub_wrap(c1, m1) & m1;
144147

145-
max0 = _mm_max_epu8(max0, c0);
146-
max1 = _mm_max_epu8(max1, c1);
147-
}
148+
max0 = v_max(max0, v_reinterpret_as_u8(c0));
149+
max1 = v_max(max1, v_reinterpret_as_u8(c1));
150+
}
148151

149-
max0 = _mm_max_epu8(max0, max1);
150-
int m = _mm_movemask_epi8(_mm_cmpgt_epi8(max0, K16));
152+
max0 = v_max(max0, max1);
153+
int m = v_signmask(K16 < max0);
151154

152-
for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
153-
if(m & 1)
155+
for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
154156
{
155-
cornerpos[ncorners++] = j+k;
156-
if(nonmax_suppression)
157-
curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
157+
if(m & 1)
158+
{
159+
cornerpos[ncorners++] = j+k;
160+
if(nonmax_suppression)
161+
curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
162+
}
158163
}
164+
}
159165
}
160166
}
161-
#endif
167+
#endif
162168
for( ; j < img.cols - 3; j++, ptr++ )
163169
{
164170
int v = ptr[0];

0 commit comments

Comments
 (0)