@@ -44,6 +44,7 @@ The references are:
44
44
#include " precomp.hpp"
45
45
#include " fast_score.hpp"
46
46
#include " opencl_kernels_features2d.hpp"
47
+ #include " opencv2/core/hal/intrin.hpp"
47
48
48
49
#include " opencv2/core/openvx/ovx_defs.hpp"
49
50
#if defined _MSC_VER
@@ -58,9 +59,10 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
58
59
{
59
60
Mat img = _img.getMat ();
60
61
const int K = patternSize/2 , N = patternSize + K + 1 ;
61
- #if CV_SSE2
62
+ #if CV_SIMD128
62
63
const int quarterPatternSize = patternSize/4 ;
63
- (void )quarterPatternSize;
64
+ v_uint8x16 delta = v_setall_u8 (0x80 ), t = v_setall_u8 ((char )threshold), K16 = v_setall_u8 ((char )K);
65
+ bool hasSimd = hasSIMD128 ();
64
66
#endif
65
67
int i, j, k, pixel[25 ];
66
68
makeOffsets (pixel, (int )img.step , patternSize);
@@ -69,12 +71,6 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
69
71
70
72
threshold = std::min (std::max (threshold, 0 ), 255 );
71
73
72
- #if CV_SSE2
73
- __m128i delta = _mm_set1_epi8 (-128 ), t = _mm_set1_epi8 ((char )threshold), K16 = _mm_set1_epi8 ((char )K);
74
- (void )K16;
75
- (void )delta;
76
- (void )t;
77
- #endif
78
74
uchar threshold_tab[512 ];
79
75
for ( i = -255 ; i <= 255 ; i++ )
80
76
threshold_tab[i+255 ] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0 );
@@ -99,66 +95,76 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
99
95
if ( i < img.rows - 3 )
100
96
{
101
97
j = 3 ;
102
- #if CV_SSE2
103
- if ( patternSize == 16 )
98
+ #if CV_SIMD128
99
+ if ( hasSimd )
104
100
{
105
- for (; j < img. cols - 16 - 3 ; j += 16 , ptr + = 16 )
101
+ if ( patternSize = = 16 )
106
102
{
107
- __m128i m0, m1;
108
- __m128i v0 = _mm_loadu_si128 ((const __m128i*)ptr);
109
- __m128i v1 = _mm_xor_si128 (_mm_subs_epu8 (v0, t), delta);
110
- v0 = _mm_xor_si128 (_mm_adds_epu8 (v0, t), delta);
111
-
112
- __m128i x0 = _mm_sub_epi8 (_mm_loadu_si128 ((const __m128i*)(ptr + pixel[0 ])), delta);
113
- __m128i x1 = _mm_sub_epi8 (_mm_loadu_si128 ((const __m128i*)(ptr + pixel[quarterPatternSize])), delta);
114
- __m128i x2 = _mm_sub_epi8 (_mm_loadu_si128 ((const __m128i*)(ptr + pixel[2 *quarterPatternSize])), delta);
115
- __m128i x3 = _mm_sub_epi8 (_mm_loadu_si128 ((const __m128i*)(ptr + pixel[3 *quarterPatternSize])), delta);
116
- m0 = _mm_and_si128 (_mm_cmpgt_epi8 (x0, v0), _mm_cmpgt_epi8 (x1, v0));
117
- m1 = _mm_and_si128 (_mm_cmpgt_epi8 (v1, x0), _mm_cmpgt_epi8 (v1, x1));
118
- m0 = _mm_or_si128 (m0, _mm_and_si128 (_mm_cmpgt_epi8 (x1, v0), _mm_cmpgt_epi8 (x2, v0)));
119
- m1 = _mm_or_si128 (m1, _mm_and_si128 (_mm_cmpgt_epi8 (v1, x1), _mm_cmpgt_epi8 (v1, x2)));
120
- m0 = _mm_or_si128 (m0, _mm_and_si128 (_mm_cmpgt_epi8 (x2, v0), _mm_cmpgt_epi8 (x3, v0)));
121
- m1 = _mm_or_si128 (m1, _mm_and_si128 (_mm_cmpgt_epi8 (v1, x2), _mm_cmpgt_epi8 (v1, x3)));
122
- m0 = _mm_or_si128 (m0, _mm_and_si128 (_mm_cmpgt_epi8 (x3, v0), _mm_cmpgt_epi8 (x0, v0)));
123
- m1 = _mm_or_si128 (m1, _mm_and_si128 (_mm_cmpgt_epi8 (v1, x3), _mm_cmpgt_epi8 (v1, x0)));
124
- m0 = _mm_or_si128 (m0, m1);
125
- int mask = _mm_movemask_epi8 (m0);
126
- if ( mask == 0 )
127
- continue ;
128
- if ( (mask & 255 ) == 0 )
103
+ for (; j < img.cols - 16 - 3 ; j += 16 , ptr += 16 )
129
104
{
130
- j -= 8 ;
131
- ptr -= 8 ;
132
- continue ;
133
- }
105
+ v_uint8x16 v = v_load (ptr);
106
+ v_int8x16 v0 = v_reinterpret_as_s8 ((v + t) ^ delta);
107
+ v_int8x16 v1 = v_reinterpret_as_s8 ((v - t) ^ delta);
108
+
109
+ v_int8x16 x0 = v_reinterpret_as_s8 (v_sub_wrap (v_load (ptr + pixel[0 ]), delta));
110
+ v_int8x16 x1 = v_reinterpret_as_s8 (v_sub_wrap (v_load (ptr + pixel[quarterPatternSize]), delta));
111
+ v_int8x16 x2 = v_reinterpret_as_s8 (v_sub_wrap (v_load (ptr + pixel[2 *quarterPatternSize]), delta));
112
+ v_int8x16 x3 = v_reinterpret_as_s8 (v_sub_wrap (v_load (ptr + pixel[3 *quarterPatternSize]), delta));
113
+
114
+ v_int8x16 m0, m1;
115
+ m0 = (v0 < x0) & (v0 < x1);
116
+ m1 = (x0 < v1) & (x1 < v1);
117
+ m0 = m0 | ((v0 < x1) & (v0 < x2));
118
+ m1 = m1 | ((x1 < v1) & (x2 < v1));
119
+ m0 = m0 | ((v0 < x2) & (v0 < x3));
120
+ m1 = m1 | ((x2 < v1) & (x3 < v1));
121
+ m0 = m0 | ((v0 < x3) & (v0 < x0));
122
+ m1 = m1 | ((x3 < v1) & (x0 < v1));
123
+ m0 = m0 | m1;
124
+
125
+ int mask = v_signmask (m0);
126
+ if ( mask == 0 )
127
+ continue ;
128
+ if ( (mask & 255 ) == 0 )
129
+ {
130
+ j -= 8 ;
131
+ ptr -= 8 ;
132
+ continue ;
133
+ }
134
134
135
- __m128i c0 = _mm_setzero_si128 (), c1 = c0, max0 = c0, max1 = c0;
136
- for ( k = 0 ; k < N; k++ )
137
- {
138
- __m128i x = _mm_xor_si128 (_mm_loadu_si128 ((const __m128i*)(ptr + pixel[k])), delta);
139
- m0 = _mm_cmpgt_epi8 (x, v0);
140
- m1 = _mm_cmpgt_epi8 (v1, x);
135
+ v_int8x16 c0 = v_setzero_s8 ();
136
+ v_int8x16 c1 = v_setzero_s8 ();
137
+ v_uint8x16 max0 = v_setzero_u8 ();
138
+ v_uint8x16 max1 = v_setzero_u8 ();
139
+ for ( k = 0 ; k < N; k++ )
140
+ {
141
+ v_int8x16 x = v_reinterpret_as_s8 (v_load ((ptr + pixel[k])) ^ delta);
142
+ m0 = v0 < x;
143
+ m1 = x < v1;
141
144
142
- c0 = _mm_and_si128 ( _mm_sub_epi8 ( c0, m0), m0) ;
143
- c1 = _mm_and_si128 ( _mm_sub_epi8 ( c1, m1), m1) ;
145
+ c0 = v_sub_wrap ( c0, m0) & m0 ;
146
+ c1 = v_sub_wrap ( c1, m1) & m1 ;
144
147
145
- max0 = _mm_max_epu8 (max0, c0 );
146
- max1 = _mm_max_epu8 (max1, c1 );
147
- }
148
+ max0 = v_max (max0, v_reinterpret_as_u8 (c0) );
149
+ max1 = v_max (max1, v_reinterpret_as_u8 (c1) );
150
+ }
148
151
149
- max0 = _mm_max_epu8 (max0, max1);
150
- int m = _mm_movemask_epi8 ( _mm_cmpgt_epi8 (max0, K16) );
152
+ max0 = v_max (max0, max1);
153
+ int m = v_signmask ( K16 < max0 );
151
154
152
- for ( k = 0 ; m > 0 && k < 16 ; k++, m >>= 1 )
153
- if (m & 1 )
155
+ for ( k = 0 ; m > 0 && k < 16 ; k++, m >>= 1 )
154
156
{
155
- cornerpos[ncorners++] = j+k;
156
- if (nonmax_suppression)
157
- curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
157
+ if (m & 1 )
158
+ {
159
+ cornerpos[ncorners++] = j+k;
160
+ if (nonmax_suppression)
161
+ curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
162
+ }
158
163
}
164
+ }
159
165
}
160
166
}
161
- #endif
167
+ #endif
162
168
for ( ; j < img.cols - 3 ; j++, ptr++ )
163
169
{
164
170
int v = ptr[0 ];
0 commit comments