Skip to content

Commit 1a495a5

Browse files
committed
Merge pull request opencv#9875 from terfendail:fast_avx
2 parents 09d283a + 2eb61a4 commit 1a495a5

File tree

3 files changed

+314
-53
lines changed

3 files changed

+314
-53
lines changed

modules/features2d/src/fast.avx2.cpp

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
2+
Below is the original copyright and the references */
3+
4+
/*
5+
Copyright (c) 2006, 2008 Edward Rosten
6+
All rights reserved.
7+
8+
Redistribution and use in source and binary forms, with or without
9+
modification, are permitted provided that the following conditions
10+
are met:
11+
12+
*Redistributions of source code must retain the above copyright
13+
notice, this list of conditions and the following disclaimer.
14+
15+
*Redistributions in binary form must reproduce the above copyright
16+
notice, this list of conditions and the following disclaimer in the
17+
documentation and/or other materials provided with the distribution.
18+
19+
*Neither the name of the University of Cambridge nor the names of
20+
its contributors may be used to endorse or promote products derived
21+
from this software without specific prior written permission.
22+
23+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
27+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34+
*/
35+
36+
/*
37+
The references are:
38+
* Machine learning for high-speed corner detection,
39+
E. Rosten and T. Drummond, ECCV 2006
40+
* Faster and better: A machine learning approach to corner detection
41+
E. Rosten, R. Porter and T. Drummond, PAMI, 2009
42+
*/
43+
44+
#include "precomp.hpp"
45+
#include "fast.hpp"
46+
#include "opencv2/core/hal/intrin.hpp"
47+
48+
namespace cv
49+
{
50+
namespace opt_AVX2
51+
{
52+
53+
class FAST_t_patternSize16_AVX2_Impl: public FAST_t_patternSize16_AVX2
54+
{
55+
public:
56+
FAST_t_patternSize16_AVX2_Impl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel):
57+
cols(_cols), nonmax_suppression(_nonmax_suppression), pixel(_pixel)
58+
{
59+
//patternSize = 16
60+
t256c = (char)_threshold;
61+
threshold = std::min(std::max(_threshold, 0), 255);
62+
}
63+
64+
virtual void process(int &j, const uchar* &ptr, uchar* curr, int* cornerpos, int &ncorners)
65+
{
66+
static const __m256i delta256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)(-128))), K16_256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)8));
67+
const __m256i t256 = _mm256_broadcastsi128_si256(_mm_set1_epi8(t256c));
68+
for (; j < cols - 32 - 3; j += 32, ptr += 32)
69+
{
70+
__m256i m0, m1;
71+
__m256i v0 = _mm256_loadu_si256((const __m256i*)ptr);
72+
73+
__m256i v1 = _mm256_xor_si256(_mm256_subs_epu8(v0, t256), delta256);
74+
v0 = _mm256_xor_si256(_mm256_adds_epu8(v0, t256), delta256);
75+
76+
__m256i x0 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[0])), delta256);
77+
__m256i x1 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[4])), delta256);
78+
__m256i x2 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[8])), delta256);
79+
__m256i x3 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[12])), delta256);
80+
81+
m0 = _mm256_and_si256(_mm256_cmpgt_epi8(x0, v0), _mm256_cmpgt_epi8(x1, v0));
82+
m1 = _mm256_and_si256(_mm256_cmpgt_epi8(v1, x0), _mm256_cmpgt_epi8(v1, x1));
83+
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x1, v0), _mm256_cmpgt_epi8(x2, v0)));
84+
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x1), _mm256_cmpgt_epi8(v1, x2)));
85+
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x2, v0), _mm256_cmpgt_epi8(x3, v0)));
86+
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x2), _mm256_cmpgt_epi8(v1, x3)));
87+
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x3, v0), _mm256_cmpgt_epi8(x0, v0)));
88+
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x3), _mm256_cmpgt_epi8(v1, x0)));
89+
m0 = _mm256_or_si256(m0, m1);
90+
91+
unsigned int mask = _mm256_movemask_epi8(m0); //unsigned is important!
92+
if (mask == 0){
93+
continue;
94+
}
95+
if ((mask & 0xffff) == 0)
96+
{
97+
j -= 16;
98+
ptr -= 16;
99+
continue;
100+
}
101+
102+
__m256i c0 = _mm256_setzero_si256(), c1 = c0, max0 = c0, max1 = c0;
103+
for (int k = 0; k < 25; k++)
104+
{
105+
__m256i x = _mm256_xor_si256(_mm256_loadu_si256((const __m256i*)(ptr + pixel[k])), delta256);
106+
m0 = _mm256_cmpgt_epi8(x, v0);
107+
m1 = _mm256_cmpgt_epi8(v1, x);
108+
109+
c0 = _mm256_and_si256(_mm256_sub_epi8(c0, m0), m0);
110+
c1 = _mm256_and_si256(_mm256_sub_epi8(c1, m1), m1);
111+
112+
max0 = _mm256_max_epu8(max0, c0);
113+
max1 = _mm256_max_epu8(max1, c1);
114+
}
115+
116+
max0 = _mm256_max_epu8(max0, max1);
117+
unsigned int m = _mm256_movemask_epi8(_mm256_cmpgt_epi8(max0, K16_256));
118+
119+
for (int k = 0; m > 0 && k < 32; k++, m >>= 1)
120+
if (m & 1)
121+
{
122+
cornerpos[ncorners++] = j + k;
123+
if (nonmax_suppression)
124+
{
125+
short d[25];
126+
for (int q = 0; q < 25; q++)
127+
d[q] = (short)(ptr[k] - ptr[k + pixel[q]]);
128+
v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000);
129+
for (int q = 0; q < 16; q += 8)
130+
{
131+
v_int16x8 v0_ = v_load(d + q + 1);
132+
v_int16x8 v1_ = v_load(d + q + 2);
133+
v_int16x8 a = v_min(v0_, v1_);
134+
v_int16x8 b = v_max(v0_, v1_);
135+
v0_ = v_load(d + q + 3);
136+
a = v_min(a, v0_);
137+
b = v_max(b, v0_);
138+
v0_ = v_load(d + q + 4);
139+
a = v_min(a, v0_);
140+
b = v_max(b, v0_);
141+
v0_ = v_load(d + q + 5);
142+
a = v_min(a, v0_);
143+
b = v_max(b, v0_);
144+
v0_ = v_load(d + q + 6);
145+
a = v_min(a, v0_);
146+
b = v_max(b, v0_);
147+
v0_ = v_load(d + q + 7);
148+
a = v_min(a, v0_);
149+
b = v_max(b, v0_);
150+
v0_ = v_load(d + q + 8);
151+
a = v_min(a, v0_);
152+
b = v_max(b, v0_);
153+
v0_ = v_load(d + q);
154+
q0 = v_max(q0, v_min(a, v0_));
155+
q1 = v_min(q1, v_max(b, v0_));
156+
v0_ = v_load(d + q + 9);
157+
q0 = v_max(q0, v_min(a, v0_));
158+
q1 = v_min(q1, v_max(b, v0_));
159+
}
160+
q0 = v_max(q0, v_setzero_s16() - q1);
161+
curr[j + k] = (uchar)(v_reduce_max(q0) - 1);
162+
}
163+
}
164+
}
165+
_mm256_zeroupper();
166+
}
167+
168+
virtual ~FAST_t_patternSize16_AVX2_Impl() {};
169+
170+
private:
171+
int cols;
172+
char t256c;
173+
int threshold;
174+
bool nonmax_suppression;
175+
const int* pixel;
176+
};
177+
178+
Ptr<FAST_t_patternSize16_AVX2> FAST_t_patternSize16_AVX2::getImpl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel)
179+
{
180+
return Ptr<FAST_t_patternSize16_AVX2>(new FAST_t_patternSize16_AVX2_Impl(_cols, _threshold, _nonmax_suppression, _pixel));
181+
}
182+
183+
}
184+
}

modules/features2d/src/fast.cpp

Lines changed: 68 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ The references are:
4242
*/
4343

4444
#include "precomp.hpp"
45+
#include "fast.hpp"
4546
#include "fast_score.hpp"
4647
#include "opencl_kernels_features2d.hpp"
4748
#include "opencv2/core/hal/intrin.hpp"
@@ -59,13 +60,20 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
5960
{
6061
Mat img = _img.getMat();
6162
const int K = patternSize/2, N = patternSize + K + 1;
63+
int i, j, k, pixel[25];
64+
makeOffsets(pixel, (int)img.step, patternSize);
65+
6266
#if CV_SIMD128
6367
const int quarterPatternSize = patternSize/4;
6468
v_uint8x16 delta = v_setall_u8(0x80), t = v_setall_u8((char)threshold), K16 = v_setall_u8((char)K);
6569
bool hasSimd = hasSIMD128();
70+
#if CV_TRY_AVX2
71+
Ptr<opt_AVX2::FAST_t_patternSize16_AVX2> fast_t_impl_avx2;
72+
if(CV_CPU_HAS_SUPPORT_AVX2)
73+
fast_t_impl_avx2 = opt_AVX2::FAST_t_patternSize16_AVX2::getImpl(img.cols, threshold, nonmax_suppression, pixel);
74+
#endif
75+
6676
#endif
67-
int i, j, k, pixel[25];
68-
makeOffsets(pixel, (int)img.step, patternSize);
6977

7078
keypoints.clear();
7179

@@ -100,65 +108,72 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
100108
{
101109
if( patternSize == 16 )
102110
{
103-
for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
111+
#if CV_TRY_AVX2
112+
if (fast_t_impl_avx2)
113+
fast_t_impl_avx2->process(j, ptr, curr, cornerpos, ncorners);
114+
#endif
115+
//vz if (j <= (img.cols - 27)) //it doesn't make sense using vectors for less than 8 elements
104116
{
105-
v_uint8x16 v = v_load(ptr);
106-
v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
107-
v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);
108-
109-
v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
110-
v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
111-
v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta));
112-
v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));
113-
114-
v_int8x16 m0, m1;
115-
m0 = (v0 < x0) & (v0 < x1);
116-
m1 = (x0 < v1) & (x1 < v1);
117-
m0 = m0 | ((v0 < x1) & (v0 < x2));
118-
m1 = m1 | ((x1 < v1) & (x2 < v1));
119-
m0 = m0 | ((v0 < x2) & (v0 < x3));
120-
m1 = m1 | ((x2 < v1) & (x3 < v1));
121-
m0 = m0 | ((v0 < x3) & (v0 < x0));
122-
m1 = m1 | ((x3 < v1) & (x0 < v1));
123-
m0 = m0 | m1;
124-
125-
int mask = v_signmask(m0);
126-
if( mask == 0 )
127-
continue;
128-
if( (mask & 255) == 0 )
117+
for (; j < img.cols - 16 - 3; j += 16, ptr += 16)
129118
{
130-
j -= 8;
131-
ptr -= 8;
132-
continue;
133-
}
119+
v_uint8x16 v = v_load(ptr);
120+
v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
121+
v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);
122+
123+
v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
124+
v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
125+
v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta));
126+
v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));
127+
128+
v_int8x16 m0, m1;
129+
m0 = (v0 < x0) & (v0 < x1);
130+
m1 = (x0 < v1) & (x1 < v1);
131+
m0 = m0 | ((v0 < x1) & (v0 < x2));
132+
m1 = m1 | ((x1 < v1) & (x2 < v1));
133+
m0 = m0 | ((v0 < x2) & (v0 < x3));
134+
m1 = m1 | ((x2 < v1) & (x3 < v1));
135+
m0 = m0 | ((v0 < x3) & (v0 < x0));
136+
m1 = m1 | ((x3 < v1) & (x0 < v1));
137+
m0 = m0 | m1;
138+
139+
int mask = v_signmask(m0);
140+
if( mask == 0 )
141+
continue;
142+
if( (mask & 255) == 0 )
143+
{
144+
j -= 8;
145+
ptr -= 8;
146+
continue;
147+
}
134148

135-
v_int8x16 c0 = v_setzero_s8();
136-
v_int8x16 c1 = v_setzero_s8();
137-
v_uint8x16 max0 = v_setzero_u8();
138-
v_uint8x16 max1 = v_setzero_u8();
139-
for( k = 0; k < N; k++ )
140-
{
141-
v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
142-
m0 = v0 < x;
143-
m1 = x < v1;
149+
v_int8x16 c0 = v_setzero_s8();
150+
v_int8x16 c1 = v_setzero_s8();
151+
v_uint8x16 max0 = v_setzero_u8();
152+
v_uint8x16 max1 = v_setzero_u8();
153+
for( k = 0; k < N; k++ )
154+
{
155+
v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
156+
m0 = v0 < x;
157+
m1 = x < v1;
144158

145-
c0 = v_sub_wrap(c0, m0) & m0;
146-
c1 = v_sub_wrap(c1, m1) & m1;
159+
c0 = v_sub_wrap(c0, m0) & m0;
160+
c1 = v_sub_wrap(c1, m1) & m1;
147161

148-
max0 = v_max(max0, v_reinterpret_as_u8(c0));
149-
max1 = v_max(max1, v_reinterpret_as_u8(c1));
150-
}
162+
max0 = v_max(max0, v_reinterpret_as_u8(c0));
163+
max1 = v_max(max1, v_reinterpret_as_u8(c1));
164+
}
151165

152-
max0 = v_max(max0, max1);
153-
int m = v_signmask(K16 < max0);
166+
max0 = v_max(max0, max1);
167+
int m = v_signmask(K16 < max0);
154168

155-
for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
156-
{
157-
if(m & 1)
169+
for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
158170
{
159-
cornerpos[ncorners++] = j+k;
160-
if(nonmax_suppression)
161-
curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
171+
if(m & 1)
172+
{
173+
cornerpos[ncorners++] = j+k;
174+
if(nonmax_suppression)
175+
curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
176+
}
162177
}
163178
}
164179
}

0 commit comments

Comments
 (0)