Skip to content

Commit 2eb61a4

Browse files
committed
AVX2 optimized implementation of FAST corner tracking migrated to separate file
1 parent 8d5a5d5 commit 2eb61a4

File tree

3 files changed

+255
-69
lines changed

3 files changed

+255
-69
lines changed

modules/features2d/src/fast.avx2.cpp

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
2+
Below is the original copyright and the references */
3+
4+
/*
5+
Copyright (c) 2006, 2008 Edward Rosten
6+
All rights reserved.
7+
8+
Redistribution and use in source and binary forms, with or without
9+
modification, are permitted provided that the following conditions
10+
are met:
11+
12+
*Redistributions of source code must retain the above copyright
13+
notice, this list of conditions and the following disclaimer.
14+
15+
*Redistributions in binary form must reproduce the above copyright
16+
notice, this list of conditions and the following disclaimer in the
17+
documentation and/or other materials provided with the distribution.
18+
19+
*Neither the name of the University of Cambridge nor the names of
20+
its contributors may be used to endorse or promote products derived
21+
from this software without specific prior written permission.
22+
23+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
27+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34+
*/
35+
36+
/*
37+
The references are:
38+
* Machine learning for high-speed corner detection,
39+
E. Rosten and T. Drummond, ECCV 2006
40+
* Faster and better: A machine learning approach to corner detection
41+
E. Rosten, R. Porter and T. Drummond, PAMI, 2009
42+
*/
43+
44+
#include "precomp.hpp"
45+
#include "fast.hpp"
46+
#include "opencv2/core/hal/intrin.hpp"
47+
48+
namespace cv
49+
{
50+
namespace opt_AVX2
51+
{
52+
53+
class FAST_t_patternSize16_AVX2_Impl: public FAST_t_patternSize16_AVX2
54+
{
55+
public:
56+
FAST_t_patternSize16_AVX2_Impl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel):
57+
cols(_cols), nonmax_suppression(_nonmax_suppression), pixel(_pixel)
58+
{
59+
//patternSize = 16
60+
t256c = (char)_threshold;
61+
threshold = std::min(std::max(_threshold, 0), 255);
62+
}
63+
64+
virtual void process(int &j, const uchar* &ptr, uchar* curr, int* cornerpos, int &ncorners)
65+
{
66+
static const __m256i delta256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)(-128))), K16_256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)8));
67+
const __m256i t256 = _mm256_broadcastsi128_si256(_mm_set1_epi8(t256c));
68+
for (; j < cols - 32 - 3; j += 32, ptr += 32)
69+
{
70+
__m256i m0, m1;
71+
__m256i v0 = _mm256_loadu_si256((const __m256i*)ptr);
72+
73+
__m256i v1 = _mm256_xor_si256(_mm256_subs_epu8(v0, t256), delta256);
74+
v0 = _mm256_xor_si256(_mm256_adds_epu8(v0, t256), delta256);
75+
76+
__m256i x0 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[0])), delta256);
77+
__m256i x1 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[4])), delta256);
78+
__m256i x2 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[8])), delta256);
79+
__m256i x3 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[12])), delta256);
80+
81+
m0 = _mm256_and_si256(_mm256_cmpgt_epi8(x0, v0), _mm256_cmpgt_epi8(x1, v0));
82+
m1 = _mm256_and_si256(_mm256_cmpgt_epi8(v1, x0), _mm256_cmpgt_epi8(v1, x1));
83+
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x1, v0), _mm256_cmpgt_epi8(x2, v0)));
84+
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x1), _mm256_cmpgt_epi8(v1, x2)));
85+
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x2, v0), _mm256_cmpgt_epi8(x3, v0)));
86+
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x2), _mm256_cmpgt_epi8(v1, x3)));
87+
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x3, v0), _mm256_cmpgt_epi8(x0, v0)));
88+
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x3), _mm256_cmpgt_epi8(v1, x0)));
89+
m0 = _mm256_or_si256(m0, m1);
90+
91+
unsigned int mask = _mm256_movemask_epi8(m0); //unsigned is important!
92+
if (mask == 0){
93+
continue;
94+
}
95+
if ((mask & 0xffff) == 0)
96+
{
97+
j -= 16;
98+
ptr -= 16;
99+
continue;
100+
}
101+
102+
__m256i c0 = _mm256_setzero_si256(), c1 = c0, max0 = c0, max1 = c0;
103+
for (int k = 0; k < 25; k++)
104+
{
105+
__m256i x = _mm256_xor_si256(_mm256_loadu_si256((const __m256i*)(ptr + pixel[k])), delta256);
106+
m0 = _mm256_cmpgt_epi8(x, v0);
107+
m1 = _mm256_cmpgt_epi8(v1, x);
108+
109+
c0 = _mm256_and_si256(_mm256_sub_epi8(c0, m0), m0);
110+
c1 = _mm256_and_si256(_mm256_sub_epi8(c1, m1), m1);
111+
112+
max0 = _mm256_max_epu8(max0, c0);
113+
max1 = _mm256_max_epu8(max1, c1);
114+
}
115+
116+
max0 = _mm256_max_epu8(max0, max1);
117+
unsigned int m = _mm256_movemask_epi8(_mm256_cmpgt_epi8(max0, K16_256));
118+
119+
for (int k = 0; m > 0 && k < 32; k++, m >>= 1)
120+
if (m & 1)
121+
{
122+
cornerpos[ncorners++] = j + k;
123+
if (nonmax_suppression)
124+
{
125+
short d[25];
126+
for (int q = 0; q < 25; q++)
127+
d[q] = (short)(ptr[k] - ptr[k + pixel[q]]);
128+
v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000);
129+
for (int q = 0; q < 16; q += 8)
130+
{
131+
v_int16x8 v0_ = v_load(d + q + 1);
132+
v_int16x8 v1_ = v_load(d + q + 2);
133+
v_int16x8 a = v_min(v0_, v1_);
134+
v_int16x8 b = v_max(v0_, v1_);
135+
v0_ = v_load(d + q + 3);
136+
a = v_min(a, v0_);
137+
b = v_max(b, v0_);
138+
v0_ = v_load(d + q + 4);
139+
a = v_min(a, v0_);
140+
b = v_max(b, v0_);
141+
v0_ = v_load(d + q + 5);
142+
a = v_min(a, v0_);
143+
b = v_max(b, v0_);
144+
v0_ = v_load(d + q + 6);
145+
a = v_min(a, v0_);
146+
b = v_max(b, v0_);
147+
v0_ = v_load(d + q + 7);
148+
a = v_min(a, v0_);
149+
b = v_max(b, v0_);
150+
v0_ = v_load(d + q + 8);
151+
a = v_min(a, v0_);
152+
b = v_max(b, v0_);
153+
v0_ = v_load(d + q);
154+
q0 = v_max(q0, v_min(a, v0_));
155+
q1 = v_min(q1, v_max(b, v0_));
156+
v0_ = v_load(d + q + 9);
157+
q0 = v_max(q0, v_min(a, v0_));
158+
q1 = v_min(q1, v_max(b, v0_));
159+
}
160+
q0 = v_max(q0, v_setzero_s16() - q1);
161+
curr[j + k] = (uchar)(v_reduce_max(q0) - 1);
162+
}
163+
}
164+
}
165+
_mm256_zeroupper();
166+
}
167+
168+
virtual ~FAST_t_patternSize16_AVX2_Impl() {};
169+
170+
private:
171+
int cols;
172+
char t256c;
173+
int threshold;
174+
bool nonmax_suppression;
175+
const int* pixel;
176+
};
177+
178+
Ptr<FAST_t_patternSize16_AVX2> FAST_t_patternSize16_AVX2::getImpl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel)
179+
{
180+
return Ptr<FAST_t_patternSize16_AVX2>(new FAST_t_patternSize16_AVX2_Impl(_cols, _threshold, _nonmax_suppression, _pixel));
181+
}
182+
183+
}
184+
}

modules/features2d/src/fast.cpp

Lines changed: 9 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ The references are:
4242
*/
4343

4444
#include "precomp.hpp"
45+
#include "fast.hpp"
4546
#include "fast_score.hpp"
4647
#include "opencl_kernels_features2d.hpp"
4748
#include "opencv2/core/hal/intrin.hpp"
@@ -59,21 +60,20 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
5960
{
6061
Mat img = _img.getMat();
6162
const int K = patternSize/2, N = patternSize + K + 1;
63+
int i, j, k, pixel[25];
64+
makeOffsets(pixel, (int)img.step, patternSize);
65+
6266
#if CV_SIMD128
6367
const int quarterPatternSize = patternSize/4;
6468
v_uint8x16 delta = v_setall_u8(0x80), t = v_setall_u8((char)threshold), K16 = v_setall_u8((char)K);
6569
bool hasSimd = hasSIMD128();
6670
#if CV_TRY_AVX2
67-
__m256i delta256, t256, K16_256;
68-
if (CV_CPU_HAS_SUPPORT_AVX2)
69-
{
70-
delta256 = _mm256_broadcastsi128_si256(delta.val), t256 = _mm256_broadcastsi128_si256(t.val), K16_256 = _mm256_broadcastsi128_si256(K16.val);
71-
}
71+
Ptr<opt_AVX2::FAST_t_patternSize16_AVX2> fast_t_impl_avx2;
72+
if(CV_CPU_HAS_SUPPORT_AVX2)
73+
fast_t_impl_avx2 = opt_AVX2::FAST_t_patternSize16_AVX2::getImpl(img.cols, threshold, nonmax_suppression, pixel);
7274
#endif
7375

7476
#endif
75-
int i, j, k, pixel[25];
76-
makeOffsets(pixel, (int)img.step, patternSize);
7777

7878
keypoints.clear();
7979

@@ -109,68 +109,8 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
109109
if( patternSize == 16 )
110110
{
111111
#if CV_TRY_AVX2
112-
if (CV_CPU_HAS_SUPPORT_AVX2)
113-
{
114-
for(; j < img.cols - 32 - 3; j += 32, ptr += 32)
115-
{
116-
__m256i m0, m1;
117-
__m256i v0 = _mm256_loadu_si256((const __m256i*)ptr);
118-
119-
__m256i v1 = _mm256_xor_si256(_mm256_subs_epu8(v0, t256), delta256);
120-
v0 = _mm256_xor_si256(_mm256_adds_epu8(v0, t256), delta256);
121-
122-
__m256i x0 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[0])), delta256);
123-
__m256i x1 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[4])), delta256);
124-
__m256i x2 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[8])), delta256);
125-
__m256i x3 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[12])), delta256);
126-
127-
m0 = _mm256_and_si256(_mm256_cmpgt_epi8(x0, v0), _mm256_cmpgt_epi8(x1, v0));
128-
m1 = _mm256_and_si256(_mm256_cmpgt_epi8(v1, x0), _mm256_cmpgt_epi8(v1, x1));
129-
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x1, v0), _mm256_cmpgt_epi8(x2, v0)));
130-
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x1), _mm256_cmpgt_epi8(v1, x2)));
131-
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x2, v0), _mm256_cmpgt_epi8(x3, v0)));
132-
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x2), _mm256_cmpgt_epi8(v1, x3)));
133-
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x3, v0), _mm256_cmpgt_epi8(x0, v0)));
134-
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x3), _mm256_cmpgt_epi8(v1, x0)));
135-
m0 = _mm256_or_si256(m0, m1);
136-
137-
unsigned int mask = _mm256_movemask_epi8(m0); //unsigned is important!
138-
if (mask == 0){
139-
continue;
140-
}
141-
if ((mask & 0xffff) == 0)
142-
{
143-
j -= 16;
144-
ptr -= 16;
145-
continue;
146-
}
147-
148-
__m256i c0 = _mm256_setzero_si256(), c1 = c0, max0 = c0, max1 = c0;
149-
for (k = 0; k < N; k++)
150-
{
151-
__m256i x = _mm256_xor_si256(_mm256_loadu_si256((const __m256i*)(ptr + pixel[k])), delta256);
152-
m0 = _mm256_cmpgt_epi8(x, v0);
153-
m1 = _mm256_cmpgt_epi8(v1, x);
154-
155-
c0 = _mm256_and_si256(_mm256_sub_epi8(c0, m0), m0);
156-
c1 = _mm256_and_si256(_mm256_sub_epi8(c1, m1), m1);
157-
158-
max0 = _mm256_max_epu8(max0, c0);
159-
max1 = _mm256_max_epu8(max1, c1);
160-
}
161-
162-
max0 = _mm256_max_epu8(max0, max1);
163-
unsigned int m = _mm256_movemask_epi8(_mm256_cmpgt_epi8(max0, K16_256));
164-
165-
for (k = 0; m > 0 && k < 32; k++, m >>= 1)
166-
if (m & 1)
167-
{
168-
cornerpos[ncorners++] = j + k;
169-
if (nonmax_suppression)
170-
curr[j + k] = (uchar)cornerScore<patternSize>(ptr + k, pixel, threshold);
171-
}
172-
}
173-
} //CV_CPU_HAS_SUPPORT_AVX2
112+
if (fast_t_impl_avx2)
113+
fast_t_impl_avx2->process(j, ptr, curr, cornerpos, ncorners);
174114
#endif
175115
//vz if (j <= (img.cols - 27)) //it doesn't make sense using vectors for less than 8 elements
176116
{

modules/features2d/src/fast.hpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
2+
Below is the original copyright and the references */
3+
4+
/*
5+
Copyright (c) 2006, 2008 Edward Rosten
6+
All rights reserved.
7+
8+
Redistribution and use in source and binary forms, with or without
9+
modification, are permitted provided that the following conditions
10+
are met:
11+
12+
*Redistributions of source code must retain the above copyright
13+
notice, this list of conditions and the following disclaimer.
14+
15+
*Redistributions in binary form must reproduce the above copyright
16+
notice, this list of conditions and the following disclaimer in the
17+
documentation and/or other materials provided with the distribution.
18+
19+
*Neither the name of the University of Cambridge nor the names of
20+
its contributors may be used to endorse or promote products derived
21+
from this software without specific prior written permission.
22+
23+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
27+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34+
*/
35+
36+
/*
37+
The references are:
38+
* Machine learning for high-speed corner detection,
39+
E. Rosten and T. Drummond, ECCV 2006
40+
* Faster and better: A machine learning approach to corner detection
41+
E. Rosten, R. Porter and T. Drummond, PAMI, 2009
42+
*/
43+
44+
#ifndef OPENCV_FEATURES2D_FAST_HPP
45+
#define OPENCV_FEATURES2D_FAST_HPP
46+
47+
namespace cv
48+
{
49+
namespace opt_AVX2
50+
{
51+
#if CV_TRY_AVX2
52+
class FAST_t_patternSize16_AVX2
53+
{
54+
public:
55+
static Ptr<FAST_t_patternSize16_AVX2> getImpl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel);
56+
virtual void process(int &j, const uchar* &ptr, uchar* curr, int* cornerpos, int &ncorners) = 0;
57+
virtual ~FAST_t_patternSize16_AVX2() {};
58+
};
59+
#endif
60+
}
61+
}
62+
#endif

0 commit comments

Comments
 (0)