Merge pull request opencv#9875 from terfendail:fast_avx

vpisarev · vpisarev · commit 1a495a58175f · 2017-10-27T12:53:59.000Z
diff --git a/modules/features2d/src/fast.avx2.cpp b/modules/features2d/src/fast.avx2.cpp
@@ -0,0 +1,184 @@
+/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
+   Below is the original copyright and the references */
+
+/*
+Copyright (c) 2006, 2008 Edward Rosten
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+    *Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+    *Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+    *Neither the name of the University of Cambridge nor the names of
+     its contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+The references are:
+ * Machine learning for high-speed corner detection,
+   E. Rosten and T. Drummond, ECCV 2006
+ * Faster and better: A machine learning approach to corner detection
+   E. Rosten, R. Porter and T. Drummond, PAMI, 2009
+*/
+
+#include "precomp.hpp"
+#include "fast.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+namespace cv
+{
+namespace opt_AVX2
+{
+
+class FAST_t_patternSize16_AVX2_Impl: public FAST_t_patternSize16_AVX2
+{
+public:
+    FAST_t_patternSize16_AVX2_Impl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel):
+                                   cols(_cols), nonmax_suppression(_nonmax_suppression), pixel(_pixel)
+    {
+        //patternSize = 16
+        t256c = (char)_threshold;
+        threshold = std::min(std::max(_threshold, 0), 255);
+    }
+
+    virtual void process(int &j, const uchar* &ptr, uchar* curr, int* cornerpos, int &ncorners)
+    {
+        static const __m256i delta256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)(-128))), K16_256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)8));
+        const __m256i t256 = _mm256_broadcastsi128_si256(_mm_set1_epi8(t256c));
+        for (; j < cols - 32 - 3; j += 32, ptr += 32)
+        {
+            __m256i m0, m1;
+            __m256i v0 = _mm256_loadu_si256((const __m256i*)ptr);
+
+            __m256i v1 = _mm256_xor_si256(_mm256_subs_epu8(v0, t256), delta256);
+            v0 = _mm256_xor_si256(_mm256_adds_epu8(v0, t256), delta256);
+
+            __m256i x0 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[0])), delta256);
+            __m256i x1 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[4])), delta256);
+            __m256i x2 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[8])), delta256);
+            __m256i x3 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[12])), delta256);
+
+            m0 = _mm256_and_si256(_mm256_cmpgt_epi8(x0, v0), _mm256_cmpgt_epi8(x1, v0));
+            m1 = _mm256_and_si256(_mm256_cmpgt_epi8(v1, x0), _mm256_cmpgt_epi8(v1, x1));
+            m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x1, v0), _mm256_cmpgt_epi8(x2, v0)));
+            m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x1), _mm256_cmpgt_epi8(v1, x2)));
+            m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x2, v0), _mm256_cmpgt_epi8(x3, v0)));
+            m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x2), _mm256_cmpgt_epi8(v1, x3)));
+            m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x3, v0), _mm256_cmpgt_epi8(x0, v0)));
+            m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x3), _mm256_cmpgt_epi8(v1, x0)));
+            m0 = _mm256_or_si256(m0, m1);
+
+            unsigned int mask = _mm256_movemask_epi8(m0); //unsigned is important!
+            if (mask == 0){
+                continue;
+            }
+            if ((mask & 0xffff) == 0)
+            {
+                j -= 16;
+                ptr -= 16;
+                continue;
+            }
+
+            __m256i c0 = _mm256_setzero_si256(), c1 = c0, max0 = c0, max1 = c0;
+            for (int k = 0; k < 25; k++)
+            {
+                __m256i x = _mm256_xor_si256(_mm256_loadu_si256((const __m256i*)(ptr + pixel[k])), delta256);
+                m0 = _mm256_cmpgt_epi8(x, v0);
+                m1 = _mm256_cmpgt_epi8(v1, x);
+
+                c0 = _mm256_and_si256(_mm256_sub_epi8(c0, m0), m0);
+                c1 = _mm256_and_si256(_mm256_sub_epi8(c1, m1), m1);
+
+                max0 = _mm256_max_epu8(max0, c0);
+                max1 = _mm256_max_epu8(max1, c1);
+            }
+
+            max0 = _mm256_max_epu8(max0, max1);
+            unsigned int m = _mm256_movemask_epi8(_mm256_cmpgt_epi8(max0, K16_256));
+
+            for (int k = 0; m > 0 && k < 32; k++, m >>= 1)
+                if (m & 1)
+                {
+                    cornerpos[ncorners++] = j + k;
+                    if (nonmax_suppression)
+                    {
+                        short d[25];
+                        for (int q = 0; q < 25; q++)
+                            d[q] = (short)(ptr[k] - ptr[k + pixel[q]]);
+                        v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000);
+                        for (int q = 0; q < 16; q += 8)
+                        {
+                            v_int16x8 v0_ = v_load(d + q + 1);
+                            v_int16x8 v1_ = v_load(d + q + 2);
+                            v_int16x8 a = v_min(v0_, v1_);
+                            v_int16x8 b = v_max(v0_, v1_);
+                            v0_ = v_load(d + q + 3);
+                            a = v_min(a, v0_);
+                            b = v_max(b, v0_);
+                            v0_ = v_load(d + q + 4);
+                            a = v_min(a, v0_);
+                            b = v_max(b, v0_);
+                            v0_ = v_load(d + q + 5);
+                            a = v_min(a, v0_);
+                            b = v_max(b, v0_);
+                            v0_ = v_load(d + q + 6);
+                            a = v_min(a, v0_);
+                            b = v_max(b, v0_);
+                            v0_ = v_load(d + q + 7);
+                            a = v_min(a, v0_);
+                            b = v_max(b, v0_);
+                            v0_ = v_load(d + q + 8);
+                            a = v_min(a, v0_);
+                            b = v_max(b, v0_);
+                            v0_ = v_load(d + q);
+                            q0 = v_max(q0, v_min(a, v0_));
+                            q1 = v_min(q1, v_max(b, v0_));
+                            v0_ = v_load(d + q + 9);
+                            q0 = v_max(q0, v_min(a, v0_));
+                            q1 = v_min(q1, v_max(b, v0_));
+                        }
+                        q0 = v_max(q0, v_setzero_s16() - q1);
+                        curr[j + k] = (uchar)(v_reduce_max(q0) - 1);
+                    }
+                }
+        }
+        _mm256_zeroupper();
+    }
+
+    virtual ~FAST_t_patternSize16_AVX2_Impl() {};
+
+private:
+    int cols;
+    char t256c;
+    int threshold;
+    bool nonmax_suppression;
+    const int* pixel;
+};
+
+Ptr<FAST_t_patternSize16_AVX2> FAST_t_patternSize16_AVX2::getImpl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel)
+{
+    return Ptr<FAST_t_patternSize16_AVX2>(new FAST_t_patternSize16_AVX2_Impl(_cols, _threshold, _nonmax_suppression, _pixel));
+}
+
+}
+}
diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp
@@ -42,6 +42,7 @@ The references are:
 */
 
 #include "precomp.hpp"
+#include "fast.hpp"
 #include "fast_score.hpp"
 #include "opencl_kernels_features2d.hpp"
 #include "opencv2/core/hal/intrin.hpp"
@@ -59,13 +60,20 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
 {
     Mat img = _img.getMat();
     const int K = patternSize/2, N = patternSize + K + 1;
+    int i, j, k, pixel[25];
+    makeOffsets(pixel, (int)img.step, patternSize);
+
 #if CV_SIMD128
     const int quarterPatternSize = patternSize/4;
     v_uint8x16 delta = v_setall_u8(0x80), t = v_setall_u8((char)threshold), K16 = v_setall_u8((char)K);
     bool hasSimd = hasSIMD128();
+#if CV_TRY_AVX2
+    Ptr<opt_AVX2::FAST_t_patternSize16_AVX2> fast_t_impl_avx2;
+    if(CV_CPU_HAS_SUPPORT_AVX2)
+        fast_t_impl_avx2 = opt_AVX2::FAST_t_patternSize16_AVX2::getImpl(img.cols, threshold, nonmax_suppression, pixel);
+#endif
+
 #endif
-    int i, j, k, pixel[25];
-    makeOffsets(pixel, (int)img.step, patternSize);
 
     keypoints.clear();
 
@@ -100,65 +108,72 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
             {
                 if( patternSize == 16 )
                 {
-                    for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
+#if CV_TRY_AVX2
+                    if (fast_t_impl_avx2)
+                        fast_t_impl_avx2->process(j, ptr, curr, cornerpos, ncorners);
+#endif
+                    //vz if (j <= (img.cols - 27)) //it doesn't make sense using vectors for less than 8 elements
                     {
-                        v_uint8x16 v = v_load(ptr);
-                        v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
-                        v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);
-
-                        v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
-                        v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
-                        v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta));
-                        v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));
-
-                        v_int8x16 m0, m1;
-                        m0 = (v0 < x0) & (v0 < x1);
-                        m1 = (x0 < v1) & (x1 < v1);
-                        m0 = m0 | ((v0 < x1) & (v0 < x2));
-                        m1 = m1 | ((x1 < v1) & (x2 < v1));
-                        m0 = m0 | ((v0 < x2) & (v0 < x3));
-                        m1 = m1 | ((x2 < v1) & (x3 < v1));
-                        m0 = m0 | ((v0 < x3) & (v0 < x0));
-                        m1 = m1 | ((x3 < v1) & (x0 < v1));
-                        m0 = m0 | m1;
-
-                        int mask = v_signmask(m0);
-                        if( mask == 0 )
-                            continue;
-                        if( (mask & 255) == 0 )
+                        for (; j < img.cols - 16 - 3; j += 16, ptr += 16)
                         {
-                            j -= 8;
-                            ptr -= 8;
-                            continue;
-                        }
+                            v_uint8x16 v = v_load(ptr);
+                            v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
+                            v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);
+
+                            v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
+                            v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
+                            v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta));
+                            v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));
+
+                            v_int8x16 m0, m1;
+                            m0 = (v0 < x0) & (v0 < x1);
+                            m1 = (x0 < v1) & (x1 < v1);
+                            m0 = m0 | ((v0 < x1) & (v0 < x2));
+                            m1 = m1 | ((x1 < v1) & (x2 < v1));
+                            m0 = m0 | ((v0 < x2) & (v0 < x3));
+                            m1 = m1 | ((x2 < v1) & (x3 < v1));
+                            m0 = m0 | ((v0 < x3) & (v0 < x0));
+                            m1 = m1 | ((x3 < v1) & (x0 < v1));
+                            m0 = m0 | m1;
+
+                            int mask = v_signmask(m0);
+                            if( mask == 0 )
+                                continue;
+                            if( (mask & 255) == 0 )
+                            {
+                                j -= 8;
+                                ptr -= 8;
+                                continue;
+                            }
 
-                        v_int8x16 c0 = v_setzero_s8();
-                        v_int8x16 c1 = v_setzero_s8();
-                        v_uint8x16 max0 = v_setzero_u8();
-                        v_uint8x16 max1 = v_setzero_u8();
-                        for( k = 0; k < N; k++ )
-                        {
-                            v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
-                            m0 = v0 < x;
-                            m1 = x < v1;
+                            v_int8x16 c0 = v_setzero_s8();
+                            v_int8x16 c1 = v_setzero_s8();
+                            v_uint8x16 max0 = v_setzero_u8();
+                            v_uint8x16 max1 = v_setzero_u8();
+                            for( k = 0; k < N; k++ )
+                            {
+                                v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
+                                m0 = v0 < x;
+                                m1 = x < v1;
 
-                            c0 = v_sub_wrap(c0, m0) & m0;
-                            c1 = v_sub_wrap(c1, m1) & m1;
+                                c0 = v_sub_wrap(c0, m0) & m0;
+                                c1 = v_sub_wrap(c1, m1) & m1;
 
-                            max0 = v_max(max0, v_reinterpret_as_u8(c0));
-                            max1 = v_max(max1, v_reinterpret_as_u8(c1));
-                        }
+                                max0 = v_max(max0, v_reinterpret_as_u8(c0));
+                                max1 = v_max(max1, v_reinterpret_as_u8(c1));
+                            }
 
-                        max0 = v_max(max0, max1);
-                        int m = v_signmask(K16 < max0);
+                            max0 = v_max(max0, max1);
+                            int m = v_signmask(K16 < max0);
 
-                        for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
-                        {
-                            if(m & 1)
+                            for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
                             {
-                                cornerpos[ncorners++] = j+k;
-                                if(nonmax_suppression)
-                                    curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
+                                if(m & 1)
+                                {
+                                    cornerpos[ncorners++] = j+k;
+                                    if(nonmax_suppression)
+                                        curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
+                                }
                             }
                         }
                     }
diff --git a/modules/features2d/src/fast.hpp b/modules/features2d/src/fast.hpp