|
| 1 | +/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten. |
| 2 | + Below is the original copyright and the references */ |
| 3 | + |
| 4 | +/* |
| 5 | +Copyright (c) 2006, 2008 Edward Rosten |
| 6 | +All rights reserved. |
| 7 | +
|
| 8 | +Redistribution and use in source and binary forms, with or without |
| 9 | +modification, are permitted provided that the following conditions |
| 10 | +are met: |
| 11 | +
|
| 12 | + *Redistributions of source code must retain the above copyright |
| 13 | + notice, this list of conditions and the following disclaimer. |
| 14 | +
|
| 15 | + *Redistributions in binary form must reproduce the above copyright |
| 16 | + notice, this list of conditions and the following disclaimer in the |
| 17 | + documentation and/or other materials provided with the distribution. |
| 18 | +
|
| 19 | + *Neither the name of the University of Cambridge nor the names of |
| 20 | + its contributors may be used to endorse or promote products derived |
| 21 | + from this software without specific prior written permission. |
| 22 | +
|
| 23 | +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 24 | +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 25 | +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 26 | +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
| 27 | +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 28 | +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 29 | +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 30 | +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 31 | +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 32 | +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 33 | +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 34 | +*/ |
| 35 | + |
| 36 | +/* |
| 37 | +The references are: |
| 38 | + * Machine learning for high-speed corner detection, |
| 39 | + E. Rosten and T. Drummond, ECCV 2006 |
| 40 | + * Faster and better: A machine learning approach to corner detection |
| 41 | + E. Rosten, R. Porter and T. Drummond, PAMI, 2009 |
| 42 | +*/ |
| 43 | + |
| 44 | +#include "precomp.hpp" |
| 45 | +#include "fast.hpp" |
| 46 | +#include "opencv2/core/hal/intrin.hpp" |
| 47 | + |
| 48 | +namespace cv |
| 49 | +{ |
| 50 | +namespace opt_AVX2 |
| 51 | +{ |
| 52 | + |
| 53 | +class FAST_t_patternSize16_AVX2_Impl: public FAST_t_patternSize16_AVX2 |
| 54 | +{ |
| 55 | +public: |
| 56 | + FAST_t_patternSize16_AVX2_Impl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel): |
| 57 | + cols(_cols), nonmax_suppression(_nonmax_suppression), pixel(_pixel) |
| 58 | + { |
| 59 | + //patternSize = 16 |
| 60 | + t256c = (char)_threshold; |
| 61 | + threshold = std::min(std::max(_threshold, 0), 255); |
| 62 | + } |
| 63 | + |
| 64 | + virtual void process(int &j, const uchar* &ptr, uchar* curr, int* cornerpos, int &ncorners) |
| 65 | + { |
| 66 | + static const __m256i delta256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)(-128))), K16_256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)8)); |
| 67 | + const __m256i t256 = _mm256_broadcastsi128_si256(_mm_set1_epi8(t256c)); |
| 68 | + for (; j < cols - 32 - 3; j += 32, ptr += 32) |
| 69 | + { |
| 70 | + __m256i m0, m1; |
| 71 | + __m256i v0 = _mm256_loadu_si256((const __m256i*)ptr); |
| 72 | + |
| 73 | + __m256i v1 = _mm256_xor_si256(_mm256_subs_epu8(v0, t256), delta256); |
| 74 | + v0 = _mm256_xor_si256(_mm256_adds_epu8(v0, t256), delta256); |
| 75 | + |
| 76 | + __m256i x0 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[0])), delta256); |
| 77 | + __m256i x1 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[4])), delta256); |
| 78 | + __m256i x2 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[8])), delta256); |
| 79 | + __m256i x3 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[12])), delta256); |
| 80 | + |
| 81 | + m0 = _mm256_and_si256(_mm256_cmpgt_epi8(x0, v0), _mm256_cmpgt_epi8(x1, v0)); |
| 82 | + m1 = _mm256_and_si256(_mm256_cmpgt_epi8(v1, x0), _mm256_cmpgt_epi8(v1, x1)); |
| 83 | + m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x1, v0), _mm256_cmpgt_epi8(x2, v0))); |
| 84 | + m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x1), _mm256_cmpgt_epi8(v1, x2))); |
| 85 | + m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x2, v0), _mm256_cmpgt_epi8(x3, v0))); |
| 86 | + m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x2), _mm256_cmpgt_epi8(v1, x3))); |
| 87 | + m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x3, v0), _mm256_cmpgt_epi8(x0, v0))); |
| 88 | + m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x3), _mm256_cmpgt_epi8(v1, x0))); |
| 89 | + m0 = _mm256_or_si256(m0, m1); |
| 90 | + |
| 91 | + unsigned int mask = _mm256_movemask_epi8(m0); //unsigned is important! |
| 92 | + if (mask == 0){ |
| 93 | + continue; |
| 94 | + } |
| 95 | + if ((mask & 0xffff) == 0) |
| 96 | + { |
| 97 | + j -= 16; |
| 98 | + ptr -= 16; |
| 99 | + continue; |
| 100 | + } |
| 101 | + |
| 102 | + __m256i c0 = _mm256_setzero_si256(), c1 = c0, max0 = c0, max1 = c0; |
| 103 | + for (int k = 0; k < 25; k++) |
| 104 | + { |
| 105 | + __m256i x = _mm256_xor_si256(_mm256_loadu_si256((const __m256i*)(ptr + pixel[k])), delta256); |
| 106 | + m0 = _mm256_cmpgt_epi8(x, v0); |
| 107 | + m1 = _mm256_cmpgt_epi8(v1, x); |
| 108 | + |
| 109 | + c0 = _mm256_and_si256(_mm256_sub_epi8(c0, m0), m0); |
| 110 | + c1 = _mm256_and_si256(_mm256_sub_epi8(c1, m1), m1); |
| 111 | + |
| 112 | + max0 = _mm256_max_epu8(max0, c0); |
| 113 | + max1 = _mm256_max_epu8(max1, c1); |
| 114 | + } |
| 115 | + |
| 116 | + max0 = _mm256_max_epu8(max0, max1); |
| 117 | + unsigned int m = _mm256_movemask_epi8(_mm256_cmpgt_epi8(max0, K16_256)); |
| 118 | + |
| 119 | + for (int k = 0; m > 0 && k < 32; k++, m >>= 1) |
| 120 | + if (m & 1) |
| 121 | + { |
| 122 | + cornerpos[ncorners++] = j + k; |
| 123 | + if (nonmax_suppression) |
| 124 | + { |
| 125 | + short d[25]; |
| 126 | + for (int q = 0; q < 25; q++) |
| 127 | + d[q] = (short)(ptr[k] - ptr[k + pixel[q]]); |
| 128 | + v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000); |
| 129 | + for (int q = 0; q < 16; q += 8) |
| 130 | + { |
| 131 | + v_int16x8 v0_ = v_load(d + q + 1); |
| 132 | + v_int16x8 v1_ = v_load(d + q + 2); |
| 133 | + v_int16x8 a = v_min(v0_, v1_); |
| 134 | + v_int16x8 b = v_max(v0_, v1_); |
| 135 | + v0_ = v_load(d + q + 3); |
| 136 | + a = v_min(a, v0_); |
| 137 | + b = v_max(b, v0_); |
| 138 | + v0_ = v_load(d + q + 4); |
| 139 | + a = v_min(a, v0_); |
| 140 | + b = v_max(b, v0_); |
| 141 | + v0_ = v_load(d + q + 5); |
| 142 | + a = v_min(a, v0_); |
| 143 | + b = v_max(b, v0_); |
| 144 | + v0_ = v_load(d + q + 6); |
| 145 | + a = v_min(a, v0_); |
| 146 | + b = v_max(b, v0_); |
| 147 | + v0_ = v_load(d + q + 7); |
| 148 | + a = v_min(a, v0_); |
| 149 | + b = v_max(b, v0_); |
| 150 | + v0_ = v_load(d + q + 8); |
| 151 | + a = v_min(a, v0_); |
| 152 | + b = v_max(b, v0_); |
| 153 | + v0_ = v_load(d + q); |
| 154 | + q0 = v_max(q0, v_min(a, v0_)); |
| 155 | + q1 = v_min(q1, v_max(b, v0_)); |
| 156 | + v0_ = v_load(d + q + 9); |
| 157 | + q0 = v_max(q0, v_min(a, v0_)); |
| 158 | + q1 = v_min(q1, v_max(b, v0_)); |
| 159 | + } |
| 160 | + q0 = v_max(q0, v_setzero_s16() - q1); |
| 161 | + curr[j + k] = (uchar)(v_reduce_max(q0) - 1); |
| 162 | + } |
| 163 | + } |
| 164 | + } |
| 165 | + _mm256_zeroupper(); |
| 166 | + } |
| 167 | + |
| 168 | + virtual ~FAST_t_patternSize16_AVX2_Impl() {}; |
| 169 | + |
| 170 | +private: |
| 171 | + int cols; |
| 172 | + char t256c; |
| 173 | + int threshold; |
| 174 | + bool nonmax_suppression; |
| 175 | + const int* pixel; |
| 176 | +}; |
| 177 | + |
| 178 | +Ptr<FAST_t_patternSize16_AVX2> FAST_t_patternSize16_AVX2::getImpl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel) |
| 179 | +{ |
| 180 | + return Ptr<FAST_t_patternSize16_AVX2>(new FAST_t_patternSize16_AVX2_Impl(_cols, _threshold, _nonmax_suppression, _pixel)); |
| 181 | +} |
| 182 | + |
| 183 | +} |
| 184 | +} |
0 commit comments