48
48
namespace cv
49
49
{
50
50
51
+ #if CV_AVX
52
+ // load three 8-packed float vector and deinterleave
53
+ // probably it's better to write down somewhere else
54
+ static inline void load_deinterleave (const float * ptr, __m256& a, __m256& b, __m256& c)
55
+ {
56
+ __m256 s0 = _mm256_loadu_ps (ptr); // a0, b0, c0, a1, b1, c1, a2, b2,
57
+ __m256 s1 = _mm256_loadu_ps (ptr + 8 ); // c2, a3, b3, c3, a4, b4, c4, a5,
58
+ __m256 s2 = _mm256_loadu_ps (ptr + 16 ); // b5, c5, a6, b6, c6, a7, b7, c7,
59
+ __m256 s3 = _mm256_permute2f128_ps (s1, s2, 0x21 ); // a4, b4, c4, a5, b5, c5, a6, b6,
60
+ __m256 s4 = _mm256_permute2f128_ps (s2, s2, 0x33 ); // c6, a7, b7, c7, c6, a7, b7, c7,
61
+
62
+ __m256 v00 = _mm256_unpacklo_ps (s0, s3); // a0, a4, b0, b4, b1, b5, c1, c5,
63
+ __m256 v01 = _mm256_unpackhi_ps (s0, s3); // c0, c4, a1, a5, a2, a6, b2, b6,
64
+ __m256 v02 = _mm256_unpacklo_ps (s1, s4); // c2, c6, a3, a7, x, x, x, x,
65
+ __m256 v03 = _mm256_unpackhi_ps (s1, s4); // b3, b7, c3, c7, x, x, x, x,
66
+ __m256 v04 = _mm256_permute2f128_ps (v02, v03, 0x20 ); // c2, c6, a3, a7, b3, b7, c3, c7,
67
+ __m256 v05 = _mm256_permute2f128_ps (v01, v03, 0x21 ); // a2, a6, b2, b6, b3, b7, c3, c7,
68
+
69
+ __m256 v10 = _mm256_unpacklo_ps (v00, v05); // a0, a2, a4, a6, b1, b3, b5, b7,
70
+ __m256 v11 = _mm256_unpackhi_ps (v00, v05); // b0, b2, b4, b6, c1, c3, c5, c7,
71
+ __m256 v12 = _mm256_unpacklo_ps (v01, v04); // c0, c2, c4, c6, x, x, x, x,
72
+ __m256 v13 = _mm256_unpackhi_ps (v01, v04); // a1, a3, a5, a7, x, x, x, x,
73
+ __m256 v14 = _mm256_permute2f128_ps (v11, v12, 0x20 ); // b0, b2, b4, b6, c0, c2, c4, c6,
74
+ __m256 v15 = _mm256_permute2f128_ps (v10, v11, 0x31 ); // b1, b3, b5, b7, c1, c3, c5, c7,
75
+
76
+ __m256 v20 = _mm256_unpacklo_ps (v14, v15); // b0, b1, b2, b3, c0, c1, c2, c3,
77
+ __m256 v21 = _mm256_unpackhi_ps (v14, v15); // b4, b5, b6, b7, c4, c5, c6, c7,
78
+ __m256 v22 = _mm256_unpacklo_ps (v10, v13); // a0, a1, a2, a3, x, x, x, x,
79
+ __m256 v23 = _mm256_unpackhi_ps (v10, v13); // a4, a5, a6, a7, x, x, x, x,
80
+
81
+ a = _mm256_permute2f128_ps (v22, v23, 0x20 ); // a0, a1, a2, a3, a4, a5, a6, a7,
82
+ b = _mm256_permute2f128_ps (v20, v21, 0x20 ); // b0, b1, b2, b3, b4, b5, b6, b7,
83
+ c = _mm256_permute2f128_ps (v20, v21, 0x31 ); // c0, c1, c2, c3, c4, c5, c6, c7,
84
+ }
85
+
86
+ // realign four 3-packed vector to three 4-packed vector
87
+ static inline void v_pack4x3to3x4 (const __m128i& s0, const __m128i& s1, const __m128i& s2, const __m128i& s3, __m128i& d0, __m128i& d1, __m128i& d2)
88
+ {
89
+ d0 = _mm_or_si128 (s0, _mm_slli_si128 (s1, 12 ));
90
+ d1 = _mm_or_si128 (_mm_srli_si128 (s1, 4 ), _mm_slli_si128 (s2, 8 ));
91
+ d2 = _mm_or_si128 (_mm_srli_si128 (s2, 8 ), _mm_slli_si128 (s3, 4 ));
92
+ }
93
+
94
+ // separate high and low 128 bit and cast to __m128i
95
+ static inline void v_separate_lo_hi (const __m256& src, __m128i& lo, __m128i& hi)
96
+ {
97
+ lo = _mm_castps_si128 (_mm256_castps256_ps128 (src));
98
+ hi = _mm_castps_si128 (_mm256_extractf128_ps (src, 1 ));
99
+ }
100
+
101
+ // interleave three 8-float vector and store
102
+ static inline void store_interleave (float * ptr, const __m256& a, const __m256& b, const __m256& c)
103
+ {
104
+ __m128i a0, a1, b0, b1, c0, c1;
105
+ v_separate_lo_hi (a, a0, a1);
106
+ v_separate_lo_hi (b, b0, b1);
107
+ v_separate_lo_hi (c, c0, c1);
108
+
109
+ v_uint32x4 z = v_setzero_u32 ();
110
+ v_uint32x4 u0, u1, u2, u3;
111
+ v_transpose4x4 (v_uint32x4 (a0), v_uint32x4 (b0), v_uint32x4 (c0), z, u0, u1, u2, u3);
112
+ v_pack4x3to3x4 (u0.val , u1.val , u2.val , u3.val , a0, b0, c0);
113
+ v_transpose4x4 (v_uint32x4 (a1), v_uint32x4 (b1), v_uint32x4 (c1), z, u0, u1, u2, u3);
114
+ v_pack4x3to3x4 (u0.val , u1.val , u2.val , u3.val , a1, b1, c1);
115
+
116
+ _mm256_storeu_ps (ptr, _mm256_setr_m128 (_mm_castsi128_ps (a0), _mm_castsi128_ps (b0)));
117
+ _mm256_storeu_ps (ptr + 8 , _mm256_setr_m128 (_mm_castsi128_ps (c0), _mm_castsi128_ps (a1)));
118
+ _mm256_storeu_ps (ptr + 16 , _mm256_setr_m128 (_mm_castsi128_ps (b1), _mm_castsi128_ps (c1)));
119
+ }
120
+ #endif // CV_AVX
121
+
51
122
static void calcMinEigenVal ( const Mat& _cov, Mat& _dst )
52
123
{
53
124
int i, j;
54
125
Size size = _cov.size ();
126
+ #if CV_AVX
127
+ bool haveAvx = checkHardwareSupport (CV_CPU_AVX);
128
+ #endif
55
129
#if CV_SIMD128
56
- bool simd = hasSIMD128 ();
130
+ bool haveSimd = hasSIMD128 ();
57
131
#endif
58
132
59
133
if ( _cov.isContinuous () && _dst.isContinuous () )
@@ -67,8 +141,25 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst )
67
141
const float * cov = _cov.ptr <float >(i);
68
142
float * dst = _dst.ptr <float >(i);
69
143
j = 0 ;
70
- #if CV_SIMD128
71
- if ( simd )
144
+ #if CV_AVX
145
+ if ( haveAvx )
146
+ {
147
+ __m256 half = _mm256_set1_ps (0 .5f );
148
+ for ( ; j <= size.width - 8 ; j += 8 )
149
+ {
150
+ __m256 v_a, v_b, v_c, v_t ;
151
+ load_deinterleave (cov + j*3 , v_a, v_b, v_c);
152
+ v_a = _mm256_mul_ps (v_a, half);
153
+ v_c = _mm256_mul_ps (v_c, half);
154
+ v_t = _mm256_sub_ps (v_a, v_c);
155
+ v_t = _mm256_add_ps (_mm256_mul_ps (v_b, v_b), _mm256_mul_ps (v_t , v_t ));
156
+ _mm256_storeu_ps (dst + j, _mm256_sub_ps (_mm256_add_ps (v_a, v_c), _mm256_sqrt_ps (v_t )));
157
+ }
158
+ }
159
+ #endif // CV_AVX
160
+
161
+ #if CV_SIMD128
162
+ if ( haveSimd )
72
163
{
73
164
v_float32x4 half = v_setall_f32 (0 .5f );
74
165
for ( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
@@ -82,7 +173,8 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst )
82
173
v_store (dst + j, (v_a + v_c) - v_sqrt (v_t ));
83
174
}
84
175
}
85
- #endif
176
+ #endif // CV_SIMD128
177
+
86
178
for ( ; j < size.width ; j++ )
87
179
{
88
180
float a = cov[j*3 ]*0 .5f ;
@@ -98,8 +190,11 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k )
98
190
{
99
191
int i, j;
100
192
Size size = _cov.size ();
193
+ #if CV_AVX
194
+ bool haveAvx = checkHardwareSupport (CV_CPU_AVX);
195
+ #endif
101
196
#if CV_SIMD128
102
- bool simd = hasSIMD128 ();
197
+ bool haveSimd = hasSIMD128 ();
103
198
#endif
104
199
105
200
if ( _cov.isContinuous () && _dst.isContinuous () )
@@ -114,8 +209,26 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k )
114
209
float * dst = _dst.ptr <float >(i);
115
210
j = 0 ;
116
211
117
- #if CV_SIMD128
118
- if ( simd )
212
+ #if CV_AVX
213
+ if ( haveAvx )
214
+ {
215
+ __m256 v_k = _mm256_set1_ps ((float )k);
216
+
217
+ for ( ; j <= size.width - 8 ; j += 8 )
218
+ {
219
+ __m256 v_a, v_b, v_c;
220
+ load_deinterleave (cov + j * 3 , v_a, v_b, v_c);
221
+
222
+ __m256 v_ac_bb = _mm256_sub_ps (_mm256_mul_ps (v_a, v_c), _mm256_mul_ps (v_b, v_b));
223
+ __m256 v_ac = _mm256_add_ps (v_a, v_c);
224
+ __m256 v_dst = _mm256_sub_ps (v_ac_bb, _mm256_mul_ps (v_k, _mm256_mul_ps (v_ac, v_ac)));
225
+ _mm256_storeu_ps (dst + j, v_dst);
226
+ }
227
+ }
228
+ #endif // CV_AVX
229
+
230
+ #if CV_SIMD128
231
+ if ( haveSimd )
119
232
{
120
233
v_float32x4 v_k = v_setall_f32 ((float )k);
121
234
@@ -130,7 +243,7 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k )
130
243
v_store (dst + j, v_dst);
131
244
}
132
245
}
133
- #endif
246
+ #endif // CV_SIMD128
134
247
135
248
for ( ; j < size.width ; j++ )
136
249
{
@@ -231,6 +344,9 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
231
344
if (tegra::useTegra () && tegra::cornerEigenValsVecs (src, eigenv, block_size, aperture_size, op_type, k, borderType))
232
345
return ;
233
346
#endif
347
+ #if CV_AVX
348
+ bool haveAvx = checkHardwareSupport (CV_CPU_AVX);
349
+ #endif
234
350
#if CV_SIMD128
235
351
bool haveSimd = hasSIMD128 ();
236
352
#endif
@@ -268,8 +384,26 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
268
384
const float * dydata = Dy.ptr <float >(i);
269
385
j = 0 ;
270
386
271
- #if CV_SIMD128
272
- if (haveSimd)
387
+ #if CV_AVX
388
+ if ( haveAvx )
389
+ {
390
+ for ( ; j <= size.width - 8 ; j += 8 )
391
+ {
392
+ __m256 v_dx = _mm256_loadu_ps (dxdata + j);
393
+ __m256 v_dy = _mm256_loadu_ps (dydata + j);
394
+
395
+ __m256 v_dst0, v_dst1, v_dst2;
396
+ v_dst0 = _mm256_mul_ps (v_dx, v_dx);
397
+ v_dst1 = _mm256_mul_ps (v_dx, v_dy);
398
+ v_dst2 = _mm256_mul_ps (v_dy, v_dy);
399
+
400
+ store_interleave (cov_data + j * 3 , v_dst0, v_dst1, v_dst2);
401
+ }
402
+ }
403
+ #endif // CV_AVX
404
+
405
+ #if CV_SIMD128
406
+ if ( haveSimd )
273
407
{
274
408
for ( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
275
409
{
@@ -284,7 +418,7 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
284
418
v_store_interleave (cov_data + j * 3 , v_dst0, v_dst1, v_dst2);
285
419
}
286
420
}
287
- #endif
421
+ #endif // CV_SIMD128
288
422
289
423
for ( ; j < size.width ; j++ )
290
424
{
0 commit comments