@@ -136,6 +136,10 @@ void cv::initUndistortRectifyMap( InputArray _cameraMatrix, InputArray _distCoef
136
136
cv::Matx33d matTilt = cv::Matx33d::eye ();
137
137
cv::detail::computeTiltProjectionMatrix (tauX, tauY, &matTilt);
138
138
139
+ #if CV_AVX2
140
+ bool USE_AVX2 = cv::checkHardwareSupport (CV_CPU_AVX2);
141
+ #endif
142
+
139
143
for ( int i = 0 ; i < size.height ; i++ )
140
144
{
141
145
float * m1f = map1.ptr <float >(i);
@@ -144,7 +148,150 @@ void cv::initUndistortRectifyMap( InputArray _cameraMatrix, InputArray _distCoef
144
148
ushort* m2 = (ushort*)m2f;
145
149
double _x = i*ir[1 ] + ir[2 ], _y = i*ir[4 ] + ir[5 ], _w = i*ir[7 ] + ir[8 ];
146
150
147
- for ( int j = 0 ; j < size.width ; j++, _x += ir[0 ], _y += ir[3 ], _w += ir[6 ] )
151
+ int j = 0 ;
152
+ #if CV_AVX2
153
+ if ( USE_AVX2 )
154
+ {
155
+ static const __m256d __one = _mm256_set1_pd (1.0 );
156
+ static const __m256d __two = _mm256_set1_pd (2.0 );
157
+
158
+ const __m256d __matTilt_00 = _mm256_set1_pd (matTilt (0 , 0 ));
159
+ const __m256d __matTilt_10 = _mm256_set1_pd (matTilt (1 , 0 ));
160
+ const __m256d __matTilt_20 = _mm256_set1_pd (matTilt (2 , 0 ));
161
+
162
+ const __m256d __matTilt_01 = _mm256_set1_pd (matTilt (0 , 1 ));
163
+ const __m256d __matTilt_11 = _mm256_set1_pd (matTilt (1 , 1 ));
164
+ const __m256d __matTilt_21 = _mm256_set1_pd (matTilt (2 , 1 ));
165
+
166
+ const __m256d __matTilt_02 = _mm256_set1_pd (matTilt (0 , 2 ));
167
+ const __m256d __matTilt_12 = _mm256_set1_pd (matTilt (1 , 2 ));
168
+ const __m256d __matTilt_22 = _mm256_set1_pd (matTilt (2 , 2 ));
169
+
170
+ for ( ; j <= size.width - 4 ; j += 4 , _x += 4 * ir[0 ], _y += 4 * ir[3 ], _w += 4 * ir[6 ] )
171
+ {
172
+ // Question: Should we load the constants first?
173
+ __m256d __w = _mm256_div_pd (__one, _mm256_set_pd (_w + 3 * ir[6 ], _w + 2 * ir[6 ], _w + ir[6 ], _w));
174
+ __m256d __x = _mm256_mul_pd (_mm256_set_pd (_x + 3 * ir[0 ], _x + 2 * ir[0 ], _x + ir[0 ], _x), __w);
175
+ __m256d __y = _mm256_mul_pd (_mm256_set_pd (_y + 3 * ir[3 ], _y + 2 * ir[3 ], _y + ir[3 ], _y), __w);
176
+ __m256d __x2 = _mm256_mul_pd (__x, __x);
177
+ __m256d __y2 = _mm256_mul_pd (__y, __y);
178
+ __m256d __r2 = _mm256_add_pd (__x2, __y2);
179
+ __m256d __2xy = _mm256_mul_pd (__two, _mm256_mul_pd (__x, __y));
180
+ __m256d __kr = _mm256_div_pd (
181
+ #if CV_FMA3
182
+ _mm256_fmadd_pd (_mm256_fmadd_pd (_mm256_fmadd_pd (_mm256_set1_pd (k3), __r2, _mm256_set1_pd (k2)), __r2, _mm256_set1_pd (k1)), __r2, __one),
183
+ _mm256_fmadd_pd (_mm256_fmadd_pd (_mm256_fmadd_pd (_mm256_set1_pd (k6), __r2, _mm256_set1_pd (k5)), __r2, _mm256_set1_pd (k4)), __r2, __one)
184
+ #else
185
+ _mm256_add_pd (__one, _mm256_mul_pd (_mm256_add_pd (_mm256_mul_pd (_mm256_add_pd (_mm256_mul_pd (_mm256_set1_pd (k3), __r2), _mm256_set1_pd (k2)), __r2), _mm256_set1_pd (k1)), __r2)),
186
+ _mm256_add_pd (__one, _mm256_mul_pd (_mm256_add_pd (_mm256_mul_pd (_mm256_add_pd (_mm256_mul_pd (_mm256_set1_pd (k6), __r2), _mm256_set1_pd (k5)), __r2), _mm256_set1_pd (k4)), __r2))
187
+ #endif
188
+ );
189
+ __m256d __r22 = _mm256_mul_pd (__r2, __r2);
190
+ #if CV_FMA3
191
+ __m256d __xd = _mm256_fmadd_pd (__x, __kr,
192
+ _mm256_add_pd (
193
+ _mm256_fmadd_pd (_mm256_set1_pd (p1), __2xy, _mm256_mul_pd (_mm256_set1_pd (p2), _mm256_fmadd_pd (__two, __x2, __r2))),
194
+ _mm256_fmadd_pd (_mm256_set1_pd (s1), __r2, _mm256_mul_pd (_mm256_set1_pd (s2), __r22))));
195
+ __m256d __yd = _mm256_fmadd_pd (__y, __kr,
196
+ _mm256_add_pd (
197
+ _mm256_fmadd_pd (_mm256_set1_pd (p1), _mm256_fmadd_pd (__two, __y2, __r2), _mm256_mul_pd (_mm256_set1_pd (p2), __2xy)),
198
+ _mm256_fmadd_pd (_mm256_set1_pd (s3), __r2, _mm256_mul_pd (_mm256_set1_pd (s4), __r22))));
199
+
200
+ __m256d __vecTilt2 = _mm256_fmadd_pd (__matTilt_20, __xd, _mm256_fmadd_pd (__matTilt_21, __yd, __matTilt_22));
201
+ #else
202
+ __m256d __xd = _mm256_add_pd (
203
+ _mm256_mul_pd (__x, __kr),
204
+ _mm256_add_pd (
205
+ _mm256_add_pd (
206
+ _mm256_mul_pd (_mm256_set1_pd (p1), __2xy),
207
+ _mm256_mul_pd (_mm256_set1_pd (p2), _mm256_add_pd (__r2, _mm256_mul_pd (__two, __x2)))),
208
+ _mm256_add_pd (
209
+ _mm256_mul_pd (_mm256_set1_pd (s1), __r2),
210
+ _mm256_mul_pd (_mm256_set1_pd (s2), __r22))));
211
+ __m256d __yd = _mm256_add_pd (
212
+ _mm256_mul_pd (__y, __kr),
213
+ _mm256_add_pd (
214
+ _mm256_add_pd (
215
+ _mm256_mul_pd (_mm256_set1_pd (p1), _mm256_add_pd (__r2, _mm256_mul_pd (__two, __y2))),
216
+ _mm256_mul_pd (_mm256_set1_pd (p2), __2xy)),
217
+ _mm256_add_pd (
218
+ _mm256_mul_pd (_mm256_set1_pd (s3), __r2),
219
+ _mm256_mul_pd (_mm256_set1_pd (s4), __r22))));
220
+
221
+ __m256d __vecTilt2 = _mm256_add_pd (_mm256_add_pd (
222
+ _mm256_mul_pd (__matTilt_20, __xd), _mm256_mul_pd (__matTilt_21, __yd)), __matTilt_22);
223
+ #endif
224
+ __m256d __invProj = _mm256_blendv_pd (
225
+ __one, _mm256_div_pd (__one, __vecTilt2),
226
+ _mm256_cmp_pd (__vecTilt2, _mm256_setzero_pd (), _CMP_EQ_OQ));
227
+
228
+ #if CV_FMA3
229
+ __m256d __u = _mm256_fmadd_pd (__matTilt_00, __xd, _mm256_fmadd_pd (__matTilt_01, __yd, __matTilt_02));
230
+ __u = _mm256_fmadd_pd (_mm256_mul_pd (_mm256_set1_pd (fx), __invProj), __u, _mm256_set1_pd (u0));
231
+
232
+ __m256d __v = _mm256_fmadd_pd (__matTilt_10, __xd, _mm256_fmadd_pd (__matTilt_11, __yd, __matTilt_12));
233
+ __v = _mm256_fmadd_pd (_mm256_mul_pd (_mm256_set1_pd (fy), __invProj), __v, _mm256_set1_pd (v0));
234
+ #else
235
+ __m256d __u = _mm256_add_pd (_mm256_add_pd (
236
+ _mm256_mul_pd (__matTilt_00, __xd), _mm256_mul_pd (__matTilt_01, __yd)), __matTilt_02);
237
+ __u = _mm256_add_pd (_mm256_mul_pd (_mm256_mul_pd (_mm256_set1_pd (fx), __invProj), __u), _mm256_set1_pd (u0));
238
+
239
+ __m256d __v = _mm256_add_pd (_mm256_add_pd (
240
+ _mm256_mul_pd (__matTilt_10, __xd), _mm256_mul_pd (__matTilt_11, __yd)), __matTilt_12);
241
+ __v = _mm256_add_pd (_mm256_mul_pd (_mm256_mul_pd (_mm256_set1_pd (fy), __invProj), __v), _mm256_set1_pd (v0));
242
+ #endif
243
+
244
+ if ( m1type == CV_32FC1 )
245
+ {
246
+ _mm_storeu_ps (&m1f[j], _mm256_cvtpd_ps (__u));
247
+ _mm_storeu_ps (&m2f[j], _mm256_cvtpd_ps (__v));
248
+ }
249
+ else if ( m1type == CV_32FC2 )
250
+ {
251
+ __m128 __u_float = _mm256_cvtpd_ps (__u);
252
+ __m128 __v_float = _mm256_cvtpd_ps (__v);
253
+
254
+ _mm_storeu_ps (&m1f[j*2 ], _mm_unpacklo_ps (__u_float, __v_float));
255
+ _mm_storeu_ps (&m1f[j*2 + 4 ], _mm_unpackhi_ps (__u_float, __v_float));
256
+ }
257
+ else // m1type == CV_16SC2
258
+ {
259
+ __u = _mm256_mul_pd (__u, _mm256_set1_pd (INTER_TAB_SIZE));
260
+ __v = _mm256_mul_pd (__v, _mm256_set1_pd (INTER_TAB_SIZE));
261
+
262
+ __m128 __u_float = _mm256_cvtpd_ps (__u);
263
+ __m128 __v_float = _mm256_cvtpd_ps (__v);
264
+ _mm256_zeroupper ();
265
+ static const __m128 __int_max = _mm_set1_ps (std::numeric_limits<int >::max ());
266
+ static const __m128 __int_min = _mm_set1_ps (std::numeric_limits<int >::min ());
267
+ __u_float = _mm_max_ps (_mm_min_ps (__u_float, __int_max), __int_min);
268
+ __v_float = _mm_max_ps (_mm_min_ps (__v_float, __int_max), __int_min);
269
+
270
+ __m128i __iu = _mm_cvtps_epi32 (__u_float);
271
+ __m128i __iv = _mm_cvtps_epi32 (__v_float);
272
+
273
+ static const __m128i __INTER_TAB_SIZE_m1 = _mm_set1_epi32 (INTER_TAB_SIZE-1 );
274
+ __m128i __m2 = _mm_add_epi32 (
275
+ _mm_mul_epi32 (_mm_and_si128 (__iv, __INTER_TAB_SIZE_m1), _mm_set1_epi32 (INTER_TAB_SIZE)),
276
+ _mm_and_si128 (__iu, __INTER_TAB_SIZE_m1));
277
+ __m2 = _mm_packus_epi16 (__m2, __m2);
278
+ _mm_maskstore_epi64 ((long long int *) &m2[j], _mm_set_epi32 (0 , 0 , 0xFFFFFFFF , 0xFFFFFFFF ), __m2);
279
+
280
+ // gcc4.9 does not support _mm256_set_m128
281
+ // __m256i __m1 = _mm256_set_m128i(__iv, __iu);
282
+ __m256i __m1;
283
+ __m1 = _mm256_inserti128_si256 (__m1, __iu, 0 );
284
+ __m1 = _mm256_inserti128_si256 (__m1, __iv, 1 );
285
+ __m1 = _mm256_srli_epi32 (__m1, INTER_BITS); // v3 v2 v1 v0 u3 u2 u1 u0 (int32_t)
286
+ static const __m256i __permute_mask = _mm256_set_epi32 (7 , 3 , 6 , 2 , 5 , 1 ,4 , 0 );
287
+ __m1 = _mm256_permutevar8x32_epi32 (__m1, __permute_mask); // v3 u3 v2 u2 v1 u1 v0 u0 (int32_t)
288
+ __m1 = _mm256_packs_epi32 (__m1, __m1); // x x x x v3 u3 v2 u2 x x x x v1 u1 v0 u0 (int16_t)
289
+ _mm_storeu_si128 ((__m128i*) &m1[j*2 ], _mm256_extracti128_si256 (_mm256_permute4x64_epi64 (__m1, (2 << 2 ) + 0 ), 0 ));
290
+ }
291
+ }
292
+ }
293
+ #endif
294
+ for ( ; j < size.width ; j++, _x += ir[0 ], _y += ir[3 ], _w += ir[6 ] )
148
295
{
149
296
double w = 1 ./_w, x = _x*w, y = _y*w;
150
297
double x2 = x*x, y2 = y*y;
0 commit comments