Skip to content

Commit f6dd549

Browse files
committed
Merge pull request opencv#9027 from terfendail:undistort_avx
2 parents 454bc7a + 526d1d6 commit f6dd549

File tree

3 files changed

+265
-142
lines changed

3 files changed

+265
-142
lines changed
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
/*M///////////////////////////////////////////////////////////////////////////////////////
2+
//
3+
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4+
//
5+
// By downloading, copying, installing or using the software you agree to this license.
6+
// If you do not agree to this license, do not download, install,
7+
// copy or use the software.
8+
//
9+
//
10+
// License Agreement
11+
// For Open Source Computer Vision Library
12+
//
13+
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14+
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15+
// Third party copyrights are property of their respective owners.
16+
//
17+
// Redistribution and use in source and binary forms, with or without modification,
18+
// are permitted provided that the following conditions are met:
19+
//
20+
// * Redistribution's of source code must retain the above copyright notice,
21+
// this list of conditions and the following disclaimer.
22+
//
23+
// * Redistribution's in binary form must reproduce the above copyright notice,
24+
// this list of conditions and the following disclaimer in the documentation
25+
// and/or other materials provided with the distribution.
26+
//
27+
// * The name of the copyright holders may not be used to endorse or promote products
28+
// derived from this software without specific prior written permission.
29+
//
30+
// This software is provided by the copyright holders and contributors "as is" and
31+
// any express or implied warranties, including, but not limited to, the implied
32+
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33+
// In no event shall the Intel Corporation or contributors be liable for any direct,
34+
// indirect, incidental, special, exemplary, or consequential damages
35+
// (including, but not limited to, procurement of substitute goods or services;
36+
// loss of use, data, or profits; or business interruption) however caused
37+
// and on any theory of liability, whether in contract, strict liability,
38+
// or tort (including negligence or otherwise) arising in any way out of
39+
// the use of this software, even if advised of the possibility of such damage.
40+
//
41+
//M*/
42+
43+
#include "precomp.hpp"
44+
#include "undistort.hpp"
45+
46+
namespace cv
47+
{
48+
49+
int initUndistortRectifyMapLine_AVX(float* m1f, float* m2f, short* m1, ushort* m2, double* matTilt, const double* ir,
50+
double& _x, double& _y, double& _w, int width, int m1type,
51+
double& k1, double& k2, double& k3, double& k4, double& k5, double& k6,
52+
double& p1, double& p2, double& s1, double& s2, double& s3, double& s4,
53+
double& u0, double& v0, double& fx, double& fy)
54+
{
55+
int j = 0;
56+
57+
static const __m256d __one = _mm256_set1_pd(1.0);
58+
static const __m256d __two = _mm256_set1_pd(2.0);
59+
60+
const __m256d __matTilt_00 = _mm256_set1_pd(matTilt[0]);
61+
const __m256d __matTilt_10 = _mm256_set1_pd(matTilt[3]);
62+
const __m256d __matTilt_20 = _mm256_set1_pd(matTilt[6]);
63+
64+
const __m256d __matTilt_01 = _mm256_set1_pd(matTilt[1]);
65+
const __m256d __matTilt_11 = _mm256_set1_pd(matTilt[4]);
66+
const __m256d __matTilt_21 = _mm256_set1_pd(matTilt[7]);
67+
68+
const __m256d __matTilt_02 = _mm256_set1_pd(matTilt[2]);
69+
const __m256d __matTilt_12 = _mm256_set1_pd(matTilt[5]);
70+
const __m256d __matTilt_22 = _mm256_set1_pd(matTilt[8]);
71+
72+
for (; j <= width - 4; j += 4, _x += 4 * ir[0], _y += 4 * ir[3], _w += 4 * ir[6])
73+
{
74+
// Question: Should we load the constants first?
75+
__m256d __w = _mm256_div_pd(__one, _mm256_set_pd(_w + 3 * ir[6], _w + 2 * ir[6], _w + ir[6], _w));
76+
__m256d __x = _mm256_mul_pd(_mm256_set_pd(_x + 3 * ir[0], _x + 2 * ir[0], _x + ir[0], _x), __w);
77+
__m256d __y = _mm256_mul_pd(_mm256_set_pd(_y + 3 * ir[3], _y + 2 * ir[3], _y + ir[3], _y), __w);
78+
__m256d __x2 = _mm256_mul_pd(__x, __x);
79+
__m256d __y2 = _mm256_mul_pd(__y, __y);
80+
__m256d __r2 = _mm256_add_pd(__x2, __y2);
81+
__m256d __2xy = _mm256_mul_pd(__two, _mm256_mul_pd(__x, __y));
82+
__m256d __kr = _mm256_div_pd(
83+
#if CV_FMA3
84+
_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_set1_pd(k3), __r2, _mm256_set1_pd(k2)), __r2, _mm256_set1_pd(k1)), __r2, __one),
85+
_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_set1_pd(k6), __r2, _mm256_set1_pd(k5)), __r2, _mm256_set1_pd(k4)), __r2, __one)
86+
#else
87+
_mm256_add_pd(__one, _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(k3), __r2), _mm256_set1_pd(k2)), __r2), _mm256_set1_pd(k1)), __r2)),
88+
_mm256_add_pd(__one, _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(k6), __r2), _mm256_set1_pd(k5)), __r2), _mm256_set1_pd(k4)), __r2))
89+
#endif
90+
);
91+
__m256d __r22 = _mm256_mul_pd(__r2, __r2);
92+
#if CV_FMA3
93+
__m256d __xd = _mm256_fmadd_pd(__x, __kr,
94+
_mm256_add_pd(
95+
_mm256_fmadd_pd(_mm256_set1_pd(p1), __2xy, _mm256_mul_pd(_mm256_set1_pd(p2), _mm256_fmadd_pd(__two, __x2, __r2))),
96+
_mm256_fmadd_pd(_mm256_set1_pd(s1), __r2, _mm256_mul_pd(_mm256_set1_pd(s2), __r22))));
97+
__m256d __yd = _mm256_fmadd_pd(__y, __kr,
98+
_mm256_add_pd(
99+
_mm256_fmadd_pd(_mm256_set1_pd(p1), _mm256_fmadd_pd(__two, __y2, __r2), _mm256_mul_pd(_mm256_set1_pd(p2), __2xy)),
100+
_mm256_fmadd_pd(_mm256_set1_pd(s3), __r2, _mm256_mul_pd(_mm256_set1_pd(s4), __r22))));
101+
102+
__m256d __vecTilt2 = _mm256_fmadd_pd(__matTilt_20, __xd, _mm256_fmadd_pd(__matTilt_21, __yd, __matTilt_22));
103+
#else
104+
__m256d __xd = _mm256_add_pd(
105+
_mm256_mul_pd(__x, __kr),
106+
_mm256_add_pd(
107+
_mm256_add_pd(
108+
_mm256_mul_pd(_mm256_set1_pd(p1), __2xy),
109+
_mm256_mul_pd(_mm256_set1_pd(p2), _mm256_add_pd(__r2, _mm256_mul_pd(__two, __x2)))),
110+
_mm256_add_pd(
111+
_mm256_mul_pd(_mm256_set1_pd(s1), __r2),
112+
_mm256_mul_pd(_mm256_set1_pd(s2), __r22))));
113+
__m256d __yd = _mm256_add_pd(
114+
_mm256_mul_pd(__y, __kr),
115+
_mm256_add_pd(
116+
_mm256_add_pd(
117+
_mm256_mul_pd(_mm256_set1_pd(p1), _mm256_add_pd(__r2, _mm256_mul_pd(__two, __y2))),
118+
_mm256_mul_pd(_mm256_set1_pd(p2), __2xy)),
119+
_mm256_add_pd(
120+
_mm256_mul_pd(_mm256_set1_pd(s3), __r2),
121+
_mm256_mul_pd(_mm256_set1_pd(s4), __r22))));
122+
123+
__m256d __vecTilt2 = _mm256_add_pd(_mm256_add_pd(
124+
_mm256_mul_pd(__matTilt_20, __xd), _mm256_mul_pd(__matTilt_21, __yd)), __matTilt_22);
125+
#endif
126+
__m256d __invProj = _mm256_blendv_pd(
127+
__one, _mm256_div_pd(__one, __vecTilt2),
128+
_mm256_cmp_pd(__vecTilt2, _mm256_setzero_pd(), _CMP_EQ_OQ));
129+
130+
#if CV_FMA3
131+
__m256d __u = _mm256_fmadd_pd(__matTilt_00, __xd, _mm256_fmadd_pd(__matTilt_01, __yd, __matTilt_02));
132+
__u = _mm256_fmadd_pd(_mm256_mul_pd(_mm256_set1_pd(fx), __invProj), __u, _mm256_set1_pd(u0));
133+
134+
__m256d __v = _mm256_fmadd_pd(__matTilt_10, __xd, _mm256_fmadd_pd(__matTilt_11, __yd, __matTilt_12));
135+
__v = _mm256_fmadd_pd(_mm256_mul_pd(_mm256_set1_pd(fy), __invProj), __v, _mm256_set1_pd(v0));
136+
#else
137+
__m256d __u = _mm256_add_pd(_mm256_add_pd(
138+
_mm256_mul_pd(__matTilt_00, __xd), _mm256_mul_pd(__matTilt_01, __yd)), __matTilt_02);
139+
__u = _mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(fx), __invProj), __u), _mm256_set1_pd(u0));
140+
141+
__m256d __v = _mm256_add_pd(_mm256_add_pd(
142+
_mm256_mul_pd(__matTilt_10, __xd), _mm256_mul_pd(__matTilt_11, __yd)), __matTilt_12);
143+
__v = _mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(fy), __invProj), __v), _mm256_set1_pd(v0));
144+
#endif
145+
146+
if (m1type == CV_32FC1)
147+
{
148+
_mm_storeu_ps(&m1f[j], _mm256_cvtpd_ps(__u));
149+
_mm_storeu_ps(&m2f[j], _mm256_cvtpd_ps(__v));
150+
}
151+
else if (m1type == CV_32FC2)
152+
{
153+
__m128 __u_float = _mm256_cvtpd_ps(__u);
154+
__m128 __v_float = _mm256_cvtpd_ps(__v);
155+
156+
_mm_storeu_ps(&m1f[j * 2], _mm_unpacklo_ps(__u_float, __v_float));
157+
_mm_storeu_ps(&m1f[j * 2 + 4], _mm_unpackhi_ps(__u_float, __v_float));
158+
}
159+
else // m1type == CV_16SC2
160+
{
161+
__u = _mm256_mul_pd(__u, _mm256_set1_pd(INTER_TAB_SIZE));
162+
__v = _mm256_mul_pd(__v, _mm256_set1_pd(INTER_TAB_SIZE));
163+
164+
__m128 __u_float = _mm256_cvtpd_ps(__u);
165+
__m128 __v_float = _mm256_cvtpd_ps(__v);
166+
_mm256_zeroupper();
167+
static const __m128 __int_max = _mm_set1_ps((float)(std::numeric_limits<int>::max()));
168+
static const __m128 __int_min = _mm_set1_ps((float)(std::numeric_limits<int>::min()));
169+
__u_float = _mm_max_ps(_mm_min_ps(__u_float, __int_max), __int_min);
170+
__v_float = _mm_max_ps(_mm_min_ps(__v_float, __int_max), __int_min);
171+
172+
__m128i __iu = _mm_cvtps_epi32(__u_float);
173+
__m128i __iv = _mm_cvtps_epi32(__v_float);
174+
175+
static const __m128i __INTER_TAB_SIZE_m1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
176+
__m128i __m2 = _mm_add_epi32(
177+
_mm_mullo_epi32(_mm_and_si128(__iv, __INTER_TAB_SIZE_m1), _mm_set1_epi32(INTER_TAB_SIZE)),
178+
_mm_and_si128(__iu, __INTER_TAB_SIZE_m1));
179+
__m2 = _mm_packus_epi32(__m2, __m2);
180+
_mm_maskstore_epi64((long long int*) &m2[j], _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF), __m2);
181+
182+
// gcc4.9 does not support _mm256_set_m128
183+
// __m256i __m1 = _mm256_set_m128i(__iv, __iu);
184+
__m256i __m1 = _mm256_setzero_si256();
185+
__m1 = _mm256_inserti128_si256(__m1, __iu, 0);
186+
__m1 = _mm256_inserti128_si256(__m1, __iv, 1);
187+
__m1 = _mm256_srai_epi32(__m1, INTER_BITS); // v3 v2 v1 v0 u3 u2 u1 u0 (int32_t)
188+
static const __m256i __permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
189+
__m1 = _mm256_permutevar8x32_epi32(__m1, __permute_mask); // v3 u3 v2 u2 v1 u1 v0 u0 (int32_t)
190+
__m1 = _mm256_packs_epi32(__m1, __m1); // x x x x v3 u3 v2 u2 x x x x v1 u1 v0 u0 (int16_t)
191+
_mm_storeu_si128((__m128i*) &m1[j * 2], _mm256_extracti128_si256(_mm256_permute4x64_epi64(__m1, (2 << 2) + 0), 0));
192+
}
193+
}
194+
195+
return j;
196+
}
197+
198+
}
199+
200+
/* End of file */

modules/imgproc/src/undistort.cpp

Lines changed: 6 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242

4343
#include "precomp.hpp"
4444
#include "opencv2/imgproc/detail/distortion_model.hpp"
45+
#include "undistort.hpp"
4546

4647
cv::Mat cv::getDefaultNewCameraMatrix( InputArray _cameraMatrix, Size imgsize,
4748
bool centerPrincipalPoint )
@@ -136,7 +137,7 @@ void cv::initUndistortRectifyMap( InputArray _cameraMatrix, InputArray _distCoef
136137
cv::Matx33d matTilt = cv::Matx33d::eye();
137138
cv::detail::computeTiltProjectionMatrix(tauX, tauY, &matTilt);
138139

139-
#if CV_AVX2
140+
#if CV_TRY_AVX2
140141
bool USE_AVX2 = cv::checkHardwareSupport(CV_CPU_AVX2);
141142
#endif
142143

@@ -157,147 +158,10 @@ void cv::initUndistortRectifyMap( InputArray _cameraMatrix, InputArray _distCoef
157158
else
158159
CV_Assert(m1 != NULL);
159160

160-
#if CV_AVX2
161-
if( USE_AVX2 )
162-
{
163-
static const __m256d __one = _mm256_set1_pd(1.0);
164-
static const __m256d __two = _mm256_set1_pd(2.0);
165-
166-
const __m256d __matTilt_00 = _mm256_set1_pd(matTilt(0, 0));
167-
const __m256d __matTilt_10 = _mm256_set1_pd(matTilt(1, 0));
168-
const __m256d __matTilt_20 = _mm256_set1_pd(matTilt(2, 0));
169-
170-
const __m256d __matTilt_01 = _mm256_set1_pd(matTilt(0, 1));
171-
const __m256d __matTilt_11 = _mm256_set1_pd(matTilt(1, 1));
172-
const __m256d __matTilt_21 = _mm256_set1_pd(matTilt(2, 1));
173-
174-
const __m256d __matTilt_02 = _mm256_set1_pd(matTilt(0, 2));
175-
const __m256d __matTilt_12 = _mm256_set1_pd(matTilt(1, 2));
176-
const __m256d __matTilt_22 = _mm256_set1_pd(matTilt(2, 2));
177-
178-
for( ; j <= size.width - 4; j += 4, _x += 4 * ir[0], _y += 4 * ir[3], _w += 4 * ir[6] )
179-
{
180-
// Question: Should we load the constants first?
181-
__m256d __w = _mm256_div_pd(__one, _mm256_set_pd(_w + 3 * ir[6], _w + 2 * ir[6], _w + ir[6], _w));
182-
__m256d __x = _mm256_mul_pd(_mm256_set_pd(_x + 3 * ir[0], _x + 2 * ir[0], _x + ir[0], _x), __w);
183-
__m256d __y = _mm256_mul_pd(_mm256_set_pd(_y + 3 * ir[3], _y + 2 * ir[3], _y + ir[3], _y), __w);
184-
__m256d __x2 = _mm256_mul_pd(__x, __x);
185-
__m256d __y2 = _mm256_mul_pd(__y, __y);
186-
__m256d __r2 = _mm256_add_pd(__x2, __y2);
187-
__m256d __2xy = _mm256_mul_pd(__two, _mm256_mul_pd(__x, __y));
188-
__m256d __kr = _mm256_div_pd(
189-
#if CV_FMA3
190-
_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_set1_pd(k3), __r2, _mm256_set1_pd(k2)), __r2, _mm256_set1_pd(k1)), __r2, __one),
191-
_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_set1_pd(k6), __r2, _mm256_set1_pd(k5)), __r2, _mm256_set1_pd(k4)), __r2, __one)
192-
#else
193-
_mm256_add_pd(__one, _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(k3), __r2), _mm256_set1_pd(k2)), __r2), _mm256_set1_pd(k1)), __r2)),
194-
_mm256_add_pd(__one, _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(k6), __r2), _mm256_set1_pd(k5)), __r2), _mm256_set1_pd(k4)), __r2))
195-
#endif
196-
);
197-
__m256d __r22 = _mm256_mul_pd(__r2, __r2);
198-
#if CV_FMA3
199-
__m256d __xd = _mm256_fmadd_pd(__x, __kr,
200-
_mm256_add_pd(
201-
_mm256_fmadd_pd(_mm256_set1_pd(p1), __2xy, _mm256_mul_pd(_mm256_set1_pd(p2), _mm256_fmadd_pd(__two, __x2, __r2))),
202-
_mm256_fmadd_pd(_mm256_set1_pd(s1), __r2, _mm256_mul_pd(_mm256_set1_pd(s2), __r22))));
203-
__m256d __yd = _mm256_fmadd_pd(__y, __kr,
204-
_mm256_add_pd(
205-
_mm256_fmadd_pd(_mm256_set1_pd(p1), _mm256_fmadd_pd(__two, __y2, __r2), _mm256_mul_pd(_mm256_set1_pd(p2), __2xy)),
206-
_mm256_fmadd_pd(_mm256_set1_pd(s3), __r2, _mm256_mul_pd(_mm256_set1_pd(s4), __r22))));
207-
208-
__m256d __vecTilt2 = _mm256_fmadd_pd(__matTilt_20, __xd, _mm256_fmadd_pd(__matTilt_21, __yd, __matTilt_22));
209-
#else
210-
__m256d __xd = _mm256_add_pd(
211-
_mm256_mul_pd(__x, __kr),
212-
_mm256_add_pd(
213-
_mm256_add_pd(
214-
_mm256_mul_pd(_mm256_set1_pd(p1), __2xy),
215-
_mm256_mul_pd(_mm256_set1_pd(p2), _mm256_add_pd(__r2, _mm256_mul_pd(__two, __x2)))),
216-
_mm256_add_pd(
217-
_mm256_mul_pd(_mm256_set1_pd(s1), __r2),
218-
_mm256_mul_pd(_mm256_set1_pd(s2), __r22))));
219-
__m256d __yd = _mm256_add_pd(
220-
_mm256_mul_pd(__y, __kr),
221-
_mm256_add_pd(
222-
_mm256_add_pd(
223-
_mm256_mul_pd(_mm256_set1_pd(p1), _mm256_add_pd(__r2, _mm256_mul_pd(__two, __y2))),
224-
_mm256_mul_pd(_mm256_set1_pd(p2), __2xy)),
225-
_mm256_add_pd(
226-
_mm256_mul_pd(_mm256_set1_pd(s3), __r2),
227-
_mm256_mul_pd(_mm256_set1_pd(s4), __r22))));
228-
229-
__m256d __vecTilt2 = _mm256_add_pd(_mm256_add_pd(
230-
_mm256_mul_pd(__matTilt_20, __xd), _mm256_mul_pd(__matTilt_21, __yd)), __matTilt_22);
231-
#endif
232-
__m256d __invProj = _mm256_blendv_pd(
233-
__one, _mm256_div_pd(__one, __vecTilt2),
234-
_mm256_cmp_pd(__vecTilt2, _mm256_setzero_pd(), _CMP_EQ_OQ));
235-
236-
#if CV_FMA3
237-
__m256d __u = _mm256_fmadd_pd(__matTilt_00, __xd, _mm256_fmadd_pd(__matTilt_01, __yd, __matTilt_02));
238-
__u = _mm256_fmadd_pd(_mm256_mul_pd(_mm256_set1_pd(fx), __invProj), __u, _mm256_set1_pd(u0));
239-
240-
__m256d __v = _mm256_fmadd_pd(__matTilt_10, __xd, _mm256_fmadd_pd(__matTilt_11, __yd, __matTilt_12));
241-
__v = _mm256_fmadd_pd(_mm256_mul_pd(_mm256_set1_pd(fy), __invProj), __v, _mm256_set1_pd(v0));
242-
#else
243-
__m256d __u = _mm256_add_pd(_mm256_add_pd(
244-
_mm256_mul_pd(__matTilt_00, __xd), _mm256_mul_pd(__matTilt_01, __yd)), __matTilt_02);
245-
__u = _mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(fx), __invProj), __u), _mm256_set1_pd(u0));
246-
247-
__m256d __v = _mm256_add_pd(_mm256_add_pd(
248-
_mm256_mul_pd(__matTilt_10, __xd), _mm256_mul_pd(__matTilt_11, __yd)), __matTilt_12);
249-
__v = _mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(fy), __invProj), __v), _mm256_set1_pd(v0));
250-
#endif
251-
252-
if ( m1type == CV_32FC1 )
253-
{
254-
_mm_storeu_ps(&m1f[j], _mm256_cvtpd_ps(__u));
255-
_mm_storeu_ps(&m2f[j], _mm256_cvtpd_ps(__v));
256-
}
257-
else if ( m1type == CV_32FC2 )
258-
{
259-
__m128 __u_float = _mm256_cvtpd_ps(__u);
260-
__m128 __v_float = _mm256_cvtpd_ps(__v);
261-
262-
_mm_storeu_ps(&m1f[j*2], _mm_unpacklo_ps(__u_float, __v_float));
263-
_mm_storeu_ps(&m1f[j*2 + 4], _mm_unpackhi_ps(__u_float, __v_float));
264-
}
265-
else // m1type == CV_16SC2
266-
{
267-
__u = _mm256_mul_pd(__u, _mm256_set1_pd(INTER_TAB_SIZE));
268-
__v = _mm256_mul_pd(__v, _mm256_set1_pd(INTER_TAB_SIZE));
269-
270-
__m128 __u_float = _mm256_cvtpd_ps(__u);
271-
__m128 __v_float = _mm256_cvtpd_ps(__v);
272-
_mm256_zeroupper();
273-
static const __m128 __int_max = _mm_set1_ps(std::numeric_limits<int>::max());
274-
static const __m128 __int_min = _mm_set1_ps(std::numeric_limits<int>::min());
275-
__u_float = _mm_max_ps(_mm_min_ps(__u_float, __int_max), __int_min);
276-
__v_float = _mm_max_ps(_mm_min_ps(__v_float, __int_max), __int_min);
277-
278-
__m128i __iu = _mm_cvtps_epi32(__u_float);
279-
__m128i __iv = _mm_cvtps_epi32(__v_float);
280-
281-
static const __m128i __INTER_TAB_SIZE_m1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
282-
__m128i __m2 = _mm_add_epi32(
283-
_mm_mul_epi32(_mm_and_si128(__iv, __INTER_TAB_SIZE_m1), _mm_set1_epi32(INTER_TAB_SIZE)),
284-
_mm_and_si128(__iu, __INTER_TAB_SIZE_m1));
285-
__m2 = _mm_packus_epi16(__m2, __m2);
286-
_mm_maskstore_epi64((long long int*) &m2[j], _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF), __m2);
287-
288-
// gcc4.9 does not support _mm256_set_m128
289-
// __m256i __m1 = _mm256_set_m128i(__iv, __iu);
290-
__m256i __m1;
291-
__m1 = _mm256_inserti128_si256(__m1, __iu, 0);
292-
__m1 = _mm256_inserti128_si256(__m1, __iv, 1);
293-
__m1 = _mm256_srli_epi32(__m1, INTER_BITS); // v3 v2 v1 v0 u3 u2 u1 u0 (int32_t)
294-
static const __m256i __permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1 ,4, 0);
295-
__m1 = _mm256_permutevar8x32_epi32(__m1, __permute_mask); // v3 u3 v2 u2 v1 u1 v0 u0 (int32_t)
296-
__m1 = _mm256_packs_epi32(__m1, __m1); // x x x x v3 u3 v2 u2 x x x x v1 u1 v0 u0 (int16_t)
297-
_mm_storeu_si128((__m128i*) &m1[j*2], _mm256_extracti128_si256(_mm256_permute4x64_epi64(__m1, (2 << 2) + 0), 0));
298-
}
299-
}
300-
}
161+
#if CV_TRY_AVX2
162+
if( USE_AVX2 )
163+
j = cv::initUndistortRectifyMapLine_AVX(m1f, m2f, m1, m2, matTilt.val, ir, _x, _y, _w, size.width, m1type,
164+
k1, k2, k3, k4, k5, k6, p1, p2, s1, s2, s3, s4, u0, v0, fx, fy);
301165
#endif
302166
for( ; j < size.width; j++, _x += ir[0], _y += ir[3], _w += ir[6] )
303167
{

0 commit comments

Comments
 (0)