Skip to content

Commit 9ef742b

Browse files
authored
Merge pull request opencv#9082 from terfendail:imgwarp_avx
AVX and SSE4.1 optimized implementation of resize and warp functions migrated
2 parents 928bfe0 + fadf25a commit 9ef742b

File tree

4 files changed

+1111
-760
lines changed

4 files changed

+1111
-760
lines changed

modules/imgproc/src/imgwarp.avx2.cpp

Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
/*M///////////////////////////////////////////////////////////////////////////////////////
2+
//
3+
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4+
//
5+
// By downloading, copying, installing or using the software you agree to this license.
6+
// If you do not agree to this license, do not download, install,
7+
// copy or use the software.
8+
//
9+
//
10+
// License Agreement
11+
// For Open Source Computer Vision Library
12+
//
13+
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14+
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15+
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16+
// Third party copyrights are property of their respective owners.
17+
//
18+
// Redistribution and use in source and binary forms, with or without modification,
19+
// are permitted provided that the following conditions are met:
20+
//
21+
// * Redistribution's of source code must retain the above copyright notice,
22+
// this list of conditions and the following disclaimer.
23+
//
24+
// * Redistribution's in binary form must reproduce the above copyright notice,
25+
// this list of conditions and the following disclaimer in the documentation
26+
// and/or other materials provided with the distribution.
27+
//
28+
// * The name of the copyright holders may not be used to endorse or promote products
29+
// derived from this software without specific prior written permission.
30+
//
31+
// This software is provided by the copyright holders and contributors "as is" and
32+
// any express or implied warranties, including, but not limited to, the implied
33+
// warranties of merchantability and fitness for a particular purpose are disclaimed.
34+
// In no event shall the Intel Corporation or contributors be liable for any direct,
35+
// indirect, incidental, special, exemplary, or consequential damages
36+
// (including, but not limited to, procurement of substitute goods or services;
37+
// loss of use, data, or profits; or business interruption) however caused
38+
// and on any theory of liability, whether in contract, strict liability,
39+
// or tort (including negligence or otherwise) arising in any way out of
40+
// the use of this software, even if advised of the possibility of such damage.
41+
//
42+
//M*/
43+
44+
/* ////////////////////////////////////////////////////////////////////
45+
//
46+
// Geometrical transforms on images and matrices: rotation, zoom etc.
47+
//
48+
// */
49+
50+
#include "precomp.hpp"
51+
#include "imgwarp.hpp"
52+
53+
namespace cv
54+
{
55+
namespace opt_AVX2
56+
{
57+
58+
class resizeNNInvokerAVX4 :
59+
public ParallelLoopBody
60+
{
61+
public:
62+
resizeNNInvokerAVX4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
63+
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
64+
ify(_ify)
65+
{
66+
}
67+
68+
#if defined(__INTEL_COMPILER)
69+
#pragma optimization_parameter target_arch=AVX
70+
#endif
71+
virtual void operator() (const Range& range) const
72+
{
73+
Size ssize = src.size(), dsize = dst.size();
74+
int y, x;
75+
int width = dsize.width;
76+
int avxWidth = width - (width & 0x7);
77+
const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1);
78+
if(((int64)(dst.data + dst.step) & 0x1f) == 0)
79+
{
80+
for(y = range.start; y < range.end; y++)
81+
{
82+
uchar* D = dst.data + dst.step*y;
83+
uchar* Dstart = D;
84+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
85+
const uchar* S = src.data + sy*src.step;
86+
#ifdef CV_ICC
87+
#pragma unroll(4)
88+
#endif
89+
for(x = 0; x < avxWidth; x += 8)
90+
{
91+
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
92+
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
93+
__m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1);
94+
_mm256_maskstore_epi32((int*)D, mask, pixels);
95+
D += 32;
96+
}
97+
for(; x < width; x++)
98+
{
99+
*(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
100+
}
101+
}
102+
}
103+
else
104+
{
105+
for(y = range.start; y < range.end; y++)
106+
{
107+
uchar* D = dst.data + dst.step*y;
108+
uchar* Dstart = D;
109+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
110+
const uchar* S = src.data + sy*src.step;
111+
#ifdef CV_ICC
112+
#pragma unroll(4)
113+
#endif
114+
for(x = 0; x < avxWidth; x += 8)
115+
{
116+
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
117+
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
118+
__m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1);
119+
_mm256_storeu_si256((__m256i*)D, pixels);
120+
D += 32;
121+
}
122+
for(; x < width; x++)
123+
{
124+
*(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
125+
}
126+
}
127+
}
128+
}
129+
130+
private:
131+
const Mat src;
132+
Mat dst;
133+
int* x_ofs, pix_size4;
134+
double ify;
135+
136+
resizeNNInvokerAVX4(const resizeNNInvokerAVX4&);
137+
resizeNNInvokerAVX4& operator=(const resizeNNInvokerAVX4&);
138+
};
139+
140+
class resizeNNInvokerAVX2 :
141+
public ParallelLoopBody
142+
{
143+
public:
144+
resizeNNInvokerAVX2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
145+
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
146+
ify(_ify)
147+
{
148+
}
149+
150+
#if defined(__INTEL_COMPILER)
151+
#pragma optimization_parameter target_arch=AVX
152+
#endif
153+
virtual void operator() (const Range& range) const
154+
{
155+
Size ssize = src.size(), dsize = dst.size();
156+
int y, x;
157+
int width = dsize.width;
158+
//int avxWidth = (width - 1) - ((width - 1) & 0x7);
159+
int avxWidth = width - (width & 0xf);
160+
const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1);
161+
const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
162+
15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
163+
const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
164+
//const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
165+
// 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
166+
if(((int64)(dst.data + dst.step) & 0x1f) == 0)
167+
{
168+
for(y = range.start; y < range.end; y++)
169+
{
170+
uchar* D = dst.data + dst.step*y;
171+
uchar* Dstart = D;
172+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
173+
const uchar* S = src.data + sy*src.step;
174+
const uchar* S2 = S - 2;
175+
#ifdef CV_ICC
176+
#pragma unroll(4)
177+
#endif
178+
for(x = 0; x < avxWidth; x += 16)
179+
{
180+
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
181+
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
182+
__m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1);
183+
const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
184+
__m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2);
185+
__m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1);
186+
__m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa);
187+
188+
__m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask);
189+
__m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask);
190+
_mm256_maskstore_epi32((int*)D, mask, ints_permuted);
191+
D += 32;
192+
}
193+
for(; x < width; x++)
194+
{
195+
*(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
196+
}
197+
198+
}
199+
}
200+
else
201+
{
202+
for(y = range.start; y < range.end; y++)
203+
{
204+
uchar* D = dst.data + dst.step*y;
205+
uchar* Dstart = D;
206+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
207+
const uchar* S = src.data + sy*src.step;
208+
const uchar* S2 = S - 2;
209+
#ifdef CV_ICC
210+
#pragma unroll(4)
211+
#endif
212+
for(x = 0; x < avxWidth; x += 16)
213+
{
214+
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
215+
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
216+
__m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1);
217+
const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
218+
__m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2);
219+
__m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1);
220+
__m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa);
221+
222+
__m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask);
223+
__m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask);
224+
_mm256_storeu_si256((__m256i*)D, ints_permuted);
225+
D += 32;
226+
}
227+
for(; x < width; x++)
228+
{
229+
*(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
230+
}
231+
}
232+
}
233+
}
234+
235+
private:
236+
const Mat src;
237+
Mat dst;
238+
int* x_ofs, pix_size4;
239+
double ify;
240+
241+
resizeNNInvokerAVX2(const resizeNNInvokerAVX2&);
242+
resizeNNInvokerAVX2& operator=(const resizeNNInvokerAVX2&);
243+
};
244+
245+
void resizeNN2_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
246+
{
247+
resizeNNInvokerAVX2 invoker(src, dst, x_ofs, pix_size4, ify);
248+
parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
249+
}
250+
251+
void resizeNN4_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
252+
{
253+
resizeNNInvokerAVX4 invoker(src, dst, x_ofs, pix_size4, ify);
254+
parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
255+
}
256+
257+
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
258+
{
259+
const int AB_BITS = MAX(10, (int)INTER_BITS);
260+
int x1 = 0;
261+
__m256i fxy_mask = _mm256_set1_epi32(INTER_TAB_SIZE - 1);
262+
__m256i XX = _mm256_set1_epi32(X0), YY = _mm256_set1_epi32(Y0);
263+
for (; x1 <= bw - 16; x1 += 16)
264+
{
265+
__m256i tx0, tx1, ty0, ty1;
266+
tx0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1)), XX);
267+
ty0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1)), YY);
268+
tx1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1 + 8)), XX);
269+
ty1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1 + 8)), YY);
270+
271+
tx0 = _mm256_srai_epi32(tx0, AB_BITS - INTER_BITS);
272+
ty0 = _mm256_srai_epi32(ty0, AB_BITS - INTER_BITS);
273+
tx1 = _mm256_srai_epi32(tx1, AB_BITS - INTER_BITS);
274+
ty1 = _mm256_srai_epi32(ty1, AB_BITS - INTER_BITS);
275+
276+
__m256i fx_ = _mm256_packs_epi32(_mm256_and_si256(tx0, fxy_mask),
277+
_mm256_and_si256(tx1, fxy_mask));
278+
__m256i fy_ = _mm256_packs_epi32(_mm256_and_si256(ty0, fxy_mask),
279+
_mm256_and_si256(ty1, fxy_mask));
280+
tx0 = _mm256_packs_epi32(_mm256_srai_epi32(tx0, INTER_BITS),
281+
_mm256_srai_epi32(tx1, INTER_BITS));
282+
ty0 = _mm256_packs_epi32(_mm256_srai_epi32(ty0, INTER_BITS),
283+
_mm256_srai_epi32(ty1, INTER_BITS));
284+
fx_ = _mm256_adds_epi16(fx_, _mm256_slli_epi16(fy_, INTER_BITS));
285+
fx_ = _mm256_permute4x64_epi64(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0);
286+
287+
_mm256_storeu_si256((__m256i*)(xy + x1 * 2), _mm256_unpacklo_epi16(tx0, ty0));
288+
_mm256_storeu_si256((__m256i*)(xy + x1 * 2 + 16), _mm256_unpackhi_epi16(tx0, ty0));
289+
_mm256_storeu_si256((__m256i*)(alpha + x1), fx_);
290+
}
291+
_mm256_zeroupper();
292+
return x1;
293+
}
294+
295+
}
296+
}
297+
/* End of file. */

0 commit comments

Comments
 (0)