Skip to content

Commit 3681dce

Browse files
committed
AVX optimized implementation of resize and warp functions migrated to separate file
1 parent 20f603a commit 3681dce

File tree

4 files changed

+569
-360
lines changed

4 files changed

+569
-360
lines changed

modules/imgproc/src/imgwarp.avx2.cpp

Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
/*M///////////////////////////////////////////////////////////////////////////////////////
2+
//
3+
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4+
//
5+
// By downloading, copying, installing or using the software you agree to this license.
6+
// If you do not agree to this license, do not download, install,
7+
// copy or use the software.
8+
//
9+
//
10+
// License Agreement
11+
// For Open Source Computer Vision Library
12+
//
13+
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14+
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15+
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16+
// Third party copyrights are property of their respective owners.
17+
//
18+
// Redistribution and use in source and binary forms, with or without modification,
19+
// are permitted provided that the following conditions are met:
20+
//
21+
// * Redistribution's of source code must retain the above copyright notice,
22+
// this list of conditions and the following disclaimer.
23+
//
24+
// * Redistribution's in binary form must reproduce the above copyright notice,
25+
// this list of conditions and the following disclaimer in the documentation
26+
// and/or other materials provided with the distribution.
27+
//
28+
// * The name of the copyright holders may not be used to endorse or promote products
29+
// derived from this software without specific prior written permission.
30+
//
31+
// This software is provided by the copyright holders and contributors "as is" and
32+
// any express or implied warranties, including, but not limited to, the implied
33+
// warranties of merchantability and fitness for a particular purpose are disclaimed.
34+
// In no event shall the Intel Corporation or contributors be liable for any direct,
35+
// indirect, incidental, special, exemplary, or consequential damages
36+
// (including, but not limited to, procurement of substitute goods or services;
37+
// loss of use, data, or profits; or business interruption) however caused
38+
// and on any theory of liability, whether in contract, strict liability,
39+
// or tort (including negligence or otherwise) arising in any way out of
40+
// the use of this software, even if advised of the possibility of such damage.
41+
//
42+
//M*/
43+
44+
/* ////////////////////////////////////////////////////////////////////
45+
//
46+
// Geometrical transforms on images and matrices: rotation, zoom etc.
47+
//
48+
// */
49+
50+
#include "precomp.hpp"
51+
#include "imgwarp.hpp"
52+
53+
namespace cv
54+
{
55+
namespace opt_AVX2
56+
{
57+
58+
class resizeNNInvokerAVX4 :
59+
public ParallelLoopBody
60+
{
61+
public:
62+
resizeNNInvokerAVX4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
63+
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
64+
ify(_ify)
65+
{
66+
}
67+
68+
#if defined(__INTEL_COMPILER)
69+
#pragma optimization_parameter target_arch=AVX
70+
#endif
71+
virtual void operator() (const Range& range) const
72+
{
73+
Size ssize = src.size(), dsize = dst.size();
74+
int y, x;
75+
int width = dsize.width;
76+
int avxWidth = width - (width & 0x7);
77+
const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1);
78+
if(((int64)(dst.data + dst.step) & 0x1f) == 0)
79+
{
80+
for(y = range.start; y < range.end; y++)
81+
{
82+
uchar* D = dst.data + dst.step*y;
83+
uchar* Dstart = D;
84+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
85+
const uchar* S = src.data + sy*src.step;
86+
#pragma unroll(4)
87+
for(x = 0; x < avxWidth; x += 8)
88+
{
89+
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
90+
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
91+
__m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1);
92+
_mm256_maskstore_epi32((int*)D, mask, pixels);
93+
D += 32;
94+
}
95+
for(; x < width; x++)
96+
{
97+
*(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
98+
}
99+
}
100+
}
101+
else
102+
{
103+
for(y = range.start; y < range.end; y++)
104+
{
105+
uchar* D = dst.data + dst.step*y;
106+
uchar* Dstart = D;
107+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
108+
const uchar* S = src.data + sy*src.step;
109+
#pragma unroll(4)
110+
for(x = 0; x < avxWidth; x += 8)
111+
{
112+
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
113+
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
114+
__m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1);
115+
_mm256_storeu_si256((__m256i*)D, pixels);
116+
D += 32;
117+
}
118+
for(; x < width; x++)
119+
{
120+
*(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
121+
}
122+
}
123+
}
124+
}
125+
126+
private:
127+
const Mat src;
128+
Mat dst;
129+
int* x_ofs, pix_size4;
130+
double ify;
131+
132+
resizeNNInvokerAVX4(const resizeNNInvokerAVX4&);
133+
resizeNNInvokerAVX4& operator=(const resizeNNInvokerAVX4&);
134+
};
135+
136+
class resizeNNInvokerAVX2 :
137+
public ParallelLoopBody
138+
{
139+
public:
140+
resizeNNInvokerAVX2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
141+
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
142+
ify(_ify)
143+
{
144+
}
145+
146+
#if defined(__INTEL_COMPILER)
147+
#pragma optimization_parameter target_arch=AVX
148+
#endif
149+
virtual void operator() (const Range& range) const
150+
{
151+
Size ssize = src.size(), dsize = dst.size();
152+
int y, x;
153+
int width = dsize.width;
154+
//int avxWidth = (width - 1) - ((width - 1) & 0x7);
155+
int avxWidth = width - (width & 0xf);
156+
const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1);
157+
const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
158+
15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
159+
const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
160+
const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
161+
13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
162+
if(((int64)(dst.data + dst.step) & 0x1f) == 0)
163+
{
164+
for(y = range.start; y < range.end; y++)
165+
{
166+
uchar* D = dst.data + dst.step*y;
167+
uchar* Dstart = D;
168+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
169+
const uchar* S = src.data + sy*src.step;
170+
const uchar* S2 = S - 2;
171+
#pragma unroll(4)
172+
for(x = 0; x < avxWidth; x += 16)
173+
{
174+
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
175+
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
176+
__m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1);
177+
const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
178+
__m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2);
179+
__m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1);
180+
__m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa);
181+
182+
__m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask);
183+
__m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask);
184+
_mm256_maskstore_epi32((int*)D, mask, ints_permuted);
185+
D += 32;
186+
}
187+
for(; x < width; x++)
188+
{
189+
*(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
190+
}
191+
192+
}
193+
}
194+
else
195+
{
196+
for(y = range.start; y < range.end; y++)
197+
{
198+
uchar* D = dst.data + dst.step*y;
199+
uchar* Dstart = D;
200+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
201+
const uchar* S = src.data + sy*src.step;
202+
const uchar* S2 = S - 2;
203+
#pragma unroll(4)
204+
for(x = 0; x < avxWidth; x += 16)
205+
{
206+
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
207+
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
208+
__m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1);
209+
const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
210+
__m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2);
211+
__m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1);
212+
__m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa);
213+
214+
__m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask);
215+
__m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask);
216+
_mm256_storeu_si256((__m256i*)D, ints_permuted);
217+
D += 32;
218+
}
219+
for(; x < width; x++)
220+
{
221+
*(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
222+
}
223+
}
224+
}
225+
}
226+
227+
private:
228+
const Mat src;
229+
Mat dst;
230+
int* x_ofs, pix_size4;
231+
double ify;
232+
233+
resizeNNInvokerAVX2(const resizeNNInvokerAVX2&);
234+
resizeNNInvokerAVX2& operator=(const resizeNNInvokerAVX2&);
235+
};
236+
237+
void resizeNN2_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
238+
{
239+
resizeNNInvokerAVX2 invoker(src, dst, x_ofs, pix_size4, ify);
240+
parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
241+
}
242+
243+
void resizeNN4_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
244+
{
245+
resizeNNInvokerAVX4 invoker(src, dst, x_ofs, pix_size4, ify);
246+
parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
247+
}
248+
249+
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
250+
{
251+
const int AB_BITS = MAX(10, (int)INTER_BITS);
252+
int x1 = 0;
253+
__m256i fxy_mask = _mm256_set1_epi32(INTER_TAB_SIZE - 1);
254+
__m256i XX = _mm256_set1_epi32(X0), YY = _mm256_set1_epi32(Y0);
255+
for (; x1 <= bw - 16; x1 += 16)
256+
{
257+
__m256i tx0, tx1, ty0, ty1;
258+
tx0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1)), XX);
259+
ty0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1)), YY);
260+
tx1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1 + 8)), XX);
261+
ty1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1 + 8)), YY);
262+
263+
tx0 = _mm256_srai_epi32(tx0, AB_BITS - INTER_BITS);
264+
ty0 = _mm256_srai_epi32(ty0, AB_BITS - INTER_BITS);
265+
tx1 = _mm256_srai_epi32(tx1, AB_BITS - INTER_BITS);
266+
ty1 = _mm256_srai_epi32(ty1, AB_BITS - INTER_BITS);
267+
268+
__m256i fx_ = _mm256_packs_epi32(_mm256_and_si256(tx0, fxy_mask),
269+
_mm256_and_si256(tx1, fxy_mask));
270+
__m256i fy_ = _mm256_packs_epi32(_mm256_and_si256(ty0, fxy_mask),
271+
_mm256_and_si256(ty1, fxy_mask));
272+
tx0 = _mm256_packs_epi32(_mm256_srai_epi32(tx0, INTER_BITS),
273+
_mm256_srai_epi32(tx1, INTER_BITS));
274+
ty0 = _mm256_packs_epi32(_mm256_srai_epi32(ty0, INTER_BITS),
275+
_mm256_srai_epi32(ty1, INTER_BITS));
276+
fx_ = _mm256_adds_epi16(fx_, _mm256_slli_epi16(fy_, INTER_BITS));
277+
fx_ = _mm256_permute4x64_epi64(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0);
278+
279+
_mm256_storeu_si256((__m256i*)(xy + x1 * 2), _mm256_unpacklo_epi16(tx0, ty0));
280+
_mm256_storeu_si256((__m256i*)(xy + x1 * 2 + 16), _mm256_unpackhi_epi16(tx0, ty0));
281+
_mm256_storeu_si256((__m256i*)(alpha + x1), fx_);
282+
}
283+
_mm256_zeroupper();
284+
return x1;
285+
}
286+
287+
}
288+
}
289+
/* End of file. */

0 commit comments

Comments
 (0)