Skip to content

Commit 42fbbfe

Browse files
committed
Merge pull request opencv#9522 from terfendail:resize_move
2 parents 791a11f + e8caa9b commit 42fbbfe

File tree

8 files changed

+4053
-3744
lines changed

8 files changed

+4053
-3744
lines changed

modules/imgproc/src/imgwarp.avx2.cpp

Lines changed: 0 additions & 201 deletions
Original file line numberDiff line numberDiff line change
@@ -55,207 +55,6 @@ namespace cv
5555
namespace opt_AVX2
5656
{
5757

58-
class resizeNNInvokerAVX4 :
59-
public ParallelLoopBody
60-
{
61-
public:
62-
resizeNNInvokerAVX4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
63-
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
64-
ify(_ify)
65-
{
66-
}
67-
68-
#if defined(__INTEL_COMPILER)
69-
#pragma optimization_parameter target_arch=AVX
70-
#endif
71-
virtual void operator() (const Range& range) const
72-
{
73-
Size ssize = src.size(), dsize = dst.size();
74-
int y, x;
75-
int width = dsize.width;
76-
int avxWidth = width - (width & 0x7);
77-
const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1);
78-
if(((int64)(dst.data + dst.step) & 0x1f) == 0)
79-
{
80-
for(y = range.start; y < range.end; y++)
81-
{
82-
uchar* D = dst.data + dst.step*y;
83-
uchar* Dstart = D;
84-
int sy = std::min(cvFloor(y*ify), ssize.height-1);
85-
const uchar* S = src.data + sy*src.step;
86-
#ifdef CV_ICC
87-
#pragma unroll(4)
88-
#endif
89-
for(x = 0; x < avxWidth; x += 8)
90-
{
91-
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
92-
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
93-
__m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1);
94-
_mm256_maskstore_epi32((int*)D, mask, pixels);
95-
D += 32;
96-
}
97-
for(; x < width; x++)
98-
{
99-
*(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
100-
}
101-
}
102-
}
103-
else
104-
{
105-
for(y = range.start; y < range.end; y++)
106-
{
107-
uchar* D = dst.data + dst.step*y;
108-
uchar* Dstart = D;
109-
int sy = std::min(cvFloor(y*ify), ssize.height-1);
110-
const uchar* S = src.data + sy*src.step;
111-
#ifdef CV_ICC
112-
#pragma unroll(4)
113-
#endif
114-
for(x = 0; x < avxWidth; x += 8)
115-
{
116-
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
117-
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
118-
__m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1);
119-
_mm256_storeu_si256((__m256i*)D, pixels);
120-
D += 32;
121-
}
122-
for(; x < width; x++)
123-
{
124-
*(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
125-
}
126-
}
127-
}
128-
_mm256_zeroupper();
129-
}
130-
131-
private:
132-
const Mat src;
133-
Mat dst;
134-
int* x_ofs, pix_size4;
135-
double ify;
136-
137-
resizeNNInvokerAVX4(const resizeNNInvokerAVX4&);
138-
resizeNNInvokerAVX4& operator=(const resizeNNInvokerAVX4&);
139-
};
140-
141-
class resizeNNInvokerAVX2 :
142-
public ParallelLoopBody
143-
{
144-
public:
145-
resizeNNInvokerAVX2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
146-
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
147-
ify(_ify)
148-
{
149-
}
150-
151-
#if defined(__INTEL_COMPILER)
152-
#pragma optimization_parameter target_arch=AVX
153-
#endif
154-
virtual void operator() (const Range& range) const
155-
{
156-
Size ssize = src.size(), dsize = dst.size();
157-
int y, x;
158-
int width = dsize.width;
159-
//int avxWidth = (width - 1) - ((width - 1) & 0x7);
160-
int avxWidth = width - (width & 0xf);
161-
const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1);
162-
const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
163-
15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
164-
const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
165-
//const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
166-
// 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
167-
if(((int64)(dst.data + dst.step) & 0x1f) == 0)
168-
{
169-
for(y = range.start; y < range.end; y++)
170-
{
171-
uchar* D = dst.data + dst.step*y;
172-
uchar* Dstart = D;
173-
int sy = std::min(cvFloor(y*ify), ssize.height-1);
174-
const uchar* S = src.data + sy*src.step;
175-
const uchar* S2 = S - 2;
176-
#ifdef CV_ICC
177-
#pragma unroll(4)
178-
#endif
179-
for(x = 0; x < avxWidth; x += 16)
180-
{
181-
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
182-
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
183-
__m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1);
184-
const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
185-
__m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2);
186-
__m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1);
187-
__m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa);
188-
189-
__m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask);
190-
__m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask);
191-
_mm256_maskstore_epi32((int*)D, mask, ints_permuted);
192-
D += 32;
193-
}
194-
for(; x < width; x++)
195-
{
196-
*(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
197-
}
198-
199-
}
200-
}
201-
else
202-
{
203-
for(y = range.start; y < range.end; y++)
204-
{
205-
uchar* D = dst.data + dst.step*y;
206-
uchar* Dstart = D;
207-
int sy = std::min(cvFloor(y*ify), ssize.height-1);
208-
const uchar* S = src.data + sy*src.step;
209-
const uchar* S2 = S - 2;
210-
#ifdef CV_ICC
211-
#pragma unroll(4)
212-
#endif
213-
for(x = 0; x < avxWidth; x += 16)
214-
{
215-
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
216-
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
217-
__m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1);
218-
const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
219-
__m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2);
220-
__m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1);
221-
__m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa);
222-
223-
__m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask);
224-
__m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask);
225-
_mm256_storeu_si256((__m256i*)D, ints_permuted);
226-
D += 32;
227-
}
228-
for(; x < width; x++)
229-
{
230-
*(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
231-
}
232-
}
233-
}
234-
_mm256_zeroupper();
235-
}
236-
237-
private:
238-
const Mat src;
239-
Mat dst;
240-
int* x_ofs, pix_size4;
241-
double ify;
242-
243-
resizeNNInvokerAVX2(const resizeNNInvokerAVX2&);
244-
resizeNNInvokerAVX2& operator=(const resizeNNInvokerAVX2&);
245-
};
246-
247-
void resizeNN2_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
248-
{
249-
resizeNNInvokerAVX2 invoker(src, dst, x_ofs, pix_size4, ify);
250-
parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
251-
}
252-
253-
void resizeNN4_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
254-
{
255-
resizeNNInvokerAVX4 invoker(src, dst, x_ofs, pix_size4, ify);
256-
parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
257-
}
258-
25958
int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)
26059
{
26160
const int AB_BITS = MAX(10, (int)INTER_BITS);

0 commit comments

Comments
 (0)