@@ -55,207 +55,6 @@ namespace cv
55
55
namespace opt_AVX2
56
56
{
57
57
58
- class resizeNNInvokerAVX4 :
59
- public ParallelLoopBody
60
- {
61
- public:
62
- resizeNNInvokerAVX4 (const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
63
- ParallelLoopBody (), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
64
- ify (_ify)
65
- {
66
- }
67
-
68
- #if defined(__INTEL_COMPILER)
69
- #pragma optimization_parameter target_arch=AVX
70
- #endif
71
- virtual void operator () (const Range& range) const
72
- {
73
- Size ssize = src.size (), dsize = dst.size ();
74
- int y, x;
75
- int width = dsize.width ;
76
- int avxWidth = width - (width & 0x7 );
77
- const __m256i CV_DECL_ALIGNED (64 ) mask = _mm256_set1_epi32 (-1 );
78
- if (((int64)(dst.data + dst.step ) & 0x1f ) == 0 )
79
- {
80
- for (y = range.start ; y < range.end ; y++)
81
- {
82
- uchar* D = dst.data + dst.step *y;
83
- uchar* Dstart = D;
84
- int sy = std::min (cvFloor (y*ify), ssize.height -1 );
85
- const uchar* S = src.data + sy*src.step ;
86
- #ifdef CV_ICC
87
- #pragma unroll(4)
88
- #endif
89
- for (x = 0 ; x < avxWidth; x += 8 )
90
- {
91
- const __m256i CV_DECL_ALIGNED (64 ) *addr = (__m256i*)(x_ofs + x);
92
- __m256i CV_DECL_ALIGNED (64 ) indices = _mm256_lddqu_si256 (addr);
93
- __m256i CV_DECL_ALIGNED (64 ) pixels = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
94
- _mm256_maskstore_epi32 ((int *)D, mask, pixels);
95
- D += 32 ;
96
- }
97
- for (; x < width; x++)
98
- {
99
- *(int *)(Dstart + x*4 ) = *(int *)(S + x_ofs[x]);
100
- }
101
- }
102
- }
103
- else
104
- {
105
- for (y = range.start ; y < range.end ; y++)
106
- {
107
- uchar* D = dst.data + dst.step *y;
108
- uchar* Dstart = D;
109
- int sy = std::min (cvFloor (y*ify), ssize.height -1 );
110
- const uchar* S = src.data + sy*src.step ;
111
- #ifdef CV_ICC
112
- #pragma unroll(4)
113
- #endif
114
- for (x = 0 ; x < avxWidth; x += 8 )
115
- {
116
- const __m256i CV_DECL_ALIGNED (64 ) *addr = (__m256i*)(x_ofs + x);
117
- __m256i CV_DECL_ALIGNED (64 ) indices = _mm256_lddqu_si256 (addr);
118
- __m256i CV_DECL_ALIGNED (64 ) pixels = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
119
- _mm256_storeu_si256 ((__m256i*)D, pixels);
120
- D += 32 ;
121
- }
122
- for (; x < width; x++)
123
- {
124
- *(int *)(Dstart + x*4 ) = *(int *)(S + x_ofs[x]);
125
- }
126
- }
127
- }
128
- _mm256_zeroupper ();
129
- }
130
-
131
- private:
132
- const Mat src;
133
- Mat dst;
134
- int * x_ofs, pix_size4;
135
- double ify;
136
-
137
- resizeNNInvokerAVX4 (const resizeNNInvokerAVX4&);
138
- resizeNNInvokerAVX4& operator =(const resizeNNInvokerAVX4&);
139
- };
140
-
141
- class resizeNNInvokerAVX2 :
142
- public ParallelLoopBody
143
- {
144
- public:
145
- resizeNNInvokerAVX2 (const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
146
- ParallelLoopBody (), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
147
- ify (_ify)
148
- {
149
- }
150
-
151
- #if defined(__INTEL_COMPILER)
152
- #pragma optimization_parameter target_arch=AVX
153
- #endif
154
- virtual void operator () (const Range& range) const
155
- {
156
- Size ssize = src.size (), dsize = dst.size ();
157
- int y, x;
158
- int width = dsize.width ;
159
- // int avxWidth = (width - 1) - ((width - 1) & 0x7);
160
- int avxWidth = width - (width & 0xf );
161
- const __m256i CV_DECL_ALIGNED (64 ) mask = _mm256_set1_epi32 (-1 );
162
- const __m256i CV_DECL_ALIGNED (64 ) shuffle_mask = _mm256_set_epi8 (15 ,14 ,11 ,10 ,13 ,12 ,9 ,8 ,7 ,6 ,3 ,2 ,5 ,4 ,1 ,0 ,
163
- 15 ,14 ,11 ,10 ,13 ,12 ,9 ,8 ,7 ,6 ,3 ,2 ,5 ,4 ,1 ,0 );
164
- const __m256i CV_DECL_ALIGNED (64 ) permute_mask = _mm256_set_epi32 (7 , 5 , 3 , 1 , 6 , 4 , 2 , 0 );
165
- // const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
166
- // 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
167
- if (((int64)(dst.data + dst.step ) & 0x1f ) == 0 )
168
- {
169
- for (y = range.start ; y < range.end ; y++)
170
- {
171
- uchar* D = dst.data + dst.step *y;
172
- uchar* Dstart = D;
173
- int sy = std::min (cvFloor (y*ify), ssize.height -1 );
174
- const uchar* S = src.data + sy*src.step ;
175
- const uchar* S2 = S - 2 ;
176
- #ifdef CV_ICC
177
- #pragma unroll(4)
178
- #endif
179
- for (x = 0 ; x < avxWidth; x += 16 )
180
- {
181
- const __m256i CV_DECL_ALIGNED (64 ) *addr = (__m256i*)(x_ofs + x);
182
- __m256i CV_DECL_ALIGNED (64 ) indices = _mm256_lddqu_si256 (addr);
183
- __m256i CV_DECL_ALIGNED (64 ) pixels1 = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
184
- const __m256i CV_DECL_ALIGNED (64 ) *addr2 = (__m256i*)(x_ofs + x + 8 );
185
- __m256i CV_DECL_ALIGNED (64 ) indices2 = _mm256_lddqu_si256 (addr2);
186
- __m256i CV_DECL_ALIGNED (64 ) pixels2 = _mm256_i32gather_epi32 ((const int *)S2, indices2, 1 );
187
- __m256i CV_DECL_ALIGNED (64 ) unpacked = _mm256_blend_epi16 (pixels1, pixels2, 0xaa );
188
-
189
- __m256i CV_DECL_ALIGNED (64 ) bytes_shuffled = _mm256_shuffle_epi8 (unpacked, shuffle_mask);
190
- __m256i CV_DECL_ALIGNED (64 ) ints_permuted = _mm256_permutevar8x32_epi32 (bytes_shuffled, permute_mask);
191
- _mm256_maskstore_epi32 ((int *)D, mask, ints_permuted);
192
- D += 32 ;
193
- }
194
- for (; x < width; x++)
195
- {
196
- *(ushort*)(Dstart + x*2 ) = *(ushort*)(S + x_ofs[x]);
197
- }
198
-
199
- }
200
- }
201
- else
202
- {
203
- for (y = range.start ; y < range.end ; y++)
204
- {
205
- uchar* D = dst.data + dst.step *y;
206
- uchar* Dstart = D;
207
- int sy = std::min (cvFloor (y*ify), ssize.height -1 );
208
- const uchar* S = src.data + sy*src.step ;
209
- const uchar* S2 = S - 2 ;
210
- #ifdef CV_ICC
211
- #pragma unroll(4)
212
- #endif
213
- for (x = 0 ; x < avxWidth; x += 16 )
214
- {
215
- const __m256i CV_DECL_ALIGNED (64 ) *addr = (__m256i*)(x_ofs + x);
216
- __m256i CV_DECL_ALIGNED (64 ) indices = _mm256_lddqu_si256 (addr);
217
- __m256i CV_DECL_ALIGNED (64 ) pixels1 = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
218
- const __m256i CV_DECL_ALIGNED (64 ) *addr2 = (__m256i*)(x_ofs + x + 8 );
219
- __m256i CV_DECL_ALIGNED (64 ) indices2 = _mm256_lddqu_si256 (addr2);
220
- __m256i CV_DECL_ALIGNED (64 ) pixels2 = _mm256_i32gather_epi32 ((const int *)S2, indices2, 1 );
221
- __m256i CV_DECL_ALIGNED (64 ) unpacked = _mm256_blend_epi16 (pixels1, pixels2, 0xaa );
222
-
223
- __m256i CV_DECL_ALIGNED (64 ) bytes_shuffled = _mm256_shuffle_epi8 (unpacked, shuffle_mask);
224
- __m256i CV_DECL_ALIGNED (64 ) ints_permuted = _mm256_permutevar8x32_epi32 (bytes_shuffled, permute_mask);
225
- _mm256_storeu_si256 ((__m256i*)D, ints_permuted);
226
- D += 32 ;
227
- }
228
- for (; x < width; x++)
229
- {
230
- *(ushort*)(Dstart + x*2 ) = *(ushort*)(S + x_ofs[x]);
231
- }
232
- }
233
- }
234
- _mm256_zeroupper ();
235
- }
236
-
237
- private:
238
- const Mat src;
239
- Mat dst;
240
- int * x_ofs, pix_size4;
241
- double ify;
242
-
243
- resizeNNInvokerAVX2 (const resizeNNInvokerAVX2&);
244
- resizeNNInvokerAVX2& operator =(const resizeNNInvokerAVX2&);
245
- };
246
-
247
- void resizeNN2_AVX2 (const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
248
- {
249
- resizeNNInvokerAVX2 invoker (src, dst, x_ofs, pix_size4, ify);
250
- parallel_for_ (range, invoker, dst.total () / (double )(1 << 16 ));
251
- }
252
-
253
- void resizeNN4_AVX2 (const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify)
254
- {
255
- resizeNNInvokerAVX4 invoker (src, dst, x_ofs, pix_size4, ify);
256
- parallel_for_ (range, invoker, dst.total () / (double )(1 << 16 ));
257
- }
258
-
259
58
int warpAffineBlockline (int *adelta, int *bdelta, short * xy, short * alpha, int X0, int Y0, int bw)
260
59
{
261
60
const int AB_BITS = MAX (10 , (int )INTER_BITS);
0 commit comments