45
45
46
46
#include " precomp.hpp"
47
47
#include " opencl_kernels_imgproc.hpp"
48
+ #include " opencv2/core/hal/intrin.hpp"
48
49
49
50
namespace cv {
51
+ #if CV_SIMD128
52
+ static inline v_float32x4 blend (const v_float32x4& v_src1, const v_float32x4& v_src2, const v_float32x4& v_w1, const v_float32x4& v_w2)
53
+ {
54
+ const v_float32x4 v_eps = v_setall_f32 (1e-5f );
55
+ v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
56
+ return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom;
57
+ }
58
+ static inline v_float32x4 blend (const v_float32x4& v_src1, const v_float32x4& v_src2, const float * w_ptr1, const float * w_ptr2, int offset)
59
+ {
60
+ v_float32x4 v_w1 = v_load (w_ptr1 + offset);
61
+ v_float32x4 v_w2 = v_load (w_ptr2 + offset);
62
+ return blend (v_src1, v_src2, v_w1, v_w2);
63
+ }
64
+ static inline v_uint32x4 saturate_f32_u32 (const v_float32x4& vec)
65
+ {
66
+ const v_int32x4 z = v_setzero_s32 ();
67
+ const v_int32x4 x = v_setall_s32 (255 );
68
+ return v_reinterpret_as_u32 (v_min (v_max (v_round (vec), z), x));
69
+ }
70
+ static inline v_uint8x16 pack_f32tou8 (v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3)
71
+ {
72
+ v_uint32x4 a = saturate_f32_u32 (val0);
73
+ v_uint32x4 b = saturate_f32_u32 (val1);
74
+ v_uint32x4 c = saturate_f32_u32 (val2);
75
+ v_uint32x4 d = saturate_f32_u32 (val3);
76
+ v_uint16x8 e = v_pack (a, b);
77
+ v_uint16x8 f = v_pack (c, d);
78
+ return v_pack (e, f);
79
+ }
80
+ static inline void store_pack_f32tou8 (uchar* ptr, v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3)
81
+ {
82
+ v_store ((ptr), pack_f32tou8 (val0, val1, val2, val3));
83
+ }
84
+ static inline void expand_u8tof32 (const v_uint8x16& src, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3)
85
+ {
86
+ v_uint16x8 a0, a1;
87
+ v_expand (src, a0, a1);
88
+ v_uint32x4 b0, b1,b2,b3;
89
+ v_expand (a0, b0, b1);
90
+ v_expand (a1, b2, b3);
91
+ dst0 = v_cvt_f32 (v_reinterpret_as_s32 (b0));
92
+ dst1 = v_cvt_f32 (v_reinterpret_as_s32 (b1));
93
+ dst2 = v_cvt_f32 (v_reinterpret_as_s32 (b2));
94
+ dst3 = v_cvt_f32 (v_reinterpret_as_s32 (b3));
95
+ }
96
+ static inline void load_expand_u8tof32 (const uchar* ptr, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3)
97
+ {
98
+ v_uint8x16 a = v_load ((ptr));
99
+ expand_u8tof32 (a, dst0, dst1, dst2, dst3);
100
+ }
101
+ int blendLinearSimd128 (const uchar* src1, const uchar* src2, const float * weights1, const float * weights2, uchar* dst, int x, int width, int cn)
102
+ {
103
+ const v_float32x4 v_eps = v_setall_f32 (1e-5f );
104
+ int weight_offset = 0 ;
105
+ int step = v_uint8x16::nlanes * cn;
106
+ int weight_step = v_uint8x16::nlanes*cn;
107
+ switch (cn)
108
+ {
109
+ case 1 :
110
+ for ( ; x <= width - step; x += step, weight_offset += weight_step)
111
+ {
112
+ v_float32x4 v_src10, v_src11, v_src12, v_src13;
113
+ v_float32x4 v_src20, v_src21, v_src22, v_src23;
114
+ load_expand_u8tof32 (src1 + x, v_src10, v_src11, v_src12, v_src13);
115
+ load_expand_u8tof32 (src2 + x, v_src20, v_src21, v_src22, v_src23);
116
+
117
+ v_float32x4 v_dst0 = blend (v_src10, v_src20, weights1, weights2, weight_offset);
118
+ v_float32x4 v_dst1 = blend (v_src11, v_src21, weights1, weights2, weight_offset + 4 );
119
+ v_float32x4 v_dst2 = blend (v_src12, v_src22, weights1, weights2, weight_offset + 8 );
120
+ v_float32x4 v_dst3 = blend (v_src13, v_src23, weights1, weights2, weight_offset + 12 );
121
+
122
+ store_pack_f32tou8 (dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
123
+ }
124
+ break ;
125
+ case 2 :
126
+ for ( ; x <= width - step; x += step, weight_offset += weight_step)
127
+ {
128
+ v_uint8x16 v_src10, v_src11, v_src20, v_src21;
129
+ v_load_deinterleave (src1 + x, v_src10, v_src11);
130
+ v_load_deinterleave (src2 + x, v_src20, v_src21);
131
+ v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113;
132
+ v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213;
133
+ expand_u8tof32 (v_src10, v_src100, v_src101, v_src102, v_src103);
134
+ expand_u8tof32 (v_src11, v_src110, v_src111, v_src112, v_src113);
135
+ expand_u8tof32 (v_src20, v_src200, v_src201, v_src202, v_src203);
136
+ expand_u8tof32 (v_src21, v_src210, v_src211, v_src212, v_src213);
137
+
138
+ v_float32x4 v_dst0 = blend (v_src100, v_src200, weights1, weights2, weight_offset);
139
+ v_float32x4 v_dst1 = blend (v_src110, v_src210, weights1, weights2, weight_offset);
140
+ v_float32x4 v_dst2 = blend (v_src101, v_src201, weights1, weights2, weight_offset + 4 );
141
+ v_float32x4 v_dst3 = blend (v_src111, v_src211, weights1, weights2, weight_offset + 4 );
142
+ v_float32x4 v_dst4 = blend (v_src102, v_src202, weights1, weights2, weight_offset + 8 );
143
+ v_float32x4 v_dst5 = blend (v_src112, v_src212, weights1, weights2, weight_offset + 8 );
144
+ v_float32x4 v_dst6 = blend (v_src103, v_src203, weights1, weights2, weight_offset + 12 );
145
+ v_float32x4 v_dst7 = blend (v_src113, v_src213, weights1, weights2, weight_offset + 12 );
146
+
147
+ v_uint8x16 v_dsta = pack_f32tou8 (v_dst0, v_dst2, v_dst4, v_dst6);
148
+ v_uint8x16 v_dstb = pack_f32tou8 (v_dst1, v_dst3, v_dst5, v_dst7);
149
+ v_store_interleave (dst + x, v_dsta, v_dstb);
150
+ }
151
+ break ;
152
+ case 3 :
153
+ for ( ; x <= width - step; x += step, weight_offset += weight_step)
154
+ {
155
+ v_uint8x16 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
156
+ v_load_deinterleave (src1 + x, v_src10, v_src11, v_src12);
157
+ v_load_deinterleave (src2 + x, v_src20, v_src21, v_src22);
158
+
159
+ v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113, v_src120, v_src121, v_src122, v_src123;
160
+ v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213, v_src220, v_src221, v_src222, v_src223;
161
+ expand_u8tof32 (v_src10, v_src100, v_src101, v_src102, v_src103);
162
+ expand_u8tof32 (v_src11, v_src110, v_src111, v_src112, v_src113);
163
+ expand_u8tof32 (v_src12, v_src120, v_src121, v_src122, v_src123);
164
+ expand_u8tof32 (v_src20, v_src200, v_src201, v_src202, v_src203);
165
+ expand_u8tof32 (v_src21, v_src210, v_src211, v_src212, v_src213);
166
+ expand_u8tof32 (v_src22, v_src220, v_src221, v_src222, v_src223);
167
+
168
+ v_float32x4 v_w10 = v_load (weights1 + weight_offset);
169
+ v_float32x4 v_w11 = v_load (weights1 + weight_offset + 4 );
170
+ v_float32x4 v_w12 = v_load (weights1 + weight_offset + 8 );
171
+ v_float32x4 v_w13 = v_load (weights1 + weight_offset + 12 );
172
+ v_float32x4 v_w20 = v_load (weights2 + weight_offset);
173
+ v_float32x4 v_w21 = v_load (weights2 + weight_offset + 4 );
174
+ v_float32x4 v_w22 = v_load (weights2 + weight_offset + 8 );
175
+ v_float32x4 v_w23 = v_load (weights2 + weight_offset + 12 );
176
+ v_src100 = blend (v_src100, v_src200, v_w10, v_w20);
177
+ v_src110 = blend (v_src110, v_src210, v_w10, v_w20);
178
+ v_src120 = blend (v_src120, v_src220, v_w10, v_w20);
179
+ v_src101 = blend (v_src101, v_src201, v_w11, v_w21);
180
+ v_src111 = blend (v_src111, v_src211, v_w11, v_w21);
181
+ v_src121 = blend (v_src121, v_src221, v_w11, v_w21);
182
+ v_src102 = blend (v_src102, v_src202, v_w12, v_w22);
183
+ v_src112 = blend (v_src112, v_src212, v_w12, v_w22);
184
+ v_src122 = blend (v_src122, v_src222, v_w12, v_w22);
185
+ v_src103 = blend (v_src103, v_src203, v_w13, v_w23);
186
+ v_src113 = blend (v_src113, v_src213, v_w13, v_w23);
187
+ v_src123 = blend (v_src123, v_src223, v_w13, v_w23);
188
+
189
+
190
+ v_uint8x16 v_dst0 = pack_f32tou8 (v_src100, v_src101, v_src102, v_src103);
191
+ v_uint8x16 v_dst1 = pack_f32tou8 (v_src110, v_src111, v_src112, v_src113);
192
+ v_uint8x16 v_dst2 = pack_f32tou8 (v_src120, v_src121, v_src122, v_src123);
193
+
194
+ v_store_interleave (dst + x, v_dst0, v_dst1, v_dst2);
195
+ }
196
+ break ;
197
+ case 4 :
198
+ step = v_uint8x16::nlanes;
199
+ weight_step = v_float32x4::nlanes;
200
+ for ( ; x <= width - step; x += step, weight_offset += weight_step)
201
+ {
202
+ v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17;
203
+ v_float32x4 v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27;
204
+ load_expand_u8tof32 (src1 + x, v_src10, v_src11, v_src12, v_src13);
205
+ load_expand_u8tof32 (src2 + x, v_src20, v_src21, v_src22, v_src23);
206
+ v_transpose4x4 (v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17);
207
+ v_transpose4x4 (v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27);
208
+
209
+ v_float32x4 v_w1 = v_load (weights1 + weight_offset);
210
+ v_float32x4 v_w2 = v_load (weights2 + weight_offset);
211
+ v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
212
+ v_src10 = (v_src14 * v_w1 + v_src24 * v_w2) / v_denom;
213
+ v_src11 = (v_src15 * v_w1 + v_src25 * v_w2) / v_denom;
214
+ v_src12 = (v_src16 * v_w1 + v_src26 * v_w2) / v_denom;
215
+ v_src13 = (v_src17 * v_w1 + v_src27 * v_w2) / v_denom;
216
+ v_float32x4 v_dst0, v_dst1, v_dst2, v_dst3;
217
+ v_transpose4x4 (v_src10, v_src11, v_src12, v_src13, v_dst0, v_dst1, v_dst2, v_dst3);
218
+
219
+ store_pack_f32tou8 (dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
220
+ }
221
+ break ;
222
+ default :
223
+ break ;
224
+ }
225
+ return x;
226
+ }
227
+
228
+ int blendLinearSimd128 (const float * src1, const float * src2, const float * weights1, const float * weights2, float * dst, int x, int width, int cn)
229
+ {
230
+ const v_float32x4 v_eps = v_setall_f32 (1e-5f );
231
+ int weight_offset = 0 ;
232
+ int step = v_float32x4::nlanes*cn;
233
+ switch (cn)
234
+ {
235
+ case 1 :
236
+ for ( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
237
+ {
238
+ v_float32x4 v_src1 = v_load (src1 + x);
239
+ v_float32x4 v_src2 = v_load (src2 + x);
240
+ v_float32x4 v_w1 = v_load (weights1 + weight_offset);
241
+ v_float32x4 v_w2 = v_load (weights2 + weight_offset);
242
+ v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
243
+ v_float32x4 v_dst = (v_src1 * v_w1 + v_src2 * v_w2) / v_denom;
244
+
245
+ v_store (dst + x, v_dst);
246
+ }
247
+ break ;
248
+ case 2 :
249
+ for ( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
250
+ {
251
+ v_float32x4 v_src10, v_src11, v_src20, v_src21;
252
+ v_load_deinterleave (src1 + x, v_src10, v_src11);
253
+ v_load_deinterleave (src2 + x, v_src20, v_src21);
254
+ v_float32x4 v_w1 = v_load (weights1 + weight_offset);
255
+ v_float32x4 v_w2 = v_load (weights2 + weight_offset);
256
+ v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
257
+ v_float32x4 v_dst0 = (v_src10 * v_w1 + v_src20 * v_w2) / v_denom;
258
+ v_float32x4 v_dst1 = (v_src11 * v_w1 + v_src21 * v_w2) / v_denom;
259
+
260
+ v_store_interleave (dst + x, v_dst0, v_dst1);
261
+ }
262
+ break ;
263
+ case 3 :
264
+ for ( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
265
+ {
266
+ v_float32x4 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
267
+ v_load_deinterleave (src1 + x, v_src10, v_src11, v_src12);
268
+ v_load_deinterleave (src2 + x, v_src20, v_src21, v_src22);
269
+ v_float32x4 v_w1 = v_load (weights1 + weight_offset);
270
+ v_float32x4 v_w2 = v_load (weights2 + weight_offset);
271
+ v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
272
+ v_float32x4 v_dst0 = (v_src10 * v_w1 + v_src20 * v_w2) / v_denom;
273
+ v_float32x4 v_dst1 = (v_src11 * v_w1 + v_src21 * v_w2) / v_denom;
274
+ v_float32x4 v_dst2 = (v_src12 * v_w1 + v_src22 * v_w2) / v_denom;
275
+
276
+ v_store_interleave (dst + x, v_dst0, v_dst1, v_dst2);
277
+ }
278
+ break ;
279
+ case 4 :
280
+ for ( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
281
+ {
282
+ v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
283
+ v_load_deinterleave (src1 + x, v_src10, v_src11, v_src12, v_src13);
284
+ v_load_deinterleave (src2 + x, v_src20, v_src21, v_src22, v_src23);
285
+ v_float32x4 v_w1 = v_load (weights1 + weight_offset);
286
+ v_float32x4 v_w2 = v_load (weights2 + weight_offset);
287
+ v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
288
+ v_float32x4 v_dst0 = (v_src10 * v_w1 + v_src20 * v_w2) / v_denom;
289
+ v_float32x4 v_dst1 = (v_src11 * v_w1 + v_src21 * v_w2) / v_denom;
290
+ v_float32x4 v_dst2 = (v_src12 * v_w1 + v_src22 * v_w2) / v_denom;
291
+ v_float32x4 v_dst3 = (v_src13 * v_w1 + v_src23 * v_w2) / v_denom;
292
+
293
+ v_store_interleave (dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
294
+ }
295
+ break ;
296
+ default :
297
+ break ;
298
+ }
299
+ return x;
300
+ }
301
+ #endif
50
302
51
303
template <typename T>
52
304
class BlendLinearInvoker :
@@ -71,7 +323,12 @@ class BlendLinearInvoker :
71
323
const T * const src2_row = src2->ptr <T>(y);
72
324
T * const dst_row = dst->ptr <T>(y);
73
325
74
- for (int x = 0 ; x < width; ++x)
326
+ int x = 0 ;
327
+ #if CV_SIMD128
328
+ x = blendLinearSimd128 (src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn);
329
+ #endif
330
+
331
+ for ( ; x < width; ++x)
75
332
{
76
333
int x1 = x / cn;
77
334
float w1 = weights1_row[x1], w2 = weights2_row[x1];
0 commit comments