45
45
46
46
#include " precomp.hpp"
47
47
#include " opencl_kernels_imgproc.hpp"
48
+ #include " opencv2/core/hal/intrin.hpp"
48
49
49
50
namespace cv {
51
+ #if CV_SIMD128
52
+ static inline v_float32x4 blend (const v_float32x4& v_src1, const v_float32x4& v_src2, const v_float32x4& v_w1, const v_float32x4& v_w2)
53
+ {
54
+ const v_float32x4 v_eps = v_setall_f32 (1e-5f );
55
+ v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
56
+ return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom;
57
+ }
58
+ static inline v_float32x4 blend (const v_float32x4& v_src1, const v_float32x4& v_src2, const float * w_ptr1, const float * w_ptr2, int offset)
59
+ {
60
+ v_float32x4 v_w1 = v_load (w_ptr1 + offset);
61
+ v_float32x4 v_w2 = v_load (w_ptr2 + offset);
62
+ return blend (v_src1, v_src2, v_w1, v_w2);
63
+ }
64
+ static inline v_uint32x4 saturate_f32_u32 (const v_float32x4& vec)
65
+ {
66
+ const v_int32x4 z = v_setzero_s32 ();
67
+ const v_int32x4 x = v_setall_s32 (255 );
68
+ return v_reinterpret_as_u32 (v_min (v_max (v_round (vec), z), x));
69
+ }
70
+ static inline v_uint8x16 pack_f32tou8 (v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3)
71
+ {
72
+ v_uint32x4 a = saturate_f32_u32 (val0);
73
+ v_uint32x4 b = saturate_f32_u32 (val1);
74
+ v_uint32x4 c = saturate_f32_u32 (val2);
75
+ v_uint32x4 d = saturate_f32_u32 (val3);
76
+ v_uint16x8 e = v_pack (a, b);
77
+ v_uint16x8 f = v_pack (c, d);
78
+ return v_pack (e, f);
79
+ }
80
+ static inline void store_pack_f32tou8 (uchar* ptr, v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3)
81
+ {
82
+ v_store ((ptr), pack_f32tou8 (val0, val1, val2, val3));
83
+ }
84
+ static inline void expand_u8tof32 (const v_uint8x16& src, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3)
85
+ {
86
+ v_uint16x8 a0, a1;
87
+ v_expand (src, a0, a1);
88
+ v_uint32x4 b0, b1,b2,b3;
89
+ v_expand (a0, b0, b1);
90
+ v_expand (a1, b2, b3);
91
+ dst0 = v_cvt_f32 (v_reinterpret_as_s32 (b0));
92
+ dst1 = v_cvt_f32 (v_reinterpret_as_s32 (b1));
93
+ dst2 = v_cvt_f32 (v_reinterpret_as_s32 (b2));
94
+ dst3 = v_cvt_f32 (v_reinterpret_as_s32 (b3));
95
+ }
96
+ static inline void load_expand_u8tof32 (const uchar* ptr, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3)
97
+ {
98
+ v_uint8x16 a = v_load ((ptr));
99
+ expand_u8tof32 (a, dst0, dst1, dst2, dst3);
100
+ }
101
+ int blendLinearSimd128 (const uchar* src1, const uchar* src2, const float * weights1, const float * weights2, uchar* dst, int x, int width, int cn);
102
+ int blendLinearSimd128 (const float * src1, const float * src2, const float * weights1, const float * weights2, float * dst, int x, int width, int cn);
103
+ int blendLinearSimd128 (const uchar* src1, const uchar* src2, const float * weights1, const float * weights2, uchar* dst, int x, int width, int cn)
104
+ {
105
+ int step = v_uint8x16::nlanes * cn;
106
+ int weight_step = v_uint8x16::nlanes;
107
+ switch (cn)
108
+ {
109
+ case 1 :
110
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
111
+ {
112
+ v_float32x4 v_src10, v_src11, v_src12, v_src13;
113
+ v_float32x4 v_src20, v_src21, v_src22, v_src23;
114
+ load_expand_u8tof32 (src1 + x, v_src10, v_src11, v_src12, v_src13);
115
+ load_expand_u8tof32 (src2 + x, v_src20, v_src21, v_src22, v_src23);
116
+
117
+ v_float32x4 v_dst0 = blend (v_src10, v_src20, weights1, weights2, weight_offset);
118
+ v_float32x4 v_dst1 = blend (v_src11, v_src21, weights1, weights2, weight_offset + 4 );
119
+ v_float32x4 v_dst2 = blend (v_src12, v_src22, weights1, weights2, weight_offset + 8 );
120
+ v_float32x4 v_dst3 = blend (v_src13, v_src23, weights1, weights2, weight_offset + 12 );
121
+
122
+ store_pack_f32tou8 (dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
123
+ }
124
+ break ;
125
+ case 2 :
126
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
127
+ {
128
+ v_uint8x16 v_src10, v_src11, v_src20, v_src21;
129
+ v_load_deinterleave (src1 + x, v_src10, v_src11);
130
+ v_load_deinterleave (src2 + x, v_src20, v_src21);
131
+ v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113;
132
+ v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213;
133
+ expand_u8tof32 (v_src10, v_src100, v_src101, v_src102, v_src103);
134
+ expand_u8tof32 (v_src11, v_src110, v_src111, v_src112, v_src113);
135
+ expand_u8tof32 (v_src20, v_src200, v_src201, v_src202, v_src203);
136
+ expand_u8tof32 (v_src21, v_src210, v_src211, v_src212, v_src213);
137
+
138
+ v_float32x4 v_dst0 = blend (v_src100, v_src200, weights1, weights2, weight_offset);
139
+ v_float32x4 v_dst1 = blend (v_src110, v_src210, weights1, weights2, weight_offset);
140
+ v_float32x4 v_dst2 = blend (v_src101, v_src201, weights1, weights2, weight_offset + 4 );
141
+ v_float32x4 v_dst3 = blend (v_src111, v_src211, weights1, weights2, weight_offset + 4 );
142
+ v_float32x4 v_dst4 = blend (v_src102, v_src202, weights1, weights2, weight_offset + 8 );
143
+ v_float32x4 v_dst5 = blend (v_src112, v_src212, weights1, weights2, weight_offset + 8 );
144
+ v_float32x4 v_dst6 = blend (v_src103, v_src203, weights1, weights2, weight_offset + 12 );
145
+ v_float32x4 v_dst7 = blend (v_src113, v_src213, weights1, weights2, weight_offset + 12 );
146
+
147
+ v_uint8x16 v_dsta = pack_f32tou8 (v_dst0, v_dst2, v_dst4, v_dst6);
148
+ v_uint8x16 v_dstb = pack_f32tou8 (v_dst1, v_dst3, v_dst5, v_dst7);
149
+ v_store_interleave (dst + x, v_dsta, v_dstb);
150
+ }
151
+ break ;
152
+ case 3 :
153
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
154
+ {
155
+ v_uint8x16 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
156
+ v_load_deinterleave (src1 + x, v_src10, v_src11, v_src12);
157
+ v_load_deinterleave (src2 + x, v_src20, v_src21, v_src22);
158
+
159
+ v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113, v_src120, v_src121, v_src122, v_src123;
160
+ v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213, v_src220, v_src221, v_src222, v_src223;
161
+ expand_u8tof32 (v_src10, v_src100, v_src101, v_src102, v_src103);
162
+ expand_u8tof32 (v_src11, v_src110, v_src111, v_src112, v_src113);
163
+ expand_u8tof32 (v_src12, v_src120, v_src121, v_src122, v_src123);
164
+ expand_u8tof32 (v_src20, v_src200, v_src201, v_src202, v_src203);
165
+ expand_u8tof32 (v_src21, v_src210, v_src211, v_src212, v_src213);
166
+ expand_u8tof32 (v_src22, v_src220, v_src221, v_src222, v_src223);
167
+
168
+ v_float32x4 v_w10 = v_load (weights1 + weight_offset);
169
+ v_float32x4 v_w11 = v_load (weights1 + weight_offset + 4 );
170
+ v_float32x4 v_w12 = v_load (weights1 + weight_offset + 8 );
171
+ v_float32x4 v_w13 = v_load (weights1 + weight_offset + 12 );
172
+ v_float32x4 v_w20 = v_load (weights2 + weight_offset);
173
+ v_float32x4 v_w21 = v_load (weights2 + weight_offset + 4 );
174
+ v_float32x4 v_w22 = v_load (weights2 + weight_offset + 8 );
175
+ v_float32x4 v_w23 = v_load (weights2 + weight_offset + 12 );
176
+ v_src100 = blend (v_src100, v_src200, v_w10, v_w20);
177
+ v_src110 = blend (v_src110, v_src210, v_w10, v_w20);
178
+ v_src120 = blend (v_src120, v_src220, v_w10, v_w20);
179
+ v_src101 = blend (v_src101, v_src201, v_w11, v_w21);
180
+ v_src111 = blend (v_src111, v_src211, v_w11, v_w21);
181
+ v_src121 = blend (v_src121, v_src221, v_w11, v_w21);
182
+ v_src102 = blend (v_src102, v_src202, v_w12, v_w22);
183
+ v_src112 = blend (v_src112, v_src212, v_w12, v_w22);
184
+ v_src122 = blend (v_src122, v_src222, v_w12, v_w22);
185
+ v_src103 = blend (v_src103, v_src203, v_w13, v_w23);
186
+ v_src113 = blend (v_src113, v_src213, v_w13, v_w23);
187
+ v_src123 = blend (v_src123, v_src223, v_w13, v_w23);
188
+
189
+
190
+ v_uint8x16 v_dst0 = pack_f32tou8 (v_src100, v_src101, v_src102, v_src103);
191
+ v_uint8x16 v_dst1 = pack_f32tou8 (v_src110, v_src111, v_src112, v_src113);
192
+ v_uint8x16 v_dst2 = pack_f32tou8 (v_src120, v_src121, v_src122, v_src123);
193
+ v_store_interleave (dst + x, v_dst0, v_dst1, v_dst2);
194
+ }
195
+ break ;
196
+ case 4 :
197
+ step = v_uint8x16::nlanes;
198
+ weight_step = v_float32x4::nlanes;
199
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
200
+ {
201
+ v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17;
202
+ v_float32x4 v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27;
203
+ load_expand_u8tof32 (src1 + x, v_src10, v_src11, v_src12, v_src13);
204
+ load_expand_u8tof32 (src2 + x, v_src20, v_src21, v_src22, v_src23);
205
+
206
+ v_transpose4x4 (v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17);
207
+ v_transpose4x4 (v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27);
208
+
209
+ v_float32x4 v_w1 = v_load (weights1 + weight_offset);
210
+ v_float32x4 v_w2 = v_load (weights2 + weight_offset);
211
+ v_src10 = blend (v_src14, v_src24, v_w1, v_w2);
212
+ v_src11 = blend (v_src15, v_src25, v_w1, v_w2);
213
+ v_src12 = blend (v_src16, v_src26, v_w1, v_w2);
214
+ v_src13 = blend (v_src17, v_src27, v_w1, v_w2);
215
+
216
+ v_float32x4 v_dst0, v_dst1, v_dst2, v_dst3;
217
+ v_transpose4x4 (v_src10, v_src11, v_src12, v_src13, v_dst0, v_dst1, v_dst2, v_dst3);
218
+ store_pack_f32tou8 (dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
219
+ }
220
+ break ;
221
+ default :
222
+ break ;
223
+ }
224
+ return x;
225
+ }
226
+
227
+ int blendLinearSimd128 (const float * src1, const float * src2, const float * weights1, const float * weights2, float * dst, int x, int width, int cn)
228
+ {
229
+ int step = v_float32x4::nlanes*cn;
230
+ switch (cn)
231
+ {
232
+ case 1 :
233
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
234
+ {
235
+ v_float32x4 v_src1 = v_load (src1 + x);
236
+ v_float32x4 v_src2 = v_load (src2 + x);
237
+ v_float32x4 v_w1 = v_load (weights1 + weight_offset);
238
+ v_float32x4 v_w2 = v_load (weights2 + weight_offset);
239
+
240
+ v_float32x4 v_dst = blend (v_src1, v_src2, v_w1, v_w2);
241
+
242
+ v_store (dst + x, v_dst);
243
+ }
244
+ break ;
245
+ case 2 :
246
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
247
+ {
248
+ v_float32x4 v_src10, v_src11, v_src20, v_src21;
249
+ v_load_deinterleave (src1 + x, v_src10, v_src11);
250
+ v_load_deinterleave (src2 + x, v_src20, v_src21);
251
+ v_float32x4 v_w1 = v_load (weights1 + weight_offset);
252
+ v_float32x4 v_w2 = v_load (weights2 + weight_offset);
253
+
254
+ v_float32x4 v_dst0 = blend (v_src10, v_src20, v_w1, v_w2);
255
+ v_float32x4 v_dst1 = blend (v_src11, v_src21, v_w1, v_w2);
256
+
257
+ v_store_interleave (dst + x, v_dst0, v_dst1);
258
+ }
259
+ break ;
260
+ case 3 :
261
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
262
+ {
263
+ v_float32x4 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
264
+ v_load_deinterleave (src1 + x, v_src10, v_src11, v_src12);
265
+ v_load_deinterleave (src2 + x, v_src20, v_src21, v_src22);
266
+ v_float32x4 v_w1 = v_load (weights1 + weight_offset);
267
+ v_float32x4 v_w2 = v_load (weights2 + weight_offset);
268
+
269
+ v_float32x4 v_dst0 = blend (v_src10, v_src20, v_w1, v_w2);
270
+ v_float32x4 v_dst1 = blend (v_src11, v_src21, v_w1, v_w2);
271
+ v_float32x4 v_dst2 = blend (v_src12, v_src22, v_w1, v_w2);
272
+
273
+ v_store_interleave (dst + x, v_dst0, v_dst1, v_dst2);
274
+ }
275
+ break ;
276
+ case 4 :
277
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
278
+ {
279
+ v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
280
+ v_load_deinterleave (src1 + x, v_src10, v_src11, v_src12, v_src13);
281
+ v_load_deinterleave (src2 + x, v_src20, v_src21, v_src22, v_src23);
282
+ v_float32x4 v_w1 = v_load (weights1 + weight_offset);
283
+ v_float32x4 v_w2 = v_load (weights2 + weight_offset);
284
+
285
+ v_float32x4 v_dst0 = blend (v_src10, v_src20, v_w1, v_w2);
286
+ v_float32x4 v_dst1 = blend (v_src11, v_src21, v_w1, v_w2);
287
+ v_float32x4 v_dst2 = blend (v_src12, v_src22, v_w1, v_w2);
288
+ v_float32x4 v_dst3 = blend (v_src13, v_src23, v_w1, v_w2);
289
+
290
+ v_store_interleave (dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
291
+ }
292
+ break ;
293
+ default :
294
+ break ;
295
+ }
296
+ return x;
297
+ }
298
+ #endif
50
299
51
300
template <typename T>
52
301
class BlendLinearInvoker :
@@ -71,7 +320,12 @@ class BlendLinearInvoker :
71
320
const T * const src2_row = src2->ptr <T>(y);
72
321
T * const dst_row = dst->ptr <T>(y);
73
322
74
- for (int x = 0 ; x < width; ++x)
323
+ int x = 0 ;
324
+ #if CV_SIMD128
325
+ x = blendLinearSimd128 (src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn);
326
+ #endif
327
+
328
+ for ( ; x < width; ++x)
75
329
{
76
330
int x1 = x / cn;
77
331
float w1 = weights1_row[x1], w2 = weights2_row[x1];
0 commit comments