@@ -98,16 +98,16 @@ static inline void load_expand_u8tof32(const uchar* ptr, v_float32x4& dst0, v_fl
98
98
v_uint8x16 a = v_load ((ptr));
99
99
expand_u8tof32 (a, dst0, dst1, dst2, dst3);
100
100
}
101
+ int blendLinearSimd128 (const uchar* src1, const uchar* src2, const float * weights1, const float * weights2, uchar* dst, int x, int width, int cn);
102
+ int blendLinearSimd128 (const float * src1, const float * src2, const float * weights1, const float * weights2, float * dst, int x, int width, int cn);
101
103
int blendLinearSimd128 (const uchar* src1, const uchar* src2, const float * weights1, const float * weights2, uchar* dst, int x, int width, int cn)
102
104
{
103
- const v_float32x4 v_eps = v_setall_f32 (1e-5f );
104
- int weight_offset = 0 ;
105
105
int step = v_uint8x16::nlanes * cn;
106
- int weight_step = v_uint8x16::nlanes*cn ;
106
+ int weight_step = v_uint8x16::nlanes;
107
107
switch (cn)
108
108
{
109
109
case 1 :
110
- for ( ; x <= width - step; x += step, weight_offset += weight_step)
110
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
111
111
{
112
112
v_float32x4 v_src10, v_src11, v_src12, v_src13;
113
113
v_float32x4 v_src20, v_src21, v_src22, v_src23;
@@ -123,7 +123,7 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
123
123
}
124
124
break ;
125
125
case 2 :
126
- for ( ; x <= width - step; x += step, weight_offset += weight_step)
126
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
127
127
{
128
128
v_uint8x16 v_src10, v_src11, v_src20, v_src21;
129
129
v_load_deinterleave (src1 + x, v_src10, v_src11);
@@ -150,7 +150,7 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
150
150
}
151
151
break ;
152
152
case 3 :
153
- for ( ; x <= width - step; x += step, weight_offset += weight_step)
153
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
154
154
{
155
155
v_uint8x16 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
156
156
v_load_deinterleave (src1 + x, v_src10, v_src11, v_src12);
@@ -190,32 +190,31 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
190
190
v_uint8x16 v_dst0 = pack_f32tou8 (v_src100, v_src101, v_src102, v_src103);
191
191
v_uint8x16 v_dst1 = pack_f32tou8 (v_src110, v_src111, v_src112, v_src113);
192
192
v_uint8x16 v_dst2 = pack_f32tou8 (v_src120, v_src121, v_src122, v_src123);
193
-
194
193
v_store_interleave (dst + x, v_dst0, v_dst1, v_dst2);
195
194
}
196
195
break ;
197
196
case 4 :
198
197
step = v_uint8x16::nlanes;
199
198
weight_step = v_float32x4::nlanes;
200
- for ( ; x <= width - step; x += step, weight_offset += weight_step)
199
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
201
200
{
202
201
v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17;
203
202
v_float32x4 v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27;
204
203
load_expand_u8tof32 (src1 + x, v_src10, v_src11, v_src12, v_src13);
205
204
load_expand_u8tof32 (src2 + x, v_src20, v_src21, v_src22, v_src23);
205
+
206
206
v_transpose4x4 (v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17);
207
207
v_transpose4x4 (v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27);
208
208
209
209
v_float32x4 v_w1 = v_load (weights1 + weight_offset);
210
210
v_float32x4 v_w2 = v_load (weights2 + weight_offset);
211
- v_float32x4 v_denom = v_w1 + v_w2 + v_eps ;
212
- v_src10 = (v_src14 * v_w1 + v_src24 * v_w2) / v_denom ;
213
- v_src11 = (v_src15 * v_w1 + v_src25 * v_w2) / v_denom ;
214
- v_src12 = (v_src16 * v_w1 + v_src26 * v_w2) / v_denom ;
215
- v_src13 = (v_src17 * v_w1 + v_src27 * v_w2) / v_denom;
211
+ v_src10 = blend (v_src14, v_src24, v_w1, v_w2) ;
212
+ v_src11 = blend (v_src15, v_src25, v_w1, v_w2);
213
+ v_src12 = blend (v_src16, v_src26, v_w1, v_w2);
214
+ v_src13 = blend (v_src17, v_src27, v_w1, v_w2);
215
+
216
216
v_float32x4 v_dst0, v_dst1, v_dst2, v_dst3;
217
217
v_transpose4x4 (v_src10, v_src11, v_src12, v_src13, v_dst0, v_dst1, v_dst2, v_dst3);
218
-
219
218
store_pack_f32tou8 (dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
220
219
}
221
220
break ;
@@ -227,68 +226,66 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
227
226
228
227
int blendLinearSimd128 (const float * src1, const float * src2, const float * weights1, const float * weights2, float * dst, int x, int width, int cn)
229
228
{
230
- const v_float32x4 v_eps = v_setall_f32 (1e-5f );
231
- int weight_offset = 0 ;
232
229
int step = v_float32x4::nlanes*cn;
233
230
switch (cn)
234
231
{
235
232
case 1 :
236
- for ( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
233
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
237
234
{
238
235
v_float32x4 v_src1 = v_load (src1 + x);
239
236
v_float32x4 v_src2 = v_load (src2 + x);
240
237
v_float32x4 v_w1 = v_load (weights1 + weight_offset);
241
238
v_float32x4 v_w2 = v_load (weights2 + weight_offset);
242
- v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
243
- v_float32x4 v_dst = (v_src1 * v_w1 + v_src2 * v_w2) / v_denom ;
239
+
240
+ v_float32x4 v_dst = blend (v_src1, v_src2, v_w1, v_w2);
244
241
245
242
v_store (dst + x, v_dst);
246
243
}
247
244
break ;
248
245
case 2 :
249
- for ( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
246
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
250
247
{
251
248
v_float32x4 v_src10, v_src11, v_src20, v_src21;
252
249
v_load_deinterleave (src1 + x, v_src10, v_src11);
253
250
v_load_deinterleave (src2 + x, v_src20, v_src21);
254
251
v_float32x4 v_w1 = v_load (weights1 + weight_offset);
255
252
v_float32x4 v_w2 = v_load (weights2 + weight_offset);
256
- v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
257
- v_float32x4 v_dst0 = (v_src10 * v_w1 + v_src20 * v_w2) / v_denom ;
258
- v_float32x4 v_dst1 = (v_src11 * v_w1 + v_src21 * v_w2) / v_denom ;
253
+
254
+ v_float32x4 v_dst0 = blend (v_src10, v_src20, v_w1, v_w2);
255
+ v_float32x4 v_dst1 = blend (v_src11, v_src21, v_w1, v_w2);
259
256
260
257
v_store_interleave (dst + x, v_dst0, v_dst1);
261
258
}
262
259
break ;
263
260
case 3 :
264
- for ( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
261
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
265
262
{
266
263
v_float32x4 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
267
264
v_load_deinterleave (src1 + x, v_src10, v_src11, v_src12);
268
265
v_load_deinterleave (src2 + x, v_src20, v_src21, v_src22);
269
266
v_float32x4 v_w1 = v_load (weights1 + weight_offset);
270
267
v_float32x4 v_w2 = v_load (weights2 + weight_offset);
271
- v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
272
- v_float32x4 v_dst0 = (v_src10 * v_w1 + v_src20 * v_w2) / v_denom ;
273
- v_float32x4 v_dst1 = (v_src11 * v_w1 + v_src21 * v_w2) / v_denom ;
274
- v_float32x4 v_dst2 = (v_src12 * v_w1 + v_src22 * v_w2) / v_denom ;
268
+
269
+ v_float32x4 v_dst0 = blend (v_src10, v_src20, v_w1, v_w2);
270
+ v_float32x4 v_dst1 = blend (v_src11, v_src21, v_w1, v_w2);
271
+ v_float32x4 v_dst2 = blend (v_src12, v_src22, v_w1, v_w2);
275
272
276
273
v_store_interleave (dst + x, v_dst0, v_dst1, v_dst2);
277
274
}
278
275
break ;
279
276
case 4 :
280
- for ( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
277
+ for (int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
281
278
{
282
279
v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
283
280
v_load_deinterleave (src1 + x, v_src10, v_src11, v_src12, v_src13);
284
281
v_load_deinterleave (src2 + x, v_src20, v_src21, v_src22, v_src23);
285
282
v_float32x4 v_w1 = v_load (weights1 + weight_offset);
286
283
v_float32x4 v_w2 = v_load (weights2 + weight_offset);
287
- v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
288
- v_float32x4 v_dst0 = (v_src10 * v_w1 + v_src20 * v_w2) / v_denom ;
289
- v_float32x4 v_dst1 = (v_src11 * v_w1 + v_src21 * v_w2) / v_denom ;
290
- v_float32x4 v_dst2 = (v_src12 * v_w1 + v_src22 * v_w2) / v_denom ;
291
- v_float32x4 v_dst3 = (v_src13 * v_w1 + v_src23 * v_w2) / v_denom ;
284
+
285
+ v_float32x4 v_dst0 = blend (v_src10, v_src20, v_w1, v_w2);
286
+ v_float32x4 v_dst1 = blend (v_src11, v_src21, v_w1, v_w2);
287
+ v_float32x4 v_dst2 = blend (v_src12, v_src22, v_w1, v_w2);
288
+ v_float32x4 v_dst3 = blend (v_src13, v_src23, v_w1, v_w2);
292
289
293
290
v_store_interleave (dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
294
291
}
0 commit comments