Skip to content

Commit aec59ab

Browse files
committed
suppress warnings
- brush up the implementation
1 parent 64cf206 commit aec59ab

File tree

2 files changed

+32
-35
lines changed

2 files changed

+32
-35
lines changed

modules/imgproc/perf/opencl/perf_blend.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ namespace ocl {
5656

5757
typedef Size_MatType BlendLinearFixture;
5858

59-
OCL_PERF_TEST_P(BlendLinearFixture, BlendLinear, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC3, CV_32FC4, CV_8UC1, CV_8UC3, CV_8UC4)))
59+
OCL_PERF_TEST_P(BlendLinearFixture, BlendLinear, ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
6060
{
6161
Size_MatType_t params = GetParam();
6262
const Size srcSize = get<0>(params);

modules/imgproc/src/blend.cpp

Lines changed: 31 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -98,16 +98,16 @@ static inline void load_expand_u8tof32(const uchar* ptr, v_float32x4& dst0, v_fl
9898
v_uint8x16 a = v_load((ptr));
9999
expand_u8tof32(a, dst0, dst1, dst2, dst3);
100100
}
101+
int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn);
102+
int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn);
101103
int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn)
102104
{
103-
const v_float32x4 v_eps = v_setall_f32(1e-5f);
104-
int weight_offset = 0;
105105
int step = v_uint8x16::nlanes * cn;
106-
int weight_step = v_uint8x16::nlanes*cn;
106+
int weight_step = v_uint8x16::nlanes;
107107
switch(cn)
108108
{
109109
case 1:
110-
for( ; x <= width - step; x += step, weight_offset += weight_step)
110+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
111111
{
112112
v_float32x4 v_src10, v_src11, v_src12, v_src13;
113113
v_float32x4 v_src20, v_src21, v_src22, v_src23;
@@ -123,7 +123,7 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
123123
}
124124
break;
125125
case 2:
126-
for( ; x <= width - step; x += step, weight_offset += weight_step)
126+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
127127
{
128128
v_uint8x16 v_src10, v_src11, v_src20, v_src21;
129129
v_load_deinterleave(src1 + x, v_src10, v_src11);
@@ -150,7 +150,7 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
150150
}
151151
break;
152152
case 3:
153-
for( ; x <= width - step; x += step, weight_offset += weight_step)
153+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
154154
{
155155
v_uint8x16 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
156156
v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
@@ -190,32 +190,31 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
190190
v_uint8x16 v_dst0 = pack_f32tou8(v_src100, v_src101, v_src102, v_src103);
191191
v_uint8x16 v_dst1 = pack_f32tou8(v_src110, v_src111, v_src112, v_src113);
192192
v_uint8x16 v_dst2 = pack_f32tou8(v_src120, v_src121, v_src122, v_src123);
193-
194193
v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2);
195194
}
196195
break;
197196
case 4:
198197
step = v_uint8x16::nlanes;
199198
weight_step = v_float32x4::nlanes;
200-
for( ; x <= width - step; x += step, weight_offset += weight_step)
199+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
201200
{
202201
v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17;
203202
v_float32x4 v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27;
204203
load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13);
205204
load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);
205+
206206
v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17);
207207
v_transpose4x4(v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27);
208208

209209
v_float32x4 v_w1 = v_load(weights1 + weight_offset);
210210
v_float32x4 v_w2 = v_load(weights2 + weight_offset);
211-
v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
212-
v_src10 = (v_src14 * v_w1 + v_src24 * v_w2) / v_denom;
213-
v_src11 = (v_src15 * v_w1 + v_src25 * v_w2) / v_denom;
214-
v_src12 = (v_src16 * v_w1 + v_src26 * v_w2) / v_denom;
215-
v_src13 = (v_src17 * v_w1 + v_src27 * v_w2) / v_denom;
211+
v_src10 = blend(v_src14, v_src24, v_w1, v_w2);
212+
v_src11 = blend(v_src15, v_src25, v_w1, v_w2);
213+
v_src12 = blend(v_src16, v_src26, v_w1, v_w2);
214+
v_src13 = blend(v_src17, v_src27, v_w1, v_w2);
215+
216216
v_float32x4 v_dst0, v_dst1, v_dst2, v_dst3;
217217
v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_dst0, v_dst1, v_dst2, v_dst3);
218-
219218
store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
220219
}
221220
break;
@@ -227,68 +226,66 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
227226

228227
int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn)
229228
{
230-
const v_float32x4 v_eps = v_setall_f32(1e-5f);
231-
int weight_offset = 0;
232229
int step = v_float32x4::nlanes*cn;
233230
switch(cn)
234231
{
235232
case 1:
236-
for( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
233+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
237234
{
238235
v_float32x4 v_src1 = v_load(src1 + x);
239236
v_float32x4 v_src2 = v_load(src2 + x);
240237
v_float32x4 v_w1 = v_load(weights1 + weight_offset);
241238
v_float32x4 v_w2 = v_load(weights2 + weight_offset);
242-
v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
243-
v_float32x4 v_dst = (v_src1 * v_w1 + v_src2 * v_w2) / v_denom;
239+
240+
v_float32x4 v_dst = blend(v_src1, v_src2, v_w1, v_w2);
244241

245242
v_store(dst + x, v_dst);
246243
}
247244
break;
248245
case 2:
249-
for( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
246+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
250247
{
251248
v_float32x4 v_src10, v_src11, v_src20, v_src21;
252249
v_load_deinterleave(src1 + x, v_src10, v_src11);
253250
v_load_deinterleave(src2 + x, v_src20, v_src21);
254251
v_float32x4 v_w1 = v_load(weights1 + weight_offset);
255252
v_float32x4 v_w2 = v_load(weights2 + weight_offset);
256-
v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
257-
v_float32x4 v_dst0 = (v_src10 * v_w1 + v_src20 * v_w2) / v_denom;
258-
v_float32x4 v_dst1 = (v_src11 * v_w1 + v_src21 * v_w2) / v_denom;
253+
254+
v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
255+
v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
259256

260257
v_store_interleave(dst + x, v_dst0, v_dst1);
261258
}
262259
break;
263260
case 3:
264-
for( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
261+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
265262
{
266263
v_float32x4 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
267264
v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
268265
v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22);
269266
v_float32x4 v_w1 = v_load(weights1 + weight_offset);
270267
v_float32x4 v_w2 = v_load(weights2 + weight_offset);
271-
v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
272-
v_float32x4 v_dst0 = (v_src10 * v_w1 + v_src20 * v_w2) / v_denom;
273-
v_float32x4 v_dst1 = (v_src11 * v_w1 + v_src21 * v_w2) / v_denom;
274-
v_float32x4 v_dst2 = (v_src12 * v_w1 + v_src22 * v_w2) / v_denom;
268+
269+
v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
270+
v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
271+
v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
275272

276273
v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2);
277274
}
278275
break;
279276
case 4:
280-
for( ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
277+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
281278
{
282279
v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
283280
v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13);
284281
v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22, v_src23);
285282
v_float32x4 v_w1 = v_load(weights1 + weight_offset);
286283
v_float32x4 v_w2 = v_load(weights2 + weight_offset);
287-
v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
288-
v_float32x4 v_dst0 = (v_src10 * v_w1 + v_src20 * v_w2) / v_denom;
289-
v_float32x4 v_dst1 = (v_src11 * v_w1 + v_src21 * v_w2) / v_denom;
290-
v_float32x4 v_dst2 = (v_src12 * v_w1 + v_src22 * v_w2) / v_denom;
291-
v_float32x4 v_dst3 = (v_src13 * v_w1 + v_src23 * v_w2) / v_denom;
284+
285+
v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
286+
v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
287+
v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
288+
v_float32x4 v_dst3 = blend(v_src13, v_src23, v_w1, v_w2);
292289

293290
v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
294291
}

0 commit comments

Comments
 (0)