Skip to content

Commit c624d82

Browse files
committed
Merge pull request opencv#8239 from tomoaki0705:buildUniversalIntrinsicBlend
2 parents 0c00242 + aec59ab commit c624d82

File tree

3 files changed

+283
-2
lines changed

3 files changed

+283
-2
lines changed

modules/core/include/opencv2/core/hal/intrin_sse.hpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1368,6 +1368,24 @@ OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NO
13681368
OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
13691369

13701370
// adopted from sse_utils.hpp
1371+
inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
1372+
{
1373+
__m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1374+
__m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1375+
1376+
__m128i t10 = _mm_unpacklo_epi8(t00, t01);
1377+
__m128i t11 = _mm_unpackhi_epi8(t00, t01);
1378+
1379+
__m128i t20 = _mm_unpacklo_epi8(t10, t11);
1380+
__m128i t21 = _mm_unpackhi_epi8(t10, t11);
1381+
1382+
__m128i t30 = _mm_unpacklo_epi8(t20, t21);
1383+
__m128i t31 = _mm_unpackhi_epi8(t20, t21);
1384+
1385+
a.val = _mm_unpacklo_epi8(t30, t31);
1386+
b.val = _mm_unpackhi_epi8(t30, t31);
1387+
}
1388+
13711389
inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
13721390
{
13731391
__m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
@@ -1507,6 +1525,15 @@ inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8&
15071525
_mm_storeu_si128((__m128i*)(ptr + 8), t1);
15081526
}
15091527

1528+
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b)
1529+
{
1530+
__m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
1531+
__m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
1532+
1533+
_mm_storeu_si128((__m128i*)(ptr), v0);
1534+
_mm_storeu_si128((__m128i*)(ptr + 16), v1);
1535+
}
1536+
15101537
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
15111538
const v_uint8x16& c )
15121539
{

modules/imgproc/perf/opencl/perf_blend.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ namespace ocl {
5656

5757
typedef Size_MatType BlendLinearFixture;
5858

59-
OCL_PERF_TEST_P(BlendLinearFixture, BlendLinear, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
59+
OCL_PERF_TEST_P(BlendLinearFixture, BlendLinear, ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
6060
{
6161
Size_MatType_t params = GetParam();
6262
const Size srcSize = get<0>(params);

modules/imgproc/src/blend.cpp

Lines changed: 255 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,257 @@
4545

4646
#include "precomp.hpp"
4747
#include "opencl_kernels_imgproc.hpp"
48+
#include "opencv2/core/hal/intrin.hpp"
4849

4950
namespace cv {
51+
#if CV_SIMD128
52+
static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const v_float32x4& v_w1, const v_float32x4& v_w2)
53+
{
54+
const v_float32x4 v_eps = v_setall_f32(1e-5f);
55+
v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
56+
return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom;
57+
}
58+
static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const float* w_ptr1, const float* w_ptr2, int offset)
59+
{
60+
v_float32x4 v_w1 = v_load(w_ptr1 + offset);
61+
v_float32x4 v_w2 = v_load(w_ptr2 + offset);
62+
return blend(v_src1, v_src2, v_w1, v_w2);
63+
}
64+
static inline v_uint32x4 saturate_f32_u32(const v_float32x4& vec)
65+
{
66+
const v_int32x4 z = v_setzero_s32();
67+
const v_int32x4 x = v_setall_s32(255);
68+
return v_reinterpret_as_u32(v_min(v_max(v_round(vec), z), x));
69+
}
70+
static inline v_uint8x16 pack_f32tou8(v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3)
71+
{
72+
v_uint32x4 a = saturate_f32_u32(val0);
73+
v_uint32x4 b = saturate_f32_u32(val1);
74+
v_uint32x4 c = saturate_f32_u32(val2);
75+
v_uint32x4 d = saturate_f32_u32(val3);
76+
v_uint16x8 e = v_pack(a, b);
77+
v_uint16x8 f = v_pack(c, d);
78+
return v_pack(e, f);
79+
}
80+
static inline void store_pack_f32tou8(uchar* ptr, v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3)
81+
{
82+
v_store((ptr), pack_f32tou8(val0, val1, val2, val3));
83+
}
84+
static inline void expand_u8tof32(const v_uint8x16& src, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3)
85+
{
86+
v_uint16x8 a0, a1;
87+
v_expand(src, a0, a1);
88+
v_uint32x4 b0, b1,b2,b3;
89+
v_expand(a0, b0, b1);
90+
v_expand(a1, b2, b3);
91+
dst0 = v_cvt_f32(v_reinterpret_as_s32(b0));
92+
dst1 = v_cvt_f32(v_reinterpret_as_s32(b1));
93+
dst2 = v_cvt_f32(v_reinterpret_as_s32(b2));
94+
dst3 = v_cvt_f32(v_reinterpret_as_s32(b3));
95+
}
96+
static inline void load_expand_u8tof32(const uchar* ptr, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3)
97+
{
98+
v_uint8x16 a = v_load((ptr));
99+
expand_u8tof32(a, dst0, dst1, dst2, dst3);
100+
}
101+
int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn);
102+
int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn);
103+
int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn)
104+
{
105+
int step = v_uint8x16::nlanes * cn;
106+
int weight_step = v_uint8x16::nlanes;
107+
switch(cn)
108+
{
109+
case 1:
110+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
111+
{
112+
v_float32x4 v_src10, v_src11, v_src12, v_src13;
113+
v_float32x4 v_src20, v_src21, v_src22, v_src23;
114+
load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13);
115+
load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);
116+
117+
v_float32x4 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset);
118+
v_float32x4 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + 4);
119+
v_float32x4 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 8);
120+
v_float32x4 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 12);
121+
122+
store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
123+
}
124+
break;
125+
case 2:
126+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
127+
{
128+
v_uint8x16 v_src10, v_src11, v_src20, v_src21;
129+
v_load_deinterleave(src1 + x, v_src10, v_src11);
130+
v_load_deinterleave(src2 + x, v_src20, v_src21);
131+
v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113;
132+
v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213;
133+
expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103);
134+
expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113);
135+
expand_u8tof32(v_src20, v_src200, v_src201, v_src202, v_src203);
136+
expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213);
137+
138+
v_float32x4 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset);
139+
v_float32x4 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset);
140+
v_float32x4 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + 4);
141+
v_float32x4 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + 4);
142+
v_float32x4 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 8);
143+
v_float32x4 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 8);
144+
v_float32x4 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 12);
145+
v_float32x4 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 12);
146+
147+
v_uint8x16 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6);
148+
v_uint8x16 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7);
149+
v_store_interleave(dst + x, v_dsta, v_dstb);
150+
}
151+
break;
152+
case 3:
153+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
154+
{
155+
v_uint8x16 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
156+
v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
157+
v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22);
158+
159+
v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113, v_src120, v_src121, v_src122, v_src123;
160+
v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213, v_src220, v_src221, v_src222, v_src223;
161+
expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103);
162+
expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113);
163+
expand_u8tof32(v_src12, v_src120, v_src121, v_src122, v_src123);
164+
expand_u8tof32(v_src20, v_src200, v_src201, v_src202, v_src203);
165+
expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213);
166+
expand_u8tof32(v_src22, v_src220, v_src221, v_src222, v_src223);
167+
168+
v_float32x4 v_w10 = v_load(weights1 + weight_offset);
169+
v_float32x4 v_w11 = v_load(weights1 + weight_offset + 4);
170+
v_float32x4 v_w12 = v_load(weights1 + weight_offset + 8);
171+
v_float32x4 v_w13 = v_load(weights1 + weight_offset + 12);
172+
v_float32x4 v_w20 = v_load(weights2 + weight_offset);
173+
v_float32x4 v_w21 = v_load(weights2 + weight_offset + 4);
174+
v_float32x4 v_w22 = v_load(weights2 + weight_offset + 8);
175+
v_float32x4 v_w23 = v_load(weights2 + weight_offset + 12);
176+
v_src100 = blend(v_src100, v_src200, v_w10, v_w20);
177+
v_src110 = blend(v_src110, v_src210, v_w10, v_w20);
178+
v_src120 = blend(v_src120, v_src220, v_w10, v_w20);
179+
v_src101 = blend(v_src101, v_src201, v_w11, v_w21);
180+
v_src111 = blend(v_src111, v_src211, v_w11, v_w21);
181+
v_src121 = blend(v_src121, v_src221, v_w11, v_w21);
182+
v_src102 = blend(v_src102, v_src202, v_w12, v_w22);
183+
v_src112 = blend(v_src112, v_src212, v_w12, v_w22);
184+
v_src122 = blend(v_src122, v_src222, v_w12, v_w22);
185+
v_src103 = blend(v_src103, v_src203, v_w13, v_w23);
186+
v_src113 = blend(v_src113, v_src213, v_w13, v_w23);
187+
v_src123 = blend(v_src123, v_src223, v_w13, v_w23);
188+
189+
190+
v_uint8x16 v_dst0 = pack_f32tou8(v_src100, v_src101, v_src102, v_src103);
191+
v_uint8x16 v_dst1 = pack_f32tou8(v_src110, v_src111, v_src112, v_src113);
192+
v_uint8x16 v_dst2 = pack_f32tou8(v_src120, v_src121, v_src122, v_src123);
193+
v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2);
194+
}
195+
break;
196+
case 4:
197+
step = v_uint8x16::nlanes;
198+
weight_step = v_float32x4::nlanes;
199+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
200+
{
201+
v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17;
202+
v_float32x4 v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27;
203+
load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13);
204+
load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);
205+
206+
v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17);
207+
v_transpose4x4(v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27);
208+
209+
v_float32x4 v_w1 = v_load(weights1 + weight_offset);
210+
v_float32x4 v_w2 = v_load(weights2 + weight_offset);
211+
v_src10 = blend(v_src14, v_src24, v_w1, v_w2);
212+
v_src11 = blend(v_src15, v_src25, v_w1, v_w2);
213+
v_src12 = blend(v_src16, v_src26, v_w1, v_w2);
214+
v_src13 = blend(v_src17, v_src27, v_w1, v_w2);
215+
216+
v_float32x4 v_dst0, v_dst1, v_dst2, v_dst3;
217+
v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_dst0, v_dst1, v_dst2, v_dst3);
218+
store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
219+
}
220+
break;
221+
default:
222+
break;
223+
}
224+
return x;
225+
}
226+
227+
int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn)
228+
{
229+
int step = v_float32x4::nlanes*cn;
230+
switch(cn)
231+
{
232+
case 1:
233+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
234+
{
235+
v_float32x4 v_src1 = v_load(src1 + x);
236+
v_float32x4 v_src2 = v_load(src2 + x);
237+
v_float32x4 v_w1 = v_load(weights1 + weight_offset);
238+
v_float32x4 v_w2 = v_load(weights2 + weight_offset);
239+
240+
v_float32x4 v_dst = blend(v_src1, v_src2, v_w1, v_w2);
241+
242+
v_store(dst + x, v_dst);
243+
}
244+
break;
245+
case 2:
246+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
247+
{
248+
v_float32x4 v_src10, v_src11, v_src20, v_src21;
249+
v_load_deinterleave(src1 + x, v_src10, v_src11);
250+
v_load_deinterleave(src2 + x, v_src20, v_src21);
251+
v_float32x4 v_w1 = v_load(weights1 + weight_offset);
252+
v_float32x4 v_w2 = v_load(weights2 + weight_offset);
253+
254+
v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
255+
v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
256+
257+
v_store_interleave(dst + x, v_dst0, v_dst1);
258+
}
259+
break;
260+
case 3:
261+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
262+
{
263+
v_float32x4 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
264+
v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
265+
v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22);
266+
v_float32x4 v_w1 = v_load(weights1 + weight_offset);
267+
v_float32x4 v_w2 = v_load(weights2 + weight_offset);
268+
269+
v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
270+
v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
271+
v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
272+
273+
v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2);
274+
}
275+
break;
276+
case 4:
277+
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
278+
{
279+
v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
280+
v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13);
281+
v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22, v_src23);
282+
v_float32x4 v_w1 = v_load(weights1 + weight_offset);
283+
v_float32x4 v_w2 = v_load(weights2 + weight_offset);
284+
285+
v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
286+
v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
287+
v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
288+
v_float32x4 v_dst3 = blend(v_src13, v_src23, v_w1, v_w2);
289+
290+
v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
291+
}
292+
break;
293+
default:
294+
break;
295+
}
296+
return x;
297+
}
298+
#endif
50299

51300
template <typename T>
52301
class BlendLinearInvoker :
@@ -71,7 +320,12 @@ class BlendLinearInvoker :
71320
const T * const src2_row = src2->ptr<T>(y);
72321
T * const dst_row = dst->ptr<T>(y);
73322

74-
for (int x = 0; x < width; ++x)
323+
int x = 0;
324+
#if CV_SIMD128
325+
x = blendLinearSimd128(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn);
326+
#endif
327+
328+
for ( ; x < width; ++x)
75329
{
76330
int x1 = x / cn;
77331
float w1 = weights1_row[x1], w2 = weights2_row[x1];

0 commit comments

Comments
 (0)