Skip to content

Commit 595fd27

Browse files
committed
Merge pull request opencv#7182 from mself:two_channel_universal_intrinsics
2 parents d4ae7f3 + 9678d48 commit 595fd27

File tree

4 files changed

+111
-8
lines changed

4 files changed

+111
-8
lines changed

modules/core/include/opencv2/core/hal/intrin_cpp.hpp

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ block and to save contents of the register to memory block.
103103
104104
These operations allow to reorder or recombine elements in one or multiple vectors.
105105
106-
- Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
106+
- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
107107
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
108108
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
109109
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
@@ -1075,12 +1075,31 @@ v_load_expand_q(const _Tp* ptr)
10751075
return c;
10761076
}
10771077

1078-
/** @brief Load and deinterleave (4 channels)
1078+
/** @brief Load and deinterleave (2 channels)
10791079
1080-
Load data from memory deinterleave and store to 4 registers.
1080+
Load data from memory deinterleave and store to 2 registers.
10811081
Scheme:
10821082
@code
1083-
{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
1083+
{A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
1084+
@endcode
1085+
For all types except 64-bit. */
1086+
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
1087+
v_reg<_Tp, n>& b)
1088+
{
1089+
int i, i2;
1090+
for( i = i2 = 0; i < n; i++, i2 += 2 )
1091+
{
1092+
a.s[i] = ptr[i2];
1093+
b.s[i] = ptr[i2+1];
1094+
}
1095+
}
1096+
1097+
/** @brief Load and deinterleave (3 channels)
1098+
1099+
Load data from memory deinterleave and store to 3 registers.
1100+
Scheme:
1101+
@code
1102+
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
10841103
@endcode
10851104
For all types except 64-bit. */
10861105
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
@@ -1095,12 +1114,12 @@ template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_
10951114
}
10961115
}
10971116

1098-
/** @brief Load and deinterleave (3 channels)
1117+
/** @brief Load and deinterleave (4 channels)
10991118
1100-
Load data from memory deinterleave and store to 3 registers.
1119+
Load data from memory deinterleave and store to 4 registers.
11011120
Scheme:
11021121
@code
1103-
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
1122+
{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
11041123
@endcode
11051124
For all types except 64-bit. */
11061125
template<typename _Tp, int n>
@@ -1118,12 +1137,32 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
11181137
}
11191138
}
11201139

1140+
/** @brief Interleave and store (2 channels)
1141+
1142+
Interleave and store data from 2 registers to memory.
1143+
Scheme:
1144+
@code
1145+
{A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
1146+
@endcode
1147+
For all types except 64-bit. */
1148+
template<typename _Tp, int n>
1149+
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
1150+
const v_reg<_Tp, n>& b)
1151+
{
1152+
int i, i2;
1153+
for( i = i2 = 0; i < n; i++, i2 += 2 )
1154+
{
1155+
ptr[i2] = a.s[i];
1156+
ptr[i2+1] = b.s[i];
1157+
}
1158+
}
1159+
11211160
/** @brief Interleave and store (3 channels)
11221161
11231162
Interleave and store data from 3 registers to memory.
11241163
Scheme:
11251164
@code
1126-
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
1165+
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
11271166
@endcode
11281167
For all types except 64-bit. */
11291168
template<typename _Tp, int n>

modules/core/include/opencv2/core/hal/intrin_neon.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,12 @@ OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
809809
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
810810

811811
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
812+
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
813+
{ \
814+
_Tpvec##x2_t v = vld2q_##suffix(ptr); \
815+
a.val = v.val[0]; \
816+
b.val = v.val[1]; \
817+
} \
812818
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
813819
{ \
814820
_Tpvec##x3_t v = vld3q_##suffix(ptr); \
@@ -825,6 +831,13 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
825831
c.val = v.val[2]; \
826832
d.val = v.val[3]; \
827833
} \
834+
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
835+
{ \
836+
_Tpvec##x2_t v; \
837+
v.val[0] = a.val; \
838+
v.val[1] = b.val; \
839+
vst2q_##suffix(ptr, v); \
840+
} \
828841
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
829842
{ \
830843
_Tpvec##x3_t v; \

modules/core/include/opencv2/core/hal/intrin_sse.hpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1374,6 +1374,18 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
13741374
v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
13751375
}
13761376

1377+
// 2-channel, float only
1378+
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
1379+
{
1380+
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
1381+
1382+
__m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
1383+
__m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
1384+
1385+
a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
1386+
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
1387+
}
1388+
13771389
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
13781390
const v_uint8x16& c )
13791391
{
@@ -1529,6 +1541,18 @@ inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint3
15291541
v_store(ptr + 12, t3);
15301542
}
15311543

1544+
// 2-channel, float only
1545+
inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
1546+
{
1547+
// a0 a1 a2 a3 ...
1548+
// b0 b1 b2 b3 ...
1549+
__m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
1550+
__m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
1551+
1552+
_mm_storeu_ps(ptr, u0);
1553+
_mm_storeu_ps((ptr + 4), u1);
1554+
}
1555+
15321556
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
15331557
inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
15341558
_Tpvec& b0, _Tpvec& c0 ) \

modules/core/test/test_intrin.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,32 @@ template<typename R> struct TheTest
132132
return *this;
133133
}
134134

135+
// float32x4 only
136+
TheTest & test_interleave_2channel()
137+
{
138+
Data<R> data1, data2;
139+
data2 += 20;
140+
141+
R a = data1, b = data2;
142+
143+
LaneType buf2[R::nlanes * 2];
144+
145+
v_store_interleave(buf2, a, b);
146+
147+
Data<R> z(0);
148+
a = b = z;
149+
150+
v_load_deinterleave(buf2, a, b);
151+
152+
for (int i = 0; i < R::nlanes; ++i)
153+
{
154+
EXPECT_EQ(data1, Data<R>(a));
155+
EXPECT_EQ(data2, Data<R>(b));
156+
}
157+
158+
return *this;
159+
}
160+
135161
// v_expand and v_load_expand
136162
TheTest & test_expand()
137163
{
@@ -846,6 +872,7 @@ TEST(hal_intrin, float32x4) {
846872
TheTest<v_float32x4>()
847873
.test_loadstore()
848874
.test_interleave()
875+
.test_interleave_2channel()
849876
.test_addsub()
850877
.test_mul()
851878
.test_div()

0 commit comments

Comments
 (0)