Skip to content

Commit 1fbdca8

Browse files
committed
Merge pull request opencv#10083 from alalek:core_intrinsics_load_low
2 parents fcdd833 + 3a0039d commit 1fbdca8

File tree

6 files changed

+47
-8
lines changed

6 files changed

+47
-8
lines changed

modules/core/include/opencv2/core/hal/intrin_cpp.hpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ block and to save contents of the register to memory block.
9999
@ref v_setall_s8, @ref v_setall_u8, ...,
100100
@ref v_setzero_u8, @ref v_setzero_s8, ...
101101
- Memory operations:
102-
@ref v_load, @ref v_load_aligned, @ref v_load_halves,
102+
@ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves,
103103
@ref v_store, @ref v_store_aligned,
104104
@ref v_store_high, @ref v_store_low
105105
@@ -1080,6 +1080,26 @@ inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
10801080
return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
10811081
}
10821082

1083+
/** @brief Load 64-bits of data to lower part (high part is undefined).
1084+
1085+
@param ptr memory block containing data for first half (0..n/2)
1086+
1087+
@code{.cpp}
1088+
int lo[2] = { 1, 2 };
1089+
v_int32x4 r = v_load_low(lo);
1090+
@endcode
1091+
*/
1092+
template<typename _Tp>
1093+
inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_low(const _Tp* ptr)
1094+
{
1095+
v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
1096+
for( int i = 0; i < c.nlanes/2; i++ )
1097+
{
1098+
c.s[i] = ptr[i];
1099+
}
1100+
return c;
1101+
}
1102+
10831103
/** @brief Load register contents from two memory blocks
10841104
10851105
@param loptr memory block containing data for first half (0..n/2)

modules/core/include/opencv2/core/hal/intrin_neon.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -763,6 +763,8 @@ inline _Tpvec v_load(const _Tp* ptr) \
763763
{ return _Tpvec(vld1q_##suffix(ptr)); } \
764764
inline _Tpvec v_load_aligned(const _Tp* ptr) \
765765
{ return _Tpvec(vld1q_##suffix(ptr)); } \
766+
inline _Tpvec v_load_low(const _Tp* ptr) \
767+
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); } \
766768
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
767769
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
768770
inline void v_store(_Tp* ptr, const _Tpvec& a) \

modules/core/include/opencv2/core/hal/intrin_sse.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1016,6 +1016,8 @@ inline _Tpvec v_load(const _Tp* ptr) \
10161016
{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
10171017
inline _Tpvec v_load_aligned(const _Tp* ptr) \
10181018
{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
1019+
inline _Tpvec v_load_low(const _Tp* ptr) \
1020+
{ return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
10191021
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
10201022
{ \
10211023
return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
@@ -1044,6 +1046,8 @@ inline _Tpvec v_load(const _Tp* ptr) \
10441046
{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \
10451047
inline _Tpvec v_load_aligned(const _Tp* ptr) \
10461048
{ return _Tpvec(_mm_load_##suffix(ptr)); } \
1049+
inline _Tpvec v_load_low(const _Tp* ptr) \
1050+
{ return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
10471051
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
10481052
{ \
10491053
return _Tpvec(_mm_castsi128_##suffix( \

modules/core/include/opencv2/core/hal/intrin_vsx.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,8 @@ inline _Tpvec v_load(const _Tp* ptr) \
281281
{ return _Tpvec(ld_func(0, ptr)); } \
282282
inline _Tpvec v_load_aligned(const _Tp* ptr) \
283283
{ return _Tpvec(ld_func(0, ptr)); } \
284+
inline _Tpvec v_load_low(const _Tp* ptr) \
285+
{ return _Tpvec(vec_ld_l8(ptr)); } \
284286
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
285287
{ return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); } \
286288
inline void v_store(_Tp* ptr, const _Tpvec& a) \

modules/core/include/opencv2/core/vsx_utils.hpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -556,17 +556,12 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
556556
* vec_ld_l8(ptr) -> Load 64-bits of integer data to lower part
557557
* vec_ldz_l8(ptr) -> Load 64-bits of integer data to lower part and zero upper part
558558
**/
559-
#if defined(__clang__) && !defined(__IBMCPP__)
560-
# define __VSX_LOAD_L8(Tvec, p) (Tvec)((vec_udword2)*((uint64*)(p)))
561-
#else
562-
# define __VSX_LOAD_L8(Tvec, p) *((Tvec*)(p))
563-
#endif
564-
565559
#define VSX_IMPL_LOAD_L8(Tvec, Tp) \
566560
FORCE_INLINE(Tvec) vec_ld_l8(const Tp *p) \
567-
{ return __VSX_LOAD_L8(Tvec, p); } \
561+
{ return ((Tvec)vec_promote(*((uint64*)p), 0)); } \
568562
FORCE_INLINE(Tvec) vec_ldz_l8(const Tp *p) \
569563
{ \
564+
/* TODO: try (Tvec)(vec_udword2{*((uint64*)p), 0}) */ \
570565
static const vec_bdword2 mask = {0xFFFFFFFFFFFFFFFF, 0x0000000000000000}; \
571566
return vec_and(vec_ld_l8(p), (Tvec)mask); \
572567
}

modules/core/test/test_intrin_utils.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,22 @@ template<typename R> struct TheTest
198198
EXPECT_EQ(data.a[0], r3.get0());
199199
EXPECT_EQ(data.u[0], r4.get0());
200200

201+
R r_low = v_load_low((LaneType*)data.u.d);
202+
EXPECT_EQ(data.u[0], r_low.get0());
203+
v_store(out.u.d, r_low);
204+
for (int i = 0; i < R::nlanes/2; ++i)
205+
{
206+
EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]);
207+
}
208+
209+
R r_low_align8byte = v_load_low((LaneType*)((char*)data.u.d + 8));
210+
EXPECT_EQ(data.u[R::nlanes/2], r_low_align8byte.get0());
211+
v_store(out.u.d, r_low_align8byte);
212+
for (int i = 0; i < R::nlanes/2; ++i)
213+
{
214+
EXPECT_EQ((LaneType)data.u[i + R::nlanes/2], (LaneType)out.u[i]);
215+
}
216+
201217
// check some store methods
202218
out.u.clear();
203219
out.a.clear();

0 commit comments

Comments
 (0)