@@ -899,6 +899,15 @@ inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
899
899
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP (v_float32x4, ps)
900
900
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP (v_float64x2, pd)
901
901
902
+ #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP (_Tpvec, cast ) \
903
+ inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
904
+ { return cast (v_reinterpret_as_f64 (a) == v_reinterpret_as_f64 (b)); } \
905
+ inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
906
+ { return cast (v_reinterpret_as_f64 (a) != v_reinterpret_as_f64 (b)); }
907
+
908
+ OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP (v_uint64x2, v_reinterpret_as_u64);
909
+ OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP (v_int64x2, v_reinterpret_as_s64);
910
+
902
911
OPENCV_HAL_IMPL_SSE_BIN_FUNC (v_uint8x16, v_add_wrap, _mm_add_epi8)
903
912
OPENCV_HAL_IMPL_SSE_BIN_FUNC (v_int8x16, v_add_wrap, _mm_add_epi8)
904
913
OPENCV_HAL_IMPL_SSE_BIN_FUNC (v_uint16x8, v_add_wrap, _mm_add_epi16)
@@ -1520,6 +1529,35 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
1520
1529
v_transpose4x4 (u0, u1, u2, u3, a, b, c, d);
1521
1530
}
1522
1531
1532
+ inline void v_load_deinterleave (const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
1533
+ {
1534
+ __m128i t0 = _mm_loadu_si128 ((const __m128i*)ptr);
1535
+ __m128i t1 = _mm_loadu_si128 ((const __m128i*)(ptr + 2 ));
1536
+ __m128i t2 = _mm_loadu_si128 ((const __m128i*)(ptr + 4 ));
1537
+
1538
+ a = v_uint64x2 (_mm_unpacklo_epi64 (t0, _mm_unpackhi_epi64 (t1, t1)));
1539
+ b = v_uint64x2 (_mm_unpacklo_epi64 (_mm_unpackhi_epi64 (t0, t0), t2));
1540
+ c = v_uint64x2 (_mm_unpacklo_epi64 (t1, _mm_unpackhi_epi64 (t2, t2)));
1541
+ }
1542
+
1543
+ inline void v_load_deinterleave (const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c)
1544
+ {
1545
+ v_uint64x2 t0, t1, t2;
1546
+ v_load_deinterleave ((const uint64*)ptr, t0, t1, t2);
1547
+ a = v_reinterpret_as_s64 (t0);
1548
+ b = v_reinterpret_as_s64 (t1);
1549
+ c = v_reinterpret_as_s64 (t2);
1550
+ }
1551
+
1552
+ inline void v_load_deinterleave (const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c)
1553
+ {
1554
+ v_uint64x2 t0, t1, t2;
1555
+ v_load_deinterleave ((const uint64*)ptr, t0, t1, t2);
1556
+ a = v_reinterpret_as_f64 (t0);
1557
+ b = v_reinterpret_as_f64 (t1);
1558
+ c = v_reinterpret_as_f64 (t2);
1559
+ }
1560
+
1523
1561
// 2-channel, float only
1524
1562
inline void v_load_deinterleave (const float * ptr, v_float32x4& a, v_float32x4& b)
1525
1563
{
@@ -1717,6 +1755,27 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
1717
1755
_mm_storeu_ps ((ptr + 4 ), u1);
1718
1756
}
1719
1757
1758
+ inline void v_store_interleave (uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
1759
+ {
1760
+ __m128i t0 = _mm_unpacklo_epi64 (a.val , b.val );
1761
+ __m128i t1 = _mm_unpacklo_epi64 (c.val , _mm_unpackhi_epi64 (a.val , a.val ));
1762
+ __m128i t2 = _mm_unpackhi_epi64 (b.val , c.val );
1763
+
1764
+ _mm_storeu_si128 ((__m128i*)ptr, t0);
1765
+ _mm_storeu_si128 ((__m128i*)(ptr + 2 ), t1);
1766
+ _mm_storeu_si128 ((__m128i*)(ptr + 4 ), t2);
1767
+ }
1768
+
1769
+ inline void v_store_interleave (int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c)
1770
+ {
1771
+ v_store_interleave ((uint64*)ptr, v_reinterpret_as_u64 (a), v_reinterpret_as_u64 (b), v_reinterpret_as_u64 (c));
1772
+ }
1773
+
1774
+ inline void v_store_interleave (double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1775
+ {
1776
+ v_store_interleave ((uint64*)ptr, v_reinterpret_as_u64 (a), v_reinterpret_as_u64 (b), v_reinterpret_as_u64 (c));
1777
+ }
1778
+
1720
1779
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE (_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix ) \
1721
1780
inline void v_load_deinterleave ( const _Tp* ptr, _Tpvec& a0, \
1722
1781
_Tpvec& b0, _Tpvec& c0 ) \
0 commit comments