Skip to content

Commit 3cbe60c

Browse files
tomoaki0705alalek
authored andcommitted
Merge pull request opencv#9753 from tomoaki0705:universalMatmul
* add accuracy test and performance check for matmul * add performance tests for transform and dotProduct * add test Core_TransformLargeTest for 8u version of transform * remove raw SSE2/NEON implementation from matmul.cpp * use universal intrinsic instead of raw intrinsic * remove unused templated function * add v_matmuladd which multiply 3x3 matrix and add 3x1 vector * add v_rotate_left/right in universal intrinsic * suppress intrinsic on some function and platform * add pure SW implementation of new universal intrinsics * add test for new universal intrinsics * core: prevent memory access after the end of buffer * fix perf tests
1 parent 2674c6b commit 3cbe60c

File tree

9 files changed

+572
-383
lines changed

9 files changed

+572
-383
lines changed

modules/core/include/opencv2/core/hal/intrin_cpp.hpp

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -885,12 +885,59 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg
885885
/** @brief Bitwise shift left
886886
887887
For 16-, 32- and 64-bit integer values. */
888-
OPENCV_HAL_IMPL_SHIFT_OP(<<)
888+
OPENCV_HAL_IMPL_SHIFT_OP(<< )
889889

890890
/** @brief Bitwise shift right
891891
892892
For 16-, 32- and 64-bit integer values. */
893-
OPENCV_HAL_IMPL_SHIFT_OP(>>)
893+
OPENCV_HAL_IMPL_SHIFT_OP(>> )
894+
895+
/** @brief Element shift left among vector
896+
897+
For all type */
898+
#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
899+
template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
900+
{ \
901+
v_reg<_Tp, n> b; \
902+
for (int i = 0; i < n; i++) \
903+
{ \
904+
int sIndex = i opA imm; \
905+
if (0 <= sIndex && sIndex < n) \
906+
{ \
907+
b.s[i] = a.s[sIndex]; \
908+
} \
909+
else \
910+
{ \
911+
b.s[i] = 0; \
912+
} \
913+
} \
914+
return b; \
915+
} \
916+
template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
917+
{ \
918+
v_reg<_Tp, n> c; \
919+
for (int i = 0; i < n; i++) \
920+
{ \
921+
int aIndex = i opA imm; \
922+
int bIndex = i opA imm opB n; \
923+
if (0 <= bIndex && bIndex < n) \
924+
{ \
925+
c.s[i] = b.s[bIndex]; \
926+
} \
927+
else if (0 <= aIndex && aIndex < n) \
928+
{ \
929+
c.s[i] = a.s[aIndex]; \
930+
} \
931+
else \
932+
{ \
933+
c.s[i] = 0; \
934+
} \
935+
} \
936+
return c; \
937+
}
938+
939+
OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left, -, +)
940+
OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
894941

895942
/** @brief Sum packed values
896943
@@ -1860,6 +1907,30 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
18601907
v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
18611908
}
18621909

1910+
/** @brief Matrix multiplication and add
1911+
1912+
Scheme:
1913+
@code
1914+
{A0 A1 A2 } |V0| |D0|
1915+
{B0 B1 B2 } |V1| |D1|
1916+
{C0 C1 C2 } x |V2| + |D2|
1917+
====================
1918+
{R0 R1 R2 R3}, where:
1919+
R0 = A0V0 + A1V1 + A2V2 + D0,
1920+
R1 = B0V0 + B1V1 + B2V2 + D1
1921+
...
1922+
@endcode
1923+
*/
1924+
inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
1925+
const v_float32x4& m1, const v_float32x4& m2,
1926+
const v_float32x4& m3)
1927+
{
1928+
return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0],
1929+
v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1],
1930+
v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2],
1931+
v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
1932+
}
1933+
18631934
//! @}
18641935

18651936
//! @name Check SIMD support

modules/core/include/opencv2/core/hal/intrin_neon.hpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,18 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
407407
return v_float32x4(res);
408408
}
409409

410+
inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
411+
const v_float32x4& m1, const v_float32x4& m2,
412+
const v_float32x4& a)
413+
{
414+
float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
415+
float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
416+
res = vmlaq_lane_f32(res, m1.val, vl, 1);
417+
res = vmlaq_lane_f32(res, m2.val, vh, 0);
418+
res = vaddq_f32(res, a.val);
419+
return v_float32x4(res);
420+
}
421+
410422
#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
411423
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
412424
{ \
@@ -747,7 +759,15 @@ template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
747759
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
748760
{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
749761
template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
750-
{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
762+
{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); } \
763+
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
764+
{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
765+
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
766+
{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
767+
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
768+
{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
769+
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
770+
{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); }
751771

752772
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
753773
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)

modules/core/include/opencv2/core/hal/intrin_sse.hpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,16 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
602602
return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
603603
}
604604

605+
inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
606+
const v_float32x4& m1, const v_float32x4& m2,
607+
const v_float32x4& a)
608+
{
609+
__m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
610+
__m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
611+
__m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
612+
613+
return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
614+
}
605615

606616
#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
607617
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
@@ -1011,6 +1021,29 @@ OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
10111021
OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
10121022
OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
10131023

1024+
template<int imm, typename _Tpvec>
1025+
inline _Tpvec v_rotate_right(const _Tpvec &a)
1026+
{
1027+
return _Tpvec(_mm_srli_si128(a.val, imm*(sizeof(typename _Tpvec::lane_type))));
1028+
}
1029+
template<int imm, typename _Tpvec>
1030+
inline _Tpvec v_rotate_left(const _Tpvec &a)
1031+
{
1032+
return _Tpvec(_mm_slli_si128(a.val, imm*(sizeof(typename _Tpvec::lane_type))));
1033+
}
1034+
template<int imm, typename _Tpvec>
1035+
inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1036+
{
1037+
const int cWidth = sizeof(typename _Tpvec::lane_type);
1038+
return _Tpvec(_mm_or_si128(_mm_srli_si128(a.val, imm*cWidth), _mm_slli_si128(b.val, (16 - imm*cWidth))));
1039+
}
1040+
template<int imm, typename _Tpvec>
1041+
inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1042+
{
1043+
const int cWidth = sizeof(typename _Tpvec::lane_type);
1044+
return _Tpvec(_mm_or_si128(_mm_slli_si128(a.val, imm*cWidth), _mm_srli_si128(b.val, (16 - imm*cWidth))));
1045+
}
1046+
10141047
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
10151048
inline _Tpvec v_load(const _Tp* ptr) \
10161049
{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \

modules/core/perf/opencl/perf_arithm.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1062,6 +1062,34 @@ OCL_PERF_TEST_P(ScaleAddFixture, ScaleAdd,
10621062
SANITY_CHECK(dst, 1e-6);
10631063
}
10641064

1065+
///////////// Transform ////////////////////////
1066+
1067+
typedef Size_MatType TransformFixture;
1068+
1069+
OCL_PERF_TEST_P(TransformFixture, Transform,
1070+
::testing::Combine(OCL_TEST_SIZES,
1071+
::testing::Values(CV_8UC3, CV_8SC3, CV_16UC3, CV_16SC3, CV_32SC3, CV_32FC3, CV_64FC3)))
1072+
{
1073+
const Size_MatType_t params = GetParam();
1074+
const Size srcSize = get<0>(params);
1075+
const int type = get<1>(params);
1076+
1077+
checkDeviceMaxMemoryAllocSize(srcSize, type);
1078+
1079+
const float transform[] = { 0.5f, 0.f, 0.86602540378f, 128,
1080+
0.f, 1.f, 0.f, -64,
1081+
0.86602540378f, 0.f, 0.5f, 32,};
1082+
Mat mtx(Size(4, 3), CV_32FC1, (void*)transform);
1083+
1084+
UMat src(srcSize, type), dst(srcSize, type);
1085+
randu(src, 0, 30);
1086+
declare.in(src).out(dst);
1087+
1088+
OCL_TEST_CYCLE() cv::transform(src, dst, mtx);
1089+
1090+
SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
1091+
}
1092+
10651093
///////////// PSNR ////////////////////////
10661094

10671095
typedef Size_MatType PSNRFixture;

modules/core/perf/perf_mat.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,31 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi,
9696

9797
SANITY_CHECK(destination, 1);
9898
}
99+
100+
///////////// Transform ////////////////////////
101+
102+
PERF_TEST_P(Size_MatType, Mat_Transform,
103+
testing::Combine(testing::Values(TYPICAL_MAT_SIZES),
104+
testing::Values(CV_8UC3, CV_8SC3, CV_16UC3, CV_16SC3, CV_32SC3, CV_32FC3, CV_64FC3))
105+
)
106+
{
107+
const Size_MatType_t params = GetParam();
108+
const Size srcSize0 = get<0>(params);
109+
const Size srcSize = Size(1, srcSize0.width*srcSize0.height);
110+
const int type = get<1>(params);
111+
const float transform[] = { 0.5f, 0.f, 0.86602540378f, 128,
112+
0.f, 1.f, 0.f, -64,
113+
0.86602540378f, 0.f, 0.5f, 32,};
114+
Mat mtx(Size(4, 3), CV_32FC1, (void*)transform);
115+
116+
Mat src(srcSize, type), dst(srcSize, type);
117+
randu(src, 0, 30);
118+
declare.in(src).out(dst);
119+
120+
TEST_CYCLE()
121+
{
122+
cv::transform(src, dst, mtx);
123+
}
124+
125+
SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
126+
}

0 commit comments

Comments
 (0)