Skip to content

Commit f49f056

Browse files
committed
Merge pull request opencv#8907 from vpisarev:dnn_fast_conv
2 parents bd1334a + fbafc70 commit f49f056

File tree

4 files changed

+45
-1
lines changed

4 files changed

+45
-1
lines changed

modules/core/include/opencv2/core/hal/intrin_cpp.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,27 @@ template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_redu
907907
return c;
908908
}
909909

910+
/** @brief Sums all elements of each input vector, returns the vector of sums
911+
912+
Scheme:
913+
@code
914+
result[0] = a[0] + a[1] + a[2] + a[3]
915+
result[1] = b[0] + b[1] + b[2] + b[3]
916+
result[2] = c[0] + c[1] + c[2] + c[3]
917+
result[3] = d[0] + d[1] + d[2] + d[3]
918+
@endcode
919+
*/
920+
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
921+
const v_float32x4& c, const v_float32x4& d)
922+
{
923+
v_float32x4 r;
924+
r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
925+
r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
926+
r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
927+
r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
928+
return r;
929+
}
930+
910931
/** @brief Get negative values mask
911932
912933
Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.

modules/core/include/opencv2/core/hal/intrin_neon.hpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -815,6 +815,21 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
815815
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
816816
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
817817

818+
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
819+
const v_float32x4& c, const v_float32x4& d)
820+
{
821+
float32x4x2_t ab = vtrnq_f32(a.val, b.val);
822+
float32x4x2_t cd = vtrnq_f32(c.val, d.val);
823+
824+
float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
825+
float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
826+
827+
float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
828+
float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
829+
830+
return v_float32x4(vaddq_f32(v0, v1));
831+
}
832+
818833
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
819834
inline v_uint32x4 v_popcount(const _Tpvec& a) \
820835
{ \

modules/core/include/opencv2/core/hal/intrin_sse.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,6 +1126,14 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV
11261126
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
11271127
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
11281128

1129+
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1130+
const v_float32x4& c, const v_float32x4& d)
1131+
{
1132+
__m128 ab = _mm_hadd_ps(a.val, b.val);
1133+
__m128 cd = _mm_hadd_ps(c.val, d.val);
1134+
return v_float32x4(_mm_hadd_ps(ab, cd));
1135+
}
1136+
11291137
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
11301138
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
11311139
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)

modules/core/src/parallel.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,7 @@ int cv::getNumThreads(void)
425425

426426
#elif defined HAVE_GCD
427427

428-
return 512; // the GCD thread pool limit
428+
return cv::getNumberOfCPUs(); // the GCD thread pool limit
429429

430430
#elif defined WINRT
431431

0 commit comments

Comments
 (0)