Skip to content

Commit 3476440

Browse files
committed
Merge pull request opencv#8078 from tomoaki0705:universalIntrinsicLapack
2 parents b333b1f + fd71121 commit 3476440

File tree

2 files changed

+120
-118
lines changed

2 files changed

+120
-118
lines changed

modules/core/include/opencv2/core/hal/intrin_sse.hpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1101,6 +1101,15 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
11011101
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
11021102
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)
11031103

1104+
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
1105+
inline scalartype v_reduce_sum(const _Tpvec& a) \
1106+
{ \
1107+
regtype val = a.val; \
1108+
val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
1109+
val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
1110+
return (scalartype)_mm_cvt##extract(val); \
1111+
}
1112+
11041113
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
11051114
inline scalartype v_reduce_##func(const _Tpvec& a) \
11061115
{ \
@@ -1111,13 +1120,14 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
11111120
return scalar_func(s0, s1); \
11121121
}
11131122

1114-
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
1123+
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1124+
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1125+
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
1126+
11151127
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
11161128
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1117-
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
11181129
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
11191130
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
1120-
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
11211131
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
11221132
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
11231133

modules/core/src/lapack.cpp

Lines changed: 107 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -262,25 +262,21 @@ template<typename T> struct VBLAS
262262
int givensx(T*, T*, int, T, T, T*, T*) const { return 0; }
263263
};
264264

265-
#if CV_SSE2
265+
#if CV_SIMD128
266266
template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, float* result) const
267267
{
268268
if( n < 8 )
269269
return 0;
270270
int k = 0;
271-
__m128 s0 = _mm_setzero_ps(), s1 = _mm_setzero_ps();
272-
for( ; k <= n - 8; k += 8 )
271+
v_float32x4 s0 = v_setzero_f32();
272+
for( ; k <= n - v_float32x4::nlanes; k += v_float32x4::nlanes )
273273
{
274-
__m128 a0 = _mm_load_ps(a + k), a1 = _mm_load_ps(a + k + 4);
275-
__m128 b0 = _mm_load_ps(b + k), b1 = _mm_load_ps(b + k + 4);
274+
v_float32x4 a0 = v_load(a + k);
275+
v_float32x4 b0 = v_load(b + k);
276276

277-
s0 = _mm_add_ps(s0, _mm_mul_ps(a0, b0));
278-
s1 = _mm_add_ps(s1, _mm_mul_ps(a1, b1));
277+
s0 += a0 * b0;
279278
}
280-
s0 = _mm_add_ps(s0, s1);
281-
float sbuf[4];
282-
_mm_storeu_ps(sbuf, s0);
283-
*result = sbuf[0] + sbuf[1] + sbuf[2] + sbuf[3];
279+
*result = v_reduce_sum(s0);
284280
return k;
285281
}
286282

@@ -290,15 +286,15 @@ template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, f
290286
if( n < 4 )
291287
return 0;
292288
int k = 0;
293-
__m128 c4 = _mm_set1_ps(c), s4 = _mm_set1_ps(s);
294-
for( ; k <= n - 4; k += 4 )
289+
v_float32x4 c4 = v_setall_f32(c), s4 = v_setall_f32(s);
290+
for( ; k <= n - v_float32x4::nlanes; k += v_float32x4::nlanes )
295291
{
296-
__m128 a0 = _mm_load_ps(a + k);
297-
__m128 b0 = _mm_load_ps(b + k);
298-
__m128 t0 = _mm_add_ps(_mm_mul_ps(a0, c4), _mm_mul_ps(b0, s4));
299-
__m128 t1 = _mm_sub_ps(_mm_mul_ps(b0, c4), _mm_mul_ps(a0, s4));
300-
_mm_store_ps(a + k, t0);
301-
_mm_store_ps(b + k, t1);
292+
v_float32x4 a0 = v_load(a + k);
293+
v_float32x4 b0 = v_load(b + k);
294+
v_float32x4 t0 = (a0 * c4) + (b0 * s4);
295+
v_float32x4 t1 = (b0 * c4) - (a0 * s4);
296+
v_store(a + k, t0);
297+
v_store(b + k, t1);
302298
}
303299
return k;
304300
}
@@ -310,45 +306,40 @@ template<> inline int VBLAS<float>::givensx(float* a, float* b, int n, float c,
310306
if( n < 4 )
311307
return 0;
312308
int k = 0;
313-
__m128 c4 = _mm_set1_ps(c), s4 = _mm_set1_ps(s);
314-
__m128 sa = _mm_setzero_ps(), sb = _mm_setzero_ps();
315-
for( ; k <= n - 4; k += 4 )
309+
v_float32x4 c4 = v_setall_f32(c), s4 = v_setall_f32(s);
310+
v_float32x4 sa = v_setzero_f32(), sb = v_setzero_f32();
311+
for( ; k <= n - v_float32x4::nlanes; k += v_float32x4::nlanes )
316312
{
317-
__m128 a0 = _mm_load_ps(a + k);
318-
__m128 b0 = _mm_load_ps(b + k);
319-
__m128 t0 = _mm_add_ps(_mm_mul_ps(a0, c4), _mm_mul_ps(b0, s4));
320-
__m128 t1 = _mm_sub_ps(_mm_mul_ps(b0, c4), _mm_mul_ps(a0, s4));
321-
_mm_store_ps(a + k, t0);
322-
_mm_store_ps(b + k, t1);
323-
sa = _mm_add_ps(sa, _mm_mul_ps(t0, t0));
324-
sb = _mm_add_ps(sb, _mm_mul_ps(t1, t1));
313+
v_float32x4 a0 = v_load(a + k);
314+
v_float32x4 b0 = v_load(b + k);
315+
v_float32x4 t0 = (a0 * c4) + (b0 * s4);
316+
v_float32x4 t1 = (b0 * c4) - (a0 * s4);
317+
v_store(a + k, t0);
318+
v_store(b + k, t1);
319+
sa += t0 + t0;
320+
sb += t1 + t1;
325321
}
326-
float abuf[4], bbuf[4];
327-
_mm_storeu_ps(abuf, sa);
328-
_mm_storeu_ps(bbuf, sb);
329-
*anorm = abuf[0] + abuf[1] + abuf[2] + abuf[3];
330-
*bnorm = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
322+
*anorm = v_reduce_sum(sa);
323+
*bnorm = v_reduce_sum(sb);
331324
return k;
332325
}
333326

334-
327+
#if CV_SIMD128_64F
335328
template<> inline int VBLAS<double>::dot(const double* a, const double* b, int n, double* result) const
336329
{
337330
if( n < 4 )
338331
return 0;
339332
int k = 0;
340-
__m128d s0 = _mm_setzero_pd(), s1 = _mm_setzero_pd();
341-
for( ; k <= n - 4; k += 4 )
333+
v_float64x2 s0 = v_setzero_f64();
334+
for( ; k <= n - v_float64x2::nlanes; k += v_float64x2::nlanes )
342335
{
343-
__m128d a0 = _mm_load_pd(a + k), a1 = _mm_load_pd(a + k + 2);
344-
__m128d b0 = _mm_load_pd(b + k), b1 = _mm_load_pd(b + k + 2);
336+
v_float64x2 a0 = v_load(a + k);
337+
v_float64x2 b0 = v_load(b + k);
345338

346-
s0 = _mm_add_pd(s0, _mm_mul_pd(a0, b0));
347-
s1 = _mm_add_pd(s1, _mm_mul_pd(a1, b1));
339+
s0 += a0 * b0;
348340
}
349-
s0 = _mm_add_pd(s0, s1);
350341
double sbuf[2];
351-
_mm_storeu_pd(sbuf, s0);
342+
v_store(sbuf, s0);
352343
*result = sbuf[0] + sbuf[1];
353344
return k;
354345
}
@@ -357,15 +348,15 @@ template<> inline int VBLAS<double>::dot(const double* a, const double* b, int n
357348
template<> inline int VBLAS<double>::givens(double* a, double* b, int n, double c, double s) const
358349
{
359350
int k = 0;
360-
__m128d c2 = _mm_set1_pd(c), s2 = _mm_set1_pd(s);
361-
for( ; k <= n - 2; k += 2 )
351+
v_float64x2 c2 = v_setall_f64(c), s2 = v_setall_f64(s);
352+
for( ; k <= n - v_float64x2::nlanes; k += v_float64x2::nlanes )
362353
{
363-
__m128d a0 = _mm_load_pd(a + k);
364-
__m128d b0 = _mm_load_pd(b + k);
365-
__m128d t0 = _mm_add_pd(_mm_mul_pd(a0, c2), _mm_mul_pd(b0, s2));
366-
__m128d t1 = _mm_sub_pd(_mm_mul_pd(b0, c2), _mm_mul_pd(a0, s2));
367-
_mm_store_pd(a + k, t0);
368-
_mm_store_pd(b + k, t1);
354+
v_float64x2 a0 = v_load(a + k);
355+
v_float64x2 b0 = v_load(b + k);
356+
v_float64x2 t0 = (a0 * c2) + (b0 * s2);
357+
v_float64x2 t1 = (b0 * c2) - (a0 * s2);
358+
v_store(a + k, t0);
359+
v_store(b + k, t1);
369360
}
370361
return k;
371362
}
@@ -375,27 +366,28 @@ template<> inline int VBLAS<double>::givensx(double* a, double* b, int n, double
375366
double* anorm, double* bnorm) const
376367
{
377368
int k = 0;
378-
__m128d c2 = _mm_set1_pd(c), s2 = _mm_set1_pd(s);
379-
__m128d sa = _mm_setzero_pd(), sb = _mm_setzero_pd();
380-
for( ; k <= n - 2; k += 2 )
369+
v_float64x2 c2 = v_setall_f64(c), s2 = v_setall_f64(s);
370+
v_float64x2 sa = v_setzero_f64(), sb = v_setzero_f64();
371+
for( ; k <= n - v_float64x2::nlanes; k += v_float64x2::nlanes )
381372
{
382-
__m128d a0 = _mm_load_pd(a + k);
383-
__m128d b0 = _mm_load_pd(b + k);
384-
__m128d t0 = _mm_add_pd(_mm_mul_pd(a0, c2), _mm_mul_pd(b0, s2));
385-
__m128d t1 = _mm_sub_pd(_mm_mul_pd(b0, c2), _mm_mul_pd(a0, s2));
386-
_mm_store_pd(a + k, t0);
387-
_mm_store_pd(b + k, t1);
388-
sa = _mm_add_pd(sa, _mm_mul_pd(t0, t0));
389-
sb = _mm_add_pd(sb, _mm_mul_pd(t1, t1));
373+
v_float64x2 a0 = v_load(a + k);
374+
v_float64x2 b0 = v_load(b + k);
375+
v_float64x2 t0 = (a0 * c2) + (b0 * s2);
376+
v_float64x2 t1 = (b0 * c2) - (a0 * s2);
377+
v_store(a + k, t0);
378+
v_store(b + k, t1);
379+
sa += t0 * t0;
380+
sb += t1 * t1;
390381
}
391382
double abuf[2], bbuf[2];
392-
_mm_storeu_pd(abuf, sa);
393-
_mm_storeu_pd(bbuf, sb);
383+
v_store(abuf, sa);
384+
v_store(bbuf, sb);
394385
*anorm = abuf[0] + abuf[1];
395386
*bnorm = bbuf[0] + bbuf[1];
396387
return k;
397388
}
398-
#endif
389+
#endif //CV_SIMD128_64F
390+
#endif //CV_SIMD128
399391

400392
template<typename _Tp> void
401393
JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
@@ -905,24 +897,24 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
905897
d = 1./d;
906898

907899
#if CV_SSE2
908-
if(USE_SSE2)
909-
{
910-
__m128 zero = _mm_setzero_ps();
911-
__m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1)
912-
__m128 t1 = _mm_loadh_pi(zero, (const __m64*)(srcdata+srcstep)); //t1 = sf(1,0) sf(1,1)
913-
__m128 s0 = _mm_or_ps(t0, t1);
914-
__m128 det =_mm_set1_ps((float)d);
915-
s0 = _mm_mul_ps(s0, det);
916-
static const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0};
917-
__m128 pattern = _mm_load_ps((const float*)inv);
918-
s0 = _mm_xor_ps(s0, pattern);//==-1*s0
919-
s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3));
920-
_mm_storel_pi((__m64*)dstdata, s0);
921-
_mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0);
922-
}
923-
else
900+
if(USE_SSE2)
901+
{
902+
__m128 zero = _mm_setzero_ps();
903+
__m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1)
904+
__m128 t1 = _mm_loadh_pi(zero, (const __m64*)(srcdata+srcstep)); //t1 = sf(1,0) sf(1,1)
905+
__m128 s0 = _mm_or_ps(t0, t1);
906+
__m128 det =_mm_set1_ps((float)d);
907+
s0 = _mm_mul_ps(s0, det);
908+
static const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0};
909+
__m128 pattern = _mm_load_ps((const float*)inv);
910+
s0 = _mm_xor_ps(s0, pattern);//==-1*s0
911+
s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3));
912+
_mm_storel_pi((__m64*)dstdata, s0);
913+
_mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0);
914+
}
915+
else
924916
#endif
925-
{
917+
{
926918
double t0, t1;
927919
t0 = Sf(0,0)*d;
928920
t1 = Sf(1,1)*d;
@@ -932,7 +924,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
932924
t1 = -Sf(1,0)*d;
933925
Df(0,1) = (float)t0;
934926
Df(1,0) = (float)t1;
935-
}
927+
}
936928

937929
}
938930
}
@@ -944,38 +936,38 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
944936
result = true;
945937
d = 1./d;
946938
#if CV_SSE2
947-
if(USE_SSE2)
948-
{
949-
__m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1)
950-
__m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1)
951-
__m128d sm = _mm_unpacklo_pd(s0, _mm_load_sd((const double*)(srcdata+srcstep)+1)); //sm = sf(0,0) sf(1,1) - main diagonal
952-
__m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //ss = sf(0,1) sf(1,0) - secondary diagonal
953-
__m128d det = _mm_load1_pd((const double*)&d);
954-
sm = _mm_mul_pd(sm, det);
955-
956-
static const uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80};
957-
__m128d pattern = _mm_load1_pd((double*)inv);
958-
ss = _mm_mul_pd(ss, det);
959-
ss = _mm_xor_pd(ss, pattern);//==-1*ss
960-
961-
s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1));
962-
s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1));
963-
_mm_storeu_pd((double*)dstdata, s0);
964-
_mm_storeu_pd((double*)(dstdata+dststep), s1);
965-
}
966-
else
939+
if(USE_SSE2)
940+
{
941+
__m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1)
942+
__m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1)
943+
__m128d sm = _mm_unpacklo_pd(s0, _mm_load_sd((const double*)(srcdata+srcstep)+1)); //sm = sf(0,0) sf(1,1) - main diagonal
944+
__m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //ss = sf(0,1) sf(1,0) - secondary diagonal
945+
__m128d det = _mm_load1_pd((const double*)&d);
946+
sm = _mm_mul_pd(sm, det);
947+
948+
static const uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80};
949+
__m128d pattern = _mm_load1_pd((double*)inv);
950+
ss = _mm_mul_pd(ss, det);
951+
ss = _mm_xor_pd(ss, pattern);//==-1*ss
952+
953+
s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1));
954+
s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1));
955+
_mm_storeu_pd((double*)dstdata, s0);
956+
_mm_storeu_pd((double*)(dstdata+dststep), s1);
957+
}
958+
else
967959
#endif
968-
{
969-
double t0, t1;
970-
t0 = Sd(0,0)*d;
971-
t1 = Sd(1,1)*d;
972-
Dd(1,1) = t0;
973-
Dd(0,0) = t1;
974-
t0 = -Sd(0,1)*d;
975-
t1 = -Sd(1,0)*d;
976-
Dd(0,1) = t0;
977-
Dd(1,0) = t1;
978-
}
960+
{
961+
double t0, t1;
962+
t0 = Sd(0,0)*d;
963+
t1 = Sd(1,1)*d;
964+
Dd(1,1) = t0;
965+
Dd(0,0) = t1;
966+
t0 = -Sd(0,1)*d;
967+
t1 = -Sd(1,0)*d;
968+
Dd(0,1) = t0;
969+
Dd(1,0) = t1;
970+
}
979971
}
980972
}
981973
}

0 commit comments

Comments
 (0)