@@ -262,25 +262,21 @@ template<typename T> struct VBLAS
262
262
int givensx (T*, T*, int , T, T, T*, T*) const { return 0 ; }
263
263
};
264
264
265
- #if CV_SSE2
265
+ #if CV_SIMD128
266
266
template <> inline int VBLAS<float >::dot(const float * a, const float * b, int n, float * result) const
267
267
{
268
268
if ( n < 8 )
269
269
return 0 ;
270
270
int k = 0 ;
271
- __m128 s0 = _mm_setzero_ps (), s1 = _mm_setzero_ps ();
272
- for ( ; k <= n - 8 ; k += 8 )
271
+ v_float32x4 s0 = v_setzero_f32 ();
272
+ for ( ; k <= n - v_float32x4::nlanes ; k += v_float32x4::nlanes )
273
273
{
274
- __m128 a0 = _mm_load_ps (a + k), a1 = _mm_load_ps (a + k + 4 );
275
- __m128 b0 = _mm_load_ps (b + k), b1 = _mm_load_ps (b + k + 4 );
274
+ v_float32x4 a0 = v_load (a + k);
275
+ v_float32x4 b0 = v_load (b + k);
276
276
277
- s0 = _mm_add_ps (s0, _mm_mul_ps (a0, b0));
278
- s1 = _mm_add_ps (s1, _mm_mul_ps (a1, b1));
277
+ s0 += a0 * b0;
279
278
}
280
- s0 = _mm_add_ps (s0, s1);
281
- float sbuf[4 ];
282
- _mm_storeu_ps (sbuf, s0);
283
- *result = sbuf[0 ] + sbuf[1 ] + sbuf[2 ] + sbuf[3 ];
279
+ *result = v_reduce_sum (s0);
284
280
return k;
285
281
}
286
282
@@ -290,15 +286,15 @@ template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, f
290
286
if ( n < 4 )
291
287
return 0 ;
292
288
int k = 0 ;
293
- __m128 c4 = _mm_set1_ps (c), s4 = _mm_set1_ps (s);
294
- for ( ; k <= n - 4 ; k += 4 )
289
+ v_float32x4 c4 = v_setall_f32 (c), s4 = v_setall_f32 (s);
290
+ for ( ; k <= n - v_float32x4::nlanes ; k += v_float32x4::nlanes )
295
291
{
296
- __m128 a0 = _mm_load_ps (a + k);
297
- __m128 b0 = _mm_load_ps (b + k);
298
- __m128 t0 = _mm_add_ps ( _mm_mul_ps (a0, c4), _mm_mul_ps (b0, s4) );
299
- __m128 t1 = _mm_sub_ps ( _mm_mul_ps (b0, c4), _mm_mul_ps (a0, s4) );
300
- _mm_store_ps (a + k, t0);
301
- _mm_store_ps (b + k, t1);
292
+ v_float32x4 a0 = v_load (a + k);
293
+ v_float32x4 b0 = v_load (b + k);
294
+ v_float32x4 t0 = (a0 * c4) + (b0 * s4 );
295
+ v_float32x4 t1 = (b0 * c4) - (a0 * s4 );
296
+ v_store (a + k, t0);
297
+ v_store (b + k, t1);
302
298
}
303
299
return k;
304
300
}
@@ -310,45 +306,40 @@ template<> inline int VBLAS<float>::givensx(float* a, float* b, int n, float c,
310
306
if ( n < 4 )
311
307
return 0 ;
312
308
int k = 0 ;
313
- __m128 c4 = _mm_set1_ps (c), s4 = _mm_set1_ps (s);
314
- __m128 sa = _mm_setzero_ps (), sb = _mm_setzero_ps ();
315
- for ( ; k <= n - 4 ; k += 4 )
309
+ v_float32x4 c4 = v_setall_f32 (c), s4 = v_setall_f32 (s);
310
+ v_float32x4 sa = v_setzero_f32 (), sb = v_setzero_f32 ();
311
+ for ( ; k <= n - v_float32x4::nlanes ; k += v_float32x4::nlanes )
316
312
{
317
- __m128 a0 = _mm_load_ps (a + k);
318
- __m128 b0 = _mm_load_ps (b + k);
319
- __m128 t0 = _mm_add_ps ( _mm_mul_ps (a0, c4), _mm_mul_ps (b0, s4) );
320
- __m128 t1 = _mm_sub_ps ( _mm_mul_ps (b0, c4), _mm_mul_ps (a0, s4) );
321
- _mm_store_ps (a + k, t0);
322
- _mm_store_ps (b + k, t1);
323
- sa = _mm_add_ps (sa, _mm_mul_ps (t0, t0)) ;
324
- sb = _mm_add_ps (sb, _mm_mul_ps (t1, t1)) ;
313
+ v_float32x4 a0 = v_load (a + k);
314
+ v_float32x4 b0 = v_load (b + k);
315
+ v_float32x4 t0 = (a0 * c4) + (b0 * s4 );
316
+ v_float32x4 t1 = (b0 * c4) - (a0 * s4 );
317
+ v_store (a + k, t0);
318
+ v_store (b + k, t1);
319
+ sa += t0 + t0 ;
320
+ sb += t1 + t1 ;
325
321
}
326
- float abuf[4 ], bbuf[4 ];
327
- _mm_storeu_ps (abuf, sa);
328
- _mm_storeu_ps (bbuf, sb);
329
- *anorm = abuf[0 ] + abuf[1 ] + abuf[2 ] + abuf[3 ];
330
- *bnorm = bbuf[0 ] + bbuf[1 ] + bbuf[2 ] + bbuf[3 ];
322
+ *anorm = v_reduce_sum (sa);
323
+ *bnorm = v_reduce_sum (sb);
331
324
return k;
332
325
}
333
326
334
-
327
+ # if CV_SIMD128_64F
335
328
template <> inline int VBLAS<double >::dot(const double * a, const double * b, int n, double * result) const
336
329
{
337
330
if ( n < 4 )
338
331
return 0 ;
339
332
int k = 0 ;
340
- __m128d s0 = _mm_setzero_pd (), s1 = _mm_setzero_pd ();
341
- for ( ; k <= n - 4 ; k += 4 )
333
+ v_float64x2 s0 = v_setzero_f64 ();
334
+ for ( ; k <= n - v_float64x2::nlanes ; k += v_float64x2::nlanes )
342
335
{
343
- __m128d a0 = _mm_load_pd (a + k), a1 = _mm_load_pd (a + k + 2 );
344
- __m128d b0 = _mm_load_pd (b + k), b1 = _mm_load_pd (b + k + 2 );
336
+ v_float64x2 a0 = v_load (a + k);
337
+ v_float64x2 b0 = v_load (b + k);
345
338
346
- s0 = _mm_add_pd (s0, _mm_mul_pd (a0, b0));
347
- s1 = _mm_add_pd (s1, _mm_mul_pd (a1, b1));
339
+ s0 += a0 * b0;
348
340
}
349
- s0 = _mm_add_pd (s0, s1);
350
341
double sbuf[2 ];
351
- _mm_storeu_pd (sbuf, s0);
342
+ v_store (sbuf, s0);
352
343
*result = sbuf[0 ] + sbuf[1 ];
353
344
return k;
354
345
}
@@ -357,15 +348,15 @@ template<> inline int VBLAS<double>::dot(const double* a, const double* b, int n
357
348
template <> inline int VBLAS<double >::givens(double * a, double * b, int n, double c, double s) const
358
349
{
359
350
int k = 0 ;
360
- __m128d c2 = _mm_set1_pd (c), s2 = _mm_set1_pd (s);
361
- for ( ; k <= n - 2 ; k += 2 )
351
+ v_float64x2 c2 = v_setall_f64 (c), s2 = v_setall_f64 (s);
352
+ for ( ; k <= n - v_float64x2::nlanes ; k += v_float64x2::nlanes )
362
353
{
363
- __m128d a0 = _mm_load_pd (a + k);
364
- __m128d b0 = _mm_load_pd (b + k);
365
- __m128d t0 = _mm_add_pd ( _mm_mul_pd (a0, c2), _mm_mul_pd (b0, s2) );
366
- __m128d t1 = _mm_sub_pd ( _mm_mul_pd (b0, c2), _mm_mul_pd (a0, s2) );
367
- _mm_store_pd (a + k, t0);
368
- _mm_store_pd (b + k, t1);
354
+ v_float64x2 a0 = v_load (a + k);
355
+ v_float64x2 b0 = v_load (b + k);
356
+ v_float64x2 t0 = (a0 * c2) + (b0 * s2 );
357
+ v_float64x2 t1 = (b0 * c2) - (a0 * s2 );
358
+ v_store (a + k, t0);
359
+ v_store (b + k, t1);
369
360
}
370
361
return k;
371
362
}
@@ -375,27 +366,28 @@ template<> inline int VBLAS<double>::givensx(double* a, double* b, int n, double
375
366
double * anorm, double * bnorm) const
376
367
{
377
368
int k = 0 ;
378
- __m128d c2 = _mm_set1_pd (c), s2 = _mm_set1_pd (s);
379
- __m128d sa = _mm_setzero_pd (), sb = _mm_setzero_pd ();
380
- for ( ; k <= n - 2 ; k += 2 )
369
+ v_float64x2 c2 = v_setall_f64 (c), s2 = v_setall_f64 (s);
370
+ v_float64x2 sa = v_setzero_f64 (), sb = v_setzero_f64 ();
371
+ for ( ; k <= n - v_float64x2::nlanes ; k += v_float64x2::nlanes )
381
372
{
382
- __m128d a0 = _mm_load_pd (a + k);
383
- __m128d b0 = _mm_load_pd (b + k);
384
- __m128d t0 = _mm_add_pd ( _mm_mul_pd (a0, c2), _mm_mul_pd (b0, s2) );
385
- __m128d t1 = _mm_sub_pd ( _mm_mul_pd (b0, c2), _mm_mul_pd (a0, s2) );
386
- _mm_store_pd (a + k, t0);
387
- _mm_store_pd (b + k, t1);
388
- sa = _mm_add_pd (sa, _mm_mul_pd (t0, t0)) ;
389
- sb = _mm_add_pd (sb, _mm_mul_pd (t1, t1)) ;
373
+ v_float64x2 a0 = v_load (a + k);
374
+ v_float64x2 b0 = v_load (b + k);
375
+ v_float64x2 t0 = (a0 * c2) + (b0 * s2 );
376
+ v_float64x2 t1 = (b0 * c2) - (a0 * s2 );
377
+ v_store (a + k, t0);
378
+ v_store (b + k, t1);
379
+ sa += t0 * t0 ;
380
+ sb += t1 * t1 ;
390
381
}
391
382
double abuf[2 ], bbuf[2 ];
392
- _mm_storeu_pd (abuf, sa);
393
- _mm_storeu_pd (bbuf, sb);
383
+ v_store (abuf, sa);
384
+ v_store (bbuf, sb);
394
385
*anorm = abuf[0 ] + abuf[1 ];
395
386
*bnorm = bbuf[0 ] + bbuf[1 ];
396
387
return k;
397
388
}
398
- #endif
389
+ #endif // CV_SIMD128_64F
390
+ #endif // CV_SIMD128
399
391
400
392
template <typename _Tp> void
401
393
JacobiSVDImpl_ (_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
@@ -905,24 +897,24 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
905
897
d = 1 ./d;
906
898
907
899
#if CV_SSE2
908
- if (USE_SSE2)
909
- {
910
- __m128 zero = _mm_setzero_ps ();
911
- __m128 t0 = _mm_loadl_pi (zero, (const __m64*)srcdata); // t0 = sf(0,0) sf(0,1)
912
- __m128 t1 = _mm_loadh_pi (zero, (const __m64*)(srcdata+srcstep)); // t1 = sf(1,0) sf(1,1)
913
- __m128 s0 = _mm_or_ps (t0, t1);
914
- __m128 det =_mm_set1_ps ((float )d);
915
- s0 = _mm_mul_ps (s0, det);
916
- static const uchar CV_DECL_ALIGNED (16 ) inv[16 ] = {0 ,0 ,0 ,0 ,0 ,0 ,0 ,0x80 ,0 ,0 ,0 ,0x80 ,0 ,0 ,0 ,0 };
917
- __m128 pattern = _mm_load_ps ((const float *)inv);
918
- s0 = _mm_xor_ps (s0, pattern);// ==-1*s0
919
- s0 = _mm_shuffle_ps (s0, s0, _MM_SHUFFLE (0 ,2 ,1 ,3 ));
920
- _mm_storel_pi ((__m64*)dstdata, s0);
921
- _mm_storeh_pi ((__m64*)((float *)(dstdata+dststep)), s0);
922
- }
923
- else
900
+ if (USE_SSE2)
901
+ {
902
+ __m128 zero = _mm_setzero_ps ();
903
+ __m128 t0 = _mm_loadl_pi (zero, (const __m64*)srcdata); // t0 = sf(0,0) sf(0,1)
904
+ __m128 t1 = _mm_loadh_pi (zero, (const __m64*)(srcdata+srcstep)); // t1 = sf(1,0) sf(1,1)
905
+ __m128 s0 = _mm_or_ps (t0, t1);
906
+ __m128 det =_mm_set1_ps ((float )d);
907
+ s0 = _mm_mul_ps (s0, det);
908
+ static const uchar CV_DECL_ALIGNED (16 ) inv[16 ] = {0 ,0 ,0 ,0 ,0 ,0 ,0 ,0x80 ,0 ,0 ,0 ,0x80 ,0 ,0 ,0 ,0 };
909
+ __m128 pattern = _mm_load_ps ((const float *)inv);
910
+ s0 = _mm_xor_ps (s0, pattern);// ==-1*s0
911
+ s0 = _mm_shuffle_ps (s0, s0, _MM_SHUFFLE (0 ,2 ,1 ,3 ));
912
+ _mm_storel_pi ((__m64*)dstdata, s0);
913
+ _mm_storeh_pi ((__m64*)((float *)(dstdata+dststep)), s0);
914
+ }
915
+ else
924
916
#endif
925
- {
917
+ {
926
918
double t0, t1;
927
919
t0 = Sf (0 ,0 )*d;
928
920
t1 = Sf (1 ,1 )*d;
@@ -932,7 +924,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
932
924
t1 = -Sf (1 ,0 )*d;
933
925
Df (0 ,1 ) = (float )t0;
934
926
Df (1 ,0 ) = (float )t1;
935
- }
927
+ }
936
928
937
929
}
938
930
}
@@ -944,38 +936,38 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
944
936
result = true ;
945
937
d = 1 ./d;
946
938
#if CV_SSE2
947
- if (USE_SSE2)
948
- {
949
- __m128d s0 = _mm_loadu_pd ((const double *)srcdata); // s0 = sf(0,0) sf(0,1)
950
- __m128d s1 = _mm_loadu_pd ((const double *)(srcdata+srcstep));// s1 = sf(1,0) sf(1,1)
951
- __m128d sm = _mm_unpacklo_pd (s0, _mm_load_sd ((const double *)(srcdata+srcstep)+1 )); // sm = sf(0,0) sf(1,1) - main diagonal
952
- __m128d ss = _mm_shuffle_pd (s0, s1, _MM_SHUFFLE2 (0 ,1 )); // ss = sf(0,1) sf(1,0) - secondary diagonal
953
- __m128d det = _mm_load1_pd ((const double *)&d);
954
- sm = _mm_mul_pd (sm, det);
955
-
956
- static const uchar CV_DECL_ALIGNED (16 ) inv[8 ] = {0 ,0 ,0 ,0 ,0 ,0 ,0 ,0x80 };
957
- __m128d pattern = _mm_load1_pd ((double *)inv);
958
- ss = _mm_mul_pd (ss, det);
959
- ss = _mm_xor_pd (ss, pattern);// ==-1*ss
960
-
961
- s0 = _mm_shuffle_pd (sm, ss, _MM_SHUFFLE2 (0 ,1 ));
962
- s1 = _mm_shuffle_pd (ss, sm, _MM_SHUFFLE2 (0 ,1 ));
963
- _mm_storeu_pd ((double *)dstdata, s0);
964
- _mm_storeu_pd ((double *)(dstdata+dststep), s1);
965
- }
966
- else
939
+ if (USE_SSE2)
940
+ {
941
+ __m128d s0 = _mm_loadu_pd ((const double *)srcdata); // s0 = sf(0,0) sf(0,1)
942
+ __m128d s1 = _mm_loadu_pd ((const double *)(srcdata+srcstep));// s1 = sf(1,0) sf(1,1)
943
+ __m128d sm = _mm_unpacklo_pd (s0, _mm_load_sd ((const double *)(srcdata+srcstep)+1 )); // sm = sf(0,0) sf(1,1) - main diagonal
944
+ __m128d ss = _mm_shuffle_pd (s0, s1, _MM_SHUFFLE2 (0 ,1 )); // ss = sf(0,1) sf(1,0) - secondary diagonal
945
+ __m128d det = _mm_load1_pd ((const double *)&d);
946
+ sm = _mm_mul_pd (sm, det);
947
+
948
+ static const uchar CV_DECL_ALIGNED (16 ) inv[8 ] = {0 ,0 ,0 ,0 ,0 ,0 ,0 ,0x80 };
949
+ __m128d pattern = _mm_load1_pd ((double *)inv);
950
+ ss = _mm_mul_pd (ss, det);
951
+ ss = _mm_xor_pd (ss, pattern);// ==-1*ss
952
+
953
+ s0 = _mm_shuffle_pd (sm, ss, _MM_SHUFFLE2 (0 ,1 ));
954
+ s1 = _mm_shuffle_pd (ss, sm, _MM_SHUFFLE2 (0 ,1 ));
955
+ _mm_storeu_pd ((double *)dstdata, s0);
956
+ _mm_storeu_pd ((double *)(dstdata+dststep), s1);
957
+ }
958
+ else
967
959
#endif
968
- {
969
- double t0, t1;
970
- t0 = Sd (0 ,0 )*d;
971
- t1 = Sd (1 ,1 )*d;
972
- Dd (1 ,1 ) = t0;
973
- Dd (0 ,0 ) = t1;
974
- t0 = -Sd (0 ,1 )*d;
975
- t1 = -Sd (1 ,0 )*d;
976
- Dd (0 ,1 ) = t0;
977
- Dd (1 ,0 ) = t1;
978
- }
960
+ {
961
+ double t0, t1;
962
+ t0 = Sd (0 ,0 )*d;
963
+ t1 = Sd (1 ,1 )*d;
964
+ Dd (1 ,1 ) = t0;
965
+ Dd (0 ,0 ) = t1;
966
+ t0 = -Sd (0 ,1 )*d;
967
+ t1 = -Sd (1 ,0 )*d;
968
+ Dd (0 ,1 ) = t0;
969
+ Dd (1 ,0 ) = t1;
970
+ }
979
971
}
980
972
}
981
973
}
0 commit comments