@@ -551,7 +551,96 @@ void exp32f( const float *_x, float *y, int n )
551
551
const Cv32suf* x = (const Cv32suf*)_x;
552
552
Cv32suf buf[4 ];
553
553
554
- #if CV_SSE2
554
+ #if CV_AVX2
555
+ if ( n >= 8 )
556
+ {
557
+ static const __m256d prescale4 = _mm256_set1_pd (exp_prescale);
558
+ static const __m256 postscale8 = _mm256_set1_ps ((float )exp_postscale);
559
+ static const __m128 maxval4 = _mm_set1_ps ((float )(exp_max_val/exp_prescale));
560
+ static const __m128 minval4 = _mm_set1_ps ((float )(-exp_max_val/exp_prescale));
561
+
562
+ static const __m256 mA1 = _mm256_set1_ps (A1);
563
+ static const __m256 mA2 = _mm256_set1_ps (A2);
564
+ static const __m256 mA3 = _mm256_set1_ps (A3);
565
+ static const __m256 mA4 = _mm256_set1_ps (A4);
566
+ bool y_aligned = (size_t )(void *)y % 32 == 0 ;
567
+
568
+ ushort CV_DECL_ALIGNED (32 ) tab_idx[16 ];
569
+
570
+ for ( ; i <= n - 8 ; i += 8 )
571
+ {
572
+ __m256 xf;
573
+ __m128i xi0, xi1;
574
+
575
+ __m256d xd0 = _mm256_cvtps_pd (_mm_min_ps (_mm_max_ps (_mm_loadu_ps (&x[i].f ), minval4), maxval4));
576
+ __m256d xd1 = _mm256_cvtps_pd (_mm_min_ps (_mm_max_ps (_mm_loadu_ps (&x[i+4 ].f ), minval4), maxval4));
577
+
578
+ xd0 = _mm256_mul_pd (xd0, prescale4);
579
+ xd1 = _mm256_mul_pd (xd1, prescale4);
580
+
581
+ xi0 = _mm256_cvtpd_epi32 (xd0);
582
+ xi1 = _mm256_cvtpd_epi32 (xd1);
583
+
584
+ xd0 = _mm256_sub_pd (xd0, _mm256_cvtepi32_pd (xi0));
585
+ xd1 = _mm256_sub_pd (xd1, _mm256_cvtepi32_pd (xi1));
586
+
587
+ // gcc does not support _mm256_set_m128
588
+ // xf = _mm256_set_m128(_mm256_cvtpd_ps(xd1), _mm256_cvtpd_ps(xd0));
589
+ xf = _mm256_insertf128_ps (xf, _mm256_cvtpd_ps (xd0), 0 );
590
+ xf = _mm256_insertf128_ps (xf, _mm256_cvtpd_ps (xd1), 1 );
591
+
592
+ xf = _mm256_mul_ps (xf, postscale8);
593
+
594
+ xi0 = _mm_packs_epi32 (xi0, xi1);
595
+
596
+ _mm_store_si128 ((__m128i*)tab_idx, _mm_and_si128 (xi0, _mm_set1_epi16 (EXPTAB_MASK)));
597
+
598
+ xi0 = _mm_add_epi16 (_mm_srai_epi16 (xi0, EXPTAB_SCALE), _mm_set1_epi16 (127 ));
599
+ xi0 = _mm_max_epi16 (xi0, _mm_setzero_si128 ());
600
+ xi0 = _mm_min_epi16 (xi0, _mm_set1_epi16 (255 ));
601
+ xi1 = _mm_unpackhi_epi16 (xi0, _mm_setzero_si128 ());
602
+ xi0 = _mm_unpacklo_epi16 (xi0, _mm_setzero_si128 ());
603
+
604
+ __m256d yd0 = _mm256_set_pd (expTab[tab_idx[3 ]], expTab[tab_idx[2 ]], expTab[tab_idx[1 ]], expTab[tab_idx[0 ]]);
605
+ __m256d yd1 = _mm256_set_pd (expTab[tab_idx[7 ]], expTab[tab_idx[6 ]], expTab[tab_idx[5 ]], expTab[tab_idx[4 ]]);
606
+
607
+ // gcc does not support _mm256_set_m128
608
+ // __m256 yf = _mm256_set_m128(_mm256_cvtpd_ps(yd1), _mm256_cvtpd_ps(yd0));
609
+ __m256 yf;
610
+ yf = _mm256_insertf128_ps (yf, _mm256_cvtpd_ps (yd0), 0 );
611
+ yf = _mm256_insertf128_ps (yf, _mm256_cvtpd_ps (yd1), 1 );
612
+
613
+ // _mm256_set_m128i(xi1, xi0)
614
+ __m256i temp;
615
+ temp = _mm256_inserti128_si256 (temp, xi0, 0 );
616
+ temp = _mm256_inserti128_si256 (temp, xi1, 1 );
617
+
618
+ yf = _mm256_mul_ps (yf, _mm256_castsi256_ps (_mm256_slli_epi32 (temp, 23 )));
619
+
620
+ __m256 zf = _mm256_add_ps (xf, mA1 );
621
+
622
+ #if CV_FMA3
623
+ zf = _mm256_fmadd_ps (zf, xf, mA2 );
624
+ zf = _mm256_fmadd_ps (zf, xf, mA3 );
625
+ zf = _mm256_fmadd_ps (zf, xf, mA4 );
626
+ #else
627
+ zf = _mm256_add_ps (_mm256_mul_ps (zf, xf), mA2 );
628
+ zf = _mm256_add_ps (_mm256_mul_ps (zf, xf), mA3 );
629
+ zf = _mm256_add_ps (_mm256_mul_ps (zf, xf), mA4 );
630
+ #endif
631
+ zf = _mm256_mul_ps (zf, yf);
632
+
633
+ if ( y_aligned )
634
+ {
635
+ _mm256_store_ps (y + i, zf);
636
+ }
637
+ else
638
+ {
639
+ _mm256_storeu_ps (y + i, zf);
640
+ }
641
+ }
642
+ }
643
+ #elif CV_SSE2
555
644
if ( n >= 8 )
556
645
{
557
646
static const __m128d prescale2 = _mm_set1_pd (exp_prescale);
0 commit comments