Skip to content

Commit a8763c1

Browse files
author
Woody Chow
committed
Optimize exp32f with AVX2
1 parent a83a1ca commit a8763c1

File tree

1 file changed

+90
-1
lines changed

1 file changed

+90
-1
lines changed

modules/core/src/mathfuncs_core.cpp

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,96 @@ void exp32f( const float *_x, float *y, int n )
551551
const Cv32suf* x = (const Cv32suf*)_x;
552552
Cv32suf buf[4];
553553

554-
#if CV_SSE2
554+
#if CV_AVX2
555+
if( n >= 8 )
556+
{
557+
static const __m256d prescale4 = _mm256_set1_pd(exp_prescale);
558+
static const __m256 postscale8 = _mm256_set1_ps((float)exp_postscale);
559+
static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
560+
static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
561+
562+
static const __m256 mA1 = _mm256_set1_ps(A1);
563+
static const __m256 mA2 = _mm256_set1_ps(A2);
564+
static const __m256 mA3 = _mm256_set1_ps(A3);
565+
static const __m256 mA4 = _mm256_set1_ps(A4);
566+
bool y_aligned = (size_t)(void*)y % 32 == 0;
567+
568+
ushort CV_DECL_ALIGNED(32) tab_idx[16];
569+
570+
for( ; i <= n - 8; i += 8 )
571+
{
572+
__m256 xf;
573+
__m128i xi0, xi1;
574+
575+
__m256d xd0 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i].f), minval4), maxval4));
576+
__m256d xd1 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i+4].f), minval4), maxval4));
577+
578+
xd0 = _mm256_mul_pd(xd0, prescale4);
579+
xd1 = _mm256_mul_pd(xd1, prescale4);
580+
581+
xi0 = _mm256_cvtpd_epi32(xd0);
582+
xi1 = _mm256_cvtpd_epi32(xd1);
583+
584+
xd0 = _mm256_sub_pd(xd0, _mm256_cvtepi32_pd(xi0));
585+
xd1 = _mm256_sub_pd(xd1, _mm256_cvtepi32_pd(xi1));
586+
587+
// gcc does not support _mm256_set_m128
588+
//xf = _mm256_set_m128(_mm256_cvtpd_ps(xd1), _mm256_cvtpd_ps(xd0));
589+
xf = _mm256_insertf128_ps(xf, _mm256_cvtpd_ps(xd0), 0);
590+
xf = _mm256_insertf128_ps(xf, _mm256_cvtpd_ps(xd1), 1);
591+
592+
xf = _mm256_mul_ps(xf, postscale8);
593+
594+
xi0 = _mm_packs_epi32(xi0, xi1);
595+
596+
_mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
597+
598+
xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
599+
xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
600+
xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
601+
xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
602+
xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
603+
604+
__m256d yd0 = _mm256_set_pd(expTab[tab_idx[3]], expTab[tab_idx[2]], expTab[tab_idx[1]], expTab[tab_idx[0]]);
605+
__m256d yd1 = _mm256_set_pd(expTab[tab_idx[7]], expTab[tab_idx[6]], expTab[tab_idx[5]], expTab[tab_idx[4]]);
606+
607+
// gcc does not support _mm256_set_m128
608+
//__m256 yf = _mm256_set_m128(_mm256_cvtpd_ps(yd1), _mm256_cvtpd_ps(yd0));
609+
__m256 yf;
610+
yf = _mm256_insertf128_ps(yf, _mm256_cvtpd_ps(yd0), 0);
611+
yf = _mm256_insertf128_ps(yf, _mm256_cvtpd_ps(yd1), 1);
612+
613+
//_mm256_set_m128i(xi1, xi0)
614+
__m256i temp;
615+
temp = _mm256_inserti128_si256(temp, xi0, 0);
616+
temp = _mm256_inserti128_si256(temp, xi1, 1);
617+
618+
yf = _mm256_mul_ps(yf, _mm256_castsi256_ps(_mm256_slli_epi32(temp, 23)));
619+
620+
__m256 zf = _mm256_add_ps(xf, mA1);
621+
622+
#if CV_FMA3
623+
zf = _mm256_fmadd_ps(zf, xf, mA2);
624+
zf = _mm256_fmadd_ps(zf, xf, mA3);
625+
zf = _mm256_fmadd_ps(zf, xf, mA4);
626+
#else
627+
zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA2);
628+
zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA3);
629+
zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA4);
630+
#endif
631+
zf = _mm256_mul_ps(zf, yf);
632+
633+
if( y_aligned )
634+
{
635+
_mm256_store_ps(y + i, zf);
636+
}
637+
else
638+
{
639+
_mm256_storeu_ps(y + i, zf);
640+
}
641+
}
642+
}
643+
#elif CV_SSE2
555644
if( n >= 8 )
556645
{
557646
static const __m128d prescale2 = _mm_set1_pd(exp_prescale);

0 commit comments

Comments
 (0)