@@ -5523,6 +5523,9 @@ class WarpAffineInvoker :
5523
5523
const int AB_BITS = MAX (10 , (int )INTER_BITS);
5524
5524
const int AB_SCALE = 1 << AB_BITS;
5525
5525
int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2 , x, y, x1, y1;
5526
+ #if CV_AVX2
5527
+ bool useAVX2 = checkHardwareSupport (CV_CPU_AVX2);
5528
+ #endif
5526
5529
#if CV_SSE2
5527
5530
bool useSSE2 = checkHardwareSupport (CV_CPU_SSE2);
5528
5531
#endif
@@ -5603,6 +5606,42 @@ class WarpAffineInvoker :
5603
5606
{
5604
5607
short * alpha = A + y1*bw;
5605
5608
x1 = 0 ;
5609
+ #if CV_AVX2
5610
+ if ( useAVX2 )
5611
+ {
5612
+ __m256i fxy_mask = _mm256_set1_epi32 (INTER_TAB_SIZE - 1 );
5613
+ __m256i XX = _mm256_set1_epi32 (X0), YY = _mm256_set1_epi32 (Y0);
5614
+ for ( ; x1 <= bw - 16 ; x1 += 16 )
5615
+ {
5616
+ __m256i tx0, tx1, ty0, ty1;
5617
+ tx0 = _mm256_add_epi32 (_mm256_loadu_si256 ((const __m256i*)(adelta + x + x1)), XX);
5618
+ ty0 = _mm256_add_epi32 (_mm256_loadu_si256 ((const __m256i*)(bdelta + x + x1)), YY);
5619
+ tx1 = _mm256_add_epi32 (_mm256_loadu_si256 ((const __m256i*)(adelta + x + x1 + 8 )), XX);
5620
+ ty1 = _mm256_add_epi32 (_mm256_loadu_si256 ((const __m256i*)(bdelta + x + x1 + 8 )), YY);
5621
+
5622
+ tx0 = _mm256_srai_epi32 (tx0, AB_BITS - INTER_BITS);
5623
+ ty0 = _mm256_srai_epi32 (ty0, AB_BITS - INTER_BITS);
5624
+ tx1 = _mm256_srai_epi32 (tx1, AB_BITS - INTER_BITS);
5625
+ ty1 = _mm256_srai_epi32 (ty1, AB_BITS - INTER_BITS);
5626
+
5627
+ __m256i fx_ = _mm256_packs_epi32 (_mm256_and_si256 (tx0, fxy_mask),
5628
+ _mm256_and_si256 (tx1, fxy_mask));
5629
+ __m256i fy_ = _mm256_packs_epi32 (_mm256_and_si256 (ty0, fxy_mask),
5630
+ _mm256_and_si256 (ty1, fxy_mask));
5631
+ tx0 = _mm256_packs_epi32 (_mm256_srai_epi32 (tx0, INTER_BITS),
5632
+ _mm256_srai_epi32 (tx1, INTER_BITS));
5633
+ ty0 = _mm256_packs_epi32 (_mm256_srai_epi32 (ty0, INTER_BITS),
5634
+ _mm256_srai_epi32 (ty1, INTER_BITS));
5635
+ fx_ = _mm256_adds_epi16 (fx_, _mm256_slli_epi16 (fy_, INTER_BITS));
5636
+ fx_ = _mm256_permute4x64_epi64 (fx_, (3 << 6 ) + (1 << 4 ) + (2 << 2 ) + 0 );
5637
+
5638
+ _mm256_storeu_si256 ((__m256i*)(xy + x1*2 ), _mm256_unpacklo_epi16 (tx0, ty0));
5639
+ _mm256_storeu_si256 ((__m256i*)(xy + x1*2 + 16 ), _mm256_unpackhi_epi16 (tx0, ty0));
5640
+ _mm256_storeu_si256 ((__m256i*)(alpha + x1), fx_);
5641
+ }
5642
+ _mm256_zeroupper ();
5643
+ }
5644
+ #endif
5606
5645
#if CV_SSE2
5607
5646
if ( useSSE2 )
5608
5647
{
0 commit comments