Skip to content

Commit 833832a

Browse files
committed
Merge pull request opencv#8391 from woodychow:warpAffine_avx2
2 parents e00d052 + 9a29fc2 commit 833832a

File tree

1 file changed

+39
-0
lines changed

1 file changed

+39
-0
lines changed

modules/imgproc/src/imgwarp.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5523,6 +5523,9 @@ class WarpAffineInvoker :
55235523
const int AB_BITS = MAX(10, (int)INTER_BITS);
55245524
const int AB_SCALE = 1 << AB_BITS;
55255525
int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
5526+
#if CV_AVX2
5527+
bool useAVX2 = checkHardwareSupport(CV_CPU_AVX2);
5528+
#endif
55265529
#if CV_SSE2
55275530
bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
55285531
#endif
@@ -5603,6 +5606,42 @@ class WarpAffineInvoker :
56035606
{
56045607
short* alpha = A + y1*bw;
56055608
x1 = 0;
5609+
#if CV_AVX2
5610+
if ( useAVX2 )
5611+
{
5612+
__m256i fxy_mask = _mm256_set1_epi32(INTER_TAB_SIZE - 1);
5613+
__m256i XX = _mm256_set1_epi32(X0), YY = _mm256_set1_epi32(Y0);
5614+
for( ; x1 <= bw - 16; x1 += 16 )
5615+
{
5616+
__m256i tx0, tx1, ty0, ty1;
5617+
tx0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x + x1)), XX);
5618+
ty0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x + x1)), YY);
5619+
tx1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x + x1 + 8)), XX);
5620+
ty1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x + x1 + 8)), YY);
5621+
5622+
tx0 = _mm256_srai_epi32(tx0, AB_BITS - INTER_BITS);
5623+
ty0 = _mm256_srai_epi32(ty0, AB_BITS - INTER_BITS);
5624+
tx1 = _mm256_srai_epi32(tx1, AB_BITS - INTER_BITS);
5625+
ty1 = _mm256_srai_epi32(ty1, AB_BITS - INTER_BITS);
5626+
5627+
__m256i fx_ = _mm256_packs_epi32(_mm256_and_si256(tx0, fxy_mask),
5628+
_mm256_and_si256(tx1, fxy_mask));
5629+
__m256i fy_ = _mm256_packs_epi32(_mm256_and_si256(ty0, fxy_mask),
5630+
_mm256_and_si256(ty1, fxy_mask));
5631+
tx0 = _mm256_packs_epi32(_mm256_srai_epi32(tx0, INTER_BITS),
5632+
_mm256_srai_epi32(tx1, INTER_BITS));
5633+
ty0 = _mm256_packs_epi32(_mm256_srai_epi32(ty0, INTER_BITS),
5634+
_mm256_srai_epi32(ty1, INTER_BITS));
5635+
fx_ = _mm256_adds_epi16(fx_, _mm256_slli_epi16(fy_, INTER_BITS));
5636+
fx_ = _mm256_permute4x64_epi64(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0);
5637+
5638+
_mm256_storeu_si256((__m256i*)(xy + x1*2), _mm256_unpacklo_epi16(tx0, ty0));
5639+
_mm256_storeu_si256((__m256i*)(xy + x1*2 + 16), _mm256_unpackhi_epi16(tx0, ty0));
5640+
_mm256_storeu_si256((__m256i*)(alpha + x1), fx_);
5641+
}
5642+
_mm256_zeroupper();
5643+
}
5644+
#endif
56065645
#if CV_SSE2
56075646
if( useSSE2 )
56085647
{

0 commit comments

Comments
 (0)