Skip to content

Commit e72e34e

Browse files
committed
Merge pull request opencv#8843 from terfendail:resizenn_patch
2 parents a3189e3 + 2de1aac commit e72e34e

File tree

1 file changed

+330
-2
lines changed

1 file changed

+330
-2
lines changed

modules/imgproc/src/imgwarp.cpp

Lines changed: 330 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,300 @@ class resizeNNInvoker :
417417
resizeNNInvoker& operator=(const resizeNNInvoker&);
418418
};
419419

420+
#if CV_AVX2
421+
class resizeNNInvokerAVX4 :
422+
public ParallelLoopBody
423+
{
424+
public:
425+
resizeNNInvokerAVX4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
426+
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
427+
ify(_ify)
428+
{
429+
}
430+
431+
#pragma optimization_parameter target_arch=AVX
432+
virtual void operator() (const Range& range) const
433+
{
434+
Size ssize = src.size(), dsize = dst.size();
435+
int y, x, pix_size = (int)src.elemSize();
436+
int width = dsize.width;
437+
int avxWidth = width - (width & 0x7);
438+
const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1);
439+
if(((int64)(dst.data + dst.step) & 0x1f) == 0)
440+
{
441+
for(y = range.start; y < range.end; y++)
442+
{
443+
uchar* D = dst.data + dst.step*y;
444+
uchar* Dstart = D;
445+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
446+
const uchar* S = src.data + sy*src.step;
447+
#pragma unroll(4)
448+
for(x = 0; x < avxWidth; x += 8)
449+
{
450+
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
451+
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
452+
__m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1);
453+
_mm256_maskstore_epi32((int*)D, mask, pixels);
454+
D += 32;
455+
}
456+
for(; x < width; x++)
457+
{
458+
*(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
459+
}
460+
}
461+
}
462+
else
463+
{
464+
for(y = range.start; y < range.end; y++)
465+
{
466+
uchar* D = dst.data + dst.step*y;
467+
uchar* Dstart = D;
468+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
469+
const uchar* S = src.data + sy*src.step;
470+
#pragma unroll(4)
471+
for(x = 0; x < avxWidth; x += 8)
472+
{
473+
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
474+
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
475+
__m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1);
476+
_mm256_storeu_si256((__m256i*)D, pixels);
477+
D += 32;
478+
}
479+
for(; x < width; x++)
480+
{
481+
*(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
482+
}
483+
}
484+
}
485+
}
486+
487+
private:
488+
const Mat src;
489+
Mat dst;
490+
int* x_ofs, pix_size4;
491+
double ify;
492+
493+
resizeNNInvokerAVX4(const resizeNNInvokerAVX4&);
494+
resizeNNInvokerAVX4& operator=(const resizeNNInvokerAVX4&);
495+
};
496+
497+
class resizeNNInvokerAVX2 :
498+
public ParallelLoopBody
499+
{
500+
public:
501+
resizeNNInvokerAVX2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
502+
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
503+
ify(_ify)
504+
{
505+
}
506+
507+
#pragma optimization_parameter target_arch=AVX
508+
virtual void operator() (const Range& range) const
509+
{
510+
Size ssize = src.size(), dsize = dst.size();
511+
int y, x, pix_size = (int)src.elemSize();
512+
int width = dsize.width;
513+
//int avxWidth = (width - 1) - ((width - 1) & 0x7);
514+
int avxWidth = width - (width & 0xf);
515+
const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1);
516+
const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0,
517+
15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0);
518+
const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
519+
const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
520+
13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
521+
if(((int64)(dst.data + dst.step) & 0x1f) == 0)
522+
{
523+
for(y = range.start; y < range.end; y++)
524+
{
525+
uchar* D = dst.data + dst.step*y;
526+
uchar* Dstart = D;
527+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
528+
const uchar* S = src.data + sy*src.step;
529+
const uchar* S2 = S - 2;
530+
#pragma unroll(4)
531+
for(x = 0; x < avxWidth; x += 16)
532+
{
533+
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
534+
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
535+
__m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1);
536+
const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
537+
__m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2);
538+
__m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1);
539+
__m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa);
540+
541+
__m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask);
542+
__m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask);
543+
_mm256_maskstore_epi32((int*)D, mask, ints_permuted);
544+
D += 32;
545+
}
546+
for(; x < width; x++)
547+
{
548+
*(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
549+
}
550+
551+
}
552+
}
553+
else
554+
{
555+
for(y = range.start; y < range.end; y++)
556+
{
557+
uchar* D = dst.data + dst.step*y;
558+
uchar* Dstart = D;
559+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
560+
const uchar* S = src.data + sy*src.step;
561+
const uchar* S2 = S - 2;
562+
#pragma unroll(4)
563+
for(x = 0; x < avxWidth; x += 16)
564+
{
565+
const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x);
566+
__m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr);
567+
__m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1);
568+
const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8);
569+
__m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2);
570+
__m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1);
571+
__m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa);
572+
573+
__m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask);
574+
__m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask);
575+
_mm256_storeu_si256((__m256i*)D, ints_permuted);
576+
D += 32;
577+
}
578+
for(; x < width; x++)
579+
{
580+
*(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
581+
}
582+
}
583+
}
584+
}
585+
586+
private:
587+
const Mat src;
588+
Mat dst;
589+
int* x_ofs, pix_size4;
590+
double ify;
591+
592+
resizeNNInvokerAVX2(const resizeNNInvokerAVX2&);
593+
resizeNNInvokerAVX2& operator=(const resizeNNInvokerAVX2&);
594+
};
595+
#endif
596+
597+
#if CV_SSE4_1
598+
class resizeNNInvokerSSE2 :
599+
public ParallelLoopBody
600+
{
601+
public:
602+
resizeNNInvokerSSE2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
603+
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
604+
ify(_ify)
605+
{
606+
}
607+
608+
#pragma optimization_parameter target_arch=SSE4.2
609+
virtual void operator() (const Range& range) const
610+
{
611+
Size ssize = src.size(), dsize = dst.size();
612+
int y, x, pix_size = (int)src.elemSize();
613+
int width = dsize.width;
614+
int sseWidth = width - (width & 0x7);
615+
for(y = range.start; y < range.end; y++)
616+
{
617+
uchar* D = dst.data + dst.step*y;
618+
uchar* Dstart = D;
619+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
620+
const uchar* S = src.data + sy*src.step;
621+
__m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0);
622+
for(x = 0; x < sseWidth; x += 8)
623+
{
624+
ushort imm = *(ushort*)(S + x_ofs[x + 0]);
625+
pixels = _mm_insert_epi16(pixels, imm, 0);
626+
imm = *(ushort*)(S + x_ofs[x + 1]);
627+
pixels = _mm_insert_epi16(pixels, imm, 1);
628+
imm = *(ushort*)(S + x_ofs[x + 2]);
629+
pixels = _mm_insert_epi16(pixels, imm, 2);
630+
imm = *(ushort*)(S + x_ofs[x + 3]);
631+
pixels = _mm_insert_epi16(pixels, imm, 3);
632+
imm = *(ushort*)(S + x_ofs[x + 4]);
633+
pixels = _mm_insert_epi16(pixels, imm, 4);
634+
imm = *(ushort*)(S + x_ofs[x + 5]);
635+
pixels = _mm_insert_epi16(pixels, imm, 5);
636+
imm = *(ushort*)(S + x_ofs[x + 6]);
637+
pixels = _mm_insert_epi16(pixels, imm, 6);
638+
imm = *(ushort*)(S + x_ofs[x + 7]);
639+
pixels = _mm_insert_epi16(pixels, imm, 7);
640+
_mm_storeu_si128((__m128i*)D, pixels);
641+
D += 16;
642+
}
643+
for(; x < width; x++)
644+
{
645+
*(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]);
646+
}
647+
}
648+
}
649+
650+
private:
651+
const Mat src;
652+
Mat dst;
653+
int* x_ofs, pix_size4;
654+
double ify;
655+
656+
resizeNNInvokerSSE2(const resizeNNInvokerSSE2&);
657+
resizeNNInvokerSSE2& operator=(const resizeNNInvokerSSE2&);
658+
};
659+
660+
class resizeNNInvokerSSE4 :
661+
public ParallelLoopBody
662+
{
663+
public:
664+
resizeNNInvokerSSE4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
665+
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
666+
ify(_ify)
667+
{
668+
}
669+
#pragma optimization_parameter target_arch=SSE4.2
670+
virtual void operator() (const Range& range) const
671+
{
672+
Size ssize = src.size(), dsize = dst.size();
673+
int y, x, pix_size = (int)src.elemSize();
674+
int width = dsize.width;
675+
int sseWidth = width - (width & 0x3);
676+
for(y = range.start; y < range.end; y++)
677+
{
678+
uchar* D = dst.data + dst.step*y;
679+
uchar* Dstart = D;
680+
int sy = std::min(cvFloor(y*ify), ssize.height-1);
681+
const uchar* S = src.data + sy*src.step;
682+
__m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0);
683+
for(x = 0; x < sseWidth; x += 4)
684+
{
685+
int imm = *(int*)(S + x_ofs[x + 0]);
686+
pixels = _mm_insert_epi32(pixels, imm, 0);
687+
imm = *(int*)(S + x_ofs[x + 1]);
688+
pixels = _mm_insert_epi32(pixels, imm, 1);
689+
imm = *(int*)(S + x_ofs[x + 2]);
690+
pixels = _mm_insert_epi32(pixels, imm, 2);
691+
imm = *(int*)(S + x_ofs[x + 3]);
692+
pixels = _mm_insert_epi32(pixels, imm, 3);
693+
_mm_storeu_si128((__m128i*)D, pixels);
694+
D += 16;
695+
}
696+
for(; x < width; x++)
697+
{
698+
*(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]);
699+
}
700+
}
701+
}
702+
703+
private:
704+
const Mat src;
705+
Mat dst;
706+
int* x_ofs, pix_size4;
707+
double ify;
708+
709+
resizeNNInvokerSSE4(const resizeNNInvokerSSE4&);
710+
resizeNNInvokerSSE4& operator=(const resizeNNInvokerSSE4&);
711+
};
712+
#endif
713+
420714
static void
421715
resizeNN( const Mat& src, Mat& dst, double fx, double fy )
422716
{
@@ -435,8 +729,42 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
435729
}
436730

437731
Range range(0, dsize.height);
438-
resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
439-
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
732+
#if CV_AVX2
733+
if(checkHardwareSupport(CV_CPU_AVX2) && ((pix_size == 2) || (pix_size == 4)))
734+
{
735+
if(pix_size == 2)
736+
{
737+
resizeNNInvokerAVX2 invoker(src, dst, x_ofs, pix_size4, ify);
738+
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
739+
}
740+
else if (pix_size == 4)
741+
{
742+
resizeNNInvokerAVX4 invoker(src, dst, x_ofs, pix_size4, ify);
743+
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
744+
}
745+
}
746+
else
747+
#endif
748+
#if CV_SSE4_1
749+
if(checkHardwareSupport(CV_CPU_SSE4_1) && ((pix_size == 2) || (pix_size == 4)))
750+
{
751+
if(pix_size == 2)
752+
{
753+
resizeNNInvokerSSE2 invoker(src, dst, x_ofs, pix_size4, ify);
754+
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
755+
}
756+
else if(pix_size == 4)
757+
{
758+
resizeNNInvokerSSE4 invoker(src, dst, x_ofs, pix_size4, ify);
759+
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
760+
}
761+
}
762+
else
763+
#endif
764+
{
765+
resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
766+
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
767+
}
440768
}
441769

442770

0 commit comments

Comments
 (0)