@@ -417,6 +417,300 @@ class resizeNNInvoker :
417
417
resizeNNInvoker& operator =(const resizeNNInvoker&);
418
418
};
419
419
420
+ #if CV_AVX2
421
+ class resizeNNInvokerAVX4 :
422
+ public ParallelLoopBody
423
+ {
424
+ public:
425
+ resizeNNInvokerAVX4 (const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
426
+ ParallelLoopBody (), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
427
+ ify (_ify)
428
+ {
429
+ }
430
+
431
+ #pragma optimization_parameter target_arch=AVX
432
+ virtual void operator () (const Range& range) const
433
+ {
434
+ Size ssize = src.size (), dsize = dst.size ();
435
+ int y, x, pix_size = (int )src.elemSize ();
436
+ int width = dsize.width ;
437
+ int avxWidth = width - (width & 0x7 );
438
+ const __m256i CV_DECL_ALIGNED (64 ) mask = _mm256_set1_epi32 (-1 );
439
+ if (((int64)(dst.data + dst.step ) & 0x1f ) == 0 )
440
+ {
441
+ for (y = range.start ; y < range.end ; y++)
442
+ {
443
+ uchar* D = dst.data + dst.step *y;
444
+ uchar* Dstart = D;
445
+ int sy = std::min (cvFloor (y*ify), ssize.height -1 );
446
+ const uchar* S = src.data + sy*src.step ;
447
+ #pragma unroll(4)
448
+ for (x = 0 ; x < avxWidth; x += 8 )
449
+ {
450
+ const __m256i CV_DECL_ALIGNED (64 ) *addr = (__m256i*)(x_ofs + x);
451
+ __m256i CV_DECL_ALIGNED (64 ) indices = _mm256_lddqu_si256 (addr);
452
+ __m256i CV_DECL_ALIGNED (64 ) pixels = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
453
+ _mm256_maskstore_epi32 ((int *)D, mask, pixels);
454
+ D += 32 ;
455
+ }
456
+ for (; x < width; x++)
457
+ {
458
+ *(int *)(Dstart + x*4 ) = *(int *)(S + x_ofs[x]);
459
+ }
460
+ }
461
+ }
462
+ else
463
+ {
464
+ for (y = range.start ; y < range.end ; y++)
465
+ {
466
+ uchar* D = dst.data + dst.step *y;
467
+ uchar* Dstart = D;
468
+ int sy = std::min (cvFloor (y*ify), ssize.height -1 );
469
+ const uchar* S = src.data + sy*src.step ;
470
+ #pragma unroll(4)
471
+ for (x = 0 ; x < avxWidth; x += 8 )
472
+ {
473
+ const __m256i CV_DECL_ALIGNED (64 ) *addr = (__m256i*)(x_ofs + x);
474
+ __m256i CV_DECL_ALIGNED (64 ) indices = _mm256_lddqu_si256 (addr);
475
+ __m256i CV_DECL_ALIGNED (64 ) pixels = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
476
+ _mm256_storeu_si256 ((__m256i*)D, pixels);
477
+ D += 32 ;
478
+ }
479
+ for (; x < width; x++)
480
+ {
481
+ *(int *)(Dstart + x*4 ) = *(int *)(S + x_ofs[x]);
482
+ }
483
+ }
484
+ }
485
+ }
486
+
487
+ private:
488
+ const Mat src;
489
+ Mat dst;
490
+ int * x_ofs, pix_size4;
491
+ double ify;
492
+
493
+ resizeNNInvokerAVX4 (const resizeNNInvokerAVX4&);
494
+ resizeNNInvokerAVX4& operator =(const resizeNNInvokerAVX4&);
495
+ };
496
+
497
+ class resizeNNInvokerAVX2 :
498
+ public ParallelLoopBody
499
+ {
500
+ public:
501
+ resizeNNInvokerAVX2 (const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
502
+ ParallelLoopBody (), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
503
+ ify (_ify)
504
+ {
505
+ }
506
+
507
+ #pragma optimization_parameter target_arch=AVX
508
+ virtual void operator () (const Range& range) const
509
+ {
510
+ Size ssize = src.size (), dsize = dst.size ();
511
+ int y, x, pix_size = (int )src.elemSize ();
512
+ int width = dsize.width ;
513
+ // int avxWidth = (width - 1) - ((width - 1) & 0x7);
514
+ int avxWidth = width - (width & 0xf );
515
+ const __m256i CV_DECL_ALIGNED (64 ) mask = _mm256_set1_epi32 (-1 );
516
+ const __m256i CV_DECL_ALIGNED (64 ) shuffle_mask = _mm256_set_epi8 (15 ,14 ,11 ,10 ,13 ,12 ,9 ,8 ,7 ,6 ,3 ,2 ,5 ,4 ,1 ,0 ,
517
+ 15 ,14 ,11 ,10 ,13 ,12 ,9 ,8 ,7 ,6 ,3 ,2 ,5 ,4 ,1 ,0 );
518
+ const __m256i CV_DECL_ALIGNED (64 ) permute_mask = _mm256_set_epi32 (7 , 5 , 3 , 1 , 6 , 4 , 2 , 0 );
519
+ const __m256i CV_DECL_ALIGNED (64 ) shift_shuffle_mask = _mm256_set_epi8 (13 ,12 ,15 ,14 ,9 ,8 ,11 ,10 ,5 ,4 ,7 ,6 ,1 ,0 ,3 ,2 ,
520
+ 13 ,12 ,15 ,14 ,9 ,8 ,11 ,10 ,5 ,4 ,7 ,6 ,1 ,0 ,3 ,2 );
521
+ if (((int64)(dst.data + dst.step ) & 0x1f ) == 0 )
522
+ {
523
+ for (y = range.start ; y < range.end ; y++)
524
+ {
525
+ uchar* D = dst.data + dst.step *y;
526
+ uchar* Dstart = D;
527
+ int sy = std::min (cvFloor (y*ify), ssize.height -1 );
528
+ const uchar* S = src.data + sy*src.step ;
529
+ const uchar* S2 = S - 2 ;
530
+ #pragma unroll(4)
531
+ for (x = 0 ; x < avxWidth; x += 16 )
532
+ {
533
+ const __m256i CV_DECL_ALIGNED (64 ) *addr = (__m256i*)(x_ofs + x);
534
+ __m256i CV_DECL_ALIGNED (64 ) indices = _mm256_lddqu_si256 (addr);
535
+ __m256i CV_DECL_ALIGNED (64 ) pixels1 = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
536
+ const __m256i CV_DECL_ALIGNED (64 ) *addr2 = (__m256i*)(x_ofs + x + 8 );
537
+ __m256i CV_DECL_ALIGNED (64 ) indices2 = _mm256_lddqu_si256 (addr2);
538
+ __m256i CV_DECL_ALIGNED (64 ) pixels2 = _mm256_i32gather_epi32 ((const int *)S2, indices2, 1 );
539
+ __m256i CV_DECL_ALIGNED (64 ) unpacked = _mm256_blend_epi16 (pixels1, pixels2, 0xaa );
540
+
541
+ __m256i CV_DECL_ALIGNED (64 ) bytes_shuffled = _mm256_shuffle_epi8 (unpacked, shuffle_mask);
542
+ __m256i CV_DECL_ALIGNED (64 ) ints_permuted = _mm256_permutevar8x32_epi32 (bytes_shuffled, permute_mask);
543
+ _mm256_maskstore_epi32 ((int *)D, mask, ints_permuted);
544
+ D += 32 ;
545
+ }
546
+ for (; x < width; x++)
547
+ {
548
+ *(ushort*)(Dstart + x*2 ) = *(ushort*)(S + x_ofs[x]);
549
+ }
550
+
551
+ }
552
+ }
553
+ else
554
+ {
555
+ for (y = range.start ; y < range.end ; y++)
556
+ {
557
+ uchar* D = dst.data + dst.step *y;
558
+ uchar* Dstart = D;
559
+ int sy = std::min (cvFloor (y*ify), ssize.height -1 );
560
+ const uchar* S = src.data + sy*src.step ;
561
+ const uchar* S2 = S - 2 ;
562
+ #pragma unroll(4)
563
+ for (x = 0 ; x < avxWidth; x += 16 )
564
+ {
565
+ const __m256i CV_DECL_ALIGNED (64 ) *addr = (__m256i*)(x_ofs + x);
566
+ __m256i CV_DECL_ALIGNED (64 ) indices = _mm256_lddqu_si256 (addr);
567
+ __m256i CV_DECL_ALIGNED (64 ) pixels1 = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
568
+ const __m256i CV_DECL_ALIGNED (64 ) *addr2 = (__m256i*)(x_ofs + x + 8 );
569
+ __m256i CV_DECL_ALIGNED (64 ) indices2 = _mm256_lddqu_si256 (addr2);
570
+ __m256i CV_DECL_ALIGNED (64 ) pixels2 = _mm256_i32gather_epi32 ((const int *)S2, indices2, 1 );
571
+ __m256i CV_DECL_ALIGNED (64 ) unpacked = _mm256_blend_epi16 (pixels1, pixels2, 0xaa );
572
+
573
+ __m256i CV_DECL_ALIGNED (64 ) bytes_shuffled = _mm256_shuffle_epi8 (unpacked, shuffle_mask);
574
+ __m256i CV_DECL_ALIGNED (64 ) ints_permuted = _mm256_permutevar8x32_epi32 (bytes_shuffled, permute_mask);
575
+ _mm256_storeu_si256 ((__m256i*)D, ints_permuted);
576
+ D += 32 ;
577
+ }
578
+ for (; x < width; x++)
579
+ {
580
+ *(ushort*)(Dstart + x*2 ) = *(ushort*)(S + x_ofs[x]);
581
+ }
582
+ }
583
+ }
584
+ }
585
+
586
+ private:
587
+ const Mat src;
588
+ Mat dst;
589
+ int * x_ofs, pix_size4;
590
+ double ify;
591
+
592
+ resizeNNInvokerAVX2 (const resizeNNInvokerAVX2&);
593
+ resizeNNInvokerAVX2& operator =(const resizeNNInvokerAVX2&);
594
+ };
595
+ #endif
596
+
597
+ #if CV_SSE4_1
598
+ class resizeNNInvokerSSE2 :
599
+ public ParallelLoopBody
600
+ {
601
+ public:
602
+ resizeNNInvokerSSE2 (const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
603
+ ParallelLoopBody (), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
604
+ ify (_ify)
605
+ {
606
+ }
607
+
608
+ #pragma optimization_parameter target_arch=SSE4.2
609
+ virtual void operator () (const Range& range) const
610
+ {
611
+ Size ssize = src.size (), dsize = dst.size ();
612
+ int y, x, pix_size = (int )src.elemSize ();
613
+ int width = dsize.width ;
614
+ int sseWidth = width - (width & 0x7 );
615
+ for (y = range.start ; y < range.end ; y++)
616
+ {
617
+ uchar* D = dst.data + dst.step *y;
618
+ uchar* Dstart = D;
619
+ int sy = std::min (cvFloor (y*ify), ssize.height -1 );
620
+ const uchar* S = src.data + sy*src.step ;
621
+ __m128i CV_DECL_ALIGNED (64 ) pixels = _mm_set1_epi16 (0 );
622
+ for (x = 0 ; x < sseWidth; x += 8 )
623
+ {
624
+ ushort imm = *(ushort*)(S + x_ofs[x + 0 ]);
625
+ pixels = _mm_insert_epi16 (pixels, imm, 0 );
626
+ imm = *(ushort*)(S + x_ofs[x + 1 ]);
627
+ pixels = _mm_insert_epi16 (pixels, imm, 1 );
628
+ imm = *(ushort*)(S + x_ofs[x + 2 ]);
629
+ pixels = _mm_insert_epi16 (pixels, imm, 2 );
630
+ imm = *(ushort*)(S + x_ofs[x + 3 ]);
631
+ pixels = _mm_insert_epi16 (pixels, imm, 3 );
632
+ imm = *(ushort*)(S + x_ofs[x + 4 ]);
633
+ pixels = _mm_insert_epi16 (pixels, imm, 4 );
634
+ imm = *(ushort*)(S + x_ofs[x + 5 ]);
635
+ pixels = _mm_insert_epi16 (pixels, imm, 5 );
636
+ imm = *(ushort*)(S + x_ofs[x + 6 ]);
637
+ pixels = _mm_insert_epi16 (pixels, imm, 6 );
638
+ imm = *(ushort*)(S + x_ofs[x + 7 ]);
639
+ pixels = _mm_insert_epi16 (pixels, imm, 7 );
640
+ _mm_storeu_si128 ((__m128i*)D, pixels);
641
+ D += 16 ;
642
+ }
643
+ for (; x < width; x++)
644
+ {
645
+ *(ushort*)(Dstart + x*2 ) = *(ushort*)(S + x_ofs[x]);
646
+ }
647
+ }
648
+ }
649
+
650
+ private:
651
+ const Mat src;
652
+ Mat dst;
653
+ int * x_ofs, pix_size4;
654
+ double ify;
655
+
656
+ resizeNNInvokerSSE2 (const resizeNNInvokerSSE2&);
657
+ resizeNNInvokerSSE2& operator =(const resizeNNInvokerSSE2&);
658
+ };
659
+
660
+ class resizeNNInvokerSSE4 :
661
+ public ParallelLoopBody
662
+ {
663
+ public:
664
+ resizeNNInvokerSSE4 (const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
665
+ ParallelLoopBody (), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
666
+ ify (_ify)
667
+ {
668
+ }
669
+ #pragma optimization_parameter target_arch=SSE4.2
670
+ virtual void operator () (const Range& range) const
671
+ {
672
+ Size ssize = src.size (), dsize = dst.size ();
673
+ int y, x, pix_size = (int )src.elemSize ();
674
+ int width = dsize.width ;
675
+ int sseWidth = width - (width & 0x3 );
676
+ for (y = range.start ; y < range.end ; y++)
677
+ {
678
+ uchar* D = dst.data + dst.step *y;
679
+ uchar* Dstart = D;
680
+ int sy = std::min (cvFloor (y*ify), ssize.height -1 );
681
+ const uchar* S = src.data + sy*src.step ;
682
+ __m128i CV_DECL_ALIGNED (64 ) pixels = _mm_set1_epi16 (0 );
683
+ for (x = 0 ; x < sseWidth; x += 4 )
684
+ {
685
+ int imm = *(int *)(S + x_ofs[x + 0 ]);
686
+ pixels = _mm_insert_epi32 (pixels, imm, 0 );
687
+ imm = *(int *)(S + x_ofs[x + 1 ]);
688
+ pixels = _mm_insert_epi32 (pixels, imm, 1 );
689
+ imm = *(int *)(S + x_ofs[x + 2 ]);
690
+ pixels = _mm_insert_epi32 (pixels, imm, 2 );
691
+ imm = *(int *)(S + x_ofs[x + 3 ]);
692
+ pixels = _mm_insert_epi32 (pixels, imm, 3 );
693
+ _mm_storeu_si128 ((__m128i*)D, pixels);
694
+ D += 16 ;
695
+ }
696
+ for (; x < width; x++)
697
+ {
698
+ *(int *)(Dstart + x*4 ) = *(int *)(S + x_ofs[x]);
699
+ }
700
+ }
701
+ }
702
+
703
+ private:
704
+ const Mat src;
705
+ Mat dst;
706
+ int * x_ofs, pix_size4;
707
+ double ify;
708
+
709
+ resizeNNInvokerSSE4 (const resizeNNInvokerSSE4&);
710
+ resizeNNInvokerSSE4& operator =(const resizeNNInvokerSSE4&);
711
+ };
712
+ #endif
713
+
420
714
static void
421
715
resizeNN ( const Mat& src, Mat& dst, double fx, double fy )
422
716
{
@@ -435,8 +729,42 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
435
729
}
436
730
437
731
Range range (0 , dsize.height );
438
- resizeNNInvoker invoker (src, dst, x_ofs, pix_size4, ify);
439
- parallel_for_ (range, invoker, dst.total ()/(double )(1 <<16 ));
732
+ #if CV_AVX2
733
+ if (checkHardwareSupport (CV_CPU_AVX2) && ((pix_size == 2 ) || (pix_size == 4 )))
734
+ {
735
+ if (pix_size == 2 )
736
+ {
737
+ resizeNNInvokerAVX2 invoker (src, dst, x_ofs, pix_size4, ify);
738
+ parallel_for_ (range, invoker, dst.total ()/(double )(1 <<16 ));
739
+ }
740
+ else if (pix_size == 4 )
741
+ {
742
+ resizeNNInvokerAVX4 invoker (src, dst, x_ofs, pix_size4, ify);
743
+ parallel_for_ (range, invoker, dst.total ()/(double )(1 <<16 ));
744
+ }
745
+ }
746
+ else
747
+ #endif
748
+ #if CV_SSE4_1
749
+ if (checkHardwareSupport (CV_CPU_SSE4_1) && ((pix_size == 2 ) || (pix_size == 4 )))
750
+ {
751
+ if (pix_size == 2 )
752
+ {
753
+ resizeNNInvokerSSE2 invoker (src, dst, x_ofs, pix_size4, ify);
754
+ parallel_for_ (range, invoker, dst.total ()/(double )(1 <<16 ));
755
+ }
756
+ else if (pix_size == 4 )
757
+ {
758
+ resizeNNInvokerSSE4 invoker (src, dst, x_ofs, pix_size4, ify);
759
+ parallel_for_ (range, invoker, dst.total ()/(double )(1 <<16 ));
760
+ }
761
+ }
762
+ else
763
+ #endif
764
+ {
765
+ resizeNNInvoker invoker (src, dst, x_ofs, pix_size4, ify);
766
+ parallel_for_ (range, invoker, dst.total ()/(double )(1 <<16 ));
767
+ }
440
768
}
441
769
442
770
0 commit comments