@@ -435,13 +435,7 @@ class resizeNNInvokerAVX4 :
435
435
int y, x, pix_size = (int )src.elemSize ();
436
436
int width = dsize.width ;
437
437
int avxWidth = width - (width & 0x7 );
438
- #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__)
439
- const __declspec (align (64 )) __m256i mask = _mm256_set1_epi32 (-1 );
440
- #elif defined __GNUC__ && __GNUC__ >= 4
441
- const __m256i mask __attribute__ ((aligned (64 ))) = _mm256_set1_epi32 (-1 );
442
- #else
443
- const __m256i mask = _mm256_set1_epi32 (-1 );
444
- #endif
438
+ const __m256i CV_DECL_ALIGNED (64 ) mask = _mm256_set1_epi32 (-1 );
445
439
if (((int64)(dst.data + dst.step ) & 0x1f ) == 0 )
446
440
{
447
441
for (y = range.start ; y < range.end ; y++)
@@ -453,19 +447,9 @@ class resizeNNInvokerAVX4 :
453
447
#pragma unroll(4)
454
448
for (x = 0 ; x < avxWidth; x += 8 )
455
449
{
456
- #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__)
457
- __declspec (align (64 )) const __m256i *addr = (__m256i*)(x_ofs + x);
458
- __declspec (align (64 )) __m256i indices = _mm256_lddqu_si256 (addr);
459
- __declspec (align (64 )) __m256i pixels = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
460
- #elif defined __GNUC__ && __GNUC__ >= 4
461
- const __m256i *addr __attribute__ ((aligned (64 ))) = (__m256i*)(x_ofs + x);
462
- __m256i indices __attribute__ ((aligned (64 ))) = _mm256_lddqu_si256 (addr);
463
- __m256i pixels __attribute__ ((aligned (64 ))) = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
464
- #else
465
- const __m256i *addr = (__m256i*)(x_ofs + x);
466
- __m256i indices = _mm256_lddqu_si256 (addr);
467
- __m256i pixels = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
468
- #endif
450
+ const __m256i CV_DECL_ALIGNED (64 ) *addr = (__m256i*)(x_ofs + x);
451
+ __m256i CV_DECL_ALIGNED (64 ) indices = _mm256_lddqu_si256 (addr);
452
+ __m256i CV_DECL_ALIGNED (64 ) pixels = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
469
453
_mm256_maskstore_epi32 ((int *)D, mask, pixels);
470
454
D += 32 ;
471
455
}
@@ -486,19 +470,9 @@ class resizeNNInvokerAVX4 :
486
470
#pragma unroll(4)
487
471
for (x = 0 ; x < avxWidth; x += 8 )
488
472
{
489
- #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__)
490
- __declspec (align (64 )) const __m256i *addr = (__m256i*)(x_ofs + x);
491
- __declspec (align (64 )) __m256i indices = _mm256_lddqu_si256 (addr);
492
- __declspec (align (64 )) __m256i pixels = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
493
- #elif defined __GNUC__ && __GNUC__ >= 4
494
- const __m256i *addr __attribute__ ((aligned (64 ))) = (__m256i*)(x_ofs + x);
495
- __m256i indices __attribute__ ((aligned (64 ))) = _mm256_lddqu_si256 (addr);
496
- __m256i pixels __attribute__ ((aligned (64 ))) = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
497
- #else
498
- const __m256i *addr = (__m256i*)(x_ofs + x);
499
- __m256i indices = _mm256_lddqu_si256 (addr);
500
- __m256i pixels = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
501
- #endif
473
+ const __m256i CV_DECL_ALIGNED (64 ) *addr = (__m256i*)(x_ofs + x);
474
+ __m256i CV_DECL_ALIGNED (64 ) indices = _mm256_lddqu_si256 (addr);
475
+ __m256i CV_DECL_ALIGNED (64 ) pixels = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
502
476
_mm256_storeu_si256 ((__m256i*)D, pixels);
503
477
D += 32 ;
504
478
}
@@ -538,29 +512,12 @@ class resizeNNInvokerAVX2 :
538
512
int width = dsize.width ;
539
513
// int avxWidth = (width - 1) - ((width - 1) & 0x7);
540
514
int avxWidth = width - (width & 0xf );
541
- #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__)
542
- const __declspec (align (64 )) __m256i mask = _mm256_set1_epi32 (-1 );
543
- const __declspec (align (64 )) __m256i shuffle_mask = _mm256_set_epi8 (15 ,14 ,11 ,10 ,13 ,12 ,9 ,8 ,7 ,6 ,3 ,2 ,5 ,4 ,1 ,0 ,
544
- 15 ,14 ,11 ,10 ,13 ,12 ,9 ,8 ,7 ,6 ,3 ,2 ,5 ,4 ,1 ,0 );
545
- const __declspec (align (64 )) __m256i permute_mask = _mm256_set_epi32 (7 , 5 , 3 , 1 , 6 , 4 , 2 , 0 );
546
- const __declspec (align (64 )) __m256i shift_shuffle_mask = _mm256_set_epi8 (13 ,12 ,15 ,14 ,9 ,8 ,11 ,10 ,5 ,4 ,7 ,6 ,1 ,0 ,3 ,2 ,
547
- 13 ,12 ,15 ,14 ,9 ,8 ,11 ,10 ,5 ,4 ,7 ,6 ,1 ,0 ,3 ,2 );
548
- #elif defined __GNUC__ && __GNUC__ >= 4
549
- const __m256i mask __attribute__ ((aligned (64 ))) = _mm256_set1_epi32 (-1 );
550
- const __m256i shuffle_mask __attribute__ ((aligned (64 ))) = _mm256_set_epi8 (15 ,14 ,11 ,10 ,13 ,12 ,9 ,8 ,7 ,6 ,3 ,2 ,5 ,4 ,1 ,0 ,
551
- 15 ,14 ,11 ,10 ,13 ,12 ,9 ,8 ,7 ,6 ,3 ,2 ,5 ,4 ,1 ,0 );
552
- const __m256i permute_mask __attribute__ ((aligned (64 ))) = _mm256_set_epi32 (7 , 5 , 3 , 1 , 6 , 4 , 2 , 0 );
553
- const __m256i shift_shuffle_mask __attribute__ ((aligned (64 ))) = _mm256_set_epi8 (13 ,12 ,15 ,14 ,9 ,8 ,11 ,10 ,5 ,4 ,7 ,6 ,1 ,0 ,3 ,2 ,
554
- 13 ,12 ,15 ,14 ,9 ,8 ,11 ,10 ,5 ,4 ,7 ,6 ,1 ,0 ,3 ,2 );
555
-
556
- #else
557
- const __m256i mask = _mm256_set1_epi32 (-1 );
558
- const __m256i shuffle_mask = _mm256_set_epi8 (15 ,14 ,11 ,10 ,13 ,12 ,9 ,8 ,7 ,6 ,3 ,2 ,5 ,4 ,1 ,0 ,
559
- 15 ,14 ,11 ,10 ,13 ,12 ,9 ,8 ,7 ,6 ,3 ,2 ,5 ,4 ,1 ,0 );
560
- const __m256i permute_mask = _mm256_set_epi32 (7 , 5 , 3 , 1 , 6 , 4 , 2 , 0 );
561
- const __m256i shift_shuffle_mask = _mm256_set_epi8 (13 ,12 ,15 ,14 ,9 ,8 ,11 ,10 ,5 ,4 ,7 ,6 ,1 ,0 ,3 ,2 ,
562
- 13 ,12 ,15 ,14 ,9 ,8 ,11 ,10 ,5 ,4 ,7 ,6 ,1 ,0 ,3 ,2 );
563
- #endif
515
+ const __m256i CV_DECL_ALIGNED (64 ) mask = _mm256_set1_epi32 (-1 );
516
+ const __m256i CV_DECL_ALIGNED (64 ) shuffle_mask = _mm256_set_epi8 (15 ,14 ,11 ,10 ,13 ,12 ,9 ,8 ,7 ,6 ,3 ,2 ,5 ,4 ,1 ,0 ,
517
+ 15 ,14 ,11 ,10 ,13 ,12 ,9 ,8 ,7 ,6 ,3 ,2 ,5 ,4 ,1 ,0 );
518
+ const __m256i CV_DECL_ALIGNED (64 ) permute_mask = _mm256_set_epi32 (7 , 5 , 3 , 1 , 6 , 4 , 2 , 0 );
519
+ const __m256i CV_DECL_ALIGNED (64 ) shift_shuffle_mask = _mm256_set_epi8 (13 ,12 ,15 ,14 ,9 ,8 ,11 ,10 ,5 ,4 ,7 ,6 ,1 ,0 ,3 ,2 ,
520
+ 13 ,12 ,15 ,14 ,9 ,8 ,11 ,10 ,5 ,4 ,7 ,6 ,1 ,0 ,3 ,2 );
564
521
if (((int64)(dst.data + dst.step ) & 0x1f ) == 0 )
565
522
{
566
523
for (y = range.start ; y < range.end ; y++)
@@ -573,40 +530,16 @@ class resizeNNInvokerAVX2 :
573
530
#pragma unroll(4)
574
531
for (x = 0 ; x < avxWidth; x += 16 )
575
532
{
576
- #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__)
577
- __declspec (align (64 )) const __m256i *addr = (__m256i*)(x_ofs + x);
578
- __declspec (align (64 )) __m256i indices = _mm256_lddqu_si256 (addr);
579
- __declspec (align (64 )) __m256i pixels1 = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
580
- __declspec (align (64 )) const __m256i *addr2 = (__m256i*)(x_ofs + x + 8 );
581
- __declspec (align (64 )) __m256i indices2 = _mm256_lddqu_si256 (addr2);
582
- __declspec (align (64 )) __m256i pixels2 = _mm256_i32gather_epi32 ((const int *)S2, indices2, 1 );
583
- __declspec (align (64 )) __m256i unpacked = _mm256_blend_epi16 (pixels1, pixels2, 0xaa );
584
-
585
- __declspec (align (64 )) __m256i bytes_shuffled = _mm256_shuffle_epi8 (unpacked, shuffle_mask);
586
- __declspec (align (64 )) __m256i ints_permuted = _mm256_permutevar8x32_epi32 (bytes_shuffled, permute_mask);
587
- #elif defined __GNUC__ && __GNUC__ >= 4
588
- const __m256i *addr __attribute__ ((aligned (64 ))) = (__m256i*)(x_ofs + x);
589
- __m256i indices __attribute__ ((aligned (64 ))) = _mm256_lddqu_si256 (addr);
590
- __m256i pixels1 __attribute__ ((aligned (64 ))) = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
591
- const __m256i *addr2 __attribute__ ((aligned (64 ))) = (__m256i*)(x_ofs + x + 8 );
592
- __m256i indices2 __attribute__ ((aligned (64 ))) = _mm256_lddqu_si256 (addr2);
593
- __m256i pixels2 __attribute__ ((aligned (64 ))) = _mm256_i32gather_epi32 ((const int *)S2, indices2, 1 );
594
- __m256i unpacked __attribute__ ((aligned (64 ))) = _mm256_blend_epi16 (pixels1, pixels2, 0xaa );
595
-
596
- __m256i bytes_shuffled __attribute__ ((aligned (64 ))) = _mm256_shuffle_epi8 (unpacked, shuffle_mask);
597
- __m256i ints_permuted __attribute__ ((aligned (64 ))) = _mm256_permutevar8x32_epi32 (bytes_shuffled, permute_mask);
598
- #else
599
- const __m256i *addr = (__m256i*)(x_ofs + x);
600
- __m256i indices = _mm256_lddqu_si256 (addr);
601
- __m256i pixels1 = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
602
- const __m256i *addr2 = (__m256i*)(x_ofs + x + 8 );
603
- __m256i indices2 = _mm256_lddqu_si256 (addr2);
604
- __m256i pixels2 = _mm256_i32gather_epi32 ((const int *)S2, indices2, 1 );
605
- __m256i unpacked = _mm256_blend_epi16 (pixels1, pixels2, 0xaa );
606
-
607
- __m256i bytes_shuffled = _mm256_shuffle_epi8 (unpacked, shuffle_mask);
608
- __m256i ints_permuted = _mm256_permutevar8x32_epi32 (bytes_shuffled, permute_mask);
609
- #endif
533
+ const __m256i CV_DECL_ALIGNED (64 ) *addr = (__m256i*)(x_ofs + x);
534
+ __m256i CV_DECL_ALIGNED (64 ) indices = _mm256_lddqu_si256 (addr);
535
+ __m256i CV_DECL_ALIGNED (64 ) pixels1 = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
536
+ const __m256i CV_DECL_ALIGNED (64 ) *addr2 = (__m256i*)(x_ofs + x + 8 );
537
+ __m256i CV_DECL_ALIGNED (64 ) indices2 = _mm256_lddqu_si256 (addr2);
538
+ __m256i CV_DECL_ALIGNED (64 ) pixels2 = _mm256_i32gather_epi32 ((const int *)S2, indices2, 1 );
539
+ __m256i CV_DECL_ALIGNED (64 ) unpacked = _mm256_blend_epi16 (pixels1, pixels2, 0xaa );
540
+
541
+ __m256i CV_DECL_ALIGNED (64 ) bytes_shuffled = _mm256_shuffle_epi8 (unpacked, shuffle_mask);
542
+ __m256i CV_DECL_ALIGNED (64 ) ints_permuted = _mm256_permutevar8x32_epi32 (bytes_shuffled, permute_mask);
610
543
_mm256_maskstore_epi32 ((int *)D, mask, ints_permuted);
611
544
D += 32 ;
612
545
}
@@ -629,40 +562,16 @@ class resizeNNInvokerAVX2 :
629
562
#pragma unroll(4)
630
563
for (x = 0 ; x < avxWidth; x += 16 )
631
564
{
632
- #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__)
633
- __declspec (align (64 )) const __m256i *addr = (__m256i*)(x_ofs + x);
634
- __declspec (align (64 )) __m256i indices = _mm256_lddqu_si256 (addr);
635
- __declspec (align (64 )) __m256i pixels1 = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
636
- __declspec (align (64 )) const __m256i *addr2 = (__m256i*)(x_ofs + x + 8 );
637
- __declspec (align (64 )) __m256i indices2 = _mm256_lddqu_si256 (addr2);
638
- __declspec (align (64 )) __m256i pixels2 = _mm256_i32gather_epi32 ((const int *)S2, indices2, 1 );
639
- __declspec (align (64 )) __m256i unpacked = _mm256_blend_epi16 (pixels1, pixels2, 0xaa );
640
-
641
- __declspec (align (64 )) __m256i bytes_shuffled = _mm256_shuffle_epi8 (unpacked, shuffle_mask);
642
- __declspec (align (64 )) __m256i ints_permuted = _mm256_permutevar8x32_epi32 (bytes_shuffled, permute_mask);
643
- #elif defined __GNUC__ && __GNUC__ >= 4
644
- const __m256i *addr __attribute__ ((aligned (64 ))) = (__m256i*)(x_ofs + x);
645
- __m256i indices __attribute__ ((aligned (64 ))) = _mm256_lddqu_si256 (addr);
646
- __m256i pixels1 __attribute__ ((aligned (64 ))) = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
647
- const __m256i *addr2 __attribute__ ((aligned (64 ))) = (__m256i*)(x_ofs + x + 8 );
648
- __m256i indices2 __attribute__ ((aligned (64 ))) = _mm256_lddqu_si256 (addr2);
649
- __m256i pixels2 __attribute__ ((aligned (64 ))) = _mm256_i32gather_epi32 ((const int *)S2, indices2, 1 );
650
- __m256i unpacked __attribute__ ((aligned (64 ))) = _mm256_blend_epi16 (pixels1, pixels2, 0xaa );
651
-
652
- __m256i bytes_shuffled __attribute__ ((aligned (64 ))) = _mm256_shuffle_epi8 (unpacked, shuffle_mask);
653
- __m256i ints_permuted __attribute__ ((aligned (64 ))) = _mm256_permutevar8x32_epi32 (bytes_shuffled, permute_mask);
654
- #else
655
- const __m256i *addr = (__m256i*)(x_ofs + x);
656
- __m256i indices = _mm256_lddqu_si256 (addr);
657
- __m256i pixels1 = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
658
- const __m256i *addr2 = (__m256i*)(x_ofs + x + 8 );
659
- __m256i indices2 = _mm256_lddqu_si256 (addr2);
660
- __m256i pixels2 = _mm256_i32gather_epi32 ((const int *)S2, indices2, 1 );
661
- __m256i unpacked = _mm256_blend_epi16 (pixels1, pixels2, 0xaa );
662
-
663
- __m256i bytes_shuffled = _mm256_shuffle_epi8 (unpacked, shuffle_mask);
664
- __m256i ints_permuted = _mm256_permutevar8x32_epi32 (bytes_shuffled, permute_mask);
665
- #endif
565
+ const __m256i CV_DECL_ALIGNED (64 ) *addr = (__m256i*)(x_ofs + x);
566
+ __m256i CV_DECL_ALIGNED (64 ) indices = _mm256_lddqu_si256 (addr);
567
+ __m256i CV_DECL_ALIGNED (64 ) pixels1 = _mm256_i32gather_epi32 ((const int *)S, indices, 1 );
568
+ const __m256i CV_DECL_ALIGNED (64 ) *addr2 = (__m256i*)(x_ofs + x + 8 );
569
+ __m256i CV_DECL_ALIGNED (64 ) indices2 = _mm256_lddqu_si256 (addr2);
570
+ __m256i CV_DECL_ALIGNED (64 ) pixels2 = _mm256_i32gather_epi32 ((const int *)S2, indices2, 1 );
571
+ __m256i CV_DECL_ALIGNED (64 ) unpacked = _mm256_blend_epi16 (pixels1, pixels2, 0xaa );
572
+
573
+ __m256i CV_DECL_ALIGNED (64 ) bytes_shuffled = _mm256_shuffle_epi8 (unpacked, shuffle_mask);
574
+ __m256i CV_DECL_ALIGNED (64 ) ints_permuted = _mm256_permutevar8x32_epi32 (bytes_shuffled, permute_mask);
666
575
_mm256_storeu_si256 ((__m256i*)D, ints_permuted);
667
576
D += 32 ;
668
577
}
@@ -709,13 +618,7 @@ class resizeNNInvokerSSE2 :
709
618
uchar* Dstart = D;
710
619
int sy = std::min (cvFloor (y*ify), ssize.height -1 );
711
620
const uchar* S = src.data + sy*src.step ;
712
- #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__)
713
- __declspec (align (64 )) __m128i pixels = _mm_set1_epi16 (0 );
714
- #elif defined __GNUC__ && __GNUC__ >= 4
715
- __m128i pixels __attribute__ ((aligned (64 ))) = _mm_set1_epi16 (0 );
716
- #else
717
- __m128i pixels = _mm_set1_epi16 (0 );
718
- #endif
621
+ __m128i CV_DECL_ALIGNED (64 ) pixels = _mm_set1_epi16 (0 );
719
622
for (x = 0 ; x < sseWidth; x += 8 )
720
623
{
721
624
ushort imm = *(ushort*)(S + x_ofs[x + 0 ]);
@@ -776,13 +679,7 @@ class resizeNNInvokerSSE4 :
776
679
uchar* Dstart = D;
777
680
int sy = std::min (cvFloor (y*ify), ssize.height -1 );
778
681
const uchar* S = src.data + sy*src.step ;
779
- #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__)
780
- __declspec (align (64 )) __m128i pixels = _mm_set1_epi16 (0 );
781
- #elif defined __GNUC__ && __GNUC__ >= 4
782
- __m128i pixels __attribute__ ((aligned (64 ))) = _mm_set1_epi16 (0 );
783
- #else
784
- __m128i pixels = _mm_set1_epi16 (0 );
785
- #endif
682
+ __m128i CV_DECL_ALIGNED (64 ) pixels = _mm_set1_epi16 (0 );
786
683
for (x = 0 ; x < sseWidth; x += 4 )
787
684
{
788
685
int imm = *(int *)(S + x_ofs[x + 0 ]);
0 commit comments