@@ -356,7 +356,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *o
356
356
}
357
357
for (; i < dst_max - 7 && ofst[i + 7 ] + 15 <= ofst[dst_width - 1 ]; i += 8 , m += 16 , dst += 8 )
358
358
{
359
- v_uint32x4 v_src01 = v_combine_low (v_reinterpret_as_u32 (v_load_expand (src + ofst[i])), v_reinterpret_as_u32 (v_load_expand (src + ofst[i + 1 ])));
359
+ v_uint32x4 v_src01 = v_combine_low (v_reinterpret_as_u32 (v_load_expand (src + ofst[i ])), v_reinterpret_as_u32 (v_load_expand (src + ofst[i + 1 ])));
360
360
v_uint32x4 v_src23 = v_combine_low (v_reinterpret_as_u32 (v_load_expand (src + ofst[i + 2 ])), v_reinterpret_as_u32 (v_load_expand (src + ofst[i + 3 ])));
361
361
v_uint32x4 v_src45 = v_combine_low (v_reinterpret_as_u32 (v_load_expand (src + ofst[i + 4 ])), v_reinterpret_as_u32 (v_load_expand (src + ofst[i + 5 ])));
362
362
v_uint32x4 v_src67 = v_combine_low (v_reinterpret_as_u32 (v_load_expand (src + ofst[i + 6 ])), v_reinterpret_as_u32 (v_load_expand (src + ofst[i + 7 ])));
@@ -390,47 +390,250 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *o
390
390
*(dst++) = src_0;
391
391
}
392
392
}
393
+ template <>
394
+ void hlineResizeCn<uint8_t , ufixedpoint16, 2 , true , 2 >(uint8_t * src, int , int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
395
+ {
396
+ int i = 0 ;
397
+ ufixedpoint16 srccn[8 ] = { src[0 ], src[1 ], src[0 ], src[1 ], src[0 ], src[1 ], src[0 ], src[1 ] };
398
+ v_uint16x8 v_srccn = v_load ((uint16_t *)srccn);
399
+ for (; i < dst_min - 3 ; i += 4 , m += 8 , dst += 8 ) // Points that fall left from src image so became equal to leftmost src point
400
+ {
401
+ v_store ((uint16_t *)dst, v_srccn);
402
+ }
403
+ for (; i < dst_min; i++, m += 2 )
404
+ {
405
+ *(dst++) = srccn[0 ];
406
+ *(dst++) = srccn[1 ];
407
+ }
408
+ for (; i < dst_max - 3 && ofst[i + 3 ] + 7 <= ofst[dst_width - 1 ]; i += 4 , m += 8 , dst += 8 )
409
+ {
410
+ v_uint32x4 v_src0 = v_combine_low (v_reinterpret_as_u32 (v_load_expand (src + 2 * ofst[i ])), v_reinterpret_as_u32 (v_load_expand (src + 2 * ofst[i + 1 ])));
411
+ v_uint32x4 v_src1 = v_combine_low (v_reinterpret_as_u32 (v_load_expand (src + 2 * ofst[i + 2 ])), v_reinterpret_as_u32 (v_load_expand (src + 2 * ofst[i + 3 ])));
412
+
413
+ v_uint32x4 v_zip0, v_zip1;
414
+ v_zip (v_src0, v_src1, v_zip0, v_zip1);
415
+ v_zip (v_zip0, v_zip1, v_src0, v_src1);
393
416
417
+ v_int16x8 v_src0123, v_src4567;
418
+ v_zip (v_reinterpret_as_s16 (v_src0), v_reinterpret_as_s16 (v_src1), v_src0123, v_src4567);
419
+
420
+ v_uint32x4 v_mul = v_load ((uint32_t *)m);// AaBbCcDd
421
+ v_zip (v_mul, v_mul, v_zip0, v_zip1);// AaAaBbBb CcCcDdDd
422
+ v_uint32x4 v_res0 = v_reinterpret_as_u32 (v_dotprod (v_src0123, v_reinterpret_as_s16 (v_zip0)));
423
+ v_uint32x4 v_res1 = v_reinterpret_as_u32 (v_dotprod (v_src4567, v_reinterpret_as_s16 (v_zip1)));
424
+ v_store ((uint16_t *)dst, v_pack (v_res0, v_res1));// AB1AB2CD1CD2
425
+ }
426
+ for (; i < dst_max; i += 1 , m += 2 )
427
+ {
428
+ uint8_t * px = src + 2 * ofst[i];
429
+ *(dst++) = m[0 ] * px[0 ] + m[1 ] * px[2 ];
430
+ *(dst++) = m[0 ] * px[1 ] + m[1 ] * px[3 ];
431
+ }
432
+ srccn[0 ] = (src + 2 * ofst[dst_width - 1 ])[0 ]; srccn[1 ] = (src + 2 * ofst[dst_width - 1 ])[1 ]; srccn[2 ] = (src + 2 * ofst[dst_width - 1 ])[0 ]; srccn[3 ] = (src + 2 * ofst[dst_width - 1 ])[1 ];
433
+ srccn[4 ] = (src + 2 * ofst[dst_width - 1 ])[0 ]; srccn[5 ] = (src + 2 * ofst[dst_width - 1 ])[1 ]; srccn[6 ] = (src + 2 * ofst[dst_width - 1 ])[0 ]; srccn[7 ] = (src + 2 * ofst[dst_width - 1 ])[1 ];
434
+ v_srccn = v_load ((uint16_t *)srccn);
435
+ for (; i < dst_width - 3 ; i += 4 , dst += 8 ) // Points that fall left from src image so became equal to leftmost src point
436
+ {
437
+ v_store ((uint16_t *)dst, v_srccn);
438
+ }
439
+ for (; i < dst_width; i++)
440
+ {
441
+ *(dst++) = srccn[0 ];
442
+ *(dst++) = srccn[1 ];
443
+ }
444
+ }
445
+ template <>
446
+ void hlineResizeCn<uint8_t , ufixedpoint16, 2 , true , 3 >(uint8_t * src, int , int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
447
+ {
448
+ int i = 0 ;
449
+ ufixedpoint16 srccn[3 ] = { src[0 ], src[1 ], src[2 ] };
450
+ v_uint16x8 v_srccn0 = v_setall_u16 (((uint16_t *)srccn)[0 ]);
451
+ v_uint16x8 v_srccn1 = v_setall_u16 (((uint16_t *)srccn)[1 ]);
452
+ v_uint16x8 v_srccn2 = v_setall_u16 (((uint16_t *)srccn)[2 ]);
453
+ for (; i < dst_min - 7 ; i += 8 , m += 16 , dst += 24 ) // Points that fall left from src image so became equal to leftmost src point
454
+ {
455
+ v_store_interleave ((uint16_t *)dst, v_srccn0, v_srccn1, v_srccn2);
456
+ }
457
+ for (; i < dst_min; i++, m += 2 )
458
+ {
459
+ *(dst++) = srccn[0 ];
460
+ *(dst++) = srccn[1 ];
461
+ *(dst++) = srccn[2 ];
462
+ }
463
+ for (; i < dst_max - 7 && ofst[i + 7 ] + 15 <= ofst[dst_width - 1 ]; i += 8 , m += 16 , dst += 24 )
464
+ {
465
+ v_uint8x16 v_src0_c1, v_src0_c2, v_src0_c3;
466
+ v_load_deinterleave (src + 3 * ofst[i ], v_src0_c1, v_src0_c2, v_src0_c3);
467
+ v_uint8x16 v_src1_c1, v_src1_c2, v_src1_c3;
468
+ v_load_deinterleave (src + 3 * ofst[i + 1 ], v_src1_c1, v_src1_c2, v_src1_c3);
469
+ v_uint8x16 v_src2_c1, v_src2_c2, v_src2_c3;
470
+ v_load_deinterleave (src + 3 * ofst[i + 2 ], v_src2_c1, v_src2_c2, v_src2_c3);
471
+ v_uint8x16 v_src3_c1, v_src3_c2, v_src3_c3;
472
+ v_load_deinterleave (src + 3 * ofst[i + 3 ], v_src3_c1, v_src3_c2, v_src3_c3);
473
+
474
+ v_uint16x8 v_tmp0, v_tmp1, v_tmp2;
475
+ v_uint16x8 v_src0123_c1, v_src0123_c2, v_src0123_c3;
476
+ v_zip (v_reinterpret_as_u16 (v_src0_c1), v_reinterpret_as_u16 (v_src2_c1), v_tmp0, v_tmp1);
477
+ v_zip (v_reinterpret_as_u16 (v_src1_c1), v_reinterpret_as_u16 (v_src3_c1), v_tmp1, v_tmp2);
478
+ v_zip (v_tmp0, v_tmp1, v_src0123_c1, v_tmp2);
479
+ v_zip (v_reinterpret_as_u16 (v_src0_c2), v_reinterpret_as_u16 (v_src2_c2), v_tmp0, v_tmp1);
480
+ v_zip (v_reinterpret_as_u16 (v_src1_c2), v_reinterpret_as_u16 (v_src3_c2), v_tmp1, v_tmp2);
481
+ v_zip (v_tmp0, v_tmp1, v_src0123_c2, v_tmp2);
482
+ v_zip (v_reinterpret_as_u16 (v_src0_c3), v_reinterpret_as_u16 (v_src2_c3), v_tmp0, v_tmp1);
483
+ v_zip (v_reinterpret_as_u16 (v_src1_c3), v_reinterpret_as_u16 (v_src3_c3), v_tmp1, v_tmp2);
484
+ v_zip (v_tmp0, v_tmp1, v_src0123_c3, v_tmp2);
485
+
486
+ v_load_deinterleave (src + 3 * ofst[i + 4 ], v_src0_c1, v_src0_c2, v_src0_c3);
487
+ v_load_deinterleave (src + 3 * ofst[i + 5 ], v_src1_c1, v_src1_c2, v_src1_c3);
488
+ v_load_deinterleave (src + 3 * ofst[i + 6 ], v_src2_c1, v_src2_c2, v_src2_c3);
489
+ v_load_deinterleave (src + 3 * ofst[i + 7 ], v_src3_c1, v_src3_c2, v_src3_c3);
490
+
491
+ v_uint16x8 v_src4567_c1, v_src4567_c2, v_src4567_c3;
492
+ v_zip (v_reinterpret_as_u16 (v_src0_c1), v_reinterpret_as_u16 (v_src2_c1), v_tmp0, v_tmp1);
493
+ v_zip (v_reinterpret_as_u16 (v_src1_c1), v_reinterpret_as_u16 (v_src3_c1), v_tmp1, v_tmp2);
494
+ v_zip (v_tmp0, v_tmp1, v_src4567_c1, v_tmp2);
495
+ v_zip (v_reinterpret_as_u16 (v_src0_c2), v_reinterpret_as_u16 (v_src2_c2), v_tmp0, v_tmp1);
496
+ v_zip (v_reinterpret_as_u16 (v_src1_c2), v_reinterpret_as_u16 (v_src3_c2), v_tmp1, v_tmp2);
497
+ v_zip (v_tmp0, v_tmp1, v_src4567_c2, v_tmp2);
498
+ v_zip (v_reinterpret_as_u16 (v_src0_c3), v_reinterpret_as_u16 (v_src2_c3), v_tmp0, v_tmp1);
499
+ v_zip (v_reinterpret_as_u16 (v_src1_c3), v_reinterpret_as_u16 (v_src3_c3), v_tmp1, v_tmp2);
500
+ v_zip (v_tmp0, v_tmp1, v_src4567_c3, v_tmp2);
501
+
502
+ v_expand (v_reinterpret_as_u8 (v_combine_low (v_src0123_c1, v_src4567_c1)),
503
+ v_src0123_c1, v_src4567_c1
504
+ );
505
+ v_expand (v_reinterpret_as_u8 (v_combine_low (v_src0123_c2, v_src4567_c2)),
506
+ v_src0123_c2, v_src4567_c2
507
+ );
508
+ v_expand (v_reinterpret_as_u8 (v_combine_low (v_src0123_c3, v_src4567_c3)),
509
+ v_src0123_c3, v_src4567_c3
510
+ );
511
+ v_int16x8 v_mul0123 = v_load ((int16_t *)m);
512
+ v_int16x8 v_mul4567 = v_load ((int16_t *)m+8 );
513
+ v_uint32x4 v_res0123_c1 = v_reinterpret_as_u32 (v_dotprod (v_reinterpret_as_s16 (v_src0123_c1), v_mul0123));
514
+ v_uint32x4 v_res0123_c2 = v_reinterpret_as_u32 (v_dotprod (v_reinterpret_as_s16 (v_src0123_c2), v_mul0123));
515
+ v_uint32x4 v_res0123_c3 = v_reinterpret_as_u32 (v_dotprod (v_reinterpret_as_s16 (v_src0123_c3), v_mul0123));
516
+ v_uint32x4 v_res4567_c1 = v_reinterpret_as_u32 (v_dotprod (v_reinterpret_as_s16 (v_src4567_c1), v_mul4567));
517
+ v_uint32x4 v_res4567_c2 = v_reinterpret_as_u32 (v_dotprod (v_reinterpret_as_s16 (v_src4567_c2), v_mul4567));
518
+ v_uint32x4 v_res4567_c3 = v_reinterpret_as_u32 (v_dotprod (v_reinterpret_as_s16 (v_src4567_c3), v_mul4567));
519
+
520
+ v_store_interleave ((uint16_t *)dst , v_pack (v_res0123_c1, v_res4567_c1), v_pack (v_res0123_c2, v_res4567_c2), v_pack (v_res0123_c3, v_res4567_c3));
521
+ }
522
+ for (; i < dst_max; i += 1 , m += 2 )
523
+ {
524
+ uint8_t * px = src + 3 * ofst[i];
525
+ *(dst++) = m[0 ] * px[0 ] + m[1 ] * px[3 ];
526
+ *(dst++) = m[0 ] * px[1 ] + m[1 ] * px[4 ];
527
+ *(dst++) = m[0 ] * px[2 ] + m[1 ] * px[5 ];
528
+ }
529
+ srccn[0 ] = (src + 3 * ofst[dst_width - 1 ])[0 ]; v_srccn0 = v_setall_u16 (((uint16_t *)srccn)[0 ]);
530
+ srccn[1 ] = (src + 3 * ofst[dst_width - 1 ])[1 ]; v_srccn1 = v_setall_u16 (((uint16_t *)srccn)[1 ]);
531
+ srccn[2 ] = (src + 3 * ofst[dst_width - 1 ])[2 ]; v_srccn2 = v_setall_u16 (((uint16_t *)srccn)[2 ]);
532
+ for (; i < dst_width - 7 ; i += 8 , dst += 24 ) // Points that fall left from src image so became equal to leftmost src point
533
+ {
534
+ v_store_interleave ((uint16_t *)dst, v_srccn0, v_srccn1, v_srccn2);
535
+ }
536
+ for (; i < dst_width; i++)
537
+ {
538
+ *(dst++) = srccn[0 ];
539
+ *(dst++) = srccn[1 ];
540
+ *(dst++) = srccn[2 ];
541
+ }
542
+ }
543
+ template <>
544
+ void hlineResizeCn<uint8_t , ufixedpoint16, 2 , true , 4 >(uint8_t * src, int , int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
545
+ {
546
+ int i = 0 ;
547
+ ufixedpoint16 srccn[8 ] = { src[0 ], src[1 ], src[2 ], src[3 ], src[0 ], src[1 ], src[2 ], src[3 ] };
548
+ v_uint16x8 v_srccn = v_load ((uint16_t *)srccn);
549
+ for (; i < dst_min - 1 ; i += 2 , m += 4 , dst += 8 ) // Points that fall left from src image so became equal to leftmost src point
550
+ {
551
+ v_store ((uint16_t *)dst, v_srccn);
552
+ }
553
+ if (i < dst_min) // Points that fall left from src image so became equal to leftmost src point
554
+ {
555
+ *(dst++) = srccn[0 ];
556
+ *(dst++) = srccn[1 ];
557
+ *(dst++) = srccn[2 ];
558
+ *(dst++) = srccn[3 ];
559
+ i++; m += 2 ;
560
+ }
561
+ for (; i < dst_max - 1 && ofst[i + 1 ] + 3 <= ofst[dst_width - 1 ]; i += 2 , m += 4 , dst += 8 )
562
+ {
563
+ v_int16x8 v_src01 = v_reinterpret_as_s16 (v_load_expand (src + 4 * ofst[i ]));
564
+ v_int16x8 v_src23 = v_reinterpret_as_s16 (v_load_expand (src + 4 * ofst[i + 1 ]));
565
+
566
+ v_int16x8 v_tmp0, v_tmp1;
567
+ v_recombine (v_src01, v_src23, v_tmp0, v_tmp1);
568
+ v_zip (v_tmp0, v_tmp1, v_src01, v_src23);
569
+
570
+ v_int16x8 v_mul01 = v_reinterpret_as_s16 (v_setall_u32 (((uint32_t *)m)[0 ]));// AaAaAaAa
571
+ v_int16x8 v_mul23 = v_reinterpret_as_s16 (v_setall_u32 (((uint32_t *)m)[1 ]));// BbBbBbBb
572
+ v_uint32x4 v_res0 = v_reinterpret_as_u32 (v_dotprod (v_src01, v_mul01));
573
+ v_uint32x4 v_res1 = v_reinterpret_as_u32 (v_dotprod (v_src23, v_mul23));
574
+ v_store ((uint16_t *)dst, v_pack (v_res0, v_res1));// AB1AB2CD1CD2
575
+ }
576
+ for (; i < dst_max; i += 1 , m += 2 )
577
+ {
578
+ uint8_t * px = src + 4 * ofst[i];
579
+ *(dst++) = m[0 ] * px[0 ] + m[1 ] * px[4 ];
580
+ *(dst++) = m[0 ] * px[1 ] + m[1 ] * px[5 ];
581
+ *(dst++) = m[0 ] * px[2 ] + m[1 ] * px[6 ];
582
+ *(dst++) = m[0 ] * px[3 ] + m[1 ] * px[7 ];
583
+ }
584
+ srccn[0 ] = (src + 4 * ofst[dst_width - 1 ])[0 ]; srccn[1 ] = (src + 4 * ofst[dst_width - 1 ])[1 ]; srccn[2 ] = (src + 4 * ofst[dst_width - 1 ])[2 ]; srccn[3 ] = (src + 4 * ofst[dst_width - 1 ])[3 ];
585
+ srccn[4 ] = (src + 4 * ofst[dst_width - 1 ])[0 ]; srccn[5 ] = (src + 4 * ofst[dst_width - 1 ])[1 ]; srccn[6 ] = (src + 4 * ofst[dst_width - 1 ])[2 ]; srccn[7 ] = (src + 4 * ofst[dst_width - 1 ])[3 ];
586
+ v_srccn = v_load ((uint16_t *)srccn);
587
+ for (; i < dst_width - 1 ; i += 2 , dst += 8 ) // Points that fall right from src image so became equal to rightmost src point
588
+ {
589
+ v_store ((uint16_t *)dst, v_srccn);
590
+ }
591
+ if (i < dst_width)
592
+ {
593
+ *(dst++) = srccn[0 ];
594
+ *(dst++) = srccn[1 ];
595
+ *(dst++) = srccn[2 ];
596
+ *(dst++) = srccn[3 ];
597
+ }
598
+ }
394
599
template <>
395
600
void hlineResizeCn<uint16_t , ufixedpoint32, 2 , true , 1 >(uint16_t * src, int , int *ofst, ufixedpoint32* m, ufixedpoint32* dst, int dst_min, int dst_max, int dst_width)
396
601
{
397
- typedef v_uint32x4 v_fixedtype;
398
- typedef uint32_t lanetype;
399
602
int i = 0 ;
400
603
ufixedpoint32 src_0 (src[0 ]);
401
- v_fixedtype v_src_0 = v_setall_u32 (*((lanetype *)&src_0));
604
+ v_uint32x4 v_src_0 = v_setall_u32 (*((uint32_t *)&src_0));
402
605
for (; i < dst_min - 3 ; i += 4 , m += 8 , dst += 4 ) // Points that fall left from src image so became equal to leftmost src point
403
606
{
404
- v_store ((lanetype *)dst, v_src_0);
607
+ v_store ((uint32_t *)dst, v_src_0);
405
608
}
406
609
for (; i < dst_min; i++, m += 2 )
407
610
{
408
611
*(dst++) = src_0;
409
612
}
410
613
for (; i < dst_max - 3 && ofst[i + 3 ] + 8 <= ofst[dst_width - 1 ]; i += 4 , m += 8 , dst += 4 )
411
614
{
412
- v_fixedtype v_src0 = v_combine_low (v_load_expand (src + ofst[i]), v_load_expand (src + ofst[i + 1 ]));
413
- v_fixedtype v_mul0 = v_load ((lanetype *)m);
414
- v_fixedtype v_src1 = v_combine_low (v_load_expand (src + ofst[i + 2 ]), v_load_expand (src + ofst[i + 3 ]));
415
- v_fixedtype v_mul1 = v_load ((lanetype *)m + 4 );
416
- v_fixedtype v_res0 = v_src0 * v_mul0;// a1a2b1b2
417
- v_fixedtype v_res1 = v_src1 * v_mul1;// c1c2d1d2
418
- v_fixedtype v_tmp0, v_tmp1;
615
+ v_uint32x4 v_src0 = v_combine_low (v_load_expand (src + ofst[i]), v_load_expand (src + ofst[i + 1 ]));
616
+ v_uint32x4 v_mul0 = v_load ((uint32_t *)m);
617
+ v_uint32x4 v_src1 = v_combine_low (v_load_expand (src + ofst[i + 2 ]), v_load_expand (src + ofst[i + 3 ]));
618
+ v_uint32x4 v_mul1 = v_load ((uint32_t *)m + 4 );
619
+ v_uint32x4 v_res0 = v_src0 * v_mul0;// a1a2b1b2
620
+ v_uint32x4 v_res1 = v_src1 * v_mul1;// c1c2d1d2
621
+ v_uint32x4 v_tmp0, v_tmp1;
419
622
v_recombine (v_res0, v_res1, v_tmp0, v_tmp1);// a1a2c1c2 b1b2d1d2
420
623
v_zip (v_tmp0, v_tmp1, v_res0, v_res1);// a1b1a2b2 c1d1c2d2
421
624
v_recombine (v_res0, v_res1, v_tmp0, v_tmp1);// a1b1c1d1 a2b2c2d2
422
- v_store ((lanetype *)dst, v_tmp0 + v_tmp1);// abcd
625
+ v_store ((uint32_t *)dst, v_tmp0 + v_tmp1);// abcd
423
626
}
424
627
for (; i < dst_max; i += 1 , m += 2 )
425
628
{
426
629
uint16_t * px = src + ofst[i];
427
630
*(dst++) = m[0 ] * px[0 ] + m[1 ] * px[1 ];
428
631
}
429
632
src_0 = (src + ofst[dst_width - 1 ])[0 ];
430
- v_src_0 = v_setall_u32 (*((lanetype *)&src_0));
633
+ v_src_0 = v_setall_u32 (*((uint32_t *)&src_0));
431
634
for (; i < dst_width - 3 ; i += 4 , dst += 4 )
432
635
{
433
- v_store ((lanetype *)dst, v_src_0);
636
+ v_store ((uint32_t *)dst, v_src_0);
434
637
}
435
638
for (; i < dst_width; i++)
436
639
{
@@ -3714,7 +3917,7 @@ void resize(int src_type,
3714
3917
{
3715
3918
// in case of inv_scale_x && inv_scale_y is equal to 0.5
3716
3919
// INTER_AREA (fast) is equal to bit exact INTER_LINEAR
3717
- if (is_area_fast && iscale_x == 2 && iscale_y == 2 )
3920
+ if (is_area_fast && iscale_x == 2 && iscale_y == 2 && cn != 2 ) // Area resize implementation for 2-channel images isn't bit-exact
3718
3921
interpolation = INTER_AREA;
3719
3922
else
3720
3923
{
0 commit comments