@@ -1617,13 +1617,13 @@ NPY_NO_EXPORT void
1617
1617
* when updating also update similar complex floats summation
1618
1618
*/
1619
1619
static @type @
1620
- pairwise_sum_ @TYPE @(@ dtype @ * a , npy_uintp n , npy_intp stride )
1620
+ pairwise_sum_ @TYPE @(char * a , npy_uintp n , npy_intp stride )
1621
1621
{
1622
1622
if (n < 8 ) {
1623
1623
npy_intp i ;
1624
1624
@type @ res = 0. ;
1625
1625
for (i = 0 ; i < n ; i ++ ) {
1626
- res += @trf @(a [ i * stride ] );
1626
+ res += @trf @(* ((@ dtype @ * )( a + i * stride )) );
1627
1627
}
1628
1628
return res ;
1629
1629
}
@@ -1636,26 +1636,26 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
1636
1636
* 8 times unroll reduces blocksize to 16 and allows vectorization with
1637
1637
* avx without changing summation ordering
1638
1638
*/
1639
- r [0 ] = @trf @(a [ 0 * stride ] );
1640
- r [1 ] = @trf @(a [ 1 * stride ] );
1641
- r [2 ] = @trf @(a [ 2 * stride ] );
1642
- r [3 ] = @trf @(a [ 3 * stride ] );
1643
- r [4 ] = @trf @(a [ 4 * stride ] );
1644
- r [5 ] = @trf @(a [ 5 * stride ] );
1645
- r [6 ] = @trf @(a [ 6 * stride ] );
1646
- r [7 ] = @trf @(a [ 7 * stride ] );
1639
+ r [0 ] = @trf @(* ((@ dtype @ * )( a + 0 * stride )) );
1640
+ r [1 ] = @trf @(* ((@ dtype @ * )( a + 1 * stride )) );
1641
+ r [2 ] = @trf @(* ((@ dtype @ * )( a + 2 * stride )) );
1642
+ r [3 ] = @trf @(* ((@ dtype @ * )( a + 3 * stride )) );
1643
+ r [4 ] = @trf @(* ((@ dtype @ * )( a + 4 * stride )) );
1644
+ r [5 ] = @trf @(* ((@ dtype @ * )( a + 5 * stride )) );
1645
+ r [6 ] = @trf @(* ((@ dtype @ * )( a + 6 * stride )) );
1646
+ r [7 ] = @trf @(* ((@ dtype @ * )( a + 7 * stride )) );
1647
1647
1648
1648
for (i = 8 ; i < n - (n % 8 ); i += 8 ) {
1649
1649
/* small blocksizes seems to mess with hardware prefetch */
1650
- NPY_PREFETCH (& a [ (i + 512 / sizeof (a [ 0 ] )) * stride ] , 0 , 3 );
1651
- r [0 ] += @trf @(a [ (i + 0 ) * stride ] );
1652
- r [1 ] += @trf @(a [ (i + 1 ) * stride ] );
1653
- r [2 ] += @trf @(a [ (i + 2 ) * stride ] );
1654
- r [3 ] += @trf @(a [ (i + 3 ) * stride ] );
1655
- r [4 ] += @trf @(a [ (i + 4 ) * stride ] );
1656
- r [5 ] += @trf @(a [ (i + 5 ) * stride ] );
1657
- r [6 ] += @trf @(a [ (i + 6 ) * stride ] );
1658
- r [7 ] += @trf @(a [ (i + 7 ) * stride ] );
1650
+ NPY_PREFETCH (a + (i + 512 / sizeof (@ dtype @ )) * stride , 0 , 3 );
1651
+ r [0 ] += @trf @(* ((@ dtype @ * )( a + (i + 0 ) * stride )) );
1652
+ r [1 ] += @trf @(* ((@ dtype @ * )( a + (i + 1 ) * stride )) );
1653
+ r [2 ] += @trf @(* ((@ dtype @ * )( a + (i + 2 ) * stride )) );
1654
+ r [3 ] += @trf @(* ((@ dtype @ * )( a + (i + 3 ) * stride )) );
1655
+ r [4 ] += @trf @(* ((@ dtype @ * )( a + (i + 4 ) * stride )) );
1656
+ r [5 ] += @trf @(* ((@ dtype @ * )( a + (i + 5 ) * stride )) );
1657
+ r [6 ] += @trf @(* ((@ dtype @ * )( a + (i + 6 ) * stride )) );
1658
+ r [7 ] += @trf @(* ((@ dtype @ * )( a + (i + 7 ) * stride )) );
1659
1659
}
1660
1660
1661
1661
/* accumulate now to avoid stack spills for single peel loop */
@@ -1664,7 +1664,7 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
1664
1664
1665
1665
/* do non multiple of 8 rest */
1666
1666
for (; i < n ; i ++ ) {
1667
- res += @trf @(a [ i * stride ] );
1667
+ res += @trf @(* ((@ dtype @ * )( a + i * stride )) );
1668
1668
}
1669
1669
return res ;
1670
1670
}
@@ -1701,8 +1701,7 @@ NPY_NO_EXPORT void
1701
1701
@type @ * iop1 = (@type @ * )args [0 ];
1702
1702
npy_intp n = dimensions [0 ];
1703
1703
1704
- * iop1 @OP @= pairwise_sum_ @TYPE @((@type @ * )args [1 ], n ,
1705
- steps [1 ] / (npy_intp )sizeof (@type @));
1704
+ * iop1 @OP @= pairwise_sum_ @TYPE @(args [1 ], n , steps [1 ]);
1706
1705
#else
1707
1706
BINARY_REDUCE_LOOP (@type @) {
1708
1707
io1 @OP @= * (@type @ * )ip2 ;
@@ -2058,8 +2057,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
2058
2057
#if @PW @
2059
2058
npy_intp n = dimensions [0 ];
2060
2059
2061
- io1 @OP @= pairwise_sum_HALF ((npy_half * )args [1 ], n ,
2062
- steps [1 ] / (npy_intp )sizeof (npy_half ));
2060
+ io1 @OP @= pairwise_sum_HALF (args [1 ], n , steps [1 ]);
2063
2061
#else
2064
2062
BINARY_REDUCE_LOOP_INNER {
2065
2063
io1 @OP @= npy_half_to_float (* (npy_half * )ip2 );
@@ -2389,7 +2387,7 @@ HALF_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UN
2389
2387
2390
2388
/* similar to pairwise sum of real floats */
2391
2389
static void
2392
- pairwise_sum_ @TYPE @(@ftype @ * rr , @ftype @ * ri , @ ftype @ * a , npy_uintp n ,
2390
+ pairwise_sum_ @TYPE @(@ftype @ * rr , @ftype @ * ri , char * a , npy_uintp n ,
2393
2391
npy_intp stride )
2394
2392
{
2395
2393
assert (n % 2 == 0 );
@@ -2398,8 +2396,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
2398
2396
* rr = 0. ;
2399
2397
* ri = 0. ;
2400
2398
for (i = 0 ; i < n ; i += 2 ) {
2401
- * rr + = a [ i * stride + 0 ] ;
2402
- * ri + = a [ i * stride + 1 ] ;
2399
+ * rr += * ((@ ftype @ * )( a + i * stride + 0 )) ;
2400
+ * ri += * ((@ ftype @ * )( a + i * stride + sizeof (@ ftype @))) ;
2403
2401
}
2404
2402
return ;
2405
2403
}
@@ -2412,26 +2410,26 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
2412
2410
* 8 times unroll reduces blocksize to 16 and allows vectorization with
2413
2411
* avx without changing summation ordering
2414
2412
*/
2415
- r [0 ] = a [ 0 * stride ] ;
2416
- r [1 ] = a [ 0 * stride + 1 ] ;
2417
- r [2 ] = a [ 2 * stride ] ;
2418
- r [3 ] = a [ 2 * stride + 1 ] ;
2419
- r [4 ] = a [ 4 * stride ] ;
2420
- r [5 ] = a [ 4 * stride + 1 ] ;
2421
- r [6 ] = a [ 6 * stride ] ;
2422
- r [7 ] = a [ 6 * stride + 1 ] ;
2413
+ r [0 ] = * ((@ ftype @ * )( a + 0 * stride )) ;
2414
+ r [1 ] = * ((@ ftype @ * )( a + 0 * stride + sizeof (@ ftype @))) ;
2415
+ r [2 ] = * ((@ ftype @ * )( a + 2 * stride )) ;
2416
+ r [3 ] = * ((@ ftype @ * )( a + 2 * stride + sizeof (@ ftype @))) ;
2417
+ r [4 ] = * ((@ ftype @ * )( a + 4 * stride )) ;
2418
+ r [5 ] = * ((@ ftype @ * )( a + 4 * stride + sizeof (@ ftype @))) ;
2419
+ r [6 ] = * ((@ ftype @ * )( a + 6 * stride )) ;
2420
+ r [7 ] = * ((@ ftype @ * )( a + 6 * stride + sizeof (@ ftype @))) ;
2423
2421
2424
2422
for (i = 8 ; i < n - (n % 8 ); i += 8 ) {
2425
2423
/* small blocksizes seems to mess with hardware prefetch */
2426
- NPY_PREFETCH (& a [ (i + 512 / sizeof (a [ 0 ] )) * stride ] , 0 , 3 );
2427
- r [0 ] += a [ (i + 0 ) * stride ] ;
2428
- r [1 ] += a [ (i + 0 ) * stride + 1 ] ;
2429
- r [2 ] += a [ (i + 2 ) * stride ] ;
2430
- r [3 ] += a [ (i + 2 ) * stride + 1 ] ;
2431
- r [4 ] += a [ (i + 4 ) * stride ] ;
2432
- r [5 ] += a [ (i + 4 ) * stride + 1 ] ;
2433
- r [6 ] += a [ (i + 6 ) * stride ] ;
2434
- r [7 ] += a [ (i + 6 ) * stride + 1 ] ;
2424
+ NPY_PREFETCH (a + (i + 512 / sizeof (@ ftype @ )) * stride , 0 , 3 );
2425
+ r [0 ] += * ((@ ftype @ * )( a + (i + 0 ) * stride )) ;
2426
+ r [1 ] += * ((@ ftype @ * )( a + (i + 0 ) * stride + sizeof (@ ftype @))) ;
2427
+ r [2 ] += * ((@ ftype @ * )( a + (i + 2 ) * stride )) ;
2428
+ r [3 ] += * ((@ ftype @ * )( a + (i + 2 ) * stride + sizeof (@ ftype @))) ;
2429
+ r [4 ] += * ((@ ftype @ * )( a + (i + 4 ) * stride )) ;
2430
+ r [5 ] += * ((@ ftype @ * )( a + (i + 4 ) * stride + sizeof (@ ftype @))) ;
2431
+ r [6 ] += * ((@ ftype @ * )( a + (i + 6 ) * stride )) ;
2432
+ r [7 ] += * ((@ ftype @ * )( a + (i + 6 ) * stride + sizeof (@ ftype @))) ;
2435
2433
}
2436
2434
2437
2435
/* accumulate now to avoid stack spills for single peel loop */
@@ -2440,8 +2438,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
2440
2438
2441
2439
/* do non multiple of 8 rest */
2442
2440
for (; i < n ; i += 2 ) {
2443
- * rr + = a [ i * stride + 0 ] ;
2444
- * ri + = a [ i * stride + 1 ] ;
2441
+ * rr += * ((@ ftype @ * )( a + i * stride + 0 )) ;
2442
+ * ri += * ((@ ftype @ * )( a + i * stride + sizeof (@ ftype @))) ;
2445
2443
}
2446
2444
return ;
2447
2445
}
@@ -2473,8 +2471,7 @@ NPY_NO_EXPORT void
2473
2471
@ftype @ * oi = ((@ftype @ * )args [0 ]) + 1 ;
2474
2472
@ftype @ rr , ri ;
2475
2473
2476
- pairwise_sum_ @TYPE @(& rr , & ri , (@ftype @ * )args [1 ], n * 2 ,
2477
- steps [1 ] / (npy_intp )sizeof (@ftype @) / 2 );
2474
+ pairwise_sum_ @TYPE @(& rr , & ri , args [1 ], n * 2 , steps [1 ] / 2 );
2478
2475
* or @OP @= rr ;
2479
2476
* oi @OP @= ri ;
2480
2477
return ;
0 commit comments