@@ -164,9 +164,23 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
164
164
165
165
#if defined HAVE_ATTRIBUTE_TARGET_ @ISA @_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
166
166
static NPY_INLINE void
167
- @ISA @_sincos_FLOAT (npy_float * , npy_float * , const npy_intp n , char * );
167
+ @ISA @_sincos_FLOAT (npy_float * , npy_float * , const npy_intp n , const npy_intp steps , char * );
168
168
#endif
169
169
170
+ static NPY_INLINE int
171
+ run_unary_ @isa @_sincos_FLOAT (char * * args , npy_intp * dimensions , npy_intp * steps , char * mychar )
172
+ {
173
+ #if defined HAVE_ATTRIBUTE_TARGET_ @ISA @_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
174
+ if (IS_OUTPUT_BLOCKABLE_UNARY (sizeof (npy_float ), @REGISTER_SIZE @)) {
175
+ @ISA @_sincos_FLOAT ((npy_float * )args [1 ], (npy_float * )args [0 ], dimensions [0 ], steps [0 ], mychar );
176
+ return 1 ;
177
+ }
178
+ else
179
+ return 0 ;
180
+ #endif
181
+ return 0 ;
182
+ }
183
+
170
184
/**end repeat**/
171
185
172
186
@@ -1473,9 +1487,13 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
1473
1487
1474
1488
#if defined HAVE_ATTRIBUTE_TARGET_ @ISA @_WITH_INTRINSICS
1475
1489
static NPY_GCC_OPT_3 NPY_GCC_TARGET_ @ISA @ void
1476
- @ISA @_sincos_FLOAT (npy_float * op , npy_float * ip , const npy_intp array_size ,
1477
- char * operation )
1490
+ @ISA @_sincos_FLOAT (npy_float * op ,
1491
+ npy_float * ip ,
1492
+ const npy_intp array_size ,
1493
+ const npy_intp steps ,
1494
+ char * operation )
1478
1495
{
1496
+ const npy_intp stride = steps /sizeof (npy_float );
1479
1497
const npy_int num_lanes = @BYTES @/sizeof (npy_float );
1480
1498
npy_int compute_cos = 1 ;
1481
1499
npy_float large_number = 71476.0625f ;
@@ -1508,13 +1526,26 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
1508
1526
@mask @ nan_mask , glibc_mask , sine_mask , negate_mask ;
1509
1527
@mask @ load_mask = @isa @_get_full_load_mask ();
1510
1528
npy_intp num_remaining_elements = array_size ;
1529
+ npy_int indexarr [16 ];
1530
+ for (npy_int ii = 0 ; ii < 16 ; ii ++ ) {
1531
+ indexarr [ii ] = ii * stride ;
1532
+ }
1533
+ @vtype @i vindex = _mm @vsize @_loadu_si @vsize @((@vtype @i * )& indexarr [0 ]);
1511
1534
1512
1535
while (num_remaining_elements > 0 ) {
1513
1536
1514
- if (num_remaining_elements < num_lanes )
1537
+ if (num_remaining_elements < num_lanes ) {
1515
1538
load_mask = @isa @_get_partial_load_mask (num_remaining_elements ,
1516
1539
num_lanes );
1517
- @vtype @ x = @isa @_masked_load (load_mask , ip );
1540
+ }
1541
+
1542
+ @vtype @ x ;
1543
+ if (stride == 1 ) {
1544
+ x = @isa @_masked_load (load_mask , ip );
1545
+ }
1546
+ else {
1547
+ x = @isa @_masked_gather (zero_f , ip , vindex , load_mask );
1548
+ }
1518
1549
1519
1550
/*
1520
1551
* For elements outside of this range, Cody-Waite's range reduction
@@ -1565,19 +1596,21 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
1565
1596
/* process elements using glibc for large elements */
1566
1597
if (compute_cos ) {
1567
1598
for (int ii = 0 ; iglibc_mask != 0 ; ii ++ ) {
1568
- if (iglibc_mask & 0x01 )
1599
+ if (iglibc_mask & 0x01 ) {
1569
1600
op [ii ] = npy_cosf (ip [ii ]);
1601
+ }
1570
1602
iglibc_mask = iglibc_mask >> 1 ;
1571
1603
}
1572
1604
}
1573
1605
else {
1574
1606
for (int ii = 0 ; iglibc_mask != 0 ; ii ++ ) {
1575
- if (iglibc_mask & 0x01 )
1607
+ if (iglibc_mask & 0x01 ) {
1576
1608
op [ii ] = npy_sinf (ip [ii ]);
1609
+ }
1577
1610
iglibc_mask = iglibc_mask >> 1 ;
1578
1611
}
1579
1612
}
1580
- ip += num_lanes ;
1613
+ ip += num_lanes * stride ;
1581
1614
op += num_lanes ;
1582
1615
num_remaining_elements -= num_lanes ;
1583
1616
}
0 commit comments