Skip to content

Commit 9cae09c

Browse files
committed
BUG: use strides and process strided arrays using AVX
1 parent bca9628 commit 9cae09c

File tree

2 files changed

+49
-21
lines changed

2 files changed

+49
-21
lines changed

numpy/core/src/umath/loops.c.src

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1651,22 +1651,17 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
16511651
NPY_NO_EXPORT NPY_GCC_OPT_3 void
16521652
FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
16531653
{
1654-
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
16551654
char str[] = "@func@";
1656-
@ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], str);
1655+
if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, str)) {
1656+
UNARY_LOOP {
1657+
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
1658+
@ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], str);
16571659
#else
1658-
/*
1659-
* This is the path it would take if ISA was runtime detected, but not
1660-
* compiled for. It fixes the error on clang6.0 which fails to compile
1661-
* AVX512F version. Not sure if I like this idea, if during runtime it
1662-
* detects AXV512F, it will end up running the scalar version instead
1663-
* of AVX2.
1664-
*/
1665-
UNARY_LOOP {
1666-
const npy_float in1 = *(npy_float *)ip1;
1667-
*(npy_float *)op1 = @scalarf@(in1);
1668-
}
1660+
const npy_float in1 = *(npy_float *)ip1;
1661+
*(npy_float *)op1 = @scalarf@(in1);
16691662
#endif
1663+
}
1664+
}
16701665
}
16711666

16721667
/**end repeat1**/

numpy/core/src/umath/simd.inc.src

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -164,9 +164,23 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
164164

165165
#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
166166
static NPY_INLINE void
167-
@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, char*);
167+
@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, char*);
168168
#endif
169169

170+
static NPY_INLINE int
171+
run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, char* mychar)
172+
{
173+
#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
174+
if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
175+
@ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], mychar);
176+
return 1;
177+
}
178+
else
179+
return 0;
180+
#endif
181+
return 0;
182+
}
183+
170184
/**end repeat**/
171185

172186

@@ -1473,9 +1487,13 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
14731487

14741488
#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
14751489
static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
1476-
@ISA@_sincos_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size,
1477-
char* operation)
1490+
@ISA@_sincos_FLOAT(npy_float * op,
1491+
npy_float * ip,
1492+
const npy_intp array_size,
1493+
const npy_intp steps,
1494+
char* operation)
14781495
{
1496+
const npy_intp stride = steps/sizeof(npy_float);
14791497
const npy_int num_lanes = @BYTES@/sizeof(npy_float);
14801498
npy_int compute_cos = 1;
14811499
npy_float large_number = 71476.0625f;
@@ -1508,13 +1526,26 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
15081526
@mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
15091527
@mask@ load_mask = @isa@_get_full_load_mask();
15101528
npy_intp num_remaining_elements = array_size;
1529+
npy_int indexarr[16];
1530+
for (npy_int ii = 0; ii < 16; ii++) {
1531+
indexarr[ii] = ii*stride;
1532+
}
1533+
@vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
15111534

15121535
while (num_remaining_elements > 0) {
15131536

1514-
if (num_remaining_elements < num_lanes)
1537+
if (num_remaining_elements < num_lanes) {
15151538
load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
15161539
num_lanes);
1517-
@vtype@ x = @isa@_masked_load(load_mask, ip);
1540+
}
1541+
1542+
@vtype@ x;
1543+
if (stride == 1) {
1544+
x = @isa@_masked_load(load_mask, ip);
1545+
}
1546+
else {
1547+
x = @isa@_masked_gather(zero_f, ip, vindex, load_mask);
1548+
}
15181549

15191550
/*
15201551
* For elements outside of this range, Cody-Waite's range reduction
@@ -1565,19 +1596,21 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
15651596
/* process elements using glibc for large elements */
15661597
if (compute_cos) {
15671598
for (int ii = 0; iglibc_mask != 0; ii++) {
1568-
if (iglibc_mask & 0x01)
1599+
if (iglibc_mask & 0x01) {
15691600
op[ii] = npy_cosf(ip[ii]);
1601+
}
15701602
iglibc_mask = iglibc_mask >> 1;
15711603
}
15721604
}
15731605
else {
15741606
for (int ii = 0; iglibc_mask != 0; ii++) {
1575-
if (iglibc_mask & 0x01)
1607+
if (iglibc_mask & 0x01) {
15761608
op[ii] = npy_sinf(ip[ii]);
1609+
}
15771610
iglibc_mask = iglibc_mask >> 1;
15781611
}
15791612
}
1580-
ip += num_lanes;
1613+
ip += num_lanes*stride;
15811614
op += num_lanes;
15821615
num_remaining_elements -= num_lanes;
15831616
}

0 commit comments

Comments
 (0)