Skip to content

Commit bab617d

Browse files
ahaldanecharris
authored andcommitted
BUG: add.reduce gives wrong results for arrays with funny strides
Fixes numpy#9876
1 parent 4509875 commit bab617d

File tree

2 files changed

+57
-49
lines changed

2 files changed

+57
-49
lines changed

numpy/core/src/umath/loops.c.src

Lines changed: 45 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1617,13 +1617,13 @@ NPY_NO_EXPORT void
16171617
* when updating also update similar complex floats summation
16181618
*/
16191619
static @type@
1620-
pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
1620+
pairwise_sum_@TYPE@(char *a, npy_uintp n, npy_intp stride)
16211621
{
16221622
if (n < 8) {
16231623
npy_intp i;
16241624
@type@ res = 0.;
16251625
for (i = 0; i < n; i++) {
1626-
res += @trf@(a[i * stride]);
1626+
res += @trf@(*((@dtype@*)(a + i * stride)));
16271627
}
16281628
return res;
16291629
}
@@ -1636,26 +1636,26 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
16361636
* 8 times unroll reduces blocksize to 16 and allows vectorization with
16371637
* avx without changing summation ordering
16381638
*/
1639-
r[0] = @trf@(a[0 * stride]);
1640-
r[1] = @trf@(a[1 * stride]);
1641-
r[2] = @trf@(a[2 * stride]);
1642-
r[3] = @trf@(a[3 * stride]);
1643-
r[4] = @trf@(a[4 * stride]);
1644-
r[5] = @trf@(a[5 * stride]);
1645-
r[6] = @trf@(a[6 * stride]);
1646-
r[7] = @trf@(a[7 * stride]);
1639+
r[0] = @trf@(*((@dtype@ *)(a + 0 * stride)));
1640+
r[1] = @trf@(*((@dtype@ *)(a + 1 * stride)));
1641+
r[2] = @trf@(*((@dtype@ *)(a + 2 * stride)));
1642+
r[3] = @trf@(*((@dtype@ *)(a + 3 * stride)));
1643+
r[4] = @trf@(*((@dtype@ *)(a + 4 * stride)));
1644+
r[5] = @trf@(*((@dtype@ *)(a + 5 * stride)));
1645+
r[6] = @trf@(*((@dtype@ *)(a + 6 * stride)));
1646+
r[7] = @trf@(*((@dtype@ *)(a + 7 * stride)));
16471647

16481648
for (i = 8; i < n - (n % 8); i += 8) {
16491649
/* small blocksizes seems to mess with hardware prefetch */
1650-
NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
1651-
r[0] += @trf@(a[(i + 0) * stride]);
1652-
r[1] += @trf@(a[(i + 1) * stride]);
1653-
r[2] += @trf@(a[(i + 2) * stride]);
1654-
r[3] += @trf@(a[(i + 3) * stride]);
1655-
r[4] += @trf@(a[(i + 4) * stride]);
1656-
r[5] += @trf@(a[(i + 5) * stride]);
1657-
r[6] += @trf@(a[(i + 6) * stride]);
1658-
r[7] += @trf@(a[(i + 7) * stride]);
1650+
NPY_PREFETCH(a + (i + 512 / sizeof(@dtype@)) * stride, 0, 3);
1651+
r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride)));
1652+
r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride)));
1653+
r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride)));
1654+
r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride)));
1655+
r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride)));
1656+
r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride)));
1657+
r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride)));
1658+
r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride)));
16591659
}
16601660

16611661
/* accumulate now to avoid stack spills for single peel loop */
@@ -1664,7 +1664,7 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
16641664

16651665
/* do non multiple of 8 rest */
16661666
for (; i < n; i++) {
1667-
res += @trf@(a[i * stride]);
1667+
res += @trf@(*((@dtype@ *)(a + i * stride)));
16681668
}
16691669
return res;
16701670
}
@@ -1701,8 +1701,7 @@ NPY_NO_EXPORT void
17011701
@type@ * iop1 = (@type@ *)args[0];
17021702
npy_intp n = dimensions[0];
17031703

1704-
*iop1 @OP@= pairwise_sum_@TYPE@((@type@ *)args[1], n,
1705-
steps[1] / (npy_intp)sizeof(@type@));
1704+
*iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]);
17061705
#else
17071706
BINARY_REDUCE_LOOP(@type@) {
17081707
io1 @OP@= *(@type@ *)ip2;
@@ -2058,8 +2057,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
20582057
#if @PW@
20592058
npy_intp n = dimensions[0];
20602059

2061-
io1 @OP@= pairwise_sum_HALF((npy_half *)args[1], n,
2062-
steps[1] / (npy_intp)sizeof(npy_half));
2060+
io1 @OP@= pairwise_sum_HALF(args[1], n, steps[1]);
20632061
#else
20642062
BINARY_REDUCE_LOOP_INNER {
20652063
io1 @OP@= npy_half_to_float(*(npy_half *)ip2);
@@ -2389,7 +2387,7 @@ HALF_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UN
23892387

23902388
/* similar to pairwise sum of real floats */
23912389
static void
2392-
pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
2390+
pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_uintp n,
23932391
npy_intp stride)
23942392
{
23952393
assert(n % 2 == 0);
@@ -2398,8 +2396,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
23982396
*rr = 0.;
23992397
*ri = 0.;
24002398
for (i = 0; i < n; i += 2) {
2401-
*rr += a[i * stride + 0];
2402-
*ri += a[i * stride + 1];
2399+
*rr += *((@ftype@ *)(a + i * stride + 0));
2400+
*ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
24032401
}
24042402
return;
24052403
}
@@ -2412,26 +2410,26 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
24122410
* 8 times unroll reduces blocksize to 16 and allows vectorization with
24132411
* avx without changing summation ordering
24142412
*/
2415-
r[0] = a[0 * stride];
2416-
r[1] = a[0 * stride + 1];
2417-
r[2] = a[2 * stride];
2418-
r[3] = a[2 * stride + 1];
2419-
r[4] = a[4 * stride];
2420-
r[5] = a[4 * stride + 1];
2421-
r[6] = a[6 * stride];
2422-
r[7] = a[6 * stride + 1];
2413+
r[0] = *((@ftype@ *)(a + 0 * stride));
2414+
r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@)));
2415+
r[2] = *((@ftype@ *)(a + 2 * stride));
2416+
r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@)));
2417+
r[4] = *((@ftype@ *)(a + 4 * stride));
2418+
r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@)));
2419+
r[6] = *((@ftype@ *)(a + 6 * stride));
2420+
r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@)));
24232421

24242422
for (i = 8; i < n - (n % 8); i += 8) {
24252423
/* small blocksizes seems to mess with hardware prefetch */
2426-
NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
2427-
r[0] += a[(i + 0) * stride];
2428-
r[1] += a[(i + 0) * stride + 1];
2429-
r[2] += a[(i + 2) * stride];
2430-
r[3] += a[(i + 2) * stride + 1];
2431-
r[4] += a[(i + 4) * stride];
2432-
r[5] += a[(i + 4) * stride + 1];
2433-
r[6] += a[(i + 6) * stride];
2434-
r[7] += a[(i + 6) * stride + 1];
2424+
NPY_PREFETCH(a + (i + 512 / sizeof(@ftype@)) * stride, 0, 3);
2425+
r[0] += *((@ftype@ *)(a + (i + 0) * stride));
2426+
r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
2427+
r[2] += *((@ftype@ *)(a + (i + 2) * stride));
2428+
r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
2429+
r[4] += *((@ftype@ *)(a + (i + 4) * stride));
2430+
r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
2431+
r[6] += *((@ftype@ *)(a + (i + 6) * stride));
2432+
r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
24352433
}
24362434

24372435
/* accumulate now to avoid stack spills for single peel loop */
@@ -2440,8 +2438,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
24402438

24412439
/* do non multiple of 8 rest */
24422440
for (; i < n; i+=2) {
2443-
*rr += a[i * stride + 0];
2444-
*ri += a[i * stride + 1];
2441+
*rr += *((@ftype@ *)(a + i * stride + 0));
2442+
*ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
24452443
}
24462444
return;
24472445
}
@@ -2473,8 +2471,7 @@ NPY_NO_EXPORT void
24732471
@ftype@ * oi = ((@ftype@ *)args[0]) + 1;
24742472
@ftype@ rr, ri;
24752473

2476-
pairwise_sum_@TYPE@(&rr, &ri, (@ftype@ *)args[1], n * 2,
2477-
steps[1] / (npy_intp)sizeof(@ftype@) / 2);
2474+
pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2);
24782475
*or @OP@= rr;
24792476
*oi @OP@= ri;
24802477
return;

numpy/core/tests/test_umath.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,18 @@ def __ne__(self, other):
214214
assert_equal(np.not_equal(a, a), [True])
215215

216216

217-
class TestDivision(TestCase):
217+
class TestAdd(object):
218+
def test_reduce_alignment(self):
219+
# gh-9876
220+
# make sure arrays with weird strides work with the optimizations in
221+
# pairwise_sum_@TYPE@. On x86, the 'b' field will count as aligned at a
222+
# 4 byte offset, even though its itemsize is 8.
223+
a = np.zeros(2, dtype=[('a', np.int32), ('b', np.float64)])
224+
a['a'] = -1
225+
assert_equal(a['b'].sum(), 0)
226+
227+
228+
class TestDivision(object):
218229
def test_division_int(self):
219230
# int division should follow Python
220231
x = np.array([5, 10, 90, 100, -5, -10, -90, -100, -120])

0 commit comments

Comments
 (0)