BUG: add.reduce gives wrong results for arrays with funny strides

ahaldane · charris · commit bab617d3a2df · 2017-10-19T14:29:44.000-06:00
Fixes numpy#9876
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
@@ -1617,13 +1617,13 @@ NPY_NO_EXPORT void
  * when updating also update similar complex floats summation
  */
 static @type@
-pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
+pairwise_sum_@TYPE@(char *a, npy_uintp n, npy_intp stride)
 {
     if (n < 8) {
         npy_intp i;
         @type@ res = 0.;
         for (i = 0; i < n; i++) {
-            res += @trf@(a[i * stride]);
+            res += @trf@(*((@dtype@*)(a + i * stride)));
         }
         return res;
     }
@@ -1636,26 +1636,26 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
          * 8 times unroll reduces blocksize to 16 and allows vectorization with
          * avx without changing summation ordering
          */
-        r[0] = @trf@(a[0 * stride]);
-        r[1] = @trf@(a[1 * stride]);
-        r[2] = @trf@(a[2 * stride]);
-        r[3] = @trf@(a[3 * stride]);
-        r[4] = @trf@(a[4 * stride]);
-        r[5] = @trf@(a[5 * stride]);
-        r[6] = @trf@(a[6 * stride]);
-        r[7] = @trf@(a[7 * stride]);
+        r[0] = @trf@(*((@dtype@ *)(a + 0 * stride)));
+        r[1] = @trf@(*((@dtype@ *)(a + 1 * stride)));
+        r[2] = @trf@(*((@dtype@ *)(a + 2 * stride)));
+        r[3] = @trf@(*((@dtype@ *)(a + 3 * stride)));
+        r[4] = @trf@(*((@dtype@ *)(a + 4 * stride)));
+        r[5] = @trf@(*((@dtype@ *)(a + 5 * stride)));
+        r[6] = @trf@(*((@dtype@ *)(a + 6 * stride)));
+        r[7] = @trf@(*((@dtype@ *)(a + 7 * stride)));
 
         for (i = 8; i < n - (n % 8); i += 8) {
             /* small blocksizes seems to mess with hardware prefetch */
-            NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
-            r[0] += @trf@(a[(i + 0) * stride]);
-            r[1] += @trf@(a[(i + 1) * stride]);
-            r[2] += @trf@(a[(i + 2) * stride]);
-            r[3] += @trf@(a[(i + 3) * stride]);
-            r[4] += @trf@(a[(i + 4) * stride]);
-            r[5] += @trf@(a[(i + 5) * stride]);
-            r[6] += @trf@(a[(i + 6) * stride]);
-            r[7] += @trf@(a[(i + 7) * stride]);
+            NPY_PREFETCH(a + (i + 512 / sizeof(@dtype@)) * stride, 0, 3);
+            r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride)));
+            r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride)));
+            r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride)));
+            r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride)));
+            r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride)));
+            r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride)));
+            r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride)));
+            r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride)));
         }
 
         /* accumulate now to avoid stack spills for single peel loop */
@@ -1664,7 +1664,7 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
 
         /* do non multiple of 8 rest */
         for (; i < n; i++) {
-            res += @trf@(a[i * stride]);
+            res += @trf@(*((@dtype@ *)(a + i * stride)));
         }
         return res;
     }
@@ -1701,8 +1701,7 @@ NPY_NO_EXPORT void
         @type@ * iop1 = (@type@ *)args[0];
         npy_intp n = dimensions[0];
 
-        *iop1 @OP@= pairwise_sum_@TYPE@((@type@ *)args[1], n,
-                                        steps[1] / (npy_intp)sizeof(@type@));
+        *iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]);
 #else
         BINARY_REDUCE_LOOP(@type@) {
             io1 @OP@= *(@type@ *)ip2;
@@ -2058,8 +2057,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
 #if @PW@
         npy_intp n = dimensions[0];
 
-        io1 @OP@= pairwise_sum_HALF((npy_half *)args[1], n,
-                                    steps[1] / (npy_intp)sizeof(npy_half));
+        io1 @OP@= pairwise_sum_HALF(args[1], n, steps[1]);
 #else
         BINARY_REDUCE_LOOP_INNER {
             io1 @OP@= npy_half_to_float(*(npy_half *)ip2);
@@ -2389,7 +2387,7 @@ HALF_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UN
 
 /* similar to pairwise sum of real floats */
 static void
-pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
+pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_uintp n,
                     npy_intp stride)
 {
     assert(n % 2 == 0);
@@ -2398,8 +2396,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
         *rr = 0.;
         *ri = 0.;
         for (i = 0; i < n; i += 2) {
-            *rr += a[i * stride + 0];
-            *ri += a[i * stride + 1];
+            *rr += *((@ftype@ *)(a + i * stride + 0));
+            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
         }
         return;
     }
@@ -2412,26 +2410,26 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
          * 8 times unroll reduces blocksize to 16 and allows vectorization with
          * avx without changing summation ordering
          */
-        r[0] = a[0 * stride];
-        r[1] = a[0 * stride + 1];
-        r[2] = a[2 * stride];
-        r[3] = a[2 * stride + 1];
-        r[4] = a[4 * stride];
-        r[5] = a[4 * stride + 1];
-        r[6] = a[6 * stride];
-        r[7] = a[6 * stride + 1];
+        r[0] = *((@ftype@ *)(a + 0 * stride));
+        r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@)));
+        r[2] = *((@ftype@ *)(a + 2 * stride));
+        r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@)));
+        r[4] = *((@ftype@ *)(a + 4 * stride));
+        r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@)));
+        r[6] = *((@ftype@ *)(a + 6 * stride));
+        r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@)));
 
         for (i = 8; i < n - (n % 8); i += 8) {
             /* small blocksizes seems to mess with hardware prefetch */
-            NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
-            r[0] += a[(i + 0) * stride];
-            r[1] += a[(i + 0) * stride + 1];
-            r[2] += a[(i + 2) * stride];
-            r[3] += a[(i + 2) * stride + 1];
-            r[4] += a[(i + 4) * stride];
-            r[5] += a[(i + 4) * stride + 1];
-            r[6] += a[(i + 6) * stride];
-            r[7] += a[(i + 6) * stride + 1];
+            NPY_PREFETCH(a + (i + 512 / sizeof(@ftype@)) * stride, 0, 3);
+            r[0] += *((@ftype@ *)(a + (i + 0) * stride));
+            r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
+            r[2] += *((@ftype@ *)(a + (i + 2) * stride));
+            r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
+            r[4] += *((@ftype@ *)(a + (i + 4) * stride));
+            r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
+            r[6] += *((@ftype@ *)(a + (i + 6) * stride));
+            r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
         }
 
         /* accumulate now to avoid stack spills for single peel loop */
@@ -2440,8 +2438,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
 
         /* do non multiple of 8 rest */
         for (; i < n; i+=2) {
-            *rr += a[i * stride + 0];
-            *ri += a[i * stride + 1];
+            *rr += *((@ftype@ *)(a + i * stride + 0));
+            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
         }
         return;
     }
@@ -2473,8 +2471,7 @@ NPY_NO_EXPORT void
         @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
         @ftype@ rr, ri;
 
-        pairwise_sum_@TYPE@(&rr, &ri, (@ftype@ *)args[1], n * 2,
-                            steps[1] / (npy_intp)sizeof(@ftype@) / 2);
+        pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2);
         *or @OP@= rr;
         *oi @OP@= ri;
         return;
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
@@ -214,7 +214,18 @@ def __ne__(self, other):
         assert_equal(np.not_equal(a, a), [True])
 
 
-class TestDivision(TestCase):
+class TestAdd(object):
+    def test_reduce_alignment(self):
+        # gh-9876
+        # make sure arrays with weird strides work with the optimizations in
+        # pairwise_sum_@TYPE@. On x86, the 'b' field will count as aligned at a
+        # 4 byte offset, even though its itemsize is 8.
+        a = np.zeros(2, dtype=[('a', np.int32), ('b', np.float64)])
+        a['a'] = -1
+        assert_equal(a['b'].sum(), 0)
+
+
+class TestDivision(object):
     def test_division_int(self):
         # int division should follow Python
         x = np.array([5, 10, 90, 100, -5, -10, -90, -100, -120])