From b80411d2eaf6eda8075764359ea32b59fcc6c468 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Thu, 17 Dec 2020 14:53:43 +0800
Subject: [PATCH 1/5] Optimize the performance of einsum's submodule sum.

---
 .../core/src/multiarray/einsum_sumprod.c.src  | 504 +++---------------
 1 file changed, 78 insertions(+), 426 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index 86d5b82fc818..03d2d614cf3c 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -94,6 +94,58 @@
  *             0*3#
  */
 
+#if !@complex@
+static @temptype@ @name@_sum_of_arr(@type@ **data, npy_intp count)
+{
+    @temptype@ accum = 0;
+#if @NPYV_CHK@ // NPYV check for @type@
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(*data);
+    const int vstep = npyv_nlanes_@sfx@;
+    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
+    const npy_intp vstepx4 = vstep * 4;
+
+    /**begin repeat1
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+        for (; count >= vstepx4; count -= vstepx4, *data += vstepx4) {
+            /**begin repeat2
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(*data + vstep * @i@);
+            /**end repeat2**/
+            npyv_@sfx@ a01   = npyv_add_@sfx@(a0, a1);
+            npyv_@sfx@ a23   = npyv_add_@sfx@(a2, a3);
+            npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23);
+                      vaccum = npyv_add_@sfx@(a0123, vaccum);
+        }
+    }
+    /**end repeat1**/
+    for (; count > 0; count -= vstep, *data += vstep) {
+        npyv_@sfx@ a = npyv_load_tillz_@sfx@(*data, count);
+        vaccum = npyv_add_@sfx@(a, vaccum);
+    }
+    accum = npyv_sum_@sfx@(vaccum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, *data += 4) {
+        const @temptype@ a01 = @from@(**data) + @from@(*(*data + 1));
+        const @temptype@ a23 = @from@(*(*data + 2)) + @from@(*(*data + 3));
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, *data += 1) {
+        accum += @from@(**data);
+    }
+#endif // NPYV check for @type@
+    return accum;
+}
+#endif
+
 /**begin repeat1
  * #nop = 1, 2, 3, 1000#
  * #noplabel = one, two, three, any#
@@ -657,139 +709,10 @@ static void
 @name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                 npy_intp const *NPY_UNUSED(strides), npy_intp count)
 {
-    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
     @type@ *data1 = (@type@ *)dataptr[1];
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data1[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
-/**end repeat2**/
-            data1 += 8;
-        }
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
-/**end repeat2**/
-            data1 += 8;
-        }
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data1[@i@]);
-/**end repeat2**/
-#endif
-        data1 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
+    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
+    @temptype@ accum = @name@_sum_of_arr(&data1, count);
+    *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
 }
 
 static void
@@ -798,135 +721,8 @@ static void
 {
     @type@ *data0 = (@type@ *)dataptr[0];
     @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data0[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data0[@i@]);
-/**end repeat2**/
-#endif
-        data0 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
+    @temptype@ accum = @name@_sum_of_arr(&data0, count);
+    *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value1 * accum);
 }
 
 #elif @nop@ == 3 && !@complex@
@@ -1032,175 +828,31 @@ static void
 @name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                 npy_intp const *strides, npy_intp count)
 {
-#if @complex@
-    @temptype@ accum_re = 0, accum_im = 0;
-    @temptype@ *data0 = (@temptype@ *)dataptr[0];
-#else
-    @temptype@ accum = 0;
-    @type@ *data0 = (@type@ *)dataptr[0];
-#endif
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
 #if !@complex@
-            accum += @from@(data0[@i@]);
-#else /* complex */
-            accum_re += data0[2*@i@+0];
-            accum_im += data0[2*@i@+1];
-#endif
-/**end repeat2**/
-        case 0:
-#if @complex@
-            ((@temptype@ *)dataptr[1])[0] += accum_re;
-            ((@temptype@ *)dataptr[1])[1] += accum_im;
+    @type@ *data = (@type@ *)dataptr[0];
+    @temptype@ accum = @name@_sum_of_arr(&data, count);
+    *((@type@ *)dataptr[1]) = @to@(accum + @from@(*((@type@ *)dataptr[1])));
 #else
-            *((@type@ *)dataptr[1]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[1])));
-#endif
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
+    @temptype@ accum_re = 0, accum_im = 0;
+    @temptype@ *data0 = (@temptype@ *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const @temptype@ re01 = data0[0] + data0[2];
+        const @temptype@ re23 = data0[4] + data0[6];
+        const @temptype@ im13 = data0[1] + data0[3];
+        const @temptype@ im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
     }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-#  if !@complex@
-        accum += @from@(data0[@i@]);
-#  else /* complex */
-        accum_re += data0[2*@i@+0];
-        accum_im += data0[2*@i@+1];
-#  endif
-/**end repeat2**/
-#endif
-
-#if !@complex@
-        data0 += 8;
-#else
-        data0 += 8*2;
-#endif
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
     }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
+    ((@temptype@ *)dataptr[1])[0] += accum_re;
+    ((@temptype@ *)dataptr[1])[1] += accum_im;
+#endif // !@complex@
 }
 
 #endif /* @nop@ == 1 */

From f3608c32c2d225dfc0ee0107d6310e6f1caef75f Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Thu, 17 Dec 2020 15:05:18 +0800
Subject: [PATCH 2/5] add NPY_GCC_OPT_3 option.

---
 numpy/core/src/multiarray/einsum_sumprod.c.src | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index 03d2d614cf3c..5c45e5ec7890 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -705,7 +705,7 @@ static NPY_GCC_OPT_3 void
     *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
 }
 
-static void
+static NPY_GCC_OPT_3 void
 @name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                 npy_intp const *NPY_UNUSED(strides), npy_intp count)
 {
@@ -715,7 +715,7 @@ static void
     *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
 }
 
-static void
+static NPY_GCC_OPT_3 void
 @name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                 npy_intp const *NPY_UNUSED(strides), npy_intp count)
 {
@@ -824,7 +824,7 @@ static void
 
 #if @nop@ == 1
 
-static void
+static NPY_GCC_OPT_3 void
 @name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                 npy_intp const *strides, npy_intp count)
 {

From 1c0ea7369d908f6c740e6f5fe95e0534938d98d3 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Fri, 18 Dec 2020 11:42:10 +0800
Subject: [PATCH 3/5] add missing opt 3 flag.

---
 numpy/core/src/multiarray/einsum_sumprod.c.src | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index 5c45e5ec7890..a3d2b127f938 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -95,7 +95,7 @@
  */
 
 #if !@complex@
-static @temptype@ @name@_sum_of_arr(@type@ **data, npy_intp count)
+static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ **data, npy_intp count)
 {
     @temptype@ accum = 0;
 #if @NPYV_CHK@ // NPYV check for @type@

From 9ed8d5dd3dfe7b7945ec2ac5d7c37f1b8c4e4ab7 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Wed, 23 Dec 2020 11:46:59 +0800
Subject: [PATCH 4/5] passing pointer not the address.

---
 .../core/src/multiarray/einsum_sumprod.c.src  | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index a3d2b127f938..88b73759837f 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -95,12 +95,12 @@
  */
 
 #if !@complex@
-static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ **data, npy_intp count)
+static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ *data, npy_intp count)
 {
     @temptype@ accum = 0;
 #if @NPYV_CHK@ // NPYV check for @type@
     /* Use aligned instructions if possible */
-    const int is_aligned = EINSUM_IS_ALIGNED(*data);
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
     const int vstep = npyv_nlanes_@sfx@;
     npyv_@sfx@ vaccum = npyv_zero_@sfx@();
     const npy_intp vstepx4 = vstep * 4;
@@ -111,11 +111,11 @@ static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ **data, npy_intp count)
      * #st = storea, store#
      */
     @cond@ {
-        for (; count >= vstepx4; count -= vstepx4, *data += vstepx4) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
             /**begin repeat2
              * #i = 0, 1, 2, 3#
              */
-            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(*data + vstep * @i@);
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data + vstep * @i@);
             /**end repeat2**/
             npyv_@sfx@ a01   = npyv_add_@sfx@(a0, a1);
             npyv_@sfx@ a23   = npyv_add_@sfx@(a2, a3);
@@ -124,22 +124,22 @@ static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ **data, npy_intp count)
         }
     }
     /**end repeat1**/
-    for (; count > 0; count -= vstep, *data += vstep) {
-        npyv_@sfx@ a = npyv_load_tillz_@sfx@(*data, count);
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_@sfx@ a = npyv_load_tillz_@sfx@(data, count);
         vaccum = npyv_add_@sfx@(a, vaccum);
     }
     accum = npyv_sum_@sfx@(vaccum);
     npyv_cleanup();
 #else
 #ifndef NPY_DISABLE_OPTIMIZATION
-    for (; count > 4; count -= 4, *data += 4) {
-        const @temptype@ a01 = @from@(**data) + @from@(*(*data + 1));
-        const @temptype@ a23 = @from@(*(*data + 2)) + @from@(*(*data + 3));
+    for (; count > 4; count -= 4, data += 4) {
+        const @temptype@ a01 = @from@(*data) + @from@(*(data + 1));
+        const @temptype@ a23 = @from@(*(data + 2)) + @from@(*(data + 3));
         accum +=  a01 + a23;
     }
 #endif // !NPY_DISABLE_OPTIMIZATION
-    for (; count > 0; --count, *data += 1) {
-        accum += @from@(**data);
+    for (; count > 0; --count, data += 1) {
+        accum += @from@(*data);
     }
 #endif // NPYV check for @type@
     return accum;
@@ -711,7 +711,7 @@ static NPY_GCC_OPT_3 void
 {
     @type@ *data1 = (@type@ *)dataptr[1];
     @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
-    @temptype@ accum = @name@_sum_of_arr(&data1, count);
+    @temptype@ accum = @name@_sum_of_arr(data1, count);
     *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
 }
 
@@ -721,7 +721,7 @@ static NPY_GCC_OPT_3 void
 {
     @type@ *data0 = (@type@ *)dataptr[0];
     @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
-    @temptype@ accum = @name@_sum_of_arr(&data0, count);
+    @temptype@ accum = @name@_sum_of_arr(data0, count);
     *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value1 * accum);
 }
 
@@ -831,7 +831,7 @@ static NPY_GCC_OPT_3 void
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
 #if !@complex@
     @type@ *data = (@type@ *)dataptr[0];
-    @temptype@ accum = @name@_sum_of_arr(&data, count);
+    @temptype@ accum = @name@_sum_of_arr(data, count);
     *((@type@ *)dataptr[1]) = @to@(accum + @from@(*((@type@ *)dataptr[1])));
 #else
     @temptype@ accum_re = 0, accum_im = 0;

From c016f636e175910a2a419bbbfed2944911a10d64 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Wed, 23 Dec 2020 17:15:59 +0800
Subject: [PATCH 5/5] simplify the index related code.

---
 numpy/core/src/multiarray/einsum_sumprod.c.src | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index 88b73759837f..d1b76de4e437 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -133,12 +133,12 @@ static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ *data, npy_intp count)
 #else
 #ifndef NPY_DISABLE_OPTIMIZATION
     for (; count > 4; count -= 4, data += 4) {
-        const @temptype@ a01 = @from@(*data) + @from@(*(data + 1));
-        const @temptype@ a23 = @from@(*(data + 2)) + @from@(*(data + 3));
+        const @temptype@ a01 = @from@(*data) + @from@(data[1]);
+        const @temptype@ a23 = @from@(data[2]) + @from@(data[3]);
         accum +=  a01 + a23;
     }
 #endif // !NPY_DISABLE_OPTIMIZATION
-    for (; count > 0; --count, data += 1) {
+    for (; count > 0; --count, data++) {
         accum += @from@(*data);
     }
 #endif // NPYV check for @type@