From b80411d2eaf6eda8075764359ea32b59fcc6c468 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Thu, 17 Dec 2020 14:53:43 +0800 Subject: [PATCH 1/5] Optimize the performance of einsum's submodule sum. --- .../core/src/multiarray/einsum_sumprod.c.src | 504 +++--------------- 1 file changed, 78 insertions(+), 426 deletions(-) diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index 86d5b82fc818..03d2d614cf3c 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -94,6 +94,58 @@ * 0*3# */ +#if !@complex@ +static @temptype@ @name@_sum_of_arr(@type@ **data, npy_intp count) +{ + @temptype@ accum = 0; +#if @NPYV_CHK@ // NPYV check for @type@ + /* Use aligned instructions if possible */ + const int is_aligned = EINSUM_IS_ALIGNED(*data); + const int vstep = npyv_nlanes_@sfx@; + npyv_@sfx@ vaccum = npyv_zero_@sfx@(); + const npy_intp vstepx4 = vstep * 4; + + /**begin repeat1 + * #cond = if(is_aligned), else# + * #ld = loada, load# + * #st = storea, store# + */ + @cond@ { + for (; count >= vstepx4; count -= vstepx4, *data += vstepx4) { + /**begin repeat2 + * #i = 0, 1, 2, 3# + */ + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(*data + vstep * @i@); + /**end repeat2**/ + npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); + npyv_@sfx@ a23 = npyv_add_@sfx@(a2, a3); + npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23); + vaccum = npyv_add_@sfx@(a0123, vaccum); + } + } + /**end repeat1**/ + for (; count > 0; count -= vstep, *data += vstep) { + npyv_@sfx@ a = npyv_load_tillz_@sfx@(*data, count); + vaccum = npyv_add_@sfx@(a, vaccum); + } + accum = npyv_sum_@sfx@(vaccum); + npyv_cleanup(); +#else +#ifndef NPY_DISABLE_OPTIMIZATION + for (; count > 4; count -= 4, *data += 4) { + const @temptype@ a01 = @from@(**data) + @from@(*(*data + 1)); + const @temptype@ a23 = @from@(*(*data + 2)) + @from@(*(*data + 3)); + accum += a01 + a23; + } +#endif // !NPY_DISABLE_OPTIMIZATION + for (; count > 0; --count, *data += 1) { + accum += @from@(**data); + } +#endif // NPYV check for @type@ + return accum; +} +#endif + /**begin repeat1 * #nop = 1, 2, 3, 1000# * #noplabel = one, two, three, any# @@ -657,139 +709,10 @@ static void @name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr, npy_intp const *NPY_UNUSED(strides), npy_intp count) { - @temptype@ value0 = @from@(*(@type@ *)dataptr[0]); @type@ *data1 = (@type@ *)dataptr[1]; - @temptype@ accum = 0; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data1[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@)); -/**end repeat2**/ - data1 += 8; - } - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data1)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@)); -/**end repeat2**/ - data1 += 8; - } - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data1[@i@]); -/**end repeat2**/ -#endif - data1 += 8; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + @temptype@ value0 = @from@(*(@type@ *)dataptr[0]); + @temptype@ accum = @name@_sum_of_arr(&data1, count); + *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum); } static void @@ -798,135 +721,8 @@ static void { @type@ *data0 = (@type@ *)dataptr[0]; @temptype@ value1 = @from@(*(@type@ *)dataptr[1]); - @temptype@ accum = 0; - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: - accum += @from@(data0[@i@]); -/**end repeat2**/ - case 0: - *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1); - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - accum += @from@(data0[@i@]); -/**end repeat2**/ -#endif - data0 += 8; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + @temptype@ accum = @name@_sum_of_arr(&data0, count); + *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value1 * accum); } #elif @nop@ == 3 && !@complex@ @@ -1032,175 +828,31 @@ static void @name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr, npy_intp const *strides, npy_intp count) { -#if @complex@ - @temptype@ accum_re = 0, accum_im = 0; - @temptype@ *data0 = (@temptype@ *)dataptr[0]; -#else - @temptype@ accum = 0; - @type@ *data0 = (@type@ *)dataptr[0]; -#endif - -#if EINSUM_USE_SSE1 && @float32@ - __m128 a, accum_sse = _mm_setzero_ps(); -#elif EINSUM_USE_SSE2 && @float64@ - __m128d a, accum_sse = _mm_setzero_pd(); -#endif - - - NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", - (int)count); - -/* This is placed before the main loop to make small counts faster */ -finish_after_unrolled_loop: - switch (count) { -/**begin repeat2 - * #i = 6, 5, 4, 3, 2, 1, 0# - */ - case @i@+1: + NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count); #if !@complex@ - accum += @from@(data0[@i@]); -#else /* complex */ - accum_re += data0[2*@i@+0]; - accum_im += data0[2*@i@+1]; -#endif -/**end repeat2**/ - case 0: -#if @complex@ - ((@temptype@ *)dataptr[1])[0] += accum_re; - ((@temptype@ *)dataptr[1])[1] += accum_im; + @type@ *data = (@type@ *)dataptr[0]; + @temptype@ accum = @name@_sum_of_arr(&data, count); + *((@type@ *)dataptr[1]) = @to@(accum + @from@(*((@type@ *)dataptr[1]))); #else - *((@type@ *)dataptr[1]) = @to@(accum + - @from@(*((@type@ *)dataptr[1]))); -#endif - return; - } - -#if EINSUM_USE_SSE1 && @float32@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; - } -#elif EINSUM_USE_SSE2 && @float64@ - /* Use aligned instructions if possible */ - if (EINSUM_IS_SSE_ALIGNED(data0)) { - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); -/**end repeat2**/ - data0 += 8; - } - - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + @temptype@ accum_re = 0, accum_im = 0; + @temptype@ *data0 = (@temptype@ *)dataptr[0]; +#ifndef NPY_DISABLE_OPTIMIZATION + for (; count > 4; count -= 4, data0 += 4*2) { + const @temptype@ re01 = data0[0] + data0[2]; + const @temptype@ re23 = data0[4] + data0[6]; + const @temptype@ im13 = data0[1] + data0[3]; + const @temptype@ im57 = data0[5] + data0[7]; + accum_re += re01 + re23; + accum_im += im13 + im57; } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; - -#if EINSUM_USE_SSE1 && @float32@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 4# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); -/**end repeat2**/ -#elif EINSUM_USE_SSE2 && @float64@ - _mm_prefetch(data0 + 512, _MM_HINT_T0); - -/**begin repeat2 - * #i = 0, 2, 4, 6# - */ - /* - * NOTE: This accumulation changes the order, so will likely - * produce slightly different results. - */ - accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); -/**end repeat2**/ -#else -/**begin repeat2 - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ -# if !@complex@ - accum += @from@(data0[@i@]); -# else /* complex */ - accum_re += data0[2*@i@+0]; - accum_im += data0[2*@i@+1]; -# endif -/**end repeat2**/ -#endif - -#if !@complex@ - data0 += 8; -#else - data0 += 8*2; -#endif +#endif // !NPY_DISABLE_OPTIMIZATION + for (; count > 0; --count, data0 += 2) { + accum_re += data0[0]; + accum_im += data0[1]; } - -#if EINSUM_USE_SSE1 && @float32@ - /* Add the four SSE values and put in accum */ - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); - accum_sse = _mm_add_ps(a, accum_sse); - a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); - accum_sse = _mm_add_ps(a, accum_sse); - _mm_store_ss(&accum, accum_sse); -#elif EINSUM_USE_SSE2 && @float64@ - /* Add the two SSE2 values and put in accum */ - a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); - accum_sse = _mm_add_pd(a, accum_sse); - _mm_store_sd(&accum, accum_sse); -#endif - - /* Finish off the loop */ - goto finish_after_unrolled_loop; + ((@temptype@ *)dataptr[1])[0] += accum_re; + ((@temptype@ *)dataptr[1])[1] += accum_im; +#endif // !@complex@ } #endif /* @nop@ == 1 */ From f3608c32c2d225dfc0ee0107d6310e6f1caef75f Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Thu, 17 Dec 2020 15:05:18 +0800 Subject: [PATCH 2/5] add NPY_GCC_OPT_3 option. --- numpy/core/src/multiarray/einsum_sumprod.c.src | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index 03d2d614cf3c..5c45e5ec7890 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -705,7 +705,7 @@ static NPY_GCC_OPT_3 void *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum); } -static void +static NPY_GCC_OPT_3 void @name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr, npy_intp const *NPY_UNUSED(strides), npy_intp count) { @@ -715,7 +715,7 @@ static void *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum); } -static void +static NPY_GCC_OPT_3 void @name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr, npy_intp const *NPY_UNUSED(strides), npy_intp count) { @@ -824,7 +824,7 @@ static void #if @nop@ == 1 -static void +static NPY_GCC_OPT_3 void @name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr, npy_intp const *strides, npy_intp count) { From 1c0ea7369d908f6c740e6f5fe95e0534938d98d3 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Fri, 18 Dec 2020 11:42:10 +0800 Subject: [PATCH 3/5] add missing opt 3 flag. --- numpy/core/src/multiarray/einsum_sumprod.c.src | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index 5c45e5ec7890..a3d2b127f938 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -95,7 +95,7 @@ */ #if !@complex@ -static @temptype@ @name@_sum_of_arr(@type@ **data, npy_intp count) +static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ **data, npy_intp count) { @temptype@ accum = 0; #if @NPYV_CHK@ // NPYV check for @type@ From 9ed8d5dd3dfe7b7945ec2ac5d7c37f1b8c4e4ab7 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Wed, 23 Dec 2020 11:46:59 +0800 Subject: [PATCH 4/5] passing pointer not the address. --- .../core/src/multiarray/einsum_sumprod.c.src | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index a3d2b127f938..88b73759837f 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -95,12 +95,12 @@ */ #if !@complex@ -static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ **data, npy_intp count) +static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ *data, npy_intp count) { @temptype@ accum = 0; #if @NPYV_CHK@ // NPYV check for @type@ /* Use aligned instructions if possible */ - const int is_aligned = EINSUM_IS_ALIGNED(*data); + const int is_aligned = EINSUM_IS_ALIGNED(data); const int vstep = npyv_nlanes_@sfx@; npyv_@sfx@ vaccum = npyv_zero_@sfx@(); const npy_intp vstepx4 = vstep * 4; @@ -111,11 +111,11 @@ static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ **data, npy_intp count) * #st = storea, store# */ @cond@ { - for (; count >= vstepx4; count -= vstepx4, *data += vstepx4) { + for (; count >= vstepx4; count -= vstepx4, data += vstepx4) { /**begin repeat2 * #i = 0, 1, 2, 3# */ - npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(*data + vstep * @i@); + npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data + vstep * @i@); /**end repeat2**/ npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1); npyv_@sfx@ a23 = npyv_add_@sfx@(a2, a3); @@ -124,22 +124,22 @@ static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ **data, npy_intp count) } } /**end repeat1**/ - for (; count > 0; count -= vstep, *data += vstep) { - npyv_@sfx@ a = npyv_load_tillz_@sfx@(*data, count); + for (; count > 0; count -= vstep, data += vstep) { + npyv_@sfx@ a = npyv_load_tillz_@sfx@(data, count); vaccum = npyv_add_@sfx@(a, vaccum); } accum = npyv_sum_@sfx@(vaccum); npyv_cleanup(); #else #ifndef NPY_DISABLE_OPTIMIZATION - for (; count > 4; count -= 4, *data += 4) { - const @temptype@ a01 = @from@(**data) + @from@(*(*data + 1)); - const @temptype@ a23 = @from@(*(*data + 2)) + @from@(*(*data + 3)); + for (; count > 4; count -= 4, data += 4) { + const @temptype@ a01 = @from@(*data) + @from@(*(data + 1)); + const @temptype@ a23 = @from@(*(data + 2)) + @from@(*(data + 3)); accum += a01 + a23; } #endif // !NPY_DISABLE_OPTIMIZATION - for (; count > 0; --count, *data += 1) { - accum += @from@(**data); + for (; count > 0; --count, data += 1) { + accum += @from@(*data); } #endif // NPYV check for @type@ return accum; @@ -711,7 +711,7 @@ static NPY_GCC_OPT_3 void { @type@ *data1 = (@type@ *)dataptr[1]; @temptype@ value0 = @from@(*(@type@ *)dataptr[0]); - @temptype@ accum = @name@_sum_of_arr(&data1, count); + @temptype@ accum = @name@_sum_of_arr(data1, count); *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum); } @@ -721,7 +721,7 @@ static NPY_GCC_OPT_3 void { @type@ *data0 = (@type@ *)dataptr[0]; @temptype@ value1 = @from@(*(@type@ *)dataptr[1]); - @temptype@ accum = @name@_sum_of_arr(&data0, count); + @temptype@ accum = @name@_sum_of_arr(data0, count); *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value1 * accum); } @@ -831,7 +831,7 @@ static NPY_GCC_OPT_3 void NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count); #if !@complex@ @type@ *data = (@type@ *)dataptr[0]; - @temptype@ accum = @name@_sum_of_arr(&data, count); + @temptype@ accum = @name@_sum_of_arr(data, count); *((@type@ *)dataptr[1]) = @to@(accum + @from@(*((@type@ *)dataptr[1]))); #else @temptype@ accum_re = 0, accum_im = 0; From c016f636e175910a2a419bbbfed2944911a10d64 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Wed, 23 Dec 2020 17:15:59 +0800 Subject: [PATCH 5/5] simplify the index related code. --- numpy/core/src/multiarray/einsum_sumprod.c.src | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index 88b73759837f..d1b76de4e437 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -133,12 +133,12 @@ static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ *data, npy_intp count) #else #ifndef NPY_DISABLE_OPTIMIZATION for (; count > 4; count -= 4, data += 4) { - const @temptype@ a01 = @from@(*data) + @from@(*(data + 1)); - const @temptype@ a23 = @from@(*(data + 2)) + @from@(*(data + 3)); + const @temptype@ a01 = @from@(*data) + @from@(data[1]); + const @temptype@ a23 = @from@(data[2]) + @from@(data[3]); accum += a01 + a23; } #endif // !NPY_DISABLE_OPTIMIZATION - for (; count > 0; --count, data += 1) { + for (; count > 0; --count, data++) { accum += @from@(*data); } #endif // NPYV check for @type@