From 0d3bd92b07d605f604ae62b743192b2683bee3c3 Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Tue, 6 Apr 2021 19:41:13 +0530
Subject: [PATCH 01/10] SIMD: Removed umath code

---
 numpy/core/src/umath/loops.c.src | 86 --------------------------------
 1 file changed, 86 deletions(-)

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 04665dc5296e..683bd0178bf0 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -843,92 +843,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
     UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
 }
 
-/* Libdivide only supports 32 and 64 bit types
- * We try to pick the best possible one */
-#if NPY_BITSOF_@TYPE@ <= 32
-#define libdivide_@type@_t libdivide_s32_t
-#define libdivide_@type@_gen libdivide_s32_gen
-#define libdivide_@type@_do libdivide_s32_do
-#else
-#define libdivide_@type@_t libdivide_s64_t
-#define libdivide_@type@_gen libdivide_s64_gen
-#define libdivide_@type@_do libdivide_s64_do
-#endif
-
-NPY_NO_EXPORT void
-@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_DEFS
-
-    /* When the divisor is a constant, use libdivide for faster division */
-    if (steps[1] == 0) {
-        /* In case of empty array, just return */
-        if (n == 0) {
-            return;
-        }
-
-        const @type@ in2 = *(@type@ *)ip2;
-
-        /* If divisor is 0, we need not compute anything */
-        if (in2 == 0) {
-            npy_set_floatstatus_divbyzero();
-            BINARY_LOOP_SLIDING {
-                *((@type@ *)op1) = 0;
-            }
-        }
-        else {
-            struct libdivide_@type@_t fast_d = libdivide_@type@_gen(in2);
-            BINARY_LOOP_SLIDING {
-                const @type@ in1 = *(@type@ *)ip1;
-                /*
-                 * FIXME: On x86 at least, dividing the smallest representable integer
-                 * by -1 causes a SIFGPE (division overflow). We treat this case here
-                 * (to avoid a SIGFPE crash at python level), but a good solution would
-                 * be to treat integer division problems separately from FPU exceptions
-                 * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
-                 */
-                if (in1 == NPY_MIN_@TYPE@ && in2 == -1) {
-                    npy_set_floatstatus_divbyzero();
-                    *((@type@ *)op1) = 0;
-                }
-                else {
-                    *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d);
-
-                    /* Negative quotients needs to be rounded down */
-                    if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) {
-                        *((@type@ *)op1) = *((@type@ *)op1) - 1;
-                    }
-                }
-            }
-        }
-    }
-    else {
-        BINARY_LOOP_SLIDING {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            /*
-             * FIXME: On x86 at least, dividing the smallest representable integer
-             * by -1 causes a SIFGPE (division overflow). We treat this case here
-             * (to avoid a SIGFPE crash at python level), but a good solution would
-             * be to treat integer division problems separately from FPU exceptions
-             * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
-             */
-            if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
-                npy_set_floatstatus_divbyzero();
-                *((@type@ *)op1) = 0;
-            }
-            else {
-                *((@type@ *)op1) = in1/in2;
-
-                /* Negative quotients needs to be rounded down */
-                if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) {
-                    *((@type@ *)op1) = *((@type@ *)op1) - 1;
-                }
-            }
-        }
-    }
-}
-
 NPY_NO_EXPORT void
 @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {

From b000d5d45a94982d330305744737c95557d13317 Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Tue, 6 Apr 2021 19:42:27 +0530
Subject: [PATCH 02/10] SIMD: Add signed to dispatch

---
 numpy/core/src/umath/loops.h.src | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 0301aa5ed7b8..bb07e047c372 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -58,7 +58,8 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
 #endif
 
 /**begin repeat
- * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+           BYTE,  SHORT,  INT,  LONG,  LONGLONG#
  */
  NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
      (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
@@ -151,9 +152,6 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
-NPY_NO_EXPORT void
-@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
 NPY_NO_EXPORT void
 @S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 

From 9bb27e8559877eca5159b43b92b7436ffcb621d7 Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Tue, 6 Apr 2021 19:43:01 +0530
Subject: [PATCH 03/10] SIMD: Add dispatch to generate_umath

---
 numpy/core/code_generators/generate_umath.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 6b6a0fe64ad5..9e94f9cccc47 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -328,7 +328,7 @@ def english_upper(s):
           docstrings.get('numpy.core.umath.floor_divide'),
           'PyUFunc_DivisionTypeResolver',
           TD(ints, cfunc_alias='divide',
-              dispatch=[('loops_arithmetic', 'BHILQ')]),
+              dispatch=[('loops_arithmetic', 'bBhHiIlLqQ')]),
           TD(flts + cmplx),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),

From 7f9d342324a730185cdf215c66d73530033436ab Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Tue, 6 Apr 2021 19:44:48 +0530
Subject: [PATCH 04/10] SIMD: Added dispatch code for signed

---
 .../src/umath/loops_arithmetic.dispatch.c.src | 59 +++++++++++++------
 1 file changed, 40 insertions(+), 19 deletions(-)

diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 7e9f464636c5..c071edb3b656 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -23,7 +23,8 @@
  ********************************************************************************/
 #if NPY_SIMD
 /**begin repeat
- *  #sfx = u8, u16, u32, u64#
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64#
+ * #signed = 0*4, 1*4#
  */
 static NPY_INLINE void
 simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
@@ -43,6 +44,12 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
     for (; len > 0; --len, ++src, ++dst) {
         const npyv_lanetype_@sfx@ a = *src;
         *dst = a / scalar;
+#if @signed@
+        /* Negative quotients needs to be rounded down */
+        if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) {
+            *dst = *dst - 1;
+        }
+#endif
     }
 
     npyv_cleanup();
@@ -56,18 +63,25 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 
 /**begin repeat
  * Unsigned types
- *  #type  = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
- *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG#
- *  #STYPE = BYTE,      SHORT,      INT,      LONG,      LONGLONG#
+ *  #type  = byte, short, int, long, longlong#
+ *  #TYPE  = BYTE, SHORT, INT, LONG, LONGLONG#
+ */
+
+/**begin repeat1
+ * #s      = ,  u#
+ * #S      = ,  U#
+ * #slen   = s, u#
+ * #signed = 1, 0#
  */
+
 #undef TO_SIMD_SFX
 #if 0
-/**begin repeat1
+/**begin repeat2
  * #len = 8, 16, 32, 64#
  */
-#elif NPY_BITSOF_@STYPE@ == @len@
-    #define TO_SIMD_SFX(X) X##_u@len@
-/**end repeat1**/
+#elif NPY_BITSOF_@TYPE@ == @len@
+    #define TO_SIMD_SFX(X) X##_@slen@@len@
+/**end repeat2**/
 #endif
 /*
  * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
@@ -77,15 +91,15 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
  * Power10(VSX4) is an exception here since it has native support for integer vector division,
  * note neither infrastructure nor NPYV has supported VSX4 yet.
  */
-#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
     #undef TO_SIMD_SFX
 #endif
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@S@@TYPE@_divide)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            const @type@ d = *(@type@ *)ip2;
+        BINARY_REDUCE_LOOP(npy_@s@@type@) {
+            const npy_@s@@type@ d = *(npy_@s@@type@ *)ip2;
             if (NPY_UNLIKELY(d == 0)) {
                 npy_set_floatstatus_divbyzero();
                 io1 = 0;
@@ -93,26 +107,33 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
                 io1 /= d;
             }
         }
-        *((@type@ *)iop1) = io1;
+        *((npy_@s@@type@ *)iop1) = io1;
     }
 #if NPY_SIMD && defined(TO_SIMD_SFX)
     // for contiguous block of memory, divisor is a scalar and not 0
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
-             (*(@type@ *)args[1]) != 0) {
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_@s@@type@), NPY_SIMD_WIDTH) &&
+             (*(npy_@s@@type@ *)args[1]) != 0) {
         TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
     }
 #endif
     else {
         BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
+            const npy_@s@@type@ in1 = *(npy_@s@@type@ *)ip1;
+            const npy_@s@@type@ in2 = *(npy_@s@@type@ *)ip2;
             if (NPY_UNLIKELY(in2 == 0)) {
                 npy_set_floatstatus_divbyzero();
-                *((@type@ *)op1) = 0;
+                *((npy_@s@@type@ *)op1) = 0;
             } else{
-                *((@type@ *)op1) = in1 / in2;
+                *((npy_@s@@type@ *)op1) = in1 / in2;
+#if @signed@
+                 /* Negative quotients needs to be rounded down */
+                 if (((in1 > 0) != (in2 > 0)) && (*((npy_@type@ *)op1) * in2 != in1)) {
+                     *((npy_@type@ *)op1) = *((npy_@type@ *)op1) - 1;
+                 }
+#endif
             }
         }
     }
 }
+/**end repeat1**/
 /**end repeat**/

From 0b8838ef0c2e4c5d9e66163d260dc30902cc6170 Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Tue, 13 Apr 2021 21:08:48 +0530
Subject: [PATCH 05/10] SIMD: Added floor divide logic for signed

---
 .../src/umath/loops_arithmetic.dispatch.c.src | 107 +++++++++++++-----
 1 file changed, 81 insertions(+), 26 deletions(-)

diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index c071edb3b656..55066589f4a3 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -23,9 +23,53 @@
  ********************************************************************************/
 #if NPY_SIMD
 /**begin repeat
- * #sfx = u8, u16, u32, u64, s8, s16, s32, s64#
+ * #sfx    = u8,  u16, u32, u64, s8, s16, s32, s64#
+ * #len    = 8,   16,  32,  64,  8,  16,  32,  64#
  * #signed = 0*4, 1*4#
  */
+#if @signed@
+static NPY_INLINE npyv_@sfx@
+simd_floor_divide_@sfx@(npyv_lanetype_@sfx@ *src, const npyv_@sfx@x3 divisor, npyv_lanetype_@sfx@ scalar)
+{
+    npyv_@sfx@ a, nsign_d, nsign_a, diff_sign, to_ninf, trunc, floor;
+    npyv_b@len@ greater_min, noverflow;
+
+    nsign_d   = npyv_setall_@sfx@(scalar < 0);
+    a         = npyv_load_@sfx@(src);
+    nsign_a   = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
+    nsign_a   = npyv_and_@sfx@((npyv_@sfx@)nsign_a, npyv_setall_@sfx@(1));
+    diff_sign = npyv_sub_@sfx@((npyv_@sfx@)nsign_a, nsign_d);
+    to_ninf   = npyv_xor_@sfx@((npyv_@sfx@)nsign_a, nsign_d);
+    trunc     = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
+
+    if (NPY_UNLIKELY(-1 == scalar)) {
+        greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
+        noverflow   = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
+        noverflow   = npyv_and_b@len@(noverflow, greater_min);
+        if (npyv_tobits_b@len@(noverflow) != (((npy_uint64)1) << npyv_nlanes_@sfx@)-1) {
+            npy_set_floatstatus_divbyzero();
+        }
+        floor = npyv_ifsub_@sfx@(greater_min, trunc, to_ninf, npyv_zero_@sfx@());
+    }
+    else {
+        floor = npyv_sub_@sfx@(trunc, to_ninf);
+    }
+
+    return floor;
+}
+#else
+static NPY_INLINE npyv_@sfx@
+simd_floor_divide_@sfx@(npyv_lanetype_@sfx@ *src, const npyv_@sfx@x3 divisor, npyv_lanetype_@sfx@ scalar)
+{
+    npyv_@sfx@ a, c;
+
+    a = npyv_load_@sfx@(src);
+    c = npyv_divc_@sfx@(a, divisor);
+
+    return c;
+}
+#endif
+
 static NPY_INLINE void
 simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 {
@@ -36,20 +80,24 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
     const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
 
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-        npyv_@sfx@ a = npyv_load_@sfx@(src);
-        npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor);
+        npyv_@sfx@ c = simd_floor_divide_@sfx@(src, divisor, scalar);
         npyv_store_@sfx@(dst, c);
     }
 
     for (; len > 0; --len, ++src, ++dst) {
         const npyv_lanetype_@sfx@ a = *src;
-        *dst = a / scalar;
+        if (scalar == 0 || (a == (npyv_lanetype_@sfx@)NPY_MIN_INT@len@ && scalar == (npyv_lanetype_@sfx@)-1)) {
+             npy_set_floatstatus_divbyzero();
+             *dst = 0;
+        } else {
+            *dst = a / scalar;
 #if @signed@
-        /* Negative quotients needs to be rounded down */
-        if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) {
-            *dst = *dst - 1;
-        }
+            /* Negative quotients needs to be rounded down */
+            if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) {
+                *dst = *dst - 1;
+            }
 #endif
+        }
     }
 
     npyv_cleanup();
@@ -68,19 +116,24 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
  */
 
 /**begin repeat1
- * #s      = ,  u#
- * #S      = ,  U#
- * #slen   = s, u#
  * #signed = 1, 0#
  */
 
 #undef TO_SIMD_SFX
+#undef SIMD_TYPE
+#undef SIMD_DIVIDE
 #if 0
 /**begin repeat2
  * #len = 8, 16, 32, 64#
  */
+#elif NPY_BITSOF_@TYPE@ == @len@ && @signed@
+    #define TO_SIMD_SFX(X) X##_s@len@
+    #define SIMD_TYPE      npy_@type@
+    #define SIMD_DIVIDE    @TYPE@_divide
 #elif NPY_BITSOF_@TYPE@ == @len@
-    #define TO_SIMD_SFX(X) X##_@slen@@len@
+    #define TO_SIMD_SFX(X) X##_u@len@
+    #define SIMD_TYPE      npy_u@type@
+    #define SIMD_DIVIDE    U@TYPE@_divide
 /**end repeat2**/
 #endif
 /*
@@ -93,42 +146,44 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
  */
 #if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
     #undef TO_SIMD_SFX
+    #undef SIMD_TYPE
+    #undef SIMD_DIVIDE
 #endif
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@S@@TYPE@_divide)
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SIMD_DIVIDE)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(npy_@s@@type@) {
-            const npy_@s@@type@ d = *(npy_@s@@type@ *)ip2;
-            if (NPY_UNLIKELY(d == 0)) {
+        BINARY_REDUCE_LOOP(SIMD_TYPE) {
+            const SIMD_TYPE d = *(SIMD_TYPE *)ip2;
+            if (NPY_UNLIKELY(d == 0 || (io1 == (SIMD_TYPE)NPY_MIN_@TYPE@ && d == (SIMD_TYPE)-1))) {
                 npy_set_floatstatus_divbyzero();
                 io1 = 0;
             } else {
                 io1 /= d;
             }
         }
-        *((npy_@s@@type@ *)iop1) = io1;
+        *((SIMD_TYPE *)iop1) = io1;
     }
 #if NPY_SIMD && defined(TO_SIMD_SFX)
     // for contiguous block of memory, divisor is a scalar and not 0
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_@s@@type@), NPY_SIMD_WIDTH) &&
-             (*(npy_@s@@type@ *)args[1]) != 0) {
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(SIMD_TYPE), NPY_SIMD_WIDTH) &&
+             (*(SIMD_TYPE *)args[1]) != 0) {
         TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
     }
 #endif
     else {
         BINARY_LOOP {
-            const npy_@s@@type@ in1 = *(npy_@s@@type@ *)ip1;
-            const npy_@s@@type@ in2 = *(npy_@s@@type@ *)ip2;
-            if (NPY_UNLIKELY(in2 == 0)) {
+            const SIMD_TYPE in1 = *(SIMD_TYPE *)ip1;
+            const SIMD_TYPE in2 = *(SIMD_TYPE *)ip2;
+            if (NPY_UNLIKELY(in2 == 0 || (in1 == (SIMD_TYPE)NPY_MIN_@TYPE@ && in2 == (SIMD_TYPE)-1))) {
                 npy_set_floatstatus_divbyzero();
-                *((npy_@s@@type@ *)op1) = 0;
+                *((SIMD_TYPE *)op1) = 0;
             } else{
-                *((npy_@s@@type@ *)op1) = in1 / in2;
+                *((SIMD_TYPE *)op1) = in1 / in2;
 #if @signed@
                  /* Negative quotients needs to be rounded down */
-                 if (((in1 > 0) != (in2 > 0)) && (*((npy_@type@ *)op1) * in2 != in1)) {
-                     *((npy_@type@ *)op1) = *((npy_@type@ *)op1) - 1;
+                 if (((in1 > 0) != (in2 > 0)) && (*((SIMD_TYPE *)op1) * in2 != in1)) {
+                     *((SIMD_TYPE *)op1) = *((SIMD_TYPE *)op1) - 1;
                  }
 #endif
             }

From b1c3c98bfa13699dda51642723e3ce849d5950eb Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Tue, 13 Apr 2021 21:09:12 +0530
Subject: [PATCH 06/10] DOC: Added floor divide doc

---
 .../core/src/umath/loops_arithmetic.dispatch.c.src  | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 55066589f4a3..30d7a2a99026 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -20,6 +20,19 @@
 //###############################################################################
 /********************************************************************************
  ** Defining the SIMD kernels
+ *
+ * Floor division of signed is based on T. Granlund and P. L. Montgomery
+ * “Division by invariant integers using multiplication(see [Figure 6.1]
+ * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)"
+ * For details on TRUNC division see simd/intdiv.h for more clarification
+ ***********************************************************************************
+ ** Figure 6.1: Signed division by run–time invariant divisor, rounded towards -INF
+ ***********************************************************************************
+ * For q = FLOOR(a/d), all sword:
+ *     sword −dsign = SRL(d, N − 1);
+ *     uword −nsign = (n < −dsign);
+ *     uword −qsign = EOR(−nsign, −dsign);
+ *     q = TRUNC((n − (−dsign ) + (−nsign))/d) − (−qsign);
  ********************************************************************************/
 #if NPY_SIMD
 /**begin repeat

From b6b32674d634b6dfe9d92212e8a6ced0f1e14319 Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Wed, 21 Apr 2021 22:01:17 +0530
Subject: [PATCH 07/10] SIMD: Refined signed and unsigned floor divide

---
 .../src/umath/loops_arithmetic.dispatch.c.src | 108 +++++++++++-------
 1 file changed, 68 insertions(+), 40 deletions(-)

diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 30d7a2a99026..5e54a45de2d9 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -41,48 +41,82 @@
  * #signed = 0*4, 1*4#
  */
 #if @signed@
-static NPY_INLINE npyv_@sfx@
-simd_floor_divide_@sfx@(npyv_lanetype_@sfx@ *src, const npyv_@sfx@x3 divisor, npyv_lanetype_@sfx@ scalar)
+static NPY_INLINE void
+simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 {
-    npyv_@sfx@ a, nsign_d, nsign_a, diff_sign, to_ninf, trunc, floor;
+    npyv_@sfx@  a, nsign_d, nsign_a, diff_sign, to_ninf, trunc, floor, neg, vzero;
     npyv_b@len@ greater_min, noverflow;
+    npy_bool    raise;
+    npy_uint64  tobits;
 
-    nsign_d   = npyv_setall_@sfx@(scalar < 0);
-    a         = npyv_load_@sfx@(src);
-    nsign_a   = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
-    nsign_a   = npyv_and_@sfx@((npyv_@sfx@)nsign_a, npyv_setall_@sfx@(1));
-    diff_sign = npyv_sub_@sfx@((npyv_@sfx@)nsign_a, nsign_d);
-    to_ninf   = npyv_xor_@sfx@((npyv_@sfx@)nsign_a, nsign_d);
-    trunc     = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
+    npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
+    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
+    npyv_lanetype_@sfx@ *dst   = (npyv_lanetype_@sfx@ *) args[2];
+    const int vstep            = npyv_nlanes_@sfx@;
+    const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
 
     if (NPY_UNLIKELY(-1 == scalar)) {
-        greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
-        noverflow   = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
-        noverflow   = npyv_and_b@len@(noverflow, greater_min);
-        if (npyv_tobits_b@len@(noverflow) != (((npy_uint64)1) << npyv_nlanes_@sfx@)-1) {
+        noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
+        vzero     = npyv_zero_@sfx@();
+        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+            a           = npyv_load_@sfx@(src);
+            greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
+            noverflow   = npyv_and_b@len@(noverflow, greater_min);
+            neg         = npyv_ifsub_@sfx@(greater_min, vzero, a, vzero);
+
+            npyv_store_@sfx@(dst, neg);
+        }
+        tobits = npyv_tobits_b@len@(noverflow);
+    #if npyv_nlanes_@sfx@ == 64
+        raise = (~tobits) != 0;
+    #else
+        raise = tobits != (1ULL << vstep)-1;
+    #endif
+
+        for (; len > 0; --len, ++src, ++dst) {
+            npyv_lanetype_@sfx@ a = *src;
+            if (a == NPY_MIN_INT@len@) {
+                raise = NPY_TRUE;
+                *dst  = 0;
+            } else {
+                *dst = -a;
+            }
+        }
+        if (raise) {
             npy_set_floatstatus_divbyzero();
         }
-        floor = npyv_ifsub_@sfx@(greater_min, trunc, to_ninf, npyv_zero_@sfx@());
-    }
-    else {
-        floor = npyv_sub_@sfx@(trunc, to_ninf);
-    }
+    } else {
+        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+            nsign_d   = npyv_setall_@sfx@(scalar < 0);
+            a         = npyv_load_@sfx@(src);
+            nsign_a   = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
+            nsign_a   = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1));
+            diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d);
+            to_ninf   = npyv_xor_@sfx@(nsign_a, nsign_d);
+            trunc     = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
+            floor     = npyv_sub_@sfx@(trunc, to_ninf);
 
-    return floor;
-}
-#else
-static NPY_INLINE npyv_@sfx@
-simd_floor_divide_@sfx@(npyv_lanetype_@sfx@ *src, const npyv_@sfx@x3 divisor, npyv_lanetype_@sfx@ scalar)
-{
-    npyv_@sfx@ a, c;
+            npyv_store_@sfx@(dst, floor);
+        }
 
-    a = npyv_load_@sfx@(src);
-    c = npyv_divc_@sfx@(a, divisor);
+        for (; len > 0; --len, ++src, ++dst) {
+            const npyv_lanetype_@sfx@ a = *src;
+            if (scalar == 0 || (a == (npyv_lanetype_@sfx@)NPY_MIN_INT@len@ && scalar == (npyv_lanetype_@sfx@)-1)) {
+                 npy_set_floatstatus_divbyzero();
+                 *dst = 0;
+            } else {
+                *dst = a / scalar;
+                /* Negative quotients needs to be rounded down */
+                if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) {
+                    *dst = *dst - 1;
+                }
+            }
+        }
+    }
 
-    return c;
+    npyv_cleanup();
 }
-#endif
-
+#else
 static NPY_INLINE void
 simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 {
@@ -93,7 +127,8 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
     const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
 
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-        npyv_@sfx@ c = simd_floor_divide_@sfx@(src, divisor, scalar);
+        npyv_@sfx@ a = npyv_load_@sfx@(src);
+        npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor);
         npyv_store_@sfx@(dst, c);
     }
 
@@ -104,17 +139,12 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
              *dst = 0;
         } else {
             *dst = a / scalar;
-#if @signed@
-            /* Negative quotients needs to be rounded down */
-            if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) {
-                *dst = *dst - 1;
-            }
-#endif
         }
     }
 
     npyv_cleanup();
 }
+#endif
 /**end repeat**/
 #endif
 
@@ -159,8 +189,6 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
  */
 #if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
     #undef TO_SIMD_SFX
-    #undef SIMD_TYPE
-    #undef SIMD_DIVIDE
 #endif
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SIMD_DIVIDE)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))

From 7c163672933d42e76dd643065acbe36a7274dc00 Mon Sep 17 00:00:00 2001
From: Ganesh Kathiresan <ganesh3597@gmail.com>
Date: Tue, 11 May 2021 21:38:51 +0530
Subject: [PATCH 08/10] SIMD: Separate signed and unsigned loops

---
 .../src/umath/loops_arithmetic.dispatch.c.src | 181 ++++++++++--------
 1 file changed, 105 insertions(+), 76 deletions(-)

diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 5e54a45de2d9..a52bb36b7031 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -36,41 +36,35 @@
  ********************************************************************************/
 #if NPY_SIMD
 /**begin repeat
- * #sfx    = u8,  u16, u32, u64, s8, s16, s32, s64#
- * #len    = 8,   16,  32,  64,  8,  16,  32,  64#
- * #signed = 0*4, 1*4#
+ * Signed types
+ * #sfx    = s8, s16, s32, s64#
+ * #len    = 8,  16,  32,  64#
  */
-#if @signed@
 static NPY_INLINE void
 simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 {
-    npyv_@sfx@  a, nsign_d, nsign_a, diff_sign, to_ninf, trunc, floor, neg, vzero;
-    npyv_b@len@ greater_min, noverflow;
-    npy_bool    raise;
-    npy_uint64  tobits;
-
     npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
     npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
     npyv_lanetype_@sfx@ *dst   = (npyv_lanetype_@sfx@ *) args[2];
     const int vstep            = npyv_nlanes_@sfx@;
     const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
 
-    if (NPY_UNLIKELY(-1 == scalar)) {
-        noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
-        vzero     = npyv_zero_@sfx@();
+    if (scalar == (npyv_lanetype_@sfx@)-1) {
+        npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
+        npyv_@sfx@ vzero      = npyv_zero_@sfx@();
         for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-            a           = npyv_load_@sfx@(src);
-            greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
-            noverflow   = npyv_and_b@len@(noverflow, greater_min);
-            neg         = npyv_ifsub_@sfx@(greater_min, vzero, a, vzero);
+            npyv_@sfx@ a            = npyv_load_@sfx@(src);
+            npyv_b@len@ greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
+            noverflow               = npyv_and_b@len@(noverflow, greater_min);
+            npyv_@sfx@ neg          = npyv_ifsub_@sfx@(greater_min, vzero, a, vzero);
 
             npyv_store_@sfx@(dst, neg);
         }
-        tobits = npyv_tobits_b@len@(noverflow);
+        npy_uint64 tobits = npyv_tobits_b@len@(noverflow);
     #if npyv_nlanes_@sfx@ == 64
-        raise = (~tobits) != 0;
+        int raise = (~tobits) != 0;
     #else
-        raise = tobits != (1ULL << vstep)-1;
+        int raise = tobits != (1ULL << vstep)-1;
     #endif
 
         for (; len > 0; --len, ++src, ++dst) {
@@ -87,36 +81,37 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
         }
     } else {
         for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-            nsign_d   = npyv_setall_@sfx@(scalar < 0);
-            a         = npyv_load_@sfx@(src);
-            nsign_a   = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
-            nsign_a   = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1));
-            diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d);
-            to_ninf   = npyv_xor_@sfx@(nsign_a, nsign_d);
-            trunc     = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
-            floor     = npyv_sub_@sfx@(trunc, to_ninf);
+            npyv_@sfx@  nsign_d   = npyv_setall_@sfx@(scalar < 0);
+            npyv_@sfx@  a         = npyv_load_@sfx@(src);
+            npyv_@sfx@  nsign_a   = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
+            nsign_a               = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1));
+            npyv_@sfx@  diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d);
+            npyv_@sfx@  to_ninf   = npyv_xor_@sfx@(nsign_a, nsign_d);
+            npyv_@sfx@  trunc     = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
+            npyv_@sfx@  floor     = npyv_sub_@sfx@(trunc, to_ninf);
 
             npyv_store_@sfx@(dst, floor);
         }
 
         for (; len > 0; --len, ++src, ++dst) {
             const npyv_lanetype_@sfx@ a = *src;
-            if (scalar == 0 || (a == (npyv_lanetype_@sfx@)NPY_MIN_INT@len@ && scalar == (npyv_lanetype_@sfx@)-1)) {
-                 npy_set_floatstatus_divbyzero();
-                 *dst = 0;
-            } else {
-                *dst = a / scalar;
-                /* Negative quotients needs to be rounded down */
-                if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) {
-                    *dst = *dst - 1;
-                }
+            *dst = a / scalar;
+            /* Negative quotients needs to be rounded down */
+            if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) {
+                *dst = *dst - 1;
             }
         }
     }
 
     npyv_cleanup();
 }
-#else
+/**end repeat**/
+
+/**begin repeat
+ * Unsigned types
+ * #sfx    = u8, u16, u32, u64#
+ * #len    = 8,  16,  32,  64#
+ */
 static NPY_INLINE void
 simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 {
@@ -134,17 +129,11 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 
     for (; len > 0; --len, ++src, ++dst) {
         const npyv_lanetype_@sfx@ a = *src;
-        if (scalar == 0 || (a == (npyv_lanetype_@sfx@)NPY_MIN_INT@len@ && scalar == (npyv_lanetype_@sfx@)-1)) {
-             npy_set_floatstatus_divbyzero();
-             *dst = 0;
-        } else {
-            *dst = a / scalar;
-        }
+        *dst = a / scalar;
     }
 
     npyv_cleanup();
 }
-#endif
 /**end repeat**/
 #endif
 
@@ -153,31 +142,78 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
  ********************************************************************************/
 
 /**begin repeat
- * Unsigned types
+ * Signed types
  *  #type  = byte, short, int, long, longlong#
  *  #TYPE  = BYTE, SHORT, INT, LONG, LONGLONG#
  */
-
+#undef TO_SIMD_SFX
+#if 0
 /**begin repeat1
- * #signed = 1, 0#
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_BITSOF_@TYPE@ == @len@
+    #define TO_SIMD_SFX(X) X##_s@len@
+/**end repeat1**/
+#endif
+
+#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+    #undef TO_SIMD_SFX
+#endif
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(npy_@type@) {
+            const npy_@type@ d = *(npy_@type@ *)ip2;
+            if (NPY_UNLIKELY(d == 0 || (io1 == (npy_@type@)NPY_MIN_@TYPE@ && d == (npy_@type@)-1))) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else {
+                io1 /= d;
+            }
+        }
+        *((npy_@type@ *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_@type@), NPY_SIMD_WIDTH) &&
+             (*(npy_@type@ *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            const npy_@type@ in1 = *(npy_@type@ *)ip1;
+            const npy_@type@ in2 = *(npy_@type@ *)ip2;
+            if (NPY_UNLIKELY(in2 == 0 || (in1 == (npy_@type@)NPY_MIN_@TYPE@ && in2 == (npy_@type@)-1))) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_@type@ *)op1) = 0;
+            } else{
+                *((npy_@type@ *)op1) = in1 / in2;
+                 /* Negative quotients needs to be rounded down */
+                 if (((in1 > 0) != (in2 > 0)) && (*((npy_@type@ *)op1) * in2 != in1)) {
+                     *((npy_@type@ *)op1) = *((npy_@type@ *)op1) - 1;
+                 }
+            }
+        }
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ * Unsigned types
+ *  #type  = byte, short, int, long, longlong#
+ *  #TYPE  = BYTE, SHORT, INT, LONG, LONGLONG#
  */
 
 #undef TO_SIMD_SFX
-#undef SIMD_TYPE
-#undef SIMD_DIVIDE
 #if 0
-/**begin repeat2
+/**begin repeat1
  * #len = 8, 16, 32, 64#
  */
-#elif NPY_BITSOF_@TYPE@ == @len@ && @signed@
-    #define TO_SIMD_SFX(X) X##_s@len@
-    #define SIMD_TYPE      npy_@type@
-    #define SIMD_DIVIDE    @TYPE@_divide
 #elif NPY_BITSOF_@TYPE@ == @len@
     #define TO_SIMD_SFX(X) X##_u@len@
-    #define SIMD_TYPE      npy_u@type@
-    #define SIMD_DIVIDE    U@TYPE@_divide
-/**end repeat2**/
+/**end repeat1**/
 #endif
 /*
  * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
@@ -190,46 +226,39 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 #if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
     #undef TO_SIMD_SFX
 #endif
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SIMD_DIVIDE)
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(U@TYPE@_divide)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(SIMD_TYPE) {
-            const SIMD_TYPE d = *(SIMD_TYPE *)ip2;
-            if (NPY_UNLIKELY(d == 0 || (io1 == (SIMD_TYPE)NPY_MIN_@TYPE@ && d == (SIMD_TYPE)-1))) {
+        BINARY_REDUCE_LOOP(npy_u@type@) {
+            const npy_u@type@ d = *(npy_u@type@ *)ip2;
+            if (NPY_UNLIKELY(d == 0 || (io1 == (npy_u@type@)NPY_MIN_@TYPE@ && d == (npy_u@type@)-1))) {
                 npy_set_floatstatus_divbyzero();
                 io1 = 0;
             } else {
                 io1 /= d;
             }
         }
-        *((SIMD_TYPE *)iop1) = io1;
+        *((npy_u@type@ *)iop1) = io1;
     }
 #if NPY_SIMD && defined(TO_SIMD_SFX)
     // for contiguous block of memory, divisor is a scalar and not 0
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(SIMD_TYPE), NPY_SIMD_WIDTH) &&
-             (*(SIMD_TYPE *)args[1]) != 0) {
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_u@type@), NPY_SIMD_WIDTH) &&
+             (*(npy_u@type@ *)args[1]) != 0) {
         TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
     }
 #endif
     else {
         BINARY_LOOP {
-            const SIMD_TYPE in1 = *(SIMD_TYPE *)ip1;
-            const SIMD_TYPE in2 = *(SIMD_TYPE *)ip2;
-            if (NPY_UNLIKELY(in2 == 0 || (in1 == (SIMD_TYPE)NPY_MIN_@TYPE@ && in2 == (SIMD_TYPE)-1))) {
+            const npy_u@type@ in1 = *(npy_u@type@ *)ip1;
+            const npy_u@type@ in2 = *(npy_u@type@ *)ip2;
+            if (NPY_UNLIKELY(in2 == 0 || (in1 == (npy_u@type@)NPY_MIN_@TYPE@ && in2 == (npy_u@type@)-1))) {
                 npy_set_floatstatus_divbyzero();
-                *((SIMD_TYPE *)op1) = 0;
+                *((npy_u@type@ *)op1) = 0;
             } else{
-                *((SIMD_TYPE *)op1) = in1 / in2;
-#if @signed@
-                 /* Negative quotients needs to be rounded down */
-                 if (((in1 > 0) != (in2 > 0)) && (*((SIMD_TYPE *)op1) * in2 != in1)) {
-                     *((SIMD_TYPE *)op1) = *((SIMD_TYPE *)op1) - 1;
-                 }
-#endif
+                *((npy_u@type@ *)op1) = in1 / in2;
             }
         }
     }
 }
-/**end repeat1**/
 /**end repeat**/

From 4619081d1faca58dd3e25db76164c1c7ad928a0f Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Tue, 11 May 2021 21:08:39 +0200
Subject: [PATCH 09/10] MAINT, SIMD: Several fixes to integer division

  - Revert unsigned integer division changes,
    overflow check only required by signed division.
  - Fix floor round for reduce divison
  - cleanup
  - revert fixme comment
---
 .../src/umath/loops_arithmetic.dispatch.c.src | 124 +++++++++---------
 1 file changed, 61 insertions(+), 63 deletions(-)

diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index a52bb36b7031..19e05f2b57b0 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -34,6 +34,7 @@
  *     uword −qsign = EOR(−nsign, −dsign);
  *     q = TRUNC((n − (−dsign ) + (−nsign))/d) − (−qsign);
  ********************************************************************************/
+
 #if NPY_SIMD
 /**begin repeat
  * Signed types
@@ -49,34 +50,28 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
     const int vstep            = npyv_nlanes_@sfx@;
     const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
 
-    if (scalar == (npyv_lanetype_@sfx@)-1) {
+    if (scalar == -1) {
         npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
         npyv_@sfx@ vzero      = npyv_zero_@sfx@();
         for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-            npyv_@sfx@ a            = npyv_load_@sfx@(src);
-            npyv_b@len@ greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
-            noverflow               = npyv_and_b@len@(noverflow, greater_min);
-            npyv_@sfx@ neg          = npyv_ifsub_@sfx@(greater_min, vzero, a, vzero);
-
+            npyv_@sfx@ a       = npyv_load_@sfx@(src);
+            npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
+            noverflow          = npyv_and_b@len@(noverflow, gt_min);
+            npyv_@sfx@ neg     = npyv_ifsub_@sfx@(gt_min, vzero, a, vzero);
             npyv_store_@sfx@(dst, neg);
         }
-        npy_uint64 tobits = npyv_tobits_b@len@(noverflow);
-    #if npyv_nlanes_@sfx@ == 64
-        int raise = (~tobits) != 0;
-    #else
-        int raise = tobits != (1ULL << vstep)-1;
-    #endif
 
+        int raise_err = npyv_tobits_b@len@(npyv_not_b@len@(noverflow)) != 0;
         for (; len > 0; --len, ++src, ++dst) {
             npyv_lanetype_@sfx@ a = *src;
             if (a == NPY_MIN_INT@len@) {
-                raise = NPY_TRUE;
+                raise_err = 1;
                 *dst  = 0;
             } else {
                 *dst = -a;
             }
         }
-        if (raise) {
+        if (raise_err) {
             npy_set_floatstatus_divbyzero();
         }
     } else {
@@ -89,20 +84,19 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
             npyv_@sfx@  to_ninf   = npyv_xor_@sfx@(nsign_a, nsign_d);
             npyv_@sfx@  trunc     = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
             npyv_@sfx@  floor     = npyv_sub_@sfx@(trunc, to_ninf);
-
             npyv_store_@sfx@(dst, floor);
         }
 
         for (; len > 0; --len, ++src, ++dst) {
             const npyv_lanetype_@sfx@ a = *src;
-            *dst = a / scalar;
-            /* Negative quotients needs to be rounded down */
-            if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) {
-                *dst = *dst - 1;
+            npyv_lanetype_@sfx@ r = a / scalar;
+            // Negative quotients needs to be rounded down
+            if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
+                r--;
             }
+            *dst = r;
         }
     }
-
     npyv_cleanup();
 }
 /**end repeat**/
@@ -131,7 +125,6 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
         const npyv_lanetype_@sfx@ a = *src;
         *dst = a / scalar;
     }
-
     npyv_cleanup();
 }
 /**end repeat**/
@@ -143,8 +136,8 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 
 /**begin repeat
  * Signed types
- *  #type  = byte, short, int, long, longlong#
- *  #TYPE  = BYTE, SHORT, INT, LONG, LONGLONG#
+ *  #type  = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
+ *  #TYPE  = BYTE,     SHORT,     INT,     LONG,     LONGLONG#
  */
 #undef TO_SIMD_SFX
 #if 0
@@ -159,42 +152,47 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 #if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
     #undef TO_SIMD_SFX
 #endif
+
+NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d)
+{
+    /*
+     * FIXME: On x86 at least, dividing the smallest representable integer
+     * by -1 causes a SIFGPE (division overflow). We treat this case here
+     * (to avoid a SIGFPE crash at python level), but a good solution would
+     * be to treat integer division problems separately from FPU exceptions
+     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
+     */
+    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) {
+        npy_set_floatstatus_divbyzero();
+        return 0;
+    }
+    @type@ r = n / d;
+    // Negative quotients needs to be rounded down
+    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
+        r--;
+    }
+    return r;
+}
+
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(npy_@type@) {
-            const npy_@type@ d = *(npy_@type@ *)ip2;
-            if (NPY_UNLIKELY(d == 0 || (io1 == (npy_@type@)NPY_MIN_@TYPE@ && d == (npy_@type@)-1))) {
-                npy_set_floatstatus_divbyzero();
-                io1 = 0;
-            } else {
-                io1 /= d;
-            }
+        BINARY_REDUCE_LOOP(@type@) {
+            io1 = floor_div_@TYPE@(io1, *(@type@*)ip2);
         }
-        *((npy_@type@ *)iop1) = io1;
+        *((@type@ *)iop1) = io1;
     }
 #if NPY_SIMD && defined(TO_SIMD_SFX)
     // for contiguous block of memory, divisor is a scalar and not 0
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_@type@), NPY_SIMD_WIDTH) &&
-             (*(npy_@type@ *)args[1]) != 0) {
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
+             (*(@type@ *)args[1]) != 0) {
         TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
     }
 #endif
     else {
         BINARY_LOOP {
-            const npy_@type@ in1 = *(npy_@type@ *)ip1;
-            const npy_@type@ in2 = *(npy_@type@ *)ip2;
-            if (NPY_UNLIKELY(in2 == 0 || (in1 == (npy_@type@)NPY_MIN_@TYPE@ && in2 == (npy_@type@)-1))) {
-                npy_set_floatstatus_divbyzero();
-                *((npy_@type@ *)op1) = 0;
-            } else{
-                *((npy_@type@ *)op1) = in1 / in2;
-                 /* Negative quotients needs to be rounded down */
-                 if (((in1 > 0) != (in2 > 0)) && (*((npy_@type@ *)op1) * in2 != in1)) {
-                     *((npy_@type@ *)op1) = *((npy_@type@ *)op1) - 1;
-                 }
-            }
+            *((@type@ *)op1) = floor_div_@TYPE@(*(@type@*)ip1, *(@type@*)ip2);
         }
     }
 }
@@ -202,16 +200,16 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
 
 /**begin repeat
  * Unsigned types
- *  #type  = byte, short, int, long, longlong#
- *  #TYPE  = BYTE, SHORT, INT, LONG, LONGLONG#
+ *  #type  = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG#
+ *  #STYPE = BYTE,      SHORT,      INT,      LONG,      LONGLONG#
  */
-
 #undef TO_SIMD_SFX
 #if 0
 /**begin repeat1
  * #len = 8, 16, 32, 64#
  */
-#elif NPY_BITSOF_@TYPE@ == @len@
+#elif NPY_BITSOF_@STYPE@ == @len@
     #define TO_SIMD_SFX(X) X##_u@len@
 /**end repeat1**/
 #endif
@@ -223,40 +221,40 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
  * Power10(VSX4) is an exception here since it has native support for integer vector division,
  * note neither infrastructure nor NPYV has supported VSX4 yet.
  */
-#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
     #undef TO_SIMD_SFX
 #endif
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(U@TYPE@_divide)
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(npy_u@type@) {
-            const npy_u@type@ d = *(npy_u@type@ *)ip2;
-            if (NPY_UNLIKELY(d == 0 || (io1 == (npy_u@type@)NPY_MIN_@TYPE@ && d == (npy_u@type@)-1))) {
+        BINARY_REDUCE_LOOP(@type@) {
+            const @type@ d = *(@type@ *)ip2;
+            if (NPY_UNLIKELY(d == 0)) {
                 npy_set_floatstatus_divbyzero();
                 io1 = 0;
             } else {
                 io1 /= d;
             }
         }
-        *((npy_u@type@ *)iop1) = io1;
+        *((@type@ *)iop1) = io1;
     }
 #if NPY_SIMD && defined(TO_SIMD_SFX)
     // for contiguous block of memory, divisor is a scalar and not 0
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_u@type@), NPY_SIMD_WIDTH) &&
-             (*(npy_u@type@ *)args[1]) != 0) {
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
+             (*(@type@ *)args[1]) != 0) {
         TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
     }
 #endif
     else {
         BINARY_LOOP {
-            const npy_u@type@ in1 = *(npy_u@type@ *)ip1;
-            const npy_u@type@ in2 = *(npy_u@type@ *)ip2;
-            if (NPY_UNLIKELY(in2 == 0 || (in1 == (npy_u@type@)NPY_MIN_@TYPE@ && in2 == (npy_u@type@)-1))) {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            if (NPY_UNLIKELY(in2 == 0)) {
                 npy_set_floatstatus_divbyzero();
-                *((npy_u@type@ *)op1) = 0;
+                *((@type@ *)op1) = 0;
             } else{
-                *((npy_u@type@ *)op1) = in1 / in2;
+                *((@type@ *)op1) = in1 / in2;
             }
         }
     }

From f74f5003090a4c7ec83dead46567bd96de6dfce8 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Fri, 21 May 2021 04:40:29 +0200
Subject: [PATCH 10/10] SIMD: Fix computing the fast int32 division parameters

  When the divisor is equal to the minimum integer value,
  It was affected by gcc 9.3 only and under certain conditions
  of aggressive optimization.
---
 numpy/core/src/common/simd/intdiv.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h
index 1ce3b4df834d..f6ea9abf254e 100644
--- a/numpy/core/src/common/simd/intdiv.h
+++ b/numpy/core/src/common/simd/intdiv.h
@@ -368,18 +368,18 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
 {
     npy_int32 d1 = abs(d);
     npy_int32 sh, m;
-    if (d1 > 1) {
+    // Handel abs overflow
+    if ((npy_uint32)d == 0x80000000U) {
+        m = 0x80000001;
+        sh = 30;
+    }
+    else if (d1 > 1) {
         sh = npyv__bitscan_revnz_u32(d1 - 1); // ceil(log2(abs(d))) - 1
         m =  (1ULL << (32 + sh)) / d1 + 1;    // multiplier
     }
     else if (d1 == 1) {
         sh = 0; m = 1;
     }
-    // fix abs overflow
-    else if (d == (1 << 31)) {
-        m = d + 1;
-        sh = 30;
-    }
     else {
         // raise arithmetic exception for d == 0
         sh = m = 1 / ((npy_int32 volatile *)&d)[0]; // LCOV_EXCL_LINE
@@ -445,18 +445,18 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
 #else
     npy_int64 d1 = llabs(d);
     npy_int64 sh, m;
-    if (d1 > 1) {
+    // Handel abs overflow
+    if ((npy_uint64)d == 0x8000000000000000ULL) {
+        m = 0x8000000000000001LL;
+        sh = 62;
+    }
+    else if (d1 > 1) {
         sh = npyv__bitscan_revnz_u64(d1 - 1);       // ceil(log2(abs(d))) - 1
         m  = npyv__divh128_u64(1ULL << sh, d1) + 1; // multiplier
     }
     else if (d1 == 1) {
         sh = 0; m = 1;
     }
-    // fix abs overflow
-    else if (d == (1LL << 63)) {
-        m = d + 1;
-        sh = 62;
-    }
     else {
         // raise arithmetic exception for d == 0
         sh = m = 1 / ((npy_int64 volatile *)&d)[0]; // LCOV_EXCL_LINE