ENH: Vectorize mod/divide operations using the universal intrinsics

rafaelcfsousa · rafaelcfsousa · commit cb74ca62894e · 2022-02-26T20:37:22.000-06:00
This commit optimizes the operations below:
 - fmod (signed/unsigned integers)
 - remainder (signed/unsigned integers)
 - divmod (signed/unsigned integers)
 - floor_divide (signed integers)
using the VSX4/Power10 integer vector division/modulo instructions.

See the improvements below (maximum speedup):
 - numpy.fmod
   - arr OP arr:    signed (1.17x), unsigned (1.13x)
   - arr OP scalar: signed (1.34x), unsigned (1.29x)
 - numpy.remainder
   - arr OP arr:    signed (4.19x), unsigned (1.17x)
   - arr OP scalar: signed (4.87x), unsigned (1.29x)
 - numpy.divmod
   - arr OP arr:    signed (4.73x), unsigned (1.23x)
   - arr OP scalar: signed (5.05x), unsigned (1.31x)
 - numpy.floor_divide
   - arr OP arr:    signed (4.44x)

The times above were collected using the benchmark tool available in NumPy.
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
@@ -364,7 +364,7 @@ def english_upper(s):
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.fmod'),
           None,
-          TD(ints),
+          TD(ints, dispatch=[('loops_arithmetic', ints)]),
           TD(flts, f='fmod', astype={'e': 'f'}),
           TD(P, f='fmod'),
           ),
@@ -884,15 +884,17 @@ def english_upper(s):
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.remainder'),
           'PyUFunc_RemainderTypeResolver',
-          TD(intflt),
+          TD(ints, dispatch=[('loops_arithmetic', ints)]),
+          TD(flts),
           [TypeDescription('m', FullTypeDescr, 'mm', 'm')],
           TD(O, f='PyNumber_Remainder'),
           ),
 'divmod':
     Ufunc(2, 2, None,
           docstrings.get('numpy.core.umath.divmod'),
           'PyUFunc_DivmodTypeResolver',
-          TD(intflt),
+          TD(ints, dispatch=[('loops_arithmetic', ints)]),
+          TD(flts),
           [TypeDescription('m', FullTypeDescr, 'mm', 'qm')],
           # TD(O, f='PyNumber_Divmod'),  # gh-9730
           ),
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
@@ -764,23 +764,6 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ in2 = *(@type@ *)ip2;
-        if (in2 == 0) {
-            npy_set_floatstatus_divbyzero();
-            *((@type@ *)op1) = 0;
-        }
-        else {
-            *((@type@ *)op1)= in1 % in2;
-        }
-
-    }
-}
-
 /**begin repeat1
  * #kind = isnan, isinf, isfinite#
  * #func = npy_isnan, npy_isinf, npy_isfinite#
@@ -817,57 +800,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
     UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
 }
 
-NPY_NO_EXPORT void
-@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ in2 = *(@type@ *)ip2;
-        if (in2 == 0) {
-            npy_set_floatstatus_divbyzero();
-            *((@type@ *)op1) = 0;
-        }
-        else {
-            /* handle mixed case the way Python does */
-            const @type@ rem = in1 % in2;
-            if ((in1 > 0) == (in2 > 0) || rem == 0) {
-                *((@type@ *)op1) = rem;
-            }
-            else {
-                *((@type@ *)op1) = rem + in2;
-            }
-        }
-    }
-}
-
-NPY_NO_EXPORT void
-@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP_TWO_OUT {
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ in2 = *(@type@ *)ip2;
-        /* see FIXME note for divide above */
-        if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
-            npy_set_floatstatus_divbyzero();
-            *((@type@ *)op1) = 0;
-            *((@type@ *)op2) = 0;
-        }
-        else {
-            /* handle mixed case the way Python does */
-            const @type@ quo = in1 / in2;
-            const @type@ rem = in1 % in2;
-            if ((in1 > 0) == (in2 > 0) || rem == 0) {
-                *((@type@ *)op1) = quo;
-                *((@type@ *)op2) = rem;
-            }
-            else {
-                *((@type@ *)op1) = quo - 1;
-                *((@type@ *)op2) = rem + in2;
-            }
-        }
-    }
-}
-
 /**begin repeat1
  * #kind = gcd, lcm#
  **/
@@ -902,40 +834,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
     UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
 }
 
-NPY_NO_EXPORT void
-@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ in2 = *(@type@ *)ip2;
-        if (in2 == 0) {
-            npy_set_floatstatus_divbyzero();
-            *((@type@ *)op1) = 0;
-        }
-        else {
-            *((@type@ *)op1) = in1 % in2;
-        }
-    }
-}
-
-NPY_NO_EXPORT void
-@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP_TWO_OUT {
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ in2 = *(@type@ *)ip2;
-        if (in2 == 0) {
-            npy_set_floatstatus_divbyzero();
-            *((@type@ *)op1) = 0;
-            *((@type@ *)op2) = 0;
-        }
-        else {
-            *((@type@ *)op1)= in1/in2;
-            *((@type@ *)op2) = in1 % in2;
-        }
-    }
-}
-
 /**begin repeat1
  * #kind = gcd, lcm#
  **/
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
@@ -62,6 +62,15 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  */
  NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
      (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_fmod,
+     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_remainder,
+     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divmod,
+     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
 /**end repeat**/
 
 /**begin repeat
@@ -142,21 +151,12 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
-NPY_NO_EXPORT void
-@S@@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
 NPY_NO_EXPORT void
 @S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
 @S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
-NPY_NO_EXPORT void
-@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
 NPY_NO_EXPORT void
 @S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src