Merge pull request #20131 from Developer-Ecosystem-Engineering/as_min_max

mattip · web-flow · commit 2d749723319e · 2022-01-11T09:14:53.000+02:00
BUG: min/max is slow, re-implement using NEON (#17989)
diff --git a/.gitignore b/.gitignore
@@ -218,6 +218,7 @@ numpy/core/src/_simd/_simd_inc.h
 numpy/core/src/umath/loops_unary_fp.dispatch.c
 numpy/core/src/umath/loops_arithm_fp.dispatch.c
 numpy/core/src/umath/loops_arithmetic.dispatch.c
+numpy/core/src/umath/loops_minmax.dispatch.c
 numpy/core/src/umath/loops_trigonometric.dispatch.c
 numpy/core/src/umath/loops_exponent_log.dispatch.c
 numpy/core/src/umath/loops_umath_fp.dispatch.c
diff --git a/benchmarks/benchmarks/bench_reduce.py b/benchmarks/benchmarks/bench_reduce.py
@@ -46,7 +46,8 @@ def time_any_slow(self):
 
 
 class MinMax(Benchmark):
-    params = [np.float32, np.float64, np.intp]
+    params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
+              np.int64, np.uint64, np.float32, np.float64, np.intp]
     param_names = ['dtype']
 
     def setup(self, dtype):
@@ -58,8 +59,21 @@ def time_min(self, dtype):
     def time_max(self, dtype):
         np.max(self.d)
 
+class FMinMax(Benchmark):
+    params = [np.float32, np.float64]
+    param_names = ['dtype']
+
+    def setup(self, dtype):
+        self.d = np.ones(20000, dtype=dtype)
+
+    def time_min(self, dtype):
+        np.fmin.reduce(self.d)
+
+    def time_max(self, dtype):
+        np.fmax.reduce(self.d)
+
 class ArgMax(Benchmark):
-    params = [np.float32, bool]
+    params = [np.float32, np.float64, bool]
     param_names = ['dtype']
 
     def setup(self, dtype):
diff --git a/benchmarks/benchmarks/bench_ufunc_strides.py b/benchmarks/benchmarks/bench_ufunc_strides.py
@@ -44,27 +44,40 @@ def setup(self, stride, dtype):
     def time_log(self, stride, dtype):
         np.log(self.arr[::stride])
 
-avx_bfuncs = ['maximum',
-              'minimum']
 
-class AVX_BFunc(Benchmark):
+binary_ufuncs = [
+    'maximum', 'minimum', 'fmax', 'fmin'
+]
+binary_dtype = ['f', 'd']
 
-    params = [avx_bfuncs, dtype, stride]
-    param_names = ['avx_based_bfunc', 'dtype', 'stride']
+class Binary(Benchmark):
+    param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype']
+    params = [binary_ufuncs, stride, stride, stride_out, binary_dtype]
     timeout = 10
 
-    def setup(self, ufuncname, dtype, stride):
+    def setup(self, ufuncname, stride_in0, stride_in1, stride_out, dtype):
         np.seterr(all='ignore')
         try:
             self.f = getattr(np, ufuncname)
         except AttributeError:
             raise NotImplementedError(f"No ufunc {ufuncname} found") from None
-        N = 10000
-        self.arr1 = np.array(np.random.rand(stride*N), dtype=dtype)
-        self.arr2 = np.array(np.random.rand(stride*N), dtype=dtype)
+        N = 100000
+        self.arr1 = np.array(np.random.rand(stride_in0*N), dtype=dtype)
+        self.arr2 = np.array(np.random.rand(stride_in1*N), dtype=dtype)
+        self.arr_out = np.empty(stride_out*N, dtype)
 
-    def time_ufunc(self, ufuncname, dtype, stride):
-        self.f(self.arr1[::stride], self.arr2[::stride])
+    def time_ufunc(self, ufuncname, stride_in0, stride_in1, stride_out, dtype):
+        self.f(self.arr1[::stride_in0], self.arr2[::stride_in1],
+               self.arr_out[::stride_out])
+
+
+binary_int_ufuncs = ['maximum', 'minimum']
+binary_int_dtype = ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
+
+class BinaryInt(Binary):
+
+    param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype']
+    params = [binary_int_ufuncs, stride, stride, stride_out, binary_int_dtype]
 
 class AVX_ldexp(Benchmark):
 
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
@@ -522,14 +522,14 @@ def english_upper(s):
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.maximum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, simd=[('avx512f', 'fd')]),
+          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
           TD(O, f='npy_ObjectMax')
           ),
 'minimum':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.minimum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, simd=[('avx512f', 'fd')]),
+          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
           TD(O, f='npy_ObjectMin')
           ),
 'clip':
@@ -543,13 +543,15 @@ def english_upper(s):
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmax'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
+          TD('fdg', dispatch=[('loops_minmax', 'fdg')]),
           TD(noobj),
           TD(O, f='npy_ObjectMax')
           ),
 'fmin':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmin'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
+          TD('fdg', dispatch=[('loops_minmax', 'fdg')]),
           TD(noobj),
           TD(O, f='npy_ObjectMin')
           ),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
@@ -999,6 +999,7 @@ def generate_umath_doc_header(ext, build_dir):
             join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
+            join('src', 'umath', 'loops_minmax.dispatch.c.src'),
             join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
             join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
@@ -724,32 +724,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 
 /**end repeat1**/
 
-/**begin repeat1
- * #kind = maximum, minimum#
- * #OP =  >, <#
- **/
-
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            const @type@ in2 = *(@type@ *)ip2;
-            io1 = (io1 @OP@ in2) ? io1 : in2;
-        }
-        *((@type@ *)iop1) = io1;
-    }
-    else {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
-        }
-    }
-}
-
-/**end repeat1**/
-
 NPY_NO_EXPORT void
 @TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -1684,93 +1658,6 @@ NPY_NO_EXPORT void
     }
 }
 
-/**begin repeat1
- * #kind = maximum, minimum#
- * #OP =  >=, <=#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*  */
-    if (IS_BINARY_REDUCE) {
-        if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
-            BINARY_REDUCE_LOOP(@type@) {
-                const @type@ in2 = *(@type@ *)ip2;
-                /* Order of operations important for MSVC 2015 */
-                io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
-            }
-            *((@type@ *)iop1) = io1;
-        }
-    }
-    else {
-        if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
-            BINARY_LOOP {
-                @type@ in1 = *(@type@ *)ip1;
-                const @type@ in2 = *(@type@ *)ip2;
-                /* Order of operations important for MSVC 2015 */
-                in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
-                *((@type@ *)op1) = in1;
-            }
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*  */
-    if (IS_BINARY_REDUCE) {
-        if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
-            BINARY_REDUCE_LOOP(@type@) {
-                const @type@ in2 = *(@type@ *)ip2;
-                /* Order of operations important for MSVC 2015 */
-                io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
-            }
-            *((@type@ *)iop1) = io1;
-        }
-    }
-    else {
-        BINARY_LOOP {
-            @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            /* Order of operations important for MSVC 2015 */
-            in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
-            *((@type@ *)op1) = in1;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-/**end repeat1**/
-
-/**begin repeat1
- * #kind = fmax, fmin#
- * #OP =  >=, <=#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*  */
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            const @type@ in2 = *(@type@ *)ip2;
-            /* Order of operations important for MSVC 2015 */
-            io1 = (io1 @OP@ in2 || npy_isnan(in2)) ? io1 : in2;
-        }
-        *((@type@ *)iop1) = io1;
-    }
-    else {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            /* Order of operations important for MSVC 2015 */
-            *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-/**end repeat1**/
-
 NPY_NO_EXPORT void
 @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
@@ -22,7 +22,6 @@
 #define BOOL_fmax BOOL_maximum
 #define BOOL_fmin BOOL_minimum
 
-
 /*
  *****************************************************************************
  **                             BOOLEAN LOOPS                               **
@@ -658,6 +657,43 @@ OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void
 NPY_NO_EXPORT void
 PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func);
 
+/*
+ *****************************************************************************
+ **                            MIN/MAX LOOPS                                **
+ *****************************************************************************
+ */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_minmax.dispatch.h"
+#endif
+
+//---------- Integers ----------
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG#
+ */
+/**begin repeat1
+ * #kind = maximum, minimum#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+//---------- Float ----------
+
+ /**begin repeat
+  * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+  */
+/**begin repeat1
+ * #kind = maximum, minimum, fmax, fmin#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
 /*
  *****************************************************************************
  **                              END LOOPS                                  **
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src