Skip to content

BUG: min/max is slow, re-implement using NEON (#17989) #20131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jan 11, 2022
Merged
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ numpy/core/src/_simd/_simd_inc.h
numpy/core/src/umath/loops_unary_fp.dispatch.c
numpy/core/src/umath/loops_arithm_fp.dispatch.c
numpy/core/src/umath/loops_arithmetic.dispatch.c
numpy/core/src/umath/loops_minmax.dispatch.c
numpy/core/src/umath/loops_trigonometric.dispatch.c
numpy/core/src/umath/loops_exponent_log.dispatch.c
numpy/core/src/umath/loops_umath_fp.dispatch.c
18 changes: 16 additions & 2 deletions benchmarks/benchmarks/bench_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ def time_any_slow(self):


class MinMax(Benchmark):
params = [np.float32, np.float64, np.intp]
params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
np.int64, np.uint64, np.float32, np.float64, np.intp]
param_names = ['dtype']

def setup(self, dtype):
Expand All @@ -58,8 +59,21 @@ def time_min(self, dtype):
def time_max(self, dtype):
np.max(self.d)

class FMinMax(Benchmark):
params = [np.float32, np.float64]
param_names = ['dtype']

def setup(self, dtype):
self.d = np.ones(20000, dtype=dtype)

def time_min(self, dtype):
np.fmin.reduce(self.d)

def time_max(self, dtype):
np.fmax.reduce(self.d)

class ArgMax(Benchmark):
params = [np.float32, bool]
params = [np.float32, np.float64, bool]
param_names = ['dtype']

def setup(self, dtype):
Expand Down
35 changes: 24 additions & 11 deletions benchmarks/benchmarks/bench_ufunc_strides.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,27 +44,40 @@ def setup(self, stride, dtype):
def time_log(self, stride, dtype):
np.log(self.arr[::stride])

avx_bfuncs = ['maximum',
'minimum']

class AVX_BFunc(Benchmark):
binary_ufuncs = [
'maximum', 'minimum', 'fmax', 'fmin'
]
binary_dtype = ['f', 'd']

params = [avx_bfuncs, dtype, stride]
param_names = ['avx_based_bfunc', 'dtype', 'stride']
class Binary(Benchmark):
param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype']
params = [binary_ufuncs, stride, stride, stride_out, binary_dtype]
timeout = 10

def setup(self, ufuncname, dtype, stride):
def setup(self, ufuncname, stride_in0, stride_in1, stride_out, dtype):
np.seterr(all='ignore')
try:
self.f = getattr(np, ufuncname)
except AttributeError:
raise NotImplementedError(f"No ufunc {ufuncname} found") from None
N = 10000
self.arr1 = np.array(np.random.rand(stride*N), dtype=dtype)
self.arr2 = np.array(np.random.rand(stride*N), dtype=dtype)
N = 100000
self.arr1 = np.array(np.random.rand(stride_in0*N), dtype=dtype)
self.arr2 = np.array(np.random.rand(stride_in1*N), dtype=dtype)
self.arr_out = np.empty(stride_out*N, dtype)

def time_ufunc(self, ufuncname, dtype, stride):
self.f(self.arr1[::stride], self.arr2[::stride])
def time_ufunc(self, ufuncname, stride_in0, stride_in1, stride_out, dtype):
self.f(self.arr1[::stride_in0], self.arr2[::stride_in1],
self.arr_out[::stride_out])


binary_int_ufuncs = ['maximum', 'minimum']
binary_int_dtype = ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']

class BinaryInt(Binary):

param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype']
params = [binary_int_ufuncs, stride, stride, stride_out, binary_int_dtype]

class AVX_ldexp(Benchmark):

Expand Down
6 changes: 4 additions & 2 deletions numpy/core/code_generators/generate_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,14 +516,14 @@ def english_upper(s):
Ufunc(2, 1, ReorderableNone,
docstrings.get('numpy.core.umath.maximum'),
'PyUFunc_SimpleUniformOperationTypeResolver',
TD(noobj, simd=[('avx512f', 'fd')]),
TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
TD(O, f='npy_ObjectMax')
),
'minimum':
Ufunc(2, 1, ReorderableNone,
docstrings.get('numpy.core.umath.minimum'),
'PyUFunc_SimpleUniformOperationTypeResolver',
TD(noobj, simd=[('avx512f', 'fd')]),
TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
TD(O, f='npy_ObjectMin')
),
'clip':
Expand All @@ -537,13 +537,15 @@ def english_upper(s):
Ufunc(2, 1, ReorderableNone,
docstrings.get('numpy.core.umath.fmax'),
'PyUFunc_SimpleUniformOperationTypeResolver',
TD('fdg', dispatch=[('loops_minmax', 'fdg')]),
TD(noobj),
TD(O, f='npy_ObjectMax')
),
'fmin':
Ufunc(2, 1, ReorderableNone,
docstrings.get('numpy.core.umath.fmin'),
'PyUFunc_SimpleUniformOperationTypeResolver',
TD('fdg', dispatch=[('loops_minmax', 'fdg')]),
TD(noobj),
TD(O, f='npy_ObjectMin')
),
Expand Down
1 change: 1 addition & 0 deletions numpy/core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -976,6 +976,7 @@ def generate_umath_c(ext, build_dir):
join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
join('src', 'umath', 'loops_minmax.dispatch.c.src'),
join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
Expand Down
113 changes: 0 additions & 113 deletions numpy/core/src/umath/loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -724,32 +724,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void

/**end repeat1**/

/**begin repeat1
* #kind = maximum, minimum#
* #OP = >, <#
**/

NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
if (IS_BINARY_REDUCE) {
BINARY_REDUCE_LOOP(@type@) {
const @type@ in2 = *(@type@ *)ip2;
io1 = (io1 @OP@ in2) ? io1 : in2;
}
*((@type@ *)iop1) = io1;
}
else {
BINARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
const @type@ in2 = *(@type@ *)ip2;
*((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
}
}
}

/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
Expand Down Expand Up @@ -1684,93 +1658,6 @@ NPY_NO_EXPORT void
}
}

/**begin repeat1
* #kind = maximum, minimum#
* #OP = >=, <=#
**/
NPY_NO_EXPORT void
@TYPE@_@kind@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
/* */
if (IS_BINARY_REDUCE) {
if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
BINARY_REDUCE_LOOP(@type@) {
const @type@ in2 = *(@type@ *)ip2;
/* Order of operations important for MSVC 2015 */
io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
}
*((@type@ *)iop1) = io1;
}
}
else {
if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
BINARY_LOOP {
@type@ in1 = *(@type@ *)ip1;
const @type@ in2 = *(@type@ *)ip2;
/* Order of operations important for MSVC 2015 */
in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
*((@type@ *)op1) = in1;
}
}
}
npy_clear_floatstatus_barrier((char*)dimensions);
}

NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
/* */
if (IS_BINARY_REDUCE) {
if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
BINARY_REDUCE_LOOP(@type@) {
const @type@ in2 = *(@type@ *)ip2;
/* Order of operations important for MSVC 2015 */
io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
}
*((@type@ *)iop1) = io1;
}
}
else {
BINARY_LOOP {
@type@ in1 = *(@type@ *)ip1;
const @type@ in2 = *(@type@ *)ip2;
/* Order of operations important for MSVC 2015 */
in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
*((@type@ *)op1) = in1;
}
}
npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

/**begin repeat1
* #kind = fmax, fmin#
* #OP = >=, <=#
**/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
/* */
if (IS_BINARY_REDUCE) {
BINARY_REDUCE_LOOP(@type@) {
const @type@ in2 = *(@type@ *)ip2;
/* Order of operations important for MSVC 2015 */
io1 = (io1 @OP@ in2 || npy_isnan(in2)) ? io1 : in2;
}
*((@type@ *)iop1) = io1;
}
else {
BINARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
const @type@ in2 = *(@type@ *)ip2;
/* Order of operations important for MSVC 2015 */
*((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
}
}
npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

NPY_NO_EXPORT void
@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
Expand Down
38 changes: 37 additions & 1 deletion numpy/core/src/umath/loops.h.src
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#define BOOL_fmax BOOL_maximum
#define BOOL_fmin BOOL_minimum


/*
*****************************************************************************
** BOOLEAN LOOPS **
Expand Down Expand Up @@ -658,6 +657,43 @@ OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void
NPY_NO_EXPORT void
PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func);

/*
*****************************************************************************
** MIN/MAX LOOPS **
*****************************************************************************
*/

#ifndef NPY_DISABLE_OPTIMIZATION
#include "loops_minmax.dispatch.h"
#endif

//---------- Integers ----------

/**begin repeat
* #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
* LONG, ULONG, LONGLONG, ULONGLONG#
*/
/**begin repeat1
* #kind = maximum, minimum#
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
/**end repeat1**/
/**end repeat**/

//---------- Float ----------

/**begin repeat
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
*/
/**begin repeat1
* #kind = maximum, minimum, fmax, fmin#
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
/**end repeat1**/
/**end repeat**/

/*
*****************************************************************************
** END LOOPS **
Expand Down
Loading