Skip to content

Commit 2d74972

Browse files
authored
Merge pull request #20131 from Developer-Ecosystem-Engineering/as_min_max
BUG: min/max is slow, re-implement using NEON (#17989)
2 parents acf33eb + 5fca2bf commit 2d74972

File tree

9 files changed

+637
-314
lines changed

9 files changed

+637
-314
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ numpy/core/src/_simd/_simd_inc.h
218218
numpy/core/src/umath/loops_unary_fp.dispatch.c
219219
numpy/core/src/umath/loops_arithm_fp.dispatch.c
220220
numpy/core/src/umath/loops_arithmetic.dispatch.c
221+
numpy/core/src/umath/loops_minmax.dispatch.c
221222
numpy/core/src/umath/loops_trigonometric.dispatch.c
222223
numpy/core/src/umath/loops_exponent_log.dispatch.c
223224
numpy/core/src/umath/loops_umath_fp.dispatch.c

benchmarks/benchmarks/bench_reduce.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ def time_any_slow(self):
4646

4747

4848
class MinMax(Benchmark):
49-
params = [np.float32, np.float64, np.intp]
49+
params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
50+
np.int64, np.uint64, np.float32, np.float64, np.intp]
5051
param_names = ['dtype']
5152

5253
def setup(self, dtype):
@@ -58,8 +59,21 @@ def time_min(self, dtype):
5859
def time_max(self, dtype):
5960
np.max(self.d)
6061

62+
class FMinMax(Benchmark):
63+
params = [np.float32, np.float64]
64+
param_names = ['dtype']
65+
66+
def setup(self, dtype):
67+
self.d = np.ones(20000, dtype=dtype)
68+
69+
def time_min(self, dtype):
70+
np.fmin.reduce(self.d)
71+
72+
def time_max(self, dtype):
73+
np.fmax.reduce(self.d)
74+
6175
class ArgMax(Benchmark):
62-
params = [np.float32, bool]
76+
params = [np.float32, np.float64, bool]
6377
param_names = ['dtype']
6478

6579
def setup(self, dtype):

benchmarks/benchmarks/bench_ufunc_strides.py

+24-11
Original file line numberDiff line numberDiff line change
@@ -44,27 +44,40 @@ def setup(self, stride, dtype):
4444
def time_log(self, stride, dtype):
4545
np.log(self.arr[::stride])
4646

47-
avx_bfuncs = ['maximum',
48-
'minimum']
4947

50-
class AVX_BFunc(Benchmark):
48+
binary_ufuncs = [
49+
'maximum', 'minimum', 'fmax', 'fmin'
50+
]
51+
binary_dtype = ['f', 'd']
5152

52-
params = [avx_bfuncs, dtype, stride]
53-
param_names = ['avx_based_bfunc', 'dtype', 'stride']
53+
class Binary(Benchmark):
54+
param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype']
55+
params = [binary_ufuncs, stride, stride, stride_out, binary_dtype]
5456
timeout = 10
5557

56-
def setup(self, ufuncname, dtype, stride):
58+
def setup(self, ufuncname, stride_in0, stride_in1, stride_out, dtype):
5759
np.seterr(all='ignore')
5860
try:
5961
self.f = getattr(np, ufuncname)
6062
except AttributeError:
6163
raise NotImplementedError(f"No ufunc {ufuncname} found") from None
62-
N = 10000
63-
self.arr1 = np.array(np.random.rand(stride*N), dtype=dtype)
64-
self.arr2 = np.array(np.random.rand(stride*N), dtype=dtype)
64+
N = 100000
65+
self.arr1 = np.array(np.random.rand(stride_in0*N), dtype=dtype)
66+
self.arr2 = np.array(np.random.rand(stride_in1*N), dtype=dtype)
67+
self.arr_out = np.empty(stride_out*N, dtype)
6568

66-
def time_ufunc(self, ufuncname, dtype, stride):
67-
self.f(self.arr1[::stride], self.arr2[::stride])
69+
def time_ufunc(self, ufuncname, stride_in0, stride_in1, stride_out, dtype):
70+
self.f(self.arr1[::stride_in0], self.arr2[::stride_in1],
71+
self.arr_out[::stride_out])
72+
73+
74+
binary_int_ufuncs = ['maximum', 'minimum']
75+
binary_int_dtype = ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
76+
77+
class BinaryInt(Binary):
78+
79+
param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype']
80+
params = [binary_int_ufuncs, stride, stride, stride_out, binary_int_dtype]
6881

6982
class AVX_ldexp(Benchmark):
7083

numpy/core/code_generators/generate_umath.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -522,14 +522,14 @@ def english_upper(s):
522522
Ufunc(2, 1, ReorderableNone,
523523
docstrings.get('numpy.core.umath.maximum'),
524524
'PyUFunc_SimpleUniformOperationTypeResolver',
525-
TD(noobj, simd=[('avx512f', 'fd')]),
525+
TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
526526
TD(O, f='npy_ObjectMax')
527527
),
528528
'minimum':
529529
Ufunc(2, 1, ReorderableNone,
530530
docstrings.get('numpy.core.umath.minimum'),
531531
'PyUFunc_SimpleUniformOperationTypeResolver',
532-
TD(noobj, simd=[('avx512f', 'fd')]),
532+
TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
533533
TD(O, f='npy_ObjectMin')
534534
),
535535
'clip':
@@ -543,13 +543,15 @@ def english_upper(s):
543543
Ufunc(2, 1, ReorderableNone,
544544
docstrings.get('numpy.core.umath.fmax'),
545545
'PyUFunc_SimpleUniformOperationTypeResolver',
546+
TD('fdg', dispatch=[('loops_minmax', 'fdg')]),
546547
TD(noobj),
547548
TD(O, f='npy_ObjectMax')
548549
),
549550
'fmin':
550551
Ufunc(2, 1, ReorderableNone,
551552
docstrings.get('numpy.core.umath.fmin'),
552553
'PyUFunc_SimpleUniformOperationTypeResolver',
554+
TD('fdg', dispatch=[('loops_minmax', 'fdg')]),
553555
TD(noobj),
554556
TD(O, f='npy_ObjectMin')
555557
),

numpy/core/setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -999,6 +999,7 @@ def generate_umath_doc_header(ext, build_dir):
999999
join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
10001000
join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
10011001
join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
1002+
join('src', 'umath', 'loops_minmax.dispatch.c.src'),
10021003
join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
10031004
join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
10041005
join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),

numpy/core/src/umath/loops.c.src

-113
Original file line numberDiff line numberDiff line change
@@ -724,32 +724,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
724724

725725
/**end repeat1**/
726726

727-
/**begin repeat1
728-
* #kind = maximum, minimum#
729-
* #OP = >, <#
730-
**/
731-
732-
NPY_NO_EXPORT void
733-
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
734-
{
735-
if (IS_BINARY_REDUCE) {
736-
BINARY_REDUCE_LOOP(@type@) {
737-
const @type@ in2 = *(@type@ *)ip2;
738-
io1 = (io1 @OP@ in2) ? io1 : in2;
739-
}
740-
*((@type@ *)iop1) = io1;
741-
}
742-
else {
743-
BINARY_LOOP {
744-
const @type@ in1 = *(@type@ *)ip1;
745-
const @type@ in2 = *(@type@ *)ip2;
746-
*((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
747-
}
748-
}
749-
}
750-
751-
/**end repeat1**/
752-
753727
NPY_NO_EXPORT void
754728
@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
755729
{
@@ -1684,93 +1658,6 @@ NPY_NO_EXPORT void
16841658
}
16851659
}
16861660

1687-
/**begin repeat1
1688-
* #kind = maximum, minimum#
1689-
* #OP = >=, <=#
1690-
**/
1691-
NPY_NO_EXPORT void
1692-
@TYPE@_@kind@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
1693-
{
1694-
/* */
1695-
if (IS_BINARY_REDUCE) {
1696-
if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
1697-
BINARY_REDUCE_LOOP(@type@) {
1698-
const @type@ in2 = *(@type@ *)ip2;
1699-
/* Order of operations important for MSVC 2015 */
1700-
io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
1701-
}
1702-
*((@type@ *)iop1) = io1;
1703-
}
1704-
}
1705-
else {
1706-
if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
1707-
BINARY_LOOP {
1708-
@type@ in1 = *(@type@ *)ip1;
1709-
const @type@ in2 = *(@type@ *)ip2;
1710-
/* Order of operations important for MSVC 2015 */
1711-
in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
1712-
*((@type@ *)op1) = in1;
1713-
}
1714-
}
1715-
}
1716-
npy_clear_floatstatus_barrier((char*)dimensions);
1717-
}
1718-
1719-
NPY_NO_EXPORT void
1720-
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
1721-
{
1722-
/* */
1723-
if (IS_BINARY_REDUCE) {
1724-
if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
1725-
BINARY_REDUCE_LOOP(@type@) {
1726-
const @type@ in2 = *(@type@ *)ip2;
1727-
/* Order of operations important for MSVC 2015 */
1728-
io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
1729-
}
1730-
*((@type@ *)iop1) = io1;
1731-
}
1732-
}
1733-
else {
1734-
BINARY_LOOP {
1735-
@type@ in1 = *(@type@ *)ip1;
1736-
const @type@ in2 = *(@type@ *)ip2;
1737-
/* Order of operations important for MSVC 2015 */
1738-
in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
1739-
*((@type@ *)op1) = in1;
1740-
}
1741-
}
1742-
npy_clear_floatstatus_barrier((char*)dimensions);
1743-
}
1744-
/**end repeat1**/
1745-
1746-
/**begin repeat1
1747-
* #kind = fmax, fmin#
1748-
* #OP = >=, <=#
1749-
**/
1750-
NPY_NO_EXPORT void
1751-
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
1752-
{
1753-
/* */
1754-
if (IS_BINARY_REDUCE) {
1755-
BINARY_REDUCE_LOOP(@type@) {
1756-
const @type@ in2 = *(@type@ *)ip2;
1757-
/* Order of operations important for MSVC 2015 */
1758-
io1 = (io1 @OP@ in2 || npy_isnan(in2)) ? io1 : in2;
1759-
}
1760-
*((@type@ *)iop1) = io1;
1761-
}
1762-
else {
1763-
BINARY_LOOP {
1764-
const @type@ in1 = *(@type@ *)ip1;
1765-
const @type@ in2 = *(@type@ *)ip2;
1766-
/* Order of operations important for MSVC 2015 */
1767-
*((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
1768-
}
1769-
}
1770-
npy_clear_floatstatus_barrier((char*)dimensions);
1771-
}
1772-
/**end repeat1**/
1773-
17741661
NPY_NO_EXPORT void
17751662
@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
17761663
{

numpy/core/src/umath/loops.h.src

+37-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
#define BOOL_fmax BOOL_maximum
2323
#define BOOL_fmin BOOL_minimum
2424

25-
2625
/*
2726
*****************************************************************************
2827
** BOOLEAN LOOPS **
@@ -658,6 +657,43 @@ OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void
658657
NPY_NO_EXPORT void
659658
PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func);
660659

660+
/*
661+
*****************************************************************************
662+
** MIN/MAX LOOPS **
663+
*****************************************************************************
664+
*/
665+
666+
#ifndef NPY_DISABLE_OPTIMIZATION
667+
#include "loops_minmax.dispatch.h"
668+
#endif
669+
670+
//---------- Integers ----------
671+
672+
/**begin repeat
673+
* #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
674+
* LONG, ULONG, LONGLONG, ULONGLONG#
675+
*/
676+
/**begin repeat1
677+
* #kind = maximum, minimum#
678+
*/
679+
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
680+
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
681+
/**end repeat1**/
682+
/**end repeat**/
683+
684+
//---------- Float ----------
685+
686+
/**begin repeat
687+
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
688+
*/
689+
/**begin repeat1
690+
* #kind = maximum, minimum, fmax, fmin#
691+
*/
692+
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
693+
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
694+
/**end repeat1**/
695+
/**end repeat**/
696+
661697
/*
662698
*****************************************************************************
663699
** END LOOPS **

0 commit comments

Comments
 (0)