Skip to content

Commit cb74ca6

Browse files
committed
ENH: Vectorize mod/divide operations using the universal intrinsics
This commit optimizes the operations below: - fmod (signed/unsigned integers) - remainder (signed/unsigned integers) - divmod (signed/unsigned integers) - floor_divide (signed integers) using the VSX4/Power10 integer vector division/modulo instructions. See the improvements below (maximum speedup): - numpy.fmod - arr OP arr: signed (1.17x), unsigned (1.13x) - arr OP scalar: signed (1.34x), unsigned (1.29x) - numpy.remainder - arr OP arr: signed (4.19x), unsigned (1.17x) - arr OP scalar: signed (4.87x), unsigned (1.29x) - numpy.divmod - arr OP arr: signed (4.73x), unsigned (1.23x) - arr OP scalar: signed (5.05x), unsigned (1.31x) - numpy.floor_divide - arr OP arr: signed (4.44x) The times above were collected using the benchmark tool available in NumPy.
1 parent 5b9b939 commit cb74ca6

File tree

4 files changed

+673
-172
lines changed

4 files changed

+673
-172
lines changed

numpy/core/code_generators/generate_umath.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ def english_upper(s):
364364
Ufunc(2, 1, None,
365365
docstrings.get('numpy.core.umath.fmod'),
366366
None,
367-
TD(ints),
367+
TD(ints, dispatch=[('loops_arithmetic', ints)]),
368368
TD(flts, f='fmod', astype={'e': 'f'}),
369369
TD(P, f='fmod'),
370370
),
@@ -884,15 +884,17 @@ def english_upper(s):
884884
Ufunc(2, 1, None,
885885
docstrings.get('numpy.core.umath.remainder'),
886886
'PyUFunc_RemainderTypeResolver',
887-
TD(intflt),
887+
TD(ints, dispatch=[('loops_arithmetic', ints)]),
888+
TD(flts),
888889
[TypeDescription('m', FullTypeDescr, 'mm', 'm')],
889890
TD(O, f='PyNumber_Remainder'),
890891
),
891892
'divmod':
892893
Ufunc(2, 2, None,
893894
docstrings.get('numpy.core.umath.divmod'),
894895
'PyUFunc_DivmodTypeResolver',
895-
TD(intflt),
896+
TD(ints, dispatch=[('loops_arithmetic', ints)]),
897+
TD(flts),
896898
[TypeDescription('m', FullTypeDescr, 'mm', 'qm')],
897899
# TD(O, f='PyNumber_Divmod'), # gh-9730
898900
),

numpy/core/src/umath/loops.c.src

Lines changed: 0 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -764,23 +764,6 @@ NPY_NO_EXPORT void
764764
}
765765
}
766766

767-
NPY_NO_EXPORT void
768-
@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
769-
{
770-
BINARY_LOOP {
771-
const @type@ in1 = *(@type@ *)ip1;
772-
const @type@ in2 = *(@type@ *)ip2;
773-
if (in2 == 0) {
774-
npy_set_floatstatus_divbyzero();
775-
*((@type@ *)op1) = 0;
776-
}
777-
else {
778-
*((@type@ *)op1)= in1 % in2;
779-
}
780-
781-
}
782-
}
783-
784767
/**begin repeat1
785768
* #kind = isnan, isinf, isfinite#
786769
* #func = npy_isnan, npy_isinf, npy_isfinite#
@@ -817,57 +800,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
817800
UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
818801
}
819802

820-
NPY_NO_EXPORT void
821-
@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
822-
{
823-
BINARY_LOOP {
824-
const @type@ in1 = *(@type@ *)ip1;
825-
const @type@ in2 = *(@type@ *)ip2;
826-
if (in2 == 0) {
827-
npy_set_floatstatus_divbyzero();
828-
*((@type@ *)op1) = 0;
829-
}
830-
else {
831-
/* handle mixed case the way Python does */
832-
const @type@ rem = in1 % in2;
833-
if ((in1 > 0) == (in2 > 0) || rem == 0) {
834-
*((@type@ *)op1) = rem;
835-
}
836-
else {
837-
*((@type@ *)op1) = rem + in2;
838-
}
839-
}
840-
}
841-
}
842-
843-
NPY_NO_EXPORT void
844-
@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
845-
{
846-
BINARY_LOOP_TWO_OUT {
847-
const @type@ in1 = *(@type@ *)ip1;
848-
const @type@ in2 = *(@type@ *)ip2;
849-
/* see FIXME note for divide above */
850-
if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
851-
npy_set_floatstatus_divbyzero();
852-
*((@type@ *)op1) = 0;
853-
*((@type@ *)op2) = 0;
854-
}
855-
else {
856-
/* handle mixed case the way Python does */
857-
const @type@ quo = in1 / in2;
858-
const @type@ rem = in1 % in2;
859-
if ((in1 > 0) == (in2 > 0) || rem == 0) {
860-
*((@type@ *)op1) = quo;
861-
*((@type@ *)op2) = rem;
862-
}
863-
else {
864-
*((@type@ *)op1) = quo - 1;
865-
*((@type@ *)op2) = rem + in2;
866-
}
867-
}
868-
}
869-
}
870-
871803
/**begin repeat1
872804
* #kind = gcd, lcm#
873805
**/
@@ -902,40 +834,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
902834
UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
903835
}
904836

905-
NPY_NO_EXPORT void
906-
@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
907-
{
908-
BINARY_LOOP {
909-
const @type@ in1 = *(@type@ *)ip1;
910-
const @type@ in2 = *(@type@ *)ip2;
911-
if (in2 == 0) {
912-
npy_set_floatstatus_divbyzero();
913-
*((@type@ *)op1) = 0;
914-
}
915-
else {
916-
*((@type@ *)op1) = in1 % in2;
917-
}
918-
}
919-
}
920-
921-
NPY_NO_EXPORT void
922-
@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
923-
{
924-
BINARY_LOOP_TWO_OUT {
925-
const @type@ in1 = *(@type@ *)ip1;
926-
const @type@ in2 = *(@type@ *)ip2;
927-
if (in2 == 0) {
928-
npy_set_floatstatus_divbyzero();
929-
*((@type@ *)op1) = 0;
930-
*((@type@ *)op2) = 0;
931-
}
932-
else {
933-
*((@type@ *)op1)= in1/in2;
934-
*((@type@ *)op2) = in1 % in2;
935-
}
936-
}
937-
}
938-
939837
/**begin repeat1
940838
* #kind = gcd, lcm#
941839
**/

numpy/core/src/umath/loops.h.src

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,15 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
6262
*/
6363
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
6464
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
65+
66+
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_fmod,
67+
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
68+
69+
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_remainder,
70+
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
71+
72+
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divmod,
73+
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
6574
/**end repeat**/
6675

6776
/**begin repeat
@@ -142,21 +151,12 @@ NPY_NO_EXPORT void
142151
NPY_NO_EXPORT void
143152
@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
144153

145-
NPY_NO_EXPORT void
146-
@S@@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
147-
148154
NPY_NO_EXPORT void
149155
@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
150156

151157
NPY_NO_EXPORT void
152158
@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
153159

154-
NPY_NO_EXPORT void
155-
@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
156-
157-
NPY_NO_EXPORT void
158-
@S@@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
159-
160160
NPY_NO_EXPORT void
161161
@S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
162162

0 commit comments

Comments
 (0)