Skip to content
2 changes: 1 addition & 1 deletion numpy/core/code_generators/generate_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ def english_upper(s):
docstrings.get('numpy.core.umath.floor_divide'),
'PyUFunc_DivisionTypeResolver',
TD(ints, cfunc_alias='divide',
dispatch=[('loops_arithmetic', 'BHILQ')]),
dispatch=[('loops_arithmetic', 'bBhHiIlLqQ')]),
TD(flts + cmplx),
[TypeDescription('m', FullTypeDescr, 'mq', 'm'),
TypeDescription('m', FullTypeDescr, 'md', 'm'),
Expand Down
24 changes: 12 additions & 12 deletions numpy/core/src/common/simd/intdiv.h
Original file line number Diff line number Diff line change
Expand Up @@ -368,18 +368,18 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
{
npy_int32 d1 = abs(d);
npy_int32 sh, m;
if (d1 > 1) {
// Handel abs overflow
if ((npy_uint32)d == 0x80000000U) {
m = 0x80000001;
sh = 30;
}
else if (d1 > 1) {
sh = npyv__bitscan_revnz_u32(d1 - 1); // ceil(log2(abs(d))) - 1
m = (1ULL << (32 + sh)) / d1 + 1; // multiplier
}
else if (d1 == 1) {
sh = 0; m = 1;
}
// fix abs overflow
else if (d == (1 << 31)) {
m = d + 1;
sh = 30;
}
else {
// raise arithmetic exception for d == 0
sh = m = 1 / ((npy_int32 volatile *)&d)[0]; // LCOV_EXCL_LINE
Expand Down Expand Up @@ -445,18 +445,18 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
#else
npy_int64 d1 = llabs(d);
npy_int64 sh, m;
if (d1 > 1) {
// Handel abs overflow
if ((npy_uint64)d == 0x8000000000000000ULL) {
m = 0x8000000000000001LL;
sh = 62;
}
else if (d1 > 1) {
sh = npyv__bitscan_revnz_u64(d1 - 1); // ceil(log2(abs(d))) - 1
m = npyv__divh128_u64(1ULL << sh, d1) + 1; // multiplier
}
else if (d1 == 1) {
sh = 0; m = 1;
}
// fix abs overflow
else if (d == (1LL << 63)) {
m = d + 1;
sh = 62;
}
else {
// raise arithmetic exception for d == 0
sh = m = 1 / ((npy_int64 volatile *)&d)[0]; // LCOV_EXCL_LINE
Expand Down
86 changes: 0 additions & 86 deletions numpy/core/src/umath/loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -843,92 +843,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
}

/* Libdivide only supports 32 and 64 bit types
* We try to pick the best possible one */
#if NPY_BITSOF_@TYPE@ <= 32
#define libdivide_@type@_t libdivide_s32_t
#define libdivide_@type@_gen libdivide_s32_gen
#define libdivide_@type@_do libdivide_s32_do
#else
#define libdivide_@type@_t libdivide_s64_t
#define libdivide_@type@_gen libdivide_s64_gen
#define libdivide_@type@_do libdivide_s64_do
#endif

NPY_NO_EXPORT void
@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
BINARY_DEFS

/* When the divisor is a constant, use libdivide for faster division */
if (steps[1] == 0) {
/* In case of empty array, just return */
if (n == 0) {
return;
}

const @type@ in2 = *(@type@ *)ip2;

/* If divisor is 0, we need not compute anything */
if (in2 == 0) {
npy_set_floatstatus_divbyzero();
BINARY_LOOP_SLIDING {
*((@type@ *)op1) = 0;
}
}
else {
struct libdivide_@type@_t fast_d = libdivide_@type@_gen(in2);
BINARY_LOOP_SLIDING {
const @type@ in1 = *(@type@ *)ip1;
/*
* FIXME: On x86 at least, dividing the smallest representable integer
* by -1 causes a SIFGPE (division overflow). We treat this case here
* (to avoid a SIGFPE crash at python level), but a good solution would
* be to treat integer division problems separately from FPU exceptions
* (i.e. a different approach than npy_set_floatstatus_divbyzero()).
*/
if (in1 == NPY_MIN_@TYPE@ && in2 == -1) {
npy_set_floatstatus_divbyzero();
*((@type@ *)op1) = 0;
}
else {
*((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d);

/* Negative quotients needs to be rounded down */
if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) {
*((@type@ *)op1) = *((@type@ *)op1) - 1;
}
}
}
}
}
else {
BINARY_LOOP_SLIDING {
const @type@ in1 = *(@type@ *)ip1;
const @type@ in2 = *(@type@ *)ip2;
/*
* FIXME: On x86 at least, dividing the smallest representable integer
* by -1 causes a SIFGPE (division overflow). We treat this case here
* (to avoid a SIGFPE crash at python level), but a good solution would
* be to treat integer division problems separately from FPU exceptions
* (i.e. a different approach than npy_set_floatstatus_divbyzero()).
*/
if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
npy_set_floatstatus_divbyzero();
*((@type@ *)op1) = 0;
}
else {
*((@type@ *)op1) = in1/in2;

/* Negative quotients needs to be rounded down */
if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) {
*((@type@ *)op1) = *((@type@ *)op1) - 1;
}
}
}
}
}

NPY_NO_EXPORT void
@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
Expand Down
6 changes: 2 additions & 4 deletions numpy/core/src/umath/loops.h.src
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
#endif

/**begin repeat
* #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
* #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
BYTE, SHORT, INT, LONG, LONGLONG#
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
Expand Down Expand Up @@ -151,9 +152,6 @@ NPY_NO_EXPORT void
NPY_NO_EXPORT void
@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));

NPY_NO_EXPORT void
@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));

NPY_NO_EXPORT void
@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));

Expand Down
148 changes: 146 additions & 2 deletions numpy/core/src/umath/loops_arithmetic.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,91 @@
//###############################################################################
/********************************************************************************
** Defining the SIMD kernels
*
* Floor division of signed is based on T. Granlund and P. L. Montgomery
* “Division by invariant integers using multiplication(see [Figure 6.1]
* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)"
* For details on TRUNC division see simd/intdiv.h for more clarification
***********************************************************************************
** Figure 6.1: Signed division by run–time invariant divisor, rounded towards -INF
***********************************************************************************
* For q = FLOOR(a/d), all sword:
* sword −dsign = SRL(d, N − 1);
* uword −nsign = (n < −dsign);
* uword −qsign = EOR(−nsign, −dsign);
* q = TRUNC((n − (−dsign ) + (−nsign))/d) − (−qsign);
********************************************************************************/

#if NPY_SIMD
/**begin repeat
* #sfx = u8, u16, u32, u64#
* Signed types
* #sfx = s8, s16, s32, s64#
* #len = 8, 16, 32, 64#
*/
static NPY_INLINE void
simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0];
npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2];
const int vstep = npyv_nlanes_@sfx@;
const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);

if (scalar == -1) {
npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
npyv_@sfx@ vzero = npyv_zero_@sfx@();
for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
npyv_@sfx@ a = npyv_load_@sfx@(src);
npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
noverflow = npyv_and_b@len@(noverflow, gt_min);
npyv_@sfx@ neg = npyv_ifsub_@sfx@(gt_min, vzero, a, vzero);
npyv_store_@sfx@(dst, neg);
}

int raise_err = npyv_tobits_b@len@(npyv_not_b@len@(noverflow)) != 0;
for (; len > 0; --len, ++src, ++dst) {
npyv_lanetype_@sfx@ a = *src;
if (a == NPY_MIN_INT@len@) {
raise_err = 1;
*dst = 0;
} else {
*dst = -a;
}
}
if (raise_err) {
npy_set_floatstatus_divbyzero();
}
} else {
for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
npyv_@sfx@ nsign_d = npyv_setall_@sfx@(scalar < 0);
npyv_@sfx@ a = npyv_load_@sfx@(src);
npyv_@sfx@ nsign_a = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
nsign_a = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1));
npyv_@sfx@ diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d);
npyv_@sfx@ to_ninf = npyv_xor_@sfx@(nsign_a, nsign_d);
npyv_@sfx@ trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
npyv_@sfx@ floor = npyv_sub_@sfx@(trunc, to_ninf);
npyv_store_@sfx@(dst, floor);
}

for (; len > 0; --len, ++src, ++dst) {
const npyv_lanetype_@sfx@ a = *src;
npyv_lanetype_@sfx@ r = a / scalar;
// Negative quotients needs to be rounded down
if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
r--;
}
*dst = r;
}
}
npyv_cleanup();
}
/**end repeat**/

/**begin repeat
* Unsigned types
* #sfx = u8, u16, u32, u64#
* #len = 8, 16, 32, 64#
*/
static NPY_INLINE void
simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
Expand All @@ -44,7 +125,6 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
const npyv_lanetype_@sfx@ a = *src;
*dst = a / scalar;
}

npyv_cleanup();
}
/**end repeat**/
Expand All @@ -54,6 +134,70 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
** Defining ufunc inner functions
********************************************************************************/

/**begin repeat
* Signed types
* #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
* #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
*/
#undef TO_SIMD_SFX
#if 0
/**begin repeat1
* #len = 8, 16, 32, 64#
*/
#elif NPY_BITSOF_@TYPE@ == @len@
#define TO_SIMD_SFX(X) X##_s@len@
/**end repeat1**/
#endif

#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
#undef TO_SIMD_SFX
#endif

NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d)
{
/*
* FIXME: On x86 at least, dividing the smallest representable integer
* by -1 causes a SIFGPE (division overflow). We treat this case here
* (to avoid a SIGFPE crash at python level), but a good solution would
* be to treat integer division problems separately from FPU exceptions
* (i.e. a different approach than npy_set_floatstatus_divbyzero()).
*/
if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) {
npy_set_floatstatus_divbyzero();
return 0;
}
@type@ r = n / d;
// Negative quotients needs to be rounded down
if (((n > 0) != (d > 0)) && ((r * d) != n)) {
r--;
}
return r;
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
if (IS_BINARY_REDUCE) {
BINARY_REDUCE_LOOP(@type@) {
io1 = floor_div_@TYPE@(io1, *(@type@*)ip2);
}
*((@type@ *)iop1) = io1;
}
#if NPY_SIMD && defined(TO_SIMD_SFX)
// for contiguous block of memory, divisor is a scalar and not 0
else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
(*(@type@ *)args[1]) != 0) {
TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
}
#endif
else {
BINARY_LOOP {
*((@type@ *)op1) = floor_div_@TYPE@(*(@type@*)ip1, *(@type@*)ip2);
}
}
}
/**end repeat**/

/**begin repeat
* Unsigned types
* #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
Expand Down