Skip to content

ENH: Add SIMD implementation for heaviside #19780

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion numpy/core/code_generators/generate_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,9 @@ def english_upper(s):
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.heaviside'),
None,
TD(flts, f='heaviside', astype={'e':'f'}),
TD('e', f='heaviside', astype={'e': 'f'}),
TD('fd', dispatch=[('loops_binary_fp', 'fd')]),
TD('g', f='heaviside', astype={'e': 'f'}),
),
'degrees':
Ufunc(1, 1, None,
Expand Down
1 change: 1 addition & 0 deletions numpy/core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,6 +920,7 @@ def generate_umath_c(ext, build_dir):
join('src', 'umath', 'loops_utils.h.src'),
join('src', 'umath', 'loops.c.src'),
join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
join('src', 'umath', 'loops_binary_fp.dispatch.c.src'),
join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
Expand Down
11 changes: 11 additions & 0 deletions numpy/core/src/umath/loops.h.src
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,17 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
/**end repeat1**/
/**end repeat**/

#ifndef NPY_DISABLE_OPTIMIZATION
#include "loops_binary_fp.dispatch.h"
#endif
/**begin repeat
* #TYPE = FLOAT, DOUBLE#
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_heaviside,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
/**end repeat**/


/**begin repeat
* #func = rint, ceil, floor, trunc#
*/
Expand Down
176 changes: 176 additions & 0 deletions numpy/core/src/umath/loops_binary_fp.dispatch.c.src
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
/*@targets
** $maxopt baseline
** sse2 avx2 avx512f
** neon
** vsx2
**/
#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"

/*************************************************************************
** SIMD intrinsics
*************************************************************************/
#if NPY_SIMD
NPY_FINLINE npyv_f32
simd_heaviside_f32(npyv_f32 x, npyv_f32 h)
{
npyv_f32 zeros = npyv_setall_f32(0.0);
npyv_f32 ones = npyv_setall_f32(1.0);
npyv_b32 lt_zero = npyv_cmplt_f32(x, zeros);
npyv_b32 gt_zero = npyv_cmpgt_f32(x, zeros);
npyv_b32 eq_zero = npyv_cmpeq_f32(x, zeros);
npyv_b32 is_nan = npyv_not_b32(npyv_cmpeq_f32(x, x));
npyv_f32 lt_zero_res = npyv_and_f32(npyv_cvt_f32_b32(lt_zero), zeros);
npyv_f32 gt_zero_res = npyv_and_f32(npyv_cvt_f32_b32(gt_zero), ones);
npyv_f32 eq_zero_res = npyv_and_f32(npyv_cvt_f32_b32(eq_zero), h);
npyv_f32 is_nan_res = npyv_and_f32(npyv_cvt_f32_b32(is_nan), x);

return npyv_or_f32(npyv_or_f32(lt_zero_res, npyv_or_f32(gt_zero_res, eq_zero_res)), is_nan_res);
}
#endif

#if NPY_SIMD_F64
NPY_FINLINE npyv_f64
simd_heaviside_f64(npyv_f64 x, npyv_f64 h)
{
npyv_f64 zeros = npyv_setall_f64(0.0);
npyv_f64 ones = npyv_setall_f64(1.0);
npyv_b64 lt_zero = npyv_cmplt_f64(x, zeros);
npyv_b64 gt_zero = npyv_cmpgt_f64(x, zeros);
npyv_b64 eq_zero = npyv_cmpeq_f64(x, zeros);
npyv_b64 is_nan = npyv_not_b64(npyv_cmpeq_f64(x, x));
npyv_f64 lt_zero_res = npyv_and_f64(npyv_cvt_f64_b64(lt_zero), zeros);
npyv_f64 gt_zero_res = npyv_and_f64(npyv_cvt_f64_b64(gt_zero), ones);
npyv_f64 eq_zero_res = npyv_and_f64(npyv_cvt_f64_b64(eq_zero), h);
npyv_f64 is_nan_res = npyv_and_f64(npyv_cvt_f64_b64(is_nan), x);

return npyv_or_f64(npyv_or_f64(lt_zero_res, npyv_or_f64(gt_zero_res, eq_zero_res)), is_nan_res);
}
#endif

/*************************************************************************
** SIMD kernels
*************************************************************************/
/**begin repeat
* #type = float, double#
* #sfx = f32, f64#
* #VCHK = NPY_SIMD, NPY_SIMD_F64#
*/
#if @VCHK@
/**begin repeat1
* #kind = heaviside#
*/
static void
simd_kernel_@kind@_@sfx@(const @type@ *src0, npy_intp ssrc0,
const @type@ *src1, npy_intp ssrc1,
@type@ *dst, npy_intp sdst, npy_intp len)
{
const int vstep = npyv_nlanes_@sfx@;
// unroll by x2.
const int wstep = vstep*2;
// contig contig contig
if (ssrc0 == 1 && ssrc1 == 1 && sdst == 1) {
for (; len >= wstep; len -= wstep, src0 += wstep,
src1 += wstep, dst += wstep
) {
npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
npyv_@sfx@ r0 = simd_@kind@_@sfx@(a0, b0);
npyv_@sfx@ r1 = simd_@kind@_@sfx@(a1, b1);
npyv_store_@sfx@(dst, r0);
npyv_store_@sfx@(dst + vstep, r1);
}
}
// contig scalar contig
else if (ssrc0 == 1 && ssrc1 == 0 && sdst == 1) {
npyv_@sfx@ scalar = npyv_setall_@sfx@(*src1);
for (; len >= wstep; len -= wstep, src0 += wstep, dst += wstep) {
npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
npyv_@sfx@ r0 = simd_@kind@_@sfx@(a0, scalar);
npyv_@sfx@ r1 = simd_@kind@_@sfx@(a1, scalar);
npyv_store_@sfx@(dst, r0);
npyv_store_@sfx@(dst + vstep, r1);
}
}
// scalar contig contig
else if (ssrc0 == 0 && ssrc1 == 1 && sdst == 1) {
npyv_@sfx@ scalar = npyv_setall_@sfx@(*src0);
for (; len >= wstep; len -= wstep, src1 += wstep, dst += wstep) {
npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
npyv_@sfx@ r0 = simd_@kind@_@sfx@(scalar, b0);
npyv_@sfx@ r1 = simd_@kind@_@sfx@(scalar, b1);
npyv_store_@sfx@(dst, r0);
npyv_store_@sfx@(dst + vstep, r1);
}
}
// remaind scalars and non-specialized strides
for (; len > 0; len -= vstep, src0 += ssrc0*vstep,
src1 += ssrc1*vstep, dst += sdst*vstep
) {
npyv_@sfx@ a = npyv_loadn_tillz_@sfx@(src0, ssrc0, len);
npyv_@sfx@ b = npyv_loadn_tillz_@sfx@(src1, ssrc1, len);
npyv_@sfx@ r = simd_@kind@_@sfx@(a, b);
npyv_storen_till_@sfx@(dst, sdst, len, r);
}
}
/**end repeat1**/
#endif
/**end repeat**/

/********************************************************************
** ufunc inner functions
********************************************************************/
/**begin repeat
* #TYPE = FLOAT, DOUBLE#
* #type = float, double#
* #sfx = f32, f64#
* #csfx = f, #
* #VCHK = NPY_SIMD, NPY_SIMD_F64#
*/
/**begin repeat1
* #kind = heaviside#
*/
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_heaviside)
(char **args, npy_intp const *dimensions, npy_intp const *steps,
void *NPY_UNUSED(func))
{
const char *src0 = args[0]; const char *src1 = args[1];
char *dst = args[2];

const npy_intp src0_step = steps[0];
const npy_intp src1_step = steps[1];
const npy_intp dst_step = steps[2];
npy_intp len = dimensions[0];
#if @VCHK@
const int lsize = sizeof(@type@);
const npy_intp ssrc0 = src0_step / lsize;
const npy_intp ssrc1 = src1_step / lsize;
const npy_intp sdst = dst_step / lsize;
if (!is_mem_overlap(src0, src0_step, dst, dst_step, len) &&
!is_mem_overlap(src1, src1_step, dst, dst_step, len) &&
npyv_loadable_stride_@sfx@(ssrc0) &&
npyv_loadable_stride_@sfx@(ssrc1) &&
npyv_storable_stride_@sfx@(sdst)
) {
simd_kernel_@kind@_@sfx@((const @type@*)src0, ssrc0,
(const @type@*)src1, ssrc1,
(@type@*)dst, sdst, len);
return;
}
#endif
for (; len > 0; --len, src0 += src0_step, src1 += src1_step,
dst += dst_step
) {
const @type@ s0 = *(@type@*)src0;
const @type@ s1 = *(@type@*)src1;
*(@type@*)dst = npy_@kind@@csfx@(s0, s1);
}
}
/**end repeat1**/
/**end repeat**/