From 6864da9457b161dbddabcc8f2c6daba8a6157256 Mon Sep 17 00:00:00 2001 From: HowJMay Date: Sun, 29 Aug 2021 16:57:44 +0800 Subject: [PATCH 1/4] ENH: Add SIMD implementation for heaviside --- numpy/core/src/_simd/_simd.dispatch.c.src | 4 ++-- numpy/core/src/common/simd/avx2/math.h | 17 +++++++++++++++++ numpy/core/src/common/simd/avx512/math.h | 16 ++++++++++++++++ numpy/core/src/common/simd/neon/math.h | 16 ++++++++++++++++ numpy/core/src/common/simd/sse/math.h | 17 +++++++++++++++++ numpy/core/src/common/simd/vsx/math.h | 17 +++++++++++++++++ numpy/core/tests/test_simd.py | 17 +++++++++++++++++ 7 files changed, 102 insertions(+), 2 deletions(-) diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 54770959c362..6d398d4bfb4a 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -395,7 +395,7 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) #if @fp_only@ /**begin repeat1 - * #intrin = maxp, minp# + * #intrin = maxp, minp, heaviside# */ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ @@ -629,7 +629,7 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) #if @fp_only@ /**begin repeat1 - * #intrin = maxp, minp# + * #intrin = maxp, minp, heaviside# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h index 9460183df5bb..bd5fb14e6d33 100644 --- a/numpy/core/src/common/simd/avx2/math.h +++ b/numpy/core/src/common/simd/avx2/math.h @@ -4,6 +4,9 @@ #ifndef _NPY_SIMD_AVX2_MATH_H #define _NPY_SIMD_AVX2_MATH_H + +#include "operators.h" + /*************************** * Elementary ***************************/ @@ -105,4 +108,18 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return _mm256_blendv_epi8(a, b, _mm256_cmpgt_epi64(a, b)); } +// heaviside +NPY_FINLINE npyv_f32 npyv_heaviside_f32(npyv_f32 a, npyv_f32 b) +{ + npyv_s32 not_a = _mm256_xor_si256(_mm256_castps_si256(a), _mm256_set1_epi32(-1)); + npyv_f32 not_zero_ret_val = _mm256_castsi256_ps(_mm256_and_si256(npyv_shri_s32(not_a, 8), _mm256_set1_epi32(0x3F800000))); + return _mm256_blendv_ps(not_zero_ret_val, b, _mm256_castsi256_ps(npyv_cmpeq_f32(a, npyv_setall_f32(0.0)))); +} +NPY_FINLINE npyv_f64 npyv_heaviside_f64(npyv_f64 a, npyv_f64 b) +{ + npyv_s64 not_a = _mm256_xor_si256(_mm256_castpd_si256(a), _mm256_set1_epi64x(-1)); + npyv_f64 not_zero_ret_val = _mm256_castsi256_pd(_mm256_and_si256(npyv_shri_s64(not_a, 11), _mm256_set1_epi64x(0x3FF0000000000000))); + return _mm256_blendv_pd(not_zero_ret_val, b, _mm256_castsi256_pd(npyv_cmpeq_f64(a, npyv_setall_f64(0.0)))); +} + #endif // _NPY_SIMD_AVX2_MATH_H diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h index 0141396d06a3..b6587662e1fd 100644 --- a/numpy/core/src/common/simd/avx512/math.h +++ b/numpy/core/src/common/simd/avx512/math.h @@ -5,6 +5,8 @@ #ifndef _NPY_SIMD_AVX512_MATH_H #define _NPY_SIMD_AVX512_MATH_H +#include "operators.h" + /*************************** * Elementary ***************************/ @@ -112,4 +114,18 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) #define npyv_min_u64 _mm512_min_epu64 #define npyv_min_s64 _mm512_min_epi64 +// heaviside +NPY_FINLINE npyv_f32 npyv_heaviside_f32(npyv_f32 a, npyv_f32 b) +{ + npyv_s32 not_a = npyv_xor_s32(_mm512_castps_si512(a), _mm512_set1_epi32(-1)); + npyv_f32 not_zero_ret_val = _mm512_castsi512_ps(_mm512_and_si512(npyv_shri_s32(not_a, 8), _mm512_set1_epi32(0x3F800000))); + return _mm512_mask_blend_ps(npyv_cmpeq_f32(a, npyv_setall_f32(0.0)), not_zero_ret_val, b); +} +NPY_FINLINE npyv_f64 npyv_heaviside_f64(npyv_f64 a, npyv_f64 b) +{ + npyv_s64 not_a = npyv_xor_s64(_mm512_castpd_si512(a), _mm512_set1_epi64(-1)); + npyv_f64 not_zero_ret_val = _mm512_castsi512_pd(_mm512_and_si512(npyv_shri_s64(not_a, 11), _mm512_set1_epi64(0x3FF0000000000000))); + return _mm512_mask_blend_pd(npyv_cmpeq_f64(a, npyv_setall_f64(0.0)), not_zero_ret_val, b); +} + #endif // _NPY_SIMD_AVX512_MATH_H diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 19ea6f22fab0..1cfc1b5362e0 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -153,4 +153,20 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return vbslq_s64(npyv_cmplt_s64(a, b), a, b); } +// heaviside +NPY_FINLINE npyv_f32 npyv_heaviside_f32(npyv_f32 a, npyv_f32 b) +{ + npyv_s32 not_a = vmvnq_s32(vreinterpretq_s32_f32(a)); + npyv_f32 not_zero_ret_val = vreinterpretq_f32_s32(vandq_s32(npyv_shri_s32(not_a, 8), vdupq_n_s32(0x3F800000))); + return npyv_select_f32(npyv_cmpeq_f32(a, npyv_setall_f32(0.0)), b, (not_zero_ret_val)); +} +#if NPY_SIMD_F64 + NPY_FINLINE npyv_f64 npyv_heaviside_f64(npyv_f64 a, npyv_f64 b) + { + npyv_s64 not_a = vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_f64(a))); + npyv_f64 not_zero_ret_val = vreinterpretq_f64_s64(vandq_s64(npyv_shri_s64(not_a, 11), vdupq_n_s64(0x3FF0000000000000))); + return npyv_select_f64(npyv_cmpeq_f64(a, npyv_setall_f64(0.0)), b, (not_zero_ret_val)); + } +#endif // NPY_SIMD_F64 + #endif // _NPY_SIMD_NEON_MATH_H diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h index 97d35afc5e04..d0dda1c67993 100644 --- a/numpy/core/src/common/simd/sse/math.h +++ b/numpy/core/src/common/simd/sse/math.h @@ -4,6 +4,9 @@ #ifndef _NPY_SIMD_SSE_MATH_H #define _NPY_SIMD_SSE_MATH_H + +#include "misc.h" + /*************************** * Elementary ***************************/ @@ -143,4 +146,18 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return npyv_select_s64(npyv_cmplt_s64(a, b), a, b); } +// heaviside +NPY_FINLINE npyv_f32 npyv_heaviside_f32(npyv_f32 a, npyv_f32 b) +{ + npyv_s32 not_a = _mm_xor_si128(_mm_castps_si128(a), _mm_set1_epi32(-1)); + npyv_f32 not_zero_ret_val = _mm_castsi128_ps(_mm_and_si128(_mm_srai_epi32(not_a, 8), _mm_set1_epi32(0x3F800000))); + return npyv_select_f32((npyv_cmpeq_f32(a, _mm_set1_ps(0.0))), b, not_zero_ret_val); +} +NPY_FINLINE npyv_f64 npyv_heaviside_f64(npyv_f64 a, npyv_f64 b) +{ + npyv_s64 not_a = _mm_xor_si128(_mm_castpd_si128(a), _mm_set1_epi64x(-1)); + npyv_f64 not_zero_ret_val = _mm_castsi128_pd(_mm_and_si128(npyv_shri_s64(not_a, 11), _mm_set1_epi64x(0x3FF0000000000000))); + return npyv_select_f64((npyv_cmpeq_f64(a, npyv_setall_f64(0.0))), b, not_zero_ret_val); +} + #endif // _NPY_SIMD_SSE_MATH_H diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h index b2e393c7cf77..bc98396d7192 100644 --- a/numpy/core/src/common/simd/vsx/math.h +++ b/numpy/core/src/common/simd/vsx/math.h @@ -4,6 +4,9 @@ #ifndef _NPY_SIMD_VSX_MATH_H #define _NPY_SIMD_VSX_MATH_H + +#include "misc.h" + /*************************** * Elementary ***************************/ @@ -69,4 +72,18 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_min_u64 vec_min #define npyv_min_s64 vec_min +// heaviside +NPY_FINLINE npyv_f32 npyv_heaviside_f32(npyv_f32 a, npyv_f32 b) +{ + npyv_s32 not_a = (npyv_s32)npyv_not_f32((a)); + npyv_f32 not_zero_ret_val = (npyv_f32)(vec_and(npyv_shri_s32(not_a, 8), npyv_setall_s32(0x3F800000))); + return npyv_select_f32(npyv_cmpeq_f32(a, npyv_setall_f32(0.0)), b, not_zero_ret_val); +} +NPY_FINLINE npyv_f64 npyv_heaviside_f64(npyv_f64 a, npyv_f64 b) +{ + npyv_s64 not_a = (npyv_s64)npyv_not_f64((a)); + npyv_f64 not_zero_ret_val = (npyv_f64)(vec_and(npyv_shri_s64(not_a, 11), npyv_setall_s64(0x3FF0000000000000))); + return npyv_select_f64(npyv_cmpeq_f32(a, npyv_setall_f64(0.0)), b, not_zero_ret_val); +} + #endif // _NPY_SIMD_VSX_MATH_H diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index f0c60953bae5..628f4742b6a5 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -415,6 +415,23 @@ def test_special_cases(self): nnan = self.notnan(self.setall(self._nan())) assert nnan == [0]*self.nlanes + def test_heaviside(self): + data_a = self._data() + data_b = self._data(self.nlanes) + vdata_a, vdata_b = self.load(data_a), self.load(data_b) + + def _heaviside(vector_a, vector_b): + if vector_a < 0: + return 0 + elif vector_a == 0: + return vector_b + elif vector_a > 0: + return 1 + + data_heaviside = [_heaviside(a, b) for a, b in zip(data_a, data_b)] + vheaviside = self.heaviside(vdata_a, vdata_b) + assert vheaviside == data_heaviside + class _SIMD_ALL(_Test_Utility): """ To test all vector types at once From 9e4b7bcbdabc6e8a4fe0ee08590d5ee27b784fe5 Mon Sep 17 00:00:00 2001 From: HowJMay Date: Tue, 31 Aug 2021 01:35:15 +0800 Subject: [PATCH 2/4] ENH: Use cmp intrinsics --- numpy/core/src/common/simd/avx2/math.h | 24 ++++++++++++++++++------ numpy/core/src/common/simd/sse/math.h | 24 ++++++++++++++++++------ 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h index bd5fb14e6d33..491c9449bcdb 100644 --- a/numpy/core/src/common/simd/avx2/math.h +++ b/numpy/core/src/common/simd/avx2/math.h @@ -111,15 +111,27 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) // heaviside NPY_FINLINE npyv_f32 npyv_heaviside_f32(npyv_f32 a, npyv_f32 b) { - npyv_s32 not_a = _mm256_xor_si256(_mm256_castps_si256(a), _mm256_set1_epi32(-1)); - npyv_f32 not_zero_ret_val = _mm256_castsi256_ps(_mm256_and_si256(npyv_shri_s32(not_a, 8), _mm256_set1_epi32(0x3F800000))); - return _mm256_blendv_ps(not_zero_ret_val, b, _mm256_castsi256_ps(npyv_cmpeq_f32(a, npyv_setall_f32(0.0)))); + npyv_f32 zeros = npyv_setall_f32(0.0); + npyv_f32 ones = npyv_setall_f32(1.0); + npyv_f32 lt_zero = _mm256_cmp_ps(a, zeros, _CMP_LT_OQ); + npyv_f32 gt_zero = _mm256_cmp_ps(a, zeros, _CMP_GT_OQ); + npyv_f32 eq_zero = _mm256_cmp_ps(a, zeros, _CMP_EQ_OQ); + npyv_f32 lt_zero_res = npyv_and_f32(lt_zero, zeros); + npyv_f32 gt_zero_res = npyv_and_f32(gt_zero, ones); + npyv_f32 eq_zero_res = npyv_and_f32(eq_zero, b); + return npyv_or_f32(lt_zero_res, npyv_or_f32(gt_zero_res, eq_zero_res)); } NPY_FINLINE npyv_f64 npyv_heaviside_f64(npyv_f64 a, npyv_f64 b) { - npyv_s64 not_a = _mm256_xor_si256(_mm256_castpd_si256(a), _mm256_set1_epi64x(-1)); - npyv_f64 not_zero_ret_val = _mm256_castsi256_pd(_mm256_and_si256(npyv_shri_s64(not_a, 11), _mm256_set1_epi64x(0x3FF0000000000000))); - return _mm256_blendv_pd(not_zero_ret_val, b, _mm256_castsi256_pd(npyv_cmpeq_f64(a, npyv_setall_f64(0.0)))); + npyv_f64 zeros = npyv_setall_f64(0.0); + npyv_f64 ones = npyv_setall_f64(1.0); + npyv_f64 lt_zero = _mm256_cmp_pd(a, zeros, _CMP_LT_OQ); + npyv_f64 gt_zero = _mm256_cmp_pd(a, zeros, _CMP_GT_OQ); + npyv_f64 eq_zero = _mm256_cmp_pd(a, zeros, _CMP_EQ_OQ); + npyv_f64 lt_zero_res = npyv_and_f64(lt_zero, zeros); + npyv_f64 gt_zero_res = npyv_and_f64(gt_zero, ones); + npyv_f64 eq_zero_res = npyv_and_f64(eq_zero, b); + return npyv_or_f64(lt_zero_res, npyv_or_f64(gt_zero_res, eq_zero_res)); } #endif // _NPY_SIMD_AVX2_MATH_H diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h index d0dda1c67993..171f02dd66b1 100644 --- a/numpy/core/src/common/simd/sse/math.h +++ b/numpy/core/src/common/simd/sse/math.h @@ -149,15 +149,27 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) // heaviside NPY_FINLINE npyv_f32 npyv_heaviside_f32(npyv_f32 a, npyv_f32 b) { - npyv_s32 not_a = _mm_xor_si128(_mm_castps_si128(a), _mm_set1_epi32(-1)); - npyv_f32 not_zero_ret_val = _mm_castsi128_ps(_mm_and_si128(_mm_srai_epi32(not_a, 8), _mm_set1_epi32(0x3F800000))); - return npyv_select_f32((npyv_cmpeq_f32(a, _mm_set1_ps(0.0))), b, not_zero_ret_val); + npyv_f32 zeros = npyv_setall_f32(0.0); + npyv_f32 ones = npyv_setall_f32(1.0); + npyv_f32 lt_zero = _mm_cmplt_ps(a, zeros); + npyv_f32 gt_zero = _mm_cmpgt_ps(a, zeros); + npyv_f32 eq_zero = _mm_cmpeq_ps(a, zeros); + npyv_f32 lt_zero_res = npyv_and_f32(lt_zero, zeros); + npyv_f32 gt_zero_res = npyv_and_f32(gt_zero, ones); + npyv_f32 eq_zero_res = npyv_and_f32(eq_zero, b); + return npyv_or_f32(lt_zero_res, npyv_or_f32(gt_zero_res, eq_zero_res)); } NPY_FINLINE npyv_f64 npyv_heaviside_f64(npyv_f64 a, npyv_f64 b) { - npyv_s64 not_a = _mm_xor_si128(_mm_castpd_si128(a), _mm_set1_epi64x(-1)); - npyv_f64 not_zero_ret_val = _mm_castsi128_pd(_mm_and_si128(npyv_shri_s64(not_a, 11), _mm_set1_epi64x(0x3FF0000000000000))); - return npyv_select_f64((npyv_cmpeq_f64(a, npyv_setall_f64(0.0))), b, not_zero_ret_val); + npyv_f64 zeros = npyv_setall_f64(0.0); + npyv_f64 ones = npyv_setall_f64(1.0); + npyv_f64 lt_zero = _mm_cmplt_pd(a, zeros); + npyv_f64 gt_zero = _mm_cmpgt_pd(a, zeros); + npyv_f64 eq_zero = _mm_cmpeq_pd(a, zeros); + npyv_f64 lt_zero_res = npyv_and_f64(lt_zero, zeros); + npyv_f64 gt_zero_res = npyv_and_f64(gt_zero, ones); + npyv_f64 eq_zero_res = npyv_and_f64(eq_zero, b); + return npyv_or_f64(lt_zero_res, npyv_or_f64(gt_zero_res, eq_zero_res)); } #endif // _NPY_SIMD_SSE_MATH_H From 3210b1352e285971ffae8e0a55e21d9fabe597b9 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Sat, 4 Sep 2021 23:18:21 +0200 Subject: [PATCH 3/4] ENH, SIMD: Prepare a new dispatchable source for general binary fp operations such heaviside. --- numpy/core/code_generators/generate_umath.py | 4 +- numpy/core/setup.py | 1 + numpy/core/src/umath/loops.h.src | 11 ++ .../src/umath/loops_binary_fp.dispatch.c.src | 154 ++++++++++++++++++ 4 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 numpy/core/src/umath/loops_binary_fp.dispatch.c.src diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 4891e8f2318a..fbc047e39988 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -605,7 +605,9 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.heaviside'), None, - TD(flts, f='heaviside', astype={'e':'f'}), + TD('e', f='heaviside', astype={'e':'f'}), + TD('fd', dispatch=[('loops_binary_fp', 'fd')]), + TD('g', f='heaviside', astype={'e':'f'}), ), 'degrees': Ufunc(1, 1, None, diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 2f495c48b745..c6fb6b24ea16 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -920,6 +920,7 @@ def generate_umath_c(ext, build_dir): join('src', 'umath', 'loops_utils.h.src'), join('src', 'umath', 'loops.c.src'), join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), + join('src', 'umath', 'loops_binary_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 02d749a5eb5b..9345280b326d 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -248,6 +248,17 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( /**end repeat1**/ /**end repeat**/ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_binary_fp.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_heaviside, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat**/ + + /**begin repeat * #func = rint, ceil, floor, trunc# */ diff --git a/numpy/core/src/umath/loops_binary_fp.dispatch.c.src b/numpy/core/src/umath/loops_binary_fp.dispatch.c.src new file mode 100644 index 000000000000..c95ad5d0625b --- /dev/null +++ b/numpy/core/src/umath/loops_binary_fp.dispatch.c.src @@ -0,0 +1,154 @@ +/*@targets + ** $maxopt baseline + ** sse2 avx2 avx512f + ** neon + ** vsx2 + **/ +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" + +/************************************************************************* + ** SIMD intrinsics + *************************************************************************/ +#if NPY_SIMD +NPY_FINLINE npyv_f32 +simd_heaviside_f32(npyv_f32 x, npyv_f32 h) +{ + // TODO: your implmentation +} +#endif + +#if NPY_SIMD_F64 +NPY_FINLINE npyv_f64 +simd_heaviside_f64(npyv_f64 x, npyv_f64 h) +{ + // TODO: your implmentation +} +#endif + +/************************************************************************* + ** SIMD kernels + *************************************************************************/ +/**begin repeat + * #type = float, double# + * #sfx = f32, f64# + * #VCHK = NPY_SIMD, NPY_SIMD_F64# + */ +#if @VCHK@ +/**begin repeat1 + * #kind = heaviside# + */ +static void +simd_kernel_@kind@_@sfx@(const @type@ *src0, npy_intp ssrc0, + const @type@ *src1, npy_intp ssrc1, + @type@ *dst, npy_intp sdst, npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + // unroll by x2. + const int wstep = vstep*2; + // contig contig contig + if (ssrc0 == 1 && ssrc1 == 1 && sdst == 1) { + for (; len >= wstep; len -= wstep, src0 += wstep, + src1 += wstep, dst += wstep + ) { + npyv_@sfx@ a0 = npyv_load_@sfx@(src0); + npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep); + npyv_@sfx@ b0 = npyv_load_@sfx@(src1); + npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep); + npyv_@sfx@ r0 = simd_@kind@_@sfx@(a0, b0); + npyv_@sfx@ r1 = simd_@kind@_@sfx@(a1, b1); + npyv_store_@sfx@(dst, r0); + npyv_store_@sfx@(dst + vstep, r1); + } + } + // contig scalar contig + else if (ssrc0 == 1 && ssrc1 == 0 && sdst == 1) { + npyv_@sfx@ scalar = npyv_setall_@sfx@(*src1); + for (; len >= wstep; len -= wstep, src0 += wstep, dst += wstep) { + npyv_@sfx@ a0 = npyv_load_@sfx@(src0); + npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep); + npyv_@sfx@ r0 = simd_@kind@_@sfx@(a0, scalar); + npyv_@sfx@ r1 = simd_@kind@_@sfx@(a1, scalar); + npyv_store_@sfx@(dst, r0); + npyv_store_@sfx@(dst + vstep, r1); + } + } + // scalar contig contig + else if (ssrc0 == 0 && ssrc1 == 1 && sdst == 1) { + npyv_@sfx@ scalar = npyv_setall_@sfx@(*src0); + for (; len >= wstep; len -= wstep, src1 += wstep, dst += wstep) { + npyv_@sfx@ b0 = npyv_load_@sfx@(src1); + npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep); + npyv_@sfx@ r0 = simd_@kind@_@sfx@(scalar, b0); + npyv_@sfx@ r1 = simd_@kind@_@sfx@(scalar, b1); + npyv_store_@sfx@(dst, r0); + npyv_store_@sfx@(dst + vstep, r1); + } + } + // remaind scalars and non-specialized strides + for (; len > 0; len -= vstep, src0 += ssrc0*vstep, + src1 += ssrc1*vstep, dst += sdst*vstep + ) { + npyv_@sfx@ a = npyv_loadn_tillz_@sfx@(src0, ssrc0, len); + npyv_@sfx@ b = npyv_loadn_tillz_@sfx@(src1, ssrc1, len); + npyv_@sfx@ r = simd_@kind@_@sfx@(a, b); + npyv_storen_till_@sfx@(dst, sdst, len, r); + } +} +/**end repeat1**/ +#endif +/**end repeat**/ + +/******************************************************************** + ** ufunc inner functions + ********************************************************************/ +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + * #type = float, double# + * #sfx = f32, f64# + * #csfx = f, # + * #VCHK = NPY_SIMD, NPY_SIMD_F64# + */ +/**begin repeat1 + * #kind = heaviside# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_heaviside) +(char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) +{ + const char *src0 = args[0]; const char *src1 = args[1]; + char *dst = args[2]; + + const npy_intp src0_step = steps[0]; + const npy_intp src1_step = steps[1]; + const npy_intp dst_step = steps[2]; + npy_intp len = dimensions[0]; +#if @VCHK@ + const int lsize = sizeof(@type@); + const npy_intp ssrc0 = src0_step / lsize; + const npy_intp ssrc1 = src1_step / lsize; + const npy_intp sdst = dst_step / lsize; + if (!is_mem_overlap(src0, src0_step, dst, dst_step, len) && + !is_mem_overlap(src1, src1_step, dst, dst_step, len) && + npyv_loadable_stride_@sfx@(ssrc0) && + npyv_loadable_stride_@sfx@(ssrc1) && + npyv_storable_stride_@sfx@(sdst) + ) { + simd_kernel_@kind@_@sfx@((const @type@*)src0, ssrc0, + (const @type@*)src1, ssrc1, + (@type@*)dst, sdst, len); + return; + } +#endif + for (; len > 0; --len, src0 += src0_step, src1 += src1_step, + dst += dst_step + ) { + const @type@ s0 = *(@type@*)src0; + const @type@ s1 = *(@type@*)src1; + *(@type@*)dst = npy_@kind@@csfx@(s0, s1); + } +} +/**end repeat1**/ +/**end repeat**/ + From 10fe327aff50ada3f671e3e9b5656409fdb9c1fa Mon Sep 17 00:00:00 2001 From: HowJMay Date: Sat, 2 Oct 2021 17:22:39 +0800 Subject: [PATCH 4/4] ENH, SIMD: Add heaviside in dispatchable source --- numpy/core/code_generators/generate_umath.py | 4 +-- numpy/core/src/_simd/_simd.dispatch.c.src | 4 +-- numpy/core/src/common/simd/avx2/math.h | 29 ------------------- numpy/core/src/common/simd/avx512/math.h | 16 ---------- numpy/core/src/common/simd/neon/math.h | 16 ---------- numpy/core/src/common/simd/sse/math.h | 29 ------------------- numpy/core/src/common/simd/vsx/math.h | 17 ----------- .../src/umath/loops_binary_fp.dispatch.c.src | 26 +++++++++++++++-- numpy/core/tests/test_simd.py | 17 ----------- 9 files changed, 28 insertions(+), 130 deletions(-) diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index fbc047e39988..43dd9eea557c 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -605,9 +605,9 @@ def english_upper(s): Ufunc(2, 1, None, docstrings.get('numpy.core.umath.heaviside'), None, - TD('e', f='heaviside', astype={'e':'f'}), + TD('e', f='heaviside', astype={'e': 'f'}), TD('fd', dispatch=[('loops_binary_fp', 'fd')]), - TD('g', f='heaviside', astype={'e':'f'}), + TD('g', f='heaviside', astype={'e': 'f'}), ), 'degrees': Ufunc(1, 1, None, diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 6d398d4bfb4a..54770959c362 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -395,7 +395,7 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) #if @fp_only@ /**begin repeat1 - * #intrin = maxp, minp, heaviside# + * #intrin = maxp, minp# */ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ @@ -629,7 +629,7 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) #if @fp_only@ /**begin repeat1 - * #intrin = maxp, minp, heaviside# + * #intrin = maxp, minp# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h index 491c9449bcdb..9460183df5bb 100644 --- a/numpy/core/src/common/simd/avx2/math.h +++ b/numpy/core/src/common/simd/avx2/math.h @@ -4,9 +4,6 @@ #ifndef _NPY_SIMD_AVX2_MATH_H #define _NPY_SIMD_AVX2_MATH_H - -#include "operators.h" - /*************************** * Elementary ***************************/ @@ -108,30 +105,4 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return _mm256_blendv_epi8(a, b, _mm256_cmpgt_epi64(a, b)); } -// heaviside -NPY_FINLINE npyv_f32 npyv_heaviside_f32(npyv_f32 a, npyv_f32 b) -{ - npyv_f32 zeros = npyv_setall_f32(0.0); - npyv_f32 ones = npyv_setall_f32(1.0); - npyv_f32 lt_zero = _mm256_cmp_ps(a, zeros, _CMP_LT_OQ); - npyv_f32 gt_zero = _mm256_cmp_ps(a, zeros, _CMP_GT_OQ); - npyv_f32 eq_zero = _mm256_cmp_ps(a, zeros, _CMP_EQ_OQ); - npyv_f32 lt_zero_res = npyv_and_f32(lt_zero, zeros); - npyv_f32 gt_zero_res = npyv_and_f32(gt_zero, ones); - npyv_f32 eq_zero_res = npyv_and_f32(eq_zero, b); - return npyv_or_f32(lt_zero_res, npyv_or_f32(gt_zero_res, eq_zero_res)); -} -NPY_FINLINE npyv_f64 npyv_heaviside_f64(npyv_f64 a, npyv_f64 b) -{ - npyv_f64 zeros = npyv_setall_f64(0.0); - npyv_f64 ones = npyv_setall_f64(1.0); - npyv_f64 lt_zero = _mm256_cmp_pd(a, zeros, _CMP_LT_OQ); - npyv_f64 gt_zero = _mm256_cmp_pd(a, zeros, _CMP_GT_OQ); - npyv_f64 eq_zero = _mm256_cmp_pd(a, zeros, _CMP_EQ_OQ); - npyv_f64 lt_zero_res = npyv_and_f64(lt_zero, zeros); - npyv_f64 gt_zero_res = npyv_and_f64(gt_zero, ones); - npyv_f64 eq_zero_res = npyv_and_f64(eq_zero, b); - return npyv_or_f64(lt_zero_res, npyv_or_f64(gt_zero_res, eq_zero_res)); -} - #endif // _NPY_SIMD_AVX2_MATH_H diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h index b6587662e1fd..0141396d06a3 100644 --- a/numpy/core/src/common/simd/avx512/math.h +++ b/numpy/core/src/common/simd/avx512/math.h @@ -5,8 +5,6 @@ #ifndef _NPY_SIMD_AVX512_MATH_H #define _NPY_SIMD_AVX512_MATH_H -#include "operators.h" - /*************************** * Elementary ***************************/ @@ -114,18 +112,4 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) #define npyv_min_u64 _mm512_min_epu64 #define npyv_min_s64 _mm512_min_epi64 -// heaviside -NPY_FINLINE npyv_f32 npyv_heaviside_f32(npyv_f32 a, npyv_f32 b) -{ - npyv_s32 not_a = npyv_xor_s32(_mm512_castps_si512(a), _mm512_set1_epi32(-1)); - npyv_f32 not_zero_ret_val = _mm512_castsi512_ps(_mm512_and_si512(npyv_shri_s32(not_a, 8), _mm512_set1_epi32(0x3F800000))); - return _mm512_mask_blend_ps(npyv_cmpeq_f32(a, npyv_setall_f32(0.0)), not_zero_ret_val, b); -} -NPY_FINLINE npyv_f64 npyv_heaviside_f64(npyv_f64 a, npyv_f64 b) -{ - npyv_s64 not_a = npyv_xor_s64(_mm512_castpd_si512(a), _mm512_set1_epi64(-1)); - npyv_f64 not_zero_ret_val = _mm512_castsi512_pd(_mm512_and_si512(npyv_shri_s64(not_a, 11), _mm512_set1_epi64(0x3FF0000000000000))); - return _mm512_mask_blend_pd(npyv_cmpeq_f64(a, npyv_setall_f64(0.0)), not_zero_ret_val, b); -} - #endif // _NPY_SIMD_AVX512_MATH_H diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 1cfc1b5362e0..19ea6f22fab0 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -153,20 +153,4 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return vbslq_s64(npyv_cmplt_s64(a, b), a, b); } -// heaviside -NPY_FINLINE npyv_f32 npyv_heaviside_f32(npyv_f32 a, npyv_f32 b) -{ - npyv_s32 not_a = vmvnq_s32(vreinterpretq_s32_f32(a)); - npyv_f32 not_zero_ret_val = vreinterpretq_f32_s32(vandq_s32(npyv_shri_s32(not_a, 8), vdupq_n_s32(0x3F800000))); - return npyv_select_f32(npyv_cmpeq_f32(a, npyv_setall_f32(0.0)), b, (not_zero_ret_val)); -} -#if NPY_SIMD_F64 - NPY_FINLINE npyv_f64 npyv_heaviside_f64(npyv_f64 a, npyv_f64 b) - { - npyv_s64 not_a = vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_f64(a))); - npyv_f64 not_zero_ret_val = vreinterpretq_f64_s64(vandq_s64(npyv_shri_s64(not_a, 11), vdupq_n_s64(0x3FF0000000000000))); - return npyv_select_f64(npyv_cmpeq_f64(a, npyv_setall_f64(0.0)), b, (not_zero_ret_val)); - } -#endif // NPY_SIMD_F64 - #endif // _NPY_SIMD_NEON_MATH_H diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h index 171f02dd66b1..97d35afc5e04 100644 --- a/numpy/core/src/common/simd/sse/math.h +++ b/numpy/core/src/common/simd/sse/math.h @@ -4,9 +4,6 @@ #ifndef _NPY_SIMD_SSE_MATH_H #define _NPY_SIMD_SSE_MATH_H - -#include "misc.h" - /*************************** * Elementary ***************************/ @@ -146,30 +143,4 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return npyv_select_s64(npyv_cmplt_s64(a, b), a, b); } -// heaviside -NPY_FINLINE npyv_f32 npyv_heaviside_f32(npyv_f32 a, npyv_f32 b) -{ - npyv_f32 zeros = npyv_setall_f32(0.0); - npyv_f32 ones = npyv_setall_f32(1.0); - npyv_f32 lt_zero = _mm_cmplt_ps(a, zeros); - npyv_f32 gt_zero = _mm_cmpgt_ps(a, zeros); - npyv_f32 eq_zero = _mm_cmpeq_ps(a, zeros); - npyv_f32 lt_zero_res = npyv_and_f32(lt_zero, zeros); - npyv_f32 gt_zero_res = npyv_and_f32(gt_zero, ones); - npyv_f32 eq_zero_res = npyv_and_f32(eq_zero, b); - return npyv_or_f32(lt_zero_res, npyv_or_f32(gt_zero_res, eq_zero_res)); -} -NPY_FINLINE npyv_f64 npyv_heaviside_f64(npyv_f64 a, npyv_f64 b) -{ - npyv_f64 zeros = npyv_setall_f64(0.0); - npyv_f64 ones = npyv_setall_f64(1.0); - npyv_f64 lt_zero = _mm_cmplt_pd(a, zeros); - npyv_f64 gt_zero = _mm_cmpgt_pd(a, zeros); - npyv_f64 eq_zero = _mm_cmpeq_pd(a, zeros); - npyv_f64 lt_zero_res = npyv_and_f64(lt_zero, zeros); - npyv_f64 gt_zero_res = npyv_and_f64(gt_zero, ones); - npyv_f64 eq_zero_res = npyv_and_f64(eq_zero, b); - return npyv_or_f64(lt_zero_res, npyv_or_f64(gt_zero_res, eq_zero_res)); -} - #endif // _NPY_SIMD_SSE_MATH_H diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h index bc98396d7192..b2e393c7cf77 100644 --- a/numpy/core/src/common/simd/vsx/math.h +++ b/numpy/core/src/common/simd/vsx/math.h @@ -4,9 +4,6 @@ #ifndef _NPY_SIMD_VSX_MATH_H #define _NPY_SIMD_VSX_MATH_H - -#include "misc.h" - /*************************** * Elementary ***************************/ @@ -72,18 +69,4 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_min_u64 vec_min #define npyv_min_s64 vec_min -// heaviside -NPY_FINLINE npyv_f32 npyv_heaviside_f32(npyv_f32 a, npyv_f32 b) -{ - npyv_s32 not_a = (npyv_s32)npyv_not_f32((a)); - npyv_f32 not_zero_ret_val = (npyv_f32)(vec_and(npyv_shri_s32(not_a, 8), npyv_setall_s32(0x3F800000))); - return npyv_select_f32(npyv_cmpeq_f32(a, npyv_setall_f32(0.0)), b, not_zero_ret_val); -} -NPY_FINLINE npyv_f64 npyv_heaviside_f64(npyv_f64 a, npyv_f64 b) -{ - npyv_s64 not_a = (npyv_s64)npyv_not_f64((a)); - npyv_f64 not_zero_ret_val = (npyv_f64)(vec_and(npyv_shri_s64(not_a, 11), npyv_setall_s64(0x3FF0000000000000))); - return npyv_select_f64(npyv_cmpeq_f32(a, npyv_setall_f64(0.0)), b, not_zero_ret_val); -} - #endif // _NPY_SIMD_VSX_MATH_H diff --git a/numpy/core/src/umath/loops_binary_fp.dispatch.c.src b/numpy/core/src/umath/loops_binary_fp.dispatch.c.src index c95ad5d0625b..c618059397da 100644 --- a/numpy/core/src/umath/loops_binary_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_binary_fp.dispatch.c.src @@ -15,7 +15,18 @@ NPY_FINLINE npyv_f32 simd_heaviside_f32(npyv_f32 x, npyv_f32 h) { - // TODO: your implmentation + npyv_f32 zeros = npyv_setall_f32(0.0); + npyv_f32 ones = npyv_setall_f32(1.0); + npyv_b32 lt_zero = npyv_cmplt_f32(x, zeros); + npyv_b32 gt_zero = npyv_cmpgt_f32(x, zeros); + npyv_b32 eq_zero = npyv_cmpeq_f32(x, zeros); + npyv_b32 is_nan = npyv_not_b32(npyv_cmpeq_f32(x, x)); + npyv_f32 lt_zero_res = npyv_and_f32(npyv_cvt_f32_b32(lt_zero), zeros); + npyv_f32 gt_zero_res = npyv_and_f32(npyv_cvt_f32_b32(gt_zero), ones); + npyv_f32 eq_zero_res = npyv_and_f32(npyv_cvt_f32_b32(eq_zero), h); + npyv_f32 is_nan_res = npyv_and_f32(npyv_cvt_f32_b32(is_nan), x); + + return npyv_or_f32(npyv_or_f32(lt_zero_res, npyv_or_f32(gt_zero_res, eq_zero_res)), is_nan_res); } #endif @@ -23,7 +34,18 @@ simd_heaviside_f32(npyv_f32 x, npyv_f32 h) NPY_FINLINE npyv_f64 simd_heaviside_f64(npyv_f64 x, npyv_f64 h) { - // TODO: your implmentation + npyv_f64 zeros = npyv_setall_f64(0.0); + npyv_f64 ones = npyv_setall_f64(1.0); + npyv_b64 lt_zero = npyv_cmplt_f64(x, zeros); + npyv_b64 gt_zero = npyv_cmpgt_f64(x, zeros); + npyv_b64 eq_zero = npyv_cmpeq_f64(x, zeros); + npyv_b64 is_nan = npyv_not_b64(npyv_cmpeq_f64(x, x)); + npyv_f64 lt_zero_res = npyv_and_f64(npyv_cvt_f64_b64(lt_zero), zeros); + npyv_f64 gt_zero_res = npyv_and_f64(npyv_cvt_f64_b64(gt_zero), ones); + npyv_f64 eq_zero_res = npyv_and_f64(npyv_cvt_f64_b64(eq_zero), h); + npyv_f64 is_nan_res = npyv_and_f64(npyv_cvt_f64_b64(is_nan), x); + + return npyv_or_f64(npyv_or_f64(lt_zero_res, npyv_or_f64(gt_zero_res, eq_zero_res)), is_nan_res); } #endif diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 628f4742b6a5..f0c60953bae5 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -415,23 +415,6 @@ def test_special_cases(self): nnan = self.notnan(self.setall(self._nan())) assert nnan == [0]*self.nlanes - def test_heaviside(self): - data_a = self._data() - data_b = self._data(self.nlanes) - vdata_a, vdata_b = self.load(data_a), self.load(data_b) - - def _heaviside(vector_a, vector_b): - if vector_a < 0: - return 0 - elif vector_a == 0: - return vector_b - elif vector_a > 0: - return 1 - - data_heaviside = [_heaviside(a, b) for a, b in zip(data_a, data_b)] - vheaviside = self.heaviside(vdata_a, vdata_b) - assert vheaviside == data_heaviside - class _SIMD_ALL(_Test_Utility): """ To test all vector types at once