From 09486f982754ef9c374c7fba8a4e2bd74881b75c Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Fri, 15 Aug 2025 09:51:43 +0800 Subject: [PATCH 01/15] ENH, SIMD: Optimize the argmax/argmin implementation based on Highway wrapper Signed-off-by: Wang Yang --- numpy/_core/meson.build | 8 +- .../src/multiarray/argfunc.dispatch.c.src | 390 --------------- .../_core/src/multiarray/argfunc.dispatch.cpp | 466 ++++++++++++++++++ 3 files changed, 471 insertions(+), 393 deletions(-) delete mode 100644 numpy/_core/src/multiarray/argfunc.dispatch.c.src create mode 100644 numpy/_core/src/multiarray/argfunc.dispatch.cpp diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index b4c769810ad8..703fba58b1f1 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -817,12 +817,13 @@ multiarray_gen_headers = [ foreach gen_mtargets : [ [ 'argfunc.dispatch.h', - src_file.process('src/multiarray/argfunc.dispatch.c.src'), + 'src/multiarray/argfunc.dispatch.cpp', [ AVX512_SKX, AVX2, XOP, SSE42, SSE2, VSX2, ASIMD, NEON, - VXE, VX + VXE, VX, + RVV ] ], ] @@ -840,7 +841,8 @@ foreach gen_mtargets : [ 'src/multiarray', 'src/multiarray/stringdtype', 'src/npymath', - 'src/umath' + 'src/umath', + 'src/highway' ] ) if not is_variable('multiarray_umath_mtargets') diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.c.src b/numpy/_core/src/multiarray/argfunc.dispatch.c.src deleted file mode 100644 index 79dc111d2438..000000000000 --- a/numpy/_core/src/multiarray/argfunc.dispatch.c.src +++ /dev/null @@ -1,390 +0,0 @@ -/* -*- c -*- */ -#define NPY_NO_DEPRECATED_API NPY_API_VERSION - -#include "simd/simd.h" -#include "numpy/npy_math.h" - -#include "arraytypes.h" - -#define MIN(a,b) (((a)<(b))?(a):(b)) - -#if NPY_SIMD -#if NPY_SIMD > 512 || NPY_SIMD < 0 - #error "the following 8/16-bit argmax kernel isn't applicable for larger SIMD" - // TODO: add special loop for large SIMD width. - // i.e avoid unroll by x4 should be numerically safe till 2048-bit SIMD width - // or maybe expand the indices to 32|64-bit vectors(slower). -#endif -/**begin repeat - * #sfx = u8, s8, u16, s16# - * #usfx = u8, u8, u16, u16# - * #bsfx = b8, b8, b16, b16# - * #idx_max = NPY_MAX_UINT8*2, NPY_MAX_UINT16*2# - */ -/**begin repeat1 - * #intrin = cmpgt, cmplt# - * #func = argmax, argmin# - * #op = >, <# - */ -static inline npy_intp -simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len) -{ - npyv_lanetype_@sfx@ s_acc = *ip; - npy_intp ret_idx = 0, i = 0; - - const int vstep = npyv_nlanes_@sfx@; - const int wstep = vstep*4; - npyv_lanetype_@usfx@ d_vindices[npyv_nlanes_@sfx@*4]; - for (int vi = 0; vi < wstep; ++vi) { - d_vindices[vi] = vi; - } - const npyv_@usfx@ vindices_0 = npyv_load_@usfx@(d_vindices); - const npyv_@usfx@ vindices_1 = npyv_load_@usfx@(d_vindices + vstep); - const npyv_@usfx@ vindices_2 = npyv_load_@usfx@(d_vindices + vstep*2); - const npyv_@usfx@ vindices_3 = npyv_load_@usfx@(d_vindices + vstep*3); - - const npy_intp max_block = @idx_max@*wstep & -wstep; - npy_intp len0 = len & -wstep; - while (i < len0) { - npyv_@sfx@ acc = npyv_setall_@sfx@(s_acc); - npyv_@usfx@ acc_indices = npyv_zero_@usfx@(); - npyv_@usfx@ acc_indices_scale = npyv_zero_@usfx@(); - - npy_intp n = i + MIN(len0 - i, max_block); - npy_intp ik = i, i2 = 0; - for (; i < n; i += wstep, ++i2) { - npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i2); - npyv_@sfx@ a = npyv_load_@sfx@(ip + i); - npyv_@sfx@ b = npyv_load_@sfx@(ip + i + vstep); - npyv_@sfx@ c = npyv_load_@sfx@(ip + i + vstep*2); - npyv_@sfx@ d = npyv_load_@sfx@(ip + i + vstep*3); - - // reverse to put lowest index first in case of matched values - npyv_@bsfx@ m_ba = npyv_@intrin@_@sfx@(b, a); - npyv_@bsfx@ m_dc = npyv_@intrin@_@sfx@(d, c); - npyv_@sfx@ x_ba = npyv_select_@sfx@(m_ba, b, a); - npyv_@sfx@ x_dc = npyv_select_@sfx@(m_dc, d, c); - npyv_@bsfx@ m_dcba = npyv_@intrin@_@sfx@(x_dc, x_ba); - npyv_@sfx@ x_dcba = npyv_select_@sfx@(m_dcba, x_dc, x_ba); - - npyv_@usfx@ idx_ba = npyv_select_@usfx@(m_ba, vindices_1, vindices_0); - npyv_@usfx@ idx_dc = npyv_select_@usfx@(m_dc, vindices_3, vindices_2); - npyv_@usfx@ idx_dcba = npyv_select_@usfx@(m_dcba, idx_dc, idx_ba); - npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(x_dcba, acc); - acc = npyv_select_@sfx@(m_acc, x_dcba, acc); - acc_indices = npyv_select_@usfx@(m_acc, idx_dcba, acc_indices); - acc_indices_scale = npyv_select_@usfx@(m_acc, vi, acc_indices_scale); - } - // reduce - npyv_lanetype_@sfx@ dacc[npyv_nlanes_@sfx@]; - npyv_lanetype_@usfx@ dacc_i[npyv_nlanes_@sfx@]; - npyv_lanetype_@usfx@ dacc_s[npyv_nlanes_@sfx@]; - npyv_store_@sfx@(dacc, acc); - npyv_store_@usfx@(dacc_i, acc_indices); - npyv_store_@usfx@(dacc_s, acc_indices_scale); - - for (int vi = 0; vi < vstep; ++vi) { - if (dacc[vi] @op@ s_acc) { - s_acc = dacc[vi]; - ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi]; - } - } - // get the lowest index in case of matched values - for (int vi = 0; vi < vstep; ++vi) { - npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi]; - if (s_acc == dacc[vi] && ret_idx > idx) { - ret_idx = idx; - } - } - } - for (; i < len; ++i) { - npyv_lanetype_@sfx@ a = ip[i]; - if (a @op@ s_acc) { - s_acc = a; - ret_idx = i; - } - } - return ret_idx; -} -/**end repeat1**/ -/**end repeat**/ -#endif - -/**begin repeat - * #sfx = u32, s32, u64, s64, f32, f64# - * #usfx = u32, u32, u64, u64, u32, u64# - * #bsfx = b32, b32, b64, b64, b32, b64# - * #is_fp = 0*4, 1*2# - * #is_idx32 = 1*2, 0*2, 1, 0# - * #chk_simd = NPY_SIMD*4, NPY_SIMD_F32, NPY_SIMD_F64# - */ -#if @chk_simd@ -/**begin repeat1 - * #intrin = cmpgt, cmplt# - * #func = argmax, argmin# - * #op = >, <# - * #iop = <, ># - */ -static inline npy_intp -simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len) -{ - npyv_lanetype_@sfx@ s_acc = *ip; - npy_intp ret_idx = 0, i = 0; - const int vstep = npyv_nlanes_@sfx@; - const int wstep = vstep*4; - // loop by a scalar will perform better for small arrays - if (len < wstep) { - goto scalar_loop; - } - npy_intp len0 = len; - // guard against wraparound vector addition for 32-bit indices - // in case of the array length is larger than 16gb -#if @is_idx32@ - if (len0 > NPY_MAX_UINT32) { - len0 = NPY_MAX_UINT32; - } -#endif - // create index for vector indices - npyv_lanetype_@usfx@ d_vindices[npyv_nlanes_@sfx@*4]; - for (int vi = 0; vi < wstep; ++vi) { - d_vindices[vi] = vi; - } - const npyv_@usfx@ vindices_0 = npyv_load_@usfx@(d_vindices); - const npyv_@usfx@ vindices_1 = npyv_load_@usfx@(d_vindices + vstep); - const npyv_@usfx@ vindices_2 = npyv_load_@usfx@(d_vindices + vstep*2); - const npyv_@usfx@ vindices_3 = npyv_load_@usfx@(d_vindices + vstep*3); - // initialize vector accumulator for highest values and its indexes - npyv_@usfx@ acc_indices = npyv_zero_@usfx@(); - npyv_@sfx@ acc = npyv_setall_@sfx@(s_acc); - for (npy_intp n = len0 & -wstep; i < n; i += wstep) { - npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i); - npyv_@sfx@ a = npyv_load_@sfx@(ip + i); - npyv_@sfx@ b = npyv_load_@sfx@(ip + i + vstep); - npyv_@sfx@ c = npyv_load_@sfx@(ip + i + vstep*2); - npyv_@sfx@ d = npyv_load_@sfx@(ip + i + vstep*3); - - // reverse to put lowest index first in case of matched values - npyv_@bsfx@ m_ba = npyv_@intrin@_@sfx@(b, a); - npyv_@bsfx@ m_dc = npyv_@intrin@_@sfx@(d, c); - npyv_@sfx@ x_ba = npyv_select_@sfx@(m_ba, b, a); - npyv_@sfx@ x_dc = npyv_select_@sfx@(m_dc, d, c); - npyv_@bsfx@ m_dcba = npyv_@intrin@_@sfx@(x_dc, x_ba); - npyv_@sfx@ x_dcba = npyv_select_@sfx@(m_dcba, x_dc, x_ba); - - npyv_@usfx@ idx_ba = npyv_select_@usfx@(m_ba, vindices_1, vindices_0); - npyv_@usfx@ idx_dc = npyv_select_@usfx@(m_dc, vindices_3, vindices_2); - npyv_@usfx@ idx_dcba = npyv_select_@usfx@(m_dcba, idx_dc, idx_ba); - npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(x_dcba, acc); - acc = npyv_select_@sfx@(m_acc, x_dcba, acc); - acc_indices = npyv_select_@usfx@(m_acc, npyv_add_@usfx@(vi, idx_dcba), acc_indices); - - #if @is_fp@ - npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a); - npyv_@bsfx@ nnan_b = npyv_notnan_@sfx@(b); - npyv_@bsfx@ nnan_c = npyv_notnan_@sfx@(c); - npyv_@bsfx@ nnan_d = npyv_notnan_@sfx@(d); - npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b); - npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d); - npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd)); - if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) { - npy_uint64 nnan_4[4]; - nnan_4[0] = npyv_tobits_@bsfx@(nnan_a); - nnan_4[1] = npyv_tobits_@bsfx@(nnan_b); - nnan_4[2] = npyv_tobits_@bsfx@(nnan_c); - nnan_4[3] = npyv_tobits_@bsfx@(nnan_d); - for (int ni = 0; ni < 4; ++ni) { - for (int vi = 0; vi < vstep; ++vi) { - if (!((nnan_4[ni] >> vi) & 1)) { - return i + ni*vstep + vi; - } - } - } - } - #endif - } - for (npy_intp n = len0 & -vstep; i < n; i += vstep) { - npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i); - npyv_@sfx@ a = npyv_load_@sfx@(ip + i); - npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(a, acc); - acc = npyv_select_@sfx@(m_acc, a, acc); - acc_indices = npyv_select_@usfx@(m_acc, npyv_add_@usfx@(vi, vindices_0), acc_indices); - #if @is_fp@ - npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a); - npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a); - if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) { - for (int vi = 0; vi < vstep; ++vi) { - if (!((nnan >> vi) & 1)) { - return i + vi; - } - } - } - #endif - } - - // reduce - npyv_lanetype_@sfx@ dacc[npyv_nlanes_@sfx@]; - npyv_lanetype_@usfx@ dacc_i[npyv_nlanes_@sfx@]; - npyv_store_@usfx@(dacc_i, acc_indices); - npyv_store_@sfx@(dacc, acc); - - s_acc = dacc[0]; - ret_idx = dacc_i[0]; - for (int vi = 1; vi < vstep; ++vi) { - if (dacc[vi] @op@ s_acc) { - s_acc = dacc[vi]; - ret_idx = (npy_intp)dacc_i[vi]; - } - } - // get the lowest index in case of matched values - for (int vi = 0; vi < vstep; ++vi) { - if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) { - ret_idx = dacc_i[vi]; - } - } -scalar_loop: - for (; i < len; ++i) { - npyv_lanetype_@sfx@ a = ip[i]; - #if @is_fp@ - if (!(a @iop@= s_acc)) { // negated, for correct nan handling - #else - if (a @op@ s_acc) { - #endif - s_acc = a; - ret_idx = i; - #if @is_fp@ - if (npy_isnan(s_acc)) { - // nan encountered, it's maximal - return ret_idx; - } - #endif - } - } - return ret_idx; -} -/**end repeat1**/ -#endif // chk_simd -/**end repeat**/ - -/**begin repeat - * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, - * BYTE, SHORT, INT, LONG, LONGLONG, - * FLOAT, DOUBLE, LONGDOUBLE# - * - * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG, - * BYTE, SHORT, INT, LONG, LONGLONG, - * FLOAT, DOUBLE, LONGDOUBLE# - * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, - * npy_byte, npy_short, npy_int, npy_long, npy_longlong, - * npy_float, npy_double, npy_longdouble# - * - * #is_fp = 0*10, 1*3# - * #is_unsigned = 1*5, 0*5, 0*3# - */ -#undef TO_SIMD_SFX -#if 0 -/**begin repeat1 - * #len = 8, 16, 32, 64# - */ -#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@ - #if @is_fp@ - #define TO_SIMD_SFX(X) X##_f@len@ - #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64 - #undef TO_SIMD_SFX - #endif - #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32 - #undef TO_SIMD_SFX - #endif - #elif @is_unsigned@ - #define TO_SIMD_SFX(X) X##_u@len@ - #else - #define TO_SIMD_SFX(X) X##_s@len@ - #endif -/**end repeat1**/ -#endif - -/**begin repeat1 - * #func = argmax, argmin# - * #op = >, <# - * #iop = <, ># - */ -NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@) -(@type@ *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip)) -{ -#if @is_fp@ - if (npy_isnan(*ip)) { - // nan encountered; it's maximal|minimal - *mindx = 0; - return 0; - } -#endif -#ifdef TO_SIMD_SFX - *mindx = TO_SIMD_SFX(simd_@func@)((TO_SIMD_SFX(npyv_lanetype)*)ip, n); - npyv_cleanup(); -#else - @type@ mp = *ip; - *mindx = 0; - npy_intp i = 1; - - for (; i < n; ++i) { - @type@ a = ip[i]; - #if @is_fp@ - if (!(a @iop@= mp)) { // negated, for correct nan handling - #else - if (a @op@ mp) { - #endif - mp = a; - *mindx = i; - #if @is_fp@ - if (npy_isnan(mp)) { - // nan encountered, it's maximal|minimal - break; - } - #endif - } - } -#endif // TO_SIMD_SFX - return 0; -} -/**end repeat1**/ -/**end repeat**/ - -NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax) -(npy_bool *ip, npy_intp len, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip)) - -{ - npy_intp i = 0; -#if NPY_SIMD - const npyv_u8 zero = npyv_zero_u8(); - const int vstep = npyv_nlanes_u8; - const int wstep = vstep * 4; - for (npy_intp n = len & -wstep; i < n; i += wstep) { - npyv_u8 a = npyv_load_u8(ip + i + vstep*0); - npyv_u8 b = npyv_load_u8(ip + i + vstep*1); - npyv_u8 c = npyv_load_u8(ip + i + vstep*2); - npyv_u8 d = npyv_load_u8(ip + i + vstep*3); - npyv_b8 m_a = npyv_cmpeq_u8(a, zero); - npyv_b8 m_b = npyv_cmpeq_u8(b, zero); - npyv_b8 m_c = npyv_cmpeq_u8(c, zero); - npyv_b8 m_d = npyv_cmpeq_u8(d, zero); - npyv_b8 m_ab = npyv_and_b8(m_a, m_b); - npyv_b8 m_cd = npyv_and_b8(m_c, m_d); - npy_uint64 m = npyv_tobits_b8(npyv_and_b8(m_ab, m_cd)); - #if NPY_SIMD == 512 - if (m != NPY_MAX_UINT64) { - #else - if ((npy_int64)m != ((1LL << vstep) - 1)) { - #endif - break; - } - } - npyv_cleanup(); -#endif // NPY_SIMD - for (; i < len; ++i) { - if (ip[i]) { - *mindx = i; - return 0; - } - } - *mindx = 0; - return 0; -} diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp new file mode 100644 index 000000000000..498f8ea861ff --- /dev/null +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -0,0 +1,466 @@ +#include "simd/simd.h" +#include "numpy/npy_math.h" +#include "numpy/npy_common.h" +#include "common.hpp" +#include "arraytypes.h" +#include "simd/simd.hpp" +#include + +#define MIN(a,b) (((a)<(b))?(a):(b)) + +namespace { +using namespace np::simd; + +template +struct OpGt { +#if NPY_HWY + template >> + HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) { + return hn::Gt(a, b); + } +#endif + HWY_INLINE bool operator()(T a, T b) { + return a > b; + } + + HWY_INLINE bool negated_op(T a, T b) { + return a <= b; + } +}; + +template <> +struct OpGt { + HWY_INLINE bool operator()(long double a, long double b) { + return a > b; + } + + HWY_INLINE bool negated_op(long double a, long double b) { + return a <= b; + } +}; + +template +struct OpLt { +#if NPY_HWY + template >> + HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) { + return hn::Lt(a, b); + } +#endif + HWY_INLINE bool operator()(T a, T b) { + return a < b; + } + + HWY_INLINE bool negated_op(T a, T b) { + return a >= b; + } +}; + +template <> +struct OpLt { + HWY_INLINE bool operator()(long double a, long double b) { + return a < b; + } + + HWY_INLINE bool negated_op(long double a, long double b) { + return a >= b; + } +}; + +#if NPY_HWY +template +static HWY_INLINE HWY_ATTR npy_intp +simd_argfunc_small(T *ip, npy_intp len) +{ + //static_assert(kMaxLanes <= 64, + // "the following 8/16-bit argmax kernel isn't applicable for larger SIMD"); + /* TODO: add special loop for large SIMD width. + i.e avoid unroll by x4 should be numerically safe till 2048-bit SIMD width + or maybe expand the indices to 32|64-bit vectors(slower). */ + + using UnsignedT = std::conditional_t; + constexpr npy_intp idx_max = (sizeof(T) == 1) ? NPY_MAX_UINT8 : NPY_MAX_UINT16; + + Op op_func; + T s_acc = *ip; + npy_intp ret_idx = 0, i = 0; + + const int vstep = Lanes(); + const int wstep = vstep*4; + UnsignedT d_vindices[Lanes()*4]; + for (int vi = 0; vi < wstep; ++vi) { + d_vindices[vi] = vi; + } + const auto vindices_0 = LoadU(d_vindices); + const auto vindices_1 = LoadU(d_vindices + vstep); + const auto vindices_2 = LoadU(d_vindices + vstep*2); + const auto vindices_3 = LoadU(d_vindices + vstep*3); + + const npy_intp max_block = idx_max*wstep & -wstep; + npy_intp len0 = len & -wstep; + while (i < len0) { + auto acc = Set(T(s_acc)); + auto acc_indices = Zero(); + auto acc_indices_scale = Zero(); + + npy_intp n = i + MIN(len0 - i, max_block); + npy_intp ik = i, i2 = 0; + for (; i < n; i += wstep, ++i2) { + auto vi = Set(UnsignedT(i2)); + auto a = LoadU(ip + i); + auto b = LoadU(ip + i + vstep); + auto c = LoadU(ip + i + vstep*2); + auto d = LoadU(ip + i + vstep*3); + + // reverse to put lowest index first in case of matched values + auto m_ba = op_func(b, a); + auto m_dc = op_func(d, c); + auto x_ba = hn::IfThenElse(hn::RebindMask(_Tag(), m_ba), b, a); + auto x_dc = hn::IfThenElse(hn::RebindMask(_Tag(), m_dc), d, c); + auto m_dcba = op_func(x_dc, x_ba); + auto x_dcba = hn::IfThenElse(hn::RebindMask(_Tag(), m_dcba), x_dc, x_ba); + + auto idx_ba = hn::IfThenElse(hn::RebindMask(_Tag(), m_ba), vindices_1, vindices_0); + auto idx_dc = hn::IfThenElse(hn::RebindMask(_Tag(), m_dc), vindices_3, vindices_2); + auto idx_dcba = hn::IfThenElse(hn::RebindMask(_Tag(), m_dcba), idx_dc, idx_ba); + auto m_acc = op_func(x_dcba, acc); + acc = hn::IfThenElse(hn::RebindMask(_Tag(), m_acc), x_dcba, acc); + acc_indices = hn::IfThenElse(hn::RebindMask(_Tag(), m_acc), idx_dcba, acc_indices); + acc_indices_scale = hn::IfThenElse(hn::RebindMask(_Tag(), m_acc), vi, acc_indices_scale); + } + // reduce + T dacc[Lanes()]; + UnsignedT dacc_i[Lanes()]; + UnsignedT dacc_s[Lanes()]; + + StoreU(acc, dacc); + StoreU(acc_indices, dacc_i); + StoreU(acc_indices_scale, dacc_s); + + for (int vi = 0; vi < vstep; ++vi) { + if (op_func(dacc[vi], s_acc)) { + s_acc = dacc[vi]; + ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi]; + } + } + // get the lowest index in case of matched values + for (int vi = 0; vi < vstep; ++vi) { + npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi]; + if (s_acc == dacc[vi] && ret_idx > idx) { + ret_idx = idx; + } + } + } + + for (; i < len; ++i) { + T a = ip[i]; + if (op_func(a, s_acc)) { + s_acc = a; + ret_idx = i; + } + } + return ret_idx; +} + +template > +static HWY_INLINE HWY_ATTR npy_intp +simd_argfunc_large(T *ip, npy_intp len) +{ + using UnsignedT = std::conditional_t; + constexpr bool is_idx32 = sizeof(T) <= 4; + + Op op_func; + T s_acc = *ip; + npy_intp ret_idx = 0, i = 0; + const int vstep = Lanes(); + const int wstep = vstep*4; + + // loop by a scalar will perform better for small arrays + if (len >= wstep) { + npy_intp len0 = len; + // guard against wraparound vector addition for 32-bit indices + // in case of the array length is larger than 16gb + if constexpr (is_idx32) { + if (len0 > NPY_MAX_UINT32) { + len0 = NPY_MAX_UINT32; + } + } + // create index for vector indices + UnsignedT d_vindices[Lanes()*4]; + for (int vi = 0; vi < wstep; ++vi) { + d_vindices[vi] = vi; + } + const auto vindices_0 = LoadU(d_vindices); + const auto vindices_1 = LoadU(d_vindices + vstep); + const auto vindices_2 = LoadU(d_vindices + vstep*2); + const auto vindices_3 = LoadU(d_vindices + vstep*3); + + // initialize vector accumulator for highest values and its indexes + auto acc_indices = Zero(); + auto acc = Set(T(s_acc)); + for (npy_intp n = len0 & -wstep; i < n; i += wstep) { + auto vi = Set(UnsignedT(i)); + auto a = LoadU(ip + i); + auto b = LoadU(ip + i + vstep); + auto c = LoadU(ip + i + vstep*2); + auto d = LoadU(ip + i + vstep*3); + + // reverse to put lowest index first in case of matched values + auto m_ba = op_func(b, a); + auto m_dc = op_func(d, c); + auto x_ba = hn::IfThenElse(hn::RebindMask(_Tag(), m_ba), b, a); + auto x_dc = hn::IfThenElse(hn::RebindMask(_Tag(), m_dc), d, c); + auto m_dcba = op_func(x_dc, x_ba); + auto x_dcba = hn::IfThenElse(hn::RebindMask(_Tag(), m_dcba), x_dc, x_ba); + + auto idx_ba = hn::IfThenElse(hn::RebindMask(_Tag(), m_ba), vindices_1, vindices_0); + auto idx_dc = hn::IfThenElse(hn::RebindMask(_Tag(), m_dc), vindices_3, vindices_2); + auto idx_dcba = hn::IfThenElse(hn::RebindMask(_Tag(), m_dcba), idx_dc, idx_ba); + auto m_acc = op_func(x_dcba, acc); + acc = hn::IfThenElse(hn::RebindMask(_Tag(), m_acc), x_dcba, acc); + acc_indices = hn::IfThenElse(hn::RebindMask(_Tag(), m_acc), hn::Add(vi, idx_dcba), acc_indices); + + if constexpr (IsFloatingPoint) { + auto nnan_a = hn::Not(hn::IsNaN(a)); + auto nnan_b = hn::Not(hn::IsNaN(b)); + auto nnan_c = hn::Not(hn::IsNaN(c)); + auto nnan_d = hn::Not(hn::IsNaN(d)); + auto nnan_ab = hn::And(nnan_a, nnan_b); + auto nnan_cd = hn::And(nnan_c, nnan_d); + + npy_uint64 nnan = 0; + hn::StoreMaskBits(_Tag(), hn::And(nnan_ab, nnan_cd), (uint8_t*)&nnan); + + if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) { + npy_uint64 nnan_4[4]; + hn::StoreMaskBits(_Tag(), nnan_a, (uint8_t*)&(nnan_4[0])); + hn::StoreMaskBits(_Tag(), nnan_b, (uint8_t*)&(nnan_4[1])); + hn::StoreMaskBits(_Tag(), nnan_c, (uint8_t*)&(nnan_4[2])); + hn::StoreMaskBits(_Tag(), nnan_d, (uint8_t*)&(nnan_4[3])); + for (int ni = 0; ni < 4; ++ni) { + for (int vi = 0; vi < vstep; ++vi) { + if (!((nnan_4[ni] >> vi) & 1)) { + return i + ni*vstep + vi; + } + } + } + } + } + } + + for (npy_intp n = len0 & -vstep; i < n; i += vstep) { + auto vi = Set(UnsignedT(i)); + auto a = LoadU(ip + i); + auto m_acc = op_func(a, acc); + + acc = hn::IfThenElse(hn::RebindMask(_Tag(), m_acc), a, acc); + acc_indices = hn::IfThenElse(hn::RebindMask(_Tag(), m_acc), hn::Add(vi, vindices_0), acc_indices); + + if constexpr (IsFloatingPoint) { + auto nnan_a = hn::Not(hn::IsNaN(a)); + + npy_uint64 nnan = 0; + hn::StoreMaskBits(_Tag(), nnan_a, (uint8_t*)&nnan); + + if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) { + for (int vi = 0; vi < vstep; ++vi) { + if (!((nnan >> vi) & 1)) { + return i + vi; + } + } + } + } + } + + // reduce + T dacc[Lanes()]; + UnsignedT dacc_i[Lanes()]; + + StoreU(acc_indices, dacc_i); + StoreU(acc, dacc); + + s_acc = dacc[0]; + ret_idx = dacc_i[0]; + for (int vi = 1; vi < vstep; ++vi) { + if (op_func(dacc[vi], s_acc)) { + s_acc = dacc[vi]; + ret_idx = (npy_intp)dacc_i[vi]; + } + } + // get the lowest index in case of matched values + for (int vi = 0; vi < vstep; ++vi) { + if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) { + ret_idx = dacc_i[vi]; + } + } + } + + //scalar loop + for (; i < len; ++i) { + T a = ip[i]; + if constexpr (IsFloatingPoint) { + if (!op_func.negated_op(a, s_acc)) { // negated, for correct nan handling + s_acc = a; + ret_idx = i; + if (npy_isnan(s_acc)) { + // nan encountered, it's maximal + return ret_idx; + } + } + } else { + if (op_func(a, s_acc)) { + s_acc = a; + ret_idx = i; + } + } + } + return ret_idx; +} +#endif //NPY_HWY + +template +HWY_INLINE HWY_ATTR int +arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx) +{ + Op op_func; + + if constexpr (std::is_floating_point_v) { + if (npy_isnan(*ip)){ + // nan encountered; it's maximal | minimal + *mindx = 0; + return 0; + } + } + +#if NPY_HWY + if constexpr (kSupportLane) { + if constexpr (sizeof(T) <= 2) { + *mindx = simd_argfunc_small(ip, n); + } else if constexpr (sizeof(long double) != sizeof(double) || !std::is_same_v) { + *mindx = simd_argfunc_large(ip, n); + } + return 0; + } +#endif + + T mp = *ip; + *mindx = 0; + npy_intp i = 1; + + for (; i < n; ++i) { + T a = ip[i]; + if constexpr (std::is_floating_point_v) { + if (!op_func.negated_op(a, mp)) { // negated, for correct nan handling + mp = a; + *mindx = i; + if (npy_isnan(mp)){ + // nan encountered, it's maximal|minimal + break; + } + } + } else { + if (op_func(a, mp)) { + mp = a; + *mindx = i; + } + } + } + + return 0; +} +} // namespace anonymous + +/*********************************************************************************** + ** Defining argfunc inner functions + ***********************************************************************************/ +#define DEFINE_ARGFUNC_INNER_FUNCTION(TYPE, KIND, INTR, T) \ +NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND) \ +(T *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip)) \ +{ \ + using FixedType = typename np::meta::FixedWidth::Type; \ + arg_max_min_func>(reinterpret_cast(ip), n, max_ind); \ + return 0; \ +} + +#define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR) \ +NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND) \ +(long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip)) \ +{ \ + arg_max_min_func>(ip, n, max_ind); \ + return 0; \ +} + +DEFINE_ARGFUNC_INNER_FUNCTION(UBYTE, argmax, Gt, npy_ubyte) +DEFINE_ARGFUNC_INNER_FUNCTION(USHORT, argmax, Gt, npy_ushort) +DEFINE_ARGFUNC_INNER_FUNCTION(UINT, argmax, Gt, npy_uint) +DEFINE_ARGFUNC_INNER_FUNCTION(ULONG, argmax, Gt, npy_ulong) +DEFINE_ARGFUNC_INNER_FUNCTION(ULONGLONG, argmax, Gt, npy_ulonglong) +DEFINE_ARGFUNC_INNER_FUNCTION(BYTE, argmax, Gt, npy_byte) +DEFINE_ARGFUNC_INNER_FUNCTION(SHORT, argmax, Gt, npy_short) +DEFINE_ARGFUNC_INNER_FUNCTION(INT, argmax, Gt, npy_int) +DEFINE_ARGFUNC_INNER_FUNCTION(LONG, argmax, Gt, npy_long) +DEFINE_ARGFUNC_INNER_FUNCTION(LONGLONG, argmax, Gt, npy_longlong) +DEFINE_ARGFUNC_INNER_FUNCTION(FLOAT, argmax, Gt, npy_float) +DEFINE_ARGFUNC_INNER_FUNCTION(DOUBLE, argmax, Gt, npy_double) +DEFINE_ARGFUNC_INNER_FUNCTION(UBYTE, argmin, Lt, npy_ubyte) +DEFINE_ARGFUNC_INNER_FUNCTION(USHORT, argmin, Lt, npy_ushort) +DEFINE_ARGFUNC_INNER_FUNCTION(UINT, argmin, Lt, npy_uint) +DEFINE_ARGFUNC_INNER_FUNCTION(ULONG, argmin, Lt, npy_ulong) +DEFINE_ARGFUNC_INNER_FUNCTION(ULONGLONG, argmin, Lt, npy_ulonglong) +DEFINE_ARGFUNC_INNER_FUNCTION(BYTE, argmin, Lt, npy_byte) +DEFINE_ARGFUNC_INNER_FUNCTION(SHORT, argmin, Lt, npy_short) +DEFINE_ARGFUNC_INNER_FUNCTION(INT, argmin, Lt, npy_int) +DEFINE_ARGFUNC_INNER_FUNCTION(LONG, argmin, Lt, npy_long) +DEFINE_ARGFUNC_INNER_FUNCTION(LONGLONG, argmin, Lt, npy_longlong) +DEFINE_ARGFUNC_INNER_FUNCTION(FLOAT, argmin, Lt, npy_float) +DEFINE_ARGFUNC_INNER_FUNCTION(DOUBLE, argmin, Lt, npy_double) +DEFINE_ARGFUNC_INNER_FUNCTION_LD(LONGDOUBLE, argmax, Gt) +DEFINE_ARGFUNC_INNER_FUNCTION_LD(LONGDOUBLE, argmin, Lt) + +#undef DEFINE_ARGFUNC_INNER_FUNCTION +#undef DEFINE_ARGFUNC_INNER_FUNCTION_LD + + +NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax) +(npy_bool *ip, npy_intp len, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip)) +{ + npy_intp i = 0; +#if NPY_HWY + const auto zero = Zero(); + const int vstep = Lanes(); + const int wstep = vstep * 4; + for (npy_intp n = len & -wstep; i < n; i += wstep) { + auto a = LoadU(ip + i + vstep*0); + auto b = LoadU(ip + i + vstep*1); + auto c = LoadU(ip + i + vstep*2); + auto d = LoadU(ip + i + vstep*3); + auto m_a = hn::Eq(a, zero); + auto m_b = hn::Eq(b, zero); + auto m_c = hn::Eq(c, zero); + auto m_d = hn::Eq(d, zero); + auto m_ab = hn::And(m_a, m_b); + auto m_cd = hn::And(m_c, m_d); + + npy_uint64 m = 0; + hn::StoreMaskBits(_Tag(), hn::And(m_ab, m_cd), (uint8_t*)&m); + + if constexpr (kMaxLanes == 512) { + if (m != NPY_MAX_UINT64) + break; + }else{ + if ((npy_int64)m != ((1LL << vstep) - 1)) + break; + } + } + +#endif // NPY_HWY + + for (; i < len; ++i) { + if (ip[i]) { + *max_ind = i; + return 0; + } + } + *max_ind = 0; + return 0; +} From 405c24fe11c586d3ba769855d251a2d2f26052f7 Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Fri, 15 Aug 2025 13:54:59 +0800 Subject: [PATCH 02/15] fix compile error C2131: expression did not evaluate to a constant --- .../_core/src/multiarray/argfunc.dispatch.cpp | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp index 498f8ea861ff..beed57b3caf7 100644 --- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -19,7 +19,7 @@ struct OpGt { return hn::Gt(a, b); } #endif - HWY_INLINE bool operator()(T a, T b) { + HWY_INLINE bool operator()(T a, T b) { return a > b; } @@ -85,16 +85,16 @@ simd_argfunc_small(T *ip, npy_intp len) T s_acc = *ip; npy_intp ret_idx = 0, i = 0; - const int vstep = Lanes(); + HWY_LANES_CONSTEXPR size_t vstep = Lanes(); const int wstep = vstep*4; - UnsignedT d_vindices[Lanes()*4]; + std::vector d_vindices(vstep*4); for (int vi = 0; vi < wstep; ++vi) { d_vindices[vi] = vi; } - const auto vindices_0 = LoadU(d_vindices); - const auto vindices_1 = LoadU(d_vindices + vstep); - const auto vindices_2 = LoadU(d_vindices + vstep*2); - const auto vindices_3 = LoadU(d_vindices + vstep*3); + const auto vindices_0 = LoadU(d_vindices.data()); + const auto vindices_1 = LoadU(d_vindices.data()+vstep); + const auto vindices_2 = LoadU(d_vindices.data()+vstep*2); + const auto vindices_3 = LoadU(d_vindices.data()+vstep*3); const npy_intp max_block = idx_max*wstep & -wstep; npy_intp len0 = len & -wstep; @@ -129,13 +129,13 @@ simd_argfunc_small(T *ip, npy_intp len) acc_indices_scale = hn::IfThenElse(hn::RebindMask(_Tag(), m_acc), vi, acc_indices_scale); } // reduce - T dacc[Lanes()]; - UnsignedT dacc_i[Lanes()]; - UnsignedT dacc_s[Lanes()]; + std::vector dacc(vstep); + std::vector dacc_i(vstep); + std::vector dacc_s(vstep); - StoreU(acc, dacc); - StoreU(acc_indices, dacc_i); - StoreU(acc_indices_scale, dacc_s); + StoreU(acc, dacc.data()); + StoreU(acc_indices, dacc_i.data()); + StoreU(acc_indices_scale, dacc_s.data()); for (int vi = 0; vi < vstep; ++vi) { if (op_func(dacc[vi], s_acc)) { @@ -172,7 +172,7 @@ simd_argfunc_large(T *ip, npy_intp len) Op op_func; T s_acc = *ip; npy_intp ret_idx = 0, i = 0; - const int vstep = Lanes(); + HWY_LANES_CONSTEXPR size_t vstep = Lanes(); const int wstep = vstep*4; // loop by a scalar will perform better for small arrays @@ -186,14 +186,14 @@ simd_argfunc_large(T *ip, npy_intp len) } } // create index for vector indices - UnsignedT d_vindices[Lanes()*4]; + std::vector d_vindices(vstep*4); for (int vi = 0; vi < wstep; ++vi) { d_vindices[vi] = vi; } - const auto vindices_0 = LoadU(d_vindices); - const auto vindices_1 = LoadU(d_vindices + vstep); - const auto vindices_2 = LoadU(d_vindices + vstep*2); - const auto vindices_3 = LoadU(d_vindices + vstep*3); + const auto vindices_0 = LoadU(d_vindices.data()); + const auto vindices_1 = LoadU(d_vindices.data()+vstep); + const auto vindices_2 = LoadU(d_vindices.data()+vstep*2); + const auto vindices_3 = LoadU(d_vindices.data()+vstep*3); // initialize vector accumulator for highest values and its indexes auto acc_indices = Zero(); @@ -273,11 +273,11 @@ simd_argfunc_large(T *ip, npy_intp len) } // reduce - T dacc[Lanes()]; - UnsignedT dacc_i[Lanes()]; + std::vector dacc(vstep); + std::vector dacc_i(vstep); - StoreU(acc_indices, dacc_i); - StoreU(acc, dacc); + StoreU(acc, dacc.data()); + StoreU(acc_indices, dacc_i.data()); s_acc = dacc[0]; ret_idx = dacc_i[0]; @@ -444,7 +444,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax) npy_uint64 m = 0; hn::StoreMaskBits(_Tag(), hn::And(m_ab, m_cd), (uint8_t*)&m); - if constexpr (kMaxLanes == 512) { + if constexpr (kMaxLanes == 64) { if (m != NPY_MAX_UINT64) break; }else{ From a6af88278c7af4369a480aed00b07353a8e6ccb2 Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Fri, 15 Aug 2025 15:41:27 +0800 Subject: [PATCH 03/15] fix compile error LNK2001: unresolved external symbol _LONGDOUBLE_argmax --- numpy/_core/src/multiarray/argfunc.dispatch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp index beed57b3caf7..31e10109cdda 100644 --- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -336,7 +336,7 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx) if constexpr (kSupportLane) { if constexpr (sizeof(T) <= 2) { *mindx = simd_argfunc_small(ip, n); - } else if constexpr (sizeof(long double) != sizeof(double) || !std::is_same_v) { + } else { *mindx = simd_argfunc_large(ip, n); } return 0; @@ -447,7 +447,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax) if constexpr (kMaxLanes == 64) { if (m != NPY_MAX_UINT64) break; - }else{ + }else if constexpr(kMaxLanes < 64){ if ((npy_int64)m != ((1LL << vstep) - 1)) break; } From c53bd0fa248554bfc2a0ea764637073e6ca4ef7d Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Fri, 15 Aug 2025 16:23:27 +0800 Subject: [PATCH 04/15] fix compile error when sizeof(long double)==sizeof(double) --- numpy/_core/src/multiarray/argfunc.dispatch.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp index 31e10109cdda..353991d6bd3b 100644 --- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -85,7 +85,7 @@ simd_argfunc_small(T *ip, npy_intp len) T s_acc = *ip; npy_intp ret_idx = 0, i = 0; - HWY_LANES_CONSTEXPR size_t vstep = Lanes(); + HWY_LANES_CONSTEXPR int vstep = Lanes(); const int wstep = vstep*4; std::vector d_vindices(vstep*4); for (int vi = 0; vi < wstep; ++vi) { @@ -172,7 +172,7 @@ simd_argfunc_large(T *ip, npy_intp len) Op op_func; T s_acc = *ip; npy_intp ret_idx = 0, i = 0; - HWY_LANES_CONSTEXPR size_t vstep = Lanes(); + HWY_LANES_CONSTEXPR int vstep = Lanes(); const int wstep = vstep*4; // loop by a scalar will perform better for small arrays @@ -336,10 +336,11 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx) if constexpr (kSupportLane) { if constexpr (sizeof(T) <= 2) { *mindx = simd_argfunc_small(ip, n); - } else { + return 0; + } else if constexpr (sizeof(long double) != sizeof(double)){ *mindx = simd_argfunc_large(ip, n); + return 0; } - return 0; } #endif From c683293443da9ed5eea33d7591c6c1b5be864615 Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Fri, 15 Aug 2025 18:03:24 +0800 Subject: [PATCH 05/15] fix compile error when sizeof(long double)==sizeof(double)[2] --- .../_core/src/multiarray/argfunc.dispatch.cpp | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp index 353991d6bd3b..4950d4aace0a 100644 --- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -337,7 +337,7 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx) if constexpr (sizeof(T) <= 2) { *mindx = simd_argfunc_small(ip, n); return 0; - } else if constexpr (sizeof(long double) != sizeof(double)){ + } else { *mindx = simd_argfunc_large(ip, n); return 0; } @@ -383,13 +383,23 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND) return 0; \ } -#define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR) \ -NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND) \ -(long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip)) \ -{ \ - arg_max_min_func>(ip, n, max_ind); \ - return 0; \ -} +#if NPY_SIZEOF_LONGDOUBLE != NPY_SIZEOF_DOUBLE + #define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR) \ + NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND) \ + (long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip)) \ + { \ + arg_max_min_func>(ip, n, max_ind); \ + return 0; \ + } +#else + #define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR) \ + NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND) \ + (long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip)) \ + { \ + arg_max_min_func>(reinterpret_cast(ip), n, max_ind); \ + return 0; \ + } +#endif DEFINE_ARGFUNC_INNER_FUNCTION(UBYTE, argmax, Gt, npy_ubyte) DEFINE_ARGFUNC_INNER_FUNCTION(USHORT, argmax, Gt, npy_ushort) From 544b3a281e97f6426e2a5cd05080d50f6fa33108 Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Mon, 18 Aug 2025 13:55:19 +0800 Subject: [PATCH 06/15] fix compile error when sizeof(long double)==sizeof(double)[3] --- .../_core/src/multiarray/argfunc.dispatch.cpp | 79 +++++++++---------- numpy/_core/src/multiarray/arraytypes.h.src | 4 + 2 files changed, 40 insertions(+), 43 deletions(-) diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp index 4950d4aace0a..271f6b8e6b86 100644 --- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -337,7 +337,7 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx) if constexpr (sizeof(T) <= 2) { *mindx = simd_argfunc_small(ip, n); return 0; - } else { + } else if constexpr (sizeof(long double) != sizeof(double) || !std::is_same_v) { *mindx = simd_argfunc_large(ip, n); return 0; } @@ -383,23 +383,13 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND) return 0; \ } -#if NPY_SIZEOF_LONGDOUBLE != NPY_SIZEOF_DOUBLE - #define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR) \ - NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND) \ - (long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip)) \ - { \ - arg_max_min_func>(ip, n, max_ind); \ - return 0; \ - } -#else - #define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR) \ - NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND) \ - (long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip)) \ - { \ - arg_max_min_func>(reinterpret_cast(ip), n, max_ind); \ - return 0; \ - } -#endif +#define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR) \ +NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND) \ +(long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip)) \ +{ \ + arg_max_min_func>(ip, n, max_ind); \ + return 0; \ +} DEFINE_ARGFUNC_INNER_FUNCTION(UBYTE, argmax, Gt, npy_ubyte) DEFINE_ARGFUNC_INNER_FUNCTION(USHORT, argmax, Gt, npy_ushort) @@ -437,31 +427,34 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax) { npy_intp i = 0; #if NPY_HWY - const auto zero = Zero(); - const int vstep = Lanes(); - const int wstep = vstep * 4; - for (npy_intp n = len & -wstep; i < n; i += wstep) { - auto a = LoadU(ip + i + vstep*0); - auto b = LoadU(ip + i + vstep*1); - auto c = LoadU(ip + i + vstep*2); - auto d = LoadU(ip + i + vstep*3); - auto m_a = hn::Eq(a, zero); - auto m_b = hn::Eq(b, zero); - auto m_c = hn::Eq(c, zero); - auto m_d = hn::Eq(d, zero); - auto m_ab = hn::And(m_a, m_b); - auto m_cd = hn::And(m_c, m_d); - - npy_uint64 m = 0; - hn::StoreMaskBits(_Tag(), hn::And(m_ab, m_cd), (uint8_t*)&m); - - if constexpr (kMaxLanes == 64) { - if (m != NPY_MAX_UINT64) - break; - }else if constexpr(kMaxLanes < 64){ - if ((npy_int64)m != ((1LL << vstep) - 1)) - break; - } + constexpr int simd_width = kMaxLanes; + if constexpr(simd_width <= 64){ + const auto zero = Zero(); + const int vstep = Lanes(); + const int wstep = vstep * 4; + for (npy_intp n = len & -wstep; i < n; i += wstep) { + auto a = LoadU(ip + i + vstep*0); + auto b = LoadU(ip + i + vstep*1); + auto c = LoadU(ip + i + vstep*2); + auto d = LoadU(ip + i + vstep*3); + auto m_a = hn::Eq(a, zero); + auto m_b = hn::Eq(b, zero); + auto m_c = hn::Eq(c, zero); + auto m_d = hn::Eq(d, zero); + auto m_ab = hn::And(m_a, m_b); + auto m_cd = hn::And(m_c, m_d); + + npy_uint64 m = 0; + hn::StoreMaskBits(_Tag(), hn::And(m_ab, m_cd), (uint8_t*)&m); + + if constexpr (simd_width == 64) { + if (m != NPY_MAX_UINT64) + break; + }else if constexpr(simd_width < 64){ + if ((npy_int64)m != ((1LL << vstep) - 1)) + break; + } + } } #endif // NPY_HWY diff --git a/numpy/_core/src/multiarray/arraytypes.h.src b/numpy/_core/src/multiarray/arraytypes.h.src index ca8dbeaa67eb..d7fb26bd3b70 100644 --- a/numpy/_core/src/multiarray/arraytypes.h.src +++ b/numpy/_core/src/multiarray/arraytypes.h.src @@ -1,6 +1,10 @@ #ifndef NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_ #define NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_ +#ifndef NPY_NO_EXPORT + #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN +#endif + #ifdef __cplusplus extern "C" { #endif From 80752bfeaa4361af80b8fb57f79ea8a3c2d64c75 Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Mon, 18 Aug 2025 15:17:43 +0800 Subject: [PATCH 07/15] fix compile error when sizeof(long double)==sizeof(double)[4] --- numpy/_core/src/multiarray/arraytypes.c.src | 32 +++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/numpy/_core/src/multiarray/arraytypes.c.src b/numpy/_core/src/multiarray/arraytypes.c.src index 52c9bdfb6bcc..fdba8f46e054 100644 --- a/numpy/_core/src/multiarray/arraytypes.c.src +++ b/numpy/_core/src/multiarray/arraytypes.c.src @@ -3428,6 +3428,38 @@ static int #define VOID_argmin NULL +/**begin repeat + * #func = argmax, argmin# + * #iop = <, ># + */ +NPY_NO_EXPORT int +LONGDOUBLE_@func@(npy_longdouble *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip)) +{ + if (npy_isnan(*ip)) { + // nan encountered; it's maximal|minimal + *mindx = 0; + return 0; + } + + npy_longdouble mp = *ip; + *mindx = 0; + npy_intp i = 1; + + for (; i < n; ++i) { + npy_longdouble a = ip[i]; + if (!(a @iop@= mp)) { // negated, for correct nan handling + mp = a; + *mindx = i; + if (npy_isnan(mp)) { + // nan encountered, it's maximal|minimal + break; + } + } + } + + return 0; +} +/**end repeat**/ /* ***************************************************************************** From 6ee215ad83011dd71075bc0852e3a7311a90468b Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Mon, 18 Aug 2025 17:19:51 +0800 Subject: [PATCH 08/15] fix compile error when sizeof(long double)==sizeof(double)[5] --- .../_core/src/multiarray/argfunc.dispatch.cpp | 24 +------------- numpy/_core/src/multiarray/arraytypes.c.src | 33 ------------------- 2 files changed, 1 insertion(+), 56 deletions(-) diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp index 271f6b8e6b86..db7d32ca3713 100644 --- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -28,17 +28,6 @@ struct OpGt { } }; -template <> -struct OpGt { - HWY_INLINE bool operator()(long double a, long double b) { - return a > b; - } - - HWY_INLINE bool negated_op(long double a, long double b) { - return a <= b; - } -}; - template struct OpLt { #if NPY_HWY @@ -56,17 +45,6 @@ struct OpLt { } }; -template <> -struct OpLt { - HWY_INLINE bool operator()(long double a, long double b) { - return a < b; - } - - HWY_INLINE bool negated_op(long double a, long double b) { - return a >= b; - } -}; - #if NPY_HWY template static HWY_INLINE HWY_ATTR npy_intp @@ -337,7 +315,7 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx) if constexpr (sizeof(T) <= 2) { *mindx = simd_argfunc_small(ip, n); return 0; - } else if constexpr (sizeof(long double) != sizeof(double) || !std::is_same_v) { + } else { *mindx = simd_argfunc_large(ip, n); return 0; } diff --git a/numpy/_core/src/multiarray/arraytypes.c.src b/numpy/_core/src/multiarray/arraytypes.c.src index fdba8f46e054..8b16343e6e61 100644 --- a/numpy/_core/src/multiarray/arraytypes.c.src +++ b/numpy/_core/src/multiarray/arraytypes.c.src @@ -3428,39 +3428,6 @@ static int #define VOID_argmin NULL -/**begin repeat - * #func = argmax, argmin# - * #iop = <, ># - */ -NPY_NO_EXPORT int -LONGDOUBLE_@func@(npy_longdouble *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip)) -{ - if (npy_isnan(*ip)) { - // nan encountered; it's maximal|minimal - *mindx = 0; - return 0; - } - - npy_longdouble mp = *ip; - *mindx = 0; - npy_intp i = 1; - - for (; i < n; ++i) { - npy_longdouble a = ip[i]; - if (!(a @iop@= mp)) { // negated, for correct nan handling - mp = a; - *mindx = i; - if (npy_isnan(mp)) { - // nan encountered, it's maximal|minimal - break; - } - } - } - - return 0; -} -/**end repeat**/ - /* ***************************************************************************** ** DOT ** From c41b68540ddcb725ee7e604fca21d64156c5aab1 Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Mon, 18 Aug 2025 18:03:37 +0800 Subject: [PATCH 09/15] fix compile error when sizeof(long double)==sizeof(double)[6] --- numpy/_core/src/multiarray/argfunc.dispatch.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp index db7d32ca3713..c51da790edf0 100644 --- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -13,10 +13,11 @@ using namespace np::simd; template struct OpGt { + using Degraded = std::conditional_t, OpGt, OpGt>; #if NPY_HWY template >> HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) { - return hn::Gt(a, b); + return hn::Gt(a, b); } #endif HWY_INLINE bool operator()(T a, T b) { @@ -30,6 +31,7 @@ struct OpGt { template struct OpLt { + using Degraded = std::conditional_t, OpLt, OpLt>; #if NPY_HWY template >> HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) { @@ -296,7 +298,7 @@ simd_argfunc_large(T *ip, npy_intp len) } #endif //NPY_HWY -template +template , double, T>> HWY_INLINE HWY_ATTR int arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx) { @@ -313,10 +315,10 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx) #if NPY_HWY if constexpr (kSupportLane) { if constexpr (sizeof(T) <= 2) { - *mindx = simd_argfunc_small(ip, n); + *mindx = simd_argfunc_small(ip, n); return 0; } else { - *mindx = simd_argfunc_large(ip, n); + *mindx = simd_argfunc_large(ip, n); return 0; } } From 4d0d623553ef9bcbe78b1a937a2e93a69ca3485e Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Mon, 18 Aug 2025 20:30:08 +0800 Subject: [PATCH 10/15] fix compile error when sizeof(long double)==sizeof(double)[7] --- numpy/_core/src/multiarray/argfunc.dispatch.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp index c51da790edf0..92a94a4be3ef 100644 --- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -15,8 +15,8 @@ template struct OpGt { using Degraded = std::conditional_t, OpGt, OpGt>; #if NPY_HWY - template >> - HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) { + template >, typename V = Vec> + HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) const { return hn::Gt(a, b); } #endif @@ -33,8 +33,8 @@ template struct OpLt { using Degraded = std::conditional_t, OpLt, OpLt>; #if NPY_HWY - template >> - HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) { + template >, typename V = Vec> + HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) const { return hn::Lt(a, b); } #endif From d1d2c648582ce6a22bb95be93c35d132bd441b51 Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Mon, 18 Aug 2025 21:49:09 +0800 Subject: [PATCH 11/15] fix compile error when sizeof(long double)==sizeof(double)[8] --- numpy/_core/src/multiarray/argfunc.dispatch.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp index 92a94a4be3ef..6a8adbb687a1 100644 --- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -15,7 +15,10 @@ template struct OpGt { using Degraded = std::conditional_t, OpGt, OpGt>; #if NPY_HWY - template >, typename V = Vec> + template < + typename D = T, + typename = std::enable_if_t && !std::is_same_v, + typename V = Vec > HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) const { return hn::Gt(a, b); } @@ -33,7 +36,10 @@ template struct OpLt { using Degraded = std::conditional_t, OpLt, OpLt>; #if NPY_HWY - template >, typename V = Vec> + template < + typename D = T, + typename = std::enable_if_t && !std::is_same_v, + typename V = Vec > HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) const { return hn::Lt(a, b); } @@ -313,7 +319,7 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx) } #if NPY_HWY - if constexpr (kSupportLane) { + if constexpr (kSupportLane && std::is_same_v) { if constexpr (sizeof(T) <= 2) { *mindx = simd_argfunc_small(ip, n); return 0; From c88e8ac35c6660440ff1497427234caf912a366b Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Mon, 18 Aug 2025 23:20:09 +0800 Subject: [PATCH 12/15] fix compile error when sizeof(long double)==sizeof(double)[9] --- .../_core/src/multiarray/argfunc.dispatch.cpp | 42 ++++++++++++------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp index 6a8adbb687a1..8597ab0ada29 100644 --- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -13,12 +13,8 @@ using namespace np::simd; template struct OpGt { - using Degraded = std::conditional_t, OpGt, OpGt>; #if NPY_HWY - template < - typename D = T, - typename = std::enable_if_t && !std::is_same_v, - typename V = Vec > + template >> HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) const { return hn::Gt(a, b); } @@ -32,14 +28,21 @@ struct OpGt { } }; +template <> +struct OpGt { + HWY_INLINE bool operator()(long double a, long double b) { + return a > b; + } + + HWY_INLINE bool negated_op(long double a, long double b) { + return a <= b; + } +}; + template struct OpLt { - using Degraded = std::conditional_t, OpLt, OpLt>; #if NPY_HWY - template < - typename D = T, - typename = std::enable_if_t && !std::is_same_v, - typename V = Vec > + template >> HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) const { return hn::Lt(a, b); } @@ -53,6 +56,17 @@ struct OpLt { } }; +template <> +struct OpLt { + HWY_INLINE bool operator()(long double a, long double b) { + return a < b; + } + + HWY_INLINE bool negated_op(long double a, long double b) { + return a >= b; + } +}; + #if NPY_HWY template static HWY_INLINE HWY_ATTR npy_intp @@ -304,7 +318,7 @@ simd_argfunc_large(T *ip, npy_intp len) } #endif //NPY_HWY -template , double, T>> +template HWY_INLINE HWY_ATTR int arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx) { @@ -319,12 +333,12 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx) } #if NPY_HWY - if constexpr (kSupportLane && std::is_same_v) { + if constexpr (kSupportLane && !std::is_same_v) { if constexpr (sizeof(T) <= 2) { - *mindx = simd_argfunc_small(ip, n); + *mindx = simd_argfunc_small(ip, n); return 0; } else { - *mindx = simd_argfunc_large(ip, n); + *mindx = simd_argfunc_large(ip, n); return 0; } } From def237222c2485676a81cec6aeb2395342608b57 Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Mon, 18 Aug 2025 23:56:18 +0800 Subject: [PATCH 13/15] fix compile error when sizeof(long double)==sizeof(double)[10] --- .../_core/src/multiarray/argfunc.dispatch.cpp | 67 ++++++++++++++++--- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp index 8597ab0ada29..e93881614f8b 100644 --- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -383,14 +383,6 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND) return 0; \ } -#define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR) \ -NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND) \ -(long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip)) \ -{ \ - arg_max_min_func>(ip, n, max_ind); \ - return 0; \ -} - DEFINE_ARGFUNC_INNER_FUNCTION(UBYTE, argmax, Gt, npy_ubyte) DEFINE_ARGFUNC_INNER_FUNCTION(USHORT, argmax, Gt, npy_ushort) DEFINE_ARGFUNC_INNER_FUNCTION(UINT, argmax, Gt, npy_uint) @@ -415,13 +407,66 @@ DEFINE_ARGFUNC_INNER_FUNCTION(LONG, argmin, Lt, npy_long) DEFINE_ARGFUNC_INNER_FUNCTION(LONGLONG, argmin, Lt, npy_longlong) DEFINE_ARGFUNC_INNER_FUNCTION(FLOAT, argmin, Lt, npy_float) DEFINE_ARGFUNC_INNER_FUNCTION(DOUBLE, argmin, Lt, npy_double) -DEFINE_ARGFUNC_INNER_FUNCTION_LD(LONGDOUBLE, argmax, Gt) -DEFINE_ARGFUNC_INNER_FUNCTION_LD(LONGDOUBLE, argmin, Lt) #undef DEFINE_ARGFUNC_INNER_FUNCTION -#undef DEFINE_ARGFUNC_INNER_FUNCTION_LD +NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_argmax) +(npy_longdouble *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip)) +{ + if (npy_isnan(*ip)) { + // nan encountered; it's maximal|minimal + *mindx = 0; + return 0; + } + + npy_longdouble mp = *ip; + *mindx = 0; + npy_intp i = 1; + + for (; i < n; ++i) { + npy_longdouble a = ip[i]; + if (!(a <= mp)) { // negated, for correct nan handling + mp = a; + *mindx = i; + if (npy_isnan(mp)) { + // nan encountered, it's maximal|minimal + break; + } + } + } + + return 0; +} + +NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_argmin) +(npy_longdouble *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip)) +{ + if (npy_isnan(*ip)) { + // nan encountered; it's maximal|minimal + *mindx = 0; + return 0; + } + + npy_longdouble mp = *ip; + *mindx = 0; + npy_intp i = 1; + + for (; i < n; ++i) { + npy_longdouble a = ip[i]; + if (!(a >= mp)) { // negated, for correct nan handling + mp = a; + *mindx = i; + if (npy_isnan(mp)) { + // nan encountered, it's maximal|minimal + break; + } + } + } + + return 0; +} + NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax) (npy_bool *ip, npy_intp len, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip)) { From 7c048104da15ed108578efeb4ea561384aa7c88c Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Tue, 19 Aug 2025 13:29:40 +0800 Subject: [PATCH 14/15] fix s390x test[1] --- .../_core/src/multiarray/argfunc.dispatch.cpp | 26 +++++++------------ 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp index e93881614f8b..b583dd9922a2 100644 --- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -87,14 +87,10 @@ simd_argfunc_small(T *ip, npy_intp len) HWY_LANES_CONSTEXPR int vstep = Lanes(); const int wstep = vstep*4; - std::vector d_vindices(vstep*4); - for (int vi = 0; vi < wstep; ++vi) { - d_vindices[vi] = vi; - } - const auto vindices_0 = LoadU(d_vindices.data()); - const auto vindices_1 = LoadU(d_vindices.data()+vstep); - const auto vindices_2 = LoadU(d_vindices.data()+vstep*2); - const auto vindices_3 = LoadU(d_vindices.data()+vstep*3); + const auto vindices_0 = hn::Iota(_Tag(), UnsignedT(0)); + const auto vindices_1 = hn::Iota(_Tag(), UnsignedT(vstep)); + const auto vindices_2 = hn::Iota(_Tag(), UnsignedT(vstep*2)); + const auto vindices_3 = hn::Iota(_Tag(), UnsignedT(vstep*3)); const npy_intp max_block = idx_max*wstep & -wstep; npy_intp len0 = len & -wstep; @@ -185,15 +181,11 @@ simd_argfunc_large(T *ip, npy_intp len) len0 = NPY_MAX_UINT32; } } - // create index for vector indices - std::vector d_vindices(vstep*4); - for (int vi = 0; vi < wstep; ++vi) { - d_vindices[vi] = vi; - } - const auto vindices_0 = LoadU(d_vindices.data()); - const auto vindices_1 = LoadU(d_vindices.data()+vstep); - const auto vindices_2 = LoadU(d_vindices.data()+vstep*2); - const auto vindices_3 = LoadU(d_vindices.data()+vstep*3); + + const auto vindices_0 = hn::Iota(_Tag(), UnsignedT(0)); + const auto vindices_1 = hn::Iota(_Tag(), UnsignedT(vstep)); + const auto vindices_2 = hn::Iota(_Tag(), UnsignedT(vstep*2)); + const auto vindices_3 = hn::Iota(_Tag(), UnsignedT(vstep*3)); // initialize vector accumulator for highest values and its indexes auto acc_indices = Zero(); From 7c6a68333080584ecfc0f22435e555654cca8b91 Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Tue, 19 Aug 2025 15:28:18 +0800 Subject: [PATCH 15/15] fix s390x test[2] --- .../_core/src/multiarray/argfunc.dispatch.cpp | 67 ++++++++++++------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp index b583dd9922a2..9b126dfaf683 100644 --- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp +++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp @@ -222,6 +222,12 @@ simd_argfunc_large(T *ip, npy_intp len) npy_uint64 nnan = 0; hn::StoreMaskBits(_Tag(), hn::And(nnan_ab, nnan_cd), (uint8_t*)&nnan); +#if HWY_IS_BIG_ENDIAN + static_assert(kMaxLanes <= 8, + "This conversion is not supported for SIMD widths " + "larger than 256 bits."); + nnan = ((uint8_t *)&nnan)[0]; +#endif if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) { npy_uint64 nnan_4[4]; @@ -229,6 +235,12 @@ simd_argfunc_large(T *ip, npy_intp len) hn::StoreMaskBits(_Tag(), nnan_b, (uint8_t*)&(nnan_4[1])); hn::StoreMaskBits(_Tag(), nnan_c, (uint8_t*)&(nnan_4[2])); hn::StoreMaskBits(_Tag(), nnan_d, (uint8_t*)&(nnan_4[3])); +#if HWY_IS_BIG_ENDIAN + nnan_4[0] = ((uint8_t *)&nnan_4[0])[0]; + nnan_4[1] = ((uint8_t *)&nnan_4[1])[0]; + nnan_4[2] = ((uint8_t *)&nnan_4[2])[0]; + nnan_4[3] = ((uint8_t *)&nnan_4[3])[0]; +#endif for (int ni = 0; ni < 4; ++ni) { for (int vi = 0; vi < vstep; ++vi) { if (!((nnan_4[ni] >> vi) & 1)) { @@ -253,6 +265,9 @@ simd_argfunc_large(T *ip, npy_intp len) npy_uint64 nnan = 0; hn::StoreMaskBits(_Tag(), nnan_a, (uint8_t*)&nnan); +#if HWY_IS_BIG_ENDIAN + nnan = ((uint8_t *)&nnan)[0]; +#endif if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) { for (int vi = 0; vi < vstep; ++vi) { @@ -466,32 +481,32 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax) #if NPY_HWY constexpr int simd_width = kMaxLanes; if constexpr(simd_width <= 64){ - const auto zero = Zero(); - const int vstep = Lanes(); - const int wstep = vstep * 4; - for (npy_intp n = len & -wstep; i < n; i += wstep) { - auto a = LoadU(ip + i + vstep*0); - auto b = LoadU(ip + i + vstep*1); - auto c = LoadU(ip + i + vstep*2); - auto d = LoadU(ip + i + vstep*3); - auto m_a = hn::Eq(a, zero); - auto m_b = hn::Eq(b, zero); - auto m_c = hn::Eq(c, zero); - auto m_d = hn::Eq(d, zero); - auto m_ab = hn::And(m_a, m_b); - auto m_cd = hn::And(m_c, m_d); - - npy_uint64 m = 0; - hn::StoreMaskBits(_Tag(), hn::And(m_ab, m_cd), (uint8_t*)&m); - - if constexpr (simd_width == 64) { - if (m != NPY_MAX_UINT64) - break; - }else if constexpr(simd_width < 64){ - if ((npy_int64)m != ((1LL << vstep) - 1)) - break; - } - } + const auto zero = Zero(); + const int vstep = Lanes(); + const int wstep = vstep * 4; + for (npy_intp n = len & -wstep; i < n; i += wstep) { + auto a = LoadU(ip + i + vstep*0); + auto b = LoadU(ip + i + vstep*1); + auto c = LoadU(ip + i + vstep*2); + auto d = LoadU(ip + i + vstep*3); + auto m_a = hn::Eq(a, zero); + auto m_b = hn::Eq(b, zero); + auto m_c = hn::Eq(c, zero); + auto m_d = hn::Eq(d, zero); + auto m_ab = hn::And(m_a, m_b); + auto m_cd = hn::And(m_c, m_d); + + npy_uint64 m = 0; + hn::StoreMaskBits(_Tag(), hn::And(m_ab, m_cd), (uint8_t*)&m); + + if constexpr (simd_width == 64) { + if (m != NPY_MAX_UINT64) + break; + }else if constexpr(simd_width < 64){ + if ((npy_int64)m != ((1LL << vstep) - 1)) + break; + } + } } #endif // NPY_HWY