From 09486f982754ef9c374c7fba8a4e2bd74881b75c Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Fri, 15 Aug 2025 09:51:43 +0800
Subject: [PATCH 01/15] ENH, SIMD: Optimize the argmax/argmin implementation
 based on Highway wrapper

Signed-off-by: Wang Yang <yangwang@iscas.ac.cn>
---
 numpy/_core/meson.build                       |   8 +-
 .../src/multiarray/argfunc.dispatch.c.src     | 390 ---------------
 .../_core/src/multiarray/argfunc.dispatch.cpp | 466 ++++++++++++++++++
 3 files changed, 471 insertions(+), 393 deletions(-)
 delete mode 100644 numpy/_core/src/multiarray/argfunc.dispatch.c.src
 create mode 100644 numpy/_core/src/multiarray/argfunc.dispatch.cpp

diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
index b4c769810ad8..703fba58b1f1 100644
--- a/numpy/_core/meson.build
+++ b/numpy/_core/meson.build
@@ -817,12 +817,13 @@ multiarray_gen_headers = [
 foreach gen_mtargets : [
   [
     'argfunc.dispatch.h',
-    src_file.process('src/multiarray/argfunc.dispatch.c.src'),
+    'src/multiarray/argfunc.dispatch.cpp',
     [
       AVX512_SKX, AVX2, XOP, SSE42, SSE2,
       VSX2,
       ASIMD, NEON,
-      VXE, VX
+      VXE, VX,
+      RVV
     ]
   ],
 ]
@@ -840,7 +841,8 @@ foreach gen_mtargets : [
       'src/multiarray',
       'src/multiarray/stringdtype',
       'src/npymath',
-      'src/umath'
+      'src/umath',
+      'src/highway'
     ]
   )
   if not is_variable('multiarray_umath_mtargets')
diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.c.src b/numpy/_core/src/multiarray/argfunc.dispatch.c.src
deleted file mode 100644
index 79dc111d2438..000000000000
--- a/numpy/_core/src/multiarray/argfunc.dispatch.c.src
+++ /dev/null
@@ -1,390 +0,0 @@
-/* -*- c -*- */
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "simd/simd.h"
-#include "numpy/npy_math.h"
-
-#include "arraytypes.h"
-
-#define MIN(a,b) (((a)<(b))?(a):(b))
-
-#if NPY_SIMD
-#if NPY_SIMD > 512 || NPY_SIMD < 0
-    #error "the following 8/16-bit argmax kernel isn't applicable for larger SIMD"
-    // TODO: add special loop for large SIMD width.
-    // i.e avoid unroll by x4 should be numerically safe till 2048-bit SIMD width
-    // or maybe expand the indices to 32|64-bit vectors(slower).
-#endif
-/**begin repeat
- * #sfx = u8, s8, u16, s16#
- * #usfx = u8, u8, u16, u16#
- * #bsfx = b8, b8, b16, b16#
- * #idx_max = NPY_MAX_UINT8*2, NPY_MAX_UINT16*2#
- */
-/**begin repeat1
- * #intrin = cmpgt, cmplt#
- * #func = argmax, argmin#
- * #op = >, <#
- */
-static inline npy_intp
-simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
-{
-    npyv_lanetype_@sfx@ s_acc = *ip;
-    npy_intp ret_idx = 0, i = 0;
-
-    const int vstep = npyv_nlanes_@sfx@;
-    const int wstep = vstep*4;
-    npyv_lanetype_@usfx@ d_vindices[npyv_nlanes_@sfx@*4];
-    for (int vi = 0; vi < wstep; ++vi) {
-        d_vindices[vi] = vi;
-    }
-    const npyv_@usfx@ vindices_0 = npyv_load_@usfx@(d_vindices);
-    const npyv_@usfx@ vindices_1 = npyv_load_@usfx@(d_vindices + vstep);
-    const npyv_@usfx@ vindices_2 = npyv_load_@usfx@(d_vindices + vstep*2);
-    const npyv_@usfx@ vindices_3 = npyv_load_@usfx@(d_vindices + vstep*3);
-
-    const npy_intp max_block = @idx_max@*wstep & -wstep;
-    npy_intp len0 = len & -wstep;
-    while (i < len0) {
-        npyv_@sfx@ acc = npyv_setall_@sfx@(s_acc);
-        npyv_@usfx@ acc_indices = npyv_zero_@usfx@();
-        npyv_@usfx@ acc_indices_scale = npyv_zero_@usfx@();
-
-        npy_intp n = i + MIN(len0 - i, max_block);
-        npy_intp ik = i, i2 = 0;
-        for (; i < n; i += wstep, ++i2) {
-            npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i2);
-            npyv_@sfx@ a = npyv_load_@sfx@(ip + i);
-            npyv_@sfx@ b = npyv_load_@sfx@(ip + i + vstep);
-            npyv_@sfx@ c = npyv_load_@sfx@(ip + i + vstep*2);
-            npyv_@sfx@ d = npyv_load_@sfx@(ip + i + vstep*3);
-
-            // reverse to put lowest index first in case of matched values
-            npyv_@bsfx@ m_ba = npyv_@intrin@_@sfx@(b, a);
-            npyv_@bsfx@ m_dc = npyv_@intrin@_@sfx@(d, c);
-            npyv_@sfx@  x_ba = npyv_select_@sfx@(m_ba, b, a);
-            npyv_@sfx@  x_dc = npyv_select_@sfx@(m_dc, d, c);
-            npyv_@bsfx@ m_dcba = npyv_@intrin@_@sfx@(x_dc, x_ba);
-            npyv_@sfx@  x_dcba = npyv_select_@sfx@(m_dcba, x_dc, x_ba);
-
-            npyv_@usfx@ idx_ba = npyv_select_@usfx@(m_ba, vindices_1, vindices_0);
-            npyv_@usfx@ idx_dc = npyv_select_@usfx@(m_dc, vindices_3, vindices_2);
-            npyv_@usfx@ idx_dcba = npyv_select_@usfx@(m_dcba, idx_dc, idx_ba);
-            npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(x_dcba, acc);
-            acc = npyv_select_@sfx@(m_acc, x_dcba, acc);
-            acc_indices = npyv_select_@usfx@(m_acc, idx_dcba, acc_indices);
-            acc_indices_scale = npyv_select_@usfx@(m_acc, vi, acc_indices_scale);
-        }
-        // reduce
-        npyv_lanetype_@sfx@ dacc[npyv_nlanes_@sfx@];
-        npyv_lanetype_@usfx@ dacc_i[npyv_nlanes_@sfx@];
-        npyv_lanetype_@usfx@ dacc_s[npyv_nlanes_@sfx@];
-        npyv_store_@sfx@(dacc, acc);
-        npyv_store_@usfx@(dacc_i, acc_indices);
-        npyv_store_@usfx@(dacc_s, acc_indices_scale);
-
-        for (int vi = 0; vi < vstep; ++vi) {
-            if (dacc[vi] @op@ s_acc) {
-                s_acc = dacc[vi];
-                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
-            }
-        }
-        // get the lowest index in case of matched values
-        for (int vi = 0; vi < vstep; ++vi) {
-            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
-            if (s_acc == dacc[vi] && ret_idx > idx) {
-                ret_idx = idx;
-            }
-        }
-    }
-    for (; i < len; ++i) {
-        npyv_lanetype_@sfx@ a = ip[i];
-        if (a @op@ s_acc) {
-            s_acc = a;
-            ret_idx = i;
-        }
-    }
-    return ret_idx;
-}
-/**end repeat1**/
-/**end repeat**/
-#endif
-
-/**begin repeat
- * #sfx = u32, s32, u64, s64, f32, f64#
- * #usfx = u32, u32, u64, u64, u32, u64#
- * #bsfx = b32, b32, b64, b64, b32, b64#
- * #is_fp = 0*4, 1*2#
- * #is_idx32 = 1*2, 0*2, 1, 0#
- * #chk_simd = NPY_SIMD*4, NPY_SIMD_F32, NPY_SIMD_F64#
- */
-#if @chk_simd@
-/**begin repeat1
- * #intrin = cmpgt, cmplt#
- * #func = argmax, argmin#
- * #op = >, <#
- * #iop = <, >#
- */
-static inline npy_intp
-simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
-{
-    npyv_lanetype_@sfx@ s_acc = *ip;
-    npy_intp ret_idx = 0, i = 0;
-    const int vstep = npyv_nlanes_@sfx@;
-    const int wstep = vstep*4;
-    // loop by a scalar will perform better for small arrays
-    if (len < wstep) {
-        goto scalar_loop;
-    }
-    npy_intp len0 = len;
-    // guard against wraparound vector addition for 32-bit indices
-    // in case of the array length is larger than 16gb
-#if @is_idx32@
-    if (len0 > NPY_MAX_UINT32) {
-        len0 = NPY_MAX_UINT32;
-    }
-#endif
-    // create index for vector indices
-    npyv_lanetype_@usfx@ d_vindices[npyv_nlanes_@sfx@*4];
-    for (int vi = 0; vi < wstep; ++vi) {
-        d_vindices[vi] = vi;
-    }
-    const npyv_@usfx@ vindices_0 = npyv_load_@usfx@(d_vindices);
-    const npyv_@usfx@ vindices_1 = npyv_load_@usfx@(d_vindices + vstep);
-    const npyv_@usfx@ vindices_2 = npyv_load_@usfx@(d_vindices + vstep*2);
-    const npyv_@usfx@ vindices_3 = npyv_load_@usfx@(d_vindices + vstep*3);
-    // initialize vector accumulator for highest values and its indexes
-    npyv_@usfx@ acc_indices = npyv_zero_@usfx@();
-    npyv_@sfx@ acc = npyv_setall_@sfx@(s_acc);
-    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
-        npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i);
-        npyv_@sfx@ a = npyv_load_@sfx@(ip + i);
-        npyv_@sfx@ b = npyv_load_@sfx@(ip + i + vstep);
-        npyv_@sfx@ c = npyv_load_@sfx@(ip + i + vstep*2);
-        npyv_@sfx@ d = npyv_load_@sfx@(ip + i + vstep*3);
-
-        // reverse to put lowest index first in case of matched values
-        npyv_@bsfx@ m_ba = npyv_@intrin@_@sfx@(b, a);
-        npyv_@bsfx@ m_dc = npyv_@intrin@_@sfx@(d, c);
-        npyv_@sfx@  x_ba = npyv_select_@sfx@(m_ba, b, a);
-        npyv_@sfx@  x_dc = npyv_select_@sfx@(m_dc, d, c);
-        npyv_@bsfx@ m_dcba = npyv_@intrin@_@sfx@(x_dc, x_ba);
-        npyv_@sfx@  x_dcba = npyv_select_@sfx@(m_dcba, x_dc, x_ba);
-
-        npyv_@usfx@ idx_ba = npyv_select_@usfx@(m_ba, vindices_1, vindices_0);
-        npyv_@usfx@ idx_dc = npyv_select_@usfx@(m_dc, vindices_3, vindices_2);
-        npyv_@usfx@ idx_dcba = npyv_select_@usfx@(m_dcba, idx_dc, idx_ba);
-        npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(x_dcba, acc);
-        acc = npyv_select_@sfx@(m_acc, x_dcba, acc);
-        acc_indices = npyv_select_@usfx@(m_acc, npyv_add_@usfx@(vi, idx_dcba), acc_indices);
-
-    #if @is_fp@
-        npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
-        npyv_@bsfx@ nnan_b = npyv_notnan_@sfx@(b);
-        npyv_@bsfx@ nnan_c = npyv_notnan_@sfx@(c);
-        npyv_@bsfx@ nnan_d = npyv_notnan_@sfx@(d);
-        npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b);
-        npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d);
-        npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd));
-        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
-            npy_uint64 nnan_4[4];
-            nnan_4[0] = npyv_tobits_@bsfx@(nnan_a);
-            nnan_4[1] = npyv_tobits_@bsfx@(nnan_b);
-            nnan_4[2] = npyv_tobits_@bsfx@(nnan_c);
-            nnan_4[3] = npyv_tobits_@bsfx@(nnan_d);
-            for (int ni = 0; ni < 4; ++ni) {
-                for (int vi = 0; vi < vstep; ++vi) {
-                    if (!((nnan_4[ni] >> vi) & 1)) {
-                        return i + ni*vstep + vi;
-                    }
-                }
-            }
-        }
-    #endif
-    }
-    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
-        npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i);
-        npyv_@sfx@ a = npyv_load_@sfx@(ip + i);
-        npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(a, acc);
-        acc = npyv_select_@sfx@(m_acc, a, acc);
-        acc_indices = npyv_select_@usfx@(m_acc, npyv_add_@usfx@(vi, vindices_0), acc_indices);
-    #if @is_fp@
-        npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
-        npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a);
-        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
-            for (int vi = 0; vi < vstep; ++vi) {
-                if (!((nnan >> vi) & 1)) {
-                    return i + vi;
-                }
-            }
-        }
-    #endif
-    }
-
-    // reduce
-    npyv_lanetype_@sfx@ dacc[npyv_nlanes_@sfx@];
-    npyv_lanetype_@usfx@ dacc_i[npyv_nlanes_@sfx@];
-    npyv_store_@usfx@(dacc_i, acc_indices);
-    npyv_store_@sfx@(dacc, acc);
-
-    s_acc = dacc[0];
-    ret_idx = dacc_i[0];
-    for (int vi = 1; vi < vstep; ++vi) {
-        if (dacc[vi] @op@ s_acc) {
-            s_acc = dacc[vi];
-            ret_idx = (npy_intp)dacc_i[vi];
-        }
-    }
-    // get the lowest index in case of matched values
-    for (int vi = 0; vi < vstep; ++vi) {
-        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
-            ret_idx = dacc_i[vi];
-        }
-    }
-scalar_loop:
-    for (; i < len; ++i) {
-        npyv_lanetype_@sfx@ a = ip[i];
-    #if @is_fp@
-        if (!(a @iop@= s_acc)) {  // negated, for correct nan handling
-    #else
-        if (a @op@ s_acc) {
-    #endif
-            s_acc = a;
-            ret_idx = i;
-        #if @is_fp@
-            if (npy_isnan(s_acc)) {
-                // nan encountered, it's maximal
-                return ret_idx;
-            }
-        #endif
-        }
-    }
-    return ret_idx;
-}
-/**end repeat1**/
-#endif // chk_simd
-/**end repeat**/
-
-/**begin repeat
- * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
- *         BYTE, SHORT, INT, LONG, LONGLONG,
- *         FLOAT, DOUBLE, LONGDOUBLE#
- *
- * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG,
- *          BYTE, SHORT, INT, LONG, LONGLONG,
- *          FLOAT, DOUBLE, LONGDOUBLE#
- * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
- *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *         npy_float, npy_double, npy_longdouble#
- *
- * #is_fp = 0*10, 1*3#
- * #is_unsigned = 1*5, 0*5, 0*3#
- */
-#undef TO_SIMD_SFX
-#if 0
-/**begin repeat1
- * #len = 8, 16, 32, 64#
- */
-#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
-    #if @is_fp@
-        #define TO_SIMD_SFX(X) X##_f@len@
-        #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
-            #undef TO_SIMD_SFX
-        #endif
-        #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32
-            #undef TO_SIMD_SFX
-        #endif
-    #elif @is_unsigned@
-        #define TO_SIMD_SFX(X) X##_u@len@
-    #else
-        #define TO_SIMD_SFX(X) X##_s@len@
-    #endif
-/**end repeat1**/
-#endif
-
-/**begin repeat1
- * #func = argmax, argmin#
- * #op = >, <#
- * #iop = <, >#
- */
-NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
-(@type@ *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
-{
-#if @is_fp@
-    if (npy_isnan(*ip)) {
-        // nan encountered; it's maximal|minimal
-        *mindx = 0;
-        return 0;
-    }
-#endif
-#ifdef TO_SIMD_SFX
-    *mindx = TO_SIMD_SFX(simd_@func@)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
-    npyv_cleanup();
-#else
-    @type@ mp = *ip;
-    *mindx = 0;
-    npy_intp i = 1;
-
-    for (; i < n; ++i) {
-        @type@ a = ip[i];
-    #if @is_fp@
-        if (!(a @iop@= mp)) {  // negated, for correct nan handling
-    #else
-        if (a @op@ mp) {
-    #endif
-            mp = a;
-            *mindx = i;
-        #if @is_fp@
-            if (npy_isnan(mp)) {
-                // nan encountered, it's maximal|minimal
-                break;
-            }
-        #endif
-        }
-    }
-#endif // TO_SIMD_SFX
-    return 0;
-}
-/**end repeat1**/
-/**end repeat**/
-
-NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax)
-(npy_bool *ip, npy_intp len, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
-
-{
-    npy_intp i = 0;
-#if NPY_SIMD
-    const npyv_u8 zero = npyv_zero_u8();
-    const int vstep = npyv_nlanes_u8;
-    const int wstep = vstep * 4;
-    for (npy_intp n = len & -wstep; i < n; i += wstep) {
-        npyv_u8 a = npyv_load_u8(ip + i + vstep*0);
-        npyv_u8 b = npyv_load_u8(ip + i + vstep*1);
-        npyv_u8 c = npyv_load_u8(ip + i + vstep*2);
-        npyv_u8 d = npyv_load_u8(ip + i + vstep*3);
-        npyv_b8 m_a = npyv_cmpeq_u8(a, zero);
-        npyv_b8 m_b = npyv_cmpeq_u8(b, zero);
-        npyv_b8 m_c = npyv_cmpeq_u8(c, zero);
-        npyv_b8 m_d = npyv_cmpeq_u8(d, zero);
-        npyv_b8 m_ab = npyv_and_b8(m_a, m_b);
-        npyv_b8 m_cd = npyv_and_b8(m_c, m_d);
-        npy_uint64 m = npyv_tobits_b8(npyv_and_b8(m_ab, m_cd));
-    #if NPY_SIMD == 512
-        if (m != NPY_MAX_UINT64) {
-    #else
-        if ((npy_int64)m != ((1LL << vstep) - 1)) {
-    #endif
-            break;
-        }
-    }
-    npyv_cleanup();
-#endif // NPY_SIMD
-    for (; i < len; ++i) {
-        if (ip[i]) {
-            *mindx = i;
-            return 0;
-        }
-    }
-    *mindx = 0;
-    return 0;
-}
diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
new file mode 100644
index 000000000000..498f8ea861ff
--- /dev/null
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -0,0 +1,466 @@
+#include "simd/simd.h"
+#include "numpy/npy_math.h"
+#include "numpy/npy_common.h"
+#include "common.hpp"
+#include "arraytypes.h"
+#include "simd/simd.hpp"
+#include <hwy/highway.h>
+
+#define MIN(a,b) (((a)<(b))?(a):(b))
+
+namespace {
+using namespace np::simd;
+
+template <typename T>
+struct OpGt {
+#if NPY_HWY
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) { 
+        return hn::Gt(a, b); 
+    }
+#endif
+    HWY_INLINE bool operator()(T a, T b) { 
+        return a > b; 
+    }
+
+    HWY_INLINE bool negated_op(T a, T b) {
+        return a <= b;
+    }
+};
+
+template <>
+struct OpGt<long double> {
+    HWY_INLINE bool operator()(long double a, long double b) {
+        return a > b; 
+    }
+
+    HWY_INLINE bool negated_op(long double a, long double b) {
+        return a <= b;
+    }
+};
+
+template <typename T>
+struct OpLt {
+#if NPY_HWY
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
+    HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) { 
+        return hn::Lt(a, b); 
+    }
+#endif
+    HWY_INLINE bool operator()(T a, T b) { 
+        return a < b; 
+    }
+
+    HWY_INLINE bool negated_op(T a, T b) {
+        return a >= b;
+    }
+};
+
+template <>
+struct OpLt<long double> {
+    HWY_INLINE bool operator()(long double a, long double b) {
+        return a < b; 
+    }
+
+    HWY_INLINE bool negated_op(long double a, long double b) {
+        return a >= b;
+    }
+};
+
+#if NPY_HWY
+template <typename T, typename Op>
+static HWY_INLINE HWY_ATTR npy_intp
+simd_argfunc_small(T *ip, npy_intp len)
+{
+    //static_assert(kMaxLanes<uint8_t> <= 64,
+    //    "the following 8/16-bit argmax kernel isn't applicable for larger SIMD");
+    /* TODO: add special loop for large SIMD width.
+       i.e avoid unroll by x4 should be numerically safe till 2048-bit SIMD width
+       or maybe expand the indices to 32|64-bit vectors(slower).  */
+
+    using UnsignedT = std::conditional_t<sizeof(T) == 1, uint8_t, uint16_t>;
+    constexpr npy_intp idx_max = (sizeof(T) == 1) ? NPY_MAX_UINT8 : NPY_MAX_UINT16;
+
+    Op op_func;
+    T s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+
+    const int vstep = Lanes<T>();
+    const int wstep = vstep*4;
+    UnsignedT d_vindices[Lanes<T>()*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const auto vindices_0 = LoadU(d_vindices);
+    const auto vindices_1 = LoadU(d_vindices + vstep);
+    const auto vindices_2 = LoadU(d_vindices + vstep*2);
+    const auto vindices_3 = LoadU(d_vindices + vstep*3);
+
+    const npy_intp max_block = idx_max*wstep & -wstep;
+    npy_intp len0 = len & -wstep;
+    while (i < len0) {
+        auto acc               = Set(T(s_acc));
+        auto acc_indices       = Zero<UnsignedT>();
+        auto acc_indices_scale = Zero<UnsignedT>();
+
+        npy_intp n = i + MIN(len0 - i, max_block);
+        npy_intp ik = i, i2 = 0;
+        for (; i < n; i += wstep, ++i2) {
+            auto vi = Set(UnsignedT(i2));
+            auto a = LoadU(ip + i);
+            auto b = LoadU(ip + i + vstep);
+            auto c = LoadU(ip + i + vstep*2);
+            auto d = LoadU(ip + i + vstep*3);
+
+            // reverse to put lowest index first in case of matched values
+            auto m_ba = op_func(b, a);
+            auto m_dc = op_func(d, c);
+            auto x_ba = hn::IfThenElse(hn::RebindMask(_Tag<T>(), m_ba), b, a);
+            auto x_dc = hn::IfThenElse(hn::RebindMask(_Tag<T>(), m_dc), d, c);
+            auto m_dcba = op_func(x_dc, x_ba);
+            auto x_dcba = hn::IfThenElse(hn::RebindMask(_Tag<T>(), m_dcba), x_dc, x_ba);
+
+            auto idx_ba = hn::IfThenElse(hn::RebindMask(_Tag<UnsignedT>(), m_ba), vindices_1, vindices_0);
+            auto idx_dc = hn::IfThenElse(hn::RebindMask(_Tag<UnsignedT>(), m_dc), vindices_3, vindices_2);
+            auto idx_dcba = hn::IfThenElse(hn::RebindMask(_Tag<UnsignedT>(), m_dcba), idx_dc, idx_ba);
+            auto m_acc = op_func(x_dcba, acc);
+            acc = hn::IfThenElse(hn::RebindMask(_Tag<T>(), m_acc), x_dcba, acc);
+            acc_indices = hn::IfThenElse(hn::RebindMask(_Tag<UnsignedT>(), m_acc), idx_dcba, acc_indices);
+            acc_indices_scale = hn::IfThenElse(hn::RebindMask(_Tag<UnsignedT>(), m_acc), vi, acc_indices_scale);
+        }
+        // reduce
+        T dacc[Lanes<T>()];
+        UnsignedT dacc_i[Lanes<T>()];
+        UnsignedT dacc_s[Lanes<T>()];
+
+        StoreU(acc, dacc);
+        StoreU(acc_indices, dacc_i);
+        StoreU(acc_indices_scale, dacc_s);
+
+        for (int vi = 0; vi < vstep; ++vi) {
+            if (op_func(dacc[vi], s_acc)) {
+                s_acc = dacc[vi];
+                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            }
+        }
+        // get the lowest index in case of matched values
+        for (int vi = 0; vi < vstep; ++vi) {
+            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            if (s_acc == dacc[vi] && ret_idx > idx) {
+                ret_idx = idx;
+            }
+        }
+    }
+
+    for (; i < len; ++i) {
+        T a = ip[i];
+        if (op_func(a, s_acc)) {
+            s_acc = a;
+            ret_idx = i;
+        }
+    }
+    return ret_idx;
+}
+
+template <typename T, typename Op, bool IsFloatingPoint = std::is_floating_point_v<T>>
+static HWY_INLINE HWY_ATTR npy_intp
+simd_argfunc_large(T *ip, npy_intp len)
+{
+    using UnsignedT = std::conditional_t<sizeof(T) <= 4, uint32_t, uint64_t>;
+    constexpr bool is_idx32 = sizeof(T) <= 4;
+
+    Op op_func;
+    T s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = Lanes<T>();
+    const int wstep = vstep*4;
+
+    // loop by a scalar will perform better for small arrays
+    if (len >= wstep) {
+        npy_intp len0 = len;
+        // guard against wraparound vector addition for 32-bit indices
+        // in case of the array length is larger than 16gb
+        if constexpr (is_idx32) {
+            if (len0 > NPY_MAX_UINT32) {
+                len0 = NPY_MAX_UINT32;
+            }
+        }
+        // create index for vector indices
+        UnsignedT d_vindices[Lanes<T>()*4];
+        for (int vi = 0; vi < wstep; ++vi) {
+            d_vindices[vi] = vi;
+        }
+        const auto vindices_0 = LoadU(d_vindices);
+        const auto vindices_1 = LoadU(d_vindices + vstep);
+        const auto vindices_2 = LoadU(d_vindices + vstep*2);
+        const auto vindices_3 = LoadU(d_vindices + vstep*3);
+
+        // initialize vector accumulator for highest values and its indexes
+        auto acc_indices = Zero<UnsignedT>();
+        auto acc         = Set(T(s_acc));
+        for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+            auto vi = Set(UnsignedT(i));
+            auto a  = LoadU(ip + i);
+            auto b  = LoadU(ip + i + vstep);
+            auto c  = LoadU(ip + i + vstep*2);
+            auto d  = LoadU(ip + i + vstep*3);
+
+            // reverse to put lowest index first in case of matched values
+            auto m_ba = op_func(b, a);
+            auto m_dc = op_func(d, c);
+            auto x_ba = hn::IfThenElse(hn::RebindMask(_Tag<T>(), m_ba), b, a);
+            auto x_dc = hn::IfThenElse(hn::RebindMask(_Tag<T>(), m_dc), d, c);
+            auto m_dcba = op_func(x_dc, x_ba);
+            auto x_dcba = hn::IfThenElse(hn::RebindMask(_Tag<T>(), m_dcba), x_dc, x_ba);
+
+            auto idx_ba = hn::IfThenElse(hn::RebindMask(_Tag<UnsignedT>(), m_ba), vindices_1, vindices_0);
+            auto idx_dc = hn::IfThenElse(hn::RebindMask(_Tag<UnsignedT>(), m_dc), vindices_3, vindices_2);
+            auto idx_dcba = hn::IfThenElse(hn::RebindMask(_Tag<UnsignedT>(), m_dcba), idx_dc, idx_ba);
+            auto m_acc = op_func(x_dcba, acc);
+            acc         = hn::IfThenElse(hn::RebindMask(_Tag<T>(), m_acc), x_dcba, acc);
+            acc_indices = hn::IfThenElse(hn::RebindMask(_Tag<UnsignedT>(), m_acc), hn::Add(vi, idx_dcba), acc_indices);
+
+            if constexpr (IsFloatingPoint) {
+                auto nnan_a  = hn::Not(hn::IsNaN(a));
+                auto nnan_b  = hn::Not(hn::IsNaN(b));
+                auto nnan_c  = hn::Not(hn::IsNaN(c));
+                auto nnan_d  = hn::Not(hn::IsNaN(d));
+                auto nnan_ab = hn::And(nnan_a, nnan_b);
+                auto nnan_cd = hn::And(nnan_c, nnan_d);
+
+                npy_uint64 nnan = 0;
+                hn::StoreMaskBits(_Tag<T>(), hn::And(nnan_ab, nnan_cd), (uint8_t*)&nnan);
+
+                if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+                    npy_uint64 nnan_4[4];
+                    hn::StoreMaskBits(_Tag<T>(), nnan_a, (uint8_t*)&(nnan_4[0]));
+                    hn::StoreMaskBits(_Tag<T>(), nnan_b, (uint8_t*)&(nnan_4[1]));
+                    hn::StoreMaskBits(_Tag<T>(), nnan_c, (uint8_t*)&(nnan_4[2]));
+                    hn::StoreMaskBits(_Tag<T>(), nnan_d, (uint8_t*)&(nnan_4[3]));
+                    for (int ni = 0; ni < 4; ++ni) {
+                        for (int vi = 0; vi < vstep; ++vi) {
+                            if (!((nnan_4[ni] >> vi) & 1)) {
+                                return i + ni*vstep + vi;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+            auto vi    = Set(UnsignedT(i));
+            auto a     = LoadU(ip + i);
+            auto m_acc = op_func(a, acc);
+
+            acc         = hn::IfThenElse(hn::RebindMask(_Tag<T>(), m_acc), a, acc);
+            acc_indices = hn::IfThenElse(hn::RebindMask(_Tag<UnsignedT>(), m_acc), hn::Add(vi, vindices_0), acc_indices);
+
+            if constexpr (IsFloatingPoint) {
+                auto nnan_a = hn::Not(hn::IsNaN(a));
+
+                npy_uint64 nnan = 0;
+                hn::StoreMaskBits(_Tag<T>(), nnan_a, (uint8_t*)&nnan);
+
+                if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+                    for (int vi = 0; vi < vstep; ++vi) {
+                        if (!((nnan >> vi) & 1)) {
+                            return i + vi;
+                        }
+                    }
+                }
+            }
+        }
+
+        // reduce
+        T dacc[Lanes<T>()];
+        UnsignedT dacc_i[Lanes<T>()];
+
+        StoreU(acc_indices, dacc_i);
+        StoreU(acc, dacc);
+
+        s_acc   = dacc[0];
+        ret_idx = dacc_i[0];
+        for (int vi = 1; vi < vstep; ++vi) {
+            if (op_func(dacc[vi], s_acc)) {
+                s_acc   = dacc[vi];
+                ret_idx = (npy_intp)dacc_i[vi];
+            }
+        }
+        // get the lowest index in case of matched values
+        for (int vi = 0; vi < vstep; ++vi) {
+            if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+                ret_idx = dacc_i[vi];
+            }
+        }
+    }
+
+    //scalar loop
+    for (; i < len; ++i) {
+        T a = ip[i];
+        if constexpr (IsFloatingPoint) {
+            if (!op_func.negated_op(a, s_acc)) {  // negated, for correct nan handling
+                s_acc   = a;
+                ret_idx = i;
+                if (npy_isnan(s_acc)) {
+                    // nan encountered, it's maximal
+                    return ret_idx;
+                }
+            }
+        } else {
+            if (op_func(a, s_acc)) {
+                s_acc = a;
+                ret_idx = i;
+            }
+        }
+    }
+    return ret_idx;
+}
+#endif   //NPY_HWY
+
+template <typename T, typename Op>
+HWY_INLINE HWY_ATTR  int
+arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx)
+{
+    Op op_func;
+
+    if constexpr (std::is_floating_point_v<T>) {
+        if (npy_isnan(*ip)){
+            // nan encountered; it's maximal | minimal
+            *mindx = 0;
+            return 0;
+        }
+    }
+
+#if NPY_HWY
+    if constexpr (kSupportLane<T>) {
+        if constexpr (sizeof(T) <= 2) {
+            *mindx = simd_argfunc_small<T, Op>(ip, n);
+        } else if constexpr (sizeof(long double) != sizeof(double) || !std::is_same_v<T, long double>) {
+            *mindx = simd_argfunc_large<T, Op>(ip, n);
+        }
+        return 0;
+    }
+#endif
+
+    T mp       = *ip;
+    *mindx     = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        T a = ip[i];
+        if constexpr (std::is_floating_point_v<T>) {
+            if (!op_func.negated_op(a, mp)) {  // negated, for correct nan handling
+                mp     = a;
+                *mindx = i;
+                if (npy_isnan(mp)){
+                    // nan encountered, it's maximal|minimal
+                    break;
+                }
+            }
+        } else {
+            if (op_func(a, mp)) {
+                mp     = a;
+                *mindx = i;
+            }
+        }
+    }
+
+    return 0;
+}
+} // namespace anonymous
+
+/***********************************************************************************
+ ** Defining argfunc inner functions
+ ***********************************************************************************/
+#define DEFINE_ARGFUNC_INNER_FUNCTION(TYPE, KIND, INTR, T)                                          \
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND)                                             \
+(T *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip))                              \
+{                                                                                                   \
+    using FixedType = typename np::meta::FixedWidth<T>::Type;                                       \
+    arg_max_min_func<FixedType, Op##INTR<FixedType>>(reinterpret_cast<FixedType*>(ip), n, max_ind); \
+    return 0;                                                                                       \
+}
+
+#define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR)                         \
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND)                            \
+(long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip))   \
+{                                                                                  \
+    arg_max_min_func<long double, Op##INTR<long double>>(ip, n, max_ind);          \
+    return 0;                                                                      \
+}
+
+DEFINE_ARGFUNC_INNER_FUNCTION(UBYTE,     argmax, Gt, npy_ubyte)
+DEFINE_ARGFUNC_INNER_FUNCTION(USHORT,    argmax, Gt, npy_ushort)
+DEFINE_ARGFUNC_INNER_FUNCTION(UINT,      argmax, Gt, npy_uint)
+DEFINE_ARGFUNC_INNER_FUNCTION(ULONG,     argmax, Gt, npy_ulong)
+DEFINE_ARGFUNC_INNER_FUNCTION(ULONGLONG, argmax, Gt, npy_ulonglong)
+DEFINE_ARGFUNC_INNER_FUNCTION(BYTE,      argmax, Gt, npy_byte)
+DEFINE_ARGFUNC_INNER_FUNCTION(SHORT,     argmax, Gt, npy_short)
+DEFINE_ARGFUNC_INNER_FUNCTION(INT,       argmax, Gt, npy_int)
+DEFINE_ARGFUNC_INNER_FUNCTION(LONG,      argmax, Gt, npy_long)
+DEFINE_ARGFUNC_INNER_FUNCTION(LONGLONG,  argmax, Gt, npy_longlong)
+DEFINE_ARGFUNC_INNER_FUNCTION(FLOAT,     argmax, Gt, npy_float)
+DEFINE_ARGFUNC_INNER_FUNCTION(DOUBLE,    argmax, Gt, npy_double)
+DEFINE_ARGFUNC_INNER_FUNCTION(UBYTE,     argmin, Lt, npy_ubyte)
+DEFINE_ARGFUNC_INNER_FUNCTION(USHORT,    argmin, Lt, npy_ushort)
+DEFINE_ARGFUNC_INNER_FUNCTION(UINT,      argmin, Lt, npy_uint)
+DEFINE_ARGFUNC_INNER_FUNCTION(ULONG,     argmin, Lt, npy_ulong)
+DEFINE_ARGFUNC_INNER_FUNCTION(ULONGLONG, argmin, Lt, npy_ulonglong)
+DEFINE_ARGFUNC_INNER_FUNCTION(BYTE,      argmin, Lt, npy_byte)
+DEFINE_ARGFUNC_INNER_FUNCTION(SHORT,     argmin, Lt, npy_short)
+DEFINE_ARGFUNC_INNER_FUNCTION(INT,       argmin, Lt, npy_int)
+DEFINE_ARGFUNC_INNER_FUNCTION(LONG,      argmin, Lt, npy_long)
+DEFINE_ARGFUNC_INNER_FUNCTION(LONGLONG,  argmin, Lt, npy_longlong)
+DEFINE_ARGFUNC_INNER_FUNCTION(FLOAT,     argmin, Lt, npy_float)
+DEFINE_ARGFUNC_INNER_FUNCTION(DOUBLE,    argmin, Lt, npy_double)
+DEFINE_ARGFUNC_INNER_FUNCTION_LD(LONGDOUBLE, argmax, Gt)
+DEFINE_ARGFUNC_INNER_FUNCTION_LD(LONGDOUBLE, argmin, Lt)
+
+#undef DEFINE_ARGFUNC_INNER_FUNCTION
+#undef DEFINE_ARGFUNC_INNER_FUNCTION_LD
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax)
+(npy_bool *ip, npy_intp len, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i = 0;
+#if NPY_HWY
+    const auto zero  = Zero<uint8_t>();
+    const int vstep  = Lanes<uint8_t>();
+    const int wstep  = vstep * 4;
+    for (npy_intp n = len & -wstep; i < n; i += wstep) {
+        auto a = LoadU(ip + i + vstep*0);
+        auto b = LoadU(ip + i + vstep*1);
+        auto c = LoadU(ip + i + vstep*2);
+        auto d = LoadU(ip + i + vstep*3);
+        auto m_a  = hn::Eq(a, zero);
+        auto m_b  = hn::Eq(b, zero);
+        auto m_c  = hn::Eq(c, zero);
+        auto m_d  = hn::Eq(d, zero);
+        auto m_ab = hn::And(m_a, m_b);
+        auto m_cd = hn::And(m_c, m_d);
+
+        npy_uint64 m = 0;
+        hn::StoreMaskBits(_Tag<uint8_t>(), hn::And(m_ab, m_cd), (uint8_t*)&m);
+
+        if constexpr (kMaxLanes<uint8_t> == 512) {
+            if (m != NPY_MAX_UINT64)
+                break;
+        }else{
+            if ((npy_int64)m != ((1LL << vstep) - 1))
+                break;
+        }
+    }
+
+#endif  // NPY_HWY
+
+    for (; i < len; ++i) {
+        if (ip[i]) {
+            *max_ind = i;
+            return 0;
+        }
+    }
+    *max_ind = 0;
+    return 0;
+}

From 405c24fe11c586d3ba769855d251a2d2f26052f7 Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Fri, 15 Aug 2025 13:54:59 +0800
Subject: [PATCH 02/15] fix compile error C2131: expression did not evaluate to
 a constant

---
 .../_core/src/multiarray/argfunc.dispatch.cpp | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
index 498f8ea861ff..beed57b3caf7 100644
--- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -19,7 +19,7 @@ struct OpGt {
         return hn::Gt(a, b); 
     }
 #endif
-    HWY_INLINE bool operator()(T a, T b) { 
+    HWY_INLINE bool operator()(T a, T b) {
         return a > b; 
     }
 
@@ -85,16 +85,16 @@ simd_argfunc_small(T *ip, npy_intp len)
     T s_acc = *ip;
     npy_intp ret_idx = 0, i = 0;
 
-    const int vstep = Lanes<T>();
+    HWY_LANES_CONSTEXPR size_t vstep = Lanes<T>();
     const int wstep = vstep*4;
-    UnsignedT d_vindices[Lanes<T>()*4];
+    std::vector<UnsignedT> d_vindices(vstep*4);
     for (int vi = 0; vi < wstep; ++vi) {
         d_vindices[vi] = vi;
     }
-    const auto vindices_0 = LoadU(d_vindices);
-    const auto vindices_1 = LoadU(d_vindices + vstep);
-    const auto vindices_2 = LoadU(d_vindices + vstep*2);
-    const auto vindices_3 = LoadU(d_vindices + vstep*3);
+    const auto vindices_0 = LoadU(d_vindices.data());
+    const auto vindices_1 = LoadU(d_vindices.data()+vstep);
+    const auto vindices_2 = LoadU(d_vindices.data()+vstep*2);
+    const auto vindices_3 = LoadU(d_vindices.data()+vstep*3);
 
     const npy_intp max_block = idx_max*wstep & -wstep;
     npy_intp len0 = len & -wstep;
@@ -129,13 +129,13 @@ simd_argfunc_small(T *ip, npy_intp len)
             acc_indices_scale = hn::IfThenElse(hn::RebindMask(_Tag<UnsignedT>(), m_acc), vi, acc_indices_scale);
         }
         // reduce
-        T dacc[Lanes<T>()];
-        UnsignedT dacc_i[Lanes<T>()];
-        UnsignedT dacc_s[Lanes<T>()];
+        std::vector<T> dacc(vstep);
+        std::vector<UnsignedT> dacc_i(vstep);
+        std::vector<UnsignedT> dacc_s(vstep);
 
-        StoreU(acc, dacc);
-        StoreU(acc_indices, dacc_i);
-        StoreU(acc_indices_scale, dacc_s);
+        StoreU(acc, dacc.data());
+        StoreU(acc_indices,       dacc_i.data());
+        StoreU(acc_indices_scale, dacc_s.data());
 
         for (int vi = 0; vi < vstep; ++vi) {
             if (op_func(dacc[vi], s_acc)) {
@@ -172,7 +172,7 @@ simd_argfunc_large(T *ip, npy_intp len)
     Op op_func;
     T s_acc = *ip;
     npy_intp ret_idx = 0, i = 0;
-    const int vstep = Lanes<T>();
+    HWY_LANES_CONSTEXPR size_t vstep = Lanes<T>();
     const int wstep = vstep*4;
 
     // loop by a scalar will perform better for small arrays
@@ -186,14 +186,14 @@ simd_argfunc_large(T *ip, npy_intp len)
             }
         }
         // create index for vector indices
-        UnsignedT d_vindices[Lanes<T>()*4];
+        std::vector<UnsignedT> d_vindices(vstep*4);
         for (int vi = 0; vi < wstep; ++vi) {
             d_vindices[vi] = vi;
         }
-        const auto vindices_0 = LoadU(d_vindices);
-        const auto vindices_1 = LoadU(d_vindices + vstep);
-        const auto vindices_2 = LoadU(d_vindices + vstep*2);
-        const auto vindices_3 = LoadU(d_vindices + vstep*3);
+        const auto vindices_0 = LoadU(d_vindices.data());
+        const auto vindices_1 = LoadU(d_vindices.data()+vstep);
+        const auto vindices_2 = LoadU(d_vindices.data()+vstep*2);
+        const auto vindices_3 = LoadU(d_vindices.data()+vstep*3);
 
         // initialize vector accumulator for highest values and its indexes
         auto acc_indices = Zero<UnsignedT>();
@@ -273,11 +273,11 @@ simd_argfunc_large(T *ip, npy_intp len)
         }
 
         // reduce
-        T dacc[Lanes<T>()];
-        UnsignedT dacc_i[Lanes<T>()];
+        std::vector<T> dacc(vstep);
+        std::vector<UnsignedT> dacc_i(vstep);
 
-        StoreU(acc_indices, dacc_i);
-        StoreU(acc, dacc);
+        StoreU(acc, dacc.data());
+        StoreU(acc_indices, dacc_i.data());
 
         s_acc   = dacc[0];
         ret_idx = dacc_i[0];
@@ -444,7 +444,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax)
         npy_uint64 m = 0;
         hn::StoreMaskBits(_Tag<uint8_t>(), hn::And(m_ab, m_cd), (uint8_t*)&m);
 
-        if constexpr (kMaxLanes<uint8_t> == 512) {
+        if constexpr (kMaxLanes<uint8_t> == 64) {
             if (m != NPY_MAX_UINT64)
                 break;
         }else{

From a6af88278c7af4369a480aed00b07353a8e6ccb2 Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Fri, 15 Aug 2025 15:41:27 +0800
Subject: [PATCH 03/15] fix compile error LNK2001: unresolved external symbol
 _LONGDOUBLE_argmax

---
 numpy/_core/src/multiarray/argfunc.dispatch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
index beed57b3caf7..31e10109cdda 100644
--- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -336,7 +336,7 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx)
     if constexpr (kSupportLane<T>) {
         if constexpr (sizeof(T) <= 2) {
             *mindx = simd_argfunc_small<T, Op>(ip, n);
-        } else if constexpr (sizeof(long double) != sizeof(double) || !std::is_same_v<T, long double>) {
+        } else {
             *mindx = simd_argfunc_large<T, Op>(ip, n);
         }
         return 0;
@@ -447,7 +447,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax)
         if constexpr (kMaxLanes<uint8_t> == 64) {
             if (m != NPY_MAX_UINT64)
                 break;
-        }else{
+        }else if constexpr(kMaxLanes<uint8_t> < 64){
             if ((npy_int64)m != ((1LL << vstep) - 1))
                 break;
         }

From c53bd0fa248554bfc2a0ea764637073e6ca4ef7d Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Fri, 15 Aug 2025 16:23:27 +0800
Subject: [PATCH 04/15] fix compile error when sizeof(long
 double)==sizeof(double)

---
 numpy/_core/src/multiarray/argfunc.dispatch.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
index 31e10109cdda..353991d6bd3b 100644
--- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -85,7 +85,7 @@ simd_argfunc_small(T *ip, npy_intp len)
     T s_acc = *ip;
     npy_intp ret_idx = 0, i = 0;
 
-    HWY_LANES_CONSTEXPR size_t vstep = Lanes<T>();
+    HWY_LANES_CONSTEXPR int vstep = Lanes<T>();
     const int wstep = vstep*4;
     std::vector<UnsignedT> d_vindices(vstep*4);
     for (int vi = 0; vi < wstep; ++vi) {
@@ -172,7 +172,7 @@ simd_argfunc_large(T *ip, npy_intp len)
     Op op_func;
     T s_acc = *ip;
     npy_intp ret_idx = 0, i = 0;
-    HWY_LANES_CONSTEXPR size_t vstep = Lanes<T>();
+    HWY_LANES_CONSTEXPR int vstep = Lanes<T>();
     const int wstep = vstep*4;
 
     // loop by a scalar will perform better for small arrays
@@ -336,10 +336,11 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx)
     if constexpr (kSupportLane<T>) {
         if constexpr (sizeof(T) <= 2) {
             *mindx = simd_argfunc_small<T, Op>(ip, n);
-        } else {
+            return 0;
+        } else if constexpr (sizeof(long double) != sizeof(double)){
             *mindx = simd_argfunc_large<T, Op>(ip, n);
+            return 0;
         }
-        return 0;
     }
 #endif
 

From c683293443da9ed5eea33d7591c6c1b5be864615 Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Fri, 15 Aug 2025 18:03:24 +0800
Subject: [PATCH 05/15] fix compile error when sizeof(long
 double)==sizeof(double)[2]

---
 .../_core/src/multiarray/argfunc.dispatch.cpp | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
index 353991d6bd3b..4950d4aace0a 100644
--- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -337,7 +337,7 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx)
         if constexpr (sizeof(T) <= 2) {
             *mindx = simd_argfunc_small<T, Op>(ip, n);
             return 0;
-        } else if constexpr (sizeof(long double) != sizeof(double)){
+        } else {
             *mindx = simd_argfunc_large<T, Op>(ip, n);
             return 0;
         }
@@ -383,13 +383,23 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND)
     return 0;                                                                                       \
 }
 
-#define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR)                         \
-NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND)                            \
-(long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip))   \
-{                                                                                  \
-    arg_max_min_func<long double, Op##INTR<long double>>(ip, n, max_ind);          \
-    return 0;                                                                      \
-}
+#if NPY_SIZEOF_LONGDOUBLE != NPY_SIZEOF_DOUBLE
+    #define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR)                         \
+    NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND)                            \
+    (long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip))   \
+    {                                                                                  \
+        arg_max_min_func<long double, Op##INTR<long double>>(ip, n, max_ind);          \
+        return 0;                                                                      \
+    }
+#else
+    #define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR)                         \
+    NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND)                            \
+    (long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip))   \
+    {                                                                                  \
+        arg_max_min_func<double, Op##INTR<double>>(reinterpret_cast<double*>(ip), n, max_ind); \
+        return 0;                                                                               \
+    }
+#endif
 
 DEFINE_ARGFUNC_INNER_FUNCTION(UBYTE,     argmax, Gt, npy_ubyte)
 DEFINE_ARGFUNC_INNER_FUNCTION(USHORT,    argmax, Gt, npy_ushort)

From 544b3a281e97f6426e2a5cd05080d50f6fa33108 Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Mon, 18 Aug 2025 13:55:19 +0800
Subject: [PATCH 06/15] fix compile error when sizeof(long
 double)==sizeof(double)[3]

---
 .../_core/src/multiarray/argfunc.dispatch.cpp | 79 +++++++++----------
 numpy/_core/src/multiarray/arraytypes.h.src   |  4 +
 2 files changed, 40 insertions(+), 43 deletions(-)

diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
index 4950d4aace0a..271f6b8e6b86 100644
--- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -337,7 +337,7 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx)
         if constexpr (sizeof(T) <= 2) {
             *mindx = simd_argfunc_small<T, Op>(ip, n);
             return 0;
-        } else {
+        } else if constexpr (sizeof(long double) != sizeof(double) || !std::is_same_v<T, long double>) {
             *mindx = simd_argfunc_large<T, Op>(ip, n);
             return 0;
         }
@@ -383,23 +383,13 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND)
     return 0;                                                                                       \
 }
 
-#if NPY_SIZEOF_LONGDOUBLE != NPY_SIZEOF_DOUBLE
-    #define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR)                         \
-    NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND)                            \
-    (long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip))   \
-    {                                                                                  \
-        arg_max_min_func<long double, Op##INTR<long double>>(ip, n, max_ind);          \
-        return 0;                                                                      \
-    }
-#else
-    #define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR)                         \
-    NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND)                            \
-    (long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip))   \
-    {                                                                                  \
-        arg_max_min_func<double, Op##INTR<double>>(reinterpret_cast<double*>(ip), n, max_ind); \
-        return 0;                                                                               \
-    }
-#endif
+#define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR)                         \
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND)                            \
+(long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip))   \
+{                                                                                  \
+    arg_max_min_func<long double, Op##INTR<long double>>(ip, n, max_ind);          \
+    return 0;                                                                      \
+}
 
 DEFINE_ARGFUNC_INNER_FUNCTION(UBYTE,     argmax, Gt, npy_ubyte)
 DEFINE_ARGFUNC_INNER_FUNCTION(USHORT,    argmax, Gt, npy_ushort)
@@ -437,31 +427,34 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax)
 {
     npy_intp i = 0;
 #if NPY_HWY
-    const auto zero  = Zero<uint8_t>();
-    const int vstep  = Lanes<uint8_t>();
-    const int wstep  = vstep * 4;
-    for (npy_intp n = len & -wstep; i < n; i += wstep) {
-        auto a = LoadU(ip + i + vstep*0);
-        auto b = LoadU(ip + i + vstep*1);
-        auto c = LoadU(ip + i + vstep*2);
-        auto d = LoadU(ip + i + vstep*3);
-        auto m_a  = hn::Eq(a, zero);
-        auto m_b  = hn::Eq(b, zero);
-        auto m_c  = hn::Eq(c, zero);
-        auto m_d  = hn::Eq(d, zero);
-        auto m_ab = hn::And(m_a, m_b);
-        auto m_cd = hn::And(m_c, m_d);
-
-        npy_uint64 m = 0;
-        hn::StoreMaskBits(_Tag<uint8_t>(), hn::And(m_ab, m_cd), (uint8_t*)&m);
-
-        if constexpr (kMaxLanes<uint8_t> == 64) {
-            if (m != NPY_MAX_UINT64)
-                break;
-        }else if constexpr(kMaxLanes<uint8_t> < 64){
-            if ((npy_int64)m != ((1LL << vstep) - 1))
-                break;
-        }
+    constexpr int simd_width = kMaxLanes<uint8_t>;
+    if constexpr(simd_width <= 64){
+	const auto zero  = Zero<uint8_t>();
+	const int vstep  = Lanes<uint8_t>();
+	const int wstep  = vstep * 4;
+	for (npy_intp n = len & -wstep; i < n; i += wstep) {
+	    auto a = LoadU(ip + i + vstep*0);
+	    auto b = LoadU(ip + i + vstep*1);
+	    auto c = LoadU(ip + i + vstep*2);
+	    auto d = LoadU(ip + i + vstep*3);
+	    auto m_a  = hn::Eq(a, zero);
+	    auto m_b  = hn::Eq(b, zero);
+	    auto m_c  = hn::Eq(c, zero);
+	    auto m_d  = hn::Eq(d, zero);
+	    auto m_ab = hn::And(m_a, m_b);
+	    auto m_cd = hn::And(m_c, m_d);
+
+	    npy_uint64 m = 0;
+	    hn::StoreMaskBits(_Tag<uint8_t>(), hn::And(m_ab, m_cd), (uint8_t*)&m);
+
+	    if constexpr (simd_width == 64) {
+		if (m != NPY_MAX_UINT64)
+		    break;
+	    }else if constexpr(simd_width < 64){
+		if ((npy_int64)m != ((1LL << vstep) - 1))
+		    break;
+	    }
+	}
     }
 
 #endif  // NPY_HWY
diff --git a/numpy/_core/src/multiarray/arraytypes.h.src b/numpy/_core/src/multiarray/arraytypes.h.src
index ca8dbeaa67eb..d7fb26bd3b70 100644
--- a/numpy/_core/src/multiarray/arraytypes.h.src
+++ b/numpy/_core/src/multiarray/arraytypes.h.src
@@ -1,6 +1,10 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_
 
+#ifndef NPY_NO_EXPORT
+    #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif

From 80752bfeaa4361af80b8fb57f79ea8a3c2d64c75 Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Mon, 18 Aug 2025 15:17:43 +0800
Subject: [PATCH 07/15] fix compile error when sizeof(long
 double)==sizeof(double)[4]

---
 numpy/_core/src/multiarray/arraytypes.c.src | 32 +++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/numpy/_core/src/multiarray/arraytypes.c.src b/numpy/_core/src/multiarray/arraytypes.c.src
index 52c9bdfb6bcc..fdba8f46e054 100644
--- a/numpy/_core/src/multiarray/arraytypes.c.src
+++ b/numpy/_core/src/multiarray/arraytypes.c.src
@@ -3428,6 +3428,38 @@ static int
 
 #define VOID_argmin NULL
 
+/**begin repeat
+ * #func = argmax, argmin#
+ * #iop = <, >#
+ */
+NPY_NO_EXPORT int
+LONGDOUBLE_@func@(npy_longdouble *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+
+    npy_longdouble mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_longdouble a = ip[i];
+        if (!(a @iop@= mp)) {  // negated, for correct nan handling
+            mp = a;
+            *mindx = i;
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        }
+    }
+
+    return 0;
+}
+/**end repeat**/
 
 /*
  *****************************************************************************

From 6ee215ad83011dd71075bc0852e3a7311a90468b Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Mon, 18 Aug 2025 17:19:51 +0800
Subject: [PATCH 08/15] fix compile error when sizeof(long
 double)==sizeof(double)[5]

---
 .../_core/src/multiarray/argfunc.dispatch.cpp | 24 +-------------
 numpy/_core/src/multiarray/arraytypes.c.src   | 33 -------------------
 2 files changed, 1 insertion(+), 56 deletions(-)

diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
index 271f6b8e6b86..db7d32ca3713 100644
--- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -28,17 +28,6 @@ struct OpGt {
     }
 };
 
-template <>
-struct OpGt<long double> {
-    HWY_INLINE bool operator()(long double a, long double b) {
-        return a > b; 
-    }
-
-    HWY_INLINE bool negated_op(long double a, long double b) {
-        return a <= b;
-    }
-};
-
 template <typename T>
 struct OpLt {
 #if NPY_HWY
@@ -56,17 +45,6 @@ struct OpLt {
     }
 };
 
-template <>
-struct OpLt<long double> {
-    HWY_INLINE bool operator()(long double a, long double b) {
-        return a < b; 
-    }
-
-    HWY_INLINE bool negated_op(long double a, long double b) {
-        return a >= b;
-    }
-};
-
 #if NPY_HWY
 template <typename T, typename Op>
 static HWY_INLINE HWY_ATTR npy_intp
@@ -337,7 +315,7 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx)
         if constexpr (sizeof(T) <= 2) {
             *mindx = simd_argfunc_small<T, Op>(ip, n);
             return 0;
-        } else if constexpr (sizeof(long double) != sizeof(double) || !std::is_same_v<T, long double>) {
+        } else {
             *mindx = simd_argfunc_large<T, Op>(ip, n);
             return 0;
         }
diff --git a/numpy/_core/src/multiarray/arraytypes.c.src b/numpy/_core/src/multiarray/arraytypes.c.src
index fdba8f46e054..8b16343e6e61 100644
--- a/numpy/_core/src/multiarray/arraytypes.c.src
+++ b/numpy/_core/src/multiarray/arraytypes.c.src
@@ -3428,39 +3428,6 @@ static int
 
 #define VOID_argmin NULL
 
-/**begin repeat
- * #func = argmax, argmin#
- * #iop = <, >#
- */
-NPY_NO_EXPORT int
-LONGDOUBLE_@func@(npy_longdouble *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
-{
-    if (npy_isnan(*ip)) {
-        // nan encountered; it's maximal|minimal
-        *mindx = 0;
-        return 0;
-    }
-
-    npy_longdouble mp = *ip;
-    *mindx = 0;
-    npy_intp i = 1;
-
-    for (; i < n; ++i) {
-        npy_longdouble a = ip[i];
-        if (!(a @iop@= mp)) {  // negated, for correct nan handling
-            mp = a;
-            *mindx = i;
-            if (npy_isnan(mp)) {
-                // nan encountered, it's maximal|minimal
-                break;
-            }
-        }
-    }
-
-    return 0;
-}
-/**end repeat**/
-
 /*
  *****************************************************************************
  **                                  DOT                                    **

From c41b68540ddcb725ee7e604fca21d64156c5aab1 Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Mon, 18 Aug 2025 18:03:37 +0800
Subject: [PATCH 09/15] fix compile error when sizeof(long
 double)==sizeof(double)[6]

---
 numpy/_core/src/multiarray/argfunc.dispatch.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
index db7d32ca3713..c51da790edf0 100644
--- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -13,10 +13,11 @@ using namespace np::simd;
 
 template <typename T>
 struct OpGt {
+    using Degraded = std::conditional_t<std::is_same_v<T, long double>, OpGt<double>, OpGt<T>>;
 #if NPY_HWY
     template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
     HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) { 
-        return hn::Gt(a, b); 
+        return hn::Gt(a, b);
     }
 #endif
     HWY_INLINE bool operator()(T a, T b) {
@@ -30,6 +31,7 @@ struct OpGt {
 
 template <typename T>
 struct OpLt {
+    using Degraded = std::conditional_t<std::is_same_v<T, long double>, OpLt<double>, OpLt<T>>;
 #if NPY_HWY
     template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
     HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) { 
@@ -296,7 +298,7 @@ simd_argfunc_large(T *ip, npy_intp len)
 }
 #endif   //NPY_HWY
 
-template <typename T, typename Op>
+template <typename T, typename Op, typename D = std::conditional_t<std::is_same_v<T, long double>, double, T>>
 HWY_INLINE HWY_ATTR  int
 arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx)
 {
@@ -313,10 +315,10 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx)
 #if NPY_HWY
     if constexpr (kSupportLane<T>) {
         if constexpr (sizeof(T) <= 2) {
-            *mindx = simd_argfunc_small<T, Op>(ip, n);
+            *mindx = simd_argfunc_small<D, typename Op::Degraded>(ip, n);
             return 0;
         } else {
-            *mindx = simd_argfunc_large<T, Op>(ip, n);
+            *mindx = simd_argfunc_large<D, typename Op::Degraded>(ip, n);
             return 0;
         }
     }

From 4d0d623553ef9bcbe78b1a937a2e93a69ca3485e Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Mon, 18 Aug 2025 20:30:08 +0800
Subject: [PATCH 10/15] fix compile error when sizeof(long
 double)==sizeof(double)[7]

---
 numpy/_core/src/multiarray/argfunc.dispatch.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
index c51da790edf0..92a94a4be3ef 100644
--- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -15,8 +15,8 @@ template <typename T>
 struct OpGt {
     using Degraded = std::conditional_t<std::is_same_v<T, long double>, OpGt<double>, OpGt<T>>;
 #if NPY_HWY
-    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
-    HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) { 
+    template <typename D = T, typename = std::enable_if_t<kSupportLane<D>>, typename V = Vec<D>>
+    HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) const { 
         return hn::Gt(a, b);
     }
 #endif
@@ -33,8 +33,8 @@ template <typename T>
 struct OpLt {
     using Degraded = std::conditional_t<std::is_same_v<T, long double>, OpLt<double>, OpLt<T>>;
 #if NPY_HWY
-    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
-    HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) { 
+    template <typename D = T, typename = std::enable_if_t<kSupportLane<D>>, typename V = Vec<D>>
+    HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) const { 
         return hn::Lt(a, b); 
     }
 #endif

From d1d2c648582ce6a22bb95be93c35d132bd441b51 Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Mon, 18 Aug 2025 21:49:09 +0800
Subject: [PATCH 11/15] fix compile error when sizeof(long
 double)==sizeof(double)[8]

---
 numpy/_core/src/multiarray/argfunc.dispatch.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
index 92a94a4be3ef..6a8adbb687a1 100644
--- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -15,7 +15,10 @@ template <typename T>
 struct OpGt {
     using Degraded = std::conditional_t<std::is_same_v<T, long double>, OpGt<double>, OpGt<T>>;
 #if NPY_HWY
-    template <typename D = T, typename = std::enable_if_t<kSupportLane<D>>, typename V = Vec<D>>
+    template <
+        typename D = T,
+        typename = std::enable_if_t<kSupportLane<D> && !std::is_same_v<D, long double>,
+        typename V = Vec<D> >
     HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) const { 
         return hn::Gt(a, b);
     }
@@ -33,7 +36,10 @@ template <typename T>
 struct OpLt {
     using Degraded = std::conditional_t<std::is_same_v<T, long double>, OpLt<double>, OpLt<T>>;
 #if NPY_HWY
-    template <typename D = T, typename = std::enable_if_t<kSupportLane<D>>, typename V = Vec<D>>
+    template <
+        typename D = T,
+        typename = std::enable_if_t<kSupportLane<D> && !std::is_same_v<D, long double>,
+        typename V = Vec<D> >
     HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) const { 
         return hn::Lt(a, b); 
     }
@@ -313,7 +319,7 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx)
     }
 
 #if NPY_HWY
-    if constexpr (kSupportLane<T>) {
+    if constexpr (kSupportLane<T> && std::is_same_v<T, D>) {
         if constexpr (sizeof(T) <= 2) {
             *mindx = simd_argfunc_small<D, typename Op::Degraded>(ip, n);
             return 0;

From c88e8ac35c6660440ff1497427234caf912a366b Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Mon, 18 Aug 2025 23:20:09 +0800
Subject: [PATCH 12/15] fix compile error when sizeof(long
 double)==sizeof(double)[9]

---
 .../_core/src/multiarray/argfunc.dispatch.cpp | 42 ++++++++++++-------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
index 6a8adbb687a1..8597ab0ada29 100644
--- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -13,12 +13,8 @@ using namespace np::simd;
 
 template <typename T>
 struct OpGt {
-    using Degraded = std::conditional_t<std::is_same_v<T, long double>, OpGt<double>, OpGt<T>>;
 #if NPY_HWY
-    template <
-        typename D = T,
-        typename = std::enable_if_t<kSupportLane<D> && !std::is_same_v<D, long double>,
-        typename V = Vec<D> >
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
     HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) const { 
         return hn::Gt(a, b);
     }
@@ -32,14 +28,21 @@ struct OpGt {
     }
 };
 
+template <>
+struct OpGt<long double> {
+    HWY_INLINE bool operator()(long double a, long double b) {
+        return a > b; 
+    }
+
+    HWY_INLINE bool negated_op(long double a, long double b) {
+        return a <= b;
+    }
+};
+
 template <typename T>
 struct OpLt {
-    using Degraded = std::conditional_t<std::is_same_v<T, long double>, OpLt<double>, OpLt<T>>;
 #if NPY_HWY
-    template <
-        typename D = T,
-        typename = std::enable_if_t<kSupportLane<D> && !std::is_same_v<D, long double>,
-        typename V = Vec<D> >
+    template <typename V, typename = std::enable_if_t<kSupportLane<T>>>
     HWY_INLINE HWY_ATTR auto operator()(const V &a, const V &b) const { 
         return hn::Lt(a, b); 
     }
@@ -53,6 +56,17 @@ struct OpLt {
     }
 };
 
+template <>
+struct OpLt<long double> {
+    HWY_INLINE bool operator()(long double a, long double b) {
+        return a < b; 
+    }
+
+    HWY_INLINE bool negated_op(long double a, long double b) {
+        return a >= b;
+    }
+};
+
 #if NPY_HWY
 template <typename T, typename Op>
 static HWY_INLINE HWY_ATTR npy_intp
@@ -304,7 +318,7 @@ simd_argfunc_large(T *ip, npy_intp len)
 }
 #endif   //NPY_HWY
 
-template <typename T, typename Op, typename D = std::conditional_t<std::is_same_v<T, long double>, double, T>>
+template <typename T, typename Op>
 HWY_INLINE HWY_ATTR  int
 arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx)
 {
@@ -319,12 +333,12 @@ arg_max_min_func(T *ip, npy_intp n, npy_intp *mindx)
     }
 
 #if NPY_HWY
-    if constexpr (kSupportLane<T> && std::is_same_v<T, D>) {
+    if constexpr (kSupportLane<T> && !std::is_same_v<T, long double>) {
         if constexpr (sizeof(T) <= 2) {
-            *mindx = simd_argfunc_small<D, typename Op::Degraded>(ip, n);
+            *mindx = simd_argfunc_small<T, Op>(ip, n);
             return 0;
         } else {
-            *mindx = simd_argfunc_large<D, typename Op::Degraded>(ip, n);
+            *mindx = simd_argfunc_large<T, Op>(ip, n);
             return 0;
         }
     }

From def237222c2485676a81cec6aeb2395342608b57 Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Mon, 18 Aug 2025 23:56:18 +0800
Subject: [PATCH 13/15] fix compile error when sizeof(long
 double)==sizeof(double)[10]

---
 .../_core/src/multiarray/argfunc.dispatch.cpp | 67 ++++++++++++++++---
 1 file changed, 56 insertions(+), 11 deletions(-)

diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
index 8597ab0ada29..e93881614f8b 100644
--- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -383,14 +383,6 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND)
     return 0;                                                                                       \
 }
 
-#define DEFINE_ARGFUNC_INNER_FUNCTION_LD(TYPE, KIND, INTR)                         \
-NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_##KIND)                            \
-(long double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip))   \
-{                                                                                  \
-    arg_max_min_func<long double, Op##INTR<long double>>(ip, n, max_ind);          \
-    return 0;                                                                      \
-}
-
 DEFINE_ARGFUNC_INNER_FUNCTION(UBYTE,     argmax, Gt, npy_ubyte)
 DEFINE_ARGFUNC_INNER_FUNCTION(USHORT,    argmax, Gt, npy_ushort)
 DEFINE_ARGFUNC_INNER_FUNCTION(UINT,      argmax, Gt, npy_uint)
@@ -415,13 +407,66 @@ DEFINE_ARGFUNC_INNER_FUNCTION(LONG,      argmin, Lt, npy_long)
 DEFINE_ARGFUNC_INNER_FUNCTION(LONGLONG,  argmin, Lt, npy_longlong)
 DEFINE_ARGFUNC_INNER_FUNCTION(FLOAT,     argmin, Lt, npy_float)
 DEFINE_ARGFUNC_INNER_FUNCTION(DOUBLE,    argmin, Lt, npy_double)
-DEFINE_ARGFUNC_INNER_FUNCTION_LD(LONGDOUBLE, argmax, Gt)
-DEFINE_ARGFUNC_INNER_FUNCTION_LD(LONGDOUBLE, argmin, Lt)
 
 #undef DEFINE_ARGFUNC_INNER_FUNCTION
-#undef DEFINE_ARGFUNC_INNER_FUNCTION_LD
 
 
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_argmax)
+(npy_longdouble *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+
+    npy_longdouble mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_longdouble a = ip[i];
+        if (!(a <= mp)) {  // negated, for correct nan handling
+            mp = a;
+            *mindx = i;
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        }
+    }
+
+    return 0;
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_argmin)
+(npy_longdouble *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+
+    npy_longdouble mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_longdouble a = ip[i];
+        if (!(a >= mp)) {  // negated, for correct nan handling
+            mp = a;
+            *mindx = i;
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        }
+    }
+
+    return 0;
+}
+
 NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax)
 (npy_bool *ip, npy_intp len, npy_intp *max_ind, PyArrayObject *NPY_UNUSED(aip))
 {

From 7c048104da15ed108578efeb4ea561384aa7c88c Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Tue, 19 Aug 2025 13:29:40 +0800
Subject: [PATCH 14/15] fix s390x test[1]

---
 .../_core/src/multiarray/argfunc.dispatch.cpp | 26 +++++++------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
index e93881614f8b..b583dd9922a2 100644
--- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -87,14 +87,10 @@ simd_argfunc_small(T *ip, npy_intp len)
 
     HWY_LANES_CONSTEXPR int vstep = Lanes<T>();
     const int wstep = vstep*4;
-    std::vector<UnsignedT> d_vindices(vstep*4);
-    for (int vi = 0; vi < wstep; ++vi) {
-        d_vindices[vi] = vi;
-    }
-    const auto vindices_0 = LoadU(d_vindices.data());
-    const auto vindices_1 = LoadU(d_vindices.data()+vstep);
-    const auto vindices_2 = LoadU(d_vindices.data()+vstep*2);
-    const auto vindices_3 = LoadU(d_vindices.data()+vstep*3);
+    const auto vindices_0 = hn::Iota(_Tag<UnsignedT>(), UnsignedT(0));
+    const auto vindices_1 = hn::Iota(_Tag<UnsignedT>(), UnsignedT(vstep));
+    const auto vindices_2 = hn::Iota(_Tag<UnsignedT>(), UnsignedT(vstep*2));
+    const auto vindices_3 = hn::Iota(_Tag<UnsignedT>(), UnsignedT(vstep*3));
 
     const npy_intp max_block = idx_max*wstep & -wstep;
     npy_intp len0 = len & -wstep;
@@ -185,15 +181,11 @@ simd_argfunc_large(T *ip, npy_intp len)
                 len0 = NPY_MAX_UINT32;
             }
         }
-        // create index for vector indices
-        std::vector<UnsignedT> d_vindices(vstep*4);
-        for (int vi = 0; vi < wstep; ++vi) {
-            d_vindices[vi] = vi;
-        }
-        const auto vindices_0 = LoadU(d_vindices.data());
-        const auto vindices_1 = LoadU(d_vindices.data()+vstep);
-        const auto vindices_2 = LoadU(d_vindices.data()+vstep*2);
-        const auto vindices_3 = LoadU(d_vindices.data()+vstep*3);
+
+        const auto vindices_0 = hn::Iota(_Tag<UnsignedT>(), UnsignedT(0));
+        const auto vindices_1 = hn::Iota(_Tag<UnsignedT>(), UnsignedT(vstep));
+        const auto vindices_2 = hn::Iota(_Tag<UnsignedT>(), UnsignedT(vstep*2));
+        const auto vindices_3 = hn::Iota(_Tag<UnsignedT>(), UnsignedT(vstep*3));
 
         // initialize vector accumulator for highest values and its indexes
         auto acc_indices = Zero<UnsignedT>();

From 7c6a68333080584ecfc0f22435e555654cca8b91 Mon Sep 17 00:00:00 2001
From: ixgbe <1113177880@qq.com>
Date: Tue, 19 Aug 2025 15:28:18 +0800
Subject: [PATCH 15/15] fix s390x test[2]

---
 .../_core/src/multiarray/argfunc.dispatch.cpp | 67 ++++++++++++-------
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/numpy/_core/src/multiarray/argfunc.dispatch.cpp b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
index b583dd9922a2..9b126dfaf683 100644
--- a/numpy/_core/src/multiarray/argfunc.dispatch.cpp
+++ b/numpy/_core/src/multiarray/argfunc.dispatch.cpp
@@ -222,6 +222,12 @@ simd_argfunc_large(T *ip, npy_intp len)
 
                 npy_uint64 nnan = 0;
                 hn::StoreMaskBits(_Tag<T>(), hn::And(nnan_ab, nnan_cd), (uint8_t*)&nnan);
+#if HWY_IS_BIG_ENDIAN
+                static_assert(kMaxLanes<T> <= 8,
+                      "This conversion is not supported for SIMD widths "
+                      "larger than 256 bits.");
+                nnan = ((uint8_t *)&nnan)[0];
+#endif
 
                 if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
                     npy_uint64 nnan_4[4];
@@ -229,6 +235,12 @@ simd_argfunc_large(T *ip, npy_intp len)
                     hn::StoreMaskBits(_Tag<T>(), nnan_b, (uint8_t*)&(nnan_4[1]));
                     hn::StoreMaskBits(_Tag<T>(), nnan_c, (uint8_t*)&(nnan_4[2]));
                     hn::StoreMaskBits(_Tag<T>(), nnan_d, (uint8_t*)&(nnan_4[3]));
+#if HWY_IS_BIG_ENDIAN
+                    nnan_4[0] = ((uint8_t *)&nnan_4[0])[0];
+                    nnan_4[1] = ((uint8_t *)&nnan_4[1])[0];
+                    nnan_4[2] = ((uint8_t *)&nnan_4[2])[0];
+                    nnan_4[3] = ((uint8_t *)&nnan_4[3])[0];
+#endif
                     for (int ni = 0; ni < 4; ++ni) {
                         for (int vi = 0; vi < vstep; ++vi) {
                             if (!((nnan_4[ni] >> vi) & 1)) {
@@ -253,6 +265,9 @@ simd_argfunc_large(T *ip, npy_intp len)
 
                 npy_uint64 nnan = 0;
                 hn::StoreMaskBits(_Tag<T>(), nnan_a, (uint8_t*)&nnan);
+#if HWY_IS_BIG_ENDIAN
+                nnan = ((uint8_t *)&nnan)[0];
+#endif
 
                 if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
                     for (int vi = 0; vi < vstep; ++vi) {
@@ -466,32 +481,32 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax)
 #if NPY_HWY
     constexpr int simd_width = kMaxLanes<uint8_t>;
     if constexpr(simd_width <= 64){
-	const auto zero  = Zero<uint8_t>();
-	const int vstep  = Lanes<uint8_t>();
-	const int wstep  = vstep * 4;
-	for (npy_intp n = len & -wstep; i < n; i += wstep) {
-	    auto a = LoadU(ip + i + vstep*0);
-	    auto b = LoadU(ip + i + vstep*1);
-	    auto c = LoadU(ip + i + vstep*2);
-	    auto d = LoadU(ip + i + vstep*3);
-	    auto m_a  = hn::Eq(a, zero);
-	    auto m_b  = hn::Eq(b, zero);
-	    auto m_c  = hn::Eq(c, zero);
-	    auto m_d  = hn::Eq(d, zero);
-	    auto m_ab = hn::And(m_a, m_b);
-	    auto m_cd = hn::And(m_c, m_d);
-
-	    npy_uint64 m = 0;
-	    hn::StoreMaskBits(_Tag<uint8_t>(), hn::And(m_ab, m_cd), (uint8_t*)&m);
-
-	    if constexpr (simd_width == 64) {
-		if (m != NPY_MAX_UINT64)
-		    break;
-	    }else if constexpr(simd_width < 64){
-		if ((npy_int64)m != ((1LL << vstep) - 1))
-		    break;
-	    }
-	}
+        const auto zero  = Zero<uint8_t>();
+        const int vstep  = Lanes<uint8_t>();
+        const int wstep  = vstep * 4;
+        for (npy_intp n = len & -wstep; i < n; i += wstep) {
+            auto a = LoadU(ip + i + vstep*0);
+            auto b = LoadU(ip + i + vstep*1);
+            auto c = LoadU(ip + i + vstep*2);
+            auto d = LoadU(ip + i + vstep*3);
+            auto m_a  = hn::Eq(a, zero);
+            auto m_b  = hn::Eq(b, zero);
+            auto m_c  = hn::Eq(c, zero);
+            auto m_d  = hn::Eq(d, zero);
+            auto m_ab = hn::And(m_a, m_b);
+            auto m_cd = hn::And(m_c, m_d);
+
+            npy_uint64 m = 0;
+            hn::StoreMaskBits(_Tag<uint8_t>(), hn::And(m_ab, m_cd), (uint8_t*)&m);
+
+            if constexpr (simd_width == 64) {
+                if (m != NPY_MAX_UINT64)
+                    break;
+            }else if constexpr(simd_width < 64){
+                if ((npy_int64)m != ((1LL << vstep) - 1))
+                    break;
+            }
+        }
     }
 
 #endif  // NPY_HWY