pytorch
diff --git a/‎aten/src/ATen/cpu/vec/functional_base.h
Lines changed: 27 additions & 29 deletions b/‎aten/src/ATen/cpu/vec/functional_base.h
Lines changed: 27 additions & 29 deletions
diff --git a/‎aten/src/ATen/cpu/vec/sve/sve_helper.h
Lines changed: 10 additions & 2 deletions b/‎aten/src/ATen/cpu/vec/sve/sve_helper.h
Lines changed: 10 additions & 2 deletions
diff --git a/‎aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
Lines changed: 2 additions & 56 deletions b/‎aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
Lines changed: 2 additions & 56 deletions
@@ -102,8 +102,31 @@ struct VecReduceAllSIMD<float, Op> {
 #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
        // !defined(C10_MOBILE)
 
-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    !defined(CPU_CAPABILITY_SVE)
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE512)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(
+      const Op& vec_fun,
+      const Vectorized<float>& acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+    // 128-bit shuffle
+    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
+    Vec v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 64-bit shuffle
+    ind = svdupq_n_u32(2, 3, 0, 1);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 32-bit shuffle
+    ind = svdupq_n_u32(1, 0, 2, 3);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    return svlasta(svpfalse(), v);
+  }
+};
+#else
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
   static inline float apply(
@@ -140,35 +163,10 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
     return vaddvq_f32(acc_vec);
   }
 };
+#endif // defined(CPU_CAPABILITY_SVE256)
 #endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
-       // && !defined(CPU_CAPABILITY_SVE)
 
-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    defined(CPU_CAPABILITY_SVE256)
-template <typename Op>
-struct VecReduceAllSIMD<float, Op> {
-  static inline float apply(
-      const Op& vec_fun,
-      const Vectorized<float>& acc_vec) {
-    using Vec = Vectorized<float>;
-    Vec v = acc_vec;
-    // 128-bit shuffle
-    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
-    Vec v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 64-bit shuffle
-    ind = svdupq_n_u32(2, 3, 0, 1);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 32-bit shuffle
-    ind = svdupq_n_u32(1, 0, 2, 3);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    return svlasta(svpfalse(), v);
-  }
-};
-#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
-       // && defined(CPU_CAPABILITY_SVE256)
+
 
 template <typename scalar_t, typename Op>
 inline scalar_t vec_reduce_all(
 
@@ -1,13 +1,21 @@
 #pragma once
 
-#include <cstdint>
 #include <c10/macros/Macros.h>
+#include <cstdint>
 
 #include <ATen/cpu/vec/intrinsics.h>
 
 #include <ATen/cpu/vec/vec_base.h>
 
-#if defined(CPU_CAPABILITY_SVE)
+#if defined(__aarch64__) && (defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) || defined(AT_BUILD_ARM_VECSVE_WITH_SLEEF))
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+
+#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE128)
 
 // Define the data type of VLS(vector-length specific).
 typedef svbool_t vls_pred_t
 
@@ -2,11 +2,6 @@
 
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/sve/sve_helper.h>
-#include <ATen/cpu/vec/sve/vec_common_sve.h>
-#include <ATen/cpu/vec/sve/vec_float.h>
-#include <ATen/cpu/vec/intrinsics.h>
-#include <ATen/cpu/vec/vec_base.h>
-#include <c10/util/bit_cast.h>
 #include <cmath>
 namespace at {
 namespace vec {
@@ -20,43 +15,8 @@ namespace vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {
 
-#define CONVERT_NON_VECTORIZED_INIT(type, name) \
-inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
-  constexpr int64_t K = Vectorized<type>::size(); \
-  __at_align__ float arr[K]; \
-  __at_align__ type arr2[K]; \
-  a.store(arr2); \
-  convert(arr2, arr, K); \
-  return std::make_tuple( \
-      Vectorized<float>::loadu(arr), \
-      Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
-} \
-inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const Vectorized<float>& b) { \
-  constexpr int64_t K = Vectorized<type>::size(); \
-  __at_align__ float arr[K]; \
-  __at_align__ type arr2[K]; \
-  a.store(arr); \
-  b.store(arr + Vectorized<float>::size()); \
-  convert(arr, arr2, K); \
-  return Vectorized<type>::loadu(arr2); \
-}
-
-#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
-inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-  __at_align__ float values[Vectorized<float>::size()]; \
-  for (const auto k : c10::irange(Vectorized<float>::size())) { \
-    values[k] = data[k]; \
-  } \
-  out = Vectorized<float>::loadu(values); \
-} \
-\
-inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vectorized<float>& out2) { \
-  load_fp32_from_##name(data, out1); \
-  data += Vectorized<float>::size(); \
-  load_fp32_from_##name(data, out2); \
-}
-
-#if defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16)
+#if (defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE512)) && \
+    defined(__ARM_FEATURE_BF16)
 
 template <>
 struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
@@ -635,20 +595,6 @@ LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16);
 
 #else // defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16)
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wignored-qualifiers"
-
-CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
-CONVERT_NON_VECTORIZED_INIT(Half, half);
-
-LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16);
-LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16);
-
-#pragma GCC diagnostic pop
-
-#endif // defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16)
-
-
 } // namespace CPU_CAPABILITY
 } // namespace vec
 } // namespace at