[Caffe2] Build perfkernels targeting SVE128

Nicoshev · facebook-github-bot · commit e24bfba108c7 · 2025-07-28T11:00:41.000-07:00
Summary: We are now building perfkernels using SVE/Neon enhancements

Test Plan:
Sigrid Predictor canary

Rollback Plan:

Differential Revision: D78902495
diff --git a/aten/src/ATen/cpu/vec/functional_base.h b/aten/src/ATen/cpu/vec/functional_base.h
@@ -102,31 +102,59 @@ struct VecReduceAllSIMD<float, Op> {
 #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
        // !defined(C10_MOBILE)
 
-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    !defined(CPU_CAPABILITY_SVE)
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE512)
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
   static inline float apply(
       const Op& vec_fun,
       const Vectorized<float>& acc_vec) {
     using Vec = Vectorized<float>;
     Vec v = acc_vec;
+    // 128-bit shuffle
+    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
+    Vec v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 64-bit shuffle
+    ind = svdupq_n_u32(2, 3, 0, 1);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 32-bit shuffle
+    ind = svdupq_n_u32(1, 0, 2, 3);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    return svlasta(svpfalse(), v);
+  }
+};
+#else
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(
+      const Op& vec_fun,
+      const Vectorized<float>& acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+    float32x4_t vReg = vld1q_f32(reinterpret_cast<const float*>(acc_vec.as_bytes()));
 
     // 64-bit shuffle: [a1+a5, a2+a6, a3+a7, a4+a8, -, -, -, -] -> [a3+a7,
     // a4+a8, a1+a5, a2+a6, -, -, -, -]
-    float32x4_t v1_1 = vextq_f32(v, v, 2);
-    Vec v1 = v1_1;
+    float32x4_t v1_1 = vextq_f32(vReg, vReg, 2);
+
+     __at_align__ float v1[4];
+    vst1q_f32(reinterpret_cast<float*>(&v1), v1_1);
     // [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, -, -, -]
-    v = vec_fun(v, v1);
+    at::vec::Vectorized<float> vf1(v1);
+    v = vec_fun(v, vf1);
 
     // 32-bit shuffle: [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -,
     // -, -, -] -> [a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, -, -, -,
     // -]
-    v1_1 = vrev64q_f32(v);
-    v1 = v1_1;
+    v1_1 = vrev64q_f32(vld1q_f32(reinterpret_cast<const float*>(acc_vec.as_bytes())));
+    vst1q_f32(reinterpret_cast<float*>(&v1), v1_1);
     // [a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8,
     // a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, -, -, -, -]
-    v = vec_fun(v, v1);
+    at::vec::Vectorized<float> vf2(v1);
+    v = vec_fun(v, vf2);
 
     return v[0];
   }
@@ -137,38 +165,13 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
   static inline float apply(
       const std::plus<Vectorized<float>>& vec_fun,
       const Vectorized<float>& acc_vec) {
-    return vaddvq_f32(acc_vec);
+    return vaddvq_f32(vld1q_f32(reinterpret_cast<const float*>(acc_vec.as_bytes())));
   }
 };
+#endif // defined(CPU_CAPABILITY_SVE256)
 #endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
-       // && !defined(CPU_CAPABILITY_SVE)
 
-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    defined(CPU_CAPABILITY_SVE256)
-template <typename Op>
-struct VecReduceAllSIMD<float, Op> {
-  static inline float apply(
-      const Op& vec_fun,
-      const Vectorized<float>& acc_vec) {
-    using Vec = Vectorized<float>;
-    Vec v = acc_vec;
-    // 128-bit shuffle
-    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
-    Vec v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 64-bit shuffle
-    ind = svdupq_n_u32(2, 3, 0, 1);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 32-bit shuffle
-    ind = svdupq_n_u32(1, 0, 2, 3);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    return svlasta(svpfalse(), v);
-  }
-};
-#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
-       // && defined(CPU_CAPABILITY_SVE256)
+
 
 template <typename scalar_t, typename Op>
 inline scalar_t vec_reduce_all(
diff --git a/aten/src/ATen/cpu/vec/sve/vec_common_sve.h b/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
@@ -12,7 +12,8 @@
 #include <ATen/cpu/vec/sve/vec_double.h>
 #include <ATen/cpu/vec/sve/vec_float.h>
 #include <ATen/cpu/vec/sve/vec_int.h>
-#include <ATen/cpu/vec/sve/vec_qint.h>
+//SVE qint version is broken
+#include <ATen/cpu/vec/vec256/vec256_qint.h>
 #include <ATen/cpu/vec/sve/vec_bfloat16.h>
 
 namespace at::vec {
@@ -74,12 +75,6 @@ DEFINE_SVE_CAST(int32_t, s32, float, f32)
 DEFINE_SVE_CAST(int16_t, s16, float, f32)
 DEFINE_SVE_CAST(float, f32, double, f64)
 
-#ifdef __ARM_FEATURE_BF16
-DEFINE_SVE_CAST(int64_t, s64, c10::BFloat16, bf16)
-DEFINE_SVE_CAST(int32_t, s32, c10::BFloat16, bf16)
-DEFINE_SVE_CAST(int16_t, s16, c10::BFloat16, bf16)
-#endif // __ARM_FEATURE_BF16
-
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template<int64_t scale = 1>
@@ -183,9 +178,13 @@ std::pair<
   // group cols crossing lanes:
   //   return {a0, b0, a1, b1, a2, b2, a3, b3}
   //          {a4, b4, a5, b5, a6, b6, a7, b7}
-  return std::make_pair(
-      Vectorized<c10::BFloat16>(svzip1_bf16(a, b)),
-      Vectorized<c10::BFloat16>(svzip2_bf16(a, b)));
+  Vectorized<c10::BFloat16> c;
+  Vectorized<c10::BFloat16> d;
+  svbfloat16_t aReg = svreinterpret_bf16_u64(a.getSve());
+  svbfloat16_t bReg = svreinterpret_bf16_u64(b.getSve());
+  c.setSve(svreinterpret_u64_bf16(svzip1_bf16(aReg, bReg)));
+  d.setSve(svreinterpret_u64_bf16(svzip2_bf16(aReg, bReg)));
+  return std::make_pair(c, d);
 }
 #endif // __ARM_FEATURE_BF16
 
@@ -234,9 +233,13 @@ std::pair<
   // swap lanes:
   //   return {a0, a1, a2, a3, a4, a5, a6, a7}
   //          {b0, b1, b2, b3, b4, b5, b6, b7}
-  return std::make_pair(
-      Vectorized<c10::BFloat16>(svuzp1_bf16((svbfloat16_t)a, (svbfloat16_t)b)),
-      Vectorized<c10::BFloat16>(svuzp2_bf16((svbfloat16_t)a, (svbfloat16_t)b)));
+  Vectorized<c10::BFloat16> c;
+  Vectorized<c10::BFloat16> d;
+  svbfloat16_t aReg = svreinterpret_bf16_u64(a.getSve());
+  svbfloat16_t bReg = svreinterpret_bf16_u64(b.getSve());
+  c.setSve(svreinterpret_u64_bf16(svuzp1_bf16(aReg, bReg)));
+  d.setSve(svreinterpret_u64_bf16(svuzp2_bf16(aReg, bReg)));
+  return std::make_pair(c, d);
 }
 #endif // __ARM_FEATURE_BF16
 
diff --git a/aten/src/ATen/cpu/vec/vec.h b/aten/src/ATen/cpu/vec/vec.h
@@ -2,7 +2,7 @@
 
 #if defined(CPU_CAPABILITY_AVX512)
 #include <ATen/cpu/vec/vec512/vec512.h>
-#elif defined(CPU_CAPABILITY_SVE)
+#elif defined(__aarch64__)
 #include <ATen/cpu/vec/sve/vec_common_sve.h>
 #else
 #include <ATen/cpu/vec/vec128/vec128.h>
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@@ -1375,48 +1375,64 @@ Vectorized<c10::quint8> inline maximum(
 #endif // if defined(CPU_CAPABILITY_AVX2)
 
 #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
-std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
+std::pair<at::vec::Vectorized<float>, at::vec::Vectorized<float>> inline convert_int8_to_float(
     at::vec::Vectorized<int8_t> src) {
   auto s8x8 = vld1_s8(src.operator const int8_t*());
   auto s16x8 = vmovl_s8(s8x8);
 
   auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
   auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
 
-  return std::make_pair(
-      Vectorized<float>(vcvtq_f32_s32(s32x4_lo)),
-      Vectorized<float>(vcvtq_f32_s32(s32x4_hi)));
+  __at_align__ float a[4];
+  __at_align__ float b[4];
+
+  vst1q_f32(reinterpret_cast<float*>(&a), vcvtq_f32_s32(s32x4_lo));
+  vst1q_f32(reinterpret_cast<float*>(&b), vcvtq_f32_s32(s32x4_hi));
+
+  return std::make_pair(at::vec::Vectorized<float>(a), at::vec::Vectorized<float>(b));
 }
 
-std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
+std::pair<at::vec::Vectorized<float>, at::vec::Vectorized<float>> inline convert_int8_to_float(
     at::vec::Vectorized<uint8_t> src) {
   auto u8x8 = vld1_u8(src.operator const uint8_t*());
   auto u16x8 = vmovl_u8(u8x8);
   auto u32x4_hi = vmovl_u16(vget_high_u16(u16x8));
   auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));
 
-  return std::make_pair(
-      Vectorized<float>(vcvtq_f32_u32(u32x4_lo)),
-      Vectorized<float>(vcvtq_f32_u32(u32x4_hi)));
+  __at_align__ float a[4];
+  __at_align__ float b[4];
+
+  vst1q_f32(reinterpret_cast<float*>(&a), vcvtq_f32_u32(u32x4_lo));
+  vst1q_f32(reinterpret_cast<float*>(&b), vcvtq_f32_u32(u32x4_hi));
+
+  return std::make_pair(at::vec::Vectorized<float>(a), at::vec::Vectorized<float>(b));
 }
 
-Vectorized<float> inline convert_int8_half_register_to_float(
+at::vec::Vectorized<float> inline convert_int8_half_register_to_float(
     at::vec::Vectorized<int8_t> src) {
   auto s8x8 = vld1_s8(src.operator const int8_t*());
   auto s16x8 = vmovl_s8(s8x8);
 
   auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
 
-  return Vectorized<float>(vcvtq_f32_s32(s32x4_lo));
+  __at_align__ float r[4];
+
+  vst1q_f32(reinterpret_cast<float*>(&r), vcvtq_f32_s32(s32x4_lo));
+
+  return at::vec::Vectorized<float>(r);
 }
 
-Vectorized<float> inline convert_int8_half_register_to_float(
+at::vec::Vectorized<float> inline convert_int8_half_register_to_float(
     at::vec::Vectorized<uint8_t> src) {
   auto u8x8 = vld1_u8(src.operator const uint8_t*());
   auto u16x8 = vmovl_u8(u8x8);
   auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));
 
-  return Vectorized<float>(vcvtq_f32_u32(u32x4_lo));
+  __at_align__ float r[4];
+
+  vst1q_f32(reinterpret_cast<float*>(&r), vcvtq_f32_u32(u32x4_lo));
+
+  return at::vec::Vectorized<float>(r);
 }
 
 #endif
diff --git a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
@@ -187,8 +187,7 @@ dot_with_fp32_arith_main_inner_loop_bfdot(
   const auto temp_vec2 = vld1q_bf16(
       reinterpret_cast<const bfloat16_t*>(
           &vec2[registerPairIndex * vec::Vectorized<BFloat16>::size()]));
-  sum[registerPairIndex] =
-    vbfdotq_f32(sum[registerPairIndex], temp_vec1, temp_vec2);
+  sum[registerPairIndex].setNeon(vreinterpretq_p128_f32(vbfdotq_f32(vreinterpretq_f32_p128(sum[registerPairIndex].getNeon()), temp_vec1, temp_vec2)));
 }
 
 TARGET_ARM_BF16_ATTRIBUTE C10_ALWAYS_INLINE
@@ -200,7 +199,7 @@ void dot_with_fp32_arith_vectorized_tail_inner_loop_bfdot(
   // See NOTE[Intrinsics in bfdot variant] above.
   const auto temp_vec1 = vld1q_bf16(reinterpret_cast<const bfloat16_t*>(&vec1[idx]));
   const auto temp_vec2 = vld1q_bf16(reinterpret_cast<const bfloat16_t*>(&vec2[idx]));
-  *tail_sum = vbfdotq_f32(*tail_sum, temp_vec1, temp_vec2);
+  tail_sum->setNeon(vreinterpretq_p128_f32(vbfdotq_f32(vreinterpretq_f32_p128(tail_sum->getNeon()), temp_vec1, temp_vec2)));
 }
 
 #else
@@ -214,8 +213,14 @@ std::pair<vec::Vectorized<float>, vec::Vectorized<float>> fmadd(
     const vec::Vectorized<c10::Half>& b,
     const vec::Vectorized<float>& acc_low,
     const vec::Vectorized<float>& acc_high) {
-#if defined(__ARM_FEATURE_FP16_FML) && !defined(CPU_CAPABILITY_SVE)
-  return std::make_pair(vfmlalq_low_f16(acc_low, a, b), vfmlalq_high_f16(acc_high, a, b));
+#if defined(__aarch64__) && ((defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)) || (defined(CPU_CAPABILITY_SVE128)))
+  float16x8_t aReg = vreinterpretq_f16_p128(a.getNeon());
+  float16x8_t bReg = vreinterpretq_f16_p128(b.getNeon());
+  vec::Vectorized<float> c;
+  vec::Vectorized<float> d;
+  c.setNeon(vreinterpretq_p128_f32(vfmlalq_low_f16(vreinterpretq_f32_p128(acc_low.getNeon()), aReg, bReg)));
+  d.setNeon(vreinterpretq_p128_f32(vfmlalq_high_f16(vreinterpretq_f32_p128(acc_high.getNeon()), aReg, bReg)));
+  return std::make_pair(c, d);
 #else
   const auto [a_float_low, a_float_high] = convert_half_float(a);
   const auto [b_float_low, b_float_high] = convert_half_float(b);
@@ -235,13 +240,18 @@ std::pair<vec::Vectorized<float>, vec::Vectorized<float>> fmadd(
 
 // Return a + b_low * c_low + b_high * c_high
 vec::Vectorized<float> fmadd(vec::Vectorized<float> a, vec::Vectorized<Half> b, vec::Vectorized<Half> c) {
-#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)
+#if defined(__aarch64__) && ((defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)) || (defined(CPU_CAPABILITY_SVE128)))
   // NOTE: this instruction is an optional instruction in ARM v8.2 and
   // v8.3, but mandatory in v8.4 per
   // https://developer.arm.com/documentation/ddi0596/2021-03/SIMD-FP-Instructions/FMLAL--FMLAL2--vector---Floating-point-fused-Multiply-Add-Long-to-accumulator--vector--?lang=en
   // I'm not certain that I have the right feature test macro.
-  vec::Vectorized<float> first = vfmlalq_low_f16(a, b, c);
-  return vfmlalq_high_f16(first, b, c);
+  float32x4_t aReg = vreinterpretq_f32_p128(a.getNeon());
+  float32x4_t bReg = vreinterpretq_f32_p128(b.getNeon());
+  float16x8_t cReg = vreinterpretq_f16_p128(c.getNeon());
+  vec::Vectorized<float> res;
+  res.setNeon(vreinterpretq_p128_f32(vfmlalq_low_f16(aReg, bReg, cReg)));
+  res.setNeon(vreinterpretq_p128_f32(vfmlalq_high_f16(vreinterpretq_f32_p128(res.getNeon()), bReg, cReg)));
+  return res;
 #else
   const auto [b_float_low, b_float_high] = convert_half_float(b);
   const auto [c_float_low, c_float_high] = convert_half_float(c);
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -34,6 +34,9 @@
 #if defined(__ARM_NEON__) || defined(__aarch64__)
 #include <ATen/quantized/Quantizer.h>
 #include <arm_neon.h>
+#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE128)
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
+#endif
 #endif
 
 
diff --git a/aten/src/ATen/native/quantized/cpu/qrelu.cpp b/aten/src/ATen/native/quantized/cpu/qrelu.cpp
@@ -26,6 +26,10 @@
 #include <ATen/ops/relu_native.h>
 #endif
 
+#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE128)
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
+#endif
+
 #include <algorithm>
 
 namespace at::native {
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
@@ -6,7 +6,6 @@
 #include <ATen/Parallel.h>
 #include <ATen/ScalarOps.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/Fill.h>
 #include <ATen/native/IndexingUtils.h>
diff --git a/torch/nativert/kernels/GeneratedNativeStaticDispatchKernels.cpp b/torch/nativert/kernels/GeneratedNativeStaticDispatchKernels.cpp
@@ -6,7 +6,6 @@
 #include <ATen/Parallel.h>
 #include <ATen/ScalarOps.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/EmbeddingBag.h>
 #include <ATen/native/Fill.h>