Skip to content

[Caffe2] Add float batch box cox SVE128 implementation #159778

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion aten/src/ATen/Version.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,13 @@ std::string get_cpu_capability() {
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
case native::CPUCapability::ZVECTOR:
return "Z VECTOR";
#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
case native::CPUCapability::SVE128:
return "SVE128";
case native::CPUCapability::SVE256:
return "SVE256";
case native::CPUCapability::SVE512:
return "SVE512";
#else
case native::CPUCapability::AVX2:
return "AVX2";
Expand Down
56 changes: 27 additions & 29 deletions aten/src/ATen/cpu/vec/functional_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,31 @@ struct VecReduceAllSIMD<float, Op> {
#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
// !defined(C10_MOBILE)

#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
!defined(CPU_CAPABILITY_SVE)
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE512)
template <typename Op>
struct VecReduceAllSIMD<float, Op> {
static inline float apply(
const Op& vec_fun,
const Vectorized<float>& acc_vec) {
using Vec = Vectorized<float>;
Vec v = acc_vec;
// 128-bit shuffle
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
Vec v1 = svtbl_f32(v, ind);
v = vec_fun(v, v1);
// 64-bit shuffle
ind = svdupq_n_u32(2, 3, 0, 1);
v1 = svtbl_f32(v, ind);
v = vec_fun(v, v1);
// 32-bit shuffle
ind = svdupq_n_u32(1, 0, 2, 3);
v1 = svtbl_f32(v, ind);
v = vec_fun(v, v1);
return svlasta(svpfalse(), v);
}
};
#else
template <typename Op>
struct VecReduceAllSIMD<float, Op> {
static inline float apply(
Expand Down Expand Up @@ -140,35 +163,10 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
return vaddvq_f32(acc_vec);
}
};
#endif // defined(CPU_CAPABILITY_SVE256)
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
// && !defined(CPU_CAPABILITY_SVE)

#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
defined(CPU_CAPABILITY_SVE256)
template <typename Op>
struct VecReduceAllSIMD<float, Op> {
static inline float apply(
const Op& vec_fun,
const Vectorized<float>& acc_vec) {
using Vec = Vectorized<float>;
Vec v = acc_vec;
// 128-bit shuffle
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
Vec v1 = svtbl_f32(v, ind);
v = vec_fun(v, v1);
// 64-bit shuffle
ind = svdupq_n_u32(2, 3, 0, 1);
v1 = svtbl_f32(v, ind);
v = vec_fun(v, v1);
// 32-bit shuffle
ind = svdupq_n_u32(1, 0, 2, 3);
v1 = svtbl_f32(v, ind);
v = vec_fun(v, v1);
return svlasta(svpfalse(), v);
}
};
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
// && defined(CPU_CAPABILITY_SVE256)


template <typename scalar_t, typename Op>
inline scalar_t vec_reduce_all(
Expand Down
13 changes: 12 additions & 1 deletion aten/src/ATen/cpu/vec/sve/sve_helper.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
#pragma once

#include <c10/macros/Macros.h>
#include <cstdint>

#include <ATen/cpu/vec/intrinsics.h>

#include <ATen/cpu/vec/vec_base.h>

#if defined(CPU_CAPABILITY_SVE)
#if defined(__aarch64__) && (defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) || defined(AT_BUILD_ARM_VECSVE_WITH_SLEEF))
#define SLEEF_STATIC_LIBS
#include <sleef.h>
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
#else
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
#endif

#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE128)

// Define the data type of VLS(vector-length specific).
typedef svbool_t vls_pred_t
Expand Down
19 changes: 13 additions & 6 deletions aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@

#include <ATen/cpu/vec/intrinsics.h>
#include <ATen/cpu/vec/sve/sve_helper.h>
#include <ATen/cpu/vec/sve/vec_common_sve.h>
#include <ATen/cpu/vec/sve/vec_float.h>
#include <ATen/cpu/vec/vec_base.h>
#include <c10/util/bit_cast.h>
#include <cmath>
namespace at {
namespace vec {
Expand All @@ -19,7 +15,8 @@ namespace vec {
// accessed as `at::vec`.
inline namespace CPU_CAPABILITY {

#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
#if (defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE512)) && \
defined(__ARM_FEATURE_BF16)

template <>
struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
Expand All @@ -39,6 +36,8 @@ class Vectorized<BFloat16> {

Vectorized();
Vectorized(svbfloat16_t v) : values(v) {}
Vectorized(float val);
Vectorized(double val);
Vectorized(int val);
Vectorized(BFloat16 val);

Expand Down Expand Up @@ -586,7 +585,15 @@ Vectorized<BFloat16> inline fmadd(
return a * b + c;
}

#endif // defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wignored-qualifiers"

CONVERT_NON_VECTORIZED_INIT(Half, half);
LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16);

#pragma GCC diagnostic pop

#else // defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16)

} // namespace CPU_CAPABILITY
} // namespace vec
Expand Down
Loading
Loading