Skip to content

Commit bfca1bc

Browse files
Nicolas De Carlifacebook-github-bot
authored andcommitted
[Caffe2] Differentiate SVE128 from SVE for vectorization
Summary: We are introducing the SVE128 vectorized<> layer. Idea is to differentiate SVE128 perfkernels from the general SVE implementation Mixing NEON and SVE should maximize performance on SVE128 cpus Test Plan: Sigrid Predictor canary Differential Revision: D78902495
1 parent 685e673 commit bfca1bc

19 files changed

+543
-256
lines changed

aten/src/ATen/cpu/vec/functional_base.h

Lines changed: 27 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,31 @@ struct VecReduceAllSIMD<float, Op> {
102102
#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
103103
// !defined(C10_MOBILE)
104104

105-
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
106-
!defined(CPU_CAPABILITY_SVE)
105+
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
106+
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE512)
107+
template <typename Op>
108+
struct VecReduceAllSIMD<float, Op> {
109+
static inline float apply(
110+
const Op& vec_fun,
111+
const Vectorized<float>& acc_vec) {
112+
using Vec = Vectorized<float>;
113+
Vec v = acc_vec;
114+
// 128-bit shuffle
115+
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
116+
Vec v1 = svtbl_f32(v, ind);
117+
v = vec_fun(v, v1);
118+
// 64-bit shuffle
119+
ind = svdupq_n_u32(2, 3, 0, 1);
120+
v1 = svtbl_f32(v, ind);
121+
v = vec_fun(v, v1);
122+
// 32-bit shuffle
123+
ind = svdupq_n_u32(1, 0, 2, 3);
124+
v1 = svtbl_f32(v, ind);
125+
v = vec_fun(v, v1);
126+
return svlasta(svpfalse(), v);
127+
}
128+
};
129+
#else
107130
template <typename Op>
108131
struct VecReduceAllSIMD<float, Op> {
109132
static inline float apply(
@@ -140,35 +163,10 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
140163
return vaddvq_f32(acc_vec);
141164
}
142165
};
166+
#endif // defined(CPU_CAPABILITY_SVE256)
143167
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
144-
// && !defined(CPU_CAPABILITY_SVE)
145168

146-
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
147-
defined(CPU_CAPABILITY_SVE256)
148-
template <typename Op>
149-
struct VecReduceAllSIMD<float, Op> {
150-
static inline float apply(
151-
const Op& vec_fun,
152-
const Vectorized<float>& acc_vec) {
153-
using Vec = Vectorized<float>;
154-
Vec v = acc_vec;
155-
// 128-bit shuffle
156-
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
157-
Vec v1 = svtbl_f32(v, ind);
158-
v = vec_fun(v, v1);
159-
// 64-bit shuffle
160-
ind = svdupq_n_u32(2, 3, 0, 1);
161-
v1 = svtbl_f32(v, ind);
162-
v = vec_fun(v, v1);
163-
// 32-bit shuffle
164-
ind = svdupq_n_u32(1, 0, 2, 3);
165-
v1 = svtbl_f32(v, ind);
166-
v = vec_fun(v, v1);
167-
return svlasta(svpfalse(), v);
168-
}
169-
};
170-
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
171-
// && defined(CPU_CAPABILITY_SVE256)
169+
172170

173171
template <typename scalar_t, typename Op>
174172
inline scalar_t vec_reduce_all(

aten/src/ATen/cpu/vec/sve/sve_helper.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
#pragma once
22

3-
#include <cstdint>
43
#include <c10/macros/Macros.h>
4+
#include <cstdint>
55

66
#include <ATen/cpu/vec/intrinsics.h>
77

88
#include <ATen/cpu/vec/vec_base.h>
99

10-
#if defined(CPU_CAPABILITY_SVE)
10+
#if defined(__aarch64__) && (defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) || defined(AT_BUILD_ARM_VECSVE_WITH_SLEEF))
11+
#define SLEEF_STATIC_LIBS
12+
#include <sleef.h>
13+
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
14+
#else
15+
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
16+
#endif
17+
18+
#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE128)
1119

1220
// Define the data type of VLS(vector-length specific).
1321
typedef svbool_t vls_pred_t

aten/src/ATen/cpu/vec/sve/vec_bfloat16.h

Lines changed: 2 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,6 @@
22

33
#include <ATen/cpu/vec/intrinsics.h>
44
#include <ATen/cpu/vec/sve/sve_helper.h>
5-
#include <ATen/cpu/vec/sve/vec_common_sve.h>
6-
#include <ATen/cpu/vec/sve/vec_float.h>
7-
#include <ATen/cpu/vec/intrinsics.h>
8-
#include <ATen/cpu/vec/vec_base.h>
9-
#include <c10/util/bit_cast.h>
105
#include <cmath>
116
namespace at {
127
namespace vec {
@@ -20,43 +15,8 @@ namespace vec {
2015
// accessed as `at::vec`.
2116
inline namespace CPU_CAPABILITY {
2217

23-
#define CONVERT_NON_VECTORIZED_INIT(type, name) \
24-
inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
25-
constexpr int64_t K = Vectorized<type>::size(); \
26-
__at_align__ float arr[K]; \
27-
__at_align__ type arr2[K]; \
28-
a.store(arr2); \
29-
convert(arr2, arr, K); \
30-
return std::make_tuple( \
31-
Vectorized<float>::loadu(arr), \
32-
Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
33-
} \
34-
inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const Vectorized<float>& b) { \
35-
constexpr int64_t K = Vectorized<type>::size(); \
36-
__at_align__ float arr[K]; \
37-
__at_align__ type arr2[K]; \
38-
a.store(arr); \
39-
b.store(arr + Vectorized<float>::size()); \
40-
convert(arr, arr2, K); \
41-
return Vectorized<type>::loadu(arr2); \
42-
}
43-
44-
#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
45-
inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
46-
__at_align__ float values[Vectorized<float>::size()]; \
47-
for (const auto k : c10::irange(Vectorized<float>::size())) { \
48-
values[k] = data[k]; \
49-
} \
50-
out = Vectorized<float>::loadu(values); \
51-
} \
52-
\
53-
inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vectorized<float>& out2) { \
54-
load_fp32_from_##name(data, out1); \
55-
data += Vectorized<float>::size(); \
56-
load_fp32_from_##name(data, out2); \
57-
}
58-
59-
#if defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16)
18+
#if (defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE512)) && \
19+
defined(__ARM_FEATURE_BF16)
6020

6121
template <>
6222
struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
@@ -635,20 +595,6 @@ LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16);
635595

636596
#else // defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16)
637597

638-
#pragma GCC diagnostic push
639-
#pragma GCC diagnostic ignored "-Wignored-qualifiers"
640-
641-
CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
642-
CONVERT_NON_VECTORIZED_INIT(Half, half);
643-
644-
LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16);
645-
LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16);
646-
647-
#pragma GCC diagnostic pop
648-
649-
#endif // defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16)
650-
651-
652598
} // namespace CPU_CAPABILITY
653599
} // namespace vec
654600
} // namespace at

0 commit comments

Comments
 (0)