20#if !defined(_DEFAULT_FN_ATTRS)
21#if defined(__HIP__) || defined(__CUDA__)
22#define _DEFAULT_FN_ATTRS __attribute__((device))
24#define _DEFAULT_FN_ATTRS
30#if !defined(__cplusplus)
35_Pragma(
"omp begin declare target device_type(nohost)");
36_Pragma(
"omp begin declare variant match(device = {kind(gpu)})");
61#elif defined(__AMDGPU__)
63#elif !defined(_OPENMP)
64#error "This header is only meant to be used on GPU architectures."
67_Pragma(
"omp begin declare target device_type(nohost)");
68_Pragma(
"omp begin declare variant match(device = {kind(gpu)})");
84 __builtin_unreachable();
98 __builtin_unreachable();
112 __builtin_unreachable();
126 __builtin_unreachable();
133 return __builtin_ffsll(__lane_mask) - 1;
145 uint32_t __hi = (uint32_t)(__x >> 32ull);
146 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
155 return __builtin_bit_cast(
157 __builtin_bit_cast(uint32_t, __x)));
163 return __builtin_bit_cast(
165 __builtin_bit_cast(uint64_t, __x)));
172 uint32_t __hi = (uint32_t)(__x >> 32ull);
173 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
174 uint32_t __mask = (uint32_t)__lane_mask;
184 return __builtin_bit_cast(
186 __builtin_bit_cast(uint32_t, __x), __width));
193 return __builtin_bit_cast(
196 __builtin_bit_cast(uint64_t, __x), __width));
200#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
201 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
202 uint64_t __lane_mask, uint32_t __x) { \
203 uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
204 bool __divergent = __gpu_read_first_lane_##__suffix( \
205 __lane_mask, __first & (__first + 1)); \
207 __type __accum = 0; \
208 for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) { \
209 __type __index = __builtin_ctzll(__mask); \
210 __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
211 __gpu_num_lanes()); \
212 __x = __gpu_lane_id() == __index ? __accum + __tmp : __x; \
216 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
217 uint32_t __index = __gpu_lane_id() - __step; \
218 __bitmask_type bitmask = __gpu_lane_id() >= __step; \
219 __x += __builtin_bit_cast( \
221 -bitmask & __builtin_bit_cast(__bitmask_type, \
222 __gpu_shuffle_idx_##__suffix( \
223 __lane_mask, __index, __x, \
224 __gpu_num_lanes()))); \
236#define __DO_LANE_SUM(__type, __suffix) \
237 _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
238 uint64_t __lane_mask, __type __x) { \
239 uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
240 bool __divergent = __gpu_read_first_lane_##__suffix( \
241 __lane_mask, __first & (__first + 1)); \
243 return __gpu_shuffle_idx_##__suffix( \
244 __lane_mask, 63 - __builtin_clzll(__lane_mask), \
245 __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes()); \
247 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
248 uint32_t __index = __step + __gpu_lane_id(); \
249 __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
250 __gpu_num_lanes()); \
252 return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
264 uint64_t __match_mask = 0;
267 for (uint64_t __active_mask = __lane_mask; __active_mask;
271 if (__first == __x) {
284 uint64_t __match_mask = 0;
287 for (uint64_t __active_mask = __lane_mask; __active_mask;
291 if (__first == __x) {
319_Pragma(
"omp end declare variant");
320_Pragma(
"omp end declare target");
322#if !defined(__cplusplus)
326#undef _DEFAULT_FN_ATTRS
__DEVICE__ unsigned int __ballot(int __a)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_lane_mask(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, uint32_t __width)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_lane(uint64_t __lane_mask)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id(int __dim)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, uint32_t __width)
#define _DEFAULT_FN_ATTRS
static _DEFAULT_FN_ATTRS __inline__ double __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_first_in_lane(uint64_t __lane_mask)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads(int __dim)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id(int __dim)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_first_lane_id(uint64_t __lane_mask)
#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ float __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x, uint32_t __width)
#define __DO_LANE_SUM(__type, __suffix)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks(int __dim)
static _DEFAULT_FN_ATTRS __inline__ float __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x)
static _DEFAULT_FN_ATTRS __inline__ double __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x, uint32_t __width)
_Pragma("push_macro(\"bool\")")