9#ifndef __AMDGPUINTRIN_H
10#define __AMDGPUINTRIN_H
13#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
17#error "Never use <amdgpuintrin.h> directly; include <gpuintrin.h> instead"
20_Pragma(
"omp begin declare target device_type(nohost)");
21_Pragma(
"omp begin declare variant match(device = {arch(amdgcn)})");
24#define __gpu_private __attribute__((address_space(5)))
25#define __gpu_constant __attribute__((address_space(4)))
26#define __gpu_local __attribute__((address_space(3)))
27#define __gpu_global __attribute__((address_space(1)))
28#define __gpu_generic __attribute__((address_space(0)))
31#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
35 return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
40 return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
45 return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
50 return __builtin_amdgcn_workgroup_id_x();
55 return __builtin_amdgcn_workgroup_id_y();
60 return __builtin_amdgcn_workgroup_id_z();
65 return __builtin_amdgcn_workgroup_size_x();
70 return __builtin_amdgcn_workgroup_size_y();
75 return __builtin_amdgcn_workgroup_size_z();
80 return __builtin_amdgcn_workitem_id_x();
85 return __builtin_amdgcn_workitem_id_y();
90 return __builtin_amdgcn_workitem_id_z();
96 return __builtin_amdgcn_wavefrontsize();
101 return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
106 return __builtin_amdgcn_read_exec();
112 return __builtin_amdgcn_readfirstlane(__x);
120 return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
125 __builtin_amdgcn_s_barrier();
126 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"workgroup");
131 __builtin_amdgcn_wave_barrier();
138 uint32_t __lane = __idx + (
__gpu_lane_id() & ~(__width - 1));
139 return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
168 return __builtin_amdgcn_is_shared((
void [[clang::address_space(0)]] *)((
174 return __builtin_amdgcn_is_private((
void [[clang::address_space(0)]] *)((
180 __builtin_amdgcn_endpgm();
185 __builtin_amdgcn_s_sleep(2);
static _DEFAULT_FN_ATTRS __inline__ void __gpu_thread_suspend(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_lane_mask(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, uint32_t __width)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_lane(uint64_t __lane_mask)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_private(void *ptr)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_local(void *ptr)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_lanes(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
_Pragma("omp begin declare target device_type(nohost)")
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_exit(void)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_threads(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
#define _DEFAULT_FN_ATTRS
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x)