clang 22.0.0git
gpuintrin.h
Go to the documentation of this file.
1//===-- gpuintrin.h - Generic GPU intrinsic functions ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Provides wrappers around the clang builtins for accessing GPU hardware
10// features. The interface is intended to be portable between architectures, but
11// some targets may provide different implementations. This header can be
12// included for all the common GPU programming languages, namely OpenMP, HIP,
13// CUDA, and OpenCL.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef __GPUINTRIN_H
18#define __GPUINTRIN_H
19
20#if !defined(_DEFAULT_FN_ATTRS)
21#if defined(__HIP__) || defined(__CUDA__)
22#define _DEFAULT_FN_ATTRS __attribute__((device))
23#else
24#define _DEFAULT_FN_ATTRS
25#endif
26#endif
27
28#include <stdint.h>
29
30#if !defined(__cplusplus)
31_Pragma("push_macro(\"bool\")");
32#define bool _Bool
33#endif
34
35_Pragma("omp begin declare target device_type(nohost)");
36_Pragma("omp begin declare variant match(device = {kind(gpu)})");
37
38// Forward declare a few functions for the implementation header.
39
40// Returns a bitmask marking all lanes that have the same value of __x.
41_DEFAULT_FN_ATTRS static __inline__ uint64_t
42__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);
43
44// Returns a bitmask marking all lanes that have the same value of __x.
45_DEFAULT_FN_ATTRS static __inline__ uint64_t
46__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x);
47
48// Returns the current lane mask if every lane contains __x.
49_DEFAULT_FN_ATTRS static __inline__ uint64_t
50__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x);
51
52// Returns the current lane mask if every lane contains __x.
53_DEFAULT_FN_ATTRS static __inline__ uint64_t
54__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x);
55
56_Pragma("omp end declare variant");
57_Pragma("omp end declare target");
58
59#if defined(__NVPTX__)
60#include <nvptxintrin.h>
61#elif defined(__AMDGPU__)
62#include <amdgpuintrin.h>
63#elif !defined(_OPENMP)
64#error "This header is only meant to be used on GPU architectures."
65#endif
66
67_Pragma("omp begin declare target device_type(nohost)");
68_Pragma("omp begin declare variant match(device = {kind(gpu)})");
69
70#define __GPU_X_DIM 0
71#define __GPU_Y_DIM 1
72#define __GPU_Z_DIM 2
73
74// Returns the number of blocks in the requested dimension.
75_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
76 switch (__dim) {
77 case 0:
78 return __gpu_num_blocks_x();
79 case 1:
80 return __gpu_num_blocks_y();
81 case 2:
82 return __gpu_num_blocks_z();
83 default:
84 __builtin_unreachable();
85 }
86}
87
88// Returns the number of block id in the requested dimension.
89_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id(int __dim) {
90 switch (__dim) {
91 case 0:
92 return __gpu_block_id_x();
93 case 1:
94 return __gpu_block_id_y();
95 case 2:
96 return __gpu_block_id_z();
97 default:
98 __builtin_unreachable();
99 }
100}
101
102// Returns the number of threads in the requested dimension.
103_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads(int __dim) {
104 switch (__dim) {
105 case 0:
106 return __gpu_num_threads_x();
107 case 1:
108 return __gpu_num_threads_y();
109 case 2:
110 return __gpu_num_threads_z();
111 default:
112 __builtin_unreachable();
113 }
114}
115
116// Returns the thread id in the requested dimension.
117_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id(int __dim) {
118 switch (__dim) {
119 case 0:
120 return __gpu_thread_id_x();
121 case 1:
122 return __gpu_thread_id_y();
123 case 2:
124 return __gpu_thread_id_z();
125 default:
126 __builtin_unreachable();
127 }
128}
129
130// Get the first active thread inside the lane.
131_DEFAULT_FN_ATTRS static __inline__ uint64_t
132__gpu_first_lane_id(uint64_t __lane_mask) {
133 return __builtin_ffsll(__lane_mask) - 1;
134}
135
136// Conditional that is only true for a single thread in a lane.
137_DEFAULT_FN_ATTRS static __inline__ bool
138__gpu_is_first_in_lane(uint64_t __lane_mask) {
139 return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
140}
141
142// Copies the value from the first active thread to the rest.
143_DEFAULT_FN_ATTRS static __inline__ uint64_t
144__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
145 uint32_t __hi = (uint32_t)(__x >> 32ull);
146 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
147 return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
148 ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) &
149 0xFFFFFFFFull);
150}
151
152// Gets the first floating point value from the active lanes.
153_DEFAULT_FN_ATTRS static __inline__ float
154__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
155 return __builtin_bit_cast(
156 float, __gpu_read_first_lane_u32(__lane_mask,
157 __builtin_bit_cast(uint32_t, __x)));
158}
159
160// Gets the first floating point value from the active lanes.
161_DEFAULT_FN_ATTRS static __inline__ double
162__gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
163 return __builtin_bit_cast(
164 double, __gpu_read_first_lane_u64(__lane_mask,
165 __builtin_bit_cast(uint64_t, __x)));
166}
167
168// Shuffles the the lanes according to the given index.
169_DEFAULT_FN_ATTRS static __inline__ uint64_t
170__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
171 uint32_t __width) {
172 uint32_t __hi = (uint32_t)(__x >> 32ull);
173 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
174 uint32_t __mask = (uint32_t)__lane_mask;
175 return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
176 << 32ull) |
177 ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
178}
179
180// Shuffles the the lanes according to the given index.
181_DEFAULT_FN_ATTRS static __inline__ float
182__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
183 uint32_t __width) {
184 return __builtin_bit_cast(
185 float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
186 __builtin_bit_cast(uint32_t, __x), __width));
187}
188
189// Shuffles the the lanes according to the given index.
190_DEFAULT_FN_ATTRS static __inline__ double
191__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
192 uint32_t __width) {
193 return __builtin_bit_cast(
194 double,
195 __gpu_shuffle_idx_u64(__lane_mask, __idx,
196 __builtin_bit_cast(uint64_t, __x), __width));
197}
198
199// Gets the accumulator scan of the threads in the warp or wavefront.
200#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
201 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
202 uint64_t __lane_mask, uint32_t __x) { \
203 uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
204 bool __divergent = __gpu_read_first_lane_##__suffix( \
205 __lane_mask, __first & (__first + 1)); \
206 if (__divergent) { \
207 __type __accum = 0; \
208 for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) { \
209 __type __index = __builtin_ctzll(__mask); \
210 __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
211 __gpu_num_lanes()); \
212 __x = __gpu_lane_id() == __index ? __accum + __tmp : __x; \
213 __accum += __tmp; \
214 } \
215 } else { \
216 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
217 uint32_t __index = __gpu_lane_id() - __step; \
218 __bitmask_type bitmask = __gpu_lane_id() >= __step; \
219 __x += __builtin_bit_cast( \
220 __type, \
221 -bitmask & __builtin_bit_cast(__bitmask_type, \
222 __gpu_shuffle_idx_##__suffix( \
223 __lane_mask, __index, __x, \
224 __gpu_num_lanes()))); \
225 } \
226 } \
227 return __x; \
228 }
229__DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x)
230__DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x)
231__DO_LANE_SCAN(float, uint32_t, f32); // float __gpu_lane_scan_f32(m, x)
232__DO_LANE_SCAN(double, uint64_t, f64); // double __gpu_lane_scan_f64(m, x)
233#undef __DO_LANE_SCAN
234
235// Gets the sum of all lanes inside the warp or wavefront.
236#define __DO_LANE_SUM(__type, __suffix) \
237 _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
238 uint64_t __lane_mask, __type __x) { \
239 uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
240 bool __divergent = __gpu_read_first_lane_##__suffix( \
241 __lane_mask, __first & (__first + 1)); \
242 if (__divergent) { \
243 return __gpu_shuffle_idx_##__suffix( \
244 __lane_mask, 63 - __builtin_clzll(__lane_mask), \
245 __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes()); \
246 } else { \
247 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
248 uint32_t __index = __step + __gpu_lane_id(); \
249 __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
250 __gpu_num_lanes()); \
251 } \
252 return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
253 } \
254 }
255__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
256__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
257__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
258__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
259#undef __DO_LANE_SUM
260
261// Returns a bitmask marking all lanes that have the same value of __x.
262_DEFAULT_FN_ATTRS static __inline__ uint64_t
263__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x) {
264 uint64_t __match_mask = 0;
265
266 bool __done = 0;
267 for (uint64_t __active_mask = __lane_mask; __active_mask;
268 __active_mask = __gpu_ballot(__lane_mask, !__done)) {
269 if (!__done) {
270 uint32_t __first = __gpu_read_first_lane_u32(__active_mask, __x);
271 if (__first == __x) {
272 __match_mask = __gpu_lane_mask();
273 __done = 1;
274 }
275 }
276 }
277 __gpu_sync_lane(__lane_mask);
278 return __match_mask;
279}
280
281// Returns a bitmask marking all lanes that have the same value of __x.
282_DEFAULT_FN_ATTRS static __inline__ uint64_t
283__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x) {
284 uint64_t __match_mask = 0;
285
286 bool __done = 0;
287 for (uint64_t __active_mask = __lane_mask; __active_mask;
288 __active_mask = __gpu_ballot(__lane_mask, !__done)) {
289 if (!__done) {
290 uint64_t __first = __gpu_read_first_lane_u64(__active_mask, __x);
291 if (__first == __x) {
292 __match_mask = __gpu_lane_mask();
293 __done = 1;
294 }
295 }
296 }
297 __gpu_sync_lane(__lane_mask);
298 return __match_mask;
299}
300
301// Returns the current lane mask if every lane contains __x.
302_DEFAULT_FN_ATTRS static __inline__ uint64_t
303__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x) {
304 uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
305 uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
306 __gpu_sync_lane(__lane_mask);
307 return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
308}
309
310// Returns the current lane mask if every lane contains __x.
311_DEFAULT_FN_ATTRS static __inline__ uint64_t
312__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x) {
313 uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
314 uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
315 __gpu_sync_lane(__lane_mask);
316 return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
317}
318
319_Pragma("omp end declare variant");
320_Pragma("omp end declare target");
321
322#if !defined(__cplusplus)
323_Pragma("pop_macro(\"bool\")");
324#endif
325
326#undef _DEFAULT_FN_ATTRS
327
328#endif // __GPUINTRIN_H
__DEVICE__ unsigned int __ballot(int __a)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
Definition: amdgpuintrin.h:79
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
Definition: amdgpuintrin.h:100
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
Definition: amdgpuintrin.h:74
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_lane_mask(void)
Definition: amdgpuintrin.h:105
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
Definition: amdgpuintrin.h:111
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, uint32_t __width)
Definition: amdgpuintrin.h:136
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
Definition: amdgpuintrin.h:54
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
Definition: amdgpuintrin.h:89
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_lane(uint64_t __lane_mask)
Definition: amdgpuintrin.h:130
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
Definition: amdgpuintrin.h:34
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
Definition: amdgpuintrin.h:59
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
Definition: amdgpuintrin.h:69
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
Definition: amdgpuintrin.h:64
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
Definition: amdgpuintrin.h:84
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
Definition: amdgpuintrin.h:44
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
Definition: amdgpuintrin.h:49
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
Definition: amdgpuintrin.h:116
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
Definition: amdgpuintrin.h:39
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id(int __dim)
Definition: gpuintrin.h:89
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)
Definition: gpuintrin.h:144
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x)
Definition: gpuintrin.h:263
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, uint32_t __width)
Definition: gpuintrin.h:170
#define _DEFAULT_FN_ATTRS
Definition: gpuintrin.h:24
static _DEFAULT_FN_ATTRS __inline__ double __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x)
Definition: gpuintrin.h:162
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x)
Definition: gpuintrin.h:312
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x)
Definition: gpuintrin.h:303
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_first_in_lane(uint64_t __lane_mask)
Definition: gpuintrin.h:138
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads(int __dim)
Definition: gpuintrin.h:103
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id(int __dim)
Definition: gpuintrin.h:117
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_first_lane_id(uint64_t __lane_mask)
Definition: gpuintrin.h:132
#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)
Definition: gpuintrin.h:200
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x)
Definition: gpuintrin.h:283
static _DEFAULT_FN_ATTRS __inline__ float __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x, uint32_t __width)
Definition: gpuintrin.h:182
#define __DO_LANE_SUM(__type, __suffix)
Definition: gpuintrin.h:236
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks(int __dim)
Definition: gpuintrin.h:75
static _DEFAULT_FN_ATTRS __inline__ float __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x)
Definition: gpuintrin.h:154
static _DEFAULT_FN_ATTRS __inline__ double __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x, uint32_t __width)
Definition: gpuintrin.h:191
_Pragma("push_macro(\"bool\")")
unsigned long uint64_t