Skip to content

Commit c076f34

Browse files
committed
move CUDA kernels to runtime(nvrtc) compilation
1 parent 90fe0f7 commit c076f34

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

84 files changed

+2931
-2250
lines changed

src/backend/common/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ target_sources(afcommon_interface
5252
${CMAKE_CURRENT_SOURCE_DIR}/half.hpp
5353
${CMAKE_CURRENT_SOURCE_DIR}/host_memory.cpp
5454
${CMAKE_CURRENT_SOURCE_DIR}/host_memory.hpp
55+
${CMAKE_CURRENT_SOURCE_DIR}/internal_enums.hpp
5556
${CMAKE_CURRENT_SOURCE_DIR}/kernel_type.hpp
5657
${CMAKE_CURRENT_SOURCE_DIR}/module_loading.hpp
5758
${CMAKE_CURRENT_SOURCE_DIR}/sparse_helpers.hpp

src/backend/common/defines.hpp

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
#pragma once
1111

12+
#include <common/internal_enums.hpp>
13+
1214
#include <mutex>
1315
#include <string>
1416

@@ -41,22 +43,6 @@ inline std::string clipFilePath(std::string path, std::string str) {
4143
#define __AF_FILENAME__ (clipFilePath(__FILE__, "src/").c_str())
4244
#endif
4345

44-
typedef enum {
45-
AF_BATCH_UNSUPPORTED = -1, /* invalid inputs */
46-
AF_BATCH_NONE, /* one signal, one filter */
47-
AF_BATCH_LHS, /* many signal, one filter */
48-
AF_BATCH_RHS, /* one signal, many filter */
49-
AF_BATCH_SAME, /* signal and filter have same batch size */
50-
AF_BATCH_DIFF, /* signal and filter have different batch size */
51-
} AF_BATCH_KIND;
52-
53-
enum class kJITHeuristics {
54-
Pass = 0, /* no eval necessary */
55-
TreeHeight = 1, /* eval due to jit tree height */
56-
KernelParameterSize = 2, /* eval due to many kernel parameters */
57-
MemoryPressure = 3 /* eval due to memory pressure */
58-
};
59-
6046
#ifdef OS_WIN
6147
#include <Windows.h>
6248
using LibHandle = HMODULE;

src/backend/common/internal_enums.hpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
/*******************************************************
2+
* Copyright (c) 2020, ArrayFire
3+
* All rights reserved.
4+
*
5+
* This file is distributed under 3-clause BSD license.
6+
* The complete license agreement can be obtained at:
7+
* http://arrayfire.com/licenses/BSD-3-Clause
8+
********************************************************/
9+
10+
#pragma once
11+
12+
// TODO AF_BATCH_UNSUPPORTED is not required and shouldn't happen
13+
// Code changes are required to handle all cases properly
14+
// and this enum value should be removed.
15+
typedef enum {
16+
AF_BATCH_UNSUPPORTED = -1, /* invalid inputs */
17+
AF_BATCH_NONE, /* one signal, one filter */
18+
AF_BATCH_LHS, /* many signal, one filter */
19+
AF_BATCH_RHS, /* one signal, many filter */
20+
AF_BATCH_SAME, /* signal and filter have same batch size */
21+
AF_BATCH_DIFF, /* signal and filter have different batch size */
22+
} AF_BATCH_KIND;

src/backend/common/jit/Node.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@
1919
#include <unordered_map>
2020
#include <vector>
2121

22+
enum class kJITHeuristics {
23+
Pass = 0, /* no eval necessary */
24+
TreeHeight = 1, /* eval due to jit tree height */
25+
KernelParameterSize = 2, /* eval due to many kernel parameters */
26+
MemoryPressure = 3 /* eval due to memory pressure */
27+
};
28+
2229
namespace common {
2330
class Node;
2431
struct Node_ids;

src/backend/cpu/Array.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <Param.hpp>
1313
#include <common/ArrayInfo.hpp>
1414
#include <common/MemoryManagerBase.hpp>
15+
#include <common/jit/Node.hpp>
1516
#include <jit/Node.hpp>
1617
#include <memory.hpp>
1718
#include <platform.hpp>

src/backend/cuda/CMakeLists.txt

Lines changed: 55 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -87,45 +87,74 @@ set(nvrtc_src
8787
${PROJECT_BINARY_DIR}/include/af/version.h
8888

8989
${CMAKE_CURRENT_SOURCE_DIR}/Param.hpp
90+
${CMAKE_CURRENT_SOURCE_DIR}/assign_kernel_param.hpp
9091
${CMAKE_CURRENT_SOURCE_DIR}/backend.hpp
92+
${CMAKE_CURRENT_SOURCE_DIR}/dims_param.hpp
9193
${CMAKE_CURRENT_SOURCE_DIR}/kernel/interp.hpp
9294
${CMAKE_CURRENT_SOURCE_DIR}/kernel/shared.hpp
9395
${CMAKE_CURRENT_SOURCE_DIR}/math.hpp
96+
${CMAKE_CURRENT_SOURCE_DIR}/minmax_op.hpp
9497
${CMAKE_CURRENT_SOURCE_DIR}/utility.hpp
9598
${CMAKE_CURRENT_SOURCE_DIR}/types.hpp
9699
${CMAKE_CURRENT_SOURCE_DIR}/../common/half.hpp
100+
${CMAKE_CURRENT_SOURCE_DIR}/../common/internal_enums.hpp
97101
${CMAKE_CURRENT_SOURCE_DIR}/../common/kernel_type.hpp
98102

99103
${CMAKE_CURRENT_SOURCE_DIR}/kernel/anisotropic_diffusion.cuh
100104
${CMAKE_CURRENT_SOURCE_DIR}/kernel/approx1.cuh
101105
${CMAKE_CURRENT_SOURCE_DIR}/kernel/approx2.cuh
106+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/assign.cuh
102107
${CMAKE_CURRENT_SOURCE_DIR}/kernel/bilateral.cuh
103108
${CMAKE_CURRENT_SOURCE_DIR}/kernel/canny.cuh
104109
${CMAKE_CURRENT_SOURCE_DIR}/kernel/convolve1.cuh
105110
${CMAKE_CURRENT_SOURCE_DIR}/kernel/convolve2.cuh
106111
${CMAKE_CURRENT_SOURCE_DIR}/kernel/convolve3.cuh
107112
${CMAKE_CURRENT_SOURCE_DIR}/kernel/convolve_separable.cuh
113+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/copy.cuh
114+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/diagonal.cuh
115+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/diff.cuh
108116
${CMAKE_CURRENT_SOURCE_DIR}/kernel/exampleFunction.cuh
117+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/fftconvolve.cuh
109118
${CMAKE_CURRENT_SOURCE_DIR}/kernel/flood_fill.cuh
119+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/gradient.cuh
110120
${CMAKE_CURRENT_SOURCE_DIR}/kernel/histogram.cuh
111121
${CMAKE_CURRENT_SOURCE_DIR}/kernel/hsv_rgb.cuh
122+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/identity.cuh
123+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/iir.cuh
124+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/index.cuh
125+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/iota.cuh
126+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/ireduce.cuh
127+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/join.cuh
128+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/lookup.cuh
129+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/lu_split.cuh
112130
${CMAKE_CURRENT_SOURCE_DIR}/kernel/match_template.cuh
113131
${CMAKE_CURRENT_SOURCE_DIR}/kernel/meanshift.cuh
114132
${CMAKE_CURRENT_SOURCE_DIR}/kernel/medfilt.cuh
133+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/memcopy.cuh
115134
${CMAKE_CURRENT_SOURCE_DIR}/kernel/moments.cuh
116135
${CMAKE_CURRENT_SOURCE_DIR}/kernel/morph.cuh
117136
${CMAKE_CURRENT_SOURCE_DIR}/kernel/pad_array_borders.cuh
137+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/range.cuh
118138
${CMAKE_CURRENT_SOURCE_DIR}/kernel/resize.cuh
139+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/reorder.cuh
119140
${CMAKE_CURRENT_SOURCE_DIR}/kernel/rotate.cuh
141+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/select.cuh
120142
${CMAKE_CURRENT_SOURCE_DIR}/kernel/scan_dim.cuh
121143
${CMAKE_CURRENT_SOURCE_DIR}/kernel/scan_dim_by_key.cuh
122144
${CMAKE_CURRENT_SOURCE_DIR}/kernel/scan_first.cuh
123145
${CMAKE_CURRENT_SOURCE_DIR}/kernel/scan_first_by_key.cuh
124146
${CMAKE_CURRENT_SOURCE_DIR}/kernel/sobel.cuh
147+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/sparse.cuh
148+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/sparse_arith.cuh
149+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/susan.cuh
150+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/tile.cuh
125151
${CMAKE_CURRENT_SOURCE_DIR}/kernel/transform.cuh
126152
${CMAKE_CURRENT_SOURCE_DIR}/kernel/transpose.cuh
127153
${CMAKE_CURRENT_SOURCE_DIR}/kernel/transpose_inplace.cuh
154+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/triangle.cuh
155+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/unwrap.cuh
128156
${CMAKE_CURRENT_SOURCE_DIR}/kernel/where.cuh
157+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/wrap.cuh
129158
)
130159

131160
file_to_string(
@@ -220,13 +249,9 @@ cuda_add_library(afcuda
220249
anisotropic_diffusion.cpp
221250
any.cu
222251
approx.cpp
223-
assign.cu
224252
bilateral.cpp
225253
canny.cpp
226-
copy.cu
227254
count.cu
228-
diagonal.cu
229-
diff.cu
230255
dilate.cpp
231256
dilate3d.cpp
232257
erode.cpp
@@ -235,20 +260,10 @@ cuda_add_library(afcuda
235260
Event.hpp
236261
exampleFunction.cpp
237262
fast.cu
238-
fftconvolve.cu
239-
gradient.cu
240263
harris.cu
241264
histogram.cpp
242265
homography.cu
243266
hsv_rgb.cpp
244-
identity.cu
245-
iir.cu
246-
index.cu
247-
iota.cu
248-
ireduce.cu
249-
join.cu
250-
lookup.cu
251-
lu.cu
252267
match_template.cpp
253268
max.cu
254269
mean.cu
@@ -260,32 +275,21 @@ cuda_add_library(afcuda
260275
orb.cu
261276
pad_array_borders.cpp
262277
product.cu
263-
qr.cu
264278
random_engine.cu
265-
range.cu
266279
regions.cu
267-
reorder.cu
268280
resize.cpp
269281
rotate.cpp
270-
select.cu
271282
set.cu
272283
sift.cu
273284
sobel.cpp
274285
sort.cu
275286
sort_by_key.cu
276287
sort_index.cu
277-
sparse.cu
278-
sparse_arith.cu
279288
sum.cu
280-
susan.cu
281-
tile.cu
282289
topk.cu
283290
transform.cpp
284291
transpose.cpp
285292
transpose_inplace.cpp
286-
triangle.cu
287-
unwrap.cu
288-
wrap.cu
289293

290294
kernel/anisotropic_diffusion.hpp
291295
kernel/approx.hpp
@@ -373,6 +377,7 @@ cuda_add_library(afcuda
373377
anisotropic_diffusion.hpp
374378
approx.hpp
375379
arith.hpp
380+
assign.cpp
376381
assign.hpp
377382
backend.hpp
378383
bilateral.hpp
@@ -386,6 +391,7 @@ cuda_add_library(afcuda
386391
complex.hpp
387392
convolve.cpp
388393
convolve.hpp
394+
copy.cpp
389395
copy.hpp
390396
cublas.cpp
391397
cublas.hpp
@@ -403,7 +409,9 @@ cuda_add_library(afcuda
403409
device_manager.hpp
404410
debug_cuda.hpp
405411
debug_thrust.hpp
412+
diagonal.cpp
406413
diagonal.hpp
414+
diff.cpp
407415
diff.hpp
408416
driver.cpp
409417
err_cuda.hpp
@@ -413,11 +421,13 @@ cuda_add_library(afcuda
413421
fast_pyramid.hpp
414422
fft.cpp
415423
fft.hpp
424+
fftconvolve.cpp
416425
fftconvolve.hpp
417426
flood_fill.cpp
418427
flood_fill.hpp
419428
GraphicsResourceManager.cpp
420429
GraphicsResourceManager.hpp
430+
gradient.cpp
421431
gradient.hpp
422432
handle.cpp
423433
harris.hpp
@@ -426,19 +436,27 @@ cuda_add_library(afcuda
426436
histogram.hpp
427437
homography.hpp
428438
hsv_rgb.hpp
439+
identity.cpp
429440
identity.hpp
441+
iir.cpp
430442
iir.hpp
431443
image.cpp
432444
image.hpp
445+
index.cpp
433446
index.hpp
434447
inverse.cpp
435448
inverse.hpp
449+
iota.cpp
436450
iota.hpp
451+
ireduce.cpp
437452
ireduce.hpp
438453
jit.cpp
454+
join.cpp
439455
join.hpp
440456
logic.hpp
457+
lookup.cpp
441458
lookup.hpp
459+
lu.cpp
442460
lu.hpp
443461
match_template.hpp
444462
math.hpp
@@ -447,6 +465,7 @@ cuda_add_library(afcuda
447465
medfilt.hpp
448466
memory.cpp
449467
memory.hpp
468+
minmax_op.hpp
450469
moments.hpp
451470
morph.hpp
452471
morph3d_impl.hpp
@@ -458,12 +477,15 @@ cuda_add_library(afcuda
458477
plot.cpp
459478
plot.hpp
460479
print.hpp
480+
qr.cpp
461481
qr.hpp
462482
random_engine.hpp
483+
range.cpp
463484
range.hpp
464485
reduce.hpp
465486
reduce_impl.hpp
466487
regions.hpp
488+
reorder.cpp
467489
reorder.hpp
468490
resize.hpp
469491
rotate.hpp
@@ -472,6 +494,7 @@ cuda_add_library(afcuda
472494
scan.hpp
473495
scan_by_key.cpp
474496
scan_by_key.hpp
497+
select.cpp
475498
select.hpp
476499
set.hpp
477500
shift.cpp
@@ -482,30 +505,37 @@ cuda_add_library(afcuda
482505
solve.hpp
483506
sort_by_key.hpp
484507
sort_index.hpp
508+
sparse.cpp
485509
sparse.hpp
510+
sparse_arith.cpp
486511
sparse_arith.hpp
487512
sparse_blas.cpp
488513
sparse_blas.hpp
489514
surface.cpp
490515
surface.hpp
516+
susan.cpp
491517
susan.hpp
492518
svd.cpp
493519
svd.hpp
520+
tile.cpp
494521
tile.hpp
495522
topk.hpp
496523
traits.hpp
497524
transform.hpp
498525
transpose.hpp
526+
triangle.cpp
499527
triangle.hpp
500528
types.hpp
501529
unary.hpp
530+
unwrap.cpp
502531
unwrap.hpp
503532
utility.cpp
504533
utility.hpp
505534
vector_field.cpp
506535
vector_field.hpp
507536
where.cpp
508537
where.hpp
538+
wrap.cpp
509539
wrap.hpp
510540

511541
jit/BufferNode.hpp

src/backend/cuda/assign.cu renamed to src/backend/cuda/assign.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ namespace cuda {
2323

2424
template<typename T>
2525
void assign(Array<T>& out, const af_index_t idxrs[], const Array<T>& rhs) {
26-
kernel::AssignKernelParam_t p;
26+
AssignKernelParam p;
2727
std::vector<af_seq> seqs(4, af_span);
2828
// create seq vector to retrieve output
2929
// dimensions, offsets & offsets
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/*******************************************************
2+
* Copyright (c) 2020, ArrayFire
3+
* All rights reserved.
4+
*
5+
* This file is distributed under 3-clause BSD license.
6+
* The complete license agreement can be obtained at:
7+
* http://arrayfire.com/licenses/BSD-3-Clause
8+
********************************************************/
9+
10+
#pragma once
11+
12+
namespace cuda {
13+
14+
typedef struct {
15+
int offs[4];
16+
int strds[4];
17+
bool isSeq[4];
18+
unsigned int* ptr[4];
19+
} AssignKernelParam;
20+
21+
using IndexKernelParam = AssignKernelParam;
22+
23+
} // namespace cuda

0 commit comments

Comments
 (0)