Skip to content

Commit 0eba1b0

Browse files
committed
move CUDA kernels to runtime(nvrtc) compilation
1 parent 24b3498 commit 0eba1b0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

84 files changed

+2931
-2250
lines changed

src/backend/common/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ target_sources(afcommon_interface
5252
${CMAKE_CURRENT_SOURCE_DIR}/half.hpp
5353
${CMAKE_CURRENT_SOURCE_DIR}/host_memory.cpp
5454
${CMAKE_CURRENT_SOURCE_DIR}/host_memory.hpp
55+
${CMAKE_CURRENT_SOURCE_DIR}/internal_enums.hpp
5556
${CMAKE_CURRENT_SOURCE_DIR}/kernel_type.hpp
5657
${CMAKE_CURRENT_SOURCE_DIR}/module_loading.hpp
5758
${CMAKE_CURRENT_SOURCE_DIR}/sparse_helpers.hpp

src/backend/common/defines.hpp

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
#pragma once
1111

12+
#include <common/internal_enums.hpp>
13+
1214
#include <mutex>
1315
#include <string>
1416

@@ -41,22 +43,6 @@ inline std::string clipFilePath(std::string path, std::string str) {
4143
#define __AF_FILENAME__ (clipFilePath(__FILE__, "src/").c_str())
4244
#endif
4345

44-
typedef enum {
45-
AF_BATCH_UNSUPPORTED = -1, /* invalid inputs */
46-
AF_BATCH_NONE, /* one signal, one filter */
47-
AF_BATCH_LHS, /* many signal, one filter */
48-
AF_BATCH_RHS, /* one signal, many filter */
49-
AF_BATCH_SAME, /* signal and filter have same batch size */
50-
AF_BATCH_DIFF, /* signal and filter have different batch size */
51-
} AF_BATCH_KIND;
52-
53-
enum class kJITHeuristics {
54-
Pass = 0, /* no eval necessary */
55-
TreeHeight = 1, /* eval due to jit tree height */
56-
KernelParameterSize = 2, /* eval due to many kernel parameters */
57-
MemoryPressure = 3 /* eval due to memory pressure */
58-
};
59-
6046
#ifdef OS_WIN
6147
#include <Windows.h>
6248
using LibHandle = HMODULE;

src/backend/common/internal_enums.hpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
/*******************************************************
2+
* Copyright (c) 2020, ArrayFire
3+
* All rights reserved.
4+
*
5+
* This file is distributed under 3-clause BSD license.
6+
* The complete license agreement can be obtained at:
7+
* http://arrayfire.com/licenses/BSD-3-Clause
8+
********************************************************/
9+
10+
#pragma once
11+
12+
// TODO AF_BATCH_UNSUPPORTED is not required and shouldn't happen
13+
// Code changes are required to handle all cases properly
14+
// and this enum value should be removed.
15+
typedef enum {
16+
AF_BATCH_UNSUPPORTED = -1, /* invalid inputs */
17+
AF_BATCH_NONE, /* one signal, one filter */
18+
AF_BATCH_LHS, /* many signal, one filter */
19+
AF_BATCH_RHS, /* one signal, many filter */
20+
AF_BATCH_SAME, /* signal and filter have same batch size */
21+
AF_BATCH_DIFF, /* signal and filter have different batch size */
22+
} AF_BATCH_KIND;

src/backend/common/jit/Node.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@
1919
#include <unordered_map>
2020
#include <vector>
2121

22+
enum class kJITHeuristics {
23+
Pass = 0, /* no eval necessary */
24+
TreeHeight = 1, /* eval due to jit tree height */
25+
KernelParameterSize = 2, /* eval due to many kernel parameters */
26+
MemoryPressure = 3 /* eval due to memory pressure */
27+
};
28+
2229
namespace common {
2330
class Node;
2431
struct Node_ids;

src/backend/cpu/Array.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <Param.hpp>
1313
#include <common/ArrayInfo.hpp>
1414
#include <common/MemoryManagerBase.hpp>
15+
#include <common/jit/Node.hpp>
1516
#include <jit/Node.hpp>
1617
#include <memory.hpp>
1718
#include <platform.hpp>

src/backend/cuda/CMakeLists.txt

Lines changed: 55 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -89,45 +89,74 @@ set(nvrtc_src
8989
${PROJECT_BINARY_DIR}/include/af/version.h
9090

9191
${CMAKE_CURRENT_SOURCE_DIR}/Param.hpp
92+
${CMAKE_CURRENT_SOURCE_DIR}/assign_kernel_param.hpp
9293
${CMAKE_CURRENT_SOURCE_DIR}/backend.hpp
94+
${CMAKE_CURRENT_SOURCE_DIR}/dims_param.hpp
9395
${CMAKE_CURRENT_SOURCE_DIR}/kernel/interp.hpp
9496
${CMAKE_CURRENT_SOURCE_DIR}/kernel/shared.hpp
9597
${CMAKE_CURRENT_SOURCE_DIR}/math.hpp
98+
${CMAKE_CURRENT_SOURCE_DIR}/minmax_op.hpp
9699
${CMAKE_CURRENT_SOURCE_DIR}/utility.hpp
97100
${CMAKE_CURRENT_SOURCE_DIR}/types.hpp
98101
${CMAKE_CURRENT_SOURCE_DIR}/../common/half.hpp
102+
${CMAKE_CURRENT_SOURCE_DIR}/../common/internal_enums.hpp
99103
${CMAKE_CURRENT_SOURCE_DIR}/../common/kernel_type.hpp
100104

101105
${CMAKE_CURRENT_SOURCE_DIR}/kernel/anisotropic_diffusion.cuh
102106
${CMAKE_CURRENT_SOURCE_DIR}/kernel/approx1.cuh
103107
${CMAKE_CURRENT_SOURCE_DIR}/kernel/approx2.cuh
108+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/assign.cuh
104109
${CMAKE_CURRENT_SOURCE_DIR}/kernel/bilateral.cuh
105110
${CMAKE_CURRENT_SOURCE_DIR}/kernel/canny.cuh
106111
${CMAKE_CURRENT_SOURCE_DIR}/kernel/convolve1.cuh
107112
${CMAKE_CURRENT_SOURCE_DIR}/kernel/convolve2.cuh
108113
${CMAKE_CURRENT_SOURCE_DIR}/kernel/convolve3.cuh
109114
${CMAKE_CURRENT_SOURCE_DIR}/kernel/convolve_separable.cuh
115+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/copy.cuh
116+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/diagonal.cuh
117+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/diff.cuh
110118
${CMAKE_CURRENT_SOURCE_DIR}/kernel/exampleFunction.cuh
119+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/fftconvolve.cuh
111120
${CMAKE_CURRENT_SOURCE_DIR}/kernel/flood_fill.cuh
121+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/gradient.cuh
112122
${CMAKE_CURRENT_SOURCE_DIR}/kernel/histogram.cuh
113123
${CMAKE_CURRENT_SOURCE_DIR}/kernel/hsv_rgb.cuh
124+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/identity.cuh
125+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/iir.cuh
126+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/index.cuh
127+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/iota.cuh
128+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/ireduce.cuh
129+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/join.cuh
130+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/lookup.cuh
131+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/lu_split.cuh
114132
${CMAKE_CURRENT_SOURCE_DIR}/kernel/match_template.cuh
115133
${CMAKE_CURRENT_SOURCE_DIR}/kernel/meanshift.cuh
116134
${CMAKE_CURRENT_SOURCE_DIR}/kernel/medfilt.cuh
135+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/memcopy.cuh
117136
${CMAKE_CURRENT_SOURCE_DIR}/kernel/moments.cuh
118137
${CMAKE_CURRENT_SOURCE_DIR}/kernel/morph.cuh
119138
${CMAKE_CURRENT_SOURCE_DIR}/kernel/pad_array_borders.cuh
139+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/range.cuh
120140
${CMAKE_CURRENT_SOURCE_DIR}/kernel/resize.cuh
141+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/reorder.cuh
121142
${CMAKE_CURRENT_SOURCE_DIR}/kernel/rotate.cuh
143+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/select.cuh
122144
${CMAKE_CURRENT_SOURCE_DIR}/kernel/scan_dim.cuh
123145
${CMAKE_CURRENT_SOURCE_DIR}/kernel/scan_dim_by_key.cuh
124146
${CMAKE_CURRENT_SOURCE_DIR}/kernel/scan_first.cuh
125147
${CMAKE_CURRENT_SOURCE_DIR}/kernel/scan_first_by_key.cuh
126148
${CMAKE_CURRENT_SOURCE_DIR}/kernel/sobel.cuh
149+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/sparse.cuh
150+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/sparse_arith.cuh
151+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/susan.cuh
152+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/tile.cuh
127153
${CMAKE_CURRENT_SOURCE_DIR}/kernel/transform.cuh
128154
${CMAKE_CURRENT_SOURCE_DIR}/kernel/transpose.cuh
129155
${CMAKE_CURRENT_SOURCE_DIR}/kernel/transpose_inplace.cuh
156+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/triangle.cuh
157+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/unwrap.cuh
130158
${CMAKE_CURRENT_SOURCE_DIR}/kernel/where.cuh
159+
${CMAKE_CURRENT_SOURCE_DIR}/kernel/wrap.cuh
131160
)
132161

133162
file_to_string(
@@ -222,13 +251,9 @@ cuda_add_library(afcuda
222251
anisotropic_diffusion.cpp
223252
any.cu
224253
approx.cpp
225-
assign.cu
226254
bilateral.cpp
227255
canny.cpp
228-
copy.cu
229256
count.cu
230-
diagonal.cu
231-
diff.cu
232257
dilate.cpp
233258
dilate3d.cpp
234259
erode.cpp
@@ -237,20 +262,10 @@ cuda_add_library(afcuda
237262
Event.hpp
238263
exampleFunction.cpp
239264
fast.cu
240-
fftconvolve.cu
241-
gradient.cu
242265
harris.cu
243266
histogram.cpp
244267
homography.cu
245268
hsv_rgb.cpp
246-
identity.cu
247-
iir.cu
248-
index.cu
249-
iota.cu
250-
ireduce.cu
251-
join.cu
252-
lookup.cu
253-
lu.cu
254269
match_template.cpp
255270
max.cu
256271
mean.cu
@@ -262,32 +277,21 @@ cuda_add_library(afcuda
262277
orb.cu
263278
pad_array_borders.cpp
264279
product.cu
265-
qr.cu
266280
random_engine.cu
267-
range.cu
268281
regions.cu
269-
reorder.cu
270282
resize.cpp
271283
rotate.cpp
272-
select.cu
273284
set.cu
274285
sift.cu
275286
sobel.cpp
276287
sort.cu
277288
sort_by_key.cu
278289
sort_index.cu
279-
sparse.cu
280-
sparse_arith.cu
281290
sum.cu
282-
susan.cu
283-
tile.cu
284291
topk.cu
285292
transform.cpp
286293
transpose.cpp
287294
transpose_inplace.cpp
288-
triangle.cu
289-
unwrap.cu
290-
wrap.cu
291295

292296
kernel/anisotropic_diffusion.hpp
293297
kernel/approx.hpp
@@ -375,6 +379,7 @@ cuda_add_library(afcuda
375379
anisotropic_diffusion.hpp
376380
approx.hpp
377381
arith.hpp
382+
assign.cpp
378383
assign.hpp
379384
backend.hpp
380385
bilateral.hpp
@@ -388,6 +393,7 @@ cuda_add_library(afcuda
388393
complex.hpp
389394
convolve.cpp
390395
convolve.hpp
396+
copy.cpp
391397
copy.hpp
392398
cublas.cpp
393399
cublas.hpp
@@ -405,7 +411,9 @@ cuda_add_library(afcuda
405411
device_manager.hpp
406412
debug_cuda.hpp
407413
debug_thrust.hpp
414+
diagonal.cpp
408415
diagonal.hpp
416+
diff.cpp
409417
diff.hpp
410418
driver.cpp
411419
err_cuda.hpp
@@ -415,11 +423,13 @@ cuda_add_library(afcuda
415423
fast_pyramid.hpp
416424
fft.cpp
417425
fft.hpp
426+
fftconvolve.cpp
418427
fftconvolve.hpp
419428
flood_fill.cpp
420429
flood_fill.hpp
421430
GraphicsResourceManager.cpp
422431
GraphicsResourceManager.hpp
432+
gradient.cpp
423433
gradient.hpp
424434
handle.cpp
425435
harris.hpp
@@ -428,19 +438,27 @@ cuda_add_library(afcuda
428438
histogram.hpp
429439
homography.hpp
430440
hsv_rgb.hpp
441+
identity.cpp
431442
identity.hpp
443+
iir.cpp
432444
iir.hpp
433445
image.cpp
434446
image.hpp
447+
index.cpp
435448
index.hpp
436449
inverse.cpp
437450
inverse.hpp
451+
iota.cpp
438452
iota.hpp
453+
ireduce.cpp
439454
ireduce.hpp
440455
jit.cpp
456+
join.cpp
441457
join.hpp
442458
logic.hpp
459+
lookup.cpp
443460
lookup.hpp
461+
lu.cpp
444462
lu.hpp
445463
match_template.hpp
446464
math.hpp
@@ -449,6 +467,7 @@ cuda_add_library(afcuda
449467
medfilt.hpp
450468
memory.cpp
451469
memory.hpp
470+
minmax_op.hpp
452471
moments.hpp
453472
morph.hpp
454473
morph3d_impl.hpp
@@ -460,12 +479,15 @@ cuda_add_library(afcuda
460479
plot.cpp
461480
plot.hpp
462481
print.hpp
482+
qr.cpp
463483
qr.hpp
464484
random_engine.hpp
485+
range.cpp
465486
range.hpp
466487
reduce.hpp
467488
reduce_impl.hpp
468489
regions.hpp
490+
reorder.cpp
469491
reorder.hpp
470492
resize.hpp
471493
rotate.hpp
@@ -474,6 +496,7 @@ cuda_add_library(afcuda
474496
scan.hpp
475497
scan_by_key.cpp
476498
scan_by_key.hpp
499+
select.cpp
477500
select.hpp
478501
set.hpp
479502
shift.cpp
@@ -484,30 +507,37 @@ cuda_add_library(afcuda
484507
solve.hpp
485508
sort_by_key.hpp
486509
sort_index.hpp
510+
sparse.cpp
487511
sparse.hpp
512+
sparse_arith.cpp
488513
sparse_arith.hpp
489514
sparse_blas.cpp
490515
sparse_blas.hpp
491516
surface.cpp
492517
surface.hpp
518+
susan.cpp
493519
susan.hpp
494520
svd.cpp
495521
svd.hpp
522+
tile.cpp
496523
tile.hpp
497524
topk.hpp
498525
traits.hpp
499526
transform.hpp
500527
transpose.hpp
528+
triangle.cpp
501529
triangle.hpp
502530
types.hpp
503531
unary.hpp
532+
unwrap.cpp
504533
unwrap.hpp
505534
utility.cpp
506535
utility.hpp
507536
vector_field.cpp
508537
vector_field.hpp
509538
where.cpp
510539
where.hpp
540+
wrap.cpp
511541
wrap.hpp
512542

513543
jit/BufferNode.hpp

src/backend/cuda/assign.cu renamed to src/backend/cuda/assign.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ namespace cuda {
2323

2424
template<typename T>
2525
void assign(Array<T>& out, const af_index_t idxrs[], const Array<T>& rhs) {
26-
kernel::AssignKernelParam_t p;
26+
AssignKernelParam p;
2727
std::vector<af_seq> seqs(4, af_span);
2828
// create seq vector to retrieve output
2929
// dimensions, offsets & offsets
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/*******************************************************
2+
* Copyright (c) 2020, ArrayFire
3+
* All rights reserved.
4+
*
5+
* This file is distributed under 3-clause BSD license.
6+
* The complete license agreement can be obtained at:
7+
* http://arrayfire.com/licenses/BSD-3-Clause
8+
********************************************************/
9+
10+
#pragma once
11+
12+
namespace cuda {
13+
14+
typedef struct {
15+
int offs[4];
16+
int strds[4];
17+
bool isSeq[4];
18+
unsigned int* ptr[4];
19+
} AssignKernelParam;
20+
21+
using IndexKernelParam = AssignKernelParam;
22+
23+
} // namespace cuda

0 commit comments

Comments
 (0)