Skip to content

Commit 7e5df44

Browse files
committed
Add support to cuDNN Dependency module to load verion 8 when available
1 parent b5eb73c commit 7e5df44

File tree

6 files changed

+247
-84
lines changed

6 files changed

+247
-84
lines changed

CMakeModules/FindcuDNN.cmake

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ find_package(CUDA QUIET)
5454
find_path(cuDNN_INCLUDE_DIRS
5555
NAMES cudnn.h
5656
HINTS
57-
${PC_CUDNN_INCLUDE_DIRS}
5857
${cuDNN_ROOT_DIR}
58+
${PC_CUDNN_INCLUDE_DIRS}
5959
${CUDA_TOOLKIT_INCLUDE}
6060
PATH_SUFFIXES include
6161
DOC "cuDNN include directory path." )
@@ -64,6 +64,12 @@ if(cuDNN_INCLUDE_DIRS)
6464
file(READ ${cuDNN_INCLUDE_DIRS}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
6565
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
6666
CUDNN_MAJOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
67+
list(LENGTH CUDNN_MAJOR_VERSION cudnn_ver_matches)
68+
if(${cudnn_ver_matches} EQUAL 0)
69+
file(READ ${cuDNN_INCLUDE_DIRS}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS)
70+
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
71+
CUDNN_MAJOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
72+
endif()
6773
string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
6874
CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}")
6975
string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
@@ -94,10 +100,10 @@ if(cuDNN_INCLUDE_DIRS)
94100
libcudnn.${cudnn_ver_suffix}.dylib
95101
cudnn
96102
PATHS
97-
$ENV{LD_LIBRARY_PATH}
98-
${libpath_cudart}
99103
${cuDNN_ROOT_DIR}
100104
${PC_CUDNN_LIBRARY_DIRS}
105+
$ENV{LD_LIBRARY_PATH}
106+
${libpath_cudart}
101107
${CMAKE_INSTALL_PREFIX}
102108
PATH_SUFFIXES lib lib64 bin lib/x64 bin/x64
103109
DOC "cuDNN link library." )
@@ -106,10 +112,10 @@ if(cuDNN_INCLUDE_DIRS)
106112
find_file(cuDNN_DLL_LIBRARY
107113
NAMES cudnn64_${cudnn_ver_suffix}${CMAKE_SHARED_LIBRARY_SUFFIX}
108114
PATHS
109-
$ENV{PATH}
110-
${libpath_cudart}
111115
${cuDNN_ROOT_DIR}
112116
${PC_CUDNN_LIBRARY_DIRS}
117+
$ENV{PATH}
118+
${libpath_cudart}
113119
${CMAKE_INSTALL_PREFIX}
114120
PATH_SUFFIXES lib lib64 bin lib/x64 bin/x64
115121
DOC "cuDNN Windows DLL." )

src/backend/cuda/convolveNN.cpp

Lines changed: 67 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,15 @@
2828
#include <af/dim4.hpp>
2929

3030
#include <type_traits>
31+
#include <vector>
3132

3233
using af::dim4;
3334
using common::flip;
3435
using common::half;
3536
using common::make_handle;
3637
using std::conditional;
3738
using std::is_same;
39+
using std::vector;
3840

3941
namespace cuda {
4042

@@ -88,19 +90,40 @@ Array<T> convolve2_cudnn(const Array<T> &signal, const Array<T> &filter,
8890
auto output_descriptor = toCudnn<cudnnTensorDescriptor_t>(out);
8991

9092
// get convolution algorithm
91-
const int memory_limit =
92-
0; // TODO: set to remaining space in memory manager?
9393
cudnnConvolutionFwdAlgo_t convolution_algorithm;
94-
CUDNN_CHECK(cuda::cudnnGetConvolutionForwardAlgorithm(
95-
cudnn, input_descriptor, filter_descriptor, convolution_descriptor,
96-
output_descriptor, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, memory_limit,
97-
&convolution_algorithm));
98-
99-
// figure out scratch space memory requirements
100-
size_t workspace_bytes;
101-
CUDNN_CHECK(cuda::cudnnGetConvolutionForwardWorkspaceSize(
102-
cudnn, input_descriptor, filter_descriptor, convolution_descriptor,
103-
output_descriptor, convolution_algorithm, &workspace_bytes));
94+
size_t workspace_bytes = 0;
95+
96+
auto version = getCudnnPlugin().getVersion();
97+
if (std::get<0>(version) >= 8) {
98+
int maxAlgoCount = 0;
99+
CUDNN_CHECK(cuda::cudnnGetConvolutionForwardAlgorithmMaxCount(
100+
cudnn, &maxAlgoCount));
101+
102+
vector<cudnnConvolutionFwdAlgoPerf_t> perfResults(maxAlgoCount);
103+
int returnAlgoCount = 0;
104+
CUDNN_CHECK(cuda::cudnnFindConvolutionForwardAlgorithm(
105+
cudnn, input_descriptor, filter_descriptor, convolution_descriptor,
106+
output_descriptor, maxAlgoCount, &returnAlgoCount,
107+
perfResults.data()));
108+
109+
for (int i = 0; i < returnAlgoCount; ++i) {
110+
if (perfResults[i].status == CUDNN_STATUS_SUCCESS) {
111+
convolution_algorithm = perfResults[i].algo;
112+
workspace_bytes = perfResults[i].memory;
113+
break;
114+
}
115+
}
116+
} else {
117+
const int memory_limit =
118+
0; // TODO: set to remaining space in memory manager?
119+
CUDNN_CHECK(cuda::cudnnGetConvolutionForwardAlgorithm(
120+
cudnn, input_descriptor, filter_descriptor, convolution_descriptor,
121+
output_descriptor, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
122+
memory_limit, &convolution_algorithm));
123+
CUDNN_CHECK(cuda::cudnnGetConvolutionForwardWorkspaceSize(
124+
cudnn, input_descriptor, filter_descriptor, convolution_descriptor,
125+
output_descriptor, convolution_algorithm, &workspace_bytes));
126+
}
104127

105128
auto workspace_buffer = memAlloc<char>(workspace_bytes);
106129

@@ -384,19 +407,40 @@ Array<T> filter_gradient_cudnn(const Array<T> &incoming_gradient,
384407

385408
// determine algorithm to use
386409
cudnnConvolutionBwdFilterAlgo_t bwd_filt_convolution_algorithm;
387-
CUDNN_CHECK(cuda::cudnnGetConvolutionBackwardFilterAlgorithm(
388-
cudnn, x_descriptor, dy_descriptor, convolution_descriptor,
389-
dw_descriptor, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0,
390-
&bwd_filt_convolution_algorithm));
391-
392410
// figure out scratch space memory requirements
393-
size_t workspace_bytes;
394-
CUDNN_CHECK(cuda::cudnnGetConvolutionBackwardFilterWorkspaceSize(
395-
cudnn, x_descriptor, dy_descriptor, convolution_descriptor,
396-
dw_descriptor, bwd_filt_convolution_algorithm, &workspace_bytes));
397-
// prepare output array and scratch space
398-
Array<T> out = createEmptyArray<T>(fDims);
411+
size_t workspace_bytes = 0;
412+
413+
auto version = getCudnnPlugin().getVersion();
414+
if (std::get<0>(version) >= 8) {
415+
int maxAlgoCount = 0;
416+
CUDNN_CHECK(cuda::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
417+
cudnn, &maxAlgoCount));
418+
419+
vector<cudnnConvolutionBwdFilterAlgoPerf_t> perfResults(maxAlgoCount);
420+
int returnAlgoCount = 0;
421+
CUDNN_CHECK(cuda::cudnnFindConvolutionBackwardFilterAlgorithm(
422+
cudnn, x_descriptor, dy_descriptor, convolution_descriptor,
423+
dw_descriptor, maxAlgoCount, &returnAlgoCount, perfResults.data()));
424+
425+
for (int i = 0; i < returnAlgoCount; ++i) {
426+
if (perfResults[i].status == CUDNN_STATUS_SUCCESS) {
427+
bwd_filt_convolution_algorithm = perfResults[i].algo;
428+
workspace_bytes = perfResults[i].memory;
429+
break;
430+
}
431+
}
432+
} else {
433+
CUDNN_CHECK(cuda::cudnnGetConvolutionBackwardFilterAlgorithm(
434+
cudnn, x_descriptor, dy_descriptor, convolution_descriptor,
435+
dw_descriptor, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0,
436+
&bwd_filt_convolution_algorithm));
437+
CUDNN_CHECK(cuda::cudnnGetConvolutionBackwardFilterWorkspaceSize(
438+
cudnn, x_descriptor, dy_descriptor, convolution_descriptor,
439+
dw_descriptor, bwd_filt_convolution_algorithm, &workspace_bytes));
440+
}
399441

442+
// prepare output array and scratch space
443+
Array<T> out = createEmptyArray<T>(fDims);
400444
auto workspace_buffer = memAlloc<char>(workspace_bytes);
401445

402446
// perform convolution

src/backend/cuda/cudnn.cpp

Lines changed: 83 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -171,16 +171,16 @@ cudnnStatus_t cudnnGetConvolutionNdForwardOutputDim(
171171
convDesc, inputTensorDesc, filterDesc, nbDims, tensorOuputDimA);
172172
}
173173

174-
cudnnStatus_t cudnnGetConvolutionForwardAlgorithm(
175-
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
176-
const cudnnFilterDescriptor_t wDesc,
177-
const cudnnConvolutionDescriptor_t convDesc,
178-
const cudnnTensorDescriptor_t yDesc,
179-
cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
180-
cudnnConvolutionFwdAlgo_t *algo) {
181-
return getCudnnPlugin().cudnnGetConvolutionForwardAlgorithm(
182-
handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes,
183-
algo);
174+
cudnnStatus_t cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle,
175+
int *count) {
176+
return getCudnnPlugin().cudnnGetConvolutionForwardAlgorithmMaxCount(handle,
177+
count);
178+
}
179+
180+
cudnnStatus_t cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
181+
cudnnHandle_t handle, int *count) {
182+
return getCudnnPlugin().cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
183+
handle, count);
184184
}
185185

186186
cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize(
@@ -193,16 +193,57 @@ cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize(
193193
handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
194194
}
195195

196-
cudnnStatus_t cudnnConvolutionForward(
197-
cudnnHandle_t handle, const void *alpha,
198-
const cudnnTensorDescriptor_t xDesc, const void *x,
199-
const cudnnFilterDescriptor_t wDesc, const void *w,
200-
const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
201-
void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
202-
const cudnnTensorDescriptor_t yDesc, void *y) {
203-
return getCudnnPlugin().cudnnConvolutionForward(
204-
handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
205-
workSpaceSizeInBytes, beta, yDesc, y);
196+
cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize(
197+
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
198+
const cudnnTensorDescriptor_t dyDesc,
199+
const cudnnConvolutionDescriptor_t convDesc,
200+
const cudnnFilterDescriptor_t gradDesc,
201+
cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
202+
return getCudnnPlugin().cudnnGetConvolutionBackwardFilterWorkspaceSize(
203+
handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
204+
}
205+
206+
cudnnStatus_t cudnnFindConvolutionForwardAlgorithm(
207+
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
208+
const cudnnFilterDescriptor_t wDesc,
209+
const cudnnConvolutionDescriptor_t convDesc,
210+
const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
211+
int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
212+
return getCudnnPlugin().cudnnFindConvolutionForwardAlgorithm(
213+
handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
214+
returnedAlgoCount, perfResults);
215+
}
216+
217+
cudnnStatus_t cudnnFindConvolutionBackwardFilterAlgorithm(
218+
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
219+
const cudnnTensorDescriptor_t dyDesc,
220+
const cudnnConvolutionDescriptor_t convDesc,
221+
const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
222+
int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
223+
return getCudnnPlugin().cudnnFindConvolutionBackwardFilterAlgorithm(
224+
handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
225+
returnedAlgoCount, perfResults);
226+
}
227+
228+
cudnnStatus_t cudnnGetConvolutionForwardAlgorithm(
229+
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
230+
const cudnnFilterDescriptor_t wDesc,
231+
const cudnnConvolutionDescriptor_t convDesc,
232+
const cudnnTensorDescriptor_t yDesc,
233+
cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
234+
cudnnConvolutionFwdAlgo_t *algo) {
235+
auto version = getCudnnPlugin().getVersion();
236+
if (std::get<0>(version) < 8) {
237+
return getCudnnPlugin().cudnnGetConvolutionForwardAlgorithm(
238+
handle, xDesc, wDesc, convDesc, yDesc, preference,
239+
memoryLimitInBytes, algo);
240+
} else {
241+
AF_ERROR(
242+
"cudnnGetConvolutionForwardAlgorithm has been removed since cuDNN "
243+
"8",
244+
AF_ERR_NOT_SUPPORTED);
245+
return CUDNN_STATUS_SUCCESS;
246+
}
206247
}
207248

208249
cudnnStatus_t cudnnGetConvolutionBackwardFilterAlgorithm(
@@ -212,19 +253,30 @@ cudnnStatus_t cudnnGetConvolutionBackwardFilterAlgorithm(
212253
const cudnnFilterDescriptor_t dwDesc,
213254
cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
214255
cudnnConvolutionBwdFilterAlgo_t *algo) {
215-
return getCudnnPlugin().cudnnGetConvolutionBackwardFilterAlgorithm(
216-
handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes,
217-
algo);
256+
auto version = getCudnnPlugin().getVersion();
257+
if (std::get<0>(version) < 8) {
258+
return getCudnnPlugin().cudnnGetConvolutionBackwardFilterAlgorithm(
259+
handle, xDesc, dyDesc, convDesc, dwDesc, preference,
260+
memoryLimitInBytes, algo);
261+
} else {
262+
AF_ERROR(
263+
"cudnnGetConvolutionBackwardFilterAlgorithm has been removed since "
264+
"cuDNN 8",
265+
AF_ERR_NOT_SUPPORTED);
266+
return CUDNN_STATUS_SUCCESS;
267+
}
218268
}
219269

220-
cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize(
221-
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
222-
const cudnnTensorDescriptor_t dyDesc,
223-
const cudnnConvolutionDescriptor_t convDesc,
224-
const cudnnFilterDescriptor_t gradDesc,
225-
cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
226-
return getCudnnPlugin().cudnnGetConvolutionBackwardFilterWorkspaceSize(
227-
handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
270+
cudnnStatus_t cudnnConvolutionForward(
271+
cudnnHandle_t handle, const void *alpha,
272+
const cudnnTensorDescriptor_t xDesc, const void *x,
273+
const cudnnFilterDescriptor_t wDesc, const void *w,
274+
const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
275+
void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
276+
const cudnnTensorDescriptor_t yDesc, void *y) {
277+
return getCudnnPlugin().cudnnConvolutionForward(
278+
handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
279+
workSpaceSizeInBytes, beta, yDesc, y);
228280
}
229281

230282
cudnnStatus_t cudnnConvolutionBackwardFilter(

src/backend/cuda/cudnn.hpp

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,40 @@ cudnnStatus_t cudnnGetConvolutionNdForwardOutputDim(
116116
const cudnnFilterDescriptor_t filterDesc, int nbDims,
117117
int tensorOuputDimA[]);
118118

119+
cudnnStatus_t cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle,
120+
int *count);
121+
122+
cudnnStatus_t cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
123+
cudnnHandle_t handle, int *count);
124+
125+
cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize(
126+
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
127+
const cudnnFilterDescriptor_t wDesc,
128+
const cudnnConvolutionDescriptor_t convDesc,
129+
const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
130+
size_t *sizeInBytes);
131+
132+
cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize(
133+
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
134+
const cudnnTensorDescriptor_t dyDesc,
135+
const cudnnConvolutionDescriptor_t convDesc,
136+
const cudnnFilterDescriptor_t gradDesc,
137+
cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes);
138+
139+
cudnnStatus_t cudnnFindConvolutionForwardAlgorithm(
140+
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
141+
const cudnnFilterDescriptor_t wDesc,
142+
const cudnnConvolutionDescriptor_t convDesc,
143+
const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
144+
int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults);
145+
146+
cudnnStatus_t cudnnFindConvolutionBackwardFilterAlgorithm(
147+
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
148+
const cudnnTensorDescriptor_t dyDesc,
149+
const cudnnConvolutionDescriptor_t convDesc,
150+
const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
151+
int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
152+
119153
cudnnStatus_t cudnnGetConvolutionForwardAlgorithm(
120154
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
121155
const cudnnFilterDescriptor_t wDesc,
@@ -124,12 +158,13 @@ cudnnStatus_t cudnnGetConvolutionForwardAlgorithm(
124158
cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
125159
cudnnConvolutionFwdAlgo_t *algo);
126160

127-
cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize(
161+
cudnnStatus_t cudnnGetConvolutionBackwardFilterAlgorithm(
128162
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
129-
const cudnnFilterDescriptor_t wDesc,
163+
const cudnnTensorDescriptor_t dyDesc,
130164
const cudnnConvolutionDescriptor_t convDesc,
131-
const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
132-
size_t *sizeInBytes);
165+
const cudnnFilterDescriptor_t dwDesc,
166+
cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
167+
cudnnConvolutionBwdFilterAlgo_t *algo);
133168

134169
cudnnStatus_t cudnnConvolutionForward(
135170
cudnnHandle_t handle, const void *alpha,
@@ -139,21 +174,6 @@ cudnnStatus_t cudnnConvolutionForward(
139174
void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
140175
const cudnnTensorDescriptor_t yDesc, void *y);
141176

142-
cudnnStatus_t cudnnGetConvolutionBackwardFilterAlgorithm(
143-
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
144-
const cudnnTensorDescriptor_t dyDesc,
145-
const cudnnConvolutionDescriptor_t convDesc,
146-
const cudnnFilterDescriptor_t dwDesc,
147-
cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
148-
cudnnConvolutionBwdFilterAlgo_t *algo);
149-
150-
cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize(
151-
cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
152-
const cudnnTensorDescriptor_t dyDesc,
153-
const cudnnConvolutionDescriptor_t convDesc,
154-
const cudnnFilterDescriptor_t gradDesc,
155-
cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes);
156-
157177
cudnnStatus_t cudnnConvolutionBackwardFilter(
158178
cudnnHandle_t handle, const void *alpha,
159179
const cudnnTensorDescriptor_t xDesc, const void *x,

0 commit comments

Comments
 (0)