Skip to content

Commit 436d7e4

Browse files
committed
add depthwise convolution kernel
Signed-off-by: Li Peng <peng.li@intel.com>
1 parent 910d7da commit 436d7e4

File tree

3 files changed

+160
-4
lines changed

3 files changed

+160
-4
lines changed

modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,9 @@ class OCL4DNNConvSpatial
215215
bool createGEMMLikeConvKernel(int32_t blockWidth,
216216
int32_t blockHeight,
217217
int32_t blockDepth);
218+
bool createDWConvKernel(int32_t blockWidth,
219+
int32_t blockHeight,
220+
int32_t blockDepth);
218221
void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
219222
int32_t offset, int32_t size, bool write_only);
220223
bool convolve(const UMat &bottom, UMat &top,
@@ -282,6 +285,8 @@ class OCL4DNNConvSpatial
282285
int32_t M_;
283286

284287
bool tuned_;
288+
bool dwconv_;
289+
285290
std::string key_, key_sanitized_;
286291
std::string short_key_;
287292
std::string kernel_name_;

modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp

Lines changed: 98 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
103103
top_dim_ = num_output_ * output_w_ * output_h_;
104104

105105
cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
106+
dwconv_ = (num_output_ == channels_ && channels_ == group_);
106107

107108
use_cache_path_ = false;
108109
if (!cache_path_.empty())
@@ -203,7 +204,8 @@ void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
203204
typedef enum {
204205
KERNEL_TYPE_INTEL_IDLF = 2,
205206
KERNEL_TYPE_BASIC = 4,
206-
KERNEL_TYPE_GEMM_LIKE = 5
207+
KERNEL_TYPE_GEMM_LIKE = 5,
208+
KERNEL_TYPE_DWCONV = 6
207209
} ocl4dnnConvSpatialKernelType_t;
208210

209211
template<typename Dtype>
@@ -313,6 +315,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
313315
if (clOptionSupport("-cl-no-subgroup-ifp"))
314316
options_ << " -cl-no-subgroup-ifp ";
315317

318+
addDef("KERNEL_GEMM_LIKE");
316319
addDef("INPUT_DEPTH", channels_);
317320
addDef("WIDTH1", M_);
318321
addDef("OUT_PADDING_LEFT", 0);
@@ -329,6 +332,28 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
329332
setFusionDefine(fused_activ_, fused_eltwise_);
330333
src_ = ocl::dnn::conv_layer_spatial_oclsrc;
331334
}
335+
else if (kernelType == KERNEL_TYPE_DWCONV)
336+
{
337+
kernelUKey = generateSpecificKey(KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
338+
kernel_name_ = "DWCONV_";
339+
kernel_name_ += kernelUKey.c_str();
340+
341+
options_ << " -cl-fast-relaxed-math ";
342+
if (clOptionSupport("-cl-no-subgroup-ifp"))
343+
options_ << " -cl-no-subgroup-ifp ";
344+
345+
addDef("KERNEL_DWCONV");
346+
addDef("KERNEL_SIZE", kernel_w_ * kernel_h_);
347+
addDef("KERNEL_W", kernel_w_);
348+
addDef("KERNEL_H", kernel_h_);
349+
addDef("APPLY_BIAS", bias_term_);
350+
addDef("OUTPUT_Z", num_output_ * num_);
351+
addDef("CHANNELS", num_output_);
352+
setFusionDefine(fused_activ_, fused_eltwise_);
353+
354+
options_ << " -D DWCONV=" << kernel_name_;
355+
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
356+
}
332357
}
333358

334359
template<typename Dtype>
@@ -906,6 +931,33 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
906931
return false;
907932
}
908933
}
934+
} else if (config->kernelType == KERNEL_TYPE_DWCONV) {
935+
ocl::Kernel kernel(config->kernelName.c_str(), program);
936+
if (kernel.empty())
937+
return false;
938+
939+
cl_uint argIdx = 0;
940+
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
941+
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
942+
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
943+
if (bias_term_)
944+
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
945+
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
946+
kernel.set(argIdx++, (uint16_t)width_);
947+
kernel.set(argIdx++, (uint16_t)height_);
948+
kernel.set(argIdx++, (uint16_t)output_w_);
949+
kernel.set(argIdx++, (uint16_t)output_h_);
950+
951+
size_t global_size[3];
952+
global_size[0] = output_w_;
953+
global_size[1] = output_h_;
954+
global_size[2] = num_output_ * num_;
955+
956+
if (!kernel.run(3, global_size, NULL, false))
957+
{
958+
std::cout << "DWCONV kernel run failed." << std::endl;
959+
return false;
960+
}
909961
} else {
910962
for (int32_t n = 0; n < numImages; ++n) {
911963
for (int32_t g = 0; g < group_; ++g) {
@@ -1222,6 +1274,39 @@ bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
12221274
return false;
12231275
}
12241276

1277+
template<>
1278+
bool OCL4DNNConvSpatial<float>::createDWConvKernel(int32_t blockWidth,
1279+
int32_t blockHeight,
1280+
int32_t blockDepth)
1281+
{
1282+
if (!dwconv_)
1283+
return false;
1284+
1285+
int workItemOutput[3] = { 1, 1, 1 };
1286+
size_t local_size[3] = { 1, 1, 1 };
1287+
size_t global_size[3];
1288+
global_size[0] = divUp(output_w_, workItemOutput[0]);
1289+
global_size[1] = divUp(output_h_, workItemOutput[1]);
1290+
global_size[2] = divUp(M_ * num_, workItemOutput[2]);
1291+
1292+
kernelType_ = KERNEL_TYPE_DWCONV;
1293+
blockM_ = blockWidth;
1294+
blockK_ = blockHeight;
1295+
blockN_ = blockDepth;
1296+
1297+
setupKernel();
1298+
1299+
ocl::Program program = compileKernel();
1300+
if (program.ptr())
1301+
{
1302+
kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0],
1303+
&workItemOutput[0], false, KERNEL_TYPE_DWCONV));
1304+
return true;
1305+
}
1306+
else
1307+
return false;
1308+
}
1309+
12251310
template<>
12261311
bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
12271312
int32_t blockWidth,
@@ -1238,6 +1323,8 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
12381323
return createBasicKernel(blockWidth, blockHeight, blockDepth);
12391324
else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
12401325
return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
1326+
else if (kernelType == KERNEL_TYPE_DWCONV)
1327+
return createDWConvKernel(blockWidth, blockHeight, blockDepth);
12411328
else
12421329
CV_Assert(0 && "Internal error");
12431330
return false;
@@ -1246,7 +1333,16 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
12461333
template<>
12471334
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
12481335
{
1249-
if (ocl::Device::getDefault().intelSubgroupsSupport()) {
1336+
if (ocl::Device::getDefault().intelSubgroupsSupport())
1337+
{
1338+
//depth_wise kernels
1339+
if (dwconv_)
1340+
{
1341+
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1));
1342+
if (group_ > 8)
1343+
return;
1344+
}
1345+
12501346
/* IDLF kernels are using Intel specific extension which make
12511347
them intel only. */
12521348
// Generates static key_

modules/dnn/src/opencl/conv_layer_spatial.cl

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ convolve_simd(
383383
}
384384
}
385385

386-
#else // KERNEL_GEMM_LIKE
386+
#elif defined KERNEL_GEMM_LIKE
387387

388388
#if APPLY_BIAS
389389
// Dtype bias[4];
@@ -1501,4 +1501,59 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
15011501
INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
15021502
}
15031503
#endif
1504-
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE
1504+
1505+
#elif defined KERNEL_DWCONV
1506+
1507+
__kernel void DWCONV(
1508+
ELTWISE_DATA_ARG
1509+
NEGATIVE_SLOPE_ARG
1510+
__global Dtype* image_data,
1511+
__global Dtype* kernel_data,
1512+
BIAS_KERNEL_ARG
1513+
__global Dtype* convolved_image,
1514+
const ushort input_width,
1515+
const ushort input_height,
1516+
const ushort output_width,
1517+
const ushort output_height) {
1518+
1519+
const int outputX = get_global_id(0);
1520+
const int outputY = get_global_id(1);
1521+
const int outputZ = get_global_id(2);
1522+
if(outputX < output_width && outputY < output_height)
1523+
{
1524+
Dtype sum = 0.;
1525+
1526+
const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;
1527+
const int org_x = outputX * STRIDE_X - INPUT_PAD_W;
1528+
const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);
1529+
const int biasIndex=outputZ%CHANNELS;
1530+
const int local_image_offset = org_y*input_width + org_x;
1531+
const int imageSize = input_width*input_height;
1532+
1533+
__global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));
1534+
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
1535+
1536+
for(int y = 0; y < KERNEL_H; y++)
1537+
{
1538+
for(int x = 0; x < KERNEL_W; x++)
1539+
{
1540+
if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))
1541+
{
1542+
continue;
1543+
}
1544+
sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];
1545+
}
1546+
image_dataPtrFloat += input_width * DILATION_Y;
1547+
kernel_dataPtrFloat += KERNEL_W;
1548+
}
1549+
1550+
#if APPLY_BIAS
1551+
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
1552+
ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);
1553+
#else
1554+
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
1555+
ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);
1556+
#endif
1557+
}
1558+
}
1559+
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV

0 commit comments

Comments
 (0)