@@ -103,6 +103,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
103
103
top_dim_ = num_output_ * output_w_ * output_h_;
104
104
105
105
cache_path_ = utils::getConfigurationParameterString (" OPENCV_OCL4DNN_CONFIG_PATH" , " " );
106
+ dwconv_ = (num_output_ == channels_ && channels_ == group_);
106
107
107
108
use_cache_path_ = false ;
108
109
if (!cache_path_.empty ())
@@ -203,7 +204,8 @@ void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
203
204
typedef enum {
204
205
KERNEL_TYPE_INTEL_IDLF = 2 ,
205
206
KERNEL_TYPE_BASIC = 4 ,
206
- KERNEL_TYPE_GEMM_LIKE = 5
207
+ KERNEL_TYPE_GEMM_LIKE = 5 ,
208
+ KERNEL_TYPE_DWCONV = 6
207
209
} ocl4dnnConvSpatialKernelType_t;
208
210
209
211
template <typename Dtype>
@@ -313,6 +315,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
313
315
if (clOptionSupport (" -cl-no-subgroup-ifp" ))
314
316
options_ << " -cl-no-subgroup-ifp " ;
315
317
318
+ addDef (" KERNEL_GEMM_LIKE" );
316
319
addDef (" INPUT_DEPTH" , channels_);
317
320
addDef (" WIDTH1" , M_);
318
321
addDef (" OUT_PADDING_LEFT" , 0 );
@@ -329,6 +332,28 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
329
332
setFusionDefine (fused_activ_, fused_eltwise_);
330
333
src_ = ocl::dnn::conv_layer_spatial_oclsrc;
331
334
}
335
+ else if (kernelType == KERNEL_TYPE_DWCONV)
336
+ {
337
+ kernelUKey = generateSpecificKey (KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
338
+ kernel_name_ = " DWCONV_" ;
339
+ kernel_name_ += kernelUKey.c_str ();
340
+
341
+ options_ << " -cl-fast-relaxed-math " ;
342
+ if (clOptionSupport (" -cl-no-subgroup-ifp" ))
343
+ options_ << " -cl-no-subgroup-ifp " ;
344
+
345
+ addDef (" KERNEL_DWCONV" );
346
+ addDef (" KERNEL_SIZE" , kernel_w_ * kernel_h_);
347
+ addDef (" KERNEL_W" , kernel_w_);
348
+ addDef (" KERNEL_H" , kernel_h_);
349
+ addDef (" APPLY_BIAS" , bias_term_);
350
+ addDef (" OUTPUT_Z" , num_output_ * num_);
351
+ addDef (" CHANNELS" , num_output_);
352
+ setFusionDefine (fused_activ_, fused_eltwise_);
353
+
354
+ options_ << " -D DWCONV=" << kernel_name_;
355
+ src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
356
+ }
332
357
}
333
358
334
359
template <typename Dtype>
@@ -906,6 +931,33 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
906
931
return false ;
907
932
}
908
933
}
934
+ } else if (config->kernelType == KERNEL_TYPE_DWCONV) {
935
+ ocl::Kernel kernel (config->kernelName .c_str (), program);
936
+ if (kernel.empty ())
937
+ return false ;
938
+
939
+ cl_uint argIdx = 0 ;
940
+ setFusionArg (fused_activ_, fused_eltwise_, kernel, argIdx);
941
+ kernel.set (argIdx++, ocl::KernelArg::PtrReadOnly (bottom));
942
+ kernel.set (argIdx++, ocl::KernelArg::PtrReadOnly (weight));
943
+ if (bias_term_)
944
+ kernel.set (argIdx++, ocl::KernelArg::PtrReadOnly (bias));
945
+ kernel.set (argIdx++, ocl::KernelArg::PtrWriteOnly (top));
946
+ kernel.set (argIdx++, (uint16_t )width_);
947
+ kernel.set (argIdx++, (uint16_t )height_);
948
+ kernel.set (argIdx++, (uint16_t )output_w_);
949
+ kernel.set (argIdx++, (uint16_t )output_h_);
950
+
951
+ size_t global_size[3 ];
952
+ global_size[0 ] = output_w_;
953
+ global_size[1 ] = output_h_;
954
+ global_size[2 ] = num_output_ * num_;
955
+
956
+ if (!kernel.run (3 , global_size, NULL , false ))
957
+ {
958
+ std::cout << " DWCONV kernel run failed." << std::endl;
959
+ return false ;
960
+ }
909
961
} else {
910
962
for (int32_t n = 0 ; n < numImages; ++n) {
911
963
for (int32_t g = 0 ; g < group_; ++g) {
@@ -1222,6 +1274,39 @@ bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
1222
1274
return false ;
1223
1275
}
1224
1276
1277
+ template <>
1278
+ bool OCL4DNNConvSpatial<float >::createDWConvKernel(int32_t blockWidth,
1279
+ int32_t blockHeight,
1280
+ int32_t blockDepth)
1281
+ {
1282
+ if (!dwconv_)
1283
+ return false ;
1284
+
1285
+ int workItemOutput[3 ] = { 1 , 1 , 1 };
1286
+ size_t local_size[3 ] = { 1 , 1 , 1 };
1287
+ size_t global_size[3 ];
1288
+ global_size[0 ] = divUp (output_w_, workItemOutput[0 ]);
1289
+ global_size[1 ] = divUp (output_h_, workItemOutput[1 ]);
1290
+ global_size[2 ] = divUp (M_ * num_, workItemOutput[2 ]);
1291
+
1292
+ kernelType_ = KERNEL_TYPE_DWCONV;
1293
+ blockM_ = blockWidth;
1294
+ blockK_ = blockHeight;
1295
+ blockN_ = blockDepth;
1296
+
1297
+ setupKernel ();
1298
+
1299
+ ocl::Program program = compileKernel ();
1300
+ if (program.ptr ())
1301
+ {
1302
+ kernelQueue.push_back (makePtr<kernelConfig>(kernel_name_, &global_size[0 ], &local_size[0 ],
1303
+ &workItemOutput[0 ], false , KERNEL_TYPE_DWCONV));
1304
+ return true ;
1305
+ }
1306
+ else
1307
+ return false ;
1308
+ }
1309
+
1225
1310
template <>
1226
1311
bool OCL4DNNConvSpatial<float >::createConvolutionKernel(int32_t kernelType,
1227
1312
int32_t blockWidth,
@@ -1238,6 +1323,8 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
1238
1323
return createBasicKernel (blockWidth, blockHeight, blockDepth);
1239
1324
else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
1240
1325
return createGEMMLikeConvKernel (blockWidth, blockHeight, blockDepth);
1326
+ else if (kernelType == KERNEL_TYPE_DWCONV)
1327
+ return createDWConvKernel (blockWidth, blockHeight, blockDepth);
1241
1328
else
1242
1329
CV_Assert (0 && " Internal error" );
1243
1330
return false ;
@@ -1246,7 +1333,16 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
1246
1333
template <>
1247
1334
void OCL4DNNConvSpatial<float >::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
1248
1335
{
1249
- if (ocl::Device::getDefault ().intelSubgroupsSupport ()) {
1336
+ if (ocl::Device::getDefault ().intelSubgroupsSupport ())
1337
+ {
1338
+ // depth_wise kernels
1339
+ if (dwconv_)
1340
+ {
1341
+ tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1 , 1 , 1 ));
1342
+ if (group_ > 8 )
1343
+ return ;
1344
+ }
1345
+
1250
1346
/* IDLF kernels are using Intel specific extension which make
1251
1347
them intel only. */
1252
1348
// Generates static key_
0 commit comments