Skip to content

Commit dcdd6af

Browse files
committed
Merge pull request opencv#10341 from pengli:dnn
2 parents badc3bd + 3b84acf commit dcdd6af

File tree

6 files changed

+453
-4
lines changed

6 files changed

+453
-4
lines changed

modules/dnn/src/layers/prior_box_layer.cpp

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include <float.h>
4646
#include <algorithm>
4747
#include <cmath>
48+
#include "opencl_kernels_dnn.hpp"
4849

4950
namespace cv
5051
{
@@ -270,11 +271,108 @@ class PriorBoxLayerImpl : public PriorBoxLayer
270271
return false;
271272
}
272273

274+
#ifdef HAVE_OPENCL
275+
bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
276+
{
277+
std::vector<UMat> inputs;
278+
std::vector<UMat> outputs;
279+
280+
inps.getUMatVector(inputs);
281+
outs.getUMatVector(outputs);
282+
283+
int _layerWidth = inputs[0].size[3];
284+
int _layerHeight = inputs[0].size[2];
285+
286+
int _imageWidth = inputs[1].size[3];
287+
int _imageHeight = inputs[1].size[2];
288+
289+
float stepX, stepY;
290+
if (_stepX == 0 || _stepY == 0)
291+
{
292+
stepX = static_cast<float>(_imageWidth) / _layerWidth;
293+
stepY = static_cast<float>(_imageHeight) / _layerHeight;
294+
} else {
295+
stepX = _stepX;
296+
stepY = _stepY;
297+
}
298+
299+
if (umat_offsetsX.empty())
300+
{
301+
Mat offsetsX(1, _offsetsX.size(), CV_32FC1, &_offsetsX[0]);
302+
Mat offsetsY(1, _offsetsX.size(), CV_32FC1, &_offsetsY[0]);
303+
Mat aspectRatios(1, _aspectRatios.size(), CV_32FC1, &_aspectRatios[0]);
304+
Mat variance(1, _variance.size(), CV_32FC1, &_variance[0]);
305+
306+
offsetsX.copyTo(umat_offsetsX);
307+
offsetsY.copyTo(umat_offsetsY);
308+
aspectRatios.copyTo(umat_aspectRatios);
309+
variance.copyTo(umat_variance);
310+
311+
int real_numPriors = _numPriors / pow(2, _offsetsX.size() - 1);
312+
umat_scales = UMat(1, &real_numPriors, CV_32F, 1.0f);
313+
}
314+
315+
size_t nthreads = _layerHeight * _layerWidth;
316+
317+
ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
318+
kernel.set(0, (int)nthreads);
319+
kernel.set(1, (float)stepX);
320+
kernel.set(2, (float)stepY);
321+
kernel.set(3, (float)_minSize);
322+
kernel.set(4, (float)_maxSize);
323+
kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_offsetsX));
324+
kernel.set(6, ocl::KernelArg::PtrReadOnly(umat_offsetsY));
325+
kernel.set(7, (int)_offsetsX.size());
326+
kernel.set(8, ocl::KernelArg::PtrReadOnly(umat_aspectRatios));
327+
kernel.set(9, (int)_aspectRatios.size());
328+
kernel.set(10, ocl::KernelArg::PtrReadOnly(umat_scales));
329+
kernel.set(11, ocl::KernelArg::PtrWriteOnly(outputs[0]));
330+
kernel.set(12, (int)_layerHeight);
331+
kernel.set(13, (int)_layerWidth);
332+
kernel.set(14, (int)_imageHeight);
333+
kernel.set(15, (int)_imageWidth);
334+
kernel.run(1, &nthreads, NULL, false);
335+
336+
// clip the prior's coordidate such that it is within [0, 1]
337+
if (_clip)
338+
{
339+
Mat mat = outputs[0].getMat(ACCESS_READ);
340+
int aspect_count = (_maxSize > 0) ? 1 : 0;
341+
int offset = nthreads * 4 * _offsetsX.size() * (1 + aspect_count + _aspectRatios.size());
342+
float* outputPtr = mat.ptr<float>() + offset;
343+
int _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;
344+
for (size_t d = 0; d < _outChannelSize; ++d)
345+
{
346+
outputPtr[d] = std::min<float>(std::max<float>(outputPtr[d], 0.), 1.);
347+
}
348+
}
349+
350+
// set the variance.
351+
{
352+
ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc);
353+
int offset = total(shape(outputs[0]), 2);
354+
size_t nthreads = _layerHeight * _layerWidth * _numPriors;
355+
kernel.set(0, (int)nthreads);
356+
kernel.set(1, (int)offset);
357+
kernel.set(2, (int)_variance.size());
358+
kernel.set(3, ocl::KernelArg::PtrReadOnly(umat_variance));
359+
kernel.set(4, ocl::KernelArg::PtrWriteOnly(outputs[0]));
360+
if (!kernel.run(1, &nthreads, NULL, false))
361+
return false;
362+
}
363+
return true;
364+
}
365+
#endif
366+
273367
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
274368
{
275369
CV_TRACE_FUNCTION();
276370
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
277371

372+
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
373+
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
374+
forward_ocl(inputs_arr, outputs_arr, internals_arr))
375+
278376
Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
279377
}
280378

@@ -441,6 +539,14 @@ class PriorBoxLayerImpl : public PriorBoxLayer
441539
std::vector<float> _offsetsX;
442540
std::vector<float> _offsetsY;
443541

542+
#ifdef HAVE_OPENCL
543+
UMat umat_offsetsX;
544+
UMat umat_offsetsY;
545+
UMat umat_aspectRatios;
546+
UMat umat_scales;
547+
UMat umat_variance;
548+
#endif
549+
444550
bool _flip;
445551
bool _clip;
446552
bool _explicitSizes;

modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,9 @@ class OCL4DNNConvSpatial
215215
bool createGEMMLikeConvKernel(int32_t blockWidth,
216216
int32_t blockHeight,
217217
int32_t blockDepth);
218+
bool createDWConvKernel(int32_t blockWidth,
219+
int32_t blockHeight,
220+
int32_t blockDepth);
218221
void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
219222
int32_t offset, int32_t size, bool write_only);
220223
bool convolve(const UMat &bottom, UMat &top,
@@ -282,6 +285,8 @@ class OCL4DNNConvSpatial
282285
int32_t M_;
283286

284287
bool tuned_;
288+
bool dwconv_;
289+
285290
std::string key_, key_sanitized_;
286291
std::string short_key_;
287292
std::string kernel_name_;

modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp

Lines changed: 98 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
103103
top_dim_ = num_output_ * output_w_ * output_h_;
104104

105105
cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
106+
dwconv_ = (num_output_ == channels_ && channels_ == group_);
106107

107108
use_cache_path_ = false;
108109
if (!cache_path_.empty())
@@ -203,7 +204,8 @@ void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
203204
typedef enum {
204205
KERNEL_TYPE_INTEL_IDLF = 2,
205206
KERNEL_TYPE_BASIC = 4,
206-
KERNEL_TYPE_GEMM_LIKE = 5
207+
KERNEL_TYPE_GEMM_LIKE = 5,
208+
KERNEL_TYPE_DWCONV = 6
207209
} ocl4dnnConvSpatialKernelType_t;
208210

209211
template<typename Dtype>
@@ -313,6 +315,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
313315
if (clOptionSupport("-cl-no-subgroup-ifp"))
314316
options_ << " -cl-no-subgroup-ifp ";
315317

318+
addDef("KERNEL_GEMM_LIKE");
316319
addDef("INPUT_DEPTH", channels_);
317320
addDef("WIDTH1", M_);
318321
addDef("OUT_PADDING_LEFT", 0);
@@ -329,6 +332,28 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
329332
setFusionDefine(fused_activ_, fused_eltwise_);
330333
src_ = ocl::dnn::conv_layer_spatial_oclsrc;
331334
}
335+
else if (kernelType == KERNEL_TYPE_DWCONV)
336+
{
337+
kernelUKey = generateSpecificKey(KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
338+
kernel_name_ = "DWCONV_";
339+
kernel_name_ += kernelUKey.c_str();
340+
341+
options_ << " -cl-fast-relaxed-math ";
342+
if (clOptionSupport("-cl-no-subgroup-ifp"))
343+
options_ << " -cl-no-subgroup-ifp ";
344+
345+
addDef("KERNEL_DWCONV");
346+
addDef("KERNEL_SIZE", kernel_w_ * kernel_h_);
347+
addDef("KERNEL_W", kernel_w_);
348+
addDef("KERNEL_H", kernel_h_);
349+
addDef("APPLY_BIAS", bias_term_);
350+
addDef("OUTPUT_Z", num_output_ * num_);
351+
addDef("CHANNELS", num_output_);
352+
setFusionDefine(fused_activ_, fused_eltwise_);
353+
354+
options_ << " -D DWCONV=" << kernel_name_;
355+
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
356+
}
332357
}
333358

334359
template<typename Dtype>
@@ -906,6 +931,33 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
906931
return false;
907932
}
908933
}
934+
} else if (config->kernelType == KERNEL_TYPE_DWCONV) {
935+
ocl::Kernel kernel(config->kernelName.c_str(), program);
936+
if (kernel.empty())
937+
return false;
938+
939+
cl_uint argIdx = 0;
940+
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
941+
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
942+
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
943+
if (bias_term_)
944+
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
945+
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
946+
kernel.set(argIdx++, (uint16_t)width_);
947+
kernel.set(argIdx++, (uint16_t)height_);
948+
kernel.set(argIdx++, (uint16_t)output_w_);
949+
kernel.set(argIdx++, (uint16_t)output_h_);
950+
951+
size_t global_size[3];
952+
global_size[0] = output_w_;
953+
global_size[1] = output_h_;
954+
global_size[2] = num_output_ * num_;
955+
956+
if (!kernel.run(3, global_size, NULL, false))
957+
{
958+
std::cout << "DWCONV kernel run failed." << std::endl;
959+
return false;
960+
}
909961
} else {
910962
for (int32_t n = 0; n < numImages; ++n) {
911963
for (int32_t g = 0; g < group_; ++g) {
@@ -1222,6 +1274,39 @@ bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
12221274
return false;
12231275
}
12241276

1277+
template<>
1278+
bool OCL4DNNConvSpatial<float>::createDWConvKernel(int32_t blockWidth,
1279+
int32_t blockHeight,
1280+
int32_t blockDepth)
1281+
{
1282+
if (!dwconv_)
1283+
return false;
1284+
1285+
int workItemOutput[3] = { 1, 1, 1 };
1286+
size_t local_size[3] = { 1, 1, 1 };
1287+
size_t global_size[3];
1288+
global_size[0] = divUp(output_w_, workItemOutput[0]);
1289+
global_size[1] = divUp(output_h_, workItemOutput[1]);
1290+
global_size[2] = divUp(M_ * num_, workItemOutput[2]);
1291+
1292+
kernelType_ = KERNEL_TYPE_DWCONV;
1293+
blockM_ = blockWidth;
1294+
blockK_ = blockHeight;
1295+
blockN_ = blockDepth;
1296+
1297+
setupKernel();
1298+
1299+
ocl::Program program = compileKernel();
1300+
if (program.ptr())
1301+
{
1302+
kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0],
1303+
&workItemOutput[0], false, KERNEL_TYPE_DWCONV));
1304+
return true;
1305+
}
1306+
else
1307+
return false;
1308+
}
1309+
12251310
template<>
12261311
bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
12271312
int32_t blockWidth,
@@ -1238,6 +1323,8 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
12381323
return createBasicKernel(blockWidth, blockHeight, blockDepth);
12391324
else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
12401325
return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
1326+
else if (kernelType == KERNEL_TYPE_DWCONV)
1327+
return createDWConvKernel(blockWidth, blockHeight, blockDepth);
12411328
else
12421329
CV_Assert(0 && "Internal error");
12431330
return false;
@@ -1246,7 +1333,16 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
12461333
template<>
12471334
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
12481335
{
1249-
if (ocl::Device::getDefault().intelSubgroupsSupport()) {
1336+
if (ocl::Device::getDefault().intelSubgroupsSupport())
1337+
{
1338+
//depth_wise kernels
1339+
if (dwconv_)
1340+
{
1341+
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1));
1342+
if (group_ > 8)
1343+
return;
1344+
}
1345+
12501346
/* IDLF kernels are using Intel specific extension which make
12511347
them intel only. */
12521348
// Generates static key_

modules/dnn/src/opencl/conv_layer_spatial.cl

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ convolve_simd(
383383
}
384384
}
385385

386-
#else // KERNEL_GEMM_LIKE
386+
#elif defined KERNEL_GEMM_LIKE
387387

388388
#if APPLY_BIAS
389389
// Dtype bias[4];
@@ -1501,4 +1501,59 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
15011501
INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
15021502
}
15031503
#endif
1504-
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE
1504+
1505+
#elif defined KERNEL_DWCONV
1506+
1507+
__kernel void DWCONV(
1508+
ELTWISE_DATA_ARG
1509+
NEGATIVE_SLOPE_ARG
1510+
__global Dtype* image_data,
1511+
__global Dtype* kernel_data,
1512+
BIAS_KERNEL_ARG
1513+
__global Dtype* convolved_image,
1514+
const ushort input_width,
1515+
const ushort input_height,
1516+
const ushort output_width,
1517+
const ushort output_height) {
1518+
1519+
const int outputX = get_global_id(0);
1520+
const int outputY = get_global_id(1);
1521+
const int outputZ = get_global_id(2);
1522+
if(outputX < output_width && outputY < output_height)
1523+
{
1524+
Dtype sum = 0.;
1525+
1526+
const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;
1527+
const int org_x = outputX * STRIDE_X - INPUT_PAD_W;
1528+
const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);
1529+
const int biasIndex=outputZ%CHANNELS;
1530+
const int local_image_offset = org_y*input_width + org_x;
1531+
const int imageSize = input_width*input_height;
1532+
1533+
__global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));
1534+
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
1535+
1536+
for(int y = 0; y < KERNEL_H; y++)
1537+
{
1538+
for(int x = 0; x < KERNEL_W; x++)
1539+
{
1540+
if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))
1541+
{
1542+
continue;
1543+
}
1544+
sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];
1545+
}
1546+
image_dataPtrFloat += input_width * DILATION_Y;
1547+
kernel_dataPtrFloat += KERNEL_W;
1548+
}
1549+
1550+
#if APPLY_BIAS
1551+
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
1552+
ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);
1553+
#else
1554+
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
1555+
ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);
1556+
#endif
1557+
}
1558+
}
1559+
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV

0 commit comments

Comments
 (0)