Skip to content

Commit a2620f7

Browse files
committed
Merge pull request opencv#10370 from pengli:dnn
2 parents 047ad4f + c5fc8e0 commit a2620f7

File tree

3 files changed

+124
-72
lines changed

3 files changed

+124
-72
lines changed

modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,12 @@ class OCL4DNNConvSpatial
258258
int lx, int ly, int lz,
259259
bool swizzle, bool nullLocal);
260260
void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);
261+
void generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
262+
int blockM, int blockK, int blockN);
263+
void generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
264+
int blockM, int blockK, int blockN);
265+
void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
266+
int blockM, int blockK, int simd_size);
261267
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
262268
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
263269

modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp

Lines changed: 114 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -257,11 +257,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
257257
addDef("INPUT_DEPTH", channels_ / group_);
258258
addDef("TOTAL_INPUT_DEPTH_SIZE", channels_);
259259
addDef("TOTAL_OUTPUT_DEPTH", num_output_);
260-
addDef("INPUT_START_X", 0);
261-
addDef("INPUT_START_Y", 0);
262-
addDef("INPUT_START_Z", 0);
263260
addDef("NUM_FILTERS", M_);
264-
addDef("OUT_BUFF_OFFSET", 0);
265261
addDef("TILE_X", tile_x);
266262
addDef("TILE_Y", tile_y);
267263
addDef("TILE_Y_STRIDE", tile_y_stride);
@@ -1330,76 +1326,128 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
13301326
return false;
13311327
}
13321328

1329+
template<>
1330+
void OCL4DNNConvSpatial<float>::generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1331+
int blockM, int blockK, int blockN)
1332+
{
1333+
if (group_ != 1 || ((M_ % 8 != 0) || (M_ % 32 == 24)))
1334+
return;
1335+
1336+
if (blockM != 1 && blockM != 2)
1337+
return;
1338+
1339+
if (blockN != 32)
1340+
return;
1341+
1342+
if (blockK != 8 && blockK != 16)
1343+
return;
1344+
1345+
if (blockK == 16)
1346+
{
1347+
if ((blockM == 1 && (kernel_w_ > 4)) || M_ % 32 != 0)
1348+
return;
1349+
if ((blockM == 2) || M_ % 32 != 0)
1350+
return;
1351+
}
1352+
1353+
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN));
1354+
}
1355+
1356+
template<>
1357+
void OCL4DNNConvSpatial<float>::generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1358+
int blockM, int blockK, int simd_size)
1359+
{
1360+
int max_compute_units = ocl::Device::getDefault().maxComputeUnits();
1361+
1362+
if (simd_size != 8 && simd_size != 16)
1363+
return;
1364+
1365+
if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0)))
1366+
return;
1367+
1368+
if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0))
1369+
return;
1370+
1371+
int width_max, height_max, block_size_max;
1372+
width_max = 14;
1373+
height_max = 14;
1374+
block_size_max = 32;
1375+
1376+
if (blockM > width_max)
1377+
return;
1378+
if (blockK > height_max)
1379+
return;
1380+
1381+
if (blockM > output_w_)
1382+
return;
1383+
if (blockK > output_h_)
1384+
return;
1385+
1386+
// Only when the work items count is less than the device
1387+
// max work items or the M_ is less than 16, we will tune
1388+
// for simd 8.
1389+
if (simd_size == 8 && M_ >= 16 &&
1390+
((num_ * M_ * output_w_ * output_h_ / static_cast<float>(blockM * blockK)) >=
1391+
max_compute_units * 7 * 16))
1392+
return;
1393+
1394+
int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1) * stride_w_ ;
1395+
int tile_x = alignSize(actual_tile_x, 4);
1396+
int tile_y = kernel_h_ * dilation_h_ + (blockK - 1) * stride_h_;
1397+
if (tile_x > (4 * simd_size))
1398+
return;
1399+
1400+
if ((blockM * blockK + divUp(tile_x * tile_y, simd_size)) > block_size_max)
1401+
return;
1402+
1403+
int tile_y_stride = (4 * simd_size) / tile_x;
1404+
int invec_size = divUp(tile_y, tile_y_stride);
1405+
if (invec_size > 4)
1406+
return;
1407+
1408+
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size));
1409+
}
1410+
1411+
template<>
1412+
void OCL4DNNConvSpatial<float>::generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1413+
int blockM, int blockK, int blockN)
1414+
{
1415+
if (!dwconv_)
1416+
return;
1417+
1418+
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, blockM, blockK, blockN));
1419+
}
1420+
13331421
template<>
13341422
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
13351423
{
13361424
if (ocl::Device::getDefault().intelSubgroupsSupport())
13371425
{
1338-
//depth_wise kernels
1339-
if (dwconv_)
1426+
// depthwise kernel
1427+
generate_dwconv_tuneritems(tunerItems, 1, 1, 1);
1428+
if (tunerItems.size() > 0 && group_ > 8)
1429+
return;
1430+
1431+
// gemm like kernel
1432+
generate_gemmlike_tuneritems(tunerItems, 1, 8, 32);
1433+
generate_gemmlike_tuneritems(tunerItems, 2, 8, 32);
1434+
generate_gemmlike_tuneritems(tunerItems, 1, 16, 32);
1435+
1436+
// idlf kernel
1437+
for (int simd_size = 8; simd_size <= 16; simd_size += 8)
13401438
{
1341-
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1));
1342-
if (group_ > 8)
1343-
return;
1344-
}
1345-
1346-
/* IDLF kernels are using Intel specific extension which make
1347-
them intel only. */
1348-
// Generates static key_
1349-
int max_compute_units = ocl::Device::getDefault().maxComputeUnits();
1350-
int kernelCnt = 0;
1351-
if (group_ == 1 && ((M_ % 8 == 0) && (M_ % 32 != 24))) {
1352-
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1, 8, 32));
1353-
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 2, 8, 32));
1354-
1355-
if (kernel_w_ < 4 && M_ % 32 == 0)
1356-
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1, 16, 32));
1357-
}
1358-
1359-
for (int simd_size = 8; simd_size <= 16; simd_size += 8) {
1360-
if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0)))
1361-
continue;
1362-
if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0))
1363-
continue;
1364-
const int width_max = 14, height_max = 8, block_size_max = 32;
1365-
for (uint32_t width = width_max; width > 0; width--) {
1366-
int candidate = 0;
1367-
if (width > output_w_)
1368-
continue;
1369-
for (uint32_t height = height_max; height > 0; height--) {
1370-
if (width * height > block_size_max || height > output_h_)
1371-
continue;
1372-
// Only when the work items count is less than the device
1373-
// max work items or the M_ is less than 16, we will tune
1374-
// for simd 8.
1375-
if (simd_size == 8 &&
1376-
M_ >= 16 &&
1377-
((num_ * M_ * output_w_ * output_h_ / static_cast<float>(width * height)) >=
1378-
max_compute_units * 7 * 16))
1379-
continue;
1380-
int actual_tile_x = kernel_w_ * dilation_w_ + (width - 1) * stride_w_;
1381-
int tile_x = alignSize(actual_tile_x, 4);
1382-
int tile_y = kernel_h_ * dilation_h_ + (height - 1) * stride_h_;
1383-
if (tile_x > (4 * simd_size))
1384-
continue;
1385-
// If actual_tile_x is multiple of 4, we may waste some IO bandwidth.
1386-
// This could reduce 75% tuning candidates. It has slightly performance
1387-
// impact for the final tuning result, less than 2% for most cases.
1388-
if (actual_tile_x % 4 != 0)
1389-
continue;
1390-
if ((width * height + divUp(tile_x * tile_y, simd_size)) > block_size_max)
1391-
continue;
1392-
int tile_y_stride = (4 * simd_size) / tile_x;
1393-
1394-
if (divUp(tile_y, tile_y_stride) < 4) {
1395-
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, width, height, simd_size));
1396-
candidate++;
1397-
}
1398-
if (candidate >= 4 && height == 2)
1439+
int width_max, height_max;
1440+
width_max = 14;
1441+
height_max = 14;
1442+
for (uint32_t width = width_max; width > 0; width--)
1443+
{
1444+
for (uint32_t height = height_max; height > 0; height--)
1445+
{
1446+
generate_idlf_tuneritems(tunerItems, width, height, simd_size);
1447+
if (tunerItems.size() >= 8 && height == 2)
13991448
break;
14001449
}
1401-
kernelCnt += candidate;
1402-
if (kernelCnt >= 12 && width == 2)
1450+
if (tunerItems.size() >= 12 && width == 2)
14031451
break;
14041452
}
14051453
}

modules/dnn/src/opencl/conv_layer_spatial.cl

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -189,10 +189,8 @@ __kernel void ConvolveBasic(
189189
// NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
190190

191191
// NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.
192-
#ifndef __BEIGNET__
193192
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
194193
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
195-
#endif
196194
__kernel void
197195
convolve_simd(
198196
ELTWISE_DATA_ARG
@@ -232,12 +230,12 @@ convolve_simd(
232230

233231
int curr_local_y = ( lid / ( TILE_X / 4 ) );
234232
int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4;
235-
int curr_y = or * STRIDE_Y + INPUT_START_Y + curr_local_y;
236-
int curr_x = oc * STRIDE_X + INPUT_START_X + curr_local_x;
233+
int curr_y = or * STRIDE_Y + curr_local_y;
234+
int curr_x = oc * STRIDE_X + curr_local_x;
237235
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
238236
int saved_y = curr_y;
239237
#endif
240-
in_addr = input_batch_offset + INPUT_START_Z * input_height * input_width
238+
in_addr = input_batch_offset
241239
+ (curr_y - INPUT_PAD_H) * input_width // y tile offset
242240
+ curr_x - INPUT_PAD_W; // x tile offset
243241
union {
@@ -363,7 +361,7 @@ convolve_simd(
363361
fm = fm % ALIGNED_NUM_FILTERS;
364362

365363
if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) {
366-
unsigned int out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height;
364+
unsigned int out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height;
367365
out_addr += or * output_width + oc;
368366
// we need this address calculation for biases because we support views and batching
369367
#if APPLY_BIAS

0 commit comments

Comments
 (0)