@@ -257,11 +257,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
257
257
addDef (" INPUT_DEPTH" , channels_ / group_);
258
258
addDef (" TOTAL_INPUT_DEPTH_SIZE" , channels_);
259
259
addDef (" TOTAL_OUTPUT_DEPTH" , num_output_);
260
- addDef (" INPUT_START_X" , 0 );
261
- addDef (" INPUT_START_Y" , 0 );
262
- addDef (" INPUT_START_Z" , 0 );
263
260
addDef (" NUM_FILTERS" , M_);
264
- addDef (" OUT_BUFF_OFFSET" , 0 );
265
261
addDef (" TILE_X" , tile_x);
266
262
addDef (" TILE_Y" , tile_y);
267
263
addDef (" TILE_Y_STRIDE" , tile_y_stride);
@@ -1330,76 +1326,128 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
1330
1326
return false ;
1331
1327
}
1332
1328
1329
+ template <>
1330
+ void OCL4DNNConvSpatial<float >::generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1331
+ int blockM, int blockK, int blockN)
1332
+ {
1333
+ if (group_ != 1 || ((M_ % 8 != 0 ) || (M_ % 32 == 24 )))
1334
+ return ;
1335
+
1336
+ if (blockM != 1 && blockM != 2 )
1337
+ return ;
1338
+
1339
+ if (blockN != 32 )
1340
+ return ;
1341
+
1342
+ if (blockK != 8 && blockK != 16 )
1343
+ return ;
1344
+
1345
+ if (blockK == 16 )
1346
+ {
1347
+ if ((blockM == 1 && (kernel_w_ > 4 )) || M_ % 32 != 0 )
1348
+ return ;
1349
+ if ((blockM == 2 ) || M_ % 32 != 0 )
1350
+ return ;
1351
+ }
1352
+
1353
+ tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN));
1354
+ }
1355
+
1356
+ template <>
1357
+ void OCL4DNNConvSpatial<float >::generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1358
+ int blockM, int blockK, int simd_size)
1359
+ {
1360
+ int max_compute_units = ocl::Device::getDefault ().maxComputeUnits ();
1361
+
1362
+ if (simd_size != 8 && simd_size != 16 )
1363
+ return ;
1364
+
1365
+ if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0 )))
1366
+ return ;
1367
+
1368
+ if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0 ))
1369
+ return ;
1370
+
1371
+ int width_max, height_max, block_size_max;
1372
+ width_max = 14 ;
1373
+ height_max = 14 ;
1374
+ block_size_max = 32 ;
1375
+
1376
+ if (blockM > width_max)
1377
+ return ;
1378
+ if (blockK > height_max)
1379
+ return ;
1380
+
1381
+ if (blockM > output_w_)
1382
+ return ;
1383
+ if (blockK > output_h_)
1384
+ return ;
1385
+
1386
+ // Only when the work items count is less than the device
1387
+ // max work items or the M_ is less than 16, we will tune
1388
+ // for simd 8.
1389
+ if (simd_size == 8 && M_ >= 16 &&
1390
+ ((num_ * M_ * output_w_ * output_h_ / static_cast <float >(blockM * blockK)) >=
1391
+ max_compute_units * 7 * 16 ))
1392
+ return ;
1393
+
1394
+ int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1 ) * stride_w_ ;
1395
+ int tile_x = alignSize (actual_tile_x, 4 );
1396
+ int tile_y = kernel_h_ * dilation_h_ + (blockK - 1 ) * stride_h_;
1397
+ if (tile_x > (4 * simd_size))
1398
+ return ;
1399
+
1400
+ if ((blockM * blockK + divUp (tile_x * tile_y, simd_size)) > block_size_max)
1401
+ return ;
1402
+
1403
+ int tile_y_stride = (4 * simd_size) / tile_x;
1404
+ int invec_size = divUp (tile_y, tile_y_stride);
1405
+ if (invec_size > 4 )
1406
+ return ;
1407
+
1408
+ tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size));
1409
+ }
1410
+
1411
+ template <>
1412
+ void OCL4DNNConvSpatial<float >::generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1413
+ int blockM, int blockK, int blockN)
1414
+ {
1415
+ if (!dwconv_)
1416
+ return ;
1417
+
1418
+ tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_DWCONV, blockM, blockK, blockN));
1419
+ }
1420
+
1333
1421
template <>
1334
1422
void OCL4DNNConvSpatial<float >::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
1335
1423
{
1336
1424
if (ocl::Device::getDefault ().intelSubgroupsSupport ())
1337
1425
{
1338
- // depth_wise kernels
1339
- if (dwconv_)
1426
+ // depthwise kernel
1427
+ generate_dwconv_tuneritems (tunerItems, 1 , 1 , 1 );
1428
+ if (tunerItems.size () > 0 && group_ > 8 )
1429
+ return ;
1430
+
1431
+ // gemm like kernel
1432
+ generate_gemmlike_tuneritems (tunerItems, 1 , 8 , 32 );
1433
+ generate_gemmlike_tuneritems (tunerItems, 2 , 8 , 32 );
1434
+ generate_gemmlike_tuneritems (tunerItems, 1 , 16 , 32 );
1435
+
1436
+ // idlf kernel
1437
+ for (int simd_size = 8 ; simd_size <= 16 ; simd_size += 8 )
1340
1438
{
1341
- tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1 , 1 , 1 ));
1342
- if (group_ > 8 )
1343
- return ;
1344
- }
1345
-
1346
- /* IDLF kernels are using Intel specific extension which make
1347
- them intel only. */
1348
- // Generates static key_
1349
- int max_compute_units = ocl::Device::getDefault ().maxComputeUnits ();
1350
- int kernelCnt = 0 ;
1351
- if (group_ == 1 && ((M_ % 8 == 0 ) && (M_ % 32 != 24 ))) {
1352
- tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1 , 8 , 32 ));
1353
- tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 2 , 8 , 32 ));
1354
-
1355
- if (kernel_w_ < 4 && M_ % 32 == 0 )
1356
- tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1 , 16 , 32 ));
1357
- }
1358
-
1359
- for (int simd_size = 8 ; simd_size <= 16 ; simd_size += 8 ) {
1360
- if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0 )))
1361
- continue ;
1362
- if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0 ))
1363
- continue ;
1364
- const int width_max = 14 , height_max = 8 , block_size_max = 32 ;
1365
- for (uint32_t width = width_max; width > 0 ; width--) {
1366
- int candidate = 0 ;
1367
- if (width > output_w_)
1368
- continue ;
1369
- for (uint32_t height = height_max; height > 0 ; height--) {
1370
- if (width * height > block_size_max || height > output_h_)
1371
- continue ;
1372
- // Only when the work items count is less than the device
1373
- // max work items or the M_ is less than 16, we will tune
1374
- // for simd 8.
1375
- if (simd_size == 8 &&
1376
- M_ >= 16 &&
1377
- ((num_ * M_ * output_w_ * output_h_ / static_cast <float >(width * height)) >=
1378
- max_compute_units * 7 * 16 ))
1379
- continue ;
1380
- int actual_tile_x = kernel_w_ * dilation_w_ + (width - 1 ) * stride_w_;
1381
- int tile_x = alignSize (actual_tile_x, 4 );
1382
- int tile_y = kernel_h_ * dilation_h_ + (height - 1 ) * stride_h_;
1383
- if (tile_x > (4 * simd_size))
1384
- continue ;
1385
- // If actual_tile_x is multiple of 4, we may waste some IO bandwidth.
1386
- // This could reduce 75% tuning candidates. It has slightly performance
1387
- // impact for the final tuning result, less than 2% for most cases.
1388
- if (actual_tile_x % 4 != 0 )
1389
- continue ;
1390
- if ((width * height + divUp (tile_x * tile_y, simd_size)) > block_size_max)
1391
- continue ;
1392
- int tile_y_stride = (4 * simd_size) / tile_x;
1393
-
1394
- if (divUp (tile_y, tile_y_stride) < 4 ) {
1395
- tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, width, height, simd_size));
1396
- candidate++;
1397
- }
1398
- if (candidate >= 4 && height == 2 )
1439
+ int width_max, height_max;
1440
+ width_max = 14 ;
1441
+ height_max = 14 ;
1442
+ for (uint32_t width = width_max; width > 0 ; width--)
1443
+ {
1444
+ for (uint32_t height = height_max; height > 0 ; height--)
1445
+ {
1446
+ generate_idlf_tuneritems (tunerItems, width, height, simd_size);
1447
+ if (tunerItems.size () >= 8 && height == 2 )
1399
1448
break ;
1400
1449
}
1401
- kernelCnt += candidate;
1402
- if (kernelCnt >= 12 && width == 2 )
1450
+ if (tunerItems.size () >= 12 && width == 2 )
1403
1451
break ;
1404
1452
}
1405
1453
}
0 commit comments