@@ -1330,76 +1330,128 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
1330
1330
return false ;
1331
1331
}
1332
1332
1333
+ template <>
1334
+ void OCL4DNNConvSpatial<float >::generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1335
+ int blockM, int blockK, int blockN)
1336
+ {
1337
+ if (group_ != 1 || ((M_ % 8 != 0 ) || (M_ % 32 == 24 )))
1338
+ return ;
1339
+
1340
+ if (blockM != 1 && blockM != 2 )
1341
+ return ;
1342
+
1343
+ if (blockN != 32 )
1344
+ return ;
1345
+
1346
+ if (blockK != 8 && blockK != 16 )
1347
+ return ;
1348
+
1349
+ if (blockK == 16 )
1350
+ {
1351
+ if ((blockM == 1 && (kernel_w_ > 4 )) || M_ % 32 != 0 )
1352
+ return ;
1353
+ if ((blockM == 2 ) || M_ % 32 != 0 )
1354
+ return ;
1355
+ }
1356
+
1357
+ tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN));
1358
+ }
1359
+
1360
+ template <>
1361
+ void OCL4DNNConvSpatial<float >::generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1362
+ int blockM, int blockK, int simd_size)
1363
+ {
1364
+ int max_compute_units = ocl::Device::getDefault ().maxComputeUnits ();
1365
+
1366
+ if (simd_size != 8 && simd_size != 16 )
1367
+ return ;
1368
+
1369
+ if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0 )))
1370
+ return ;
1371
+
1372
+ if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0 ))
1373
+ return ;
1374
+
1375
+ int width_max, height_max, block_size_max;
1376
+ width_max = 14 ;
1377
+ height_max = 14 ;
1378
+ block_size_max = 32 ;
1379
+
1380
+ if (blockM > width_max)
1381
+ return ;
1382
+ if (blockK > height_max)
1383
+ return ;
1384
+
1385
+ if (blockM > output_w_)
1386
+ return ;
1387
+ if (blockK > output_h_)
1388
+ return ;
1389
+
1390
+ // Only when the work items count is less than the device
1391
+ // max work items or the M_ is less than 16, we will tune
1392
+ // for simd 8.
1393
+ if (simd_size == 8 && M_ >= 16 &&
1394
+ ((num_ * M_ * output_w_ * output_h_ / static_cast <float >(blockM * blockK)) >=
1395
+ max_compute_units * 7 * 16 ))
1396
+ return ;
1397
+
1398
+ int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1 ) * stride_w_ ;
1399
+ int tile_x = alignSize (actual_tile_x, 4 );
1400
+ int tile_y = kernel_h_ * dilation_h_ + (blockK - 1 ) * stride_h_;
1401
+ if (tile_x > (4 * simd_size))
1402
+ return ;
1403
+
1404
+ if ((blockM * blockK + divUp (tile_x * tile_y, simd_size)) > block_size_max)
1405
+ return ;
1406
+
1407
+ int tile_y_stride = (4 * simd_size) / tile_x;
1408
+ int invec_size = divUp (tile_y, tile_y_stride);
1409
+ if (invec_size > 4 )
1410
+ return ;
1411
+
1412
+ tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size));
1413
+ }
1414
+
1415
+ template <>
1416
+ void OCL4DNNConvSpatial<float >::generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
1417
+ int blockM, int blockK, int blockN)
1418
+ {
1419
+ if (!dwconv_)
1420
+ return ;
1421
+
1422
+ tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_DWCONV, blockM, blockK, blockN));
1423
+ }
1424
+
1333
1425
template <>
1334
1426
void OCL4DNNConvSpatial<float >::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
1335
1427
{
1336
1428
if (ocl::Device::getDefault ().intelSubgroupsSupport ())
1337
1429
{
1338
- // depth_wise kernels
1339
- if (dwconv_)
1430
+ // depthwise kernel
1431
+ generate_dwconv_tuneritems (tunerItems, 1 , 1 , 1 );
1432
+ if (tunerItems.size () > 0 && group_ > 8 )
1433
+ return ;
1434
+
1435
+ // gemm like kernel
1436
+ generate_gemmlike_tuneritems (tunerItems, 1 , 8 , 32 );
1437
+ generate_gemmlike_tuneritems (tunerItems, 2 , 8 , 32 );
1438
+ generate_gemmlike_tuneritems (tunerItems, 1 , 16 , 32 );
1439
+
1440
+ // idlf kernel
1441
+ for (int simd_size = 8 ; simd_size <= 16 ; simd_size += 8 )
1340
1442
{
1341
- tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1 , 1 , 1 ));
1342
- if (group_ > 8 )
1343
- return ;
1344
- }
1345
-
1346
- /* IDLF kernels are using Intel specific extension which make
1347
- them intel only. */
1348
- // Generates static key_
1349
- int max_compute_units = ocl::Device::getDefault ().maxComputeUnits ();
1350
- int kernelCnt = 0 ;
1351
- if (group_ == 1 && ((M_ % 8 == 0 ) && (M_ % 32 != 24 ))) {
1352
- tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1 , 8 , 32 ));
1353
- tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 2 , 8 , 32 ));
1354
-
1355
- if (kernel_w_ < 4 && M_ % 32 == 0 )
1356
- tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1 , 16 , 32 ));
1357
- }
1358
-
1359
- for (int simd_size = 8 ; simd_size <= 16 ; simd_size += 8 ) {
1360
- if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0 )))
1361
- continue ;
1362
- if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0 ))
1363
- continue ;
1364
- const int width_max = 14 , height_max = 8 , block_size_max = 32 ;
1365
- for (uint32_t width = width_max; width > 0 ; width--) {
1366
- int candidate = 0 ;
1367
- if (width > output_w_)
1368
- continue ;
1369
- for (uint32_t height = height_max; height > 0 ; height--) {
1370
- if (width * height > block_size_max || height > output_h_)
1371
- continue ;
1372
- // Only when the work items count is less than the device
1373
- // max work items or the M_ is less than 16, we will tune
1374
- // for simd 8.
1375
- if (simd_size == 8 &&
1376
- M_ >= 16 &&
1377
- ((num_ * M_ * output_w_ * output_h_ / static_cast <float >(width * height)) >=
1378
- max_compute_units * 7 * 16 ))
1379
- continue ;
1380
- int actual_tile_x = kernel_w_ * dilation_w_ + (width - 1 ) * stride_w_;
1381
- int tile_x = alignSize (actual_tile_x, 4 );
1382
- int tile_y = kernel_h_ * dilation_h_ + (height - 1 ) * stride_h_;
1383
- if (tile_x > (4 * simd_size))
1384
- continue ;
1385
- // If actual_tile_x is multiple of 4, we may waste some IO bandwidth.
1386
- // This could reduce 75% tuning candidates. It has slightly performance
1387
- // impact for the final tuning result, less than 2% for most cases.
1388
- if (actual_tile_x % 4 != 0 )
1389
- continue ;
1390
- if ((width * height + divUp (tile_x * tile_y, simd_size)) > block_size_max)
1391
- continue ;
1392
- int tile_y_stride = (4 * simd_size) / tile_x;
1393
-
1394
- if (divUp (tile_y, tile_y_stride) < 4 ) {
1395
- tunerItems.push_back (makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, width, height, simd_size));
1396
- candidate++;
1397
- }
1398
- if (candidate >= 4 && height == 2 )
1443
+ int width_max, height_max;
1444
+ width_max = 14 ;
1445
+ height_max = 14 ;
1446
+ for (uint32_t width = width_max; width > 0 ; width--)
1447
+ {
1448
+ for (uint32_t height = height_max; height > 0 ; height--)
1449
+ {
1450
+ generate_idlf_tuneritems (tunerItems, width, height, simd_size);
1451
+ if (tunerItems.size () >= 8 && height == 2 )
1399
1452
break ;
1400
1453
}
1401
- kernelCnt += candidate;
1402
- if (kernelCnt >= 12 && width == 2 )
1454
+ if (tunerItems.size () >= 12 && width == 2 )
1403
1455
break ;
1404
1456
}
1405
1457
}
0 commit comments