Skip to content

Commit 59e825e

Browse files
committed
Merge pull request opencv#10385 from pengli:dnn
2 parents 97af608 + 181b448 commit 59e825e

File tree

2 files changed

+236
-11
lines changed

2 files changed

+236
-11
lines changed

modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1432,6 +1432,7 @@ void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerPar
14321432
generate_gemmlike_tuneritems(tunerItems, 1, 8, 32);
14331433
generate_gemmlike_tuneritems(tunerItems, 2, 8, 32);
14341434
generate_gemmlike_tuneritems(tunerItems, 1, 16, 32);
1435+
generate_gemmlike_tuneritems(tunerItems, 2, 16, 32);
14351436

14361437
// idlf kernel
14371438
for (int simd_size = 8; simd_size <= 16; simd_size += 8)

modules/dnn/src/opencl/conv_layer_spatial.cl

Lines changed: 235 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,6 @@ convolve_simd(
384384
#elif defined KERNEL_GEMM_LIKE
385385

386386
#if APPLY_BIAS
387-
// Dtype bias[4];
388387
#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i)
389388
#else
390389
#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0)
@@ -446,9 +445,7 @@ typedef struct float0 { float s0; } float0; //never used but makes compiler happ
446445
#define TILE_K KERNEL_WIDTH
447446
#define TILE_N 32
448447

449-
#ifndef __BEIGNET__
450448
__attribute__((intel_reqd_sub_group_size(8)))
451-
#endif
452449
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
453450
{
454451
const int group_x = get_group_id(0);
@@ -608,6 +605,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
608605
Dtype4 *bias_vec;
609606
bias_vec = (Dtype4*)bias;
610607
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
608+
if (group_x > 0xFFFFFFFEul) {
609+
dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
610+
}
611+
#else
612+
const Dtype bias[4] = {0, 0, 0, 0};
611613
#endif
612614
if (global_y * TILE_M < output_width * output_height )
613615
{
@@ -768,6 +770,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
768770
Dtype4 *bias_vec;
769771
bias_vec = (Dtype4*)bias;
770772
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
773+
if (group_x > 0xFFFFFFFEul) {
774+
dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
775+
}
776+
#else
777+
const Dtype bias[4] = {0, 0, 0, 0};
771778
#endif
772779

773780
if (global_y * TILE_M < output_width * output_height )
@@ -813,9 +820,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
813820
#define TILE_K KERNEL_WIDTH
814821
#define TILE_N 32
815822

816-
#ifndef __BEIGNET__
817823
__attribute__((intel_reqd_sub_group_size(8)))
818-
#endif
819824
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
820825
{
821826
const int group_x = get_group_id(0);
@@ -1012,6 +1017,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
10121017
Dtype4 *bias_vec;
10131018
bias_vec = (Dtype4*)bias;
10141019
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
1020+
if (group_x > 0xFFFFFFFEul) {
1021+
dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
1022+
}
1023+
#else
1024+
const Dtype bias[4] = {0, 0, 0, 0};
10151025
#endif
10161026

10171027
if( global_y * TILE_M < output_width * output_height )
@@ -1221,6 +1231,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
12211231
Dtype4 *bias_vec;
12221232
bias_vec = (Dtype4*)bias;
12231233
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
1234+
if (group_x > 0xFFFFFFFEul) {
1235+
dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
1236+
}
1237+
#else
1238+
const Dtype bias[4] = {0, 0, 0, 0};
12241239
#endif
12251240
if( global_y * TILE_M < output_width * output_height )
12261241
{
@@ -1334,9 +1349,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
13341349
#define TILE_K KERNEL_WIDTH
13351350
#define TILE_N 32
13361351

1337-
#ifndef __BEIGNET__
13381352
__attribute__((intel_reqd_sub_group_size(16)))
1339-
#endif
13401353
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
13411354
{
13421355
const int group_x = get_group_id(0);
@@ -1396,18 +1409,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
13961409
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
13971410
// and KERNEL_WIDTH/2 rows of interleaved filter.
13981411
int patch_depth = 0;
1399-
#ifndef __BEIGNET__
14001412
__attribute__((opencl_unroll_hint(1)))
1401-
#endif
14021413
do
14031414
{
14041415
int patch_row = 0;
14051416
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
14061417
curr_y = saved_y;
14071418
#endif
1408-
#ifndef __BEIGNET__
14091419
__attribute__((opencl_unroll_hint(1)))
1410-
#endif
14111420
do
14121421
{
14131422
// Load atile and btile.
@@ -1495,11 +1504,226 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
14951504
Dtype2 *bias_vec;
14961505
bias_vec = (Dtype2*)bias;
14971506
*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
1507+
if (group_x > 0xFFFFFFFEul) {
1508+
dst[0] = bias[0] + bias[1];
1509+
}
1510+
#else
1511+
const Dtype bias[2] = {0, 0};
14981512
#endif
14991513
INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
15001514
}
15011515
#endif
15021516

1517+
#ifdef GEMM_LIKE_CONV_32_2_SIMD16
1518+
1519+
//////////////////////////////////////////////////////////////////////////////
1520+
// Conv_Interleaved_32_2_SIMD16
1521+
//
1522+
// Convolution: each workitem computes 1 patch x 32 filters worth of output
1523+
// data.
1524+
#define TILE_M 2
1525+
#define TILE_K KERNEL_WIDTH
1526+
#define TILE_N 32
1527+
1528+
__attribute__((intel_reqd_sub_group_size(16)))
1529+
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1530+
{
1531+
const int group_x = get_group_id(0);
1532+
const int group_y = get_group_id(1);
1533+
const int global_x = get_global_id(0);
1534+
const int global_y = get_global_id(1);
1535+
const int global_z = get_global_id(2);
1536+
int interleaved_y;
1537+
int kernel_y;
1538+
int kernel_idx;
1539+
#define DOT_PRODUCT_16( _result, _rowA, colB ) \
1540+
{ \
1541+
_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \
1542+
_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \
1543+
_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \
1544+
_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \
1545+
_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \
1546+
_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \
1547+
_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \
1548+
_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \
1549+
_result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \
1550+
_result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \
1551+
_result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \
1552+
_result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \
1553+
_result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \
1554+
_result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \
1555+
_result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \
1556+
_result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \
1557+
}
1558+
typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
1559+
1560+
// True for all threads if filter_width is multiple of TILE_N
1561+
// else, true for all but right-most column of threads.
1562+
{
1563+
// Result ctile (*dst) is M rows x N columns
1564+
// LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.
1565+
Dtype16 blockC00 = 0.f;
1566+
Dtype16 blockC10 = 0.f;
1567+
Dtype16 blockC01 = 0.f;
1568+
Dtype16 blockC11 = 0.f;
1569+
1570+
// Src0 (patch input) is directly used as atile.
1571+
// Each work item points to the start of a different patch.
1572+
// atile is M rows x K columns.
1573+
int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
1574+
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
1575+
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
1576+
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
1577+
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
1578+
int saved_y0 = curr_y0;
1579+
int saved_y1 = curr_y1;
1580+
#endif
1581+
const __global Dtype *src0_read0 = src0
1582+
+ aligned_input_size * global_z // batch offset
1583+
+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset
1584+
+ curr_x0 - INPUT_PAD_W; // x offset
1585+
const __global Dtype *src0_read1 = src0
1586+
+ aligned_input_size * global_z // batch offset
1587+
+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset
1588+
+ curr_x1 - INPUT_PAD_W; // x offset
1589+
1590+
// Src1 (filter) is directly used as btile.
1591+
// It starts at the top of src1 and walks down.
1592+
// btile is K rows x N columns.
1593+
const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
1594+
1595+
// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
1596+
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
1597+
// and KERNEL_WIDTH/2 rows of interleaved filter.
1598+
int patch_depth = 0;
1599+
do
1600+
{
1601+
int patch_row = 0;
1602+
do
1603+
{
1604+
// Load atile and btile.
1605+
// Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity.
1606+
// The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non
1607+
// interleaved row is padded with zero to ensure same size as interleaved rows. This
1608+
// interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the
1609+
// kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
1610+
// (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
1611+
// (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...
1612+
// (0, 2) (8, 2) (16, 2) (24, 2) ... ...
1613+
// ...
1614+
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
1615+
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
1616+
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH;
1617+
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH;
1618+
Dtype* pblockA00 = (Dtype*)(&blockA00);
1619+
Dtype* pblockA01 = (Dtype*)(&blockA01);
1620+
#else
1621+
Dtype_t blockA00;
1622+
Dtype* pblockA00 = (Dtype*)(&blockA00);
1623+
int pos = 0;
1624+
LOOP(KERNEL_WIDTH, pos,
1625+
{
1626+
if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
1627+
pblockA00[pos] = src0_read0[pos * DILATION_X];
1628+
else
1629+
pblockA00[pos] = 0;
1630+
})
1631+
curr_y0 += DILATION_Y;
1632+
Dtype_t blockA01;
1633+
Dtype* pblockA01 = (Dtype*)(&blockA01);
1634+
pos = 0;
1635+
LOOP(KERNEL_WIDTH, pos,
1636+
{
1637+
if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
1638+
pblockA01[pos] = src0_read1[pos * DILATION_X];
1639+
else
1640+
pblockA01[pos] = 0;
1641+
})
1642+
curr_y1 += DILATION_Y;
1643+
src0_read0 += (ROW_PITCH * DILATION_Y);
1644+
src0_read1 += (ROW_PITCH * DILATION_Y);
1645+
#endif
1646+
Dtype blockB00[KERNEL_WIDTH*2];
1647+
Dtype4* p4BlockB00 = (Dtype4*)blockB00;
1648+
Dtype2* p2BlockB00 = (Dtype2*)blockB00;
1649+
Dtype* pBlockB00 = (Dtype* )blockB00;
1650+
1651+
interleaved_y = 0;
1652+
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1653+
{
1654+
p4BlockB00[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
1655+
src1_read += WIDTH1 * 2;
1656+
} )
1657+
if ( kernel_width_is_odd )
1658+
{
1659+
p2BlockB00[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
1660+
src1_read += WIDTH1 * 2;
1661+
}
1662+
// Perform MADs
1663+
kernel_idx = 0;
1664+
interleaved_y = 0;
1665+
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1666+
{
1667+
kernel_y = interleaved_y * 2;
1668+
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
1669+
DOT_PRODUCT_16( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
1670+
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
1671+
DOT_PRODUCT_16( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1672+
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
1673+
DOT_PRODUCT_16( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
1674+
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
1675+
DOT_PRODUCT_16( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1676+
} )
1677+
if ( kernel_width_is_odd )
1678+
{
1679+
kernel_y = interleaved_y * 2;
1680+
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1681+
DOT_PRODUCT_16( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1682+
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1683+
DOT_PRODUCT_16( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1684+
}
1685+
}
1686+
1687+
//while( ++patch_row < 1 ); //debug
1688+
while( ++patch_row < KERNEL_HEIGHT );
1689+
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
1690+
curr_y0 = saved_y0;
1691+
curr_y1 = saved_y1;
1692+
#endif
1693+
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch
1694+
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
1695+
}
1696+
//while ( ++patch_depth < 1 ); //debug
1697+
while ( ++patch_depth < INPUT_DEPTH );
1698+
1699+
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
1700+
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
1701+
int out0_offset = global_z * out_pitch_z // batch offset
1702+
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
1703+
+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1704+
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset
1705+
int out1_offset = global_z * out_pitch_z // batch offset
1706+
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
1707+
+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1708+
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset
1709+
1710+
#if APPLY_BIAS
1711+
Dtype bias[2];
1712+
Dtype2 *bias_vec;
1713+
bias_vec = (Dtype2*)bias;
1714+
*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
1715+
if (group_x > 0xFFFFFFFEul) {
1716+
dst[0] = bias[0] + bias[1];
1717+
}
1718+
#else
1719+
const Dtype bias[2] = {0, 0};
1720+
#endif
1721+
INTERLEAVED_SIMD16_OUTPUT(dst, out0_offset, 0);
1722+
INTERLEAVED_SIMD16_OUTPUT(dst, out1_offset, 1);
1723+
}
1724+
}
1725+
#endif
1726+
15031727
#elif defined KERNEL_DWCONV
15041728

15051729
__kernel void DWCONV(

0 commit comments

Comments
 (0)