@@ -384,7 +384,6 @@ convolve_simd(
384
384
#elif defined KERNEL_GEMM_LIKE
385
385
386
386
#if APPLY_BIAS
387
- // Dtype bias[4];
388
387
#define SUBGROUP_GET_BIAS (k , i ) intel_sub_group_shuffle(bias[k], i)
389
388
#else
390
389
#define SUBGROUP_GET_BIAS (k , i ) ((Dtype)0)
@@ -446,9 +445,7 @@ typedef struct float0 { float s0; } float0; //never used but makes compiler happ
446
445
#define TILE_K KERNEL_WIDTH
447
446
#define TILE_N 32
448
447
449
- #ifndef __BEIGNET__
450
448
__attribute__((intel_reqd_sub_group_size (8 )))
451
- #endif
452
449
__kernel void Conv_Interleaved (GEMM_LIKE_KERNEL_ARGS )
453
450
{
454
451
const int group_x = get_group_id (0 );
@@ -608,6 +605,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
608
605
Dtype4 * bias_vec ;
609
606
bias_vec = (Dtype4 * )bias ;
610
607
* bias_vec = as_Dtype4 (SUB_GROUP_BLOCK_READ4 ((__global INT_TYPE * )biases_base + group_x * TILE_N ));
608
+ if (group_x > 0xFFFFFFFEul ) {
609
+ dst [0 ] = bias [0 ] + bias [1 ] + bias [2 ] + bias [3 ];
610
+ }
611
+ #else
612
+ const Dtype bias [4 ] = {0 , 0 , 0 , 0 };
611
613
#endif
612
614
if (global_y * TILE_M < output_width * output_height )
613
615
{
@@ -768,6 +770,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
768
770
Dtype4 * bias_vec ;
769
771
bias_vec = (Dtype4 * )bias ;
770
772
* bias_vec = as_Dtype4 (SUB_GROUP_BLOCK_READ4 ((__global INT_TYPE * )biases_base + group_x * TILE_N ));
773
+ if (group_x > 0xFFFFFFFEul ) {
774
+ dst [0 ] = bias [0 ] + bias [1 ] + bias [2 ] + bias [3 ];
775
+ }
776
+ #else
777
+ const Dtype bias [4 ] = {0 , 0 , 0 , 0 };
771
778
#endif
772
779
773
780
if (global_y * TILE_M < output_width * output_height )
@@ -813,9 +820,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
813
820
#define TILE_K KERNEL_WIDTH
814
821
#define TILE_N 32
815
822
816
- #ifndef __BEIGNET__
817
823
__attribute__((intel_reqd_sub_group_size (8 )))
818
- #endif
819
824
__kernel void Conv_Interleaved (GEMM_LIKE_KERNEL_ARGS )
820
825
{
821
826
const int group_x = get_group_id (0 );
@@ -1012,6 +1017,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1012
1017
Dtype4 * bias_vec ;
1013
1018
bias_vec = (Dtype4 * )bias ;
1014
1019
* bias_vec = as_Dtype4 (SUB_GROUP_BLOCK_READ4 ((__global INT_TYPE * )biases_base + group_x * TILE_N ));
1020
+ if (group_x > 0xFFFFFFFEul ) {
1021
+ dst [0 ] = bias [0 ] + bias [1 ] + bias [2 ] + bias [3 ];
1022
+ }
1023
+ #else
1024
+ const Dtype bias [4 ] = {0 , 0 , 0 , 0 };
1015
1025
#endif
1016
1026
1017
1027
if ( global_y * TILE_M < output_width * output_height )
@@ -1221,6 +1231,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1221
1231
Dtype4 * bias_vec ;
1222
1232
bias_vec = (Dtype4 * )bias ;
1223
1233
* bias_vec = as_Dtype4 (SUB_GROUP_BLOCK_READ4 ((__global INT_TYPE * )biases_base + group_x * TILE_N ));
1234
+ if (group_x > 0xFFFFFFFEul ) {
1235
+ dst [0 ] = bias [0 ] + bias [1 ] + bias [2 ] + bias [3 ];
1236
+ }
1237
+ #else
1238
+ const Dtype bias [4 ] = {0 , 0 , 0 , 0 };
1224
1239
#endif
1225
1240
if ( global_y * TILE_M < output_width * output_height )
1226
1241
{
@@ -1334,9 +1349,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1334
1349
#define TILE_K KERNEL_WIDTH
1335
1350
#define TILE_N 32
1336
1351
1337
- #ifndef __BEIGNET__
1338
1352
__attribute__((intel_reqd_sub_group_size (16 )))
1339
- #endif
1340
1353
__kernel void Conv_Interleaved (GEMM_LIKE_KERNEL_ARGS )
1341
1354
{
1342
1355
const int group_x = get_group_id (0 );
@@ -1396,18 +1409,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1396
1409
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
1397
1410
// and KERNEL_WIDTH/2 rows of interleaved filter.
1398
1411
int patch_depth = 0 ;
1399
- #ifndef __BEIGNET__
1400
1412
__attribute__((opencl_unroll_hint (1 )))
1401
- #endif
1402
1413
do
1403
1414
{
1404
1415
int patch_row = 0 ;
1405
1416
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
1406
1417
curr_y = saved_y ;
1407
1418
#endif
1408
- #ifndef __BEIGNET__
1409
1419
__attribute__((opencl_unroll_hint (1 )))
1410
- #endif
1411
1420
do
1412
1421
{
1413
1422
// Load atile and btile.
@@ -1495,11 +1504,226 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1495
1504
Dtype2 * bias_vec ;
1496
1505
bias_vec = (Dtype2 * )bias ;
1497
1506
* bias_vec = as_Dtype2 (SUB_GROUP_BLOCK_READ2 ((__global INT_TYPE * )biases_base + group_x * TILE_N ));
1507
+ if (group_x > 0xFFFFFFFEul ) {
1508
+ dst [0 ] = bias [0 ] + bias [1 ];
1509
+ }
1510
+ #else
1511
+ const Dtype bias [2 ] = {0 , 0 };
1498
1512
#endif
1499
1513
INTERLEAVED_SIMD16_OUTPUT (dst , out_offset , 0 );
1500
1514
}
1501
1515
#endif
1502
1516
1517
+ #ifdef GEMM_LIKE_CONV_32_2_SIMD16
1518
+
1519
+ //////////////////////////////////////////////////////////////////////////////
1520
+ // Conv_Interleaved_32_2_SIMD16
1521
+ //
1522
+ // Convolution: each workitem computes 1 patch x 32 filters worth of output
1523
+ // data.
1524
+ #define TILE_M 2
1525
+ #define TILE_K KERNEL_WIDTH
1526
+ #define TILE_N 32
1527
+
1528
+ __attribute__((intel_reqd_sub_group_size (16 )))
1529
+ __kernel void Conv_Interleaved (GEMM_LIKE_KERNEL_ARGS )
1530
+ {
1531
+ const int group_x = get_group_id (0 );
1532
+ const int group_y = get_group_id (1 );
1533
+ const int global_x = get_global_id (0 );
1534
+ const int global_y = get_global_id (1 );
1535
+ const int global_z = get_global_id (2 );
1536
+ int interleaved_y ;
1537
+ int kernel_y ;
1538
+ int kernel_idx ;
1539
+ #define DOT_PRODUCT_16 ( _result , _rowA , colB ) \
1540
+ { \
1541
+ _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \
1542
+ _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \
1543
+ _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \
1544
+ _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \
1545
+ _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \
1546
+ _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \
1547
+ _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \
1548
+ _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \
1549
+ _result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \
1550
+ _result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \
1551
+ _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \
1552
+ _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \
1553
+ _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \
1554
+ _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \
1555
+ _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \
1556
+ _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \
1557
+ }
1558
+ typedef CAT ( Dtype , KERNEL_WIDTH ) Dtype_t ;
1559
+
1560
+ // True for all threads if filter_width is multiple of TILE_N
1561
+ // else, true for all but right-most column of threads.
1562
+ {
1563
+ // Result ctile (*dst) is M rows x N columns
1564
+ // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.
1565
+ Dtype16 blockC00 = 0.f ;
1566
+ Dtype16 blockC10 = 0.f ;
1567
+ Dtype16 blockC01 = 0.f ;
1568
+ Dtype16 blockC11 = 0.f ;
1569
+
1570
+ // Src0 (patch input) is directly used as atile.
1571
+ // Each work item points to the start of a different patch.
1572
+ // atile is M rows x K columns.
1573
+ int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X ;
1574
+ int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X ;
1575
+ int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y ;
1576
+ int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y ;
1577
+ #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
1578
+ int saved_y0 = curr_y0 ;
1579
+ int saved_y1 = curr_y1 ;
1580
+ #endif
1581
+ const __global Dtype * src0_read0 = src0
1582
+ + aligned_input_size * global_z // batch offset
1583
+ + (curr_y0 - INPUT_PAD_H ) * ROW_PITCH // y offset
1584
+ + curr_x0 - INPUT_PAD_W ; // x offset
1585
+ const __global Dtype * src0_read1 = src0
1586
+ + aligned_input_size * global_z // batch offset
1587
+ + (curr_y1 - INPUT_PAD_H ) * ROW_PITCH // y offset
1588
+ + curr_x1 - INPUT_PAD_W ; // x offset
1589
+
1590
+ // Src1 (filter) is directly used as btile.
1591
+ // It starts at the top of src1 and walks down.
1592
+ // btile is K rows x N columns.
1593
+ const __global Dtype * src1_read = src1 + ( global_x * TILE_N * 2 );
1594
+
1595
+ // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
1596
+ // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
1597
+ // and KERNEL_WIDTH/2 rows of interleaved filter.
1598
+ int patch_depth = 0 ;
1599
+ do
1600
+ {
1601
+ int patch_row = 0 ;
1602
+ do
1603
+ {
1604
+ // Load atile and btile.
1605
+ // Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity.
1606
+ // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non
1607
+ // interleaved row is padded with zero to ensure same size as interleaved rows. This
1608
+ // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the
1609
+ // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
1610
+ // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
1611
+ // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...
1612
+ // (0, 2) (8, 2) (16, 2) (24, 2) ... ...
1613
+ // ...
1614
+ const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1 ;
1615
+ #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
1616
+ Dtype_t blockA00 = ( (const __global Dtype_t * )src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH ;
1617
+ Dtype_t blockA01 = ( (const __global Dtype_t * )src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH ;
1618
+ Dtype * pblockA00 = (Dtype * )(& blockA00 );
1619
+ Dtype * pblockA01 = (Dtype * )(& blockA01 );
1620
+ #else
1621
+ Dtype_t blockA00 ;
1622
+ Dtype * pblockA00 = (Dtype * )(& blockA00 );
1623
+ int pos = 0 ;
1624
+ LOOP (KERNEL_WIDTH , pos ,
1625
+ {
1626
+ if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W )
1627
+ pblockA00 [pos ] = src0_read0 [pos * DILATION_X ];
1628
+ else
1629
+ pblockA00 [pos ] = 0 ;
1630
+ })
1631
+ curr_y0 += DILATION_Y ;
1632
+ Dtype_t blockA01 ;
1633
+ Dtype * pblockA01 = (Dtype * )(& blockA01 );
1634
+ pos = 0 ;
1635
+ LOOP (KERNEL_WIDTH , pos ,
1636
+ {
1637
+ if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W )
1638
+ pblockA01 [pos ] = src0_read1 [pos * DILATION_X ];
1639
+ else
1640
+ pblockA01 [pos ] = 0 ;
1641
+ })
1642
+ curr_y1 += DILATION_Y ;
1643
+ src0_read0 += (ROW_PITCH * DILATION_Y );
1644
+ src0_read1 += (ROW_PITCH * DILATION_Y );
1645
+ #endif
1646
+ Dtype blockB00 [KERNEL_WIDTH * 2 ];
1647
+ Dtype4 * p4BlockB00 = (Dtype4 * )blockB00 ;
1648
+ Dtype2 * p2BlockB00 = (Dtype2 * )blockB00 ;
1649
+ Dtype * pBlockB00 = (Dtype * )blockB00 ;
1650
+
1651
+ interleaved_y = 0 ;
1652
+ LOOP (KERNEL_WIDTH_DIV2 , interleaved_y ,
1653
+ {
1654
+ p4BlockB00 [interleaved_y ] = as_Dtype4 ( SUB_GROUP_BLOCK_READ4 ( (const __global INT_TYPE * )src1_read ) );
1655
+ src1_read += WIDTH1 * 2 ;
1656
+ } )
1657
+ if ( kernel_width_is_odd )
1658
+ {
1659
+ p2BlockB00 [KERNEL_WIDTH - 1 ] = as_Dtype2 ( SUB_GROUP_BLOCK_READ2 ( (const __global INT_TYPE * )src1_read ) );
1660
+ src1_read += WIDTH1 * 2 ;
1661
+ }
1662
+ // Perform MADs
1663
+ kernel_idx = 0 ;
1664
+ interleaved_y = 0 ;
1665
+ LOOP (KERNEL_WIDTH_DIV2 , interleaved_y ,
1666
+ {
1667
+ kernel_y = interleaved_y * 2 ;
1668
+ DOT_PRODUCT_16 ( blockC00 , pblockA00 [kernel_y ], pBlockB00 [kernel_idx ] );
1669
+ DOT_PRODUCT_16 ( blockC01 , pblockA01 [kernel_y ], pBlockB00 [kernel_idx ] ); kernel_idx ++ ;
1670
+ DOT_PRODUCT_16 ( blockC00 , pblockA00 [kernel_y + 1 ], pBlockB00 [kernel_idx ] );
1671
+ DOT_PRODUCT_16 ( blockC01 , pblockA01 [kernel_y + 1 ], pBlockB00 [kernel_idx ] ); kernel_idx ++ ;
1672
+ DOT_PRODUCT_16 ( blockC10 , pblockA00 [kernel_y ], pBlockB00 [kernel_idx ] );
1673
+ DOT_PRODUCT_16 ( blockC11 , pblockA01 [kernel_y ], pBlockB00 [kernel_idx ] ); kernel_idx ++ ;
1674
+ DOT_PRODUCT_16 ( blockC10 , pblockA00 [kernel_y + 1 ], pBlockB00 [kernel_idx ] );
1675
+ DOT_PRODUCT_16 ( blockC11 , pblockA01 [kernel_y + 1 ], pBlockB00 [kernel_idx ] ); kernel_idx ++ ;
1676
+ } )
1677
+ if ( kernel_width_is_odd )
1678
+ {
1679
+ kernel_y = interleaved_y * 2 ;
1680
+ DOT_PRODUCT_16 ( blockC00 , pblockA00 [kernel_y ], pBlockB00 [kernel_idx ] );
1681
+ DOT_PRODUCT_16 ( blockC01 , pblockA01 [kernel_y ], pBlockB00 [kernel_idx ] ); kernel_idx ++ ;
1682
+ DOT_PRODUCT_16 ( blockC10 , pblockA00 [kernel_y ], pBlockB00 [kernel_idx ] );
1683
+ DOT_PRODUCT_16 ( blockC11 , pblockA01 [kernel_y ], pBlockB00 [kernel_idx ] ); kernel_idx ++ ;
1684
+ }
1685
+ }
1686
+
1687
+ //while( ++patch_row < 1 ); //debug
1688
+ while ( ++ patch_row < KERNEL_HEIGHT );
1689
+ #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
1690
+ curr_y0 = saved_y0 ;
1691
+ curr_y1 = saved_y1 ;
1692
+ #endif
1693
+ src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
1694
+ src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1695
+ }
1696
+ //while ( ++patch_depth < 1 ); //debug
1697
+ while ( ++ patch_depth < INPUT_DEPTH );
1698
+
1699
+ // Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
1700
+ // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
1701
+ int out0_offset = global_z * out_pitch_z // batch offset
1702
+ + ( group_x * TILE_N ) * out_pitch_y // channel offset
1703
+ + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1704
+ + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT ; // x offset
1705
+ int out1_offset = global_z * out_pitch_z // batch offset
1706
+ + ( group_x * TILE_N ) * out_pitch_y // channel offset
1707
+ + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1708
+ + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT ; // x offset
1709
+
1710
+ #if APPLY_BIAS
1711
+ Dtype bias [2 ];
1712
+ Dtype2 * bias_vec ;
1713
+ bias_vec = (Dtype2 * )bias ;
1714
+ * bias_vec = as_Dtype2 (SUB_GROUP_BLOCK_READ2 ((__global INT_TYPE * )biases_base + group_x * TILE_N ));
1715
+ if (group_x > 0xFFFFFFFEul ) {
1716
+ dst [0 ] = bias [0 ] + bias [1 ];
1717
+ }
1718
+ #else
1719
+ const Dtype bias [2 ] = {0 , 0 };
1720
+ #endif
1721
+ INTERLEAVED_SIMD16_OUTPUT (dst , out0_offset , 0 );
1722
+ INTERLEAVED_SIMD16_OUTPUT (dst , out1_offset , 1 );
1723
+ }
1724
+ }
1725
+ #endif
1726
+
1503
1727
#elif defined KERNEL_DWCONV
1504
1728
1505
1729
__kernel void DWCONV (
0 commit comments