Merge pull request opencv#10385 from pengli:dnn

alalek · alalek · commit 59e825ee0234 · 2017-12-22T15:48:40.000Z
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -1432,6 +1432,7 @@ void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerPar
         generate_gemmlike_tuneritems(tunerItems, 1, 8, 32);
         generate_gemmlike_tuneritems(tunerItems, 2, 8, 32);
         generate_gemmlike_tuneritems(tunerItems, 1, 16, 32);
+        generate_gemmlike_tuneritems(tunerItems, 2, 16, 32);
 
         // idlf kernel
         for (int simd_size = 8; simd_size <= 16; simd_size += 8)
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -384,7 +384,6 @@ convolve_simd(
 #elif defined KERNEL_GEMM_LIKE
 
 #if APPLY_BIAS
-// Dtype bias[4];
 #define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i)
 #else
 #define SUBGROUP_GET_BIAS(k, i) ((Dtype)0)
@@ -446,9 +445,7 @@ typedef struct float0 { float s0; } float0; //never used but makes compiler happ
 #define TILE_K          KERNEL_WIDTH
 #define TILE_N          32
 
-#ifndef __BEIGNET__
 __attribute__((intel_reqd_sub_group_size(8)))
-#endif
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
     const int group_x = get_group_id(0);
@@ -608,6 +605,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         Dtype4 *bias_vec;
         bias_vec = (Dtype4*)bias;
         *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
+        if (group_x > 0xFFFFFFFEul) {
+          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
+        }
+#else
+        const Dtype bias[4] = {0, 0, 0, 0};
 #endif
         if (global_y * TILE_M < output_width * output_height )
         {
@@ -768,6 +770,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         Dtype4 *bias_vec;
         bias_vec = (Dtype4*)bias;
         *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
+        if (group_x > 0xFFFFFFFEul) {
+          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
+        }
+#else
+        const Dtype bias[4] = {0, 0, 0, 0};
 #endif
 
         if (global_y * TILE_M < output_width * output_height )
@@ -813,9 +820,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 #define TILE_K          KERNEL_WIDTH
 #define TILE_N          32
 
-#ifndef __BEIGNET__
 __attribute__((intel_reqd_sub_group_size(8)))
-#endif
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
     const int group_x = get_group_id(0);
@@ -1012,6 +1017,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         Dtype4 *bias_vec;
         bias_vec = (Dtype4*)bias;
         *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
+        if (group_x > 0xFFFFFFFEul) {
+          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
+        }
+#else
+        const Dtype bias[4] = {0, 0, 0, 0};
 #endif
 
         if( global_y * TILE_M < output_width * output_height )
@@ -1221,6 +1231,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         Dtype4 *bias_vec;
         bias_vec = (Dtype4*)bias;
         *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
+        if (group_x > 0xFFFFFFFEul) {
+          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
+        }
+#else
+        const Dtype bias[4] = {0, 0, 0, 0};
 #endif
         if( global_y * TILE_M < output_width * output_height )
         {
@@ -1334,9 +1349,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 #define TILE_K          KERNEL_WIDTH
 #define TILE_N          32
 
-#ifndef __BEIGNET__
 __attribute__((intel_reqd_sub_group_size(16)))
-#endif
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
     const int group_x = get_group_id(0);
@@ -1396,18 +1409,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
     // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
     // and KERNEL_WIDTH/2 rows of interleaved filter.
     int patch_depth = 0;
-#ifndef __BEIGNET__
     __attribute__((opencl_unroll_hint(1)))
-#endif
     do
     {
         int patch_row = 0;
 #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
         curr_y = saved_y;
 #endif
-#ifndef __BEIGNET__
         __attribute__((opencl_unroll_hint(1)))
-#endif
         do
         {
             // Load atile and btile.
@@ -1495,11 +1504,226 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
     Dtype2 *bias_vec;
     bias_vec = (Dtype2*)bias;
     *bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
+    if (group_x > 0xFFFFFFFEul) {
+      dst[0] = bias[0] + bias[1];
+    }
+#else
+    const Dtype bias[2] = {0, 0};
 #endif
     INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
 }
 #endif
 
+#ifdef GEMM_LIKE_CONV_32_2_SIMD16
+
+//////////////////////////////////////////////////////////////////////////////
+// Conv_Interleaved_32_2_SIMD16
+//
+// Convolution: each workitem computes 1 patch x 32 filters worth of output
+// data.
+#define TILE_M          2
+#define TILE_K          KERNEL_WIDTH
+#define TILE_N          32
+
+__attribute__((intel_reqd_sub_group_size(16)))
+__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
+{
+    const int group_x = get_group_id(0);
+    const int group_y = get_group_id(1);
+    const int global_x = get_global_id(0);
+    const int global_y = get_global_id(1);
+    const int global_z = get_global_id(2);
+    int interleaved_y;
+    int kernel_y;
+    int kernel_idx;
+#define DOT_PRODUCT_16( _result, _rowA, colB )    \
+    {   \
+        _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 );  \
+        _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 );  \
+        _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 );  \
+        _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 );  \
+        _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 );  \
+        _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 );  \
+        _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 );  \
+        _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 );  \
+        _result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 );  \
+        _result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 );  \
+        _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa );  \
+        _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb );  \
+        _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc );  \
+        _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd );  \
+        _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se );  \
+        _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf );  \
+    }
+        typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
+
+    // True for all threads if filter_width is multiple of TILE_N
+    // else, true for all but right-most column of threads.
+    {
+        // Result ctile (*dst) is M rows x N columns
+        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
+        Dtype16  blockC00 = 0.f;
+        Dtype16  blockC10 = 0.f;
+        Dtype16  blockC01 = 0.f;
+        Dtype16  blockC11 = 0.f;
+
+        // Src0 (patch input) is directly used as atile.
+        // Each work item points to the start of a different patch.
+        // atile is M rows x K columns.
+        int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
+        int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
+        int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
+        int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+        int saved_y0 = curr_y0;
+        int saved_y1 = curr_y1;
+#endif
+        const __global Dtype *src0_read0 = src0
+         + aligned_input_size * global_z                                            // batch offset
+         + (curr_y0 - INPUT_PAD_H) * ROW_PITCH   // y offset
+         + curr_x0 - INPUT_PAD_W;                // x offset
+        const __global Dtype *src0_read1 = src0
+         + aligned_input_size * global_z                                            // batch offset
+         + (curr_y1 - INPUT_PAD_H) * ROW_PITCH   // y offset
+         + curr_x1 - INPUT_PAD_W;                // x offset
+
+        // Src1 (filter) is directly used as btile.
+        // It starts at the top of src1 and walks down.
+        // btile is K rows x N columns.
+        const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
+
+        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
+        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
+        // and KERNEL_WIDTH/2 rows of interleaved filter.
+        int patch_depth = 0;
+        do
+        {
+            int patch_row = 0;
+            do
+            {
+                // Load atile and btile.
+                // Kernel data is partially interleaved.  Every 2 rows are interleaved at Dtype8 granularity.
+                // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
+                // interleaved row is padded with zero to ensure same size as interleaved rows. This
+                // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
+                // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
+                // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
+                // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
+                // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
+                // ...
+                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
+#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
+                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;
+                Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;
+                Dtype*  pblockA00 = (Dtype*)(&blockA00);
+                Dtype*  pblockA01 = (Dtype*)(&blockA01);
+#else
+                Dtype_t blockA00;
+                Dtype*  pblockA00 = (Dtype*)(&blockA00);
+                int pos = 0;
+                LOOP(KERNEL_WIDTH, pos,
+                {
+                  if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
+                    pblockA00[pos] = src0_read0[pos * DILATION_X];
+                  else
+                    pblockA00[pos] = 0;
+                })
+                curr_y0 += DILATION_Y;
+                Dtype_t blockA01;
+                Dtype*  pblockA01 = (Dtype*)(&blockA01);
+                pos = 0;
+                LOOP(KERNEL_WIDTH, pos,
+                {
+                  if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
+                    pblockA01[pos] = src0_read1[pos * DILATION_X];
+                  else
+                    pblockA01[pos] = 0;
+                })
+                curr_y1 += DILATION_Y;
+                src0_read0 += (ROW_PITCH * DILATION_Y);
+                src0_read1 += (ROW_PITCH * DILATION_Y);
+#endif
+                Dtype blockB00[KERNEL_WIDTH*2];
+                Dtype4* p4BlockB00 = (Dtype4*)blockB00;
+                Dtype2* p2BlockB00 = (Dtype2*)blockB00;
+                Dtype*  pBlockB00 =  (Dtype* )blockB00;
+
+                interleaved_y = 0;
+                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+                {
+                    p4BlockB00[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
+                    src1_read += WIDTH1 * 2;
+                } )
+                if ( kernel_width_is_odd )
+                {
+                    p2BlockB00[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
+                    src1_read += WIDTH1 * 2;
+                }
+                // Perform MADs
+                kernel_idx = 0;
+                interleaved_y = 0;
+                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+                {
+                    kernel_y = interleaved_y * 2;
+                    DOT_PRODUCT_16( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_16( blockC01, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_16( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_16( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_16( blockC11, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_16( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+                } )
+                if ( kernel_width_is_odd )
+                {
+                    kernel_y = interleaved_y * 2;
+                    DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_16( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_16( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+                }
+            }
+
+            //while( ++patch_row < 1 ); //debug
+            while( ++patch_row < KERNEL_HEIGHT );
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
+            curr_y0 = saved_y0;
+            curr_y1 = saved_y1;
+#endif
+            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch
+            src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
+        }
+        //while ( ++patch_depth < 1 );  //debug
+        while ( ++patch_depth < INPUT_DEPTH );
+
+        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
+        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
+        int out0_offset = global_z * out_pitch_z                                                       // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
+         + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+        int out1_offset = global_z * out_pitch_z                                                       // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
+         + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+
+#if APPLY_BIAS
+        Dtype bias[2];
+        Dtype2 *bias_vec;
+        bias_vec = (Dtype2*)bias;
+        *bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
+        if (group_x > 0xFFFFFFFEul) {
+          dst[0] = bias[0] + bias[1];
+        }
+#else
+        const Dtype bias[2] = {0, 0};
+#endif
+        INTERLEAVED_SIMD16_OUTPUT(dst, out0_offset, 0);
+        INTERLEAVED_SIMD16_OUTPUT(dst, out1_offset, 1);
+    }
+}
+#endif
+
 #elif defined KERNEL_DWCONV
 
 __kernel void DWCONV(