Merge pull request opencv#10341 from pengli:dnn

alalek · alalek · commit dcdd6af5a856 · 2017-12-19T14:04:55.000Z
diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp
@@ -45,6 +45,7 @@
 #include <float.h>
 #include <algorithm>
 #include <cmath>
+#include "opencl_kernels_dnn.hpp"
 
 namespace cv
 {
@@ -270,11 +271,108 @@ class PriorBoxLayerImpl : public PriorBoxLayer
         return false;
     }
 
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        int _layerWidth = inputs[0].size[3];
+        int _layerHeight = inputs[0].size[2];
+
+        int _imageWidth = inputs[1].size[3];
+        int _imageHeight = inputs[1].size[2];
+
+        float stepX, stepY;
+        if (_stepX == 0 || _stepY == 0)
+        {
+            stepX = static_cast<float>(_imageWidth) / _layerWidth;
+            stepY = static_cast<float>(_imageHeight) / _layerHeight;
+        } else {
+            stepX = _stepX;
+            stepY = _stepY;
+        }
+
+        if (umat_offsetsX.empty())
+        {
+            Mat offsetsX(1, _offsetsX.size(), CV_32FC1, &_offsetsX[0]);
+            Mat offsetsY(1, _offsetsX.size(), CV_32FC1, &_offsetsY[0]);
+            Mat aspectRatios(1, _aspectRatios.size(), CV_32FC1, &_aspectRatios[0]);
+            Mat variance(1, _variance.size(), CV_32FC1, &_variance[0]);
+
+            offsetsX.copyTo(umat_offsetsX);
+            offsetsY.copyTo(umat_offsetsY);
+            aspectRatios.copyTo(umat_aspectRatios);
+            variance.copyTo(umat_variance);
+
+            int real_numPriors = _numPriors / pow(2, _offsetsX.size() - 1);
+            umat_scales = UMat(1, &real_numPriors, CV_32F, 1.0f);
+        }
+
+        size_t nthreads = _layerHeight * _layerWidth;
+
+        ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
+        kernel.set(0, (int)nthreads);
+        kernel.set(1, (float)stepX);
+        kernel.set(2, (float)stepY);
+        kernel.set(3, (float)_minSize);
+        kernel.set(4, (float)_maxSize);
+        kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_offsetsX));
+        kernel.set(6, ocl::KernelArg::PtrReadOnly(umat_offsetsY));
+        kernel.set(7, (int)_offsetsX.size());
+        kernel.set(8, ocl::KernelArg::PtrReadOnly(umat_aspectRatios));
+        kernel.set(9, (int)_aspectRatios.size());
+        kernel.set(10, ocl::KernelArg::PtrReadOnly(umat_scales));
+        kernel.set(11, ocl::KernelArg::PtrWriteOnly(outputs[0]));
+        kernel.set(12, (int)_layerHeight);
+        kernel.set(13, (int)_layerWidth);
+        kernel.set(14, (int)_imageHeight);
+        kernel.set(15, (int)_imageWidth);
+        kernel.run(1, &nthreads, NULL, false);
+
+        // clip the prior's coordidate such that it is within [0, 1]
+        if (_clip)
+        {
+            Mat mat = outputs[0].getMat(ACCESS_READ);
+            int aspect_count = (_maxSize > 0) ? 1 : 0;
+            int offset = nthreads * 4 * _offsetsX.size() * (1 + aspect_count + _aspectRatios.size());
+            float* outputPtr = mat.ptr<float>() + offset;
+            int _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;
+            for (size_t d = 0; d < _outChannelSize; ++d)
+            {
+                outputPtr[d] = std::min<float>(std::max<float>(outputPtr[d], 0.), 1.);
+            }
+        }
+
+        // set the variance.
+        {
+            ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc);
+            int offset = total(shape(outputs[0]), 2);
+            size_t nthreads = _layerHeight * _layerWidth * _numPriors;
+            kernel.set(0, (int)nthreads);
+            kernel.set(1, (int)offset);
+            kernel.set(2, (int)_variance.size());
+            kernel.set(3, ocl::KernelArg::PtrReadOnly(umat_variance));
+            kernel.set(4, ocl::KernelArg::PtrWriteOnly(outputs[0]));
+            if (!kernel.run(1, &nthreads, NULL, false))
+                return false;
+        }
+        return true;
+    }
+#endif
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
     {
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
         Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
     }
 
@@ -441,6 +539,14 @@ class PriorBoxLayerImpl : public PriorBoxLayer
     std::vector<float> _offsetsX;
     std::vector<float> _offsetsY;
 
+#ifdef HAVE_OPENCL
+    UMat umat_offsetsX;
+    UMat umat_offsetsY;
+    UMat umat_aspectRatios;
+    UMat umat_scales;
+    UMat umat_variance;
+#endif
+
     bool _flip;
     bool _clip;
     bool _explicitSizes;
diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -215,6 +215,9 @@ class OCL4DNNConvSpatial
         bool createGEMMLikeConvKernel(int32_t blockWidth,
                                       int32_t blockHeight,
                                       int32_t blockDepth);
+        bool createDWConvKernel(int32_t blockWidth,
+                                int32_t blockHeight,
+                                int32_t blockDepth);
         void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
                              int32_t offset, int32_t size, bool write_only);
         bool convolve(const UMat &bottom, UMat &top,
@@ -282,6 +285,8 @@ class OCL4DNNConvSpatial
         int32_t M_;
 
         bool tuned_;
+        bool dwconv_;
+
         std::string key_, key_sanitized_;
         std::string short_key_;
         std::string kernel_name_;
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -103,6 +103,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
     top_dim_ = num_output_ * output_w_ * output_h_;
 
     cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
+    dwconv_ = (num_output_ == channels_ && channels_ == group_);
 
     use_cache_path_ = false;
     if (!cache_path_.empty())
@@ -203,7 +204,8 @@ void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
 typedef enum {
     KERNEL_TYPE_INTEL_IDLF = 2,
     KERNEL_TYPE_BASIC = 4,
-    KERNEL_TYPE_GEMM_LIKE = 5
+    KERNEL_TYPE_GEMM_LIKE = 5,
+    KERNEL_TYPE_DWCONV = 6
 } ocl4dnnConvSpatialKernelType_t;
 
 template<typename Dtype>
@@ -313,6 +315,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
         if (clOptionSupport("-cl-no-subgroup-ifp"))
             options_ << " -cl-no-subgroup-ifp ";
 
+        addDef("KERNEL_GEMM_LIKE");
         addDef("INPUT_DEPTH", channels_);
         addDef("WIDTH1", M_);
         addDef("OUT_PADDING_LEFT", 0);
@@ -329,6 +332,28 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
         setFusionDefine(fused_activ_, fused_eltwise_);
         src_ = ocl::dnn::conv_layer_spatial_oclsrc;
     }
+    else if (kernelType == KERNEL_TYPE_DWCONV)
+    {
+        kernelUKey = generateSpecificKey(KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
+        kernel_name_ = "DWCONV_";
+        kernel_name_ += kernelUKey.c_str();
+
+        options_ << " -cl-fast-relaxed-math ";
+        if (clOptionSupport("-cl-no-subgroup-ifp"))
+            options_ << " -cl-no-subgroup-ifp ";
+
+        addDef("KERNEL_DWCONV");
+        addDef("KERNEL_SIZE", kernel_w_ * kernel_h_);
+        addDef("KERNEL_W", kernel_w_);
+        addDef("KERNEL_H", kernel_h_);
+        addDef("APPLY_BIAS", bias_term_);
+        addDef("OUTPUT_Z", num_output_ * num_);
+        addDef("CHANNELS", num_output_);
+        setFusionDefine(fused_activ_, fused_eltwise_);
+
+        options_ << " -D DWCONV=" << kernel_name_;
+        src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
+    }
 }
 
 template<typename Dtype>
@@ -906,6 +931,33 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                 return false;
             }
         }
+    } else if (config->kernelType == KERNEL_TYPE_DWCONV) {
+        ocl::Kernel kernel(config->kernelName.c_str(), program);
+        if (kernel.empty())
+            return false;
+
+        cl_uint argIdx = 0;
+        setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+        if (bias_term_)
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+        kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+        kernel.set(argIdx++, (uint16_t)width_);
+        kernel.set(argIdx++, (uint16_t)height_);
+        kernel.set(argIdx++, (uint16_t)output_w_);
+        kernel.set(argIdx++, (uint16_t)output_h_);
+
+        size_t global_size[3];
+        global_size[0] = output_w_;
+        global_size[1] = output_h_;
+        global_size[2] = num_output_ * num_;
+
+        if (!kernel.run(3, global_size, NULL, false))
+        {
+            std::cout << "DWCONV kernel run failed." << std::endl;
+            return false;
+        }
     } else {
         for (int32_t n = 0; n < numImages; ++n) {
             for (int32_t g = 0; g < group_; ++g) {
@@ -1222,6 +1274,39 @@ bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
         return false;
 }
 
+template<>
+bool OCL4DNNConvSpatial<float>::createDWConvKernel(int32_t blockWidth,
+                                                   int32_t blockHeight,
+                                                   int32_t blockDepth)
+{
+    if (!dwconv_)
+        return false;
+
+    int workItemOutput[3] = { 1, 1, 1 };
+    size_t local_size[3] = { 1, 1, 1 };
+    size_t global_size[3];
+    global_size[0] = divUp(output_w_, workItemOutput[0]);
+    global_size[1] = divUp(output_h_, workItemOutput[1]);
+    global_size[2] = divUp(M_ * num_, workItemOutput[2]);
+
+    kernelType_ = KERNEL_TYPE_DWCONV;
+    blockM_ = blockWidth;
+    blockK_ = blockHeight;
+    blockN_ = blockDepth;
+
+    setupKernel();
+
+    ocl::Program program = compileKernel();
+    if (program.ptr())
+    {
+        kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0],
+                              &workItemOutput[0], false, KERNEL_TYPE_DWCONV));
+        return true;
+    }
+    else
+        return false;
+}
+
 template<>
 bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
                                                         int32_t blockWidth,
@@ -1238,6 +1323,8 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
         return createBasicKernel(blockWidth, blockHeight, blockDepth);
     else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
         return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
+    else if (kernelType == KERNEL_TYPE_DWCONV)
+        return createDWConvKernel(blockWidth, blockHeight, blockDepth);
     else
         CV_Assert(0 && "Internal error");
     return false;
@@ -1246,7 +1333,16 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
 template<>
 void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
 {
-    if (ocl::Device::getDefault().intelSubgroupsSupport()) {
+    if (ocl::Device::getDefault().intelSubgroupsSupport())
+    {
+        //depth_wise kernels
+        if (dwconv_)
+        {
+            tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1));
+            if (group_ > 8)
+                return;
+        }
+
         /* IDLF kernels are using Intel specific extension which make
            them intel only. */
         // Generates static key_
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -383,7 +383,7 @@ convolve_simd(
   }
 }
 
-#else // KERNEL_GEMM_LIKE
+#elif defined KERNEL_GEMM_LIKE
 
 #if APPLY_BIAS
 // Dtype bias[4];
@@ -1501,4 +1501,59 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
     INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
 }
 #endif
-#endif // KERNEL_BASIC/IDLF/GEMM_LIKE
+
+#elif defined KERNEL_DWCONV
+
+__kernel void DWCONV(
+    ELTWISE_DATA_ARG
+    NEGATIVE_SLOPE_ARG
+    __global Dtype* image_data,
+    __global Dtype* kernel_data,
+    BIAS_KERNEL_ARG
+    __global Dtype* convolved_image,
+    const ushort input_width,
+    const ushort input_height,
+    const ushort output_width,
+    const ushort output_height) {
+
+  const int outputX = get_global_id(0);
+  const int outputY = get_global_id(1);
+  const int outputZ = get_global_id(2);
+  if(outputX < output_width && outputY < output_height)
+  {
+    Dtype sum = 0.;
+
+    const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;
+    const int org_x = outputX * STRIDE_X - INPUT_PAD_W;
+    const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);
+    const int biasIndex=outputZ%CHANNELS;
+    const int local_image_offset = org_y*input_width + org_x;
+    const int imageSize = input_width*input_height;
+
+    __global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));
+    __global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
+
+    for(int y = 0; y < KERNEL_H; y++)
+    {
+      for(int x = 0; x < KERNEL_W; x++)
+      {
+        if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))
+        {
+          continue;
+        }
+        sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];
+      }
+      image_dataPtrFloat += input_width * DILATION_Y;
+      kernel_dataPtrFloat += KERNEL_W;
+    }
+
+    #if APPLY_BIAS
+    int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
+    ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);
+    #else
+    int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
+    ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);
+    #endif
+  }
+}
+#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV
diff --git a/modules/dnn/src/opencl/prior_box.cl b/modules/dnn/src/opencl/prior_box.cl
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp