Merge pull request opencv#10150 from alalek:dnn_ocl_refactor_pooling

vpisarev · vpisarev · commit bd6a6690f2d7 · 2017-11-24T11:51:03.000Z
diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -351,8 +351,6 @@ class OCL4DNNPool
                      UMat& top_data,
                      UMat& top_mask);
     private:
-        UMat mask_idx_;
-
         // Pooling parameters
         std::vector<int32_t> pad_;
         std::vector<int32_t> stride_;
@@ -362,7 +360,6 @@ class OCL4DNNPool
 
         ocl4dnnPoolingMethod_t pool_method_;
         int32_t count_;
-        int32_t batch_size_;
         int32_t channels_;
         int32_t kernel_h_;
         int32_t kernel_w_;
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@@ -54,7 +54,6 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
     int dims = config.in_shape.size();
     int spatial_dims = 2;
 
-    batch_size_ = config.in_shape[0];
     channels_ = config.channels;
     pool_method_ = config.pool_method;
 
@@ -88,7 +87,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
 template<typename Dtype>
 OCL4DNNPool<Dtype>::~OCL4DNNPool()
 {
-    mask_idx_.release();
+    // nothing
 }
 
 template<typename Dtype>
@@ -99,99 +98,100 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
     bool ret = true;
     size_t global[] = { 128 * 128 };
     size_t local[] = { 128 };
-    cl_uint argIdx = 0;
 
     // support 2D case
     switch (pool_method_)
     {
     case LIBDNN_POOLING_METHOD_MAX:
         {
-            if (top_mask.empty() && mask_idx_.empty())
-            {
-                mask_idx_.create(1, count_, CV_32FC1);
-            }
-            ocl::Kernel oclk_max_pool_forward(CL_KERNEL_SELECT("max_pool_forward"),
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+            bool haveMask = !top_mask.empty();
+            ocl::Kernel oclk_max_pool_forward(
+                haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"),
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
+                format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                       " -D STRIDE_W=%d -D STRIDE_H=%d"
+                       " -D PAD_W=%d -D PAD_H=%d%s",
+                       kernel_w_, kernel_h_,
+                       stride_w_, stride_h_,
+                       pad_w_, pad_h_,
+                       haveMask ? " -D HAVE_MASK=1" : ""
+                ));
 
             if (oclk_max_pool_forward.empty())
                 return false;
 
-            argIdx = 0;
-            oclk_max_pool_forward.set(argIdx++, count_);
-            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            oclk_max_pool_forward.set(argIdx++, batch_size_);
-            oclk_max_pool_forward.set(argIdx++, channels_);
-            oclk_max_pool_forward.set(argIdx++, height_);
-            oclk_max_pool_forward.set(argIdx++, width_);
-            oclk_max_pool_forward.set(argIdx++, pooled_height_);
-            oclk_max_pool_forward.set(argIdx++, pooled_width_);
-            oclk_max_pool_forward.set(argIdx++, kernel_h_);
-            oclk_max_pool_forward.set(argIdx++, kernel_w_);
-            oclk_max_pool_forward.set(argIdx++, stride_h_);
-            oclk_max_pool_forward.set(argIdx++, stride_w_);
-            oclk_max_pool_forward.set(argIdx++, pad_h_);
-            oclk_max_pool_forward.set(argIdx++, pad_w_);
-            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
-            oclk_max_pool_forward.set(argIdx++, mask_idx_.empty() ? 0 : 1);
-            if (mask_idx_.empty())
-                oclk_max_pool_forward.set(argIdx++, (void *)NULL);
-            else
-                oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(mask_idx_));
-            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top_mask));
+            oclk_max_pool_forward.args(
+                count_,
+                ocl::KernelArg::PtrReadOnly(bottom),
+                channels_,
+                height_,
+                width_,
+                pooled_height_,
+                pooled_width_,
+                ocl::KernelArg::PtrWriteOnly(top),
+                ocl::KernelArg::PtrWriteOnly(top_mask)
+            );
 
             ret = oclk_max_pool_forward.run(1, global, local, false);
         }
         break;
     case LIBDNN_POOLING_METHOD_AVE:
         {
+            CV_Assert(top_mask.empty());
+
             ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
+                format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                       " -D STRIDE_W=%d -D STRIDE_H=%d"
+                       " -D PAD_W=%d -D PAD_H=%d",
+                       kernel_w_, kernel_h_,
+                       stride_w_, stride_h_,
+                       pad_w_, pad_h_
+                ));
 
             if (oclk_ave_pool_forward.empty())
                 return false;
 
-            argIdx = 0;
-            oclk_ave_pool_forward.set(argIdx++, count_);
-            oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            oclk_ave_pool_forward.set(argIdx++, batch_size_);
-            oclk_ave_pool_forward.set(argIdx++, channels_);
-            oclk_ave_pool_forward.set(argIdx++, height_);
-            oclk_ave_pool_forward.set(argIdx++, width_);
-            oclk_ave_pool_forward.set(argIdx++, pooled_height_);
-            oclk_ave_pool_forward.set(argIdx++, pooled_width_);
-            oclk_ave_pool_forward.set(argIdx++, kernel_h_);
-            oclk_ave_pool_forward.set(argIdx++, kernel_w_);
-            oclk_ave_pool_forward.set(argIdx++, stride_h_);
-            oclk_ave_pool_forward.set(argIdx++, stride_w_);
-            oclk_ave_pool_forward.set(argIdx++, pad_h_);
-            oclk_ave_pool_forward.set(argIdx++, pad_w_);
-            oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            oclk_ave_pool_forward.args(
+                count_,
+                ocl::KernelArg::PtrReadOnly(bottom),
+                channels_,
+                height_,
+                width_,
+                pooled_height_,
+                pooled_width_,
+                ocl::KernelArg::PtrWriteOnly(top)
+            );
 
             ret = oclk_ave_pool_forward.run(1, global, local, false);
         }
         break;
     case LIBDNN_POOLING_METHOD_STO:
         {
+            CV_Assert(top_mask.empty());
+
             ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
+                format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+                       " -D STRIDE_W=%d -D STRIDE_H=%d",
+                       kernel_w_, kernel_h_,
+                       stride_w_, stride_h_
+                ));
+
 
             if (oclk_sto_pool_forward.empty())
                 return false;
 
-            argIdx = 0;
-            oclk_sto_pool_forward.set(argIdx++, count_);
-            oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            oclk_sto_pool_forward.set(argIdx++, batch_size_);
-            oclk_sto_pool_forward.set(argIdx++, channels_);
-            oclk_sto_pool_forward.set(argIdx++, height_);
-            oclk_sto_pool_forward.set(argIdx++, width_);
-            oclk_sto_pool_forward.set(argIdx++, pooled_height_);
-            oclk_sto_pool_forward.set(argIdx++, pooled_width_);
-            oclk_sto_pool_forward.set(argIdx++, kernel_h_);
-            oclk_sto_pool_forward.set(argIdx++, kernel_w_);
-            oclk_sto_pool_forward.set(argIdx++, stride_h_);
-            oclk_sto_pool_forward.set(argIdx++, stride_w_);
-            oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            oclk_sto_pool_forward.args(
+                count_,
+                ocl::KernelArg::PtrReadOnly(bottom),
+                channels_,
+                height_,
+                width_,
+                pooled_height_,
+                pooled_width_,
+                ocl::KernelArg::PtrWriteOnly(top)
+            );
 
             ret = oclk_sto_pool_forward.run(1, global, local, false);
         }
diff --git a/modules/dnn/src/opencl/ocl4dnn_pooling.cl b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@@ -44,14 +44,23 @@
 #define TEMPLATE(name,type) CONCAT(name,type)
 #define Dtype float
 
-void TEMPLATE(max_pool_forward_impl, Dtype)(
-    const int nthreads, __global const Dtype* bottom_data, const int num,
+#if defined KERNEL_MAX_POOL
+
+__kernel void
+#ifdef HAVE_MASK
+    TEMPLATE(max_pool_forward_mask, Dtype)
+#else
+    TEMPLATE(max_pool_forward, Dtype)
+#endif
+(
+    const int nthreads, __global const Dtype* bottom_data,
     const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w,
-    __global Dtype* top_data,
-    const int use_mask, __global int* mask, __global Dtype* top_mask, bool no_mask)
+    const int pooled_height, const int pooled_width,
+    __global Dtype* top_data
+#ifdef HAVE_MASK
+    , __global Dtype* mask
+#endif
+)
 {
   for (int index = get_global_id(0); index < nthreads;
       index += get_global_size(0))
@@ -60,10 +69,10 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
     const int ph = (index / pooled_width) % pooled_height;
     const int c = (index / pooled_width / pooled_height) % channels;
     const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    const int hend = min(hstart + kernel_h, height);
-    const int wend = min(wstart + kernel_w, width);
+    int hstart = ph * STRIDE_H - PAD_H;
+    int wstart = pw * STRIDE_W - PAD_W;
+    const int hend = min(hstart + KERNEL_H, height);
+    const int wend = min(wstart + KERNEL_W, width);
     hstart = max(hstart, (int)0);
     wstart = max(wstart, (int)0);
     Dtype maxval = -FLT_MAX;
@@ -79,38 +88,19 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
       }
     }
     top_data[index] = maxval;
-    if (!no_mask) {
-      if (use_mask == 1) {
-        mask[index] = maxidx;
-      } else {
-        top_mask[index] = maxidx;
-      }
-    }
+#ifdef HAVE_MASK
+    mask[index] = maxidx;
+#endif
   }
 }
 
-__kernel void TEMPLATE(max_pool_forward, Dtype)(
-    const int nthreads, __global const Dtype* bottom_data, const int num,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w,
-    __global Dtype* top_data,
-    const int use_mask, __global int* mask, __global Dtype* top_mask)
-{
-    TEMPLATE(max_pool_forward_impl, Dtype)(
-      nthreads, bottom_data, num, channels, height, width,
-      pooled_height, pooled_width, kernel_h,
-      kernel_w, stride_h, stride_w, pad_h, pad_w, top_data, use_mask, mask, top_mask, false
-    );
-}
+#elif defined KERNEL_AVE_POOL
 
 __kernel void TEMPLATE(ave_pool_forward, Dtype)(
-    const int nthreads, __global const Dtype* const bottom_data, const int num,
+    const int nthreads, __global const Dtype* const bottom_data,
     const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w, __global Dtype* top_data)
+    const int pooled_height, const int pooled_width,
+    __global Dtype* top_data)
 {
   for (int index = get_global_id(0); index < nthreads;
       index += get_global_size(0))
@@ -120,10 +110,10 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
       const int ph = (index / pooled_width) % pooled_height;
       const int c = (index / pooled_width / pooled_height) % channels;
       const int n = index / pooled_width / pooled_height / channels;
-      int hstart = ph * stride_h - pad_h;
-      int wstart = pw * stride_w - pad_w;
-      int hend = min(hstart + kernel_h, height + pad_h);
-      int wend = min(wstart + kernel_w, width + pad_w);
+      int hstart = ph * STRIDE_H - PAD_H;
+      int wstart = pw * STRIDE_W - PAD_W;
+      int hend = min(hstart + KERNEL_H, height + PAD_H);
+      int wend = min(wstart + KERNEL_W, width + PAD_W);
       const int pool_size = (hend - hstart) * (wend - wstart);
       hstart = max(hstart, (int)0);
       wstart = max(wstart, (int)0);
@@ -142,11 +132,12 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
   }
 }
 
+#elif defined KERNEL_STO_POOL
+
 __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
-    const int nthreads, __global const Dtype* const bottom_data, const int num,
+    const int nthreads, __global const Dtype* const bottom_data,
     const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w,
+    const int pooled_height, const int pooled_width,
     __global Dtype* top_data)
 {
   for (int index = get_global_id(0); index < nthreads;
@@ -156,10 +147,10 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
     const int ph = (index / pooled_width) % pooled_height;
     const int c = (index / pooled_width / pooled_height) % channels;
     const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
+    const int hstart = ph * STRIDE_H;
+    const int hend = min(hstart + KERNEL_H, height);
+    const int wstart = pw * STRIDE_W;
+    const int wend = min(wstart + KERNEL_W, width);
     // We set cumsum to be 0 to avoid divide-by-zero problems
     Dtype cumsum = FLT_MIN;
     Dtype cumvalues = 0.;
@@ -168,10 +159,13 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
     // First pass: get sum
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
+        Dtype v = bottom_slice[h * width + w];
+        cumsum += v;
+        cumvalues += v * v;
       }
     }
     top_data[index] = cumvalues / cumsum;
   }
 }
+
+#endif // KERNEL_*