pythonwebcoder
diff --git a/‎modules/dnn/src/layers/concat_layer.cpp
Lines changed: 6 additions & 5 deletions b/‎modules/dnn/src/layers/concat_layer.cpp
Lines changed: 6 additions & 5 deletions
diff --git a/‎modules/dnn/src/layers/detection_output_layer.cpp
Lines changed: 166 additions & 16 deletions b/‎modules/dnn/src/layers/detection_output_layer.cpp
Lines changed: 166 additions & 16 deletions
diff --git a/‎modules/dnn/src/layers/permute_layer.cpp
Lines changed: 59 additions & 0 deletions b/‎modules/dnn/src/layers/permute_layer.cpp
Lines changed: 59 additions & 0 deletions
@@ -185,12 +185,13 @@ class ConcatLayerImpl : public ConcatLayer
         outs.getUMatVector(outputs);
 
         int cAxis = clamp(axis, inputs[0].dims);
-        if (!(cAxis == 1 && outputs[0].dims == 4 && !padding))
+        if (padding)
             return false;
 
         int bottom_concat_axis;
-        int concat_size = inputs[0].size[2] * inputs[0].size[3];
-        int top_concat_axis = outputs[0].size[1];
+        int concat_size = total(shape(inputs[0]), cAxis + 1);
+        int top_concat_axis = outputs[0].size[cAxis];
+        int num_concats = total(shape(inputs[0]), 0, cAxis);
         int offset_concat_axis = 0;
         UMat& outMat = outputs[0];
         String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" ");
@@ -202,12 +203,12 @@ class ConcatLayerImpl : public ConcatLayer
                 return false;
 
             UMat& inpMat = inputs[i];
-            bottom_concat_axis = inputs[i].size[1];
+            bottom_concat_axis = inputs[i].size[cAxis];
             size_t nthreads = inputs[i].total();
 
             kernel.set(0, (int)nthreads);
             kernel.set(1, ocl::KernelArg::PtrReadOnly(inpMat));
-            kernel.set(2, (int)inputs[i].size[0]);
+            kernel.set(2, (int)num_concats);
             kernel.set(3, (int)concat_size);
             kernel.set(4, (int)top_concat_axis);
             kernel.set(5, (int)bottom_concat_axis);
 
@@ -45,6 +45,7 @@
 #include <float.h>
 #include <string>
 #include "../nms.inl.hpp"
+#include "opencl_kernels_dnn.hpp"
 
 namespace cv
 {
@@ -211,11 +212,160 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
         return false;
     }
 
+#ifdef HAVE_OPENCL
+    // Decode all bboxes in a batch
+    bool ocl_DecodeBBoxesAll(UMat& loc_mat, UMat& prior_mat,
+                             const int num, const int numPriors, const bool share_location,
+                             const int num_loc_classes, const int background_label_id,
+                             const cv::String& code_type, const bool variance_encoded_in_target,
+                             const bool clip, std::vector<LabelBBox>& all_decode_bboxes)
+    {
+        UMat outmat = UMat(loc_mat.dims, loc_mat.size, CV_32F);
+        size_t nthreads = loc_mat.total();
+        String kernel_name;
+
+        if (code_type == "CORNER")
+            kernel_name = "DecodeBBoxesCORNER";
+        else if (code_type == "CENTER_SIZE")
+            kernel_name = "DecodeBBoxesCENTER_SIZE";
+        else
+            return false;
+
+        for (int i = 0; i < num; ++i)
+        {
+            ocl::Kernel kernel(kernel_name.c_str(), ocl::dnn::detection_output_oclsrc);
+            kernel.set(0, (int)nthreads);
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(loc_mat));
+            kernel.set(2, ocl::KernelArg::PtrReadOnly(prior_mat));
+            kernel.set(3, (int)variance_encoded_in_target);
+            kernel.set(4, (int)numPriors);
+            kernel.set(5, (int)share_location);
+            kernel.set(6, (int)num_loc_classes);
+            kernel.set(7, (int)background_label_id);
+            kernel.set(8, (int)clip);
+            kernel.set(9, ocl::KernelArg::PtrWriteOnly(outmat));
+
+            if (!kernel.run(1, &nthreads, NULL, false))
+                return false;
+        }
+
+        all_decode_bboxes.clear();
+        all_decode_bboxes.resize(num);
+        {
+            Mat mat = outmat.getMat(ACCESS_READ);
+            const float* decode_data = mat.ptr<float>();
+            for (int i = 0; i < num; ++i)
+            {
+                LabelBBox& decode_bboxes = all_decode_bboxes[i];
+                for (int c = 0; c < num_loc_classes; ++c)
+                {
+                    int label = share_location ? -1 : c;
+                    decode_bboxes[label].resize(numPriors);
+                    for (int p = 0; p < numPriors; ++p)
+                    {
+                        int startIdx = p * num_loc_classes * 4;
+                        util::NormalizedBBox& bbox = decode_bboxes[label][p];
+                        bbox.xmin = decode_data[startIdx + c * 4];
+                        bbox.ymin = decode_data[startIdx + c * 4 + 1];
+                        bbox.xmax = decode_data[startIdx + c * 4 + 2];
+                        bbox.ymax = decode_data[startIdx + c * 4 + 3];
+                    }
+                }
+            }
+        }
+        return true;
+    }
+
+    void ocl_GetConfidenceScores(const UMat& inp1, const int num,
+                                 const int numPredsPerClass, const int numClasses,
+                                 std::vector<Mat>& confPreds)
+    {
+        int shape[] = { numClasses, numPredsPerClass };
+        for (int i = 0; i < num; i++)
+            confPreds.push_back(Mat(2, shape, CV_32F));
+
+        UMat umat = inp1.reshape(1, num * numPredsPerClass);
+        for (int i = 0; i < num; ++i)
+        {
+            Range ranges[] = { Range(i * numPredsPerClass, (i + 1) * numPredsPerClass), Range::all() };
+            transpose(umat(ranges), confPreds[i]);
+        }
+    }
+
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        std::vector<LabelBBox> allDecodedBBoxes;
+        std::vector<Mat> allConfidenceScores;
+
+        int num = inputs[0].size[0];
+
+        // extract predictions from input layers
+        {
+            int numPriors = inputs[2].size[2] / 4;
+
+            // Retrieve all confidences
+            ocl_GetConfidenceScores(inputs[1], num, numPriors, _numClasses, allConfidenceScores);
+
+            // Decode all loc predictions to bboxes
+            bool ret = ocl_DecodeBBoxesAll(inputs[0], inputs[2], num, numPriors,
+                                           _shareLocation, _numLocClasses, _backgroundLabelId,
+                                           _codeType, _varianceEncodedInTarget, false,
+                                           allDecodedBBoxes);
+            if (!ret)
+                return false;
+        }
+
+        size_t numKept = 0;
+        std::vector<std::map<int, std::vector<int> > > allIndices;
+        for (int i = 0; i < num; ++i)
+        {
+            numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);
+        }
+
+        if (numKept == 0)
+        {
+            // Set confidences to zeros.
+            Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
+            outputs[0](ranges).setTo(0);
+            return true;
+        }
+        int outputShape[] = {1, 1, (int)numKept, 7};
+        UMat umat = UMat(4, outputShape, CV_32F);
+        {
+            Mat mat = umat.getMat(ACCESS_WRITE);
+            float* outputsData = mat.ptr<float>();
+
+            size_t count = 0;
+            for (int i = 0; i < num; ++i)
+            {
+                count += outputDetections_(i, &outputsData[count * 7],
+                                           allDecodedBBoxes[i], allConfidenceScores[i],
+                                           allIndices[i]);
+            }
+            CV_Assert(count == numKept);
+        }
+        outputs.clear();
+        outputs.push_back(umat);
+        outs.assign(outputs);
+        return true;
+    }
+#endif
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
     {
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
         Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
     }
 
@@ -225,7 +375,7 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
         std::vector<LabelBBox> allDecodedBBoxes;
-        std::vector<std::vector<std::vector<float> > > allConfidenceScores;
+        std::vector<Mat> allConfidenceScores;
 
         int num = inputs[0]->size[0];
 
@@ -286,17 +436,17 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
 
     size_t outputDetections_(
             const int i, float* outputsData,
-            const LabelBBox& decodeBBoxes, const std::vector<std::vector<float> >& confidenceScores,
+            const LabelBBox& decodeBBoxes, Mat& confidenceScores,
             const std::map<int, std::vector<int> >& indicesMap
     )
     {
         size_t count = 0;
         for (std::map<int, std::vector<int> >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it)
         {
             int label = it->first;
-            if (confidenceScores.size() <= label)
+            if (confidenceScores.rows <= label)
                 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find confidence predictions for label %d", label));
-            const std::vector<float>& scores = confidenceScores[label];
+            const std::vector<float>& scores = confidenceScores.row(label);
             int locLabel = _shareLocation ? -1 : label;
             LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(locLabel);
             if (label_bboxes == decodeBBoxes.end())
@@ -320,7 +470,7 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
     }
 
     size_t processDetections_(
-            const LabelBBox& decodeBBoxes, const std::vector<std::vector<float> >& confidenceScores,
+            const LabelBBox& decodeBBoxes, Mat& confidenceScores,
             std::vector<std::map<int, std::vector<int> > >& allIndices
     )
     {
@@ -330,10 +480,10 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
         {
             if (c == _backgroundLabelId)
                 continue; // Ignore background class.
-            if (c >= confidenceScores.size())
+            if (c >= confidenceScores.rows)
                 CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find confidence predictions for label %d", c));
 
-            const std::vector<float>& scores = confidenceScores[c];
+            const std::vector<float> scores = confidenceScores.row(c);
             int label = _shareLocation ? -1 : c;
 
             LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(label);
@@ -351,9 +501,9 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
             {
                 int label = it->first;
                 const std::vector<int>& labelIndices = it->second;
-                if (label >= confidenceScores.size())
+                if (label >= confidenceScores.rows)
                     CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
-                const std::vector<float>& scores = confidenceScores[label];
+                const std::vector<float>& scores = confidenceScores.row(label);
                 for (size_t j = 0; j < labelIndices.size(); ++j)
                 {
                     size_t idx = labelIndices[j];
@@ -630,20 +780,20 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
     //      confidence prediction for an image.
     static void GetConfidenceScores(const float* confData, const int num,
                              const int numPredsPerClass, const int numClasses,
-                             std::vector<std::vector<std::vector<float> > >& confPreds)
+                             std::vector<Mat>& confPreds)
     {
-        confPreds.clear(); confPreds.resize(num);
+        int shape[] = { numClasses, numPredsPerClass };
+        for (int i = 0; i < num; i++)
+            confPreds.push_back(Mat(2, shape, CV_32F));
+
         for (int i = 0; i < num; ++i, confData += numPredsPerClass * numClasses)
         {
-            std::vector<std::vector<float> >& labelScores = confPreds[i];
-            labelScores.resize(numClasses);
+            Mat labelScores = confPreds[i];
             for (int c = 0; c < numClasses; ++c)
             {
-                std::vector<float>& classLabelScores = labelScores[c];
-                classLabelScores.resize(numPredsPerClass);
                 for (int p = 0; p < numPredsPerClass; ++p)
                 {
-                    classLabelScores[p] = confData[p * numClasses + c];
+                    labelScores.at<float>(c, p) = confData[p * numClasses + c];
                 }
             }
         }
 
@@ -44,6 +44,7 @@
 #include "layers_common.hpp"
 #include <float.h>
 #include <algorithm>
+#include "opencl_kernels_dnn.hpp"
 
 namespace cv
 {
@@ -173,6 +174,24 @@ class PermuteLayerImpl : public PermuteLayer
         CV_Assert((int)_numAxes == inp0.dims);
 
         computeStrides(shape(*inputs[0]), shape(outputs[0]));
+
+#ifdef HAVE_OPENCL
+        if (uorder.empty())
+        {
+            std::vector<int> orderVec(_order.begin(), _order.end());;
+            Mat morder(1, orderVec.size(), CV_32SC1, &orderVec[0]);
+
+            std::vector<int> oldStrideVec(_oldStride.begin(), _oldStride.end());
+            Mat mold_stride(1, _oldStride.size(), CV_32SC1, &oldStrideVec[0]);
+
+            std::vector<int> newStrideVec(_newStride.begin(), _newStride.end());
+            Mat mnew_stride(1, newStrideVec.size(), CV_32SC1, &newStrideVec[0]);
+
+            morder.copyTo(uorder);
+            mold_stride.copyTo(uold_stride);
+            mnew_stride.copyTo(unew_stride);
+        }
+#endif
     }
 
     class PermuteInvoker : public ParallelLoopBody
@@ -247,11 +266,47 @@ class PermuteLayerImpl : public PermuteLayer
         }
     };
 
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        if (!_needsPermute)
+            return false;
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc);
+
+            kernel.set(0, (int)_count);
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i]));
+            kernel.set(2, ocl::KernelArg::PtrReadOnly(uorder));
+            kernel.set(3, ocl::KernelArg::PtrReadOnly(uold_stride));
+            kernel.set(4, ocl::KernelArg::PtrReadOnly(unew_stride));
+            kernel.set(5, (int)_numAxes);
+            kernel.set(6, ocl::KernelArg::PtrWriteOnly(outputs[i]));
+
+            if (!kernel.run(1, &_count, NULL, false))
+                return false;
+        }
+
+        return true;
+    }
+#endif
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
     {
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
         Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
     }
 
@@ -325,6 +380,10 @@ class PermuteLayerImpl : public PermuteLayer
     std::vector<size_t> _newStride;
     bool _needsPermute;
 
+#ifdef HAVE_OPENCL
+    UMat uorder, uold_stride, unew_stride;
+#endif
+
     size_t _numAxes;
 };