pythonwebcoder
diff --git a/‎modules/dnn/src/dnn.cpp
Lines changed: 26 additions & 10 deletions b/‎modules/dnn/src/dnn.cpp
Lines changed: 26 additions & 10 deletions
diff --git a/‎modules/dnn/src/layers/convolution_layer.cpp
Lines changed: 82 additions & 15 deletions b/‎modules/dnn/src/layers/convolution_layer.cpp
Lines changed: 82 additions & 15 deletions
@@ -1023,7 +1023,7 @@ struct Net::Impl
 
     void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
     {
-        if( !fusion || !(preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_CPU))
+        if( !fusion || preferableBackend != DNN_BACKEND_DEFAULT)
             return;
 
         CV_TRACE_FUNCTION();
@@ -1051,6 +1051,11 @@ struct Net::Impl
             // with the current layer if they follow it. Normally, the are fused with the convolution layer,
             // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
             // some other layers.
+
+            // TODO: OpenCL target support more fusion styles.
+            if ( preferableTarget == DNN_TARGET_OPENCL && ld.layerInstance->type.compare("Convolution") )
+                continue;
+
             Ptr<Layer>& currLayer = ld.layerInstance;
             if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
             {
@@ -1095,16 +1100,27 @@ struct Net::Impl
                     }
                 }
 
-                Ptr<ActivationLayer> nextActivLayer;
-                if( nextData )
-                    nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
-
-                if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0
-                        && currLayer->setActivation(nextActivLayer) )
+                // For now,  OpenCL target only support fusion with activation of ReLU/ChannelsPReLU
+                if ( preferableTarget != DNN_TARGET_OPENCL ||
+                        (preferableTarget == DNN_TARGET_OPENCL &&
+                         nextData &&
+                        (!nextData->type.compare("ReLU") ||
+                         !nextData->type.compare("ChannelsPReLU"))) )
                 {
-                    printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
-                    nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
-                    ld.outputBlobs = layers[lpNext.lid].outputBlobs;
+
+                    Ptr<ActivationLayer> nextActivLayer;
+
+                    if( nextData )
+                        nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
+
+                    if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0
+                            && currLayer->setActivation(nextActivLayer) )
+                    {
+                        LayerData *activData = nextData;
+                        printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
+                        activData->skipFlags[DNN_BACKEND_DEFAULT] = true;
+                        ld.outputBlobs = layers[lpNext.lid].outputBlobs;
+                    }
                 }
             }
 
 
@@ -157,7 +157,20 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
 #ifdef HAVE_OPENCL
     Ptr<OCL4DNNConvSpatial<float> > convolutionOp;
     std::vector<UMat> umat_blobs;
+    bool fusedBias;
+    bool newWeightAndBias;
+    bool newActiv;
+    ocl4dnnFusedActiv_t activType;
 #endif
+    ConvolutionLayerImpl()
+    {
+#ifdef HAVE_OPENCL
+        fusedBias = false;
+        newWeightAndBias = false;
+        newActiv = false;
+        activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
+#endif
+    }
 
     MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
     {
@@ -209,6 +222,10 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
         activ = layer;
         if (activ.empty())
             reluslope.clear();
+#ifdef HAVE_OPENCL
+        newActiv = true;
+        activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
+#endif
         return !activ.empty();
     }
 
@@ -221,6 +238,10 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
         // we will need to re-compute the weights with the batch
         // norm coefficients taken into account
         weightsMat.release();
+#ifdef HAVE_OPENCL
+        newWeightAndBias = true;
+        fusedBias = false;
+#endif
         return !bnorm.empty();
     }
 
@@ -230,6 +251,10 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
         // we will need to re-compute the weights with the scaling
         // coefficients taken into account
         weightsMat.release();
+#ifdef HAVE_OPENCL
+        newWeightAndBias = true;
+        fusedBias = false;
+#endif
         return !scaleLayer.empty();
     }
 
@@ -665,19 +690,49 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
             convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
         }
 
-        for (size_t ii = 0; ii < outputs.size(); ii++)
+        if ( newWeightAndBias )
         {
-            UMat inpMat, outMat;
-            inpMat = inputs[ii]->getUMat(ACCESS_READ);
-            outMat = outputs[ii].getUMat(ACCESS_WRITE);
-
-            int batch_size = inpMat.size[0];
+            weightsMat.copyTo(umat_blobs[0]);
+            if ( fusedBias )
+            {
+                if ( umat_blobs.size() < 2 )
+                    umat_blobs.resize(2);
+                umat_blobs[1] = UMat(biasvec, true);
+            }
+            convolutionOp->setBias(fusedBias || hasBias());
+            newWeightAndBias = false;
+        }
 
-            if (!convolutionOp->Forward(inpMat, umat_blobs[0], hasBias() ? umat_blobs[1] : UMat(),
-                                        outMat, batch_size))
-               return false;
+        if ( newActiv )
+        {
+            if ( activType == OCL4DNN_CONV_FUSED_ACTIV_RELU )
+            {
+                CV_Assert(!reluslope.empty());
+                convolutionOp->setActivReLU(true, reluslope[0]);
+            }
+            else if ( activType == OCL4DNN_CONV_FUSED_ACTIV_PRELU)
+            {
+                CV_Assert(!reluslope.empty());
+                convolutionOp->setActivPReLU(true, reluslope);
+            }
+            else
+            {
+                convolutionOp->setActivReLU(false, 0);
+                convolutionOp->setActivPReLU(false, reluslope);
+            }
+            newActiv = false;
         }
-        return true;
+
+        UMat inpMat, outMat;
+        inpMat = inputs[0]->getUMat(ACCESS_READ);
+        outMat = outputs[0].getUMat(ACCESS_WRITE);
+        int batch_size = inpMat.size[0];
+
+        return convolutionOp->Forward(inpMat,
+                                      umat_blobs[0],
+                                      (hasBias() || fusedBias) ? umat_blobs[1] : UMat(),
+                                      outMat,
+                                      batch_size);
     }
 #endif
 
@@ -693,11 +748,6 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
         CV_Assert(inputs.size() == (size_t)1 && inputs[0]->size[1] % blobs[0].size[1] == 0);
         int ngroups = inputs[0]->size[1]/blobs[0].size[1];
         CV_Assert(outputs[0].size[1] % ngroups == 0);
-
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
-                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
-                   forward_ocl(inputs, outputs, internals))
-
         int k, outCn = blobs[0].size[0];
 
         if( weightsMat.empty() )
@@ -761,6 +811,11 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
                     }
                 }
 
+#ifdef HAVE_OPENCL
+                if (shiftptr || shiftptr2)
+                    fusedBias = true;
+#endif
+
                 for( int i = 0; i < outCn; i++ )
                 {
                     float s1 = scaleptr ? scaleptr[i] : 1.f;
@@ -784,7 +839,12 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
         {
             Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
             if( !activ_relu.empty() )
+            {
                 reluslope.assign(outCn+2, activ_relu->negativeSlope);
+#ifdef HAVE_OPENCL
+                activType = OCL4DNN_CONV_FUSED_ACTIV_RELU;
+#endif
+            }
 
             Ptr<ChannelsPReLULayer> activ_chprelu = activ.dynamicCast<ChannelsPReLULayer>();
             if( !activ_chprelu.empty() )
@@ -795,9 +855,16 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
                 reluslope.resize(outCn+2);
                 std::copy(mdata, mdata + outCn, reluslope.begin());
                 reluslope[outCn] = reluslope[outCn+1] = reluslope[outCn-1];
+#ifdef HAVE_OPENCL
+                activType = OCL4DNN_CONV_FUSED_ACTIV_PRELU;
+#endif
             }
         }
 
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs, outputs, internals))
+
         int nstripes = std::max(getNumThreads(), 1);
 
         ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasvec, reluslope,