Skip to content

Commit e0e4040

Browse files
committed
Merge pull request opencv#9847 from wzw-intel:ocl4dnn_fusion
2 parents ff037eb + 2d8f2c2 commit e0e4040

File tree

6 files changed

+721
-707
lines changed

6 files changed

+721
-707
lines changed

modules/dnn/src/dnn.cpp

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,7 +1028,7 @@ struct Net::Impl
10281028

10291029
void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
10301030
{
1031-
if( !fusion || !(preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_CPU))
1031+
if( !fusion || preferableBackend != DNN_BACKEND_DEFAULT)
10321032
return;
10331033

10341034
CV_TRACE_FUNCTION();
@@ -1056,6 +1056,11 @@ struct Net::Impl
10561056
// with the current layer if they follow it. Normally, the are fused with the convolution layer,
10571057
// but some of them (like activation) may be fused with fully-connected, elemwise (+) and
10581058
// some other layers.
1059+
1060+
// TODO: OpenCL target support more fusion styles.
1061+
if ( preferableTarget == DNN_TARGET_OPENCL && ld.layerInstance->type.compare("Convolution") )
1062+
continue;
1063+
10591064
Ptr<Layer>& currLayer = ld.layerInstance;
10601065
if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
10611066
{
@@ -1100,16 +1105,27 @@ struct Net::Impl
11001105
}
11011106
}
11021107

1103-
Ptr<ActivationLayer> nextActivLayer;
1104-
if( nextData )
1105-
nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
1106-
1107-
if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0
1108-
&& currLayer->setActivation(nextActivLayer) )
1108+
// For now, OpenCL target only support fusion with activation of ReLU/ChannelsPReLU
1109+
if ( preferableTarget != DNN_TARGET_OPENCL ||
1110+
(preferableTarget == DNN_TARGET_OPENCL &&
1111+
nextData &&
1112+
(!nextData->type.compare("ReLU") ||
1113+
!nextData->type.compare("ChannelsPReLU"))) )
11091114
{
1110-
printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
1111-
nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
1112-
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
1115+
1116+
Ptr<ActivationLayer> nextActivLayer;
1117+
1118+
if( nextData )
1119+
nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
1120+
1121+
if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0
1122+
&& currLayer->setActivation(nextActivLayer) )
1123+
{
1124+
LayerData *activData = nextData;
1125+
printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
1126+
activData->skipFlags[DNN_BACKEND_DEFAULT] = true;
1127+
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
1128+
}
11131129
}
11141130
}
11151131

modules/dnn/src/layers/convolution_layer.cpp

Lines changed: 82 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,20 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
157157
#ifdef HAVE_OPENCL
158158
Ptr<OCL4DNNConvSpatial<float> > convolutionOp;
159159
std::vector<UMat> umat_blobs;
160+
bool fusedBias;
161+
bool newWeightAndBias;
162+
bool newActiv;
163+
ocl4dnnFusedActiv_t activType;
160164
#endif
165+
ConvolutionLayerImpl()
166+
{
167+
#ifdef HAVE_OPENCL
168+
fusedBias = false;
169+
newWeightAndBias = false;
170+
newActiv = false;
171+
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
172+
#endif
173+
}
161174

162175
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
163176
{
@@ -209,6 +222,10 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
209222
activ = layer;
210223
if (activ.empty())
211224
reluslope.clear();
225+
#ifdef HAVE_OPENCL
226+
newActiv = true;
227+
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
228+
#endif
212229
return !activ.empty();
213230
}
214231

@@ -221,6 +238,10 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
221238
// we will need to re-compute the weights with the batch
222239
// norm coefficients taken into account
223240
weightsMat.release();
241+
#ifdef HAVE_OPENCL
242+
newWeightAndBias = true;
243+
fusedBias = false;
244+
#endif
224245
return !bnorm.empty();
225246
}
226247

@@ -230,6 +251,10 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
230251
// we will need to re-compute the weights with the scaling
231252
// coefficients taken into account
232253
weightsMat.release();
254+
#ifdef HAVE_OPENCL
255+
newWeightAndBias = true;
256+
fusedBias = false;
257+
#endif
233258
return !scaleLayer.empty();
234259
}
235260

@@ -665,19 +690,49 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
665690
convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
666691
}
667692

668-
for (size_t ii = 0; ii < outputs.size(); ii++)
693+
if ( newWeightAndBias )
669694
{
670-
UMat inpMat, outMat;
671-
inpMat = inputs[ii]->getUMat(ACCESS_READ);
672-
outMat = outputs[ii].getUMat(ACCESS_WRITE);
673-
674-
int batch_size = inpMat.size[0];
695+
weightsMat.copyTo(umat_blobs[0]);
696+
if ( fusedBias )
697+
{
698+
if ( umat_blobs.size() < 2 )
699+
umat_blobs.resize(2);
700+
umat_blobs[1] = UMat(biasvec, true);
701+
}
702+
convolutionOp->setBias(fusedBias || hasBias());
703+
newWeightAndBias = false;
704+
}
675705

676-
if (!convolutionOp->Forward(inpMat, umat_blobs[0], hasBias() ? umat_blobs[1] : UMat(),
677-
outMat, batch_size))
678-
return false;
706+
if ( newActiv )
707+
{
708+
if ( activType == OCL4DNN_CONV_FUSED_ACTIV_RELU )
709+
{
710+
CV_Assert(!reluslope.empty());
711+
convolutionOp->setActivReLU(true, reluslope[0]);
712+
}
713+
else if ( activType == OCL4DNN_CONV_FUSED_ACTIV_PRELU)
714+
{
715+
CV_Assert(!reluslope.empty());
716+
convolutionOp->setActivPReLU(true, reluslope);
717+
}
718+
else
719+
{
720+
convolutionOp->setActivReLU(false, 0);
721+
convolutionOp->setActivPReLU(false, reluslope);
722+
}
723+
newActiv = false;
679724
}
680-
return true;
725+
726+
UMat inpMat, outMat;
727+
inpMat = inputs[0]->getUMat(ACCESS_READ);
728+
outMat = outputs[0].getUMat(ACCESS_WRITE);
729+
int batch_size = inpMat.size[0];
730+
731+
return convolutionOp->Forward(inpMat,
732+
umat_blobs[0],
733+
(hasBias() || fusedBias) ? umat_blobs[1] : UMat(),
734+
outMat,
735+
batch_size);
681736
}
682737
#endif
683738

@@ -693,11 +748,6 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
693748
CV_Assert(inputs.size() == (size_t)1 && inputs[0]->size[1] % blobs[0].size[1] == 0);
694749
int ngroups = inputs[0]->size[1]/blobs[0].size[1];
695750
CV_Assert(outputs[0].size[1] % ngroups == 0);
696-
697-
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
698-
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
699-
forward_ocl(inputs, outputs, internals))
700-
701751
int k, outCn = blobs[0].size[0];
702752

703753
if( weightsMat.empty() )
@@ -761,6 +811,11 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
761811
}
762812
}
763813

814+
#ifdef HAVE_OPENCL
815+
if (shiftptr || shiftptr2)
816+
fusedBias = true;
817+
#endif
818+
764819
for( int i = 0; i < outCn; i++ )
765820
{
766821
float s1 = scaleptr ? scaleptr[i] : 1.f;
@@ -784,7 +839,12 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
784839
{
785840
Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
786841
if( !activ_relu.empty() )
842+
{
787843
reluslope.assign(outCn+2, activ_relu->negativeSlope);
844+
#ifdef HAVE_OPENCL
845+
activType = OCL4DNN_CONV_FUSED_ACTIV_RELU;
846+
#endif
847+
}
788848

789849
Ptr<ChannelsPReLULayer> activ_chprelu = activ.dynamicCast<ChannelsPReLULayer>();
790850
if( !activ_chprelu.empty() )
@@ -795,9 +855,16 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
795855
reluslope.resize(outCn+2);
796856
std::copy(mdata, mdata + outCn, reluslope.begin());
797857
reluslope[outCn] = reluslope[outCn+1] = reluslope[outCn-1];
858+
#ifdef HAVE_OPENCL
859+
activType = OCL4DNN_CONV_FUSED_ACTIV_PRELU;
860+
#endif
798861
}
799862
}
800863

864+
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
865+
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
866+
forward_ocl(inputs, outputs, internals))
867+
801868
int nstripes = std::max(getNumThreads(), 1);
802869

803870
ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasvec, reluslope,

0 commit comments

Comments
 (0)