@@ -157,7 +157,20 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
157
157
#ifdef HAVE_OPENCL
158
158
Ptr<OCL4DNNConvSpatial<float > > convolutionOp;
159
159
std::vector<UMat> umat_blobs;
160
+ bool fusedBias;
161
+ bool newWeightAndBias;
162
+ bool newActiv;
163
+ ocl4dnnFusedActiv_t activType;
160
164
#endif
165
+ ConvolutionLayerImpl ()
166
+ {
167
+ #ifdef HAVE_OPENCL
168
+ fusedBias = false ;
169
+ newWeightAndBias = false ;
170
+ newActiv = false ;
171
+ activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
172
+ #endif
173
+ }
161
174
162
175
MatShape computeColRowShape (const MatShape &inpShape, const MatShape &outShape) const
163
176
{
@@ -209,6 +222,10 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
209
222
activ = layer;
210
223
if (activ.empty ())
211
224
reluslope.clear ();
225
+ #ifdef HAVE_OPENCL
226
+ newActiv = true ;
227
+ activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
228
+ #endif
212
229
return !activ.empty ();
213
230
}
214
231
@@ -221,6 +238,10 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
221
238
// we will need to re-compute the weights with the batch
222
239
// norm coefficients taken into account
223
240
weightsMat.release ();
241
+ #ifdef HAVE_OPENCL
242
+ newWeightAndBias = true ;
243
+ fusedBias = false ;
244
+ #endif
224
245
return !bnorm.empty ();
225
246
}
226
247
@@ -230,6 +251,10 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
230
251
// we will need to re-compute the weights with the scaling
231
252
// coefficients taken into account
232
253
weightsMat.release ();
254
+ #ifdef HAVE_OPENCL
255
+ newWeightAndBias = true ;
256
+ fusedBias = false ;
257
+ #endif
233
258
return !scaleLayer.empty ();
234
259
}
235
260
@@ -665,19 +690,49 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
665
690
convolutionOp = Ptr<OCL4DNNConvSpatial<float > >(new OCL4DNNConvSpatial<float >(config));
666
691
}
667
692
668
- for ( size_t ii = 0 ; ii < outputs. size (); ii++ )
693
+ if ( newWeightAndBias )
669
694
{
670
- UMat inpMat, outMat;
671
- inpMat = inputs[ii]->getUMat (ACCESS_READ);
672
- outMat = outputs[ii].getUMat (ACCESS_WRITE);
673
-
674
- int batch_size = inpMat.size [0 ];
695
+ weightsMat.copyTo (umat_blobs[0 ]);
696
+ if ( fusedBias )
697
+ {
698
+ if ( umat_blobs.size () < 2 )
699
+ umat_blobs.resize (2 );
700
+ umat_blobs[1 ] = UMat (biasvec, true );
701
+ }
702
+ convolutionOp->setBias (fusedBias || hasBias ());
703
+ newWeightAndBias = false ;
704
+ }
675
705
676
- if (!convolutionOp->Forward (inpMat, umat_blobs[0 ], hasBias () ? umat_blobs[1 ] : UMat (),
677
- outMat, batch_size))
678
- return false ;
706
+ if ( newActiv )
707
+ {
708
+ if ( activType == OCL4DNN_CONV_FUSED_ACTIV_RELU )
709
+ {
710
+ CV_Assert (!reluslope.empty ());
711
+ convolutionOp->setActivReLU (true , reluslope[0 ]);
712
+ }
713
+ else if ( activType == OCL4DNN_CONV_FUSED_ACTIV_PRELU)
714
+ {
715
+ CV_Assert (!reluslope.empty ());
716
+ convolutionOp->setActivPReLU (true , reluslope);
717
+ }
718
+ else
719
+ {
720
+ convolutionOp->setActivReLU (false , 0 );
721
+ convolutionOp->setActivPReLU (false , reluslope);
722
+ }
723
+ newActiv = false ;
679
724
}
680
- return true ;
725
+
726
+ UMat inpMat, outMat;
727
+ inpMat = inputs[0 ]->getUMat (ACCESS_READ);
728
+ outMat = outputs[0 ].getUMat (ACCESS_WRITE);
729
+ int batch_size = inpMat.size [0 ];
730
+
731
+ return convolutionOp->Forward (inpMat,
732
+ umat_blobs[0 ],
733
+ (hasBias () || fusedBias) ? umat_blobs[1 ] : UMat (),
734
+ outMat,
735
+ batch_size);
681
736
}
682
737
#endif
683
738
@@ -693,11 +748,6 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
693
748
CV_Assert (inputs.size () == (size_t )1 && inputs[0 ]->size [1 ] % blobs[0 ].size [1 ] == 0 );
694
749
int ngroups = inputs[0 ]->size [1 ]/blobs[0 ].size [1 ];
695
750
CV_Assert (outputs[0 ].size [1 ] % ngroups == 0 );
696
-
697
- CV_OCL_RUN ((preferableTarget == DNN_TARGET_OPENCL) &&
698
- OCL_PERFORMANCE_CHECK (ocl::Device::getDefault ().isIntel ()),
699
- forward_ocl (inputs, outputs, internals))
700
-
701
751
int k, outCn = blobs[0 ].size [0 ];
702
752
703
753
if ( weightsMat.empty () )
@@ -761,6 +811,11 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
761
811
}
762
812
}
763
813
814
+ #ifdef HAVE_OPENCL
815
+ if (shiftptr || shiftptr2)
816
+ fusedBias = true ;
817
+ #endif
818
+
764
819
for ( int i = 0 ; i < outCn; i++ )
765
820
{
766
821
float s1 = scaleptr ? scaleptr[i] : 1 .f ;
@@ -784,7 +839,12 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
784
839
{
785
840
Ptr<ReLULayer> activ_relu = activ.dynamicCast <ReLULayer>();
786
841
if ( !activ_relu.empty () )
842
+ {
787
843
reluslope.assign (outCn+2 , activ_relu->negativeSlope );
844
+ #ifdef HAVE_OPENCL
845
+ activType = OCL4DNN_CONV_FUSED_ACTIV_RELU;
846
+ #endif
847
+ }
788
848
789
849
Ptr<ChannelsPReLULayer> activ_chprelu = activ.dynamicCast <ChannelsPReLULayer>();
790
850
if ( !activ_chprelu.empty () )
@@ -795,9 +855,16 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
795
855
reluslope.resize (outCn+2 );
796
856
std::copy (mdata, mdata + outCn, reluslope.begin ());
797
857
reluslope[outCn] = reluslope[outCn+1 ] = reluslope[outCn-1 ];
858
+ #ifdef HAVE_OPENCL
859
+ activType = OCL4DNN_CONV_FUSED_ACTIV_PRELU;
860
+ #endif
798
861
}
799
862
}
800
863
864
+ CV_OCL_RUN ((preferableTarget == DNN_TARGET_OPENCL) &&
865
+ OCL_PERFORMANCE_CHECK (ocl::Device::getDefault ().isIntel ()),
866
+ forward_ocl (inputs, outputs, internals))
867
+
801
868
int nstripes = std::max (getNumThreads (), 1 );
802
869
803
870
ParallelConv::run (*inputs[0 ], outputs[0 ], weightsMat, biasvec, reluslope,
0 commit comments