Skip to content

Commit 8b3d660

Browse files
authored
another round of dnn optimization (opencv#9011)
* another round of dnn optimization: * increased malloc alignment across OpenCV from 16 to 64 bytes to make it AVX2 and even AVX-512 friendly * improved SIMD optimization of pooling layer, optimized average pooling * cleaned up convolution layer implementation * made activation layer "attacheable" to all other layers, including fully connected and addition layer. * fixed bug in the fusion algorithm: "LayerData::consumers" should not be cleared, because it desctibes the topology. * greatly optimized permutation layer, which improved SSD performance * parallelized element-wise binary/ternary/... ops (sum, prod, max) * also, added missing copyrights to many of the layer implementation files * temporarily disabled (again) the check for intermediate blobs consistency; fixed warnings from various builders
1 parent 82ec76c commit 8b3d660

29 files changed

+696
-336
lines changed

modules/core/include/opencv2/core/private.hpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ namespace cv
131131
\****************************************************************************************/
132132

133133
/* the alignment of all the allocated buffers */
134-
#define CV_MALLOC_ALIGN 16
134+
#define CV_MALLOC_ALIGN 64
135135

136136
/* IEEE754 constants and macros */
137137
#define CV_TOGGLE_FLT(x) ((x)^((int)(x) < 0 ? 0x7fffffff : 0))
@@ -241,11 +241,6 @@ CV_EXPORTS void scalarToRawData(const cv::Scalar& s, void* buf, int type, int un
241241
#include "iw++/iw.hpp"
242242
#endif
243243

244-
#ifdef CV_MALLOC_ALIGN
245-
#undef CV_MALLOC_ALIGN
246-
#endif
247-
#define CV_MALLOC_ALIGN 32 // required for AVX optimization
248-
249244
#if IPP_VERSION_X100 >= 201700
250245
#define CV_IPP_MALLOC(SIZE) ippMalloc_L(SIZE)
251246
#else

modules/dnn/include/opencv2/dnn/all_layers.hpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -201,15 +201,9 @@ namespace dnn
201201
String padMode;
202202
};
203203

204-
class CV_EXPORTS ActivationLayer;
205-
class CV_EXPORTS BatchNormLayer;
206-
207204
class CV_EXPORTS ConvolutionLayer : public BaseConvolutionLayer
208205
{
209206
public:
210-
virtual bool setActivation(const Ptr<ActivationLayer>& layer) = 0;
211-
virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer) = 0;
212-
213207
static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
214208
};
215209

modules/dnn/include/opencv2/dnn/dnn.hpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,9 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
148148
int targetId; //!< Target identifier.
149149
};
150150

151+
class CV_EXPORTS ActivationLayer;
152+
class CV_EXPORTS BatchNormLayer;
153+
151154
/** @brief This interface class allows to build new Layers - are building blocks of networks.
152155
*
153156
* Each class, derived from Layer, must implement allocate() methods to declare own outputs and forward() to compute outputs.
@@ -248,6 +251,22 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
248251
*/
249252
virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node);
250253

254+
/**
255+
* @brief Tries to attach to the layer the subsequent activation layer, i.e. do the layer fusion in a partial case.
256+
* @param[in] layer The subsequent activation layer.
257+
*
258+
* Returns true if the activation layer has been attached successfully.
259+
*/
260+
virtual bool setActivation(const Ptr<ActivationLayer>& layer);
261+
262+
/**
263+
* @brief Tries to attach to the layer the subsequent batch normalization layer, i.e. do the layer fusion in a partial case.
264+
* @param[in] layer The subsequent batch normalization layer.
265+
*
266+
* Returns true if the batch normalization layer has been attached successfully.
267+
*/
268+
virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer);
269+
251270
virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
252271
const int requiredOutputs,
253272
std::vector<MatShape> &outputs,

modules/dnn/src/dnn.cpp

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -674,16 +674,16 @@ struct Net::Impl
674674
it->second.internals.clear();
675675
}
676676
it->second.skipFlags.clear();
677-
it->second.consumers.clear();
678-
Ptr<ConvolutionLayer> convLayer = it->second.layerInstance.dynamicCast<ConvolutionLayer>();
677+
//it->second.consumers.clear();
678+
Ptr<Layer> currLayer = it->second.layerInstance;
679679

680-
if( !convLayer.empty() )
681-
{
682-
convLayer->setActivation(Ptr<ActivationLayer>());
683-
convLayer->setBatchNorm(Ptr<BatchNormLayer>());
684-
}
680+
if( currLayer.empty() )
681+
continue;
682+
683+
currLayer->setActivation(Ptr<ActivationLayer>());
684+
currLayer->setBatchNorm(Ptr<BatchNormLayer>());
685685

686-
Ptr<PoolingLayer> poolingLayer = it->second.layerInstance.dynamicCast<PoolingLayer>();
686+
Ptr<PoolingLayer> poolingLayer = currLayer.dynamicCast<PoolingLayer>();
687687
if( !poolingLayer.empty() )
688688
{
689689
poolingLayer->computeMaxIdx = true;
@@ -1042,10 +1042,9 @@ struct Net::Impl
10421042
}
10431043
if( ld.consumers.size() == 0 )
10441044
outnames.push_back(ld.layerInstance->name);
1045-
Ptr<ConvolutionLayer> convLayer = ld.layerInstance.dynamicCast<ConvolutionLayer>();
1046-
LayerPin lp(lid, 0);
1047-
if( !convLayer.empty() && ld.consumers.size() == 1 &&
1048-
pinsToKeep.count(lp) == 0 )
1045+
1046+
Ptr<Layer>& currLayer = ld.layerInstance;
1047+
if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
10491048
{
10501049
LayerData* nextData = &layers[ld.consumers[0].lid];
10511050
Ptr<BatchNormLayer> nextBNormLayer =
@@ -1055,7 +1054,7 @@ struct Net::Impl
10551054
{
10561055
LayerData* bnormData = nextData;
10571056
nextData = 0;
1058-
if( convLayer->setBatchNorm(nextBNormLayer) )
1057+
if( currLayer->setBatchNorm(nextBNormLayer) )
10591058
{
10601059
bnormData->skipFlags[DNN_BACKEND_DEFAULT] = true;
10611060
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
@@ -1068,8 +1067,9 @@ struct Net::Impl
10681067
if( nextData )
10691068
nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
10701069

1071-
if( !nextActivLayer.empty() && convLayer->setActivation(nextActivLayer) )
1070+
if( !nextActivLayer.empty() && currLayer->setActivation(nextActivLayer) )
10721071
{
1072+
//printf("successfully merged %s and %s\n", currLayer->name.c_str(), nextActivLayer->name.c_str());
10731073
nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
10741074
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
10751075
}
@@ -1084,7 +1084,10 @@ struct Net::Impl
10841084
// if there is no layer that takes the second output pin of the pooling layer
10851085
// on input then we don't need to compute the indices
10861086
if( i >= nconsumers )
1087+
{
10871088
poolingLayer->computeMaxIdx = false;
1089+
//printf("simplified pooling layer %s\n", poolingLayer->name.c_str());
1090+
}
10881091
}
10891092
}
10901093
}
@@ -1875,6 +1878,9 @@ Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
18751878
return Ptr<BackendNode>();
18761879
}
18771880

1881+
bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
1882+
bool Layer::setBatchNorm(const Ptr<BatchNormLayer>&) { return false; }
1883+
18781884
template <typename T>
18791885
static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
18801886
{

modules/dnn/src/layers/blank_layer.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
// For Open Source Computer Vision Library
1212
//
1313
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14+
// Copyright (C) 2017, Intel Corporation, all rights reserved.
1415
// Third party copyrights are property of their respective owners.
1516
//
1617
// Redistribution and use in source and binary forms, with or without modification,

modules/dnn/src/layers/concat_layer.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
// For Open Source Computer Vision Library
1212
//
1313
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14+
// Copyright (C) 2017, Intel Corporation, all rights reserved.
1415
// Third party copyrights are property of their respective owners.
1516
//
1617
// Redistribution and use in source and binary forms, with or without modification,

modules/dnn/src/layers/convolution_layer.cpp

Lines changed: 69 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
// For Open Source Computer Vision Library
1212
//
1313
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14+
// Copyright (C) 2017, Intel Corporation, all rights reserved.
1415
// Third party copyrights are property of their respective owners.
1516
//
1617
// Redistribution and use in source and binary forms, with or without modification,
@@ -95,8 +96,6 @@ class BaseConvolutionLayerImpl : public ConvolutionLayer
9596
(stride.height == 1 && stride.width == 1) &&
9697
(dilation.height == 1 && dilation.width == 1);
9798
}
98-
bool setActivation(const Ptr<ActivationLayer>& ) { return false; }
99-
bool setBatchNorm(const Ptr<BatchNormLayer>& ) { return false; }
10099

101100
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
102101
const std::vector<Mat*> &inputs,
@@ -195,14 +194,19 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
195194
return false;
196195
}
197196

198-
bool setActivation(const Ptr<ActivationLayer>& layer) { activ = layer; return true; }
197+
bool setActivation(const Ptr<ActivationLayer>& layer)
198+
{
199+
activ = layer;
200+
return !activ.empty();
201+
}
202+
199203
bool setBatchNorm(const Ptr<BatchNormLayer>& layer )
200204
{
201205
bnorm = layer;
202206
// we will need to re-compute the weights with the batch
203207
// norm coefficients taken into account
204208
weightsMat.release();
205-
return true;
209+
return !bnorm.empty();
206210
}
207211

208212
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
@@ -289,7 +293,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
289293
const std::vector<float>& biasvec,
290294
const std::vector<float>& reluslope,
291295
Size kernel, Size pad, Size stride, Size dilation,
292-
int ngroups, int nstripes, const ActivationLayer* activ )
296+
const ActivationLayer* activ, int ngroups, int nstripes )
293297
{
294298
CV_Assert( input.dims == 4 && output.dims == 4 &&
295299
input.size[0] == output.size[0] &&
@@ -315,7 +319,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
315319
int inpCnAll = input.size[1], width = input.size[3], height = input.size[2];
316320
int inpCn = inpCnAll / ngroups;
317321
p.is1x1_ = kernel == Size(0,0) && pad == Size(0, 0);
318-
p.useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
322+
p.useAVX2 = checkHardwareSupport(CPU_AVX2);
319323

320324
int ncn = std::min(inpCn, (int)BLK_SIZE_CN);
321325
p.ofstab_.resize(kernel.width*kernel.height*ncn);
@@ -418,64 +422,73 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
418422
for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += BLK_SIZE )
419423
{
420424
int ofs, ofs1 = std::min(ofs0 + BLK_SIZE, stripeEnd);
425+
int out_i = ofs0 / outW;
426+
int out_j = ofs0 - out_i * outW;
421427

422428
// do im2row for a part of input tensor
423-
if( is1x1 )
429+
float* rowbuf = rowbuf0;
430+
for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
424431
{
425-
for( ofs = ofs0; ofs < ofs1; ofs++ )
432+
int delta = std::min(ofs1 - ofs, outW - out_j);
433+
int out_j1 = out_j + delta;
434+
int in_i = out_i * stride_h - pad_h;
435+
int in_j = out_j * stride_w - pad_w;
436+
const float* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
437+
ofs += delta;
438+
439+
// do im2row for a part of input tensor
440+
if( is1x1 )
426441
{
427-
int out_i = ofs / outW;
428-
int out_j = ofs - out_i * outW;
429-
float* rowbuf = rowbuf0 + (ofs - ofs0)*vsz_a;
430-
431-
int in_i = out_i * stride_h - pad_h;
432-
int in_j = out_j * stride_w - pad_w;
433-
const float* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
434-
435-
for( k = 0; k < vsz; k++ )
436-
rowbuf[k] = imgptr[k*inpPlaneSize];
437-
}
438-
}
439-
else
440-
{
441-
for( ofs = ofs0; ofs < ofs1; ofs++ )
442-
{
443-
int out_i = ofs / outW;
444-
int out_j = ofs - out_i * outW;
445-
float* rowbuf = rowbuf0 + (ofs - ofs0)*vsz_a;
446-
447-
int in_i = out_i * stride_h - pad_h;
448-
int in_j = out_j * stride_w - pad_w;
449-
const float* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
450-
451-
// this condition should be true for most of the tensor elements, i.e.
452-
// most of the time the kernel aperture is inside the tensor X-Y plane.
453-
if( 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h &&
454-
0 <= in_j && in_j < width - (kernel_w-1)*dilation_w )
442+
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
455443
{
456444
for( k = 0; k < vsz; k++ )
457-
rowbuf[k] = imgptr[ofstab[k]];
445+
rowbuf[k] = imgptr[k*inpPlaneSize];
458446
}
459-
else
447+
}
448+
else
449+
{
450+
bool ok_i = 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h;
451+
int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
452+
int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
453+
454+
for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
460455
{
461-
int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
462-
int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
463-
int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
464-
int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
465-
466-
// here some non-continous sub-row of the row will not be
467-
// filled from the tensor; we need to make sure that the uncovered
468-
// elements are explicitly set to 0's. the easiest way is to
469-
// set all the elements to 0's before the loop.
470-
memset(rowbuf, 0, vsz*sizeof(rowbuf[0]));
471-
for( k = 0; k < ncn; k++, imgptr += width*height )
456+
// this condition should be true for most of the tensor elements, i.e.
457+
// most of the time the kernel aperture is inside the tensor X-Y plane.
458+
if( ok_i && out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w )
459+
{
460+
for( k = 0; k < vsz; k++ )
461+
{
462+
int k1 = ofstab[k];
463+
float v0 = imgptr[k1];
464+
float v1 = imgptr[k1 + stride_w];
465+
rowbuf[k] = v0;
466+
rowbuf[k+vsz_a] = v1;
467+
}
468+
out_j++;
469+
rowbuf += vsz_a;
470+
imgptr += stride_w;
471+
in_j += stride_w;
472+
}
473+
else
472474
{
473-
for( i = i0; i < i1; i++ )
475+
int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
476+
int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
477+
478+
// here some non-continous sub-row of the row will not be
479+
// filled from the tensor; we need to make sure that the uncovered
480+
// elements are explicitly set to 0's. the easiest way is to
481+
// set all the elements to 0's before the loop.
482+
memset(rowbuf, 0, vsz*sizeof(rowbuf[0]));
483+
for( k = 0; k < ncn; k++ )
474484
{
475-
for( j = j0; j < j1; j++ )
485+
for( i = i0; i < i1; i++ )
476486
{
477-
int imgofs = i*(dilation_h*width) + j*dilation_w;
478-
rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
487+
for( j = j0; j < j1; j++ )
488+
{
489+
int imgofs = k*(width*height) + i*(dilation_h*width) + j*dilation_w;
490+
rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
491+
}
479492
}
480493
}
481494
}
@@ -625,7 +638,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
625638
{
626639
// prepare weightsMat where each row is aligned and has enough zero padding on the right to
627640
// use vectorized (i.e. with intrinsics) loops without tail processing
628-
Mat wm = blobs[0].reshape(1, outCn).clone();
641+
Mat wm = blobs[0].reshape(1, outCn);
629642
if( wm.step1() % VEC_ALIGN != 0 )
630643
{
631644
int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
@@ -698,7 +711,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
698711
int nstripes = std::max(getNumThreads(), 1);
699712

700713
ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasvec, reluslope,
701-
kernel, pad, stride, dilation, ngroups, nstripes, activ.get());
714+
kernel, pad, stride, dilation, activ.get(), ngroups, nstripes);
702715
}
703716

704717
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
@@ -776,7 +789,7 @@ class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
776789
b_ = &b;
777790
c_ = &c;
778791
nstripes_ = nstripes;
779-
useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
792+
useAVX2 = checkHardwareSupport(CPU_AVX2);
780793
}
781794

782795
void operator()(const Range& range_) const

modules/dnn/src/layers/crop_layer.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
// For Open Source Computer Vision Library
1212
//
1313
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14+
// Copyright (C) 2017, Intel Corporation, all rights reserved.
1415
// Third party copyrights are property of their respective owners.
1516
//
1617
// Redistribution and use in source and binary forms, with or without modification,

modules/dnn/src/layers/detection_output_layer.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
// For Open Source Computer Vision Library
1212
//
1313
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14+
// Copyright (C) 2017, Intel Corporation, all rights reserved.
1415
// Third party copyrights are property of their respective owners.
1516
//
1617
// Redistribution and use in source and binary forms, with or without modification,

0 commit comments

Comments
 (0)