pythonwebcoder
diff --git a/‎modules/core/include/opencv2/core/private.hpp
Lines changed: 1 addition & 6 deletions b/‎modules/core/include/opencv2/core/private.hpp
Lines changed: 1 addition & 6 deletions
diff --git a/‎modules/dnn/include/opencv2/dnn/all_layers.hpp
Lines changed: 0 additions & 6 deletions b/‎modules/dnn/include/opencv2/dnn/all_layers.hpp
Lines changed: 0 additions & 6 deletions
diff --git a/‎modules/dnn/include/opencv2/dnn/dnn.hpp
Lines changed: 19 additions & 0 deletions b/‎modules/dnn/include/opencv2/dnn/dnn.hpp
Lines changed: 19 additions & 0 deletions
diff --git a/‎modules/dnn/src/dnn.cpp
Lines changed: 20 additions & 14 deletions b/‎modules/dnn/src/dnn.cpp
Lines changed: 20 additions & 14 deletions
diff --git a/‎modules/dnn/src/layers/blank_layer.cpp
Lines changed: 1 addition & 0 deletions b/‎modules/dnn/src/layers/blank_layer.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎modules/dnn/src/layers/concat_layer.cpp
Lines changed: 1 addition & 0 deletions b/‎modules/dnn/src/layers/concat_layer.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎modules/dnn/src/layers/convolution_layer.cpp
Lines changed: 69 additions & 56 deletions b/‎modules/dnn/src/layers/convolution_layer.cpp
Lines changed: 69 additions & 56 deletions
diff --git a/‎modules/dnn/src/layers/crop_layer.cpp
Lines changed: 1 addition & 0 deletions b/‎modules/dnn/src/layers/crop_layer.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎modules/dnn/src/layers/detection_output_layer.cpp
Lines changed: 1 addition & 0 deletions b/‎modules/dnn/src/layers/detection_output_layer.cpp
Lines changed: 1 addition & 0 deletions
@@ -131,7 +131,7 @@ namespace cv
 \****************************************************************************************/
 
 /* the alignment of all the allocated buffers */
-#define  CV_MALLOC_ALIGN    16
+#define  CV_MALLOC_ALIGN    64
 
 /* IEEE754 constants and macros */
 #define  CV_TOGGLE_FLT(x) ((x)^((int)(x) < 0 ? 0x7fffffff : 0))
@@ -241,11 +241,6 @@ CV_EXPORTS void scalarToRawData(const cv::Scalar& s, void* buf, int type, int un
 #include "iw++/iw.hpp"
 #endif
 
-#ifdef CV_MALLOC_ALIGN
-#undef CV_MALLOC_ALIGN
-#endif
-#define CV_MALLOC_ALIGN 32 // required for AVX optimization
-
 #if IPP_VERSION_X100 >= 201700
 #define CV_IPP_MALLOC(SIZE) ippMalloc_L(SIZE)
 #else
 
@@ -201,15 +201,9 @@ namespace dnn
         String padMode;
     };
 
-    class CV_EXPORTS ActivationLayer;
-    class CV_EXPORTS BatchNormLayer;
-
     class CV_EXPORTS ConvolutionLayer : public BaseConvolutionLayer
     {
     public:
-        virtual bool setActivation(const Ptr<ActivationLayer>& layer) = 0;
-        virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer) = 0;
-
         static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
     };
 
 
@@ -148,6 +148,9 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
         int targetId;   //!< Target identifier.
     };
 
+    class CV_EXPORTS ActivationLayer;
+    class CV_EXPORTS BatchNormLayer;
+
     /** @brief This interface class allows to build new Layers - are building blocks of networks.
      *
      * Each class, derived from Layer, must implement allocate() methods to declare own outputs and forward() to compute outputs.
@@ -248,6 +251,22 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
          */
         virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node);
 
+        /**
+         * @brief Tries to attach to the layer the subsequent activation layer, i.e. do the layer fusion in a partial case.
+         * @param[in] layer The subsequent activation layer.
+         *
+         * Returns true if the activation layer has been attached successfully.
+         */
+        virtual bool setActivation(const Ptr<ActivationLayer>& layer);
+
+        /**
+         * @brief Tries to attach to the layer the subsequent batch normalization layer, i.e. do the layer fusion in a partial case.
+         * @param[in] layer The subsequent batch normalization layer.
+         *
+         * Returns true if the batch normalization layer has been attached successfully.
+         */
+        virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer);
+
         virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
                                      const int requiredOutputs,
                                      std::vector<MatShape> &outputs,
 
@@ -674,16 +674,16 @@ struct Net::Impl
                 it->second.internals.clear();
             }
             it->second.skipFlags.clear();
-            it->second.consumers.clear();
-            Ptr<ConvolutionLayer> convLayer = it->second.layerInstance.dynamicCast<ConvolutionLayer>();
+            //it->second.consumers.clear();
+            Ptr<Layer> currLayer = it->second.layerInstance;
 
-            if( !convLayer.empty() )
-            {
-                convLayer->setActivation(Ptr<ActivationLayer>());
-                convLayer->setBatchNorm(Ptr<BatchNormLayer>());
-            }
+            if( currLayer.empty() )
+                continue;
+
+            currLayer->setActivation(Ptr<ActivationLayer>());
+            currLayer->setBatchNorm(Ptr<BatchNormLayer>());
 
-            Ptr<PoolingLayer> poolingLayer = it->second.layerInstance.dynamicCast<PoolingLayer>();
+            Ptr<PoolingLayer> poolingLayer = currLayer.dynamicCast<PoolingLayer>();
             if( !poolingLayer.empty() )
             {
                 poolingLayer->computeMaxIdx = true;
@@ -1042,10 +1042,9 @@ struct Net::Impl
             }
             if( ld.consumers.size() == 0 )
                 outnames.push_back(ld.layerInstance->name);
-            Ptr<ConvolutionLayer> convLayer = ld.layerInstance.dynamicCast<ConvolutionLayer>();
-            LayerPin lp(lid, 0);
-            if( !convLayer.empty() && ld.consumers.size() == 1 &&
-                pinsToKeep.count(lp) == 0 )
+
+            Ptr<Layer>& currLayer = ld.layerInstance;
+            if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
             {
                 LayerData* nextData = &layers[ld.consumers[0].lid];
                 Ptr<BatchNormLayer> nextBNormLayer =
@@ -1055,7 +1054,7 @@ struct Net::Impl
                 {
                     LayerData* bnormData = nextData;
                     nextData = 0;
-                    if( convLayer->setBatchNorm(nextBNormLayer) )
+                    if( currLayer->setBatchNorm(nextBNormLayer) )
                     {
                         bnormData->skipFlags[DNN_BACKEND_DEFAULT] = true;
                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
@@ -1068,8 +1067,9 @@ struct Net::Impl
                 if( nextData )
                     nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
 
-                if( !nextActivLayer.empty() && convLayer->setActivation(nextActivLayer) )
+                if( !nextActivLayer.empty() && currLayer->setActivation(nextActivLayer) )
                 {
+                    //printf("successfully merged %s and %s\n", currLayer->name.c_str(), nextActivLayer->name.c_str());
                     nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
                     ld.outputBlobs = layers[lpNext.lid].outputBlobs;
                 }
@@ -1084,7 +1084,10 @@ struct Net::Impl
                 // if there is no layer that takes the second output pin of the pooling layer
                 // on input then we don't need to compute the indices
                 if( i >= nconsumers )
+                {
                     poolingLayer->computeMaxIdx = false;
+                    //printf("simplified pooling layer %s\n", poolingLayer->name.c_str());
+                }
             }
         }
     }
@@ -1875,6 +1878,9 @@ Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
     return Ptr<BackendNode>();
 }
 
+bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
+bool Layer::setBatchNorm(const Ptr<BatchNormLayer>&) { return false; }
+
 template <typename T>
 static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
 {
 
@@ -11,6 +11,7 @@
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 
@@ -11,6 +11,7 @@
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 
@@ -11,6 +11,7 @@
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -95,8 +96,6 @@ class BaseConvolutionLayerImpl : public ConvolutionLayer
         (stride.height == 1 && stride.width == 1) &&
         (dilation.height == 1 && dilation.width == 1);
     }
-    bool setActivation(const Ptr<ActivationLayer>& ) { return false; }
-    bool setBatchNorm(const Ptr<BatchNormLayer>& ) { return false; }
 
     virtual void applyHalideScheduler(Ptr<BackendNode>& node,
                                       const std::vector<Mat*> &inputs,
@@ -195,14 +194,19 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
         return false;
     }
 
-    bool setActivation(const Ptr<ActivationLayer>& layer) { activ = layer; return true; }
+    bool setActivation(const Ptr<ActivationLayer>& layer)
+    {
+        activ = layer;
+        return !activ.empty();
+    }
+
     bool setBatchNorm(const Ptr<BatchNormLayer>& layer )
     {
         bnorm = layer;
         // we will need to re-compute the weights with the batch
         // norm coefficients taken into account
         weightsMat.release();
-        return true;
+        return !bnorm.empty();
     }
 
     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
@@ -289,7 +293,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
                          const std::vector<float>& biasvec,
                          const std::vector<float>& reluslope,
                          Size kernel, Size pad, Size stride, Size dilation,
-                         int ngroups, int nstripes, const ActivationLayer* activ )
+                         const ActivationLayer* activ, int ngroups, int nstripes )
         {
             CV_Assert( input.dims == 4 && output.dims == 4 &&
                        input.size[0] == output.size[0] &&
@@ -315,7 +319,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
             int inpCnAll = input.size[1], width = input.size[3], height = input.size[2];
             int inpCn = inpCnAll / ngroups;
             p.is1x1_ = kernel == Size(0,0) && pad == Size(0, 0);
-            p.useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
+            p.useAVX2 = checkHardwareSupport(CPU_AVX2);
 
             int ncn = std::min(inpCn, (int)BLK_SIZE_CN);
             p.ofstab_.resize(kernel.width*kernel.height*ncn);
@@ -418,64 +422,73 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
                     for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += BLK_SIZE )
                     {
                         int ofs, ofs1 = std::min(ofs0 + BLK_SIZE, stripeEnd);
+                        int out_i = ofs0 / outW;
+                        int out_j = ofs0 - out_i * outW;
 
                         // do im2row for a part of input tensor
-                        if( is1x1 )
+                        float* rowbuf = rowbuf0;
+                        for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
                         {
-                            for( ofs = ofs0; ofs < ofs1; ofs++ )
+                            int delta = std::min(ofs1 - ofs, outW - out_j);
+                            int out_j1 = out_j + delta;
+                            int in_i = out_i * stride_h - pad_h;
+                            int in_j = out_j * stride_w - pad_w;
+                            const float* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
+                            ofs += delta;
+
+                            // do im2row for a part of input tensor
+                            if( is1x1 )
                             {
-                                int out_i = ofs / outW;
-                                int out_j = ofs - out_i * outW;
-                                float* rowbuf = rowbuf0 + (ofs - ofs0)*vsz_a;
-
-                                int in_i = out_i * stride_h - pad_h;
-                                int in_j = out_j * stride_w - pad_w;
-                                const float* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
-
-                                for( k = 0; k < vsz; k++ )
-                                    rowbuf[k] = imgptr[k*inpPlaneSize];
-                            }
-                        }
-                        else
-                        {
-                            for( ofs = ofs0; ofs < ofs1; ofs++ )
-                            {
-                                int out_i = ofs / outW;
-                                int out_j = ofs - out_i * outW;
-                                float* rowbuf = rowbuf0 + (ofs - ofs0)*vsz_a;
-
-                                int in_i = out_i * stride_h - pad_h;
-                                int in_j = out_j * stride_w - pad_w;
-                                const float* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
-
-                                // this condition should be true for most of the tensor elements, i.e.
-                                // most of the time the kernel aperture is inside the tensor X-Y plane.
-                                if( 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h &&
-                                    0 <= in_j && in_j < width - (kernel_w-1)*dilation_w )
+                                for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
                                 {
                                     for( k = 0; k < vsz; k++ )
-                                        rowbuf[k] = imgptr[ofstab[k]];
+                                        rowbuf[k] = imgptr[k*inpPlaneSize];
                                 }
-                                else
+                            }
+                            else
+                            {
+                                bool ok_i = 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h;
+                                int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
+                                int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
+
+                                for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
                                 {
-                                    int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
-                                    int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
-                                    int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
-                                    int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
-
-                                    // here some non-continous sub-row of the row will not be
-                                    // filled from the tensor; we need to make sure that the uncovered
-                                    // elements are explicitly set to 0's. the easiest way is to
-                                    // set all the elements to 0's before the loop.
-                                    memset(rowbuf, 0, vsz*sizeof(rowbuf[0]));
-                                    for( k = 0; k < ncn; k++, imgptr += width*height )
+                                    // this condition should be true for most of the tensor elements, i.e.
+                                    // most of the time the kernel aperture is inside the tensor X-Y plane.
+                                    if( ok_i && out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w )
+                                    {
+                                        for( k = 0; k < vsz; k++ )
+                                        {
+                                            int k1 = ofstab[k];
+                                            float v0 = imgptr[k1];
+                                            float v1 = imgptr[k1 + stride_w];
+                                            rowbuf[k] = v0;
+                                            rowbuf[k+vsz_a] = v1;
+                                        }
+                                        out_j++;
+                                        rowbuf += vsz_a;
+                                        imgptr += stride_w;
+                                        in_j += stride_w;
+                                    }
+                                    else
                                     {
-                                        for( i = i0; i < i1; i++ )
+                                        int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
+                                        int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
+
+                                        // here some non-continous sub-row of the row will not be
+                                        // filled from the tensor; we need to make sure that the uncovered
+                                        // elements are explicitly set to 0's. the easiest way is to
+                                        // set all the elements to 0's before the loop.
+                                        memset(rowbuf, 0, vsz*sizeof(rowbuf[0]));
+                                        for( k = 0; k < ncn; k++ )
                                         {
-                                            for( j = j0; j < j1; j++ )
+                                            for( i = i0; i < i1; i++ )
                                             {
-                                                int imgofs = i*(dilation_h*width) + j*dilation_w;
-                                                rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
+                                                for( j = j0; j < j1; j++ )
+                                                {
+                                                    int imgofs = k*(width*height) + i*(dilation_h*width) + j*dilation_w;
+                                                    rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
+                                                }
                                             }
                                         }
                                     }
@@ -625,7 +638,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
         {
             // prepare weightsMat where each row is aligned and has enough zero padding on the right to
             // use vectorized (i.e. with intrinsics) loops without tail processing
-            Mat wm = blobs[0].reshape(1, outCn).clone();
+            Mat wm = blobs[0].reshape(1, outCn);
             if( wm.step1() % VEC_ALIGN != 0 )
             {
                 int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
@@ -698,7 +711,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
         int nstripes = std::max(getNumThreads(), 1);
 
         ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasvec, reluslope,
-                          kernel, pad, stride, dilation, ngroups, nstripes, activ.get());
+                          kernel, pad, stride, dilation, activ.get(), ngroups, nstripes);
     }
 
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
@@ -776,7 +789,7 @@ class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
             b_ = &b;
             c_ = &c;
             nstripes_ = nstripes;
-            useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
+            useAVX2 = checkHardwareSupport(CPU_AVX2);
         }
 
         void operator()(const Range& range_) const
 
@@ -11,6 +11,7 @@
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 
@@ -11,6 +11,7 @@
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,