11
11
// For Open Source Computer Vision Library
12
12
//
13
13
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14
+ // Copyright (C) 2017, Intel Corporation, all rights reserved.
14
15
// Third party copyrights are property of their respective owners.
15
16
//
16
17
// Redistribution and use in source and binary forms, with or without modification,
@@ -95,8 +96,6 @@ class BaseConvolutionLayerImpl : public ConvolutionLayer
95
96
(stride.height == 1 && stride.width == 1 ) &&
96
97
(dilation.height == 1 && dilation.width == 1 );
97
98
}
98
- bool setActivation (const Ptr<ActivationLayer>& ) { return false ; }
99
- bool setBatchNorm (const Ptr<BatchNormLayer>& ) { return false ; }
100
99
101
100
virtual void applyHalideScheduler (Ptr<BackendNode>& node,
102
101
const std::vector<Mat*> &inputs,
@@ -195,14 +194,19 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
195
194
return false ;
196
195
}
197
196
198
- bool setActivation (const Ptr<ActivationLayer>& layer) { activ = layer; return true ; }
197
+ bool setActivation (const Ptr<ActivationLayer>& layer)
198
+ {
199
+ activ = layer;
200
+ return !activ.empty ();
201
+ }
202
+
199
203
bool setBatchNorm (const Ptr<BatchNormLayer>& layer )
200
204
{
201
205
bnorm = layer;
202
206
// we will need to re-compute the weights with the batch
203
207
// norm coefficients taken into account
204
208
weightsMat.release ();
205
- return true ;
209
+ return !bnorm. empty () ;
206
210
}
207
211
208
212
virtual Ptr<BackendNode> initHalide (const std::vector<Ptr<BackendWrapper> > &inputs)
@@ -289,7 +293,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
289
293
const std::vector<float >& biasvec,
290
294
const std::vector<float >& reluslope,
291
295
Size kernel, Size pad, Size stride, Size dilation,
292
- int ngroups , int nstripes, const ActivationLayer* activ )
296
+ const ActivationLayer* activ , int ngroups, int nstripes )
293
297
{
294
298
CV_Assert ( input.dims == 4 && output.dims == 4 &&
295
299
input.size [0 ] == output.size [0 ] &&
@@ -315,7 +319,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
315
319
int inpCnAll = input.size [1 ], width = input.size [3 ], height = input.size [2 ];
316
320
int inpCn = inpCnAll / ngroups;
317
321
p.is1x1_ = kernel == Size (0 ,0 ) && pad == Size (0 , 0 );
318
- p.useAVX2 = CV_CPU_HAS_SUPPORT_AVX2 ;
322
+ p.useAVX2 = checkHardwareSupport (CPU_AVX2) ;
319
323
320
324
int ncn = std::min (inpCn, (int )BLK_SIZE_CN);
321
325
p.ofstab_ .resize (kernel.width *kernel.height *ncn);
@@ -418,64 +422,73 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
418
422
for ( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += BLK_SIZE )
419
423
{
420
424
int ofs, ofs1 = std::min (ofs0 + BLK_SIZE, stripeEnd);
425
+ int out_i = ofs0 / outW;
426
+ int out_j = ofs0 - out_i * outW;
421
427
422
428
// do im2row for a part of input tensor
423
- if ( is1x1 )
429
+ float * rowbuf = rowbuf0;
430
+ for ( ofs = ofs0; ofs < ofs1; out_j = 0 , ++out_i )
424
431
{
425
- for ( ofs = ofs0; ofs < ofs1; ofs++ )
432
+ int delta = std::min (ofs1 - ofs, outW - out_j);
433
+ int out_j1 = out_j + delta;
434
+ int in_i = out_i * stride_h - pad_h;
435
+ int in_j = out_j * stride_w - pad_w;
436
+ const float * imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
437
+ ofs += delta;
438
+
439
+ // do im2row for a part of input tensor
440
+ if ( is1x1 )
426
441
{
427
- int out_i = ofs / outW;
428
- int out_j = ofs - out_i * outW;
429
- float * rowbuf = rowbuf0 + (ofs - ofs0)*vsz_a;
430
-
431
- int in_i = out_i * stride_h - pad_h;
432
- int in_j = out_j * stride_w - pad_w;
433
- const float * imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
434
-
435
- for ( k = 0 ; k < vsz; k++ )
436
- rowbuf[k] = imgptr[k*inpPlaneSize];
437
- }
438
- }
439
- else
440
- {
441
- for ( ofs = ofs0; ofs < ofs1; ofs++ )
442
- {
443
- int out_i = ofs / outW;
444
- int out_j = ofs - out_i * outW;
445
- float * rowbuf = rowbuf0 + (ofs - ofs0)*vsz_a;
446
-
447
- int in_i = out_i * stride_h - pad_h;
448
- int in_j = out_j * stride_w - pad_w;
449
- const float * imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
450
-
451
- // this condition should be true for most of the tensor elements, i.e.
452
- // most of the time the kernel aperture is inside the tensor X-Y plane.
453
- if ( 0 <= in_i && in_i < height - (kernel_h-1 )*dilation_h &&
454
- 0 <= in_j && in_j < width - (kernel_w-1 )*dilation_w )
442
+ for ( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
455
443
{
456
444
for ( k = 0 ; k < vsz; k++ )
457
- rowbuf[k] = imgptr[ofstab[k] ];
445
+ rowbuf[k] = imgptr[k*inpPlaneSize ];
458
446
}
459
- else
447
+ }
448
+ else
449
+ {
450
+ bool ok_i = 0 <= in_i && in_i < height - (kernel_h-1 )*dilation_h;
451
+ int i0 = std::max (0 , (-in_i + dilation_h-1 )/dilation_h);
452
+ int i1 = std::min (kernel_h, (height - in_i + dilation_h-1 )/dilation_h);
453
+
454
+ for ( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
460
455
{
461
- int i0 = std::max (0 , (-in_i + dilation_h-1 )/dilation_h);
462
- int i1 = std::min (kernel_h, (height - in_i + dilation_h-1 )/dilation_h);
463
- int j0 = std::max (0 , (-in_j + dilation_w-1 )/dilation_w);
464
- int j1 = std::min (kernel_w, (width - in_j + dilation_w-1 )/dilation_w);
465
-
466
- // here some non-continous sub-row of the row will not be
467
- // filled from the tensor; we need to make sure that the uncovered
468
- // elements are explicitly set to 0's. the easiest way is to
469
- // set all the elements to 0's before the loop.
470
- memset (rowbuf, 0 , vsz*sizeof (rowbuf[0 ]));
471
- for ( k = 0 ; k < ncn; k++, imgptr += width*height )
456
+ // this condition should be true for most of the tensor elements, i.e.
457
+ // most of the time the kernel aperture is inside the tensor X-Y plane.
458
+ if ( ok_i && out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1 )*dilation_w )
459
+ {
460
+ for ( k = 0 ; k < vsz; k++ )
461
+ {
462
+ int k1 = ofstab[k];
463
+ float v0 = imgptr[k1];
464
+ float v1 = imgptr[k1 + stride_w];
465
+ rowbuf[k] = v0;
466
+ rowbuf[k+vsz_a] = v1;
467
+ }
468
+ out_j++;
469
+ rowbuf += vsz_a;
470
+ imgptr += stride_w;
471
+ in_j += stride_w;
472
+ }
473
+ else
472
474
{
473
- for ( i = i0; i < i1; i++ )
475
+ int j0 = std::max (0 , (-in_j + dilation_w-1 )/dilation_w);
476
+ int j1 = std::min (kernel_w, (width - in_j + dilation_w-1 )/dilation_w);
477
+
478
+ // here some non-continous sub-row of the row will not be
479
+ // filled from the tensor; we need to make sure that the uncovered
480
+ // elements are explicitly set to 0's. the easiest way is to
481
+ // set all the elements to 0's before the loop.
482
+ memset (rowbuf, 0 , vsz*sizeof (rowbuf[0 ]));
483
+ for ( k = 0 ; k < ncn; k++ )
474
484
{
475
- for ( j = j0; j < j1; j ++ )
485
+ for ( i = i0; i < i1; i ++ )
476
486
{
477
- int imgofs = i*(dilation_h*width) + j*dilation_w;
478
- rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
487
+ for ( j = j0; j < j1; j++ )
488
+ {
489
+ int imgofs = k*(width*height) + i*(dilation_h*width) + j*dilation_w;
490
+ rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
491
+ }
479
492
}
480
493
}
481
494
}
@@ -625,7 +638,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
625
638
{
626
639
// prepare weightsMat where each row is aligned and has enough zero padding on the right to
627
640
// use vectorized (i.e. with intrinsics) loops without tail processing
628
- Mat wm = blobs[0 ].reshape (1 , outCn). clone () ;
641
+ Mat wm = blobs[0 ].reshape (1 , outCn);
629
642
if ( wm.step1 () % VEC_ALIGN != 0 )
630
643
{
631
644
int newcols = (int )alignSize (wm.step1 (), VEC_ALIGN);
@@ -698,7 +711,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
698
711
int nstripes = std::max (getNumThreads (), 1 );
699
712
700
713
ParallelConv::run (*inputs[0 ], outputs[0 ], weightsMat, biasvec, reluslope,
701
- kernel, pad, stride, dilation, ngroups, nstripes, activ.get ());
714
+ kernel, pad, stride, dilation, activ.get (), ngroups, nstripes );
702
715
}
703
716
704
717
virtual int64 getFLOPS (const std::vector<MatShape> &inputs,
@@ -776,7 +789,7 @@ class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
776
789
b_ = &b;
777
790
c_ = &c;
778
791
nstripes_ = nstripes;
779
- useAVX2 = CV_CPU_HAS_SUPPORT_AVX2 ;
792
+ useAVX2 = checkHardwareSupport (CPU_AVX2) ;
780
793
}
781
794
782
795
void operator ()(const Range& range_) const
0 commit comments