Skip to content

Commit 4784c7b

Browse files
committed
dnn: cleanup dispatched code, fix SIMD128 types
1 parent c3e6de2 commit 4784c7b

File tree

7 files changed

+49
-156
lines changed

7 files changed

+49
-156
lines changed

modules/dnn/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ endif()
99

1010
set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
1111

12+
ocv_add_dispatched_file("layers/layers_common" AVX AVX2)
13+
1214
ocv_add_module(dnn opencv_core opencv_imgproc WRAP python matlab java)
1315
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-shadow -Wno-parentheses -Wmaybe-uninitialized -Wsign-promo
1416
-Wmissing-declarations -Wmissing-prototypes

modules/dnn/src/layers/convolution_layer.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -506,13 +506,13 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
506506
int bsz = ofs1 - ofs0;
507507
#if CV_TRY_AVX2
508508
if(useAVX2)
509-
fastConv_avx2(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
509+
opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
510510
outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
511511
else
512512
#endif
513513
#if CV_TRY_AVX
514514
if(useAVX)
515-
fastConv_avx(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
515+
opt_AVX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
516516
outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
517517
else
518518
#endif
@@ -824,12 +824,12 @@ class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
824824

825825
#if CV_TRY_AVX2
826826
if( useAVX2 )
827-
fastGEMM_avx2( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
827+
opt_AVX2::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
828828
else
829829
#endif
830830
#if CV_TRY_AVX
831831
if( useAVX )
832-
fastGEMM_avx( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
832+
opt_AVX::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
833833
else
834834
#endif
835835
for( m = 0; m < mmax; m += 2 )

modules/dnn/src/layers/fully_connected_layer.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -177,12 +177,12 @@ class FullyConnectedLayerImpl : public InnerProductLayer
177177

178178
#if CV_TRY_AVX2
179179
if( useAVX2 )
180-
fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
180+
opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
181181
else
182182
#endif
183183
#if CV_TRY_AVX
184184
if( useAVX )
185-
fastGEMM1T_avx( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
185+
opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
186186
else
187187
#endif
188188
{
@@ -191,19 +191,19 @@ class FullyConnectedLayerImpl : public InnerProductLayer
191191
#if CV_SIMD128
192192
for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
193193
{
194-
vfloat32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
195-
vfloat32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
194+
v_float32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
195+
v_float32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
196196

197197
for( k = 0; k < vecsize; k += 4 )
198198
{
199-
vfloat32x4 v = v_load_aligned(sptr + k);
199+
v_float32x4 v = v_load_aligned(sptr + k);
200200
vs0 += v*v_load_aligned(wptr + k);
201201
vs1 += v*v_load_aligned(wptr + wstep + k);
202202
vs2 += v*v_load_aligned(wptr + wstep*2 + k);
203203
vs3 += v*v_load_aligned(wptr + wstep*3 + k);
204204
}
205205

206-
vfloat32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
206+
v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
207207
s += v_load(biasptr + i);
208208
v_store(dptr + i, s);
209209
}

modules/dnn/src/layers/layers_common.avx.cpp

Lines changed: 0 additions & 54 deletions
This file was deleted.

modules/dnn/src/layers/layers_common.avx2.cpp

Lines changed: 0 additions & 51 deletions
This file was deleted.

modules/dnn/src/layers/layers_common.hpp

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@
4545
#include <opencv2/dnn.hpp>
4646
#include <opencv2/dnn/shape_utils.hpp>
4747

48+
// dispatched AVX/AVX2 optimizations
49+
#include "layers/layers_common.simd.hpp"
50+
#include "layers/layers_common.simd_declarations.hpp"
51+
4852
namespace cv
4953
{
5054
namespace dnn
@@ -64,32 +68,6 @@ void getConvPoolPaddings(const Size& inp, const Size& out,
6468
const Size &kernel, const Size &stride,
6569
const String &padMode, Size &pad);
6670

67-
#if CV_TRY_AVX
68-
void fastConv_avx(const float* weights, size_t wstep, const float* bias,
69-
const float* rowbuf, float* output, const int* outShape,
70-
int blockSize, int vecsize, int vecsize_aligned,
71-
const float* relu, bool initOutput);
72-
void fastGEMM1T_avx( const float* vec, const float* weights,
73-
size_t wstep, const float* bias,
74-
float* dst, int nvecs, int vecsize );
75-
void fastGEMM_avx( const float* aptr, size_t astep, const float* bptr0,
76-
size_t bstep, float* cptr, size_t cstep,
77-
int ma, int na, int nb );
78-
#endif
79-
80-
#if CV_TRY_AVX2
81-
void fastConv_avx2(const float* weights, size_t wstep, const float* bias,
82-
const float* rowbuf, float* output, const int* outShape,
83-
int blockSize, int vecsize, int vecsize_aligned,
84-
const float* relu, bool initOutput);
85-
void fastGEMM1T_avx2( const float* vec, const float* weights,
86-
size_t wstep, const float* bias,
87-
float* dst, int nvecs, int vecsize );
88-
void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr0,
89-
size_t bstep, float* cptr, size_t cstep,
90-
int ma, int na, int nb );
91-
#endif
92-
9371
}
9472
}
9573

modules/dnn/src/layers/layers_common.simd.hpp

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,34 @@
4040
//
4141
//M*/
4242

43-
#ifndef __DNN_LAYERS_COMMON_SIMD_HPP__
44-
#define __DNN_LAYERS_COMMON_SIMD_HPP__
43+
#include "opencv2/core/hal/intrin.hpp"
4544

4645
namespace cv {
4746
namespace dnn {
47+
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
48+
49+
void fastConv( const float* weights, size_t wstep, const float* bias,
50+
const float* rowbuf, float* output, const int* outShape,
51+
int blockSize, int vecsize, int vecsize_aligned,
52+
const float* relu, bool initOutput );
53+
void fastGEMM1T( const float* vec, const float* weights,
54+
size_t wstep, const float* bias,
55+
float* dst, int nvecs, int vecsize );
56+
void fastGEMM( const float* aptr, size_t astep, const float* bptr,
57+
size_t bstep, float* cptr, size_t cstep,
58+
int ma, int na, int nb );
59+
60+
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
61+
62+
#if !CV_FMA // AVX workaround
63+
#undef _mm256_fmadd_ps
64+
#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
65+
#endif
4866

49-
void fastConv_some_avx( const float* weights, size_t wstep, const float* bias,
50-
const float* rowbuf, float* output, const int* outShape,
51-
int blockSize, int vecsize, int vecsize_aligned,
52-
const float* relu, bool initOutput )
67+
void fastConv( const float* weights, size_t wstep, const float* bias,
68+
const float* rowbuf, float* output, const int* outShape,
69+
int blockSize, int vecsize, int vecsize_aligned,
70+
const float* relu, bool initOutput )
5371
{
5472
int outCn = outShape[1];
5573
size_t outPlaneSize = outShape[2]*outShape[3];
@@ -214,9 +232,9 @@ void fastConv_some_avx( const float* weights, size_t wstep, const float* bias,
214232
}
215233

216234
// dst = vec * weights^t + bias
217-
void fastGEMM1T_some_avx( const float* vec, const float* weights,
218-
size_t wstep, const float* bias,
219-
float* dst, int nvecs, int vecsize )
235+
void fastGEMM1T( const float* vec, const float* weights,
236+
size_t wstep, const float* bias,
237+
float* dst, int nvecs, int vecsize )
220238
{
221239
int i = 0;
222240

@@ -276,9 +294,9 @@ void fastGEMM1T_some_avx( const float* vec, const float* weights,
276294
_mm256_zeroupper();
277295
}
278296

279-
void fastGEMM_some_avx( const float* aptr, size_t astep, const float* bptr,
280-
size_t bstep, float* cptr, size_t cstep,
281-
int ma, int na, int nb )
297+
void fastGEMM( const float* aptr, size_t astep, const float* bptr,
298+
size_t bstep, float* cptr, size_t cstep,
299+
int ma, int na, int nb )
282300
{
283301
int n = 0;
284302
for( ; n <= nb - 16; n += 16 )
@@ -346,7 +364,7 @@ void fastGEMM_some_avx( const float* aptr, size_t astep, const float* bptr,
346364
_mm256_zeroupper();
347365
}
348366

349-
}
350-
}
367+
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
351368

352-
#endif
369+
CV_CPU_OPTIMIZATION_NAMESPACE_END
370+
}} // namespace

0 commit comments

Comments
 (0)