Skip to content

Commit 1c4fb41

Browse files
committed
Merge pull request opencv#9007 from alalek:issue_9001
2 parents 58c5be0 + f8a75c4 commit 1c4fb41

File tree

6 files changed

+47
-19
lines changed

6 files changed

+47
-19
lines changed

cmake/OpenCVCompilerOptimizations.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,12 +647,15 @@ macro(ocv_compiler_optimization_fill_cpu_config)
647647
if(NOT DEFINED CPU_${OPT}_FEATURE_ALIAS OR NOT "x${CPU_${OPT}_FEATURE_ALIAS}" STREQUAL "x")
648648
set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE}
649649
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_${OPT}
650+
# define CV_TRY_${OPT} 1
650651
# define CV_CPU_HAS_SUPPORT_${OPT} 1
651652
# define CV_CPU_CALL_${OPT}(fn, args) return (opt_${OPT}::fn args)
652653
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_${OPT}
654+
# define CV_TRY_${OPT} 1
653655
# define CV_CPU_HAS_SUPPORT_${OPT} (cv::checkHardwareSupport(CV_CPU_${OPT}))
654656
# define CV_CPU_CALL_${OPT}(fn, args) if (CV_CPU_HAS_SUPPORT_${OPT}) return (opt_${OPT}::fn args)
655657
#else
658+
# define CV_TRY_${OPT} 0
656659
# define CV_CPU_HAS_SUPPORT_${OPT} 0
657660
# define CV_CPU_CALL_${OPT}(fn, args)
658661
#endif

modules/core/include/opencv2/core/cv_cpu_helper.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,144 +1,180 @@
11
// AUTOGENERATED, DO NOT EDIT
22

33
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE
4+
# define CV_TRY_SSE 1
45
# define CV_CPU_HAS_SUPPORT_SSE 1
56
# define CV_CPU_CALL_SSE(fn, args) return (opt_SSE::fn args)
67
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE
8+
# define CV_TRY_SSE 1
79
# define CV_CPU_HAS_SUPPORT_SSE (cv::checkHardwareSupport(CV_CPU_SSE))
810
# define CV_CPU_CALL_SSE(fn, args) if (CV_CPU_HAS_SUPPORT_SSE) return (opt_SSE::fn args)
911
#else
12+
# define CV_TRY_SSE 0
1013
# define CV_CPU_HAS_SUPPORT_SSE 0
1114
# define CV_CPU_CALL_SSE(fn, args)
1215
#endif
1316
#define __CV_CPU_DISPATCH_CHAIN_SSE(fn, args, mode, ...) CV_CPU_CALL_SSE(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
1417

1518
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE2
19+
# define CV_TRY_SSE2 1
1620
# define CV_CPU_HAS_SUPPORT_SSE2 1
1721
# define CV_CPU_CALL_SSE2(fn, args) return (opt_SSE2::fn args)
1822
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE2
23+
# define CV_TRY_SSE2 1
1924
# define CV_CPU_HAS_SUPPORT_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE2))
2025
# define CV_CPU_CALL_SSE2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE2) return (opt_SSE2::fn args)
2126
#else
27+
# define CV_TRY_SSE2 0
2228
# define CV_CPU_HAS_SUPPORT_SSE2 0
2329
# define CV_CPU_CALL_SSE2(fn, args)
2430
#endif
2531
#define __CV_CPU_DISPATCH_CHAIN_SSE2(fn, args, mode, ...) CV_CPU_CALL_SSE2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
2632

2733
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE3
34+
# define CV_TRY_SSE3 1
2835
# define CV_CPU_HAS_SUPPORT_SSE3 1
2936
# define CV_CPU_CALL_SSE3(fn, args) return (opt_SSE3::fn args)
3037
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE3
38+
# define CV_TRY_SSE3 1
3139
# define CV_CPU_HAS_SUPPORT_SSE3 (cv::checkHardwareSupport(CV_CPU_SSE3))
3240
# define CV_CPU_CALL_SSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSE3) return (opt_SSE3::fn args)
3341
#else
42+
# define CV_TRY_SSE3 0
3443
# define CV_CPU_HAS_SUPPORT_SSE3 0
3544
# define CV_CPU_CALL_SSE3(fn, args)
3645
#endif
3746
#define __CV_CPU_DISPATCH_CHAIN_SSE3(fn, args, mode, ...) CV_CPU_CALL_SSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
3847

3948
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSSE3
49+
# define CV_TRY_SSSE3 1
4050
# define CV_CPU_HAS_SUPPORT_SSSE3 1
4151
# define CV_CPU_CALL_SSSE3(fn, args) return (opt_SSSE3::fn args)
4252
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSSE3
53+
# define CV_TRY_SSSE3 1
4354
# define CV_CPU_HAS_SUPPORT_SSSE3 (cv::checkHardwareSupport(CV_CPU_SSSE3))
4455
# define CV_CPU_CALL_SSSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSSE3) return (opt_SSSE3::fn args)
4556
#else
57+
# define CV_TRY_SSSE3 0
4658
# define CV_CPU_HAS_SUPPORT_SSSE3 0
4759
# define CV_CPU_CALL_SSSE3(fn, args)
4860
#endif
4961
#define __CV_CPU_DISPATCH_CHAIN_SSSE3(fn, args, mode, ...) CV_CPU_CALL_SSSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
5062

5163
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_1
64+
# define CV_TRY_SSE4_1 1
5265
# define CV_CPU_HAS_SUPPORT_SSE4_1 1
5366
# define CV_CPU_CALL_SSE4_1(fn, args) return (opt_SSE4_1::fn args)
5467
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_1
68+
# define CV_TRY_SSE4_1 1
5569
# define CV_CPU_HAS_SUPPORT_SSE4_1 (cv::checkHardwareSupport(CV_CPU_SSE4_1))
5670
# define CV_CPU_CALL_SSE4_1(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_1) return (opt_SSE4_1::fn args)
5771
#else
72+
# define CV_TRY_SSE4_1 0
5873
# define CV_CPU_HAS_SUPPORT_SSE4_1 0
5974
# define CV_CPU_CALL_SSE4_1(fn, args)
6075
#endif
6176
#define __CV_CPU_DISPATCH_CHAIN_SSE4_1(fn, args, mode, ...) CV_CPU_CALL_SSE4_1(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
6277

6378
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_2
79+
# define CV_TRY_SSE4_2 1
6480
# define CV_CPU_HAS_SUPPORT_SSE4_2 1
6581
# define CV_CPU_CALL_SSE4_2(fn, args) return (opt_SSE4_2::fn args)
6682
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_2
83+
# define CV_TRY_SSE4_2 1
6784
# define CV_CPU_HAS_SUPPORT_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2))
6885
# define CV_CPU_CALL_SSE4_2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_2) return (opt_SSE4_2::fn args)
6986
#else
87+
# define CV_TRY_SSE4_2 0
7088
# define CV_CPU_HAS_SUPPORT_SSE4_2 0
7189
# define CV_CPU_CALL_SSE4_2(fn, args)
7290
#endif
7391
#define __CV_CPU_DISPATCH_CHAIN_SSE4_2(fn, args, mode, ...) CV_CPU_CALL_SSE4_2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
7492

7593
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_POPCNT
94+
# define CV_TRY_POPCNT 1
7695
# define CV_CPU_HAS_SUPPORT_POPCNT 1
7796
# define CV_CPU_CALL_POPCNT(fn, args) return (opt_POPCNT::fn args)
7897
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_POPCNT
98+
# define CV_TRY_POPCNT 1
7999
# define CV_CPU_HAS_SUPPORT_POPCNT (cv::checkHardwareSupport(CV_CPU_POPCNT))
80100
# define CV_CPU_CALL_POPCNT(fn, args) if (CV_CPU_HAS_SUPPORT_POPCNT) return (opt_POPCNT::fn args)
81101
#else
102+
# define CV_TRY_POPCNT 0
82103
# define CV_CPU_HAS_SUPPORT_POPCNT 0
83104
# define CV_CPU_CALL_POPCNT(fn, args)
84105
#endif
85106
#define __CV_CPU_DISPATCH_CHAIN_POPCNT(fn, args, mode, ...) CV_CPU_CALL_POPCNT(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
86107

87108
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX
109+
# define CV_TRY_AVX 1
88110
# define CV_CPU_HAS_SUPPORT_AVX 1
89111
# define CV_CPU_CALL_AVX(fn, args) return (opt_AVX::fn args)
90112
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX
113+
# define CV_TRY_AVX 1
91114
# define CV_CPU_HAS_SUPPORT_AVX (cv::checkHardwareSupport(CV_CPU_AVX))
92115
# define CV_CPU_CALL_AVX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX) return (opt_AVX::fn args)
93116
#else
117+
# define CV_TRY_AVX 0
94118
# define CV_CPU_HAS_SUPPORT_AVX 0
95119
# define CV_CPU_CALL_AVX(fn, args)
96120
#endif
97121
#define __CV_CPU_DISPATCH_CHAIN_AVX(fn, args, mode, ...) CV_CPU_CALL_AVX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
98122

99123
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FP16
124+
# define CV_TRY_FP16 1
100125
# define CV_CPU_HAS_SUPPORT_FP16 1
101126
# define CV_CPU_CALL_FP16(fn, args) return (opt_FP16::fn args)
102127
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FP16
128+
# define CV_TRY_FP16 1
103129
# define CV_CPU_HAS_SUPPORT_FP16 (cv::checkHardwareSupport(CV_CPU_FP16))
104130
# define CV_CPU_CALL_FP16(fn, args) if (CV_CPU_HAS_SUPPORT_FP16) return (opt_FP16::fn args)
105131
#else
132+
# define CV_TRY_FP16 0
106133
# define CV_CPU_HAS_SUPPORT_FP16 0
107134
# define CV_CPU_CALL_FP16(fn, args)
108135
#endif
109136
#define __CV_CPU_DISPATCH_CHAIN_FP16(fn, args, mode, ...) CV_CPU_CALL_FP16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
110137

111138
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX2
139+
# define CV_TRY_AVX2 1
112140
# define CV_CPU_HAS_SUPPORT_AVX2 1
113141
# define CV_CPU_CALL_AVX2(fn, args) return (opt_AVX2::fn args)
114142
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX2
143+
# define CV_TRY_AVX2 1
115144
# define CV_CPU_HAS_SUPPORT_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2))
116145
# define CV_CPU_CALL_AVX2(fn, args) if (CV_CPU_HAS_SUPPORT_AVX2) return (opt_AVX2::fn args)
117146
#else
147+
# define CV_TRY_AVX2 0
118148
# define CV_CPU_HAS_SUPPORT_AVX2 0
119149
# define CV_CPU_CALL_AVX2(fn, args)
120150
#endif
121151
#define __CV_CPU_DISPATCH_CHAIN_AVX2(fn, args, mode, ...) CV_CPU_CALL_AVX2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
122152

123153
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FMA3
154+
# define CV_TRY_FMA3 1
124155
# define CV_CPU_HAS_SUPPORT_FMA3 1
125156
# define CV_CPU_CALL_FMA3(fn, args) return (opt_FMA3::fn args)
126157
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FMA3
158+
# define CV_TRY_FMA3 1
127159
# define CV_CPU_HAS_SUPPORT_FMA3 (cv::checkHardwareSupport(CV_CPU_FMA3))
128160
# define CV_CPU_CALL_FMA3(fn, args) if (CV_CPU_HAS_SUPPORT_FMA3) return (opt_FMA3::fn args)
129161
#else
162+
# define CV_TRY_FMA3 0
130163
# define CV_CPU_HAS_SUPPORT_FMA3 0
131164
# define CV_CPU_CALL_FMA3(fn, args)
132165
#endif
133166
#define __CV_CPU_DISPATCH_CHAIN_FMA3(fn, args, mode, ...) CV_CPU_CALL_FMA3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
134167

135168
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
169+
# define CV_TRY_NEON 1
136170
# define CV_CPU_HAS_SUPPORT_NEON 1
137171
# define CV_CPU_CALL_NEON(fn, args) return (opt_NEON::fn args)
138172
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON
173+
# define CV_TRY_NEON 1
139174
# define CV_CPU_HAS_SUPPORT_NEON (cv::checkHardwareSupport(CV_CPU_NEON))
140175
# define CV_CPU_CALL_NEON(fn, args) if (CV_CPU_HAS_SUPPORT_NEON) return (opt_NEON::fn args)
141176
#else
177+
# define CV_TRY_NEON 0
142178
# define CV_CPU_HAS_SUPPORT_NEON 0
143179
# define CV_CPU_CALL_NEON(fn, args)
144180
#endif

modules/dnn/src/layers/convolution_layer.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
315315
int inpCnAll = input.size[1], width = input.size[3], height = input.size[2];
316316
int inpCn = inpCnAll / ngroups;
317317
p.is1x1_ = kernel == Size(0,0) && pad == Size(0, 0);
318-
p.useAVX2 = checkHardwareSupport(CPU_AVX2);
318+
p.useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
319319

320320
int ncn = std::min(inpCn, (int)BLK_SIZE_CN);
321321
p.ofstab_.resize(kernel.width*kernel.height*ncn);
@@ -486,7 +486,7 @@ class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
486486
// now compute dot product of the weights
487487
// and im2row-transformed part of the tensor
488488
int bsz = ofs1 - ofs0;
489-
#if CV_DNN_TRY_AVX2
489+
#if CV_TRY_AVX2
490490
if(useAVX2)
491491
fastConv_avx2(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
492492
outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
@@ -776,7 +776,7 @@ class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
776776
b_ = &b;
777777
c_ = &c;
778778
nstripes_ = nstripes;
779-
useAVX2 = checkHardwareSupport(CPU_AVX2);
779+
useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
780780
}
781781

782782
void operator()(const Range& range_) const
@@ -794,7 +794,7 @@ class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
794794
size_t bstep = b_->step1();
795795
size_t cstep = c_->step1();
796796

797-
#if CV_DNN_TRY_AVX2
797+
#if CV_TRY_AVX2
798798
if( useAVX2 )
799799
fastGEMM_avx2( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
800800
else

modules/dnn/src/layers/fully_connected_layer.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ class FullyConnectedLayerImpl : public InnerProductLayer
127127
biasMat_ = &biasMat;
128128
dstMat_ = &dstMat;
129129
nstripes_ = nstripes;
130-
useAVX2_ = checkHardwareSupport(CPU_AVX2);
130+
useAVX2_ = CV_CPU_HAS_SUPPORT_AVX2;
131131
}
132132

133133
void operator()(const Range& r) const
@@ -161,7 +161,7 @@ class FullyConnectedLayerImpl : public InnerProductLayer
161161

162162
memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
163163

164-
#if CV_DNN_TRY_AVX2
164+
#if CV_TRY_AVX2
165165
if( useAVX2_ )
166166
fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
167167
else

modules/dnn/src/layers/layers_common.avx2.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,6 @@
4343
#include "layers_common.hpp"
4444
#include "opencv2/core/hal/intrin.hpp"
4545

46-
#if CV_DNN_TRY_AVX2
47-
48-
#include <immintrin.h>
49-
5046
namespace cv {
5147
namespace dnn {
5248

@@ -334,7 +330,6 @@ void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr,
334330
_mm256_storeu_ps(cptr3 + n + 8, d31);
335331
}
336332
}
337-
_mm256_zeroupper();
338333

339334
for( ; n < nb; n++ )
340335
{
@@ -350,9 +345,8 @@ void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr,
350345
cptr0[n] = d0;
351346
}
352347
}
348+
_mm256_zeroupper();
353349
}
354350

355351
}
356352
}
357-
358-
#endif

modules/dnn/src/layers/layers_common.hpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,7 @@ void getConvPoolPaddings(const Size& inp, const Size& out,
6363
const Size &kernel, const Size &stride,
6464
const String &padMode, Size &pad);
6565

66-
#if CV_SSE2
67-
#define CV_DNN_TRY_AVX2 1
68-
66+
#if CV_TRY_AVX2
6967
void fastConv_avx2(const float* weights, size_t wstep, const float* bias,
7068
const float* rowbuf, float* output, const int* outShape,
7169
int blockSize, int vecsize, int vecsize_aligned,
@@ -76,9 +74,6 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
7674
void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr0,
7775
size_t bstep, float* cptr, size_t cstep,
7876
int ma, int na, int nb );
79-
80-
#else
81-
#define CV_DNN_TRY_AVX2 0
8277
#endif
8378

8479
}

0 commit comments

Comments
 (0)