Skip to content

Commit 11feae6

Browse files
authored
Merge pull request opencv#9041 from terfendail:filter_avx
AVX optimized implementation of separable filters migrated
2 parents eef78f5 + 4d0f789 commit 11feae6

File tree

3 files changed

+274
-109
lines changed

3 files changed

+274
-109
lines changed

modules/imgproc/src/filter.avx2.cpp

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
/*M///////////////////////////////////////////////////////////////////////////////////////
2+
//
3+
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4+
//
5+
// By downloading, copying, installing or using the software you agree to this license.
6+
// If you do not agree to this license, do not download, install,
7+
// copy or use the software.
8+
//
9+
//
10+
// License Agreement
11+
// For Open Source Computer Vision Library
12+
//
13+
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14+
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15+
// Third party copyrights are property of their respective owners.
16+
//
17+
// Redistribution and use in source and binary forms, with or without modification,
18+
// are permitted provided that the following conditions are met:
19+
//
20+
// * Redistribution's of source code must retain the above copyright notice,
21+
// this list of conditions and the following disclaimer.
22+
//
23+
// * Redistribution's in binary form must reproduce the above copyright notice,
24+
// this list of conditions and the following disclaimer in the documentation
25+
// and/or other materials provided with the distribution.
26+
//
27+
// * The name of the copyright holders may not be used to endorse or promote products
28+
// derived from this software without specific prior written permission.
29+
//
30+
// This software is provided by the copyright holders and contributors "as is" and
31+
// any express or implied warranties, including, but not limited to, the implied
32+
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33+
// In no event shall the Intel Corporation or contributors be liable for any direct,
34+
// indirect, incidental, special, exemplary, or consequential damages
35+
// (including, but not limited to, procurement of substitute goods or services;
36+
// loss of use, data, or profits; or business interruption) however caused
37+
// and on any theory of liability, whether in contract, strict liability,
38+
// or tort (including negligence or otherwise) arising in any way out of
39+
// the use of this software, even if advised of the possibility of such damage.
40+
//
41+
//M*/
42+
43+
#include "precomp.hpp"
44+
#include "filter.hpp"
45+
46+
namespace cv
47+
{
48+
49+
int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize)
50+
{
51+
int i = 0, k;
52+
for (; i <= width - 8; i += 8)
53+
{
54+
const float* src = src0 + i;
55+
__m256 f, x0;
56+
__m256 s0 = _mm256_set1_ps(0.0f);
57+
for (k = 0; k < _ksize; k++, src += cn)
58+
{
59+
f = _mm256_set1_ps(_kx[k]);
60+
x0 = _mm256_loadu_ps(src);
61+
#if CV_FMA3
62+
s0 = _mm256_fmadd_ps(x0, f, s0);
63+
#else
64+
s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
65+
#endif
66+
}
67+
_mm256_storeu_ps(dst + i, s0);
68+
}
69+
_mm256_zeroupper();
70+
return i;
71+
}
72+
73+
int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2)
74+
{
75+
int i = 0, k;
76+
const float *S, *S2;
77+
const __m128 d4 = _mm_set1_ps(delta);
78+
const __m256 d8 = _mm256_set1_ps(delta);
79+
80+
for( ; i <= width - 16; i += 16 )
81+
{
82+
__m256 f = _mm256_set1_ps(ky[0]);
83+
__m256 s0, s1;
84+
__m256 x0;
85+
S = src[0] + i;
86+
s0 = _mm256_loadu_ps(S);
87+
#if CV_FMA3
88+
s0 = _mm256_fmadd_ps(s0, f, d8);
89+
#else
90+
s0 = _mm256_add_ps(_mm256_mul_ps(s0, f), d8);
91+
#endif
92+
s1 = _mm256_loadu_ps(S+8);
93+
#if CV_FMA3
94+
s1 = _mm256_fmadd_ps(s1, f, d8);
95+
#else
96+
s1 = _mm256_add_ps(_mm256_mul_ps(s1, f), d8);
97+
#endif
98+
99+
for( k = 1; k <= ksize2; k++ )
100+
{
101+
S = src[k] + i;
102+
S2 = src[-k] + i;
103+
f = _mm256_set1_ps(ky[k]);
104+
x0 = _mm256_add_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2));
105+
#if CV_FMA3
106+
s0 = _mm256_fmadd_ps(x0, f, s0);
107+
#else
108+
s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
109+
#endif
110+
x0 = _mm256_add_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8));
111+
#if CV_FMA3
112+
s1 = _mm256_fmadd_ps(x0, f, s1);
113+
#else
114+
s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f));
115+
#endif
116+
}
117+
118+
_mm256_storeu_ps(dst + i, s0);
119+
_mm256_storeu_ps(dst + i + 8, s1);
120+
}
121+
122+
for( ; i <= width - 4; i += 4 )
123+
{
124+
__m128 f = _mm_set1_ps(ky[0]);
125+
__m128 x0, s0 = _mm_load_ps(src[0] + i);
126+
s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
127+
128+
for( k = 1; k <= ksize2; k++ )
129+
{
130+
f = _mm_set1_ps(ky[k]);
131+
S = src[k] + i;
132+
S2 = src[-k] + i;
133+
x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
134+
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
135+
}
136+
137+
_mm_storeu_ps(dst + i, s0);
138+
}
139+
140+
_mm256_zeroupper();
141+
return i;
142+
}
143+
144+
int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2)
145+
{
146+
int i = 0, k;
147+
const float *S, *S2;
148+
const __m128 d4 = _mm_set1_ps(delta);
149+
const __m256 d8 = _mm256_set1_ps(delta);
150+
151+
for (; i <= width - 16; i += 16)
152+
{
153+
__m256 f, s0 = d8, s1 = d8;
154+
__m256 x0;
155+
S = src[0] + i;
156+
157+
for (k = 1; k <= ksize2; k++)
158+
{
159+
S = src[k] + i;
160+
S2 = src[-k] + i;
161+
f = _mm256_set1_ps(ky[k]);
162+
x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2));
163+
#if CV_FMA3
164+
s0 = _mm256_fmadd_ps(x0, f, s0);
165+
#else
166+
s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
167+
#endif
168+
x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8));
169+
#if CV_FMA3
170+
s1 = _mm256_fmadd_ps(x0, f, s1);
171+
#else
172+
s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f));
173+
#endif
174+
}
175+
176+
_mm256_storeu_ps(dst + i, s0);
177+
_mm256_storeu_ps(dst + i + 8, s1);
178+
}
179+
180+
for (; i <= width - 4; i += 4)
181+
{
182+
__m128 f, x0, s0 = d4;
183+
184+
for (k = 1; k <= ksize2; k++)
185+
{
186+
f = _mm_set1_ps(ky[k]);
187+
x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i));
188+
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
189+
}
190+
191+
_mm_storeu_ps(dst + i, s0);
192+
}
193+
194+
_mm256_zeroupper();
195+
return i;
196+
}
197+
198+
}
199+
200+
/* End of file. */

modules/imgproc/src/filter.cpp

Lines changed: 17 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
#include "opencv2/core/opencl/ocl_defs.hpp"
4545
#include "opencl_kernels_imgproc.hpp"
4646
#include "hal_replacement.hpp"
47+
#include "filter.hpp"
48+
4749

4850
/****************************************************************************************\
4951
Base Image Filter
@@ -1356,7 +1358,7 @@ struct RowVec_32f
13561358
RowVec_32f()
13571359
{
13581360
haveSSE = checkHardwareSupport(CV_CPU_SSE);
1359-
haveAVX2 = checkHardwareSupport(CV_CPU_AVX2);
1361+
haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
13601362
#if defined USE_IPP_SEP_FILTERS
13611363
bufsz = -1;
13621364
#endif
@@ -1366,7 +1368,7 @@ struct RowVec_32f
13661368
{
13671369
kernel = _kernel;
13681370
haveSSE = checkHardwareSupport(CV_CPU_SSE);
1369-
haveAVX2 = checkHardwareSupport(CV_CPU_AVX2);
1371+
haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
13701372
#if defined USE_IPP_SEP_FILTERS
13711373
bufsz = -1;
13721374
#endif
@@ -1393,28 +1395,9 @@ struct RowVec_32f
13931395
int i = 0, k;
13941396
width *= cn;
13951397

1396-
#if CV_AVX2
1397-
if ( haveAVX2 )
1398-
{
1399-
for( ; i <= width - 8; i += 8 )
1400-
{
1401-
const float* src = src0 + i;
1402-
__m256 f, x0;
1403-
__m256 s0 = _mm256_set1_ps(0.0f);
1404-
for( k = 0; k < _ksize; k++, src += cn )
1405-
{
1406-
f = _mm256_set1_ps(_kx[k]);
1407-
x0 = _mm256_loadu_ps(src);
1408-
#if CV_FMA3
1409-
s0 = _mm256_fmadd_ps(x0, f, s0);
1410-
#else
1411-
s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
1412-
#endif
1413-
}
1414-
_mm256_storeu_ps(dst + i, s0);
1415-
}
1416-
return i;
1417-
}
1398+
#if CV_TRY_AVX2
1399+
if (haveAVX2)
1400+
return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize);
14181401
#endif
14191402
for( ; i <= width - 8; i += 8 )
14201403
{
@@ -1679,7 +1662,7 @@ struct SymmColumnVec_32f
16791662
SymmColumnVec_32f() {
16801663
symmetryType=0;
16811664
haveSSE = checkHardwareSupport(CV_CPU_SSE);
1682-
haveAVX2 = checkHardwareSupport(CV_CPU_AVX2);
1665+
haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
16831666
delta = 0;
16841667
}
16851668
SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
@@ -1688,7 +1671,7 @@ struct SymmColumnVec_32f
16881671
kernel = _kernel;
16891672
delta = (float)_delta;
16901673
haveSSE = checkHardwareSupport(CV_CPU_SSE);
1691-
haveAVX2 = checkHardwareSupport(CV_CPU_AVX2);
1674+
haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
16921675
CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
16931676
}
16941677

@@ -1704,61 +1687,15 @@ struct SymmColumnVec_32f
17041687
const float** src = (const float**)_src;
17051688
const float *S, *S2;
17061689
float* dst = (float*)_dst;
1707-
const __m128 d4 = _mm_set1_ps(delta);
1708-
#if CV_AVX2
1709-
const __m256 d8 = _mm256_set1_ps(delta);
1710-
#endif
17111690

17121691
if( symmetrical )
17131692
{
17141693

1715-
#if CV_AVX2
1716-
if ( haveAVX2 )
1717-
{
1718-
for( ; i <= width - 16; i += 16 )
1719-
{
1720-
__m256 f = _mm256_set1_ps(ky[0]);
1721-
__m256 s0, s1;
1722-
__m256 x0;
1723-
S = src[0] + i;
1724-
s0 = _mm256_loadu_ps(S);
1725-
#if CV_FMA3
1726-
s0 = _mm256_fmadd_ps(s0, f, d8);
1727-
#else
1728-
s0 = _mm256_add_ps(_mm256_mul_ps(s0, f), d8);
1729-
#endif
1730-
s1 = _mm256_loadu_ps(S+8);
1731-
#if CV_FMA3
1732-
s1 = _mm256_fmadd_ps(s1, f, d8);
1733-
#else
1734-
s1 = _mm256_add_ps(_mm256_mul_ps(s1, f), d8);
1735-
#endif
1736-
1737-
for( k = 1; k <= ksize2; k++ )
1738-
{
1739-
S = src[k] + i;
1740-
S2 = src[-k] + i;
1741-
f = _mm256_set1_ps(ky[k]);
1742-
x0 = _mm256_add_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2));
1743-
#if CV_FMA3
1744-
s0 = _mm256_fmadd_ps(x0, f, s0);
1745-
#else
1746-
s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
1747-
#endif
1748-
x0 = _mm256_add_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8));
1749-
#if CV_FMA3
1750-
s1 = _mm256_fmadd_ps(x0, f, s1);
1751-
#else
1752-
s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f));
1753-
#endif
1754-
}
1755-
1756-
_mm256_storeu_ps(dst + i, s0);
1757-
_mm256_storeu_ps(dst + i + 8, s1);
1758-
}
1759-
_mm256_zeroupper();
1760-
}
1694+
#if CV_TRY_AVX2
1695+
if (haveAVX2)
1696+
return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2);
17611697
#endif
1698+
const __m128 d4 = _mm_set1_ps(delta);
17621699
for( ; i <= width - 16; i += 16 )
17631700
{
17641701
__m128 f = _mm_set1_ps(ky[0]);
@@ -1815,40 +1752,11 @@ if ( haveAVX2 )
18151752
}
18161753
else
18171754
{
1818-
#if CV_AVX2
1819-
if ( haveAVX2 )
1820-
{
1821-
for( ; i <= width - 16; i += 16 )
1822-
{
1823-
__m256 f, s0 = d8, s1 = d8;
1824-
__m256 x0;
1825-
S = src[0] + i;
1826-
1827-
for( k = 1; k <= ksize2; k++ )
1828-
{
1829-
S = src[k] + i;
1830-
S2 = src[-k] + i;
1831-
f = _mm256_set1_ps(ky[k]);
1832-
x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2));
1833-
#if CV_FMA3
1834-
s0 = _mm256_fmadd_ps(x0, f, s0);
1835-
#else
1836-
s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
1837-
#endif
1838-
x0 = _mm256_sub_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8));
1839-
#if CV_FMA3
1840-
s1 = _mm256_fmadd_ps(x0, f, s1);
1841-
#else
1842-
s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f));
1843-
#endif
1844-
}
1845-
1846-
_mm256_storeu_ps(dst + i, s0);
1847-
_mm256_storeu_ps(dst + i + 8, s1);
1848-
}
1849-
_mm256_zeroupper();
1850-
}
1755+
#if CV_TRY_AVX2
1756+
if (haveAVX2)
1757+
return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2);
18511758
#endif
1759+
const __m128 d4 = _mm_set1_ps(delta);
18521760
for( ; i <= width - 16; i += 16 )
18531761
{
18541762
__m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;

0 commit comments

Comments
 (0)