@@ -1354,12 +1354,14 @@ struct RowVec_32f
1354
1354
RowVec_32f ()
1355
1355
{
1356
1356
haveSSE = checkHardwareSupport (CV_CPU_SSE);
1357
+ haveAVX2 = checkHardwareSupport (CV_CPU_AVX2);
1357
1358
}
1358
1359
1359
1360
RowVec_32f ( const Mat& _kernel )
1360
1361
{
1361
1362
kernel = _kernel;
1362
1363
haveSSE = checkHardwareSupport (CV_CPU_SSE);
1364
+ haveAVX2 = checkHardwareSupport (CV_CPU_AVX2);
1363
1365
#if defined USE_IPP_SEP_FILTERS
1364
1366
bufsz = -1 ;
1365
1367
#endif
@@ -1386,14 +1388,36 @@ struct RowVec_32f
1386
1388
int i = 0 , k;
1387
1389
width *= cn;
1388
1390
1391
+ #if CV_AVX2
1392
+ if ( haveAVX2 )
1393
+ {
1394
+ for ( ; i <= width - 8 ; i += 8 )
1395
+ {
1396
+ const float * src = src0 + i;
1397
+ __m256 f, x0;
1398
+ __m256 s0 = _mm256_set1_ps (0 .0f );
1399
+ for ( k = 0 ; k < _ksize; k++, src += cn )
1400
+ {
1401
+ f = _mm256_set1_ps (_kx[k]);
1402
+ x0 = _mm256_loadu_ps (src);
1403
+ #if CV_FMA3
1404
+ s0 = _mm256_fmadd_ps (x0, f, s0);
1405
+ #else
1406
+ s0 = _mm256_add_ps (s0, _mm256_mul_ps (x0, f));
1407
+ #endif
1408
+ }
1409
+ _mm256_storeu_ps (dst + i, s0);
1410
+ }
1411
+ return i;
1412
+ }
1413
+ #endif
1389
1414
for ( ; i <= width - 8 ; i += 8 )
1390
1415
{
1391
1416
const float * src = src0 + i;
1392
1417
__m128 f, s0 = _mm_setzero_ps (), s1 = s0, x0, x1;
1393
1418
for ( k = 0 ; k < _ksize; k++, src += cn )
1394
1419
{
1395
- f = _mm_load_ss (_kx+k);
1396
- f = _mm_shuffle_ps (f, f, 0 );
1420
+ f = _mm_set1_ps (_kx[k]);
1397
1421
1398
1422
x0 = _mm_loadu_ps (src);
1399
1423
x1 = _mm_loadu_ps (src + 4 );
@@ -1408,6 +1432,7 @@ struct RowVec_32f
1408
1432
1409
1433
Mat kernel;
1410
1434
bool haveSSE;
1435
+ bool haveAVX2;
1411
1436
#if defined USE_IPP_SEP_FILTERS
1412
1437
private:
1413
1438
mutable int bufsz;
@@ -1646,18 +1671,24 @@ struct SymmRowSmallVec_32f
1646
1671
1647
1672
struct SymmColumnVec_32f
1648
1673
{
1649
- SymmColumnVec_32f () { symmetryType=0 ; }
1674
+ SymmColumnVec_32f () {
1675
+ symmetryType=0 ;
1676
+ haveSSE = checkHardwareSupport (CV_CPU_SSE);
1677
+ haveAVX2 = checkHardwareSupport (CV_CPU_AVX2);
1678
+ }
1650
1679
SymmColumnVec_32f (const Mat& _kernel, int _symmetryType, int , double _delta)
1651
1680
{
1652
1681
symmetryType = _symmetryType;
1653
1682
kernel = _kernel;
1654
1683
delta = (float )_delta;
1684
+ haveSSE = checkHardwareSupport (CV_CPU_SSE);
1685
+ haveAVX2 = checkHardwareSupport (CV_CPU_AVX2);
1655
1686
CV_Assert ( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1656
1687
}
1657
1688
1658
1689
int operator ()(const uchar** _src, uchar* _dst, int width) const
1659
1690
{
1660
- if ( !checkHardwareSupport (CV_CPU_SSE) )
1691
+ if ( !haveSSE )
1661
1692
return 0 ;
1662
1693
1663
1694
int ksize2 = (kernel.rows + kernel.cols - 1 )/2 ;
@@ -1667,14 +1698,64 @@ struct SymmColumnVec_32f
1667
1698
const float ** src = (const float **)_src;
1668
1699
const float *S, *S2;
1669
1700
float * dst = (float *)_dst;
1670
- __m128 d4 = _mm_set1_ps (delta);
1701
+ const __m128 d4 = _mm_set1_ps (delta);
1702
+ #if CV_AVX2
1703
+ const __m256 d8 = _mm256_set1_ps (delta);
1704
+ #endif
1671
1705
1672
1706
if ( symmetrical )
1673
1707
{
1708
+
1709
+ #if CV_AVX2
1710
+ if ( haveAVX2 )
1711
+ {
1674
1712
for ( ; i <= width - 16 ; i += 16 )
1675
1713
{
1676
- __m128 f = _mm_load_ss (ky);
1677
- f = _mm_shuffle_ps (f, f, 0 );
1714
+ __m256 f = _mm256_set1_ps (ky[0 ]);
1715
+ __m256 s0, s1;
1716
+ __m256 x0;
1717
+ S = src[0 ] + i;
1718
+ s0 = _mm256_loadu_ps (S);
1719
+ #if CV_FMA3
1720
+ s0 = _mm256_fmadd_ps (s0, f, d8);
1721
+ #else
1722
+ s0 = _mm256_add_ps (_mm256_mul_ps (s0, f), d8);
1723
+ #endif
1724
+ s1 = _mm256_loadu_ps (S+8 );
1725
+ #if CV_FMA3
1726
+ s1 = _mm256_fmadd_ps (s1, f, d8);
1727
+ #else
1728
+ s1 = _mm256_add_ps (_mm256_mul_ps (s1, f), d8);
1729
+ #endif
1730
+
1731
+ for ( k = 1 ; k <= ksize2; k++ )
1732
+ {
1733
+ S = src[k] + i;
1734
+ S2 = src[-k] + i;
1735
+ f = _mm256_set1_ps (ky[k]);
1736
+ x0 = _mm256_add_ps (_mm256_loadu_ps (S), _mm256_loadu_ps (S2));
1737
+ #if CV_FMA3
1738
+ s0 = _mm256_fmadd_ps (x0, f, s0);
1739
+ #else
1740
+ s0 = _mm256_add_ps (s0, _mm256_mul_ps (x0, f));
1741
+ #endif
1742
+ x0 = _mm256_add_ps (_mm256_loadu_ps (S+8 ), _mm256_loadu_ps (S2+8 ));
1743
+ #if CV_FMA3
1744
+ s1 = _mm256_fmadd_ps (x0, f, s1);
1745
+ #else
1746
+ s1 = _mm256_add_ps (s1, _mm256_mul_ps (x0, f));
1747
+ #endif
1748
+ }
1749
+
1750
+ _mm256_storeu_ps (dst + i, s0);
1751
+ _mm256_storeu_ps (dst + i + 8 , s1);
1752
+ }
1753
+ _mm256_zeroupper ();
1754
+ }
1755
+ #endif
1756
+ for ( ; i <= width - 16 ; i += 16 )
1757
+ {
1758
+ __m128 f = _mm_set1_ps (ky[0 ]);
1678
1759
__m128 s0, s1, s2, s3;
1679
1760
__m128 x0, x1;
1680
1761
S = src[0 ] + i;
@@ -1691,8 +1772,7 @@ struct SymmColumnVec_32f
1691
1772
{
1692
1773
S = src[k] + i;
1693
1774
S2 = src[-k] + i;
1694
- f = _mm_load_ss (ky+k);
1695
- f = _mm_shuffle_ps (f, f, 0 );
1775
+ f = _mm_set1_ps (ky[k]);
1696
1776
x0 = _mm_add_ps (_mm_load_ps (S), _mm_load_ps (S2));
1697
1777
x1 = _mm_add_ps (_mm_load_ps (S+4 ), _mm_load_ps (S2+4 ));
1698
1778
s0 = _mm_add_ps (s0, _mm_mul_ps (x0, f));
@@ -1711,15 +1791,13 @@ struct SymmColumnVec_32f
1711
1791
1712
1792
for ( ; i <= width - 4 ; i += 4 )
1713
1793
{
1714
- __m128 f = _mm_load_ss (ky);
1715
- f = _mm_shuffle_ps (f, f, 0 );
1794
+ __m128 f = _mm_set1_ps (ky[0 ]);
1716
1795
__m128 x0, s0 = _mm_load_ps (src[0 ] + i);
1717
1796
s0 = _mm_add_ps (_mm_mul_ps (s0, f), d4);
1718
1797
1719
1798
for ( k = 1 ; k <= ksize2; k++ )
1720
1799
{
1721
- f = _mm_load_ss (ky+k);
1722
- f = _mm_shuffle_ps (f, f, 0 );
1800
+ f = _mm_set1_ps (ky[k]);
1723
1801
S = src[k] + i;
1724
1802
S2 = src[-k] + i;
1725
1803
x0 = _mm_add_ps (_mm_load_ps (src[k]+i), _mm_load_ps (src[-k] + i));
@@ -1731,6 +1809,40 @@ struct SymmColumnVec_32f
1731
1809
}
1732
1810
else
1733
1811
{
1812
+ #if CV_AVX2
1813
+ if ( haveAVX2 )
1814
+ {
1815
+ for ( ; i <= width - 16 ; i += 16 )
1816
+ {
1817
+ __m256 f, s0 = d8, s1 = d8;
1818
+ __m256 x0;
1819
+ S = src[0 ] + i;
1820
+
1821
+ for ( k = 1 ; k <= ksize2; k++ )
1822
+ {
1823
+ S = src[k] + i;
1824
+ S2 = src[-k] + i;
1825
+ f = _mm256_set1_ps (ky[k]);
1826
+ x0 = _mm256_sub_ps (_mm256_loadu_ps (S), _mm256_loadu_ps (S2));
1827
+ #if CV_FMA3
1828
+ s0 = _mm256_fmadd_ps (x0, f, s0);
1829
+ #else
1830
+ s0 = _mm256_add_ps (s0, _mm256_mul_ps (x0, f));
1831
+ #endif
1832
+ x0 = _mm256_sub_ps (_mm256_loadu_ps (S+8 ), _mm256_loadu_ps (S2+8 ));
1833
+ #if CV_FMA3
1834
+ s1 = _mm256_fmadd_ps (x0, f, s1);
1835
+ #else
1836
+ s1 = _mm256_add_ps (s1, _mm256_mul_ps (x0, f));
1837
+ #endif
1838
+ }
1839
+
1840
+ _mm256_storeu_ps (dst + i, s0);
1841
+ _mm256_storeu_ps (dst + i + 8 , s1);
1842
+ }
1843
+ _mm256_zeroupper ();
1844
+ }
1845
+ #endif
1734
1846
for ( ; i <= width - 16 ; i += 16 )
1735
1847
{
1736
1848
__m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
@@ -1741,8 +1853,7 @@ struct SymmColumnVec_32f
1741
1853
{
1742
1854
S = src[k] + i;
1743
1855
S2 = src[-k] + i;
1744
- f = _mm_load_ss (ky+k);
1745
- f = _mm_shuffle_ps (f, f, 0 );
1856
+ f = _mm_set1_ps (ky[k]);
1746
1857
x0 = _mm_sub_ps (_mm_load_ps (S), _mm_load_ps (S2));
1747
1858
x1 = _mm_sub_ps (_mm_load_ps (S+4 ), _mm_load_ps (S2+4 ));
1748
1859
s0 = _mm_add_ps (s0, _mm_mul_ps (x0, f));
@@ -1765,8 +1876,7 @@ struct SymmColumnVec_32f
1765
1876
1766
1877
for ( k = 1 ; k <= ksize2; k++ )
1767
1878
{
1768
- f = _mm_load_ss (ky+k);
1769
- f = _mm_shuffle_ps (f, f, 0 );
1879
+ f = _mm_set1_ps (ky[k]);
1770
1880
x0 = _mm_sub_ps (_mm_load_ps (src[k]+i), _mm_load_ps (src[-k] + i));
1771
1881
s0 = _mm_add_ps (s0, _mm_mul_ps (x0, f));
1772
1882
}
@@ -1781,6 +1891,8 @@ struct SymmColumnVec_32f
1781
1891
int symmetryType;
1782
1892
float delta;
1783
1893
Mat kernel;
1894
+ bool haveSSE;
1895
+ bool haveAVX2;
1784
1896
};
1785
1897
1786
1898
0 commit comments