Skip to content

Commit 92a3dbe

Browse files
committed
more ICV_HLINE optimization
added 64b optimization for 3 channels case not added 64b optimization for 4 channels case since timings did not show any improvement split ICV_HLINE cases into inline functions instead of macro for code size reduction, without significand speed drawback at first sight
1 parent afbcc07 commit 92a3dbe

File tree

1 file changed

+226
-9
lines changed

1 file changed

+226
-9
lines changed

modules/imgproc/src/drawing.cpp

Lines changed: 226 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1097,6 +1097,7 @@ static const int opencvOne = 1;
10971097
# include <intrin.h>
10981098
# pragma intrinsic(_byteswap_ushort)
10991099
# pragma intrinsic(_byteswap_ulong)
1100+
# pragma intrinsic(_byteswap_uint64)
11001101
# pragma intrinsic(_ReadWriteBarrier)
11011102
# else
11021103
# include <cmnintrin.h>
@@ -1134,20 +1135,20 @@ static inline uint32_t opencvBigToHost32(uint32_t x){
11341135
static inline uint32_t opencvLittleToHost32(const uchar* p){
11351136
#if OPENCV_BYTEORDER==1234
11361137
uint32_t x;
1137-
memcpy(&x,p,4);
1138+
memcpy(&x,p,sizeof(x));
11381139
return x;
11391140
#elif OPENCV_BYTEORDER==4321 && defined(__GNUC__)
11401141
uint32_t x;
1141-
memcpy(&x,p,4);
1142+
memcpy(&x,p,sizeof(x));
11421143
return __builtin_bswap32(x);
11431144
#elif OPENCV_BYTEORDER==4321 && defined(_MSC_VER) && _MSC_VER>=1300
11441145
uint32_t x;
1145-
memcpy(&x,p,4);
1146+
memcpy(&x,p,sizeof(x));
11461147
return _byteswap_ulong(x);
11471148
#elif OPENCV_LITTLEENDIAN
11481149
return x;
11491150
#else
1150-
return ((unsigned)p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
1151+
return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
11511152
#endif
11521153
}
11531154

@@ -1159,7 +1160,33 @@ static inline uint32_t opencvLittleToHost32(uint32_t x){
11591160
#endif
11601161
}
11611162

1163+
static inline uint64_t opencvLittleToHost64(const uchar* p){
1164+
#if OPENCV_BYTEORDER==1234
1165+
uint64_t x;
1166+
memcpy(&x,p,sizeof(x));
1167+
return x;
1168+
#elif OPENCV_BYTEORDER==4321 && defined(__GNUC__)
1169+
uint64_t x;
1170+
memcpy(&x,p,sizeof(x));
1171+
return __builtin_bswap64(x);
1172+
#elif OPENCV_BYTEORDER==4321 && defined(_MSC_VER) && _MSC_VER>=1300
1173+
uint64_t x;
1174+
memcpy(&x,p,sizeof(x));
1175+
return _byteswap_uint64(x);
1176+
#elif OPENCV_LITTLEENDIAN
1177+
return x;
1178+
#else
1179+
return (p[0]<<56) | (p[1]<<40) | (p[2]<<24) | (p[3]<<8) | (p[4]>>8) | (p[5]>>24) | (p[6]>>40) | (p[7]>>56);
1180+
#endif
1181+
}
11621182

1183+
static inline uint64_t opencvLittleToHost64(uint64_t x){
1184+
#if OPENCV_LITTLEENDIAN
1185+
return x;
1186+
#else
1187+
return opencvLittleToHost64((uchar*)&x);
1188+
#endif
1189+
}
11631190

11641191
/* helper macros: filling horizontal row */
11651192
#define is_aligned(POINTER, BYTE_COUNT) (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0)
@@ -1179,6 +1206,38 @@ static inline uint32_t opencvLittleToHost32(uint32_t x){
11791206
} \
11801207
}*/
11811208

1209+
/*
1210+
template <unsigned pix_size_forced>
1211+
static inline void icv_hline_impl(uchar* ptr, size_t xl, size_t xr, const uchar* color, unsigned pix_size_)
1212+
{
1213+
const unsigned pix_size = pix_size_forced ? pix_size_forced : pix_size_;
1214+
1215+
uchar* hline_ptr = ptr + xl*pix_size;
1216+
uchar* hline_max_ptr = ptr + xr*pix_size;
1217+
1218+
for ( ; hline_ptr <= hline_max_ptr; hline_ptr += pix_size)
1219+
{
1220+
for (unsigned c = 0; c < pix_size; c++)
1221+
{
1222+
hline_ptr[c] = color[c];
1223+
}
1224+
}
1225+
}
1226+
1227+
#define ICV_HLINE( ptr, xl, xr, color, pix_size ) \
1228+
{ \
1229+
if (pix_size == 1) \
1230+
icv_hline_impl<1>((uchar*)ptr, (xl), (xr), (const uchar*)color,pix_size); \
1231+
else if (pix_size == 3) \
1232+
icv_hline_impl<3>((uchar*)ptr, (xl), (xr), (const uchar*)color, pix_size); \
1233+
else if (pix_size == 4) \
1234+
icv_hline_impl<4>((uchar*)ptr, (xl), (xr), (const uchar*)color, pix_size); \
1235+
else \
1236+
icv_hline_impl<0>((uchar*)ptr, (xl), (xr), (const uchar*)color, pix_size); \
1237+
}
1238+
*/
1239+
1240+
/*
11821241
#define ICV_HLINE( ptr, xl, xr, color, pix_size ) \
11831242
if((pix_size) == 1) \
11841243
{ \
@@ -1192,9 +1251,36 @@ else if((pix_size) == 3) \
11921251
{ \
11931252
uchar* hline_ptr = (uchar*)(ptr) + (xl)*3; \
11941253
uchar* hline_end = (uchar*)(ptr) + (xr+1)*3; \
1254+
uchar* hbody24_start = std::min(hline_end, (uchar*)(24*(((uintptr_t)(hline_ptr)+23)/24))); \
1255+
uchar* hbody24_end = std::min(hline_end, (uchar*)(24*(((uintptr_t)(hline_end))/24))); \
11951256
uchar* hbody12_start = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_ptr)+11)/12))); \
1196-
uchar* hbody12_end = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_end))/12))); \
1197-
if ((hbody12_start < hbody12_end)) \
1257+
uchar* hbody12_end = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_end))/12))); \
1258+
if (hbody24_start < hbody24_end) \
1259+
{ \
1260+
int offset = ((uintptr_t)(hbody24_start-hline_ptr))%3; \
1261+
uint64_t c4[3]; \
1262+
uchar* ptrC4 = reinterpret_cast<uchar*>(&c4); \
1263+
ptrC4[0] = ((uchar*)(color))[(offset++)%3]; \
1264+
ptrC4[1] = ((uchar*)(color))[(offset++)%3]; \
1265+
ptrC4[2] = ((uchar*)(color))[(offset++)%3]; \
1266+
memcpy(&ptrC4[3], &ptrC4[0], 3); \
1267+
memcpy(&ptrC4[6], &ptrC4[0], 6); \
1268+
memcpy(&ptrC4[12], &ptrC4[0], 12); \
1269+
c4[0] = opencvLittleToHost64(c4[0]); \
1270+
c4[1] = opencvLittleToHost64(c4[1]); \
1271+
c4[2] = opencvLittleToHost64(c4[2]); \
1272+
for(offset = 0 ; hline_ptr < hbody24_start; offset = (offset+1)%3)\
1273+
*hline_ptr++ = ((uchar*)(color))[offset]; \
1274+
for(uint64_t* ptr64 = reinterpret_cast<uint64_t*>(hbody24_start), *ptr64End = reinterpret_cast<uint64_t*>(hbody24_end) ; ptr64<ptr64End ; ) \
1275+
{ \
1276+
*ptr64++ = c4[0]; \
1277+
*ptr64++ = c4[1]; \
1278+
*ptr64++ = c4[2]; \
1279+
} \
1280+
for(offset = ((uintptr_t)(hbody24_end-(uchar*)(ptr)))%3, hline_ptr = hbody24_end ; hline_ptr < hline_end ; offset = (offset+1)%3) \
1281+
*hline_ptr++ = ((uchar*)(color))[offset]; \
1282+
} \
1283+
else if (hbody12_start < hbody12_end) \
11981284
{ \
11991285
int offset = ((uintptr_t)(hbody12_start-hline_ptr))%3; \
12001286
uint32_t c4[3]; \
@@ -1233,9 +1319,8 @@ else if(((pix_size) == 4) && is_aligned(((uchar*)(ptr) + (xl)*4), 0x4)) \
12331319
uint32_t c = opencvLittleToHost32((uchar*)(color)); \
12341320
uint32_t* hline_ptr = (uint32_t*)(ptr) + xl; \
12351321
uint32_t* hline_max_ptr = (uint32_t*)(ptr) + xr; \
1236-
\
1237-
for( ; hline_ptr <= hline_max_ptr; ++hline_ptr ) \
1238-
*hline_ptr = c; \
1322+
for( ; hline_ptr <= hline_max_ptr; ) \
1323+
*hline_ptr++ = c; \
12391324
} \
12401325
else \
12411326
{ \
@@ -1251,6 +1336,138 @@ else \
12511336
} \
12521337
} \
12531338
}
1339+
*/
1340+
1341+
static inline void ICV_HLINE_0(uchar* ptr, int xl, int xr, const uchar* color, int pix_size)
1342+
{
1343+
uchar* hline_ptr = (uchar*)(ptr) + (xl)*(pix_size);
1344+
uchar* hline_max_ptr = (uchar*)(ptr) + (xr)*(pix_size);
1345+
for( ; hline_ptr <= hline_max_ptr; hline_ptr += (pix_size))
1346+
{
1347+
int hline_j;
1348+
for( hline_j = 0; hline_j < (4); hline_j++ )
1349+
{
1350+
hline_ptr[hline_j] = ((uchar*)color)[hline_j];
1351+
}
1352+
}
1353+
}
1354+
//end ICV_HLINE_0()
1355+
1356+
static inline void ICV_HLINE_1(uchar* ptr, int xl, int xr, const uchar* color)
1357+
{
1358+
uchar* hline_ptr = (uchar*)(ptr) + (xl);
1359+
uchar* hline_max_ptr = (uchar*)(ptr) + (xr);
1360+
uchar hline_c = *(const uchar*)(color);
1361+
memset(hline_ptr, hline_c, (hline_max_ptr - hline_ptr) + 1);
1362+
}
1363+
1364+
static inline void ICV_HLINE_3(uchar* ptr, int xl, int xr, const uchar* color)
1365+
{
1366+
uchar* hline_ptr = (uchar*)(ptr) + (xl)*3;
1367+
uchar* hline_end = (uchar*)(ptr) + (xr+1)*3;
1368+
uchar* hbody24_start = std::min(hline_end, (uchar*)(24*(((uintptr_t)(hline_ptr)+23)/24)));
1369+
uchar* hbody24_end = std::min(hline_end, (uchar*)(24*(((uintptr_t)(hline_end))/24)));
1370+
uchar* hbody12_start = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_ptr)+11)/12)));
1371+
uchar* hbody12_end = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_end))/12)));
1372+
if (hbody24_start < hbody24_end)
1373+
{
1374+
int offset = ((uintptr_t)(hbody24_start-hline_ptr))%3;
1375+
uint64_t c4[3];
1376+
uchar* ptrC4 = reinterpret_cast<uchar*>(&c4);
1377+
ptrC4[0] = ((uchar*)(color))[(offset++)%3];
1378+
ptrC4[1] = ((uchar*)(color))[(offset++)%3];
1379+
ptrC4[2] = ((uchar*)(color))[(offset++)%3];
1380+
memcpy(&ptrC4[3], &ptrC4[0], 3);
1381+
memcpy(&ptrC4[6], &ptrC4[0], 6);
1382+
memcpy(&ptrC4[12], &ptrC4[0], 12);
1383+
c4[0] = opencvLittleToHost64(c4[0]);
1384+
c4[1] = opencvLittleToHost64(c4[1]);
1385+
c4[2] = opencvLittleToHost64(c4[2]);
1386+
for(offset = 0 ; hline_ptr < hbody24_start; offset = (offset+1)%3)
1387+
*hline_ptr++ = ((uchar*)(color))[offset];
1388+
for(uint64_t* ptr64 = reinterpret_cast<uint64_t*>(hbody24_start), *ptr64End = reinterpret_cast<uint64_t*>(hbody24_end) ; ptr64<ptr64End ; )
1389+
{
1390+
*ptr64++ = c4[0];
1391+
*ptr64++ = c4[1];
1392+
*ptr64++ = c4[2];
1393+
}
1394+
for(offset = ((uintptr_t)(hbody24_end-(uchar*)(ptr)))%3, hline_ptr = hbody24_end ; hline_ptr < hline_end ; offset = (offset+1)%3)
1395+
*hline_ptr++ = ((uchar*)(color))[offset];
1396+
}
1397+
else if (hbody12_start < hbody12_end)
1398+
{
1399+
int offset = ((uintptr_t)(hbody12_start-hline_ptr))%3;
1400+
uint32_t c4[3];
1401+
uchar* ptrC4 = reinterpret_cast<uchar*>(&c4);
1402+
ptrC4[0] = ((uchar*)(color))[(offset++)%3];
1403+
ptrC4[1] = ((uchar*)(color))[(offset++)%3];
1404+
ptrC4[2] = ((uchar*)(color))[(offset++)%3];
1405+
memcpy(&ptrC4[3], &ptrC4[0], 3);
1406+
memcpy(&ptrC4[6], &ptrC4[0], 6);
1407+
c4[0] = opencvLittleToHost32(c4[0]);
1408+
c4[1] = opencvLittleToHost32(c4[1]);
1409+
c4[2] = opencvLittleToHost32(c4[2]);
1410+
for(offset = 0 ; hline_ptr < hbody12_start; offset = (offset+1)%3)
1411+
*hline_ptr++ = ((uchar*)(color))[offset];
1412+
for(uint32_t* ptr32 = reinterpret_cast<uint32_t*>(hbody12_start), *ptr32End = reinterpret_cast<uint32_t*>(hbody12_end) ; ptr32<ptr32End ; )
1413+
{
1414+
*ptr32++ = c4[0];
1415+
*ptr32++ = c4[1];
1416+
*ptr32++ = c4[2];
1417+
}
1418+
for(offset = ((uintptr_t)(hbody12_end-(uchar*)(ptr)))%3, hline_ptr = hbody12_end ; hline_ptr < hline_end ; offset = (offset+1)%3)
1419+
*hline_ptr++ = ((uchar*)(color))[offset];
1420+
}
1421+
else
1422+
{
1423+
for( ; hline_ptr < hline_end ; )
1424+
{
1425+
*hline_ptr++ = ((uchar*)(color))[0];
1426+
*hline_ptr++ = ((uchar*)(color))[1];
1427+
*hline_ptr++ = ((uchar*)(color))[2];
1428+
}
1429+
}
1430+
}
1431+
//end ICV_HLINE_3()
1432+
1433+
static inline void ICV_HLINE_4(uchar* ptr, int xl, int xr, const uchar* color)
1434+
{
1435+
if (is_aligned(((uchar*)(ptr) + (xl)*4), 0x4))
1436+
{
1437+
uint32_t c = opencvLittleToHost32((uchar*)(color));
1438+
uint32_t* hline_ptr = (uint32_t*)(ptr) + xl;
1439+
uint32_t* hline_max_ptr = (uint32_t*)(ptr) + xr;
1440+
for( ; hline_ptr <= hline_max_ptr; )
1441+
*hline_ptr++ = c;
1442+
}
1443+
else
1444+
{
1445+
uchar* hline_ptr = (uchar*)(ptr) + (xl)*(4);
1446+
uchar* hline_max_ptr = (uchar*)(ptr) + (xr)*(4);
1447+
for( ; hline_ptr <= hline_max_ptr; hline_ptr += (4))
1448+
{
1449+
int hline_j;
1450+
for( hline_j = 0; hline_j < (4); hline_j++ )
1451+
{
1452+
hline_ptr[hline_j] = ((uchar*)color)[hline_j];
1453+
}
1454+
}
1455+
}
1456+
}
1457+
//end ICV_HLINE_4()
1458+
1459+
static inline void ICV_HLINE(uchar* ptr, int xl, int xr, const void* color, int pix_size)
1460+
{
1461+
if (pix_size == 1)
1462+
ICV_HLINE_1(ptr, xl, xr, reinterpret_cast<const uchar*>(color));
1463+
else if (pix_size == 3)
1464+
ICV_HLINE_3(ptr, xl, xr, reinterpret_cast<const uchar*>(color));
1465+
else if (pix_size == 4)
1466+
ICV_HLINE_4(ptr, xl, xr, reinterpret_cast<const uchar*>(color));
1467+
else
1468+
ICV_HLINE_0(ptr, xl, xr, reinterpret_cast<const uchar*>(color), pix_size);
1469+
}
1470+
//end ICV_HLINE()
12541471

12551472
/* filling convex polygon. v - array of vertices, ntps - number of points */
12561473
static void

0 commit comments

Comments
 (0)