Skip to content

Commit 6cb7335

Browse files
committed
laplacian ocl kernel optimization
This ocl kernel is 46%~171% faster than current laplacian 3x3 ocl kernel in the perf test, with image format "CV_8UC1". Signed-off-by: Li Peng <peng.li@intel.com>
1 parent c93fb14 commit 6cb7335

File tree

3 files changed

+248
-22
lines changed

3 files changed

+248
-22
lines changed

modules/imgproc/src/deriv.cpp

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -807,8 +807,57 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
807807
return k.run(2, globalsize, NULL, false);
808808
}
809809

810+
static bool ocl_Laplacian3_8UC1(InputArray _src, OutputArray _dst, int ddepth,
811+
InputArray _kernel, double delta, int borderType)
812+
{
813+
const ocl::Device & dev = ocl::Device::getDefault();
814+
int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
815+
816+
if ( !(dev.isIntel() && (type == CV_8UC1) && (ddepth == CV_8U) &&
817+
(borderType != BORDER_WRAP) &&
818+
(_src.offset() == 0) && (_src.step() % 4 == 0) &&
819+
(_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)) )
820+
return false;
821+
822+
Mat kernel = _kernel.getMat().reshape(1, 1);
823+
824+
if (ddepth < 0)
825+
ddepth = sdepth;
826+
827+
Size size = _src.size();
828+
size_t globalsize[2] = { 0, 0 };
829+
size_t localsize[2] = { 0, 0 };
830+
831+
globalsize[0] = size.width / 16;
832+
globalsize[1] = size.height / 2;
833+
834+
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
835+
char build_opts[1024];
836+
sprintf(build_opts, "-D %s %s", borderMap[borderType],
837+
ocl::kernelToStr(kernel, CV_32F, "KERNEL_MATRIX").c_str());
838+
839+
ocl::Kernel k("laplacian3_8UC1_cols16_rows2", cv::ocl::imgproc::laplacian3_oclsrc, build_opts);
840+
if (k.empty())
841+
return false;
842+
843+
UMat src = _src.getUMat();
844+
_dst.create(size, CV_MAKETYPE(ddepth, cn));
845+
if (!(_dst.offset() == 0 && _dst.step() % 4 == 0))
846+
return false;
847+
UMat dst = _dst.getUMat();
848+
849+
int idxArg = k.set(0, ocl::KernelArg::PtrReadOnly(src));
850+
idxArg = k.set(idxArg, (int)src.step);
851+
idxArg = k.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
852+
idxArg = k.set(idxArg, (int)dst.step);
853+
idxArg = k.set(idxArg, (int)dst.rows);
854+
idxArg = k.set(idxArg, (int)dst.cols);
855+
idxArg = k.set(idxArg, static_cast<float>(delta));
856+
857+
return k.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false);
810858
}
811859

860+
}
812861
#endif
813862

814863
#if defined(HAVE_IPP)
@@ -892,6 +941,22 @@ void cv::Laplacian( InputArray _src, OutputArray _dst, int ddepth, int ksize,
892941
ddepth = sdepth;
893942
_dst.create( _src.size(), CV_MAKETYPE(ddepth, cn) );
894943

944+
if( ksize == 1 || ksize == 3 )
945+
{
946+
float K[2][9] =
947+
{
948+
{ 0, 1, 0, 1, -4, 1, 0, 1, 0 },
949+
{ 2, 0, 2, 0, -8, 0, 2, 0, 2 }
950+
};
951+
952+
Mat kernel(3, 3, CV_32F, K[ksize == 3]);
953+
if( scale != 1 )
954+
kernel *= scale;
955+
956+
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
957+
ocl_Laplacian3_8UC1(_src, _dst, ddepth, kernel, delta, borderType));
958+
}
959+
895960
CV_IPP_RUN((ksize == 3 || ksize == 5) && ((borderType & BORDER_ISOLATED) != 0 || !_src.isSubmatrix()) &&
896961
((stype == CV_8UC1 && ddepth == CV_16S) || (ddepth == CV_32F && stype == CV_32FC1)) && (!cv::ocl::useOpenCL()),
897962
ipp_Laplacian(_src, _dst, ddepth, ksize, scale, delta, borderType));
@@ -920,6 +985,7 @@ void cv::Laplacian( InputArray _src, OutputArray _dst, int ddepth, int ksize,
920985
Mat kernel(3, 3, CV_32F, K[ksize == 3]);
921986
if( scale != 1 )
922987
kernel *= scale;
988+
923989
filter2D( _src, _dst, ddepth, kernel, Point(-1, -1), delta, borderType );
924990
}
925991
else
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
// This file is part of OpenCV project.
2+
// It is subject to the license terms in the LICENSE file found in the top-level directory
3+
// of this distribution and at http://opencv.org/license.html.
4+
5+
#define DIG(a) a,
6+
__constant float kx[] = { KERNEL_MATRIX };
7+
8+
#define OP(delta, x) (convert_float16(arr[delta + x]) * kx[x])
9+
10+
__kernel void laplacian3_8UC1_cols16_rows2(__global const uint* src, int src_step,
11+
__global uint* dst, int dst_step,
12+
int rows, int cols, float delta)
13+
{
14+
int block_x = get_global_id(0);
15+
int y = get_global_id(1) * 2;
16+
int ssx, dsx;
17+
18+
if ((block_x * 16) >= cols || y >= rows) return;
19+
20+
uint4 line[4];
21+
uint4 line_out[2];
22+
uchar a; uchar16 b; uchar c;
23+
uchar d; uchar16 e; uchar f;
24+
uchar g; uchar16 h; uchar i;
25+
uchar j; uchar16 k; uchar l;
26+
27+
ssx = dsx = 1;
28+
int src_index = block_x * 4 * ssx + (y - 1) * (src_step / 4);
29+
line[1] = vload4(0, src + src_index + (src_step / 4));
30+
line[2] = vload4(0, src + src_index + 2 * (src_step / 4));
31+
32+
#ifdef BORDER_CONSTANT
33+
line[0] = (y == 0) ? (uint4)0 : vload4(0, src + src_index);
34+
line[3] = (y == (rows - 2)) ? (uint4)0 : vload4(0, src + src_index + 3 * (src_step / 4));
35+
#elif defined BORDER_REFLECT_101
36+
line[0] = (y == 0) ? line[2] : vload4(0, src + src_index);
37+
line[3] = (y == (rows - 2)) ? line[1] : vload4(0, src + src_index + 3 * (src_step / 4));
38+
#elif defined (BORDER_REPLICATE) || defined(BORDER_REFLECT)
39+
line[0] = (y == 0) ? line[1] : vload4(0, src + src_index);
40+
line[3] = (y == (rows - 2)) ? line[2] : vload4(0, src + src_index + 3 * (src_step / 4));
41+
#endif
42+
43+
__global uchar *src_p = (__global uchar *)src;
44+
45+
src_index = block_x * 16 * ssx + (y - 1) * src_step;
46+
bool line_end = ((block_x + 1) * 16 == cols);
47+
48+
b = as_uchar16(line[0]);
49+
e = as_uchar16(line[1]);
50+
h = as_uchar16(line[2]);
51+
k = as_uchar16(line[3]);
52+
53+
#ifdef BORDER_CONSTANT
54+
a = (block_x == 0 || y == 0) ? 0 : src_p[src_index - 1];
55+
c = (line_end || y == 0) ? 0 : src_p[src_index + 16];
56+
57+
d = (block_x == 0) ? 0 : src_p[src_index + src_step - 1];
58+
f = line_end ? 0 : src_p[src_index + src_step + 16];
59+
60+
g = (block_x == 0) ? 0 : src_p[src_index + 2 * src_step - 1];
61+
i = line_end ? 0 : src_p[src_index + 2 * src_step + 16];
62+
63+
j = (block_x == 0 || y == (rows - 2)) ? 0 : src_p[src_index + 3 * src_step - 1];
64+
l = (line_end || y == (rows - 2))? 0 : src_p[src_index + 3 * src_step + 16];
65+
66+
#elif defined BORDER_REFLECT_101
67+
int offset;
68+
offset = (y == 0) ? (2 * src_step) : 0;
69+
70+
a = (block_x == 0) ? src_p[src_index + offset + 1] : src_p[src_index + offset - 1];
71+
c = line_end ? src_p[src_index + offset + 14] : src_p[src_index + offset + 16];
72+
73+
d = (block_x == 0) ? src_p[src_index + src_step + 1] : src_p[src_index + src_step - 1];
74+
f = line_end ? src_p[src_index + src_step + 14] : src_p[src_index + src_step + 16];
75+
76+
g = (block_x == 0) ? src_p[src_index + 2 * src_step + 1] : src_p[src_index + 2 * src_step - 1];
77+
i = line_end ? src_p[src_index + 2 * src_step + 14] : src_p[src_index + 2 * src_step + 16];
78+
79+
offset = (y == (rows - 2)) ? (1 * src_step) : (3 * src_step);
80+
81+
j = (block_x == 0) ? src_p[src_index + offset + 1] : src_p[src_index + offset - 1];
82+
l = line_end ? src_p[src_index + offset + 14] : src_p[src_index + offset + 16];
83+
84+
#elif defined (BORDER_REPLICATE) || defined(BORDER_REFLECT)
85+
int offset;
86+
offset = (y == 0) ? (1 * src_step) : 0;
87+
88+
a = (block_x == 0) ? src_p[src_index + offset] : src_p[src_index + offset - 1];
89+
c = line_end ? src_p[src_index + offset + 15] : src_p[src_index + offset + 16];
90+
91+
d = (block_x == 0) ? src_p[src_index + src_step] : src_p[src_index + src_step - 1];
92+
f = line_end ? src_p[src_index + src_step + 15] : src_p[src_index + src_step + 16];
93+
94+
g = (block_x == 0) ? src_p[src_index + 2 * src_step] : src_p[src_index + 2 * src_step - 1];
95+
i = line_end ? src_p[src_index + 2 * src_step + 15] : src_p[src_index + 2 * src_step + 16];
96+
97+
offset = (y == (rows - 2)) ? (2 * src_step) : (3 * src_step);
98+
99+
j = (block_x == 0) ? src_p[src_index + offset] : src_p[src_index + offset - 1];
100+
l = line_end ? src_p[src_index + offset + 15] : src_p[src_index + offset + 16];
101+
102+
#endif
103+
104+
uchar16 arr[12];
105+
float16 sum[2];
106+
107+
arr[0] = (uchar16)(a, b.s0123, b.s456789ab, b.scde);
108+
arr[1] = b;
109+
arr[2] = (uchar16)(b.s123, b.s4567, b.s89abcdef, c);
110+
arr[3] = (uchar16)(d, e.s0123, e.s456789ab, e.scde);
111+
arr[4] = e;
112+
arr[5] = (uchar16)(e.s123, e.s4567, e.s89abcdef, f);
113+
arr[6] = (uchar16)(g, h.s0123, h.s456789ab, h.scde);
114+
arr[7] = h;
115+
arr[8] = (uchar16)(h.s123, h.s4567, h.s89abcdef, i);
116+
arr[9] = (uchar16)(j, k.s0123, k.s456789ab, k.scde);
117+
arr[10] = k;
118+
arr[11] = (uchar16)(k.s123, k.s4567, k.s89abcdef, l);
119+
120+
sum[0] = OP(0, 0) + OP(0, 1) + OP(0, 2) +
121+
OP(0, 3) + OP(0, 4) + OP(0, 5) +
122+
OP(0, 6) + OP(0, 7) + OP(0, 8);
123+
124+
sum[1] = OP(3, 0) + OP(3, 1) + OP(3, 2) +
125+
OP(3, 3) + OP(3, 4) + OP(3, 5) +
126+
OP(3, 6) + OP(3, 7) + OP(3, 8);
127+
128+
line_out[0] = as_uint4(convert_uchar16_sat_rte(sum[0] + delta));
129+
line_out[1] = as_uint4(convert_uchar16_sat_rte(sum[1] + delta));
130+
131+
int dst_index = block_x * 4 * dsx + y * (dst_step / 4);
132+
vstore4(line_out[0], 0, dst + dst_index);
133+
vstore4(line_out[1], 0, dst + dst_index + (dst_step / 4));
134+
}

modules/imgproc/test/ocl/test_filters.cpp

Lines changed: 48 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -165,28 +165,6 @@ OCL_TEST_P(LaplacianTest, Accuracy)
165165
}
166166
}
167167

168-
169-
/////////////////////////////////////////////////////////////////////////////////////////////////
170-
// Sobel
171-
172-
typedef FilterTestBase SobelTest;
173-
174-
OCL_TEST_P(SobelTest, Mat)
175-
{
176-
int dx = size.width, dy = size.height;
177-
double scale = param;
178-
179-
for (int j = 0; j < test_loop_times; j++)
180-
{
181-
random_roi();
182-
183-
OCL_OFF(cv::Sobel(src_roi, dst_roi, -1, dx, dy, ksize, scale, /* delta */0, borderType));
184-
OCL_ON(cv::Sobel(usrc_roi, udst_roi, -1, dx, dy, ksize, scale, /* delta */0, borderType));
185-
186-
Near();
187-
}
188-
}
189-
190168
PARAM_TEST_CASE(Deriv3x3_cols16_rows2_Base, MatType,
191169
int, // kernel size
192170
Size, // dx, dy
@@ -247,6 +225,45 @@ PARAM_TEST_CASE(Deriv3x3_cols16_rows2_Base, MatType,
247225
}
248226
};
249227

228+
typedef Deriv3x3_cols16_rows2_Base Laplacian3_cols16_rows2;
229+
230+
OCL_TEST_P(Laplacian3_cols16_rows2, Accuracy)
231+
{
232+
double scale = param;
233+
234+
for (int j = 0; j < test_loop_times; j++)
235+
{
236+
random_roi();
237+
238+
OCL_OFF(cv::Laplacian(src_roi, dst_roi, -1, ksize, scale, 10, borderType));
239+
OCL_ON(cv::Laplacian(usrc_roi, udst_roi, -1, ksize, scale, 10, borderType));
240+
241+
Near();
242+
}
243+
}
244+
245+
246+
/////////////////////////////////////////////////////////////////////////////////////////////////
247+
// Sobel
248+
249+
typedef FilterTestBase SobelTest;
250+
251+
OCL_TEST_P(SobelTest, Mat)
252+
{
253+
int dx = size.width, dy = size.height;
254+
double scale = param;
255+
256+
for (int j = 0; j < test_loop_times; j++)
257+
{
258+
random_roi();
259+
260+
OCL_OFF(cv::Sobel(src_roi, dst_roi, -1, dx, dy, ksize, scale, /* delta */0, borderType));
261+
OCL_ON(cv::Sobel(usrc_roi, udst_roi, -1, dx, dy, ksize, scale, /* delta */0, borderType));
262+
263+
Near();
264+
}
265+
}
266+
250267
typedef Deriv3x3_cols16_rows2_Base Sobel3x3_cols16_rows2;
251268

252269
OCL_TEST_P(Sobel3x3_cols16_rows2, Mat)
@@ -639,6 +656,15 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, LaplacianTest, Combine(
639656
Bool(),
640657
Values(1))); // not used
641658

659+
OCL_INSTANTIATE_TEST_CASE_P(Filter, Laplacian3_cols16_rows2, Combine(
660+
Values((MatType)CV_8UC1),
661+
Values(3), // kernel size
662+
Values(Size(0, 0)), // not used
663+
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
664+
Values(1.0, 0.2, 3.0), // kernel scale
665+
Bool(),
666+
Values(1))); // not used
667+
642668
OCL_INSTANTIATE_TEST_CASE_P(Filter, SobelTest, Combine(
643669
FILTER_TYPES,
644670
Values(3, 5), // kernel size

0 commit comments

Comments
 (0)