Skip to content

Commit af8fc3e

Browse files
committed
Merge pull request opencv#9604 from alalek:ocl_kernel_profiling
2 parents 2246759 + e5890fc commit af8fc3e

File tree

3 files changed

+152
-26
lines changed

3 files changed

+152
-26
lines changed

modules/core/include/opencv2/core/ocl.hpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -333,8 +333,12 @@ class CV_EXPORTS Queue
333333
void* ptr() const;
334334
static Queue& getDefault();
335335

336+
/// @brief Returns OpenCL command queue with enable profiling mode support
337+
const Queue& getProfilingQueue() const;
338+
339+
struct Impl; friend struct Impl;
340+
inline Impl* getImpl() const { return p; }
336341
protected:
337-
struct Impl;
338342
Impl* p;
339343
};
340344

@@ -569,6 +573,12 @@ class CV_EXPORTS Kernel
569573
size_t localsize[], bool sync, const Queue& q=Queue());
570574
bool runTask(bool sync, const Queue& q=Queue());
571575

576+
/** @brief Similar to synchronized run() call with returning of kernel execution time
577+
* Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
578+
* @return Execution time in nanoseconds or negative number on error
579+
*/
580+
int64 runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q=Queue());
581+
572582
size_t workGroupSize() const;
573583
size_t preferedWorkGroupSizeMultiple() const;
574584
bool compileWorkGroupSize(size_t wsz[]) const;

modules/core/src/ocl.cpp

Lines changed: 120 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1840,9 +1840,35 @@ void initializeContextFromHandle(Context& ctx, void* platform, void* _context, v
18401840

18411841
struct Queue::Impl
18421842
{
1843-
Impl(const Context& c, const Device& d)
1843+
inline void __init()
18441844
{
18451845
refcount = 1;
1846+
handle = 0;
1847+
isProfilingQueue_ = false;
1848+
}
1849+
1850+
Impl(cl_command_queue q)
1851+
{
1852+
__init();
1853+
handle = q;
1854+
1855+
cl_command_queue_properties props = 0;
1856+
cl_int result = clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL);
1857+
CV_Assert(result && "clGetCommandQueueInfo(CL_QUEUE_PROPERTIES)");
1858+
isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
1859+
}
1860+
1861+
Impl(cl_command_queue q, bool isProfilingQueue)
1862+
{
1863+
__init();
1864+
handle = q;
1865+
isProfilingQueue_ = isProfilingQueue;
1866+
}
1867+
1868+
Impl(const Context& c, const Device& d, bool withProfiling = false)
1869+
{
1870+
__init();
1871+
18461872
const Context* pc = &c;
18471873
cl_context ch = (cl_context)pc->ptr();
18481874
if( !ch )
@@ -1854,8 +1880,10 @@ struct Queue::Impl
18541880
if( !dh )
18551881
dh = (cl_device_id)pc->device(0).ptr();
18561882
cl_int retval = 0;
1857-
handle = clCreateCommandQueue(ch, dh, 0, &retval);
1883+
cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
1884+
handle = clCreateCommandQueue(ch, dh, props, &retval);
18581885
CV_OclDbgAssert(retval == CL_SUCCESS);
1886+
isProfilingQueue_ = withProfiling;
18591887
}
18601888

18611889
~Impl()
@@ -1873,9 +1901,37 @@ struct Queue::Impl
18731901
}
18741902
}
18751903

1904+
const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
1905+
{
1906+
if (isProfilingQueue_)
1907+
return self;
1908+
1909+
if (profiling_queue_.ptr())
1910+
return profiling_queue_;
1911+
1912+
cl_context ctx = 0;
1913+
CV_Assert(CL_SUCCESS == clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
1914+
1915+
cl_device_id device = 0;
1916+
CV_Assert(CL_SUCCESS == clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
1917+
1918+
cl_int result = CL_SUCCESS;
1919+
cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
1920+
cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
1921+
CV_Assert(result == CL_SUCCESS && "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
1922+
1923+
Queue queue;
1924+
queue.p = new Impl(q, true);
1925+
profiling_queue_ = queue;
1926+
1927+
return profiling_queue_;
1928+
}
1929+
18761930
IMPLEMENT_REFCOUNTABLE();
18771931

18781932
cl_command_queue handle;
1933+
bool isProfilingQueue_;
1934+
cv::ocl::Queue profiling_queue_;
18791935
};
18801936

18811937
Queue::Queue()
@@ -1929,6 +1985,12 @@ void Queue::finish()
19291985
}
19301986
}
19311987

1988+
const Queue& Queue::getProfilingQueue() const
1989+
{
1990+
CV_Assert(p);
1991+
return p->getProfilingQueue(*this);
1992+
}
1993+
19321994
void* Queue::ptr() const
19331995
{
19341996
return p ? p->handle : 0;
@@ -2032,6 +2094,9 @@ struct Kernel::Impl
20322094
release();
20332095
}
20342096

2097+
bool run(int dims, size_t _globalsize[], size_t _localsize[],
2098+
bool sync, int64* timeNS, const Queue& q);
2099+
20352100
~Impl()
20362101
{
20372102
if(handle)
@@ -2259,19 +2324,15 @@ int Kernel::set(int i, const KernelArg& arg)
22592324
return i+1;
22602325
}
22612326

2262-
22632327
bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
22642328
bool sync, const Queue& q)
22652329
{
2266-
CV_INSTRUMENT_REGION_OPENCL_RUN(p->name.c_str());
2267-
2268-
if(!p || !p->handle || p->isInProgress)
2330+
if (!p)
22692331
return false;
22702332

2271-
cl_command_queue qq = getQueue(q);
22722333
size_t globalsize[CV_MAX_DIM] = {1,1,1};
22732334
size_t total = 1;
2274-
CV_Assert(_globalsize != 0);
2335+
CV_Assert(_globalsize != NULL);
22752336
for (int i = 0; i < dims; i++)
22762337
{
22772338
size_t val = _localsize ? _localsize[i] :
@@ -2283,29 +2344,60 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
22832344
globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
22842345
}
22852346
CV_Assert(total > 0);
2286-
if( p->haveTempDstUMats )
2347+
2348+
return p->run(dims, globalsize, _localsize, sync, NULL, q);
2349+
}
2350+
2351+
2352+
bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
2353+
bool sync, int64* timeNS, const Queue& q)
2354+
{
2355+
CV_INSTRUMENT_REGION_OPENCL_RUN(p->name.c_str());
2356+
2357+
if (!handle || isInProgress)
2358+
return false;
2359+
2360+
cl_command_queue qq = getQueue(q);
2361+
if (haveTempDstUMats)
2362+
sync = true;
2363+
if (timeNS)
22872364
sync = true;
22882365
cl_event asyncEvent = 0;
2289-
cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
2290-
NULL, globalsize, _localsize, 0, 0,
2291-
sync ? 0 : &asyncEvent);
2366+
cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
2367+
NULL, globalsize, localsize, 0, 0,
2368+
(sync && !timeNS) ? 0 : &asyncEvent);
22922369
#if CV_OPENCL_SHOW_RUN_ERRORS
22932370
if (retval != CL_SUCCESS)
22942371
{
22952372
printf("OpenCL program returns error: %d\n", retval);
22962373
fflush(stdout);
22972374
}
22982375
#endif
2299-
if( sync || retval != CL_SUCCESS )
2376+
if (sync || retval != CL_SUCCESS)
23002377
{
23012378
CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
2302-
p->cleanupUMats();
2379+
if (timeNS)
2380+
{
2381+
if (retval == CL_SUCCESS)
2382+
{
2383+
clWaitForEvents(1, &asyncEvent);
2384+
cl_ulong startTime, stopTime;
2385+
CV_Assert(CL_SUCCESS == clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
2386+
CV_Assert(CL_SUCCESS == clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
2387+
*timeNS = (int64)(stopTime - startTime);
2388+
}
2389+
else
2390+
{
2391+
*timeNS = -1;
2392+
}
2393+
}
2394+
cleanupUMats();
23032395
}
23042396
else
23052397
{
2306-
p->addref();
2307-
p->isInProgress = true;
2308-
CV_OclDbgAssert(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
2398+
addref();
2399+
isInProgress = true;
2400+
CV_OclDbgAssert(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this) == CL_SUCCESS);
23092401
}
23102402
if (asyncEvent)
23112403
clReleaseEvent(asyncEvent);
@@ -2336,6 +2428,17 @@ bool Kernel::runTask(bool sync, const Queue& q)
23362428
return retval == CL_SUCCESS;
23372429
}
23382430

2431+
int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
2432+
{
2433+
CV_Assert(p && p->handle && !p->isInProgress);
2434+
Queue q = q_.ptr() ? q_ : Queue::getDefault();
2435+
CV_Assert(q.ptr());
2436+
q.finish(); // call clFinish() on base queue
2437+
Queue profilingQueue = q.getProfilingQueue();
2438+
int64 timeNs = -1;
2439+
bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
2440+
return res ? timeNs : -1;
2441+
}
23392442

23402443
size_t Kernel::workGroupSize() const
23412444
{

modules/core/src/ocl_deprecated.hpp

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -759,15 +759,15 @@ OCL_FUNC_P(cl_mem, clCreateBuffer,
759759

760760
/*
761761
OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue))
762-
762+
*/
763763
OCL_FUNC(cl_int, clGetCommandQueueInfo,
764-
(cl_command_queue command_queue,
765-
cl_command_queue_info param_name,
766-
size_t param_value_size,
767-
void * param_value,
768-
size_t * param_value_size_ret),
769-
(command_queue, param_name, param_value_size, param_value, param_value_size_ret))
770-
764+
(cl_command_queue command_queue,
765+
cl_command_queue_info param_name,
766+
size_t param_value_size,
767+
void * param_value,
768+
size_t * param_value_size_ret),
769+
(command_queue, param_name, param_value_size, param_value, param_value_size_ret))
770+
/*
771771
OCL_FUNC_P(cl_mem, clCreateSubBuffer,
772772
(cl_mem buffer,
773773
cl_mem_flags flags,
@@ -1202,6 +1202,19 @@ OCL_FUNC(cl_int, clSetEventCallback,
12021202

12031203
OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
12041204

1205+
OCL_FUNC(cl_int, clWaitForEvents,
1206+
(cl_uint num_events, const cl_event *event_list),
1207+
(num_events, event_list))
1208+
1209+
1210+
OCL_FUNC(cl_int, clGetEventProfilingInfo, (
1211+
cl_event event,
1212+
cl_profiling_info param_name,
1213+
size_t param_value_size,
1214+
void *param_value,
1215+
size_t *param_value_size_ret),
1216+
(event, param_name, param_value_size, param_value, param_value_size_ret))
1217+
12051218
}
12061219

12071220
#endif

0 commit comments

Comments
 (0)