@@ -1840,9 +1840,35 @@ void initializeContextFromHandle(Context& ctx, void* platform, void* _context, v
1840
1840
1841
1841
struct Queue ::Impl
1842
1842
{
1843
- Impl ( const Context& c, const Device& d )
1843
+ inline void __init ( )
1844
1844
{
1845
1845
refcount = 1 ;
1846
+ handle = 0 ;
1847
+ isProfilingQueue_ = false ;
1848
+ }
1849
+
1850
+ Impl (cl_command_queue q)
1851
+ {
1852
+ __init ();
1853
+ handle = q;
1854
+
1855
+ cl_command_queue_properties props = 0 ;
1856
+ cl_int result = clGetCommandQueueInfo (handle, CL_QUEUE_PROPERTIES, sizeof (cl_command_queue_properties), &props, NULL );
1857
+ CV_Assert (result && " clGetCommandQueueInfo(CL_QUEUE_PROPERTIES)" );
1858
+ isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
1859
+ }
1860
+
1861
+ Impl (cl_command_queue q, bool isProfilingQueue)
1862
+ {
1863
+ __init ();
1864
+ handle = q;
1865
+ isProfilingQueue_ = isProfilingQueue;
1866
+ }
1867
+
1868
+ Impl (const Context& c, const Device& d, bool withProfiling = false )
1869
+ {
1870
+ __init ();
1871
+
1846
1872
const Context* pc = &c;
1847
1873
cl_context ch = (cl_context)pc->ptr ();
1848
1874
if ( !ch )
@@ -1854,8 +1880,10 @@ struct Queue::Impl
1854
1880
if ( !dh )
1855
1881
dh = (cl_device_id)pc->device (0 ).ptr ();
1856
1882
cl_int retval = 0 ;
1857
- handle = clCreateCommandQueue (ch, dh, 0 , &retval);
1883
+ cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0 ;
1884
+ handle = clCreateCommandQueue (ch, dh, props, &retval);
1858
1885
CV_OclDbgAssert (retval == CL_SUCCESS);
1886
+ isProfilingQueue_ = withProfiling;
1859
1887
}
1860
1888
1861
1889
~Impl ()
@@ -1873,9 +1901,37 @@ struct Queue::Impl
1873
1901
}
1874
1902
}
1875
1903
1904
+ const cv::ocl::Queue& getProfilingQueue (const cv::ocl::Queue& self)
1905
+ {
1906
+ if (isProfilingQueue_)
1907
+ return self;
1908
+
1909
+ if (profiling_queue_.ptr ())
1910
+ return profiling_queue_;
1911
+
1912
+ cl_context ctx = 0 ;
1913
+ CV_Assert (CL_SUCCESS == clGetCommandQueueInfo (handle, CL_QUEUE_CONTEXT, sizeof (cl_context), &ctx, NULL ));
1914
+
1915
+ cl_device_id device = 0 ;
1916
+ CV_Assert (CL_SUCCESS == clGetCommandQueueInfo (handle, CL_QUEUE_DEVICE, sizeof (cl_device_id), &device, NULL ));
1917
+
1918
+ cl_int result = CL_SUCCESS;
1919
+ cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
1920
+ cl_command_queue q = clCreateCommandQueue (ctx, device, props, &result);
1921
+ CV_Assert (result == CL_SUCCESS && " clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)" );
1922
+
1923
+ Queue queue;
1924
+ queue.p = new Impl (q, true );
1925
+ profiling_queue_ = queue;
1926
+
1927
+ return profiling_queue_;
1928
+ }
1929
+
1876
1930
IMPLEMENT_REFCOUNTABLE ();
1877
1931
1878
1932
cl_command_queue handle;
1933
+ bool isProfilingQueue_;
1934
+ cv::ocl::Queue profiling_queue_;
1879
1935
};
1880
1936
1881
1937
Queue::Queue ()
@@ -1929,6 +1985,12 @@ void Queue::finish()
1929
1985
}
1930
1986
}
1931
1987
1988
+ const Queue& Queue::getProfilingQueue () const
1989
+ {
1990
+ CV_Assert (p);
1991
+ return p->getProfilingQueue (*this );
1992
+ }
1993
+
1932
1994
void * Queue::ptr () const
1933
1995
{
1934
1996
return p ? p->handle : 0 ;
@@ -2032,6 +2094,9 @@ struct Kernel::Impl
2032
2094
release ();
2033
2095
}
2034
2096
2097
+ bool run (int dims, size_t _globalsize[], size_t _localsize[],
2098
+ bool sync, int64* timeNS, const Queue& q);
2099
+
2035
2100
~Impl ()
2036
2101
{
2037
2102
if (handle)
@@ -2259,19 +2324,15 @@ int Kernel::set(int i, const KernelArg& arg)
2259
2324
return i+1 ;
2260
2325
}
2261
2326
2262
-
2263
2327
bool Kernel::run (int dims, size_t _globalsize[], size_t _localsize[],
2264
2328
bool sync, const Queue& q)
2265
2329
{
2266
- CV_INSTRUMENT_REGION_OPENCL_RUN (p->name .c_str ());
2267
-
2268
- if (!p || !p->handle || p->isInProgress )
2330
+ if (!p)
2269
2331
return false ;
2270
2332
2271
- cl_command_queue qq = getQueue (q);
2272
2333
size_t globalsize[CV_MAX_DIM] = {1 ,1 ,1 };
2273
2334
size_t total = 1 ;
2274
- CV_Assert (_globalsize != 0 );
2335
+ CV_Assert (_globalsize != NULL );
2275
2336
for (int i = 0 ; i < dims; i++)
2276
2337
{
2277
2338
size_t val = _localsize ? _localsize[i] :
@@ -2283,29 +2344,60 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
2283
2344
globalsize[i] = divUp (_globalsize[i], (unsigned int )val) * val;
2284
2345
}
2285
2346
CV_Assert (total > 0 );
2286
- if ( p->haveTempDstUMats )
2347
+
2348
+ return p->run (dims, globalsize, _localsize, sync, NULL , q);
2349
+ }
2350
+
2351
+
2352
+ bool Kernel::Impl::run (int dims, size_t globalsize[], size_t localsize[],
2353
+ bool sync, int64* timeNS, const Queue& q)
2354
+ {
2355
+ CV_INSTRUMENT_REGION_OPENCL_RUN (p->name .c_str ());
2356
+
2357
+ if (!handle || isInProgress)
2358
+ return false ;
2359
+
2360
+ cl_command_queue qq = getQueue (q);
2361
+ if (haveTempDstUMats)
2362
+ sync = true ;
2363
+ if (timeNS)
2287
2364
sync = true ;
2288
2365
cl_event asyncEvent = 0 ;
2289
- cl_int retval = clEnqueueNDRangeKernel (qq, p-> handle , (cl_uint)dims,
2290
- NULL , globalsize, _localsize , 0 , 0 ,
2291
- sync ? 0 : &asyncEvent);
2366
+ cl_int retval = clEnqueueNDRangeKernel (qq, handle, (cl_uint)dims,
2367
+ NULL , globalsize, localsize , 0 , 0 ,
2368
+ ( sync && !timeNS) ? 0 : &asyncEvent);
2292
2369
#if CV_OPENCL_SHOW_RUN_ERRORS
2293
2370
if (retval != CL_SUCCESS)
2294
2371
{
2295
2372
printf (" OpenCL program returns error: %d\n " , retval);
2296
2373
fflush (stdout);
2297
2374
}
2298
2375
#endif
2299
- if ( sync || retval != CL_SUCCESS )
2376
+ if ( sync || retval != CL_SUCCESS)
2300
2377
{
2301
2378
CV_OclDbgAssert (clFinish (qq) == CL_SUCCESS);
2302
- p->cleanupUMats ();
2379
+ if (timeNS)
2380
+ {
2381
+ if (retval == CL_SUCCESS)
2382
+ {
2383
+ clWaitForEvents (1 , &asyncEvent);
2384
+ cl_ulong startTime, stopTime;
2385
+ CV_Assert (CL_SUCCESS == clGetEventProfilingInfo (asyncEvent, CL_PROFILING_COMMAND_START, sizeof (startTime), &startTime, NULL ));
2386
+ CV_Assert (CL_SUCCESS == clGetEventProfilingInfo (asyncEvent, CL_PROFILING_COMMAND_END, sizeof (stopTime), &stopTime, NULL ));
2387
+ *timeNS = (int64)(stopTime - startTime);
2388
+ }
2389
+ else
2390
+ {
2391
+ *timeNS = -1 ;
2392
+ }
2393
+ }
2394
+ cleanupUMats ();
2303
2395
}
2304
2396
else
2305
2397
{
2306
- p-> addref ();
2307
- p-> isInProgress = true ;
2308
- CV_OclDbgAssert (clSetEventCallback (asyncEvent, CL_COMPLETE, oclCleanupCallback, p ) == CL_SUCCESS);
2398
+ addref ();
2399
+ isInProgress = true ;
2400
+ CV_OclDbgAssert (clSetEventCallback (asyncEvent, CL_COMPLETE, oclCleanupCallback, this ) == CL_SUCCESS);
2309
2401
}
2310
2402
if (asyncEvent)
2311
2403
clReleaseEvent (asyncEvent);
@@ -2336,6 +2428,17 @@ bool Kernel::runTask(bool sync, const Queue& q)
2336
2428
return retval == CL_SUCCESS;
2337
2429
}
2338
2430
2431
+ int64 Kernel::runProfiling (int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
2432
+ {
2433
+ CV_Assert (p && p->handle && !p->isInProgress );
2434
+ Queue q = q_.ptr () ? q_ : Queue::getDefault ();
2435
+ CV_Assert (q.ptr ());
2436
+ q.finish (); // call clFinish() on base queue
2437
+ Queue profilingQueue = q.getProfilingQueue ();
2438
+ int64 timeNs = -1 ;
2439
+ bool res = p->run (dims, globalsize, localsize, true , &timeNs, profilingQueue);
2440
+ return res ? timeNs : -1 ;
2441
+ }
2339
2442
2340
2443
size_t Kernel::workGroupSize () const
2341
2444
{
0 commit comments