@@ -2094,6 +2094,9 @@ struct Kernel::Impl
2094
2094
release ();
2095
2095
}
2096
2096
2097
+ bool run (int dims, size_t _globalsize[], size_t _localsize[],
2098
+ bool sync, int64* timeNS, const Queue& q);
2099
+
2097
2100
~Impl ()
2098
2101
{
2099
2102
if (handle)
@@ -2321,19 +2324,15 @@ int Kernel::set(int i, const KernelArg& arg)
2321
2324
return i+1 ;
2322
2325
}
2323
2326
2324
-
2325
2327
bool Kernel::run (int dims, size_t _globalsize[], size_t _localsize[],
2326
2328
bool sync, const Queue& q)
2327
2329
{
2328
- CV_INSTRUMENT_REGION_OPENCL_RUN (p->name .c_str ());
2329
-
2330
- if (!p || !p->handle || p->isInProgress )
2330
+ if (!p)
2331
2331
return false ;
2332
2332
2333
- cl_command_queue qq = getQueue (q);
2334
2333
size_t globalsize[CV_MAX_DIM] = {1 ,1 ,1 };
2335
2334
size_t total = 1 ;
2336
- CV_Assert (_globalsize != 0 );
2335
+ CV_Assert (_globalsize != NULL );
2337
2336
for (int i = 0 ; i < dims; i++)
2338
2337
{
2339
2338
size_t val = _localsize ? _localsize[i] :
@@ -2345,29 +2344,60 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
2345
2344
globalsize[i] = divUp (_globalsize[i], (unsigned int )val) * val;
2346
2345
}
2347
2346
CV_Assert (total > 0 );
2348
- if ( p->haveTempDstUMats )
2347
+
2348
+ return p->run (dims, globalsize, _localsize, sync, NULL , q);
2349
+ }
2350
+
2351
+
2352
+ bool Kernel::Impl::run (int dims, size_t globalsize[], size_t localsize[],
2353
+ bool sync, int64* timeNS, const Queue& q)
2354
+ {
2355
+ CV_INSTRUMENT_REGION_OPENCL_RUN (p->name .c_str ());
2356
+
2357
+ if (!handle || isInProgress)
2358
+ return false ;
2359
+
2360
+ cl_command_queue qq = getQueue (q);
2361
+ if (haveTempDstUMats)
2362
+ sync = true ;
2363
+ if (timeNS)
2349
2364
sync = true ;
2350
2365
cl_event asyncEvent = 0 ;
2351
- cl_int retval = clEnqueueNDRangeKernel (qq, p-> handle , (cl_uint)dims,
2352
- NULL , globalsize, _localsize , 0 , 0 ,
2353
- sync ? 0 : &asyncEvent);
2366
+ cl_int retval = clEnqueueNDRangeKernel (qq, handle, (cl_uint)dims,
2367
+ NULL , globalsize, localsize , 0 , 0 ,
2368
+ ( sync && !timeNS) ? 0 : &asyncEvent);
2354
2369
#if CV_OPENCL_SHOW_RUN_ERRORS
2355
2370
if (retval != CL_SUCCESS)
2356
2371
{
2357
2372
printf (" OpenCL program returns error: %d\n " , retval);
2358
2373
fflush (stdout);
2359
2374
}
2360
2375
#endif
2361
- if ( sync || retval != CL_SUCCESS )
2376
+ if ( sync || retval != CL_SUCCESS)
2362
2377
{
2363
2378
CV_OclDbgAssert (clFinish (qq) == CL_SUCCESS);
2364
- p->cleanupUMats ();
2379
+ if (timeNS)
2380
+ {
2381
+ if (retval == CL_SUCCESS)
2382
+ {
2383
+ clWaitForEvents (1 , &asyncEvent);
2384
+ cl_ulong startTime, stopTime;
2385
+ CV_Assert (CL_SUCCESS == clGetEventProfilingInfo (asyncEvent, CL_PROFILING_COMMAND_START, sizeof (startTime), &startTime, NULL ));
2386
+ CV_Assert (CL_SUCCESS == clGetEventProfilingInfo (asyncEvent, CL_PROFILING_COMMAND_END, sizeof (stopTime), &stopTime, NULL ));
2387
+ *timeNS = (int64)(stopTime - startTime);
2388
+ }
2389
+ else
2390
+ {
2391
+ *timeNS = -1 ;
2392
+ }
2393
+ }
2394
+ cleanupUMats ();
2365
2395
}
2366
2396
else
2367
2397
{
2368
- p-> addref ();
2369
- p-> isInProgress = true ;
2370
- CV_OclDbgAssert (clSetEventCallback (asyncEvent, CL_COMPLETE, oclCleanupCallback, p ) == CL_SUCCESS);
2398
+ addref ();
2399
+ isInProgress = true ;
2400
+ CV_OclDbgAssert (clSetEventCallback (asyncEvent, CL_COMPLETE, oclCleanupCallback, this ) == CL_SUCCESS);
2371
2401
}
2372
2402
if (asyncEvent)
2373
2403
clReleaseEvent (asyncEvent);
@@ -2398,6 +2428,17 @@ bool Kernel::runTask(bool sync, const Queue& q)
2398
2428
return retval == CL_SUCCESS;
2399
2429
}
2400
2430
2431
+ int64 Kernel::runProfiling (int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
2432
+ {
2433
+ CV_Assert (p && p->handle && !p->isInProgress );
2434
+ Queue q = q_.ptr () ? q_ : Queue::getDefault ();
2435
+ CV_Assert (q.ptr ());
2436
+ q.finish (); // call clFinish() on base queue
2437
+ Queue profilingQueue = q.getProfilingQueue ();
2438
+ int64 timeNs = -1 ;
2439
+ bool res = p->run (dims, globalsize, localsize, true , &timeNs, profilingQueue);
2440
+ return res ? timeNs : -1 ;
2441
+ }
2401
2442
2402
2443
size_t Kernel::workGroupSize () const
2403
2444
{
0 commit comments