Skip to content

Commit 36e8017

Browse files
committed
Merge pull request opencv#8465 from claudiofantacci:enh/cudastreamhog
2 parents 4aa51f6 + 4709b9d commit 36e8017

File tree

2 files changed

+188
-131
lines changed

2 files changed

+188
-131
lines changed

modules/cudaobjdetect/src/cuda/hog.cu

Lines changed: 86 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -98,28 +98,31 @@ namespace cv { namespace cuda { namespace device
9898
}
9999

100100

101-
void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
102-
int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y)
101+
void set_up_constants(int nbins,
102+
int block_stride_x, int block_stride_y,
103+
int nblocks_win_x, int nblocks_win_y,
104+
int ncells_block_x, int ncells_block_y,
105+
const cudaStream_t& stream)
103106
{
104-
cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) );
105-
cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) );
106-
cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) );
107-
cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );
108-
cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) );
109-
cudaSafeCall( cudaMemcpyToSymbol(cncells_block_x, &ncells_block_x, sizeof(ncells_block_x)) );
110-
cudaSafeCall( cudaMemcpyToSymbol(cncells_block_y, &ncells_block_y, sizeof(ncells_block_y)) );
107+
cudaSafeCall(cudaMemcpyToSymbolAsync(cnbins, &nbins, sizeof(nbins), 0, cudaMemcpyHostToDevice, stream));
108+
cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_stride_x, &block_stride_x, sizeof(block_stride_x), 0, cudaMemcpyHostToDevice, stream));
109+
cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_stride_y, &block_stride_y, sizeof(block_stride_y), 0, cudaMemcpyHostToDevice, stream));
110+
cudaSafeCall(cudaMemcpyToSymbolAsync(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x), 0, cudaMemcpyHostToDevice, stream));
111+
cudaSafeCall(cudaMemcpyToSymbolAsync(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y), 0, cudaMemcpyHostToDevice, stream));
112+
cudaSafeCall(cudaMemcpyToSymbolAsync(cncells_block_x, &ncells_block_x, sizeof(ncells_block_x), 0, cudaMemcpyHostToDevice, stream));
113+
cudaSafeCall(cudaMemcpyToSymbolAsync(cncells_block_y, &ncells_block_y, sizeof(ncells_block_y), 0, cudaMemcpyHostToDevice, stream));
111114

112115
int block_hist_size = nbins * ncells_block_x * ncells_block_y;
113-
cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) );
116+
cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_hist_size, &block_hist_size, sizeof(block_hist_size), 0, cudaMemcpyHostToDevice, stream));
114117

115118
int block_hist_size_2up = power_2up(block_hist_size);
116-
cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );
119+
cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up), 0, cudaMemcpyHostToDevice, stream));
117120

118121
int descr_width = nblocks_win_x * block_hist_size;
119-
cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) );
122+
cudaSafeCall(cudaMemcpyToSymbolAsync(cdescr_width, &descr_width, sizeof(descr_width), 0, cudaMemcpyHostToDevice, stream));
120123

121124
int descr_size = descr_width * nblocks_win_y;
122-
cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) );
125+
cudaSafeCall(cudaMemcpyToSymbolAsync(cdescr_size, &descr_size, sizeof(descr_size), 0, cudaMemcpyHostToDevice, stream));
123126
}
124127

125128

@@ -230,10 +233,15 @@ namespace cv { namespace cuda { namespace device
230233
}
231234

232235
//declaration of variables and invoke the kernel with the calculated number of blocks
233-
void compute_hists(int nbins, int block_stride_x, int block_stride_y,
234-
int height, int width, const PtrStepSzf& grad,
235-
const PtrStepSzb& qangle, float sigma, float* block_hists,
236-
int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y)
236+
void compute_hists(int nbins,
237+
int block_stride_x, int block_stride_y,
238+
int height, int width,
239+
const PtrStepSzf& grad, const PtrStepSzb& qangle,
240+
float sigma,
241+
float* block_hists,
242+
int cell_size_x, int cell_size_y,
243+
int ncells_block_x, int ncells_block_y,
244+
const cudaStream_t& stream)
237245
{
238246
const int ncells_block = ncells_block_x * ncells_block_y;
239247
const int patch_side = cell_size_x / 4;
@@ -259,20 +267,15 @@ namespace cv { namespace cuda { namespace device
259267
int final_hists_size = (nbins * ncells_block * nblocks) * sizeof(float);
260268
int smem = hists_size + final_hists_size;
261269
if (nblocks == 4)
262-
compute_hists_kernel_many_blocks<4><<<grid, threads, smem>>>(
263-
img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
270+
compute_hists_kernel_many_blocks<4><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
264271
else if (nblocks == 3)
265-
compute_hists_kernel_many_blocks<3><<<grid, threads, smem>>>(
266-
img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
272+
compute_hists_kernel_many_blocks<3><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
267273
else if (nblocks == 2)
268-
compute_hists_kernel_many_blocks<2><<<grid, threads, smem>>>(
269-
img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
274+
compute_hists_kernel_many_blocks<2><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
270275
else
271-
compute_hists_kernel_many_blocks<1><<<grid, threads, smem>>>(
272-
img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
273-
cudaSafeCall( cudaGetLastError() );
276+
compute_hists_kernel_many_blocks<1><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
274277

275-
cudaSafeCall( cudaDeviceSynchronize() );
278+
cudaSafeCall( cudaGetLastError() );
276279
}
277280

278281

@@ -347,8 +350,14 @@ namespace cv { namespace cuda { namespace device
347350
}
348351

349352

350-
void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
351-
int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y)
353+
void normalize_hists(int nbins,
354+
int block_stride_x, int block_stride_y,
355+
int height, int width,
356+
float* block_hists,
357+
float threshold,
358+
int cell_size_x, int cell_size_y,
359+
int ncells_block_x, int ncells_block_y,
360+
const cudaStream_t& stream)
352361
{
353362
const int nblocks = 1;
354363

@@ -361,21 +370,19 @@ namespace cv { namespace cuda { namespace device
361370
dim3 grid(divUp(img_block_width, nblocks), img_block_height);
362371

363372
if (nthreads == 32)
364-
normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
373+
normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
365374
else if (nthreads == 64)
366-
normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
375+
normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
367376
else if (nthreads == 128)
368-
normalize_hists_kernel_many_blocks<128, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
377+
normalize_hists_kernel_many_blocks<128, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
369378
else if (nthreads == 256)
370-
normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
379+
normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
371380
else if (nthreads == 512)
372-
normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
381+
normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
373382
else
374383
CV_Error(cv::Error::StsBadArg, "normalize_hists: histogram's size is too big, try to decrease number of bins");
375384

376385
cudaSafeCall( cudaGetLastError() );
377-
378-
cudaSafeCall( cudaDeviceSynchronize() );
379386
}
380387

381388

@@ -511,8 +518,10 @@ namespace cv { namespace cuda { namespace device
511518

512519

513520
template <int nthreads>
514-
__global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
515-
const float* block_hists, PtrStepf descriptors)
521+
__global__ void extract_descrs_by_rows_kernel(const int img_block_width,
522+
const int win_block_stride_x, const int win_block_stride_y,
523+
const float* block_hists,
524+
PtrStepf descriptors)
516525
{
517526
// Get left top corner of the window in src
518527
const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
@@ -531,8 +540,14 @@ namespace cv { namespace cuda { namespace device
531540
}
532541

533542

534-
void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
535-
int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, PtrStepSzf descriptors)
543+
void extract_descrs_by_rows(int win_height, int win_width,
544+
int block_stride_y, int block_stride_x,
545+
int win_stride_y, int win_stride_x,
546+
int height, int width,
547+
float* block_hists, int cell_size_x,
548+
int ncells_block_x,
549+
PtrStepSzf descriptors,
550+
const cudaStream_t& stream)
536551
{
537552
const int nthreads = 256;
538553

@@ -544,17 +559,16 @@ namespace cv { namespace cuda { namespace device
544559
dim3 grid(img_win_width, img_win_height);
545560

546561
int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
547-
extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
548-
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
549-
cudaSafeCall( cudaGetLastError() );
562+
extract_descrs_by_rows_kernel<nthreads><<<grid, threads, 0, stream>>>(img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
550563

551-
cudaSafeCall( cudaDeviceSynchronize() );
564+
cudaSafeCall( cudaGetLastError() );
552565
}
553566

554567

555568
template <int nthreads>
556-
__global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x,
557-
const int win_block_stride_y, const float* block_hists,
569+
__global__ void extract_descrs_by_cols_kernel(const int img_block_width,
570+
const int win_block_stride_x, const int win_block_stride_y,
571+
const float* block_hists,
558572
PtrStepf descriptors)
559573
{
560574
// Get left top corner of the window in src
@@ -579,9 +593,14 @@ namespace cv { namespace cuda { namespace device
579593
}
580594

581595

582-
void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
583-
int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
584-
PtrStepSzf descriptors)
596+
void extract_descrs_by_cols(int win_height, int win_width,
597+
int block_stride_y, int block_stride_x,
598+
int win_stride_y, int win_stride_x,
599+
int height, int width,
600+
float* block_hists,
601+
int cell_size_x, int ncells_block_x,
602+
PtrStepSzf descriptors,
603+
const cudaStream_t& stream)
585604
{
586605
const int nthreads = 256;
587606

@@ -593,11 +612,9 @@ namespace cv { namespace cuda { namespace device
593612
dim3 grid(img_win_width, img_win_height);
594613

595614
int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
596-
extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
597-
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
598-
cudaSafeCall( cudaGetLastError() );
615+
extract_descrs_by_cols_kernel<nthreads><<<grid, threads, 0, stream>>>(img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
599616

600-
cudaSafeCall( cudaDeviceSynchronize() );
617+
cudaSafeCall( cudaGetLastError() );
601618
}
602619

603620
//----------------------------------------------------------------------------
@@ -707,8 +724,12 @@ namespace cv { namespace cuda { namespace device
707724
}
708725

709726

710-
void compute_gradients_8UC4(int nbins, int height, int width, const PtrStepSzb& img,
711-
float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
727+
void compute_gradients_8UC4(int nbins,
728+
int height, int width, const PtrStepSzb& img,
729+
float angle_scale,
730+
PtrStepSzf grad, PtrStepSzb qangle,
731+
bool correct_gamma,
732+
const cudaStream_t& stream)
712733
{
713734
(void)nbins;
714735
const int nthreads = 256;
@@ -717,13 +738,11 @@ namespace cv { namespace cuda { namespace device
717738
dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
718739

719740
if (correct_gamma)
720-
compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
741+
compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
721742
else
722-
compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
743+
compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
723744

724745
cudaSafeCall( cudaGetLastError() );
725-
726-
cudaSafeCall( cudaDeviceSynchronize() );
727746
}
728747

729748
template <int nthreads, int correct_gamma>
@@ -780,8 +799,12 @@ namespace cv { namespace cuda { namespace device
780799
}
781800

782801

783-
void compute_gradients_8UC1(int nbins, int height, int width, const PtrStepSzb& img,
784-
float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
802+
void compute_gradients_8UC1(int nbins,
803+
int height, int width, const PtrStepSzb& img,
804+
float angle_scale,
805+
PtrStepSzf grad, PtrStepSzb qangle,
806+
bool correct_gamma,
807+
const cudaStream_t& stream)
785808
{
786809
(void)nbins;
787810
const int nthreads = 256;
@@ -790,13 +813,11 @@ namespace cv { namespace cuda { namespace device
790813
dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
791814

792815
if (correct_gamma)
793-
compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
816+
compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
794817
else
795-
compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
818+
compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
796819

797820
cudaSafeCall( cudaGetLastError() );
798-
799-
cudaSafeCall( cudaDeviceSynchronize() );
800821
}
801822

802823

0 commit comments

Comments
 (0)