@@ -98,28 +98,31 @@ namespace cv { namespace cuda { namespace device
98
98
}
99
99
100
100
101
- void set_up_constants (int nbins, int block_stride_x, int block_stride_y,
102
- int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y)
101
+ void set_up_constants (int nbins,
102
+ int block_stride_x, int block_stride_y,
103
+ int nblocks_win_x, int nblocks_win_y,
104
+ int ncells_block_x, int ncells_block_y,
105
+ const cudaStream_t& stream)
103
106
{
104
- cudaSafeCall ( cudaMemcpyToSymbol (cnbins, &nbins, sizeof (nbins)) );
105
- cudaSafeCall ( cudaMemcpyToSymbol (cblock_stride_x, &block_stride_x, sizeof (block_stride_x)) );
106
- cudaSafeCall ( cudaMemcpyToSymbol (cblock_stride_y, &block_stride_y, sizeof (block_stride_y)) );
107
- cudaSafeCall ( cudaMemcpyToSymbol (cnblocks_win_x, &nblocks_win_x, sizeof (nblocks_win_x)) );
108
- cudaSafeCall ( cudaMemcpyToSymbol (cnblocks_win_y, &nblocks_win_y, sizeof (nblocks_win_y)) );
109
- cudaSafeCall ( cudaMemcpyToSymbol (cncells_block_x, &ncells_block_x, sizeof (ncells_block_x)) );
110
- cudaSafeCall ( cudaMemcpyToSymbol (cncells_block_y, &ncells_block_y, sizeof (ncells_block_y)) );
107
+ cudaSafeCall (cudaMemcpyToSymbolAsync (cnbins, &nbins, sizeof (nbins), 0 , cudaMemcpyHostToDevice, stream) );
108
+ cudaSafeCall (cudaMemcpyToSymbolAsync (cblock_stride_x, &block_stride_x, sizeof (block_stride_x), 0 , cudaMemcpyHostToDevice, stream) );
109
+ cudaSafeCall (cudaMemcpyToSymbolAsync (cblock_stride_y, &block_stride_y, sizeof (block_stride_y), 0 , cudaMemcpyHostToDevice, stream) );
110
+ cudaSafeCall (cudaMemcpyToSymbolAsync (cnblocks_win_x, &nblocks_win_x, sizeof (nblocks_win_x), 0 , cudaMemcpyHostToDevice, stream) );
111
+ cudaSafeCall (cudaMemcpyToSymbolAsync (cnblocks_win_y, &nblocks_win_y, sizeof (nblocks_win_y), 0 , cudaMemcpyHostToDevice, stream) );
112
+ cudaSafeCall (cudaMemcpyToSymbolAsync (cncells_block_x, &ncells_block_x, sizeof (ncells_block_x), 0 , cudaMemcpyHostToDevice, stream) );
113
+ cudaSafeCall (cudaMemcpyToSymbolAsync (cncells_block_y, &ncells_block_y, sizeof (ncells_block_y), 0 , cudaMemcpyHostToDevice, stream) );
111
114
112
115
int block_hist_size = nbins * ncells_block_x * ncells_block_y;
113
- cudaSafeCall ( cudaMemcpyToSymbol (cblock_hist_size, &block_hist_size, sizeof (block_hist_size)) );
116
+ cudaSafeCall (cudaMemcpyToSymbolAsync (cblock_hist_size, &block_hist_size, sizeof (block_hist_size), 0 , cudaMemcpyHostToDevice, stream) );
114
117
115
118
int block_hist_size_2up = power_2up (block_hist_size);
116
- cudaSafeCall ( cudaMemcpyToSymbol (cblock_hist_size_2up, &block_hist_size_2up, sizeof (block_hist_size_2up)) );
119
+ cudaSafeCall (cudaMemcpyToSymbolAsync (cblock_hist_size_2up, &block_hist_size_2up, sizeof (block_hist_size_2up), 0 , cudaMemcpyHostToDevice, stream) );
117
120
118
121
int descr_width = nblocks_win_x * block_hist_size;
119
- cudaSafeCall ( cudaMemcpyToSymbol (cdescr_width, &descr_width, sizeof (descr_width)) );
122
+ cudaSafeCall (cudaMemcpyToSymbolAsync (cdescr_width, &descr_width, sizeof (descr_width), 0 , cudaMemcpyHostToDevice, stream) );
120
123
121
124
int descr_size = descr_width * nblocks_win_y;
122
- cudaSafeCall ( cudaMemcpyToSymbol (cdescr_size, &descr_size, sizeof (descr_size)) );
125
+ cudaSafeCall (cudaMemcpyToSymbolAsync (cdescr_size, &descr_size, sizeof (descr_size), 0 , cudaMemcpyHostToDevice, stream) );
123
126
}
124
127
125
128
@@ -230,10 +233,15 @@ namespace cv { namespace cuda { namespace device
230
233
}
231
234
232
235
// declaration of variables and invoke the kernel with the calculated number of blocks
233
- void compute_hists (int nbins, int block_stride_x, int block_stride_y,
234
- int height, int width, const PtrStepSzf& grad,
235
- const PtrStepSzb& qangle, float sigma, float * block_hists,
236
- int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y)
236
+ void compute_hists (int nbins,
237
+ int block_stride_x, int block_stride_y,
238
+ int height, int width,
239
+ const PtrStepSzf& grad, const PtrStepSzb& qangle,
240
+ float sigma,
241
+ float * block_hists,
242
+ int cell_size_x, int cell_size_y,
243
+ int ncells_block_x, int ncells_block_y,
244
+ const cudaStream_t& stream)
237
245
{
238
246
const int ncells_block = ncells_block_x * ncells_block_y;
239
247
const int patch_side = cell_size_x / 4 ;
@@ -259,20 +267,15 @@ namespace cv { namespace cuda { namespace device
259
267
int final_hists_size = (nbins * ncells_block * nblocks) * sizeof (float );
260
268
int smem = hists_size + final_hists_size;
261
269
if (nblocks == 4 )
262
- compute_hists_kernel_many_blocks<4 ><<<grid, threads, smem>>> (
263
- img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
270
+ compute_hists_kernel_many_blocks<4 ><<<grid, threads, smem, stream>>> (img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
264
271
else if (nblocks == 3 )
265
- compute_hists_kernel_many_blocks<3 ><<<grid, threads, smem>>> (
266
- img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
272
+ compute_hists_kernel_many_blocks<3 ><<<grid, threads, smem, stream>>> (img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
267
273
else if (nblocks == 2 )
268
- compute_hists_kernel_many_blocks<2 ><<<grid, threads, smem>>> (
269
- img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
274
+ compute_hists_kernel_many_blocks<2 ><<<grid, threads, smem, stream>>> (img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
270
275
else
271
- compute_hists_kernel_many_blocks<1 ><<<grid, threads, smem>>> (
272
- img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
273
- cudaSafeCall ( cudaGetLastError () );
276
+ compute_hists_kernel_many_blocks<1 ><<<grid, threads, smem, stream>>> (img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
274
277
275
- cudaSafeCall ( cudaDeviceSynchronize () );
278
+ cudaSafeCall ( cudaGetLastError () );
276
279
}
277
280
278
281
@@ -347,8 +350,14 @@ namespace cv { namespace cuda { namespace device
347
350
}
348
351
349
352
350
- void normalize_hists (int nbins, int block_stride_x, int block_stride_y,
351
- int height, int width, float * block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y)
353
+ void normalize_hists (int nbins,
354
+ int block_stride_x, int block_stride_y,
355
+ int height, int width,
356
+ float * block_hists,
357
+ float threshold,
358
+ int cell_size_x, int cell_size_y,
359
+ int ncells_block_x, int ncells_block_y,
360
+ const cudaStream_t& stream)
352
361
{
353
362
const int nblocks = 1 ;
354
363
@@ -361,21 +370,19 @@ namespace cv { namespace cuda { namespace device
361
370
dim3 grid (divUp (img_block_width, nblocks), img_block_height);
362
371
363
372
if (nthreads == 32 )
364
- normalize_hists_kernel_many_blocks<32 , nblocks><<<grid, threads>>> (block_hist_size, img_block_width, block_hists, threshold);
373
+ normalize_hists_kernel_many_blocks<32 , nblocks><<<grid, threads, 0 , stream >>> (block_hist_size, img_block_width, block_hists, threshold);
365
374
else if (nthreads == 64 )
366
- normalize_hists_kernel_many_blocks<64 , nblocks><<<grid, threads>>> (block_hist_size, img_block_width, block_hists, threshold);
375
+ normalize_hists_kernel_many_blocks<64 , nblocks><<<grid, threads, 0 , stream >>> (block_hist_size, img_block_width, block_hists, threshold);
367
376
else if (nthreads == 128 )
368
- normalize_hists_kernel_many_blocks<128 , nblocks><<<grid, threads>>> (block_hist_size, img_block_width, block_hists, threshold);
377
+ normalize_hists_kernel_many_blocks<128 , nblocks><<<grid, threads, 0 , stream >>> (block_hist_size, img_block_width, block_hists, threshold);
369
378
else if (nthreads == 256 )
370
- normalize_hists_kernel_many_blocks<256 , nblocks><<<grid, threads>>> (block_hist_size, img_block_width, block_hists, threshold);
379
+ normalize_hists_kernel_many_blocks<256 , nblocks><<<grid, threads, 0 , stream >>> (block_hist_size, img_block_width, block_hists, threshold);
371
380
else if (nthreads == 512 )
372
- normalize_hists_kernel_many_blocks<512 , nblocks><<<grid, threads>>> (block_hist_size, img_block_width, block_hists, threshold);
381
+ normalize_hists_kernel_many_blocks<512 , nblocks><<<grid, threads, 0 , stream >>> (block_hist_size, img_block_width, block_hists, threshold);
373
382
else
374
383
CV_Error (cv::Error::StsBadArg, " normalize_hists: histogram's size is too big, try to decrease number of bins" );
375
384
376
385
cudaSafeCall ( cudaGetLastError () );
377
-
378
- cudaSafeCall ( cudaDeviceSynchronize () );
379
386
}
380
387
381
388
@@ -511,8 +518,10 @@ namespace cv { namespace cuda { namespace device
511
518
512
519
513
520
template <int nthreads>
514
- __global__ void extract_descrs_by_rows_kernel (const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
515
- const float * block_hists, PtrStepf descriptors)
521
+ __global__ void extract_descrs_by_rows_kernel (const int img_block_width,
522
+ const int win_block_stride_x, const int win_block_stride_y,
523
+ const float * block_hists,
524
+ PtrStepf descriptors)
516
525
{
517
526
// Get left top corner of the window in src
518
527
const float * hist = block_hists + (blockIdx .y * win_block_stride_y * img_block_width +
@@ -531,8 +540,14 @@ namespace cv { namespace cuda { namespace device
531
540
}
532
541
533
542
534
- void extract_descrs_by_rows (int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
535
- int height, int width, float * block_hists, int cell_size_x, int ncells_block_x, PtrStepSzf descriptors)
543
+ void extract_descrs_by_rows (int win_height, int win_width,
544
+ int block_stride_y, int block_stride_x,
545
+ int win_stride_y, int win_stride_x,
546
+ int height, int width,
547
+ float * block_hists, int cell_size_x,
548
+ int ncells_block_x,
549
+ PtrStepSzf descriptors,
550
+ const cudaStream_t& stream)
536
551
{
537
552
const int nthreads = 256 ;
538
553
@@ -544,17 +559,16 @@ namespace cv { namespace cuda { namespace device
544
559
dim3 grid (img_win_width, img_win_height);
545
560
546
561
int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
547
- extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>> (
548
- img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
549
- cudaSafeCall ( cudaGetLastError () );
562
+ extract_descrs_by_rows_kernel<nthreads><<<grid, threads, 0 , stream>>> (img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
550
563
551
- cudaSafeCall ( cudaDeviceSynchronize () );
564
+ cudaSafeCall ( cudaGetLastError () );
552
565
}
553
566
554
567
555
568
template <int nthreads>
556
- __global__ void extract_descrs_by_cols_kernel (const int img_block_width, const int win_block_stride_x,
557
- const int win_block_stride_y, const float * block_hists,
569
+ __global__ void extract_descrs_by_cols_kernel (const int img_block_width,
570
+ const int win_block_stride_x, const int win_block_stride_y,
571
+ const float * block_hists,
558
572
PtrStepf descriptors)
559
573
{
560
574
// Get left top corner of the window in src
@@ -579,9 +593,14 @@ namespace cv { namespace cuda { namespace device
579
593
}
580
594
581
595
582
- void extract_descrs_by_cols (int win_height, int win_width, int block_stride_y, int block_stride_x,
583
- int win_stride_y, int win_stride_x, int height, int width, float * block_hists, int cell_size_x, int ncells_block_x,
584
- PtrStepSzf descriptors)
596
+ void extract_descrs_by_cols (int win_height, int win_width,
597
+ int block_stride_y, int block_stride_x,
598
+ int win_stride_y, int win_stride_x,
599
+ int height, int width,
600
+ float * block_hists,
601
+ int cell_size_x, int ncells_block_x,
602
+ PtrStepSzf descriptors,
603
+ const cudaStream_t& stream)
585
604
{
586
605
const int nthreads = 256 ;
587
606
@@ -593,11 +612,9 @@ namespace cv { namespace cuda { namespace device
593
612
dim3 grid (img_win_width, img_win_height);
594
613
595
614
int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
596
- extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>> (
597
- img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
598
- cudaSafeCall ( cudaGetLastError () );
615
+ extract_descrs_by_cols_kernel<nthreads><<<grid, threads, 0 , stream>>> (img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
599
616
600
- cudaSafeCall ( cudaDeviceSynchronize () );
617
+ cudaSafeCall ( cudaGetLastError () );
601
618
}
602
619
603
620
// ----------------------------------------------------------------------------
@@ -707,8 +724,12 @@ namespace cv { namespace cuda { namespace device
707
724
}
708
725
709
726
710
- void compute_gradients_8UC4 (int nbins, int height, int width, const PtrStepSzb& img,
711
- float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
727
+ void compute_gradients_8UC4 (int nbins,
728
+ int height, int width, const PtrStepSzb& img,
729
+ float angle_scale,
730
+ PtrStepSzf grad, PtrStepSzb qangle,
731
+ bool correct_gamma,
732
+ const cudaStream_t& stream)
712
733
{
713
734
(void )nbins;
714
735
const int nthreads = 256 ;
@@ -717,13 +738,11 @@ namespace cv { namespace cuda { namespace device
717
738
dim3 gdim (divUp (width, bdim.x ), divUp (height, bdim.y ));
718
739
719
740
if (correct_gamma)
720
- compute_gradients_8UC4_kernel<nthreads, 1 ><<<gdim, bdim>>> (height, width, img, angle_scale, grad, qangle);
741
+ compute_gradients_8UC4_kernel<nthreads, 1 ><<<gdim, bdim, 0 , stream >>> (height, width, img, angle_scale, grad, qangle);
721
742
else
722
- compute_gradients_8UC4_kernel<nthreads, 0 ><<<gdim, bdim>>> (height, width, img, angle_scale, grad, qangle);
743
+ compute_gradients_8UC4_kernel<nthreads, 0 ><<<gdim, bdim, 0 , stream >>> (height, width, img, angle_scale, grad, qangle);
723
744
724
745
cudaSafeCall ( cudaGetLastError () );
725
-
726
- cudaSafeCall ( cudaDeviceSynchronize () );
727
746
}
728
747
729
748
template <int nthreads, int correct_gamma>
@@ -780,8 +799,12 @@ namespace cv { namespace cuda { namespace device
780
799
}
781
800
782
801
783
- void compute_gradients_8UC1 (int nbins, int height, int width, const PtrStepSzb& img,
784
- float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
802
+ void compute_gradients_8UC1 (int nbins,
803
+ int height, int width, const PtrStepSzb& img,
804
+ float angle_scale,
805
+ PtrStepSzf grad, PtrStepSzb qangle,
806
+ bool correct_gamma,
807
+ const cudaStream_t& stream)
785
808
{
786
809
(void )nbins;
787
810
const int nthreads = 256 ;
@@ -790,13 +813,11 @@ namespace cv { namespace cuda { namespace device
790
813
dim3 gdim (divUp (width, bdim.x ), divUp (height, bdim.y ));
791
814
792
815
if (correct_gamma)
793
- compute_gradients_8UC1_kernel<nthreads, 1 ><<<gdim, bdim>>> (height, width, img, angle_scale, grad, qangle);
816
+ compute_gradients_8UC1_kernel<nthreads, 1 ><<<gdim, bdim, 0 , stream >>> (height, width, img, angle_scale, grad, qangle);
794
817
else
795
- compute_gradients_8UC1_kernel<nthreads, 0 ><<<gdim, bdim>>> (height, width, img, angle_scale, grad, qangle);
818
+ compute_gradients_8UC1_kernel<nthreads, 0 ><<<gdim, bdim, 0 , stream >>> (height, width, img, angle_scale, grad, qangle);
796
819
797
820
cudaSafeCall ( cudaGetLastError () );
798
-
799
- cudaSafeCall ( cudaDeviceSynchronize () );
800
821
}
801
822
802
823
0 commit comments